In [15]:
#Import Libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
import os
In [16]:
# Import Folders
data_folder = os.path.abspath("data")
output_folder = os.path.abspath("output")
#check if outfolder exists if not create it
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
In [18]:
#Declare Columns
columns = [
    'sid',
    'stage',
    'asset_id',
    'category_id',
    'subcategory_id',
    'state_code',
    'district_code',
    'block_code',
    'panchayat_code',
    'work_financial_year',
    'completion_financial_year',
    'uuid',
    'ser_time',
    'status',
    'validated_timestamp'
]
#Read Files
chunksize = 500000
total_saved = total_read = 0
records = 0
df_list = []
files_folder = os.path.join(data_folder, 'Scraped Files')
files = os.listdir(files_folder)
files = [os.path.join(files_folder, i) for i in files if i.split('.')[-1] == 'xml']
files = files
file_count = 0
for file in files:
    file_count = file_count + 1
    with open(os.path.join(files_folder, file), 'r') as myfile:
        data = myfile.read().replace('\n', '')
        #Get Column Names
        items = pd.DataFrame(data.split('<item>')[1:])
        if items.shape[0] > 0:
            columns = items[0].values[0].split('<span class="atr-name">')[1:]
            columns = [i.split('</span>')[0] for i in columns]
            #Get All Attributes
            attributes = None
            for column in columns:
                df = items[0].str.split('<span class="atr-name">' + column + '</span>').str[1].str.split('<span class="atr-value">').str[1].str.split('</span>').str[0]
                if attributes is None:
                    attributes = pd.DataFrame(df)
                    attributes.columns = [column]
                else:
                    attributes[column] = df
                df = None
            #Get Geometry
            geometry = pd.DataFrame(items[0].str.split('<georss:point>').str[1])
            geometry = geometry[geometry[0].notnull()]
            geometry = pd.DataFrame(geometry[0].str.split('</georss:point>').str[0].str.split().tolist())
            geometry = geometry.apply(pd.to_numeric)
            geometry.columns = ['latitude', 'longitude']
            #geometry['latitude'] = geometry['latitude'].astype(float)
            #geometry['longitude'] = geometry['longitude'].astype(float)
            #Make Combined files
            df = pd.concat([attributes, geometry], axis=1, join='inner')
            df_list.append(df)
            records = records + df.shape[0]
    #When files size increases Create merged File and dump it to drive
    #print(records)
    if records > chunksize or file_count == len(files):
        df = pd.concat(df_list)
        df = df[(df['longitude'].notnull()) & (df['latitude'].notnull())]
        df_list = []
        records = 0
        _chunksize = df.shape[0]
        total_read = total_saved + _chunksize
        print('Saving a chunk size of ' + str(chunksize) + ' | Total read till now ' + str(total_read) + ' | Total Files Read till now ' + str(file_count))
        out_filename = str(total_saved) + '-' + str(total_read) + '.csv'
        out_filepath = os.path.join(output_folder, out_filename)
        df.to_csv(out_filepath, encoding='utf-8', index=False)
        #Change dataframe to geospatial and check by exporting to shp
        geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
        crs = {'init': 'epsg:4326'}
        geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
        geo_df.to_file(driver='ESRI Shapefile', filename=out_filepath.split('.')[0]+'.shp')
        total_saved = total_read
        df = None
        geo_df = None
#All done
print('All done Total saved records ' + str(total_saved))
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\ipykernel_launcher.py:63: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

Saving a chunk size of 500000 | Total read till now 530724 | Total Files Read till now 1031
Saving a chunk size of 500000 | Total read till now 1039186 | Total Files Read till now 1164
Saving a chunk size of 500000 | Total read till now 1551728 | Total Files Read till now 1235
Saving a chunk size of 500000 | Total read till now 2063111 | Total Files Read till now 1273
Saving a chunk size of 500000 | Total read till now 2576537 | Total Files Read till now 1322
Saving a chunk size of 500000 | Total read till now 3097678 | Total Files Read till now 1377
Saving a chunk size of 500000 | Total read till now 3615384 | Total Files Read till now 1413
Saving a chunk size of 500000 | Total read till now 4150566 | Total Files Read till now 1449
Saving a chunk size of 500000 | Total read till now 4653311 | Total Files Read till now 1482
Saving a chunk size of 500000 | Total read till now 5186465 | Total Files Read till now 1519
Saving a chunk size of 500000 | Total read till now 5695786 | Total Files Read till now 1547
Saving a chunk size of 500000 | Total read till now 6199690 | Total Files Read till now 1587
Saving a chunk size of 500000 | Total read till now 6725592 | Total Files Read till now 1596
Saving a chunk size of 500000 | Total read till now 7238643 | Total Files Read till now 1628
Saving a chunk size of 500000 | Total read till now 7777195 | Total Files Read till now 1665
Saving a chunk size of 500000 | Total read till now 8290816 | Total Files Read till now 1671
Saving a chunk size of 500000 | Total read till now 8815856 | Total Files Read till now 1691
Saving a chunk size of 500000 | Total read till now 9358338 | Total Files Read till now 1732
Saving a chunk size of 500000 | Total read till now 9896930 | Total Files Read till now 1741
Saving a chunk size of 500000 | Total read till now 10445724 | Total Files Read till now 1747
Saving a chunk size of 500000 | Total read till now 10968248 | Total Files Read till now 1768
Saving a chunk size of 500000 | Total read till now 11542772 | Total Files Read till now 1808
Saving a chunk size of 500000 | Total read till now 12045458 | Total Files Read till now 1816
Saving a chunk size of 500000 | Total read till now 12546107 | Total Files Read till now 1825
Saving a chunk size of 500000 | Total read till now 13055647 | Total Files Read till now 1841
Saving a chunk size of 500000 | Total read till now 13555649 | Total Files Read till now 1879
Saving a chunk size of 500000 | Total read till now 14106175 | Total Files Read till now 1887
Saving a chunk size of 500000 | Total read till now 14609724 | Total Files Read till now 1893
Saving a chunk size of 500000 | Total read till now 15128993 | Total Files Read till now 1911
Saving a chunk size of 500000 | Total read till now 15673851 | Total Files Read till now 1952
Saving a chunk size of 500000 | Total read till now 16184755 | Total Files Read till now 1958
Saving a chunk size of 500000 | Total read till now 16744195 | Total Files Read till now 1963
Saving a chunk size of 500000 | Total read till now 17282714 | Total Files Read till now 1981
Saving a chunk size of 500000 | Total read till now 17843119 | Total Files Read till now 2030
Saving a chunk size of 500000 | Total read till now 18351116 | Total Files Read till now 2046
Saving a chunk size of 500000 | Total read till now 18957137 | Total Files Read till now 2101
Saving a chunk size of 500000 | Total read till now 19461924 | Total Files Read till now 2114
Saving a chunk size of 500000 | Total read till now 19965520 | Total Files Read till now 2125
Saving a chunk size of 500000 | Total read till now 20492934 | Total Files Read till now 2186
Saving a chunk size of 500000 | Total read till now 21015279 | Total Files Read till now 2196
Saving a chunk size of 500000 | Total read till now 21553901 | Total Files Read till now 2254
Saving a chunk size of 500000 | Total read till now 22090818 | Total Files Read till now 2267
Saving a chunk size of 500000 | Total read till now 22612038 | Total Files Read till now 2326
Saving a chunk size of 500000 | Total read till now 23136424 | Total Files Read till now 2339
Saving a chunk size of 500000 | Total read till now 23651544 | Total Files Read till now 2402
Saving a chunk size of 500000 | Total read till now 24205407 | Total Files Read till now 2464
Saving a chunk size of 500000 | Total read till now 24709573 | Total Files Read till now 2477
Saving a chunk size of 500000 | Total read till now 25245198 | Total Files Read till now 2537
Saving a chunk size of 500000 | Total read till now 25810643 | Total Files Read till now 2553
Saving a chunk size of 500000 | Total read till now 26336732 | Total Files Read till now 2621
Saving a chunk size of 500000 | Total read till now 26852248 | Total Files Read till now 2696
Saving a chunk size of 500000 | Total read till now 27378586 | Total Files Read till now 2835
Saving a chunk size of 500000 | Total read till now 27886975 | Total Files Read till now 2912
Saving a chunk size of 500000 | Total read till now 28421895 | Total Files Read till now 3050
Saving a chunk size of 500000 | Total read till now 28974617 | Total Files Read till now 3122
Saving a chunk size of 500000 | Total read till now 29594164 | Total Files Read till now 3193
Saving a chunk size of 500000 | Total read till now 30111313 | Total Files Read till now 3202
Saving a chunk size of 500000 | Total read till now 30657878 | Total Files Read till now 3345
Saving a chunk size of 500000 | Total read till now 31190908 | Total Files Read till now 3700
Saving a chunk size of 500000 | Total read till now 31613796 | Total Files Read till now 4896
All done Total saved records 31613796
In [ ]:
 
In [ ]: