#Declare Columns
columns = [
'sid',
'stage',
'asset_id',
'category_id',
'subcategory_id',
'state_code',
'district_code',
'block_code',
'panchayat_code',
'work_financial_year',
'completion_financial_year',
'uuid',
'ser_time',
'status',
'validated_timestamp'
]
#Read Files
chunksize = 500000
total_saved = total_read = 0
records = 0
df_list = []
files_folder = os.path.join(data_folder, 'Scraped Files')
files = os.listdir(files_folder)
files = [os.path.join(files_folder, i) for i in files if i.split('.')[-1] == 'xml']
files = files
file_count = 0
for file in files:
file_count = file_count + 1
with open(os.path.join(files_folder, file), 'r') as myfile:
data = myfile.read().replace('\n', '')
#Get Column Names
items = pd.DataFrame(data.split('<item>')[1:])
if items.shape[0] > 0:
columns = items[0].values[0].split('<span class="atr-name">')[1:]
columns = [i.split('</span>')[0] for i in columns]
#Get All Attributes
attributes = None
for column in columns:
df = items[0].str.split('<span class="atr-name">' + column + '</span>').str[1].str.split('<span class="atr-value">').str[1].str.split('</span>').str[0]
if attributes is None:
attributes = pd.DataFrame(df)
attributes.columns = [column]
else:
attributes[column] = df
df = None
#Get Geometry
geometry = pd.DataFrame(items[0].str.split('<georss:point>').str[1])
geometry = geometry[geometry[0].notnull()]
geometry = pd.DataFrame(geometry[0].str.split('</georss:point>').str[0].str.split().tolist())
geometry = geometry.apply(pd.to_numeric)
geometry.columns = ['latitude', 'longitude']
#geometry['latitude'] = geometry['latitude'].astype(float)
#geometry['longitude'] = geometry['longitude'].astype(float)
#Make Combined files
df = pd.concat([attributes, geometry], axis=1, join='inner')
df_list.append(df)
records = records + df.shape[0]
#When files size increases Create merged File and dump it to drive
#print(records)
if records > chunksize or file_count == len(files):
df = pd.concat(df_list)
df = df[(df['longitude'].notnull()) & (df['latitude'].notnull())]
df_list = []
records = 0
_chunksize = df.shape[0]
total_read = total_saved + _chunksize
print('Saving a chunk size of ' + str(chunksize) + ' | Total read till now ' + str(total_read) + ' | Total Files Read till now ' + str(file_count))
out_filename = str(total_saved) + '-' + str(total_read) + '.csv'
out_filepath = os.path.join(output_folder, out_filename)
df.to_csv(out_filepath, encoding='utf-8', index=False)
#Change dataframe to geospatial and check by exporting to shp
geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
crs = {'init': 'epsg:4326'}
geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
geo_df.to_file(driver='ESRI Shapefile', filename=out_filepath.split('.')[0]+'.shp')
total_saved = total_read
df = None
geo_df = None
#All done
print('All done Total saved records ' + str(total_saved))