# Import Libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
import lxml
import os
import glob
import time
import datetime
import json
import itertools
# Set Output Folder
output_folder = os.path.abspath("output")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Import data folders
data_folder = os.path.abspath("data")
# Helping Functions
def remove_consecutive_duplicates(x):
return ''.join(i for i, _ in itertools.groupby(x))
# States of India
states_file = os.path.join(data_folder, 'allStateofIndia2018_07_15_05_49_44_241.csv')
states_df = pd.read_csv(states_file, delimiter=';')
states_df = states_df[['State Name(In English)', 'Census 2011 Code']]
states_df.columns = ['State Name', 'state code 2011']
states_df.loc[states_df['State Name'] == 'TELANGANA', 'state code 2011'] = 28 #Telangana Fix
states_df['State Name'] = states_df['State Name'].str.upper()
states_df
# Read physical progress Data
physical_progress_folder = os.path.join(data_folder, "Physical Progress Reports", "output")
physical_progress_files = os.listdir(physical_progress_folder)
physical_progress_files = [os.path.join(physical_progress_folder, x) for x in physical_progress_files if x.split('.')[-1] == 'csv'][:-1]
df_list = []
for file in physical_progress_files:
df = pd.read_csv(file, usecols=[0, 7, 8, 9, 10, 11])
df.columns = ['Panchayat Name', 'Total Houses completed', 'Financial Year', 'Scheme', 'State Name', 'District Name']
df_list.append(df)
print(file)
physical_df = pd.concat(df_list).reset_index()
physical_df = physical_df[physical_df['Panchayat Name'].notnull()]
physical_df['Panchayat Name'] = physical_df['Panchayat Name'].str.upper()
physical_df['Panchayat Name'] = physical_df['Panchayat Name'].str.replace('PANCHAYAT', '').str.replace('GP', '')
physical_df['District Name'] = physical_df['District Name'].str.upper()
physical_df['State Name'] = physical_df['State Name'].str.upper()
physical_df = physical_df[['Panchayat Name','State Name', 'District Name']]
physical_df
# Read All Central Scheme Financial progress Data
centr_folder = os.path.join(data_folder, "Financial Progress Reports", "All Central Scheme-done", "output")
centr_files = os.listdir(centr_folder)
centr_files = [os.path.join(centr_folder, x) for x in centr_files if x.split('.')[-1] == 'csv'][:-1]
df_list = []
for file in centr_files:
df = pd.read_csv(file)
df_list.append(df)
print(file)
centr_df = pd.concat(df_list).reset_index()
centr_df['Scheme'] = 'All Central Scheme'
centr_df['Panchayat Name'] = centr_df['A']
centr_df = centr_df[centr_df['Panchayat Name'].notnull()]
centr_df['Panchayat Name'] = centr_df['Panchayat Name'].str.upper()
centr_df['Panchayat Name'] = centr_df['Panchayat Name'].str.replace('PANCHAYAT', '').str.replace('GP', '')
centr_df['District Name'] = centr_df['District Name'].str.upper()
centr_df['State Name'] = centr_df['State Name'].str.upper()
centr_df = centr_df[['Panchayat Name','State Name', 'District Name']]
centr_df
# Read All IAY New Construction Data
iay_folder = os.path.join(data_folder, "Financial Progress Reports", "IAY New Construction -done", "output")
iay_files = os.listdir(iay_folder)
iay_files = [os.path.join(iay_folder, x) for x in iay_files if x.split('.')[-1] == 'csv'][:-1]
df_list = []
for file in iay_files:
df = pd.read_csv(file)
df_list.append(df)
print(file)
iay_df = pd.concat(df_list).reset_index()
iay_df['Scheme'] = 'IAY New Construction'
iay_df = iay_df[iay_df.columns[1:-1]]
iay_df['Panchayat Name'] = iay_df['A']
iay_df = iay_df[iay_df.columns[1:]]
iay_df = iay_df[iay_df['Panchayat Name'].notnull()]
iay_df['Panchayat Name'] = iay_df['Panchayat Name'].str.upper()
iay_df['Panchayat Name'] = iay_df['Panchayat Name'].str.replace('PANCHAYAT', '').str.replace('GP', '')
iay_df['District Name'] = iay_df['District Name'].str.upper()
iay_df['State Name'] = iay_df['State Name'].str.upper()
iay_df = iay_df[['Panchayat Name','State Name', 'District Name']]
iay_df
# Read All PMAY New Construction Data
pmay_folder = os.path.join(data_folder, "Financial Progress Reports", "Pradhan Mantri Awaas Yojana -done")
pmay_files = os.listdir(pmay_folder)
pmay_files = [os.path.join(pmay_folder, x) for x in pmay_files if x.split('.')[-1] == 'csv'][:-1]
df_list = []
for file in pmay_files:
df = pd.read_csv(file)
df_list.append(df)
print(file)
pmay_df = pd.concat(df_list).reset_index()
pmay_df['Scheme'] = 'Pradhan Mantri Awaas Yojana'
pmay_df = pmay_df[pmay_df.columns[1:-1]]
pmay_df['Panchayat Name'] = pmay_df['A']
pmay_df = pmay_df[pmay_df.columns[1:]]
pmay_df = pmay_df[pmay_df['Panchayat Name'].notnull()]
pmay_df['Panchayat Name'] = pmay_df['Panchayat Name'].str.upper()
pmay_df['Panchayat Name'] = pmay_df['Panchayat Name'].str.replace('PANCHAYAT', '').str.replace('GP', '')
pmay_df['District Name'] = pmay_df['District Name'].str.upper()
pmay_df['State Name'] = pmay_df['State Name'].str.upper()
pmay_df = pmay_df[['Panchayat Name','State Name', 'District Name']]
pmay_df
togeocode_df = pd.concat([physical_df, centr_df, iay_df, pmay_df])
togeocode_df = togeocode_df.drop_duplicates().reset_index()
togeocode_df
# Read Scrapped Data
match_folder = os.path.join(data_folder, "VILLAGE_ASSEMBLY_PC_match")
file_path = os.path.join(match_folder, "merge.xlsx")
match_df = pd.read_excel(file_path)
match_df = match_df[['Village Name', 'Localbody Name', 'District Name', 'State Name', 'Assembly Constituency ECI Code']]
match_df = match_df.drop_duplicates().reset_index()
match_df = pd.merge(match_df, states_df, how='inner', on=['State Name']).drop_duplicates().reset_index()
match_df = match_df[match_df.columns[2:]]
match_df
# Join by the column and try to filter non geocodes
match_df['district_y'] = match_df['District Name'].str.upper().str.replace('([^A-Za-z]+)', '').apply(remove_consecutive_duplicates)
match_df['village_y'] = match_df['Localbody Name'].str.upper().str.replace('([^A-Za-z]+)', '').apply(remove_consecutive_duplicates)
togeocode_df['district_y'] = togeocode_df['District Name'].str.upper().str.replace('([^A-Za-z]+)', '').apply(remove_consecutive_duplicates)
togeocode_df['village_y'] = togeocode_df['Panchayat Name'].str.upper().str.replace('([^A-Za-z]+)', '').apply(remove_consecutive_duplicates)
geocoded_df = pd.merge(togeocode_df, match_df, how='left', left_on=['village_y', 'district_y'], right_on = ['village_y', 'district_y'])
nongeocoded_df = geocoded_df[geocoded_df['state code 2011'].isnull()]
nongeocoded_df = nongeocoded_df[nongeocoded_df.columns[1:6]]
nongeocoded_df = nongeocoded_df.drop_duplicates().reset_index()
geocoded_df = geocoded_df[geocoded_df['state code 2011'].notnull()][['Panchayat Name', 'District Name_x', 'State Name_x', 'Assembly Constituency ECI Code', 'state code 2011']]
geocoded_df.columns = ['panchayat', 'district', 'state', 'ac_code', 'state_code']
geocoded_df = geocoded_df.drop_duplicates().reset_index()
match_df['village_y'] = match_df['Village Name'].str.upper().str.replace('([^A-Za-z]+)', '').apply(remove_consecutive_duplicates)
geocoded_df1 = pd.merge(nongeocoded_df, match_df, how='left', left_on=['village_y', 'district_y'], right_on = ['village_y', 'district_y'])
nongeocoded_df = geocoded_df1[geocoded_df1['state code 2011'].isnull()]
nongeocoded_df = nongeocoded_df[nongeocoded_df.columns[1:6]]
nongeocoded_df = nongeocoded_df.drop_duplicates().reset_index()
nongeocoded_df = nongeocoded_df[nongeocoded_df.columns[1:4]]
nongeocoded_df.columns = ['panchayat', 'state', 'district']
geocoded_df1 = geocoded_df1[geocoded_df1['state code 2011'].notnull()][['Panchayat Name', 'District Name_x', 'State Name_x', 'Assembly Constituency ECI Code', 'state code 2011']]
geocoded_df1.columns = ['panchayat', 'district', 'state', 'ac_code', 'state_code']
geocoded_df = pd.concat([geocoded_df, geocoded_df1]).drop_duplicates().reset_index()
geocoded_df = geocoded_df[['panchayat', 'district', 'state', 'ac_code', 'state_code']]
geocoded_df
nongeocoded_df
#Save to CSV
geocoded_df.to_csv(os.path.join(output_folder, 'match_geocoded.csv'), encoding='utf-8', index=False)
nongeocoded_df.to_csv(os.path.join(output_folder, 'nongeocoded.csv'), encoding='utf-8', index=False)