#Import Libraries
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
import json
import requests
import math
import os
import time
# Import folders
data_folder = os.path.abspath('data')
d2011 = os.path.join(data_folder, 'Data User_2011_12')
d2012 = os.path.join(data_folder, 'Data User_2012_13')
d2013 = os.path.join(data_folder, 'Data User_2013_14')
d2014 = os.path.join(data_folder, 'Data User_2014_15')
d2015 = os.path.join(data_folder, 'Data User_2015_16')
d2016 = os.path.join(data_folder, 'Data User_2016_17')
d2017 = os.path.join(data_folder, 'Data User_2017_18')
folders = [d2011, d2012, d2013, d2014, d2015, d2016, d2017]
# Set Output Folder
output_folder = os.path.abspath("output")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Read Data for all year function
def read_all_year(file):
df = None
df_list = []
for folder in folders:
df1 = pd.read_csv(os.path.join(folder, file))
df_list.append(df1)
df = pd.concat(df_list)
return df
# Read College Data
df = read_all_year('college.csv')
df = df.drop_duplicates(['id'], keep='last')
df1 = read_all_year('college_institution.csv')
#df1 = df1[( (df1['latitude'] > 0) & (df1['longitude'] > 0) ) ]
#df1 = df1.drop_duplicates(['id'], keep='last')
#df2 = df2.drop_duplicates(['id'], keep='last')
df = df.join(df1, on='id', how='left', lsuffix='_x', rsuffix='_y', sort=False).drop_duplicates(['id'], keep='last').reset_index(drop=True)
df['id'] = df['id_x']
df = df.drop(columns=['id_x'])
geocoded_df = df[ ( ( df['latitude'] > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
df = df[ ( np.invert( df['latitude'] > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
df['pin_code'] = pd.to_numeric(df['pin_code'], errors='coerce')
# Read Pincode Data
df_list = []
downloaded_folder = os.path.join(data_folder, 'Downloaded')
post_folder = os.path.join(downloaded_folder, 'Post')
headpo_folder = os.path.join(post_folder, 'HeadPO')
headpo_df = gpd.read_file(os.path.join(headpo_folder, 'HeadPO.shp'))
headpo_df = headpo_df[['pincode','LATITUDE', 'LONGITUDE']]
headpo_df.columns = ['pincode','lat', 'long']
df_list.append(headpo_df)
subpo_folder = os.path.join(post_folder, 'SubPO')
subpo_df = gpd.read_file(os.path.join(subpo_folder, 'SubPO.shp'))
subpo_df = subpo_df[['pincode','LATITUDE', 'LONGITUDE']]
subpo_df.columns = ['pincode','lat', 'long']
df_list.append(subpo_df)
general_folder = os.path.join(post_folder, 'GeneralPO')
general_df = gpd.read_file(os.path.join(general_folder, 'GeneralPO.shp'))
general_df = general_df[['pincode','LATITUDE', 'LONGITUDE']]
general_df.columns = ['pincode','lat', 'long']
df_list.append(general_df)
branch_folder = os.path.join(post_folder, 'BranchPO')
branch_df = gpd.read_file(os.path.join(branch_folder, 'BranchPO.shp'))
branch_df['LATITUDE'] = branch_df.geometry.y
branch_df['LONGITUDE'] = branch_df.geometry.x
branch_df = branch_df[['PINCODE','LATITUDE', 'LONGITUDE']]
branch_df.columns = ['pincode','lat', 'long']
df_list.append(branch_df)
post_df = pd.concat(df_list)
# Join pincode
df = pd.merge(df, post_df, how = 'left', left_on = 'pin_code', right_on = 'pincode')
df['latitude'] = df['lat']
df['longitude'] = df['long']
df = df.drop(columns=['pincode', 'lat', 'long'])
geocoded1_df = df[ ( ( df['latitude'] > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
geocoded_df = pd.concat([geocoded_df, geocoded1_df])
df = df[ ( np.invert( df['latitude'] > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
# to csv
to_geocode = os.path.join(output_folder, "Colleges_non_geocoded.csv")
#df.to_csv(to_geocode, encoding='utf-8', index=False)
df = pd.read_csv(to_geocode, encoding = "ISO-8859-1")
df['latitude'] = df['_latitude']
df['longitude'] = df['_longitude']
df = df[ ( ( df['latitude'] > 0) & ( df['longitude'] > 0) ) ]
df = df.drop_duplicates(['id'], keep='first').reset_index(drop=True)
df = pd.concat([geocoded_df, df])
df = df[['id', 'latitude', 'longitude']]
df.to_csv(os.path.join(output_folder, "Colleges_geocoded.csv"), encoding='utf-8', index=False)
df
#Change dataframe to geospatial and check by exporting to shp
geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
crs = {'init': 'epsg:4326'}
geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
out_file = os.path.join(output_folder, "Colleges_geocoded.shp")
geo_df.to_file(driver='ESRI Shapefile', filename=out_file)