#Import Libraries
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
import json
import requests
import math
import os
import time
# Import folders
data_folder = os.path.abspath('data')
d2014 = os.path.join(data_folder, 'Data User_2014_15')
d2015 = os.path.join(data_folder, 'Data User_2015_16')
d2016 = os.path.join(data_folder, 'Data User_2016_17')
folders = [d2014, d2015, d2016]
# Set Output Folder
output_folder = os.path.abspath("output")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Read Data for all year function
def read_all_year(file):
df = None
df_list = []
for folder in folders:
df1 = pd.read_csv(os.path.join(folder, file))
df_list.append(df1)
df = pd.concat(df_list)
return df
# Read Data
df = read_all_year('university.csv')
geocoded_df = df[ ( ( df['latitude'] > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
df = df[ ( np.invert( df['latitude'] > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
# Read Pincode Data
df_list = []
downloaded_folder = os.path.join(data_folder, 'Downloaded')
post_folder = os.path.join(downloaded_folder, 'Post')
headpo_folder = os.path.join(post_folder, 'HeadPO')
headpo_df = gpd.read_file(os.path.join(headpo_folder, 'HeadPO.shp'))
headpo_df = headpo_df[['pincode','LATITUDE', 'LONGITUDE']]
headpo_df.columns = ['pincode','lat', 'long']
df_list.append(headpo_df)
subpo_folder = os.path.join(post_folder, 'SubPO')
subpo_df = gpd.read_file(os.path.join(subpo_folder, 'SubPO.shp'))
subpo_df = subpo_df[['pincode','LATITUDE', 'LONGITUDE']]
subpo_df.columns = ['pincode','lat', 'long']
df_list.append(subpo_df)
general_folder = os.path.join(post_folder, 'GeneralPO')
general_df = gpd.read_file(os.path.join(general_folder, 'GeneralPO.shp'))
general_df = general_df[['pincode','LATITUDE', 'LONGITUDE']]
general_df.columns = ['pincode','lat', 'long']
df_list.append(general_df)
branch_folder = os.path.join(post_folder, 'BranchPO')
branch_df = gpd.read_file(os.path.join(branch_folder, 'BranchPO.shp'))
branch_df['LATITUDE'] = branch_df.geometry.y
branch_df['LONGITUDE'] = branch_df.geometry.x
branch_df = branch_df[['PINCODE','LATITUDE', 'LONGITUDE']]
branch_df.columns = ['pincode','lat', 'long']
df_list.append(branch_df)
post_df = pd.concat(df_list)
# Join pincode
df = pd.merge(df, post_df, how = 'left', left_on = 'pin_code', right_on = 'pincode')
df['latitude'] = df['lat']
df['longitude'] = df['long']
df = df.drop(columns=['pincode', 'lat', 'long'])
geocoded1_df = df[ ( ( df['latitude'] > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
geocoded_df = pd.concat([geocoded_df, geocoded1_df])
df = df[ ( np.invert( df['latitude'] > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
# to csv
to_geocode = os.path.join(output_folder, "university_non_geocoded.csv")
df.to_csv(to_geocode, encoding='utf-8', index=False)
# Get Geocoded universities
df = pd.read_csv(to_geocode)
df = pd.concat([geocoded_df, df])
df = df[['id', 'latitude', 'longitude']]
df.to_csv(os.path.join(output_folder, "university_geocoded.csv"), encoding='utf-8', index=False)
df
#Change dataframe to geospatial and check by exporting to shp
geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
crs = {'init': 'epsg:4326'}
geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
out_file = os.path.join(output_folder, "university_geocoded.shp")
geo_df.to_file(driver='ESRI Shapefile', filename=out_file)
# Standalore Universities
df = read_all_year('standalone_institution.csv')
geocoded_df = df[ ( ( df['latitude'] > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
df = df[ ( np.invert( df['latitude'] > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
# Standalore Universities
# to csv
to_geocode = os.path.join(output_folder, "standalone_non_geocoded.csv")
#df.to_csv(to_geocode, encoding='utf-8', index=False)
# Standalore Universities
df = pd.read_csv(to_geocode)
df = df[ ( ( df['latitude'] > 0) & ( df['longitude'] > 0) ) ]
df = df.drop_duplicates(['id'], keep='first').reset_index(drop=True)
df = pd.concat([geocoded_df, df])
df = df[['id', 'latitude', 'longitude']]
df.to_csv(os.path.join(output_folder, "standalone_geocoded.csv"), encoding='utf-8', index=False)
df
#Change dataframe to geospatial and check by exporting to shp
geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
crs = {'init': 'epsg:4326'}
geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
out_file = os.path.join(output_folder, "standalone_geocoded.shp")
geo_df.to_file(driver='ESRI Shapefile', filename=out_file)