In [179]:
#Import Libraries
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
import json
import requests
import math
import os
import time
In [174]:
# Import folders
data_folder = os.path.abspath('data')
d2014 = os.path.join(data_folder, 'Data User_2014_15')
d2015 = os.path.join(data_folder, 'Data User_2015_16')
d2016 = os.path.join(data_folder, 'Data User_2016_17')
folders = [d2014, d2015, d2016]
# Set Output Folder
output_folder = os.path.abspath("output")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
In [167]:
# Read Data for all year function
def read_all_year(file):
    df = None
    df_list = []
    for folder in folders:
        df1 = pd.read_csv(os.path.join(folder, file))
        df_list.append(df1)
    df = pd.concat(df_list)
    return df
In [168]:
# Read College Data
df = read_all_year('college.csv')
df = df.drop_duplicates(['id'], keep='last')
df1 = read_all_year('college_institution.csv')
#df1 = df1[( (df1['latitude'] > 0) & (df1['longitude'] > 0) ) ]
#df1 = df1.drop_duplicates(['id'], keep='last')
#df2 = df2.drop_duplicates(['id'], keep='last')
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2850: DtypeWarning: Columns (44) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2850: DtypeWarning: Columns (37,38,39,40,44,45,49) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):
In [169]:
df = df.join(df1, on='id', how='left', lsuffix='_x', rsuffix='_y', sort=False).drop_duplicates(['id'], keep='last').reset_index(drop=True)
df['id'] = df['id_x']
df = df.drop(columns=['id_x'])
geocoded_df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
df = df[ ( np.invert( df['latitude']  > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
In [170]:
# Read Pincode Data
df_list = []
downloaded_folder = os.path.join(data_folder, 'Downloaded')
post_folder = os.path.join(downloaded_folder, 'Post')
headpo_folder = os.path.join(post_folder, 'HeadPO')
headpo_df = gpd.read_file(os.path.join(headpo_folder, 'HeadPO.shp'))
headpo_df = headpo_df[['pincode','LATITUDE', 'LONGITUDE']]
headpo_df.columns = ['pincode','lat', 'long']
df_list.append(headpo_df)
subpo_folder = os.path.join(post_folder, 'SubPO')
subpo_df = gpd.read_file(os.path.join(subpo_folder, 'SubPO.shp'))
subpo_df = subpo_df[['pincode','LATITUDE', 'LONGITUDE']]
subpo_df.columns = ['pincode','lat', 'long']
df_list.append(subpo_df)
general_folder = os.path.join(post_folder, 'GeneralPO')
general_df = gpd.read_file(os.path.join(general_folder, 'GeneralPO.shp'))
general_df = general_df[['pincode','LATITUDE', 'LONGITUDE']]
general_df.columns = ['pincode','lat', 'long']
df_list.append(general_df)
branch_folder = os.path.join(post_folder, 'BranchPO')
branch_df = gpd.read_file(os.path.join(branch_folder, 'BranchPO.shp'))
branch_df['LATITUDE'] = branch_df.geometry.y
branch_df['LONGITUDE'] = branch_df.geometry.x
branch_df = branch_df[['PINCODE','LATITUDE', 'LONGITUDE']]
branch_df.columns = ['pincode','lat', 'long']
df_list.append(branch_df)
post_df = pd.concat(df_list)
In [171]:
# Join pincode
df = pd.merge(df, post_df, how = 'left', left_on = 'pin_code', right_on = 'pincode')
df['latitude'] = df['lat']
df['longitude'] = df['long']
df = df.drop(columns=['pincode', 'lat', 'long'])
geocoded1_df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
geocoded_df = pd.concat([geocoded_df, geocoded1_df])
df = df[ ( np.invert( df['latitude']  > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
In [176]:
# to csv
to_geocode = os.path.join(output_folder, "Colleges_non_geocoded.csv")
#df.to_csv(to_geocode, encoding='utf-8', index=False)
In [177]:
df = pd.read_csv(to_geocode)
df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ]
df = df.drop_duplicates(['id'], keep='first').reset_index(drop=True)
df = pd.concat([geocoded_df, df])
df = df[['id', 'latitude', 'longitude']]
df.to_csv(os.path.join(output_folder, "Colleges_geocoded.csv"), encoding='utf-8', index=False)
df
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2728: DtypeWarning: Columns (8,9,11,13,18,22,23,24,25,28,29,32,35,38,39,40,41,42,44,47,48,51,52,55) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[177]:
id latitude longitude
0 24513 26.937001 93.817001
1 11043 16.402857 74.362316
2 8168 29.291724 74.568297
3 192 9.388276 78.744013
4 16746 10.520010 76.210010
5 63 21.620010 71.230010
6 58 22.456587 83.235465
7 169 13.039339 80.268372
8 202 19.100010 73.200010
9 368 22.123450 82.123450
10 434 11.000010 77.000030
11 477 20.020020 73.501230
12 601 24.180944 85.891744
13 764 27.416673 77.646385
14 869 11.016844 76.955832
15 23858 29.650801 75.824230
16 1060 8.810010 78.140010
17 6447 14.455743 78.806638
18 6621 16.250010 80.250010
19 6639 32.250868 75.670357
20 6161 12.907841 77.573178
21 6164 30.100001 75.170001
22 6227 15.475536 78.463716
23 6219 18.520310 73.856710
24 6273 11.802194 79.763349
25 6262 23.260001 68.260001
26 6309 13.603543 79.447861
27 7133 21.978390 82.648340
28 7159 23.256672 75.689980
29 7204 28.300951 77.718024
... ... ... ...
17254 57731 23.306308 77.360108
17255 57732 23.311401 77.356201
17256 57733 17.988738 79.603249
17257 57734 17.323562 78.557526
17258 57737 24.872649 74.639947
17259 57741 25.599311 81.641258
17260 57742 25.407596 83.219824
17261 57743 24.812179 82.344828
17262 57745 26.142461 83.638125
17263 57752 18.399836 76.560050
17264 57751 18.801019 75.159957
17265 57750 26.885897 80.851982
17266 57756 26.028693 80.590467
17267 57755 26.873150 75.816645
17268 57760 28.948700 78.675900
17269 57759 17.504923 78.487790
17270 57761 23.666164 74.025289
17271 57764 30.935599 76.407314
17272 57763 30.060003 78.997745
17273 57765 17.657600 80.896000
17274 57766 19.880460 75.324495
17275 57767 28.613422 77.364655
17276 57769 8.570479 76.889794
17277 57772 13.010446 77.614570
17278 57789 18.631887 73.847832
17279 57792 21.139607 78.797290
17280 57793 23.153671 79.795959
17281 57794 31.174760 76.018293
17282 57795 30.271411 78.081306
17283 57797 26.487067 80.349160

34179 rows × 3 columns

In [ ]:
 
In [180]:
#Change dataframe to geospatial and check by exporting to shp
geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
crs = {'init': 'epsg:4326'}
geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
out_file = os.path.join(output_folder, "Colleges_geocoded.shp")
geo_df.to_file(driver='ESRI Shapefile', filename=out_file)
In [ ]: