In [45]:
#Import Libraries
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
import json
import requests
import math
import os
import time
In [22]:
# Import folders
data_folder = os.path.abspath('data')
d2014 = os.path.join(data_folder, 'Data User_2014_15')
d2015 = os.path.join(data_folder, 'Data User_2015_16')
d2016 = os.path.join(data_folder, 'Data User_2016_17')
folders = [d2014, d2015, d2016]
# Set Output Folder
output_folder = os.path.abspath("output")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
In [23]:
# Read Data for all year function
def read_all_year(file):
    df = None
    df_list = []
    for folder in folders:
        df1 = pd.read_csv(os.path.join(folder, file))
        df_list.append(df1)
    df = pd.concat(df_list)
    return df
In [24]:
# Read Data
df = read_all_year('university.csv')
geocoded_df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
df = df[ ( np.invert( df['latitude']  > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
In [25]:
# Read Pincode Data
df_list = []
downloaded_folder = os.path.join(data_folder, 'Downloaded')
post_folder = os.path.join(downloaded_folder, 'Post')
headpo_folder = os.path.join(post_folder, 'HeadPO')
headpo_df = gpd.read_file(os.path.join(headpo_folder, 'HeadPO.shp'))
headpo_df = headpo_df[['pincode','LATITUDE', 'LONGITUDE']]
headpo_df.columns = ['pincode','lat', 'long']
df_list.append(headpo_df)
subpo_folder = os.path.join(post_folder, 'SubPO')
subpo_df = gpd.read_file(os.path.join(subpo_folder, 'SubPO.shp'))
subpo_df = subpo_df[['pincode','LATITUDE', 'LONGITUDE']]
subpo_df.columns = ['pincode','lat', 'long']
df_list.append(subpo_df)
general_folder = os.path.join(post_folder, 'GeneralPO')
general_df = gpd.read_file(os.path.join(general_folder, 'GeneralPO.shp'))
general_df = general_df[['pincode','LATITUDE', 'LONGITUDE']]
general_df.columns = ['pincode','lat', 'long']
df_list.append(general_df)
branch_folder = os.path.join(post_folder, 'BranchPO')
branch_df = gpd.read_file(os.path.join(branch_folder, 'BranchPO.shp'))
branch_df['LATITUDE'] = branch_df.geometry.y
branch_df['LONGITUDE'] = branch_df.geometry.x
branch_df = branch_df[['PINCODE','LATITUDE', 'LONGITUDE']]
branch_df.columns = ['pincode','lat', 'long']
df_list.append(branch_df)
post_df = pd.concat(df_list)
In [26]:
# Join pincode
df = pd.merge(df, post_df, how = 'left', left_on = 'pin_code', right_on = 'pincode')
df['latitude'] = df['lat']
df['longitude'] = df['long']
df = df.drop(columns=['pincode', 'lat', 'long'])
geocoded1_df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
geocoded_df = pd.concat([geocoded_df, geocoded1_df])
df = df[ ( np.invert( df['latitude']  > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
In [28]:
# to csv
to_geocode = os.path.join(output_folder, "university_non_geocoded.csv")
df.to_csv(to_geocode, encoding='utf-8', index=False)
In [43]:
# Get Geocoded universities
df = pd.read_csv(to_geocode)
df = pd.concat([geocoded_df, df])
df = df[['id', 'latitude', 'longitude']]
df.to_csv(os.path.join(output_folder, "university_geocoded.csv"), encoding='utf-8', index=False)
df
Out[43]:
id latitude longitude
0 323 18.533300 73.866700
1 147 23.000000 72.000000
2 82 19.000000 82.000000
3 4 17.000000 73.000000
4 767 27.080001 93.400001
5 504 26.280010 80.210010
6 546 25.540001 81.850000
7 404 26.493410 75.273810
8 402 24.580010 73.680010
9 667 28.470000 77.030000
10 709 29.869745 73.942707
11 171 29.430001 77.200001
12 710 25.026110 85.413330
13 732 26.487764 75.203417
14 706 23.584795 72.954487
15 660 31.532210 75.917210
16 105 28.546010 77.273010
17 640 27.976436 76.387210
18 645 30.173450 77.382410
19 90 22.000050 82.000130
20 287 23.250010 68.250010
21 764 27.338930 88.606510
22 422 26.880010 75.810010
23 74 25.617010 85.167010
24 243 23.250001 77.694260
25 813 23.100010 77.250010
26 220 13.021860 77.567142
27 371 30.410106 74.440521
28 503 25.000010 83.000010
29 692 26.236150 72.021890
... ... ... ...
233 520 28.317794 79.443642
234 542 28.905528 78.065051
235 164 29.989671 76.781157
236 41 17.209830 77.602610
237 543 28.973720 77.641110
238 501 26.845110 80.941780
239 114 28.551220 77.208170
240 564 30.356049 77.921233
241 7 17.429970 78.417420
242 381 30.902390 75.814690
243 98 28.744110 77.145000
244 876 13.158816 77.572080
0 272 18.547456 73.807629
1 541 28.472720 77.482002
2 493 23.844716 91.428499
3 376 31.633374 74.827248
4 495 23.763673 91.260385
5 828 19.310522 84.827506
6 229 16.830626 75.640583
7 550 25.830749 82.681763
8 358 20.484786 85.789775
9 737 30.192774 78.164742
10 346 23.800644 92.727956
11 141 23.239165 72.638549
12 24 17.604163 78.546920
13 659 30.386539 76.324167
14 386 30.386539 76.324167
15 760 13.555559 80.026880
16 290 22.806404 81.749488
17 343 25.567454 91.899960

884 rows × 3 columns

In [47]:
#Change dataframe to geospatial and check by exporting to shp
geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
crs = {'init': 'epsg:4326'}
geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
out_file = os.path.join(output_folder, "university_geocoded.shp")
geo_df.to_file(driver='ESRI Shapefile', filename=out_file)
In [58]:
# Standalore Universities
df = read_all_year('standalone_institution.csv')
geocoded_df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
df = df[ ( np.invert( df['latitude']  > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
In [59]:
# Standalore Universities
# to csv
to_geocode = os.path.join(output_folder, "standalone_non_geocoded.csv")
#df.to_csv(to_geocode, encoding='utf-8', index=False)
In [60]:
# Standalore Universities
df = pd.read_csv(to_geocode)
df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ]
df = df.drop_duplicates(['id'], keep='first').reset_index(drop=True)
df = pd.concat([geocoded_df, df])
df = df[['id', 'latitude', 'longitude']]
df.to_csv(os.path.join(output_folder, "standalone_geocoded.csv"), encoding='utf-8', index=False)
df
Out[60]:
id latitude longitude
0 14518 28.587225 77.227095
1 6466 15.366900 75.128268
2 6464 15.366900 75.128268
3 3596 10.000000 72.000000
4 14622 14.235650 70.214589
5 989 29.676140 76.741250
6 12408 20.568950 75.356890
7 10863 18.493866 73.871494
8 389 28.461110 77.491410
9 4433 23.256720 75.256720
10 12616 13.790001 76.280001
11 6830 14.280010 76.200010
12 6751 12.923050 77.484651
13 1099 30.122310 77.855010
14 14753 28.250032 77.380007
15 4524 23.568990 75.256890
16 2080 20.000000 68.000000
17 2451 29.961655 75.123310
18 2435 31.000000 74.000000
19 6699 16.226710 76.236720
20 6559 16.000000 75.000000
21 4457 10.782788 76.008996
22 12286 14.000000 76.000000
23 4441 28.123450 80.123450
24 2893 38.000000 68.000000
25 12279 13.256470 75.236470
26 14843 28.480818 77.577682
27 347 28.280570 77.290430
28 12660 14.180000 76.540000
29 12276 14.180000 76.540000
... ... ... ...
130 3526 17.326828 78.274666
131 1648 26.264776 82.072706
132 10215 21.385554 78.921752
133 8222 9.453285 77.802417
134 6110 22.530987 72.799109
135 14012 21.871514 73.503052
136 6884 15.751870 76.192864
137 10221 19.138251 77.320956
138 10080 18.679164 73.896865
139 14045 27.106176 78.588167
140 7818 13.075239 79.655824
141 8101 9.925201 78.119775
142 3451 13.628756 79.419179
143 13198 19.664062 78.532011
144 12979 17.156175 82.286138
145 9815 20.100330 79.114568
146 3910 12.827145 76.759622
147 9290 19.138251 77.320956
148 9359 19.876165 75.343314
149 14303 15.505723 80.049922
150 12415 12.971599 77.594563
151 9560 20.466345 79.983103
152 13356 17.189853 78.648384
153 3874 31.533240 76.892273
154 6667 16.172536 75.655721
155 15889 16.757173 81.679963
156 13060 16.289009 80.262797
157 9841 16.852397 74.581477
158 12140 9.318327 76.611085
159 16466 16.455396 80.025546

5694 rows × 3 columns

In [61]:
#Change dataframe to geospatial and check by exporting to shp
geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
crs = {'init': 'epsg:4326'}
geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
out_file = os.path.join(output_folder, "standalone_geocoded.shp")
geo_df.to_file(driver='ESRI Shapefile', filename=out_file)
In [ ]:
 
In [ ]: