In [1]:
#Import Libraries
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
import json
import requests
import math
import os
import time
In [2]:
# Import folders
data_folder = os.path.abspath('data')
d2014 = os.path.join(data_folder, 'Data User_2014_15')
d2015 = os.path.join(data_folder, 'Data User_2015_16')
d2016 = os.path.join(data_folder, 'Data User_2016_17')
folders = [d2014, d2015, d2016]
# Set Output Folder
output_folder = os.path.abspath("output")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
In [3]:
# Read Data for all year function
def read_all_year(file):
    df = None
    df_list = []
    for folder in folders:
        df1 = pd.read_csv(os.path.join(folder, file))
        df_list.append(df1)
    df = pd.concat(df_list)
    return df
In [23]:
# Read Data
df = read_all_year('university.csv')
geocoded_df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
df = df[ ( np.invert( df['latitude']  > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\ipykernel_launcher.py:8: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  
In [24]:
# Read Pincode Data
df_list = []
downloaded_folder = os.path.join(data_folder, 'Downloaded')
post_folder = os.path.join(downloaded_folder, 'Post')
headpo_folder = os.path.join(post_folder, 'HeadPO')
headpo_df = gpd.read_file(os.path.join(headpo_folder, 'HeadPO.shp'))
headpo_df = headpo_df[['pincode','LATITUDE', 'LONGITUDE']]
headpo_df.columns = ['pincode','lat', 'long']
df_list.append(headpo_df)
subpo_folder = os.path.join(post_folder, 'SubPO')
subpo_df = gpd.read_file(os.path.join(subpo_folder, 'SubPO.shp'))
subpo_df = subpo_df[['pincode','LATITUDE', 'LONGITUDE']]
subpo_df.columns = ['pincode','lat', 'long']
df_list.append(subpo_df)
general_folder = os.path.join(post_folder, 'GeneralPO')
general_df = gpd.read_file(os.path.join(general_folder, 'GeneralPO.shp'))
general_df = general_df[['pincode','LATITUDE', 'LONGITUDE']]
general_df.columns = ['pincode','lat', 'long']
df_list.append(general_df)
branch_folder = os.path.join(post_folder, 'BranchPO')
branch_df = gpd.read_file(os.path.join(branch_folder, 'BranchPO.shp'))
branch_df['LATITUDE'] = branch_df.geometry.y
branch_df['LONGITUDE'] = branch_df.geometry.x
branch_df = branch_df[['PINCODE','LATITUDE', 'LONGITUDE']]
branch_df.columns = ['pincode','lat', 'long']
df_list.append(branch_df)
post_df = pd.concat(df_list)
In [25]:
# Join pincode
df = pd.merge(df, post_df, how = 'left', left_on = 'pin_code', right_on = 'pincode')
df['latitude'] = df['lat']
df['longitude'] = df['long']
df = df.drop(columns=['pincode', 'lat', 'long'])
geocoded1_df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
geocoded_df = pd.concat([geocoded_df, geocoded1_df])
df = df[ ( np.invert( df['latitude']  > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
In [26]:
# to csv
to_geocode = os.path.join(output_folder, "university_non_geocoded.csv")
#df.to_csv(to_geocode, encoding='utf-8', index=False)
In [28]:
# Get Geocoded universities
df = pd.read_csv(to_geocode, encoding = "ISO-8859-1")
df['latitude'] = df['_latitude']
df['longitude'] = df['_longitude']
df = pd.concat([geocoded_df, df])
df = df[['id', 'latitude', 'longitude']]
df.to_csv(os.path.join(output_folder, "university_geocoded.csv"), encoding='utf-8', index=False)
df
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\ipykernel_launcher.py:5: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """
Out[28]:
id latitude longitude
0 323 18.533300 73.866700
1 147 23.000000 72.000000
2 82 19.000000 82.000000
3 4 17.000000 73.000000
4 767 27.080001 93.400001
5 504 26.280010 80.210010
6 546 25.540001 81.850000
7 404 26.493410 75.273810
8 402 24.580010 73.680010
9 667 28.470000 77.030000
10 709 29.869745 73.942707
11 171 29.430001 77.200001
12 710 25.026110 85.413330
13 732 26.487764 75.203417
14 706 23.584795 72.954487
15 660 31.532210 75.917210
16 105 28.546010 77.273010
17 640 27.976436 76.387210
18 645 30.173450 77.382410
19 90 22.000050 82.000130
20 287 23.250010 68.250010
21 764 27.338930 88.606510
22 422 26.880010 75.810010
23 74 25.617010 85.167010
24 243 23.250001 77.694260
25 813 23.100010 77.250010
26 220 13.021860 77.567142
27 371 30.410106 74.440521
28 503 25.000010 83.000010
29 692 26.236150 72.021890
... ... ... ...
233 520 28.317794 79.443642
234 542 28.905528 78.065051
235 164 29.989671 76.781157
236 41 17.209830 77.602610
237 543 28.973720 77.641110
238 501 26.845110 80.941780
239 114 28.551220 77.208170
240 564 30.356049 77.921233
241 7 17.429970 78.417420
242 381 30.902390 75.814690
243 98 28.744110 77.145000
244 876 13.158816 77.572080
0 493 23.844716 91.428499
1 376 31.633374 74.827248
2 272 23.288350 77.275715
3 541 28.472720 77.482002
4 828 19.310522 84.827506
5 229 16.830626 75.640583
6 495 23.761059 91.265598
7 550 25.830749 82.681763
8 358 20.485206 85.791650
9 737 30.192774 78.164742
10 346 23.800644 92.727956
11 141 23.240727 72.654945
12 24 17.394557 78.498526
13 659 15.034025 75.175530
14 386 30.386539 76.324167
15 760 13.555559 80.026880
16 343 25.567454 91.899960
17 290 22.806404 81.749488

884 rows × 3 columns

In [9]:
#Change dataframe to geospatial and check by exporting to shp
geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
crs = {'init': 'epsg:4326'}
geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
out_file = os.path.join(output_folder, "university_geocoded.shp")
geo_df.to_file(driver='ESRI Shapefile', filename=out_file)
In [30]:
# Standalore Universities
df = read_all_year('standalone_institution.csv')
geocoded_df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
df = df[ ( np.invert( df['latitude']  > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\ipykernel_launcher.py:8: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  
In [32]:
# Standalore Universities
# to csv
to_geocode = os.path.join(output_folder, "standalone_non_geocoded.csv")
#df.to_csv(to_geocode, encoding='utf-8', index=False)
In [35]:
# Standalore Universities
df = pd.read_csv(to_geocode, encoding = "ISO-8859-1")
df['latitude'] = df['_latitude']
df['longitude'] = df['_longitude']
df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ]
df = df.drop_duplicates(['id'], keep='first').reset_index(drop=True)
df = pd.concat([geocoded_df, df])
df = df[['id', 'latitude', 'longitude']]
df.to_csv(os.path.join(output_folder, "standalone_geocoded.csv"), encoding='utf-8', index=False)
df
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\ipykernel_launcher.py:7: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  import sys
Out[35]:
id latitude longitude
0 14518 28.587225 77.227095
1 6466 15.366900 75.128268
2 6464 15.366900 75.128268
3 3596 10.000000 72.000000
4 14622 14.235650 70.214589
5 989 29.676140 76.741250
6 12408 20.568950 75.356890
7 10863 18.493866 73.871494
8 389 28.461110 77.491410
9 4433 23.256720 75.256720
10 12616 13.790001 76.280001
11 6830 14.280010 76.200010
12 6751 12.923050 77.484651
13 1099 30.122310 77.855010
14 14753 28.250032 77.380007
15 4524 23.568990 75.256890
16 2080 20.000000 68.000000
17 2451 29.961655 75.123310
18 2435 31.000000 74.000000
19 6699 16.226710 76.236720
20 6559 16.000000 75.000000
21 4457 10.782788 76.008996
22 12286 14.000000 76.000000
23 4441 28.123450 80.123450
24 2893 38.000000 68.000000
25 12279 13.256470 75.236470
26 14843 28.480818 77.577682
27 347 28.280570 77.290430
28 12660 14.180000 76.540000
29 12276 14.180000 76.540000
... ... ... ...
3670 13224 17.321832 78.454296
3671 10812 18.520466 73.856741
3672 5039 31.276393 75.554326
3673 4906 31.514318 75.911483
3674 16610 18.867974 77.172559
3675 2032 20.503057 76.198081
3676 16206 20.128240 75.483305
3677 15237 25.776207 84.153540
3678 14518 28.587364 77.227452
3679 2474 30.694538 74.648591
3680 2463 31.133893 76.120398
3681 13239 17.334960 78.570601
3682 15515 21.500740 83.885025
3683 1824 18.406296 77.134299
3684 3549 17.335802 78.573527
3685 7496 31.326015 75.576183
3686 13313 9.988525 76.320148
3687 2424 30.404733 75.873030
3688 16667 22.117117 84.046209
3689 4862 21.878500 84.027800
3690 16606 17.546250 78.283393
3691 4932 30.900965 75.857276
3692 12153 8.958650 76.681950
3693 16644 17.546250 78.283393
3694 3502 17.488186 78.382262
3695 3563 17.486750 78.319960
3696 13229 17.400043 78.458975
3697 2411 32.066312 75.443656
3698 16669 28.876556 77.603947
3699 13196 18.875595 79.459138

9234 rows × 3 columns

In [36]:
#Change dataframe to geospatial and check by exporting to shp
geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
crs = {'init': 'epsg:4326'}
geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
out_file = os.path.join(output_folder, "standalone_geocoded.shp")
geo_df.to_file(driver='ESRI Shapefile', filename=out_file)
In [ ]:
 
In [ ]: