In [12]:
#Import Libraries
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
import json
import requests
import math
import os
import time
In [13]:
# Import folders
data_folder = os.path.abspath('data')

d2011 = os.path.join(data_folder, 'Data User_2011_12')
d2012 = os.path.join(data_folder, 'Data User_2012_13')
d2013 = os.path.join(data_folder, 'Data User_2013_14')
d2014 = os.path.join(data_folder, 'Data User_2014_15')
d2015 = os.path.join(data_folder, 'Data User_2015_16')
d2016 = os.path.join(data_folder, 'Data User_2016_17')
d2017 = os.path.join(data_folder, 'Data User_2017_18')
folders = [d2011, d2012, d2013, d2014, d2015, d2016, d2017]
# Set Output Folder
output_folder = os.path.abspath("output")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
In [14]:
# Read Data for all year function
def read_all_year(file):
    df = None
    df_list = []
    for folder in folders:
        df1 = pd.read_csv(os.path.join(folder, file))
        df_list.append(df1)
    df = pd.concat(df_list)
    return df
In [15]:
# Read College Data
df = read_all_year('college.csv')
df = df.drop_duplicates(['id'], keep='last')
df1 = read_all_year('college_institution.csv')
#df1 = df1[( (df1['latitude'] > 0) & (df1['longitude'] > 0) ) ]
#df1 = df1.drop_duplicates(['id'], keep='last')
#df2 = df2.drop_duplicates(['id'], keep='last')
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\IPython\core\interactiveshell.py:2903: DtypeWarning: Columns (37,38,39,40,41) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\IPython\core\interactiveshell.py:2903: DtypeWarning: Columns (48) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\IPython\core\interactiveshell.py:2903: DtypeWarning: Columns (44) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\IPython\core\interactiveshell.py:2903: DtypeWarning: Columns (37,38,39,40,44,45,49) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\IPython\core\interactiveshell.py:2903: DtypeWarning: Columns (37,38,39,40,44,45) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\ipykernel_launcher.py:8: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  
In [16]:
df = df.join(df1, on='id', how='left', lsuffix='_x', rsuffix='_y', sort=False).drop_duplicates(['id'], keep='last').reset_index(drop=True)
df['id'] = df['id_x']
df = df.drop(columns=['id_x'])
geocoded_df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
df = df[ ( np.invert( df['latitude']  > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
df['pin_code']   = pd.to_numeric(df['pin_code'], errors='coerce')
In [17]:
# Read Pincode Data
df_list = []
downloaded_folder = os.path.join(data_folder, 'Downloaded')
post_folder = os.path.join(downloaded_folder, 'Post')
headpo_folder = os.path.join(post_folder, 'HeadPO')
headpo_df = gpd.read_file(os.path.join(headpo_folder, 'HeadPO.shp'))
headpo_df = headpo_df[['pincode','LATITUDE', 'LONGITUDE']]
headpo_df.columns = ['pincode','lat', 'long']
df_list.append(headpo_df)
subpo_folder = os.path.join(post_folder, 'SubPO')
subpo_df = gpd.read_file(os.path.join(subpo_folder, 'SubPO.shp'))
subpo_df = subpo_df[['pincode','LATITUDE', 'LONGITUDE']]
subpo_df.columns = ['pincode','lat', 'long']
df_list.append(subpo_df)
general_folder = os.path.join(post_folder, 'GeneralPO')
general_df = gpd.read_file(os.path.join(general_folder, 'GeneralPO.shp'))
general_df = general_df[['pincode','LATITUDE', 'LONGITUDE']]
general_df.columns = ['pincode','lat', 'long']
df_list.append(general_df)
branch_folder = os.path.join(post_folder, 'BranchPO')
branch_df = gpd.read_file(os.path.join(branch_folder, 'BranchPO.shp'))
branch_df['LATITUDE'] = branch_df.geometry.y
branch_df['LONGITUDE'] = branch_df.geometry.x
branch_df = branch_df[['PINCODE','LATITUDE', 'LONGITUDE']]
branch_df.columns = ['pincode','lat', 'long']
df_list.append(branch_df)
post_df = pd.concat(df_list)
In [18]:
# Join pincode
df = pd.merge(df, post_df, how = 'left', left_on = 'pin_code', right_on = 'pincode')
df['latitude'] = df['lat']
df['longitude'] = df['long']
df = df.drop(columns=['pincode', 'lat', 'long'])
geocoded1_df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
geocoded_df = pd.concat([geocoded_df, geocoded1_df])
df = df[ ( np.invert( df['latitude']  > 0) & np.invert( df['longitude'] > 0) ) ].drop_duplicates(['id'], keep='last').reset_index(drop=True)
In [19]:
# to csv
to_geocode = os.path.join(output_folder, "Colleges_non_geocoded.csv")
#df.to_csv(to_geocode, encoding='utf-8', index=False)
In [20]:
df = pd.read_csv(to_geocode, encoding = "ISO-8859-1")
df['latitude'] = df['_latitude']
df['longitude'] = df['_longitude']
df = df[ ( ( df['latitude']  > 0) & ( df['longitude'] > 0) ) ]
df = df.drop_duplicates(['id'], keep='first').reset_index(drop=True)
df = pd.concat([geocoded_df, df])
df = df[['id', 'latitude', 'longitude']]
df.to_csv(os.path.join(output_folder, "Colleges_geocoded.csv"), encoding='utf-8', index=False)
df
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (41,51) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\ipykernel_launcher.py:6: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  
Out[20]:
id latitude longitude
0 8937 17.372404 78.510087
1 118 24.174051 72.433099
2 294 20.764111 73.691277
3 33092 28.443849 76.774039
4 99 21.025305 83.081451
5 122 21.483308 70.333292
6 204 22.111155 72.221157
7 236 23.030006 72.458999
8 254 22.189030 71.634075
9 196 19.532552 73.093771
10 311 22.377476 72.999201
11 680 23.271521 73.172341
12 9382 28.059010 80.092310
13 6136 21.520010 71.830010
14 6149 17.392136 78.462495
15 6155 18.310010 73.550010
16 6165 16.945772 81.409743
17 6183 17.210420 74.555250
18 6190 25.540001 86.000050
19 6206 21.548673 73.009712
20 6250 17.800010 83.400010
21 6319 20.000120 78.460001
22 6336 13.245120 74.245120
23 9416 18.223230 75.124250
24 17096 29.948776 78.165397
25 17099 24.047348 74.768086
26 17117 22.624036 75.797733
27 17360 17.231641 82.203154
28 7200 22.340001 88.200001
29 13776 13.426080 77.730401
... ... ... ...
18622 58182 23.235526 77.400148
18623 58184 24.118587 75.044495
18624 58186 24.911314 80.231839
18625 58187 24.989987 79.043095
18626 58192 14.616700 75.616700
18627 58129 18.020099 77.259293
18628 58195 9.951145 76.630626
18629 58188 22.600796 72.820252
18630 58130 17.872103 76.947022
18631 58119 16.423535 76.935498
18632 58154 16.423535 76.935498
18633 58199 25.122114 75.814770
18634 58201 26.287562 75.715514
18635 58205 27.318153 77.076593
18636 58207 15.695110 73.722626
18637 58210 21.092397 71.772001
18638 58208 8.782482 77.613654
18639 58212 24.112719 71.780827
18640 58213 8.532488 77.035186
18641 58215 18.024258 75.519930
18642 58214 21.754022 72.131970
18643 58197 17.567628 76.566162
18644 58222 21.012075 78.246649
18645 58221 18.302055 76.663349
18646 58223 18.969862 73.327506
18647 58227 20.987774 77.757465
18648 58226 18.648296 73.775804
18649 58228 29.622841 76.934057
18650 58229 30.627111 74.261420
18651 58211 21.055564 70.529468

45818 rows × 3 columns

In [21]:
#Change dataframe to geospatial and check by exporting to shp
geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
crs = {'init': 'epsg:4326'}
geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
out_file = os.path.join(output_folder, "Colleges_geocoded.shp")
geo_df.to_file(driver='ESRI Shapefile', filename=out_file)
In [ ]: