In [21]:
# Import Librariesa
import pandas as pd
import geopandas as gpd
import lxml
import os
import glob
import time
import datetime
import json
import math
In [22]:
# Set Output Folder
output_folder = os.path.abspath("output")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
In [23]:
# Import data folders
data_folder = os.path.abspath("data")
d2011 = os.path.join(data_folder, 'Data User_2011_12')
d2012 = os.path.join(data_folder, 'Data User_2012_13')
d2013 = os.path.join(data_folder, 'Data User_2013_14')
d2014 = os.path.join(data_folder, 'Data User_2014_15')
d2015 = os.path.join(data_folder, 'Data User_2015_16')
d2016 = os.path.join(data_folder, 'Data User_2016_17')
d2017 = os.path.join(data_folder, 'Data User_2017_18')
folders = [d2011, d2012, d2013, d2014, d2015, d2016, d2017]
download_folder = os.path.join(data_folder, "Downloaded")
geocoded_data_folder = os.path.join(data_folder, "Geocoded")
## Make Noise
print("data found for these themes:")
i = 0
for folder in folders:
    print(i,") ",folder)
    i+=1
data found for these themes:
0 )  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2011_12
1 )  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2012_13
2 )  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2013_14
3 )  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2014_15
4 )  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2015_16
5 )  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2016_17
6 )  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2017_18
In [37]:
# Read College Data
df_list = []
for folder in folders:
    print("--Reading data from folder ", folder)
    ## Read General data
    college = os.path.join(folder, "college.csv")
    cdf = pd.read_csv(college)
    cdf

    ## Read Institution data
    college_institution = os.path.join(folder, "college_institution.csv")
    cidf = pd.read_csv(college_institution)
    cidf

    ## Read Geocoded data
    #geocoded_data_folder = os.path.join(folder, "Geocoded")
    Colleges_geocoded = os.path.join(geocoded_data_folder, "Colleges_geocoded.shp.csv")
    cgdf = pd.read_csv(Colleges_geocoded)
    cgdf

    ## merge data
    colleges_merge_df = pd.merge(cgdf, cdf, on=['id'], how='inner')
    colleges_merge_df = pd.merge(colleges_merge_df, cidf, on=['id'], how='inner')

    ## create Datetime Column
    colleges_merge_df['year'] = colleges_merge_df['year_of_establishment'].astype(float).fillna(0.0).astype(int)
    colleges_merge_df = colleges_merge_df[colleges_merge_df['year'] > 1700]
    colleges_merge_df['month'] = 9
    colleges_merge_df['day'] = 30
    colleges_merge_df['datetime'] = pd.to_datetime(colleges_merge_df[['year', 'month', 'day']])
    
    df_list.append(colleges_merge_df)
    
df = pd.concat(df_list)
df.drop('remarks', axis=1, inplace=True)
# Output to CSV
df.to_csv(os.path.join(output_folder, "Colleges_Data_Joined.csv"), line_terminator='\n',encoding='utf-8', index=False)
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2011_12
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (37,38,39,40,41) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2012_13
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2013_14
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (48) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2014_15
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (44) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2015_16
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2016_17
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (37,38,39,40,44,45,49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2017_18
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (37,38,39,40,44,45) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\ipykernel_launcher.py:34: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

In [38]:
# Read University data
df_list = []
for folder in folders:
    print("--Reading data from folder ", folder)
    ## Read General data
    university = os.path.join(folder, "university.csv")
    udf = pd.read_csv(university)
    udf

    ## Read Geocoded data
    #geocoded_data_folder = os.path.join(data_folder, "Geocoded")
    university_geocoded = os.path.join(geocoded_data_folder, "university_geocoded.shp.csv")
    ugdf = pd.read_csv(university_geocoded)
    ugdf

    ## merge data
    university_merge_df = pd.merge(ugdf, udf, on=['id'], how='inner')

    ## create Datetime Column
    university_merge_df['year'] = university_merge_df['year_of_establishment'].astype(float).fillna(0.0).astype(int)
    university_merge_df = university_merge_df[university_merge_df['year'] > 2000]
    university_merge_df['month'] = 9
    university_merge_df['day'] = 30
    university_merge_df['datetime'] = pd.to_datetime(university_merge_df[['year', 'month', 'day']])

    df_list.append(university_merge_df)
    
df = pd.concat(df_list)
df.drop('remarks', axis=1, inplace=True)
# Output to CSV
df.to_csv(os.path.join(output_folder, "Universities_Data_Joined.csv"), line_terminator='\n', encoding='utf-8', index=False)
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2011_12
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2012_13
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2013_14
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2014_15
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2015_16
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2016_17
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2017_18
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

In [35]:
# Read Standalone Institute data
df_list = []
for folder in folders:
    print("--Reading data from folder ", folder)
    ## Read Institution data
    standalone_institution = os.path.join(folder, "standalone_institution.csv")
    sidf = pd.read_csv(standalone_institution)
    sidf

    ## Read Geocoded data
    standalone_geocoded = os.path.join(geocoded_data_folder, "standalone_geocoded.shp.csv")
    sgdf = pd.read_csv(standalone_geocoded)
    sgdf

    ## merge data
    standalone_merge_df = pd.merge(sgdf, sidf, on=['id'], how='inner')

    ## create Datetime Column
    standalone_merge_df['year'] = standalone_merge_df['year_of_establishment'].astype(float).fillna(0.0).astype(int)
    standalone_merge_df = standalone_merge_df[standalone_merge_df['year'] > 2000]
    standalone_merge_df['month'] = 9
    standalone_merge_df['day'] = 30
    standalone_merge_df['datetime'] = pd.to_datetime(standalone_merge_df[['year', 'month', 'day']])

    df_list.append(standalone_merge_df)
    
df = pd.concat(df_list)
df.drop('remarks', axis=1, inplace=True)
# Output to CSV
df.to_csv(os.path.join(output_folder, "Standalone_Data_Joined.csv"), line_terminator='\n', encoding='utf-8', index=False)
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2011_12
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2012_13
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2013_14
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2014_15
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2015_16
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2016_17
--Reading data from folder  E:\workspace\sourav sarkar\Task 18 - Join new AISHE data\data\Data User_2017_18
C:\Users\sandyjones\AppData\Local\conda\conda\envs\geo\lib\site-packages\ipykernel_launcher.py:27: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

In [36]:
df
Out[36]:
AC_NAME AC_NO DIST_NAME DT_CODE OBJECTID PC_ID PC_NAME PC_NO STATUS ST_CODE ... staff_quarter_available staff_quarter_id state_code student_hostel_available survey_year university_id website year year_of_establishment year_of_recognition
0 New Delhi 40 NaN NaN 4 704 NEW DELHI 4 NaN 7 ... False NaN 27 True 2011 NaN www.coppaniv.com 2006 2006.0 2006.0
1 Malshiras (SC) 254 SOLAPUR 30.0 43 2743 MADHA 43 NaN 27 ... False NaN 27 True 2011 NaN www.coppaniv.com 2006 2006.0 2006.0
2 New Delhi 40 NaN NaN 4 704 NEW DELHI 4 NaN 7 ... False NaN 29 False 2011 NaN NaN 2005 2005.0 2005.0
3 New Delhi 40 NaN NaN 4 704 NEW DELHI 4 NaN 7 ... False NaN 27 False 2011 NaN www.priyadarhiniss.org.in 2005 2005.0 2005.0
5 New Delhi 40 NaN NaN 4 704 NEW DELHI 4 NaN 7 ... True 11816.0 36 True 2011 NaN www.fernandezhospital.com 2005 2005.0 2006.0
6 Medchal 43 RANGAREDDI 6.0 7 2807 MALKAJGIRI 7 NaN 28 ... True 11816.0 36 True 2011 NaN www.fernandezhospital.com 2005 2005.0 2006.0
10 Hubli-Dharwad-Central 73 DHARWAD 9.0 11 2911 DHARWAD 11 NaN 29 ... False NaN 29 False 2011 NaN www.kssgadag.org 2005 2005.0 2005.0
11 Hubli-Dharwad-Central 73 DHARWAD 9.0 11 2911 DHARWAD 11 NaN 29 ... False NaN 29 False 2011 NaN www.kssdedhubli.com 2004 2004.0 2004.0
16 Hubli-Dharwad-Central 73 DHARWAD 9.0 11 2911 DHARWAD 11 NaN 29 ... False NaN 29 True 2011 NaN NaN 2004 2004.0 2004.0
17 Hubli-Dharwad-Central 73 DHARWAD 9.0 11 2911 DHARWAD 11 NaN 29 ... False NaN 29 False 2011 NaN www.sanatrust.com 2005 2005.0 2005.0
18 Hubli-Dharwad-Central 73 DHARWAD 9.0 11 2911 DHARWAD 11 NaN 29 ... False NaN 29 True 2011 NaN www.vinsnursing.com 2001 2001.0 2003.0
19 Hubli-Dharwad-Central 73 DHARWAD 9.0 11 2911 DHARWAD 11 NaN 29 ... False NaN 29 False 2011 NaN www.gpthubli.com 2009 2009.0 2009.0
20 Hubli-Dharwad-Central 73 DHARWAD 9.0 11 2911 DHARWAD 11 NaN 29 ... False NaN 29 False 2011 NaN NaN 2005 2005.0 2005.0
24 Hubli-Dharwad-Central 73 DHARWAD 9.0 11 2911 DHARWAD 11 NaN 29 ... False NaN 29 False 2011 NaN NaN 2003 2003.0 2003.0
25 Hubli-Dharwad-Central 73 DHARWAD 9.0 11 2911 DHARWAD 11 NaN 29 ... True 11096.0 29 True 2011 240 klesnursing.com 2003 2003.0 2003.0
26 Hubli-Dharwad-Central 73 DHARWAD 9.0 11 2911 DHARWAD 11 NaN 29 ... True 11673.0 29 True 2011 NaN martinluther.in 2004 2004.0 2004.0
27 Nilokheri(SC) 19 KARNAL 6.0 5 605 KARNAL 5 NaN 6 ... False NaN 6 False 2011 NaN www.indologyseenk.com 2008 2008.0 2008.0
30 Sillod 104 AURANGABAD 19.0 18 2718 JALNA 18 NaN 27 ... False NaN 27 False 2011 NaN WWW.VASANTRAOMORE.ORG 2011 2011.0 2011.0
31 Erandol 16 JALGAON 3.0 3 2703 JALGAON 3 NaN 27 ... False NaN 27 False 2011 NaN WWW.VASANTRAOMORE.ORG 2011 2011.0 2011.0
32 Sillod 104 AURANGABAD 19.0 18 2718 JALNA 18 NaN 27 ... False NaN 29 False 2011 NaN dte.kar.nic.in 2009 2009.0 2009.0
33 Parvati 212 PUNE 25.0 34 2734 PUNE 34 NaN 27 ... False NaN 27 False 2011 NaN www.micedu.org 2009 2009.0 2009.0
39 Parvati 212 PUNE 25.0 34 2734 PUNE 34 NaN 27 ... False NaN 27 True 2011 NaN www.instituteofpharmacy.org 2003 2003.0 2003.0
40 Jewar 63 GAUTAM BUDDHA NAGAR * 10.0 13 913 GAUTAM BUDDHA NAGAR 13 NaN 9 ... False NaN 9 True 2011 506 www.piprams.com 2005 2005.0 2005.0
41 Jewar 63 GAUTAM BUDDHA NAGAR * 10.0 13 913 GAUTAM BUDDHA NAGAR 13 NaN 9 ... True 16368.0 9 True 2011 NaN NaN 2005 2005.0 2005.0
42 Jewar 63 GAUTAM BUDDHA NAGAR * 10.0 13 913 GAUTAM BUDDHA NAGAR 13 NaN 9 ... True 16368.0 9 True 2011 NaN NaN 2005 2005.0 2005.0
43 Jewar 63 GAUTAM BUDDHA NAGAR * 10.0 13 913 GAUTAM BUDDHA NAGAR 13 NaN 9 ... False NaN 9 True 2011 NaN WWW.LLOYDBUSINESSSCHOOL.COM 2008 2008.0 2008.0
46 Ratlam Rural (ST) 219 RATLAM 20.0 24 2324 RATLAM (ST) 24 NaN 23 ... True 22144.0 9 True 2011 NaN www.sbbgi.ac.in 2010 2010.0 2011.0
47 Ratlam Rural (ST) 219 RATLAM 20.0 24 2324 RATLAM (ST) 24 NaN 23 ... False NaN 33 False 2011 NaN NaN 2001 2001.0 2001.0
51 Ratlam Rural (ST) 219 RATLAM 20.0 24 2324 RATLAM (ST) 24 NaN 23 ... False NaN 24 True 2011 NaN www.dietdahod.org 2002 2002.0 2002.0
52 Ratlam Rural (ST) 219 RATLAM 20.0 24 2324 RATLAM (ST) 24 NaN 23 ... False NaN 33 False 2011 NaN www.btcangels-apc.com 2011 2011.0 2011.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7081 BHANDARIPOKHARI 43 BHADRAK * 9.0 7 2107 BHADRAK (SC) 7 NaN 21 ... True 77692.0 21 True 2017 NaN NaN 2006 2006.0 2006.0
7083 Wyra (ST) 115 KHAMMAM 10.0 17 2817 KHAMMAM 17 NaN 28 ... False NaN 36 False 2017 NaN www.skec.ac.in 2001 2001.0 2001.0
7084 KUSHMANDI (SC) 37 DAKSHIN DINAJPUR * 5.0 6 1906 BALURGHAT 6 NaN 19 ... False NaN 19 False 2017 NaN www.bttc.org 2013 2013.0 2014.0
7086 Ellenabad 46 SIRSA 11.0 3 603 SIRSA (SC) 3 NaN 6 ... True 86333.0 6 True 2017 NaN www.gpchopta.ac.in 2002 2002.0 2002.0
7092 BARANAGAR 113 NORTH 24 PARGANAS 11.0 16 1916 DUM DUM 16 NaN 19 ... False NaN 19 False 2017 NaN NaN 2009 2009.0 2009.0
7093 Krishna Nagar 60 NaN NaN 3 703 EAST DELHI 3 NaN 7 ... False NaN 7 False 2017 NaN www.saiinstituteforgirls.com 2004 2004.0 2004.0
7094 JAGATSINGHPUR 104 JAGATSINGHAPUR * 11.0 16 2116 JAGATSINGHPUR (SC) 16 NaN 21 ... False NaN 21 True 2017 NaN NaN 2010 2010.0 2010.0
7095 JAGATSINGHPUR 104 JAGATSINGHAPUR * 11.0 16 2116 JAGATSINGHPUR (SC) 16 NaN 21 ... True 85202.0 21 True 2017 NaN NaN 2006 2006.0 2006.0
7099 Shrawasti 290 SHRAWASTI * 51.0 58 958 SHRAWASTI 58 NaN 9 ... True 78460.0 9 False 2017 NaN NaN 2016 2016.0 2016.0
7100 BHADERWAH 54 DODA 9.0 5 0 UDHAMPUR 5 Pre delimitation 1 ... False NaN 1 False 2017 NaN NaN 2012 2012.0 2013.0
7101 Loni 53 GHAZIABAD 9.0 12 912 GHAZIABAD 12 NaN 9 ... False NaN 9 True 2017 NaN www.rcim.co.in 2008 2008.0 2008.0
7103 Charkop 161 MUMBAI (SUBURBAN) * 22.0 26 2726 MUMBAI NORTH 26 NaN 27 ... False NaN 27 False 2017 NaN www.sirjpgi.com 2008 2008.0 2008.0
7104 Rampur (SC) 66 SHIMLA 11.0 2 202 MANDI 2 NaN 2 ... False NaN 2 True 2017 NaN NaN 2007 2007.0 NaN
7105 DIGAPAHANDI 134 GANJAM 19.0 20 2120 BERHAMPUR 20 NaN 21 ... True 82169.0 21 True 2017 NaN www.ose.org.in 2009 2009.0 2009.0
7106 Nalbari 59 NALBARI 7.0 8 1808 MANGALDOI 8 Pre delimitation 18 ... False NaN 18 True 2017 NaN NaN 2003 2003.0 2003.0
7108 GOPALPUR 132 GANJAM 19.0 20 2120 BERHAMPUR 20 NaN 21 ... False NaN 21 True 2017 NaN www.bsetberhampur.ac.in 2004 2004.0 2004.0
7109 GOPALPUR 132 GANJAM 19.0 20 2120 BERHAMPUR 20 NaN 21 ... True 87527.0 21 True 2017 NaN www.mttsn.org 2012 2012.0 2012.0
7110 RAIRAKHOL 18 SAMBALPUR 3.0 3 2103 SAMBALPUR 3 NaN 21 ... False NaN 21 True 2017 NaN sambalpurnursingcollege.org 2015 2015.0 2015.0
7111 Jakhanian 373 GHAZIPUR 65.0 75 975 GHAZIPUR 75 NaN 9 ... False NaN 9 True 2017 NaN NaN 2010 2010.0 2010.0
7113 Jamwa Ramgarh (ST) 48 JAIPUR 12.0 6 806 JAIPUR RURAL 6 NaN 8 ... False NaN 8 False 2017 NaN jmcpjamwaramgarh@gmail.com 2010 2010.0 2010.0
7114 Kalina 175 MUMBAI (SUBURBAN) * 22.0 29 2729 MUMBAI NORTH-CENTRAL 29 NaN 27 ... False NaN 27 False 2017 NaN www.bsm-uks.edu.in 2012 2012.0 2012.0
7116 BANGRIPOSI (ST) 29 MAYURBHANJ 7.0 5 2105 MAYURBHANJ (ST) 5 NaN 21 ... False NaN 21 True 2017 NaN www.bijubikash.com 2008 2008.0 2008.0
7117 Rudauli 271 BARABANKI 46.0 54 954 FAIZABAD 54 NaN 9 ... True 87931.0 9 True 2017 NaN iims.org.in 2013 2013.0 2013.0
7119 Behror 62 ALWAR 6.0 8 808 ALWAR 8 NaN 8 ... False NaN 8 True 2017 NaN www.bknmvidyapeeth.org 2006 2006.0 2006.0
7120 Ajmer North 100 AJMER 21.0 13 813 AJMER 13 NaN 8 ... False NaN 8 False 2017 NaN NaN 2007 2007.0 2007.0
7123 Basti Sadar 310 BASTI 55.0 61 961 BASTI 61 NaN 9 ... True 87924.0 9 True 2017 NaN NaN 2014 2014.0 2014.0
7124 Nongkrem (ST) 22 EAST KHASI HILLS 6.0 1 1701 SHILLONG (ST) 1 NaN 17 ... False NaN 17 True 2017 0340 www.woodlandinstituteofnursing.in 2007 2007.0 2007.0
7125 Karwan 64 HYDERABAD 5.0 9 2809 HYDERABAD 9 NaN 28 ... False NaN 36 True 2017 NaN - - 2004 2004.0 2009.0
7126 Ludhiana Central 63 LUDHIANA 9.0 7 307 LUDHIANA 7 NaN 3 ... False NaN 3 True 2017 NaN NaN 2010 2010.0 2010.0
7127 Serilingampally 52 RANGAREDDI 6.0 10 2810 CHEVELLA 10 NaN 28 ... True 86936.0 36 True 2017 NaN www.mythrinursingschool.com 2003 2003.0 2005.0

28572 rows × 62 columns

In [ ]: