# Import Librariesa
import pandas as pd
import geopandas as gpd
import lxml
import os
import glob
import time
import datetime
import json
import math
# Set Output Folder
output_folder = os.path.abspath("output")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Import data folders
data_folder = os.path.abspath("data")
d2011 = os.path.join(data_folder, 'Data User_2011_12')
d2012 = os.path.join(data_folder, 'Data User_2012_13')
d2013 = os.path.join(data_folder, 'Data User_2013_14')
d2014 = os.path.join(data_folder, 'Data User_2014_15')
d2015 = os.path.join(data_folder, 'Data User_2015_16')
d2016 = os.path.join(data_folder, 'Data User_2016_17')
d2017 = os.path.join(data_folder, 'Data User_2017_18')
folders = [d2011, d2012, d2013, d2014, d2015, d2016, d2017]
download_folder = os.path.join(data_folder, "Downloaded")
geocoded_data_folder = os.path.join(data_folder, "Geocoded")
## Make Noise
print("data found for these themes:")
i = 0
for folder in folders:
print(i,") ",folder)
i+=1
# Read College Data
df_list = []
for folder in folders:
print("--Reading data from folder ", folder)
## Read General data
college = os.path.join(folder, "college.csv")
cdf = pd.read_csv(college)
cdf
## Read Institution data
college_institution = os.path.join(folder, "college_institution.csv")
cidf = pd.read_csv(college_institution)
cidf
## Read Geocoded data
#geocoded_data_folder = os.path.join(folder, "Geocoded")
Colleges_geocoded = os.path.join(geocoded_data_folder, "Colleges_geocoded.shp.csv")
cgdf = pd.read_csv(Colleges_geocoded)
cgdf
## merge data
colleges_merge_df = pd.merge(cgdf, cdf, on=['id'], how='inner')
colleges_merge_df = pd.merge(colleges_merge_df, cidf, on=['id'], how='inner')
## create Datetime Column
colleges_merge_df['year'] = colleges_merge_df['year_of_establishment'].astype(float).fillna(0.0).astype(int)
colleges_merge_df = colleges_merge_df[colleges_merge_df['year'] > 1700]
colleges_merge_df['month'] = 9
colleges_merge_df['day'] = 30
colleges_merge_df['datetime'] = pd.to_datetime(colleges_merge_df[['year', 'month', 'day']])
df_list.append(colleges_merge_df)
df = pd.concat(df_list)
df.drop('remarks', axis=1, inplace=True)
# Output to CSV
df.to_csv(os.path.join(output_folder, "Colleges_Data_Joined.csv"), line_terminator='\n',encoding='utf-8', index=False)
# Read University data
df_list = []
for folder in folders:
print("--Reading data from folder ", folder)
## Read General data
university = os.path.join(folder, "university.csv")
udf = pd.read_csv(university)
udf
## Read Geocoded data
#geocoded_data_folder = os.path.join(data_folder, "Geocoded")
university_geocoded = os.path.join(geocoded_data_folder, "university_geocoded.shp.csv")
ugdf = pd.read_csv(university_geocoded)
ugdf
## merge data
university_merge_df = pd.merge(ugdf, udf, on=['id'], how='inner')
## create Datetime Column
university_merge_df['year'] = university_merge_df['year_of_establishment'].astype(float).fillna(0.0).astype(int)
university_merge_df = university_merge_df[university_merge_df['year'] > 2000]
university_merge_df['month'] = 9
university_merge_df['day'] = 30
university_merge_df['datetime'] = pd.to_datetime(university_merge_df[['year', 'month', 'day']])
df_list.append(university_merge_df)
df = pd.concat(df_list)
df.drop('remarks', axis=1, inplace=True)
# Output to CSV
df.to_csv(os.path.join(output_folder, "Universities_Data_Joined.csv"), line_terminator='\n', encoding='utf-8', index=False)
# Read Standalone Institute data
df_list = []
for folder in folders:
print("--Reading data from folder ", folder)
## Read Institution data
standalone_institution = os.path.join(folder, "standalone_institution.csv")
sidf = pd.read_csv(standalone_institution)
sidf
## Read Geocoded data
standalone_geocoded = os.path.join(geocoded_data_folder, "standalone_geocoded.shp.csv")
sgdf = pd.read_csv(standalone_geocoded)
sgdf
## merge data
standalone_merge_df = pd.merge(sgdf, sidf, on=['id'], how='inner')
## create Datetime Column
standalone_merge_df['year'] = standalone_merge_df['year_of_establishment'].astype(float).fillna(0.0).astype(int)
standalone_merge_df = standalone_merge_df[standalone_merge_df['year'] > 2000]
standalone_merge_df['month'] = 9
standalone_merge_df['day'] = 30
standalone_merge_df['datetime'] = pd.to_datetime(standalone_merge_df[['year', 'month', 'day']])
df_list.append(standalone_merge_df)
df = pd.concat(df_list)
df.drop('remarks', axis=1, inplace=True)
# Output to CSV
df.to_csv(os.path.join(output_folder, "Standalone_Data_Joined.csv"), line_terminator='\n', encoding='utf-8', index=False)
df