#Import Libraries
import pandas as pd
import numpy as np
import lxml
import os
#Folders
scraped_folder = 'Scrapped'
output_folder = 'Output'
if not os.path.exists(output_folder):
os.makedirs(output_folder)
year = "2016-2017"
year_folder = os.path.join(scraped_folder, year)
cleaned_folder = os.path.join(output_folder, year)
if not os.path.exists(cleaned_folder):
os.makedirs(cleaned_folder)
#Get list of scraped files
scraped_files = os.listdir(year_folder)
#Read files to one dataframe
df = None
for sfile in scraped_files:
#Read Scraped files
sfile_path = os.path.join(year_folder,sfile)
#print(sfile_path)
df1 = pd.read_csv(sfile_path)
#print(df1.columns)
#Check if master dataframe is empty then copy otherwise concat new Dataframe
if df is not None:
df = pd.concat([df,df1])
else:
df = df1.copy()
#Reset Index
df = df.reset_index(drop=True)
#Save to file
financial_year = df['Financial Year'].values[0]
df.to_csv(os.path.join(cleaned_folder, financial_year+'_Scrapped_Data.csv'), encoding='utf-8', index=False)
df
#Cleaning
## Remove unwanted data and columns
#df = df[ (df['Unnamed: 17'].isnull()) | (df['0'].isnull()) ]
df = df[ (df['0'].isnull()) ]
unwanted_cols = ['0', 'Sl.No.']
df = df.drop(columns=unwanted_cols)
## Get List of Columns to Group
columns = list(df.columns)
columns.remove('Category')
## Create Binary Variables
Categories = list(df['Category'].unique())
for Category in Categories:
df[Category] = 0
df.loc[df['Category'] == Category, Category] = 1
## Group the dataframe and sum the binary variables
df = df.fillna(-999).groupby(columns).sum().reset_index()
## Convert more than one to one
for Category in Categories:
df.loc[df[Category] > 0, Category] = 1
## Save to file
financial_year = df['Financial Year'].values[0]
df.to_csv(os.path.join(cleaned_folder, financial_year+'_Scrapped_Cleaned_Data.csv'), encoding='utf-8', index=False)
df