In [ ]:
#Import Libraries
import pandas as pd
import numpy as np
import lxml
import os
In [ ]:
#Folders
scraped_folder = 'Scrapped'
output_folder = 'Output'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
year = "2016-2017"
year_folder = os.path.join(scraped_folder, year)
cleaned_folder = os.path.join(output_folder, year)
if not os.path.exists(cleaned_folder):
    os.makedirs(cleaned_folder)
#Get list of scraped files
scraped_files = os.listdir(year_folder)

#Read files to one dataframe
df = None
for sfile in scraped_files:
    #Read Scraped files
    sfile_path = os.path.join(year_folder,sfile)
    #print(sfile_path)
    df1 = pd.read_csv(sfile_path)
    #print(df1.columns)
    #Check if master dataframe is empty then copy otherwise concat new Dataframe
    if df is not None:            
        df = pd.concat([df,df1])
    else:
        df = df1.copy()
#Reset Index
df = df.reset_index(drop=True)
#Save to file
financial_year = df['Financial Year'].values[0]
df.to_csv(os.path.join(cleaned_folder, financial_year+'_Scrapped_Data.csv'), encoding='utf-8', index=False)
In [ ]:
df
In [ ]:
#Cleaning
## Remove unwanted data and columns
#df = df[ (df['Unnamed: 17'].isnull()) | (df['0'].isnull()) ]
df = df[ (df['0'].isnull()) ]
unwanted_cols = ['0', 'Sl.No.']
df = df.drop(columns=unwanted_cols)
## Get List of Columns to Group
columns = list(df.columns)
columns.remove('Category')
## Create Binary Variables
Categories = list(df['Category'].unique())
for Category in Categories:
    df[Category] = 0
    df.loc[df['Category'] == Category, Category] = 1
## Group the dataframe and sum the binary variables
df = df.fillna(-999).groupby(columns).sum().reset_index()
## Convert more than one to one
for Category in Categories:
    df.loc[df[Category] > 0, Category] = 1
## Save to file
financial_year = df['Financial Year'].values[0]
df.to_csv(os.path.join(cleaned_folder, financial_year+'_Scrapped_Cleaned_Data.csv'), encoding='utf-8', index=False)
In [ ]:
df
In [ ]: