In [ ]:
#Import Libraries
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException,TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from selenium.common.exceptions import StaleElementReferenceException
import pandas as pd
import lxml
import os
import time
import datetime
import json
In [ ]:
#Start Timmer
start = time.time()

#Set Output Folder
output_folder = datetime.datetime.now().strftime("Date-%Y-%m-%d-Time-%H-%M-%S")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
In [ ]:
#Start Chrome
driver = webdriver.Chrome()
driver.implicitly_wait(20) # seconds

#Prepare URL
protocol = "http://"
url = protocol + "indiawater.gov.in/imisreports/Reports/Physical/rpt_RWS_TargetAchievement_S.aspx?"
params = {
    'Rep': 0,
    'RP': 'Y',
    'APP': 'IMIS'
}
for key, value in params.items():
    url = url + "&" + key + "=" + str(value)

#Playing with headers
my_referer = url
UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
mod_headers = {'referer': my_referer, 'User-Agent':UserAgent}

#Start Chrome
driver.get(url)
In [ ]:
#Find Category Selector
financial_year_selector = "ctl00$ContentPlaceHolder$ddfinyear"
state_selector = "ctl00$ContentPlaceHolder$ddState"
cat_selector = "ctl00$ContentPlaceHolder$ddcat"
detail_checkbox = "ctl00$ContentPlaceHolder$chk_detail"
show_button = "ctl00$ContentPlaceHolder$btnGO"
last_col_head = "ctl00_ContentPlaceHolder_rpt_ctl00_lblnoofhabwithcov75to100popcov"
district_radio = "ctl00_ContentPlaceHolder_ddDivisionCat_0"
district_selector = "ctl00$ContentPlaceHolder$dddistrict"

financial_year = "2014-2015"
year_query = ".//option[@value='" + financial_year + "']"
reset = ".//option[@value='-1']"
cat_reset = ".//option[@value='Total']"

def del_show():
    driver.execute_script("return document.getElementsByName('" + show_button + "')[0].remove()")
def check_show():
    WebDriverWait( driver, 2 ).until( EC.visibility_of_element_located( ( By.NAME, show_button ) ) )
    
def detail_check():
    try:
        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.NAME, detail_checkbox))
        )
        if not element.is_selected():
            #wait start
            del_show()
            element.click()
            #wait end
            check_show()
    except Exception as e:
        print("Error while checking details: ", e)
        pass

def reset_page():
    
    #Get Page
    driver.get(url)
    
    #Get Page
    driver.get(url)
    
    #wait end
    check_show()
        
    #Select Year
    if (driver.find_element_by_name(financial_year_selector).get_attribute("value") == '-1' ):
        #wait
        time.sleep(1)
        del_show()
        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.NAME, financial_year_selector))
        )
        element.find_elements_by_xpath(".//option")[-1].click
        #wait end
        check_show()
    #Reset now
    #wait
    time.sleep(1)
    del_show()
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.NAME, financial_year_selector))
    )
    element.find_element_by_xpath(reset).click()
    #wait end
    check_show()
    #Select year
    #wait
    time.sleep(1)
    del_show()
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.NAME, financial_year_selector))
    )
    element.find_element_by_xpath(year_query).click()
    #wait end
    check_show()

    #Reset State
    #wait
    time.sleep(1)
    if not (driver.find_element_by_name(state_selector).get_attribute("value") == '-1' ):
        #wait
        del_show()
        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.NAME, state_selector))
        ) 
        element.find_element_by_xpath(reset).click()
        #wait end
        check_show()

    #Reset Category
    #wait
    time.sleep(1)
    if not (driver.find_element_by_name(cat_selector).get_attribute("value") == 'Total' ):
        #wait
        del_show()
        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.NAME, cat_selector))
        ) 
        element.find_element_by_xpath(cat_reset).click()
        #wait end
        check_show()

    #Checkbox details
    time.sleep(1)
    detail_check()

    #Cick Show Button
    time.sleep(2)
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.NAME, show_button))
    )
    element.click()

    #Wait for Table to Load fully
    element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))            
    WebDriverWait(driver, 5).until(element_present)
    time.sleep(2)

#Reset Page
reset_page()

#Get all data Categories
table = driver.find_element_by_id('tableReportTable')
head_cats = table.find_elements_by_xpath(".//thead//tr[@class='evenrowcolor']/th[@colspan='3']")[1:]
sub_cat = table.find_elements_by_xpath(".//tr[@class='oddrowcolor']/th")[:3]

#Get habitation Categories #loop over each head_cat
habit_Categories = []
for x in head_cats:
    habit_Categories.append(x.text)
#Remove not needed cats
habit_Categories = habit_Categories[1:]
print ("Total habit_Categories to scrape data for: ",json.dumps(habit_Categories, sort_keys=True, indent=4))

#Get Sub habitation Categories #loop over each sub_cat
sub_Categories = []
for x in sub_cat:
    sub_Categories.append(x.text)
print ("Total sub_Categories to scrape data for: ",json.dumps(sub_Categories, sort_keys=True, indent=4))
        
#Build Financial Year List
financial_years = driver.find_element_by_name(financial_year_selector)
options = financial_years.find_elements_by_xpath(".//option")
#Delete Total
options = options[1:]
financial_years = {}
#loop over each option
for x in options:
    financial_years[x.get_attribute('value')] = x.text
print ("Total financial_year to scrape data for: ",json.dumps(financial_years, sort_keys=True, indent=4))

#Build Category List
temp_list = driver.find_element_by_name(cat_selector).find_elements_by_xpath(".//option")[1:]
categories = []
#loop over each option
for x in temp_list:
    d = {}
    d['text'] = x.text
    d['value'] = x.get_attribute('value')
    categories.append(d)
del temp_list
print ("Total Categories to scrape data for: ",json.dumps(categories, sort_keys=True, indent=4))

#Build State List
temp_list = driver.find_element_by_name(state_selector).find_elements_by_xpath(".//option")[1:]
states = []
#loop over each option
for x in temp_list:
    d = {}
    d['text'] = x.text
    d['value'] = x.get_attribute('value')
    states.append(d)
    #print("State index:",i,d)
del temp_list
print ("Total States to scrape data for: ",json.dumps(states, sort_keys=True, indent=4))

#Build State Dict
temp_list = driver.find_element_by_name(state_selector).find_elements_by_xpath(".//option")[1:]
state_dict = {}
for x in temp_list:
    state_dict[x.text] = x.get_attribute('value')
del temp_list

#Build State District Dict
driver.implicitly_wait(3) # seconds
state_district_dict = {}
for state in states:
    try:
        state_query = ".//option[@value='" + state['value'] + "']"
        
        #Select State
        #wait
        time.sleep(1)
        del_show()
        try:
            driver.execute_script("return document.getElementsByName('" + district_selector + "')[0].remove()")
        except:
            pass
        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.NAME, state_selector))
        )
        element.find_element_by_xpath(state_query).click()
        #wait end    
        check_show()
        try:
            WebDriverWait( driver, 3 ).until( EC.visibility_of_element_located( ( By.NAME, district_selector ) ) )
        except:
            #Cick Show Button
            element = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.NAME, show_button))
            )
            element.click()
            element = WebDriverWait(driver, 2).until(
                EC.visibility_of_element_located((By.ID, district_radio))
            )
            
            element = WebDriverWait(driver, 2).until(
                EC.element_to_be_clickable((By.ID, district_radio))
            )
            element.click()
            WebDriverWait( driver, 5 ).until( EC.visibility_of_element_located( ( By.NAME, district_selector ) ) )
        
        #Build District Dict
        temp_list = driver.find_element_by_name(district_selector).find_elements_by_xpath(".//option")[1:]
        district_dict = {}
        for x in temp_list:
            district_dict[x.text] = x.get_attribute('value')
        del temp_list
        
        #Append to State Dict
        state_district_dict[state['text']] = district_dict
        
    except Exception as e:
        print("Error while Building state_district_dict: ", e)
        pass
driver.implicitly_wait(20) # seconds
print (" State District Index: ",json.dumps(state_district_dict, sort_keys=True, indent=4))
In [ ]:
df = None
cat_i = 5 #Increase to skip Category
cat_i_count = len(categories)
while cat_i < cat_i_count:
    row_category_dict = categories[cat_i]
    row_category = row_category_dict['text']
    cat_key = row_category_dict['value']
    cat_query = ".//option[@value='" + cat_key + "']"
    print("\n\n\nAAAAAAAAA\n Scraping Data ->  For category -> ",cat_key,": ",row_category)

    #Reset Page
    reset_page()
    
    #Select Category
    #wait
    time.sleep(1)
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.NAME, cat_selector))
    )
    element.find_element_by_xpath(cat_query).click()

    #Checkbox details
    time.sleep(1)
    detail_check()
    
    #Remove Previous Table if any
    try:
        driver.execute_script('document.getElementById("tableReportTable").remove()')
        time.sleep(.5)
    except:
        pass

    #Cick Show Button
    time.sleep(2)
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.NAME, show_button))
    )
    element.click()

    data_for_states = 0
    try:
        #Wait for Table to Load fully
        element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))            
        WebDriverWait(driver, 5).until(element_present)
        time.sleep(2)
        #Build Data State List
        temp_list = driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[:-1]
        data_state_list = []
        i = 0
        for x in temp_list:
            xt = x.find_elements_by_xpath(".//td")[1].text.upper()
            data_state_list.append(xt)
            print("-Data Available in this category -> for State -> ",i,": ",xt)
            i+=1
        del temp_list
        print("-Data Available in this category for Total States: ",len(data_state_list))
        #All done so we have data
        data_for_states = 1
    except:
        print("-No Data found for: ","category",row_category)
        pass
    #Start scraping data for states
    if data_for_states > 0:
        state_i = 2 #Increase to skip states
        state_i_count = len(data_state_list)
        while state_i < state_i_count:
            row_state = data_state_list[state_i]
            state_query = ".//option[@value='" + state_dict[row_state] + "']"
            print("\n\n\nAAAAAAAAA\n -Scraping Data  ->  For category: ",row_category," -> for State: ",state_i," -> ",row_state)
            
            #Reset Page
            reset_page()

            #Select State
            #wait
            time.sleep(1)
            del_show()
            element = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.NAME, state_selector))
            )
            element.find_element_by_xpath(state_query).click()
            #wait end    
            check_show()

            #Select Category
            #wait
            time.sleep(1)
            element = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.NAME, cat_selector))
            )
            element.find_element_by_xpath(cat_query).click()

            #Checkbox details
            time.sleep(1)
            detail_check()

            #Remove Previous Table if any
            try:
                driver.execute_script('document.getElementById("tableReportTable").remove()')
                time.sleep(.5)
            except:
                pass

            #Cick Show Button
            time.sleep(2)
            element = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.NAME, show_button))
            )
            element.click()
            
            #Check if data loaded at district level
            driver.implicitly_wait(3) # seconds
            try:
                WebDriverWait( driver, 3 ).until( EC.visibility_of_element_located( ( By.NAME, district_selector ) ) )
            except:
                element = WebDriverWait(driver, 2).until(
                    EC.visibility_of_element_located((By.ID, district_radio))
                )

                element = WebDriverWait(driver, 2).until(
                    EC.element_to_be_clickable((By.ID, district_radio))
                )
                element.click()
                WebDriverWait( driver, 5 ).until( EC.visibility_of_element_located( ( By.NAME, district_selector ) ) )
                #Cick Show Button
                time.sleep(2)
                element = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.NAME, show_button))
                )
                element.click()
            driver.implicitly_wait(20) # seconds
            
            data_for_districts = 0
            try:
                #Wait for Table to Load fully
                element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))            
                WebDriverWait(driver, 5).until(element_present)
                time.sleep(2)
                #Build Data State List
                temp_list = driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[:-1]
                data_district_list = []
                for x in temp_list:
                    xt = x.find_elements_by_xpath(".//td")[1].text.upper()
                    data_district_list.append(xt)
                    print("--Data Available in this category -> this state -> for District: ",xt)
                del temp_list
                print("-Data Available in this category -> this state -> for Total District: ",len(data_district_list))
                #All done so we have data
                data_for_districts = 1
            except:
                print("-No Data found -> for: ","category",row_category)
                pass
            
            #Start scraping data for Districts
            if data_for_districts > 0:
                for row_district in data_district_list:
                    dist_match = 0
                    try:
                        district_query = ".//option[@value='" + state_district_dict[row_state][row_district] + "']"
                        print("\n\n\nAAAAAAAAA\n -Scraping Data  ->  For category: ",row_category," -> for State: ",row_state," -> for District: ",row_district)
                        dist_match = 1
                    except:
                        pass
                    if dist_match > 0:
                        def data_table_form():
                            #Reset Page
                            reset_page()

                            #Select State
                            #wait
                            time.sleep(1)
                            del_show()
                            element = WebDriverWait(driver, 10).until(
                                EC.element_to_be_clickable((By.NAME, state_selector))
                            )
                            element.find_element_by_xpath(state_query).click()
                            #wait end    
                            check_show()

                            #Check if data loaded at district level
                            driver.implicitly_wait(3) # seconds
                            try:
                                WebDriverWait( driver, 3 ).until( EC.visibility_of_element_located( ( By.NAME, district_selector ) ) )
                            except:
                                #Cick Show Button
                                element = WebDriverWait(driver, 10).until(
                                    EC.element_to_be_clickable((By.NAME, show_button))
                                )
                                element.click()
                                #Check District Radio button now
                                element = WebDriverWait(driver, 2).until(
                                    EC.visibility_of_element_located((By.ID, district_radio))
                                )
                                element = WebDriverWait(driver, 2).until(
                                    EC.element_to_be_clickable((By.ID, district_radio))
                                )
                                element.click()
                                WebDriverWait( driver, 5 ).until(
                                    EC.visibility_of_element_located( ( By.NAME, district_selector ) ) 
                                )
                                #Cick Show Button
                                time.sleep(2)
                                element = WebDriverWait(driver, 10).until(
                                    EC.element_to_be_clickable((By.NAME, show_button))
                                )
                                element.click()
                            driver.implicitly_wait(20) # seconds

                            #Select District
                            #wait
                            time.sleep(2)
                            del_show()
                            element = WebDriverWait(driver, 10).until(
                                EC.element_to_be_clickable((By.NAME, district_selector))
                            )
                            element.find_element_by_xpath(district_query).click()
                            #wait end    
                            check_show()

                            #Select Category
                            #wait
                            time.sleep(1)
                            element = WebDriverWait(driver, 10).until(
                                EC.element_to_be_clickable((By.NAME, cat_selector))
                            )
                            element.find_element_by_xpath(cat_query).click()

                            #Checkbox details
                            time.sleep(1)
                            detail_check()

                            #Remove Previous Table if any
                            try:
                                driver.execute_script('document.getElementById("tableReportTable").remove()')
                                time.sleep(.5)
                            except:
                                pass

                            #Cick Show Button
                            time.sleep(2)
                            element = WebDriverWait(driver, 10).until(
                                EC.element_to_be_clickable((By.NAME, show_button))
                            )
                            element.click()

                        data_table_form()

                        data_for_scraping = 0
                        try:
                            #Wait for Table to Load fully
                            element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))            
                            WebDriverWait(driver, 5).until(element_present)
                            time.sleep(2)
                            data_rows_count = len(driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[:-1])
                            if len(driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[:-1]) > 0:
                                #All done so we have data
                                data_for_scraping = 1
                        except Exception as e:
                            print("-No Data found for this category -> this state -> this District | Error: ",str(e))
                            pass

                        #Start scraping data
                        if data_for_scraping > 0:
                            #Loop over each row                        
                            row = 0
                            while row < data_rows_count and data_for_scraping > 0:
                                data_1 = 0
                                try:
                                    #Wait for Table to Load fully
                                    columns = driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[row].find_elements_by_xpath(".//td")
                                    data_1 = 1
                                except Exception as e:
                                    print("!Encountered Error retrying | Error: ",str(e))
                                    data_table_form()
                                    try:
                                        #Wait for Table to Load fully
                                        element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))            
                                        WebDriverWait(driver, 5).until(element_present)
                                        columns = driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[row].find_elements_by_xpath(".//td")
                                        data_1 = 1
                                    except Exception as e:
                                        data_1 = 0
                                        pass
                                if data_1 > 0:
                                    row_block = columns[1].text.strip()
                                    print("----Scraping Data  ->  For category: ",row_category," -> for State: ",row_state," -> for District: ",row_district," -> for Block: ",row_block)
                                    #Loop over all Data columns
                                    column = len(columns) - 15 #5
                                    for h_cat in habit_Categories:
                                        for s_cat in sub_Categories:
                                            try:
                                                cur_col = driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[row].find_elements_by_xpath(".//td")[column]
                                                data_2 = 1
                                            except Exception as e:
                                                print("!Encountered Error Reading Ccolumn | Error: ",str(e))
                                                data_table_form()
                                                try:
                                                    #Wait for Table to Load fully
                                                    element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))            
                                                    WebDriverWait(driver, 5).until(element_present)
                                                    data_2 = 1
                                                except Exception as e:
                                                    data_2 = 0
                                                    print("!Encountered Error while Reading this column ",str(e))
                                                    pass
                                            #Check if Link is clickable otherwise nothing
                                            if data_2 > 0 and ( not (cur_col.text == '0') ) :
                                                #Try to click on link to data
                                                try:
                                                    cur_col.find_element_by_tag_name('a').click()
                                                    #Wait for Table to Load fully
                                                    element_present = EC.presence_of_element_located((By.CLASS_NAME, "lnkPages"))
                                                    WebDriverWait(driver, 30).until(element_present)                        
                                                    data_pages = len(driver.find_elements_by_class_name("lnkPages"))
                                                    #print("-----Total Data Pages: ",data_pages)
                                                    #Testing only 
                                                    #data_pages = 1

                                                    i = 0
                                                    while i < data_pages:        
                                                        driver.execute_script('document.getElementById("tableReportTable").remove()')
                                                        driver.find_elements_by_class_name("lnkPages")[i].click()

                                                        #Wait for Table to Load fully
                                                        element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))
                                                        WebDriverWait(driver, 40).until(element_present)

                                                        #Once Loaded Create DataFrame from Table
                                                        html_table = driver.find_element_by_id('tableReportTable').get_attribute('outerHTML')
                                                        df1 = pd.read_html(html_table, skiprows=0)[0]

                                                        #Put Additional Information
                                                        df1['Financial Year'] = financial_year                                                
                                                        df1['Category'] = row_category
                                                        df1['State'] = row_state
                                                        df1['District'] = row_district
                                                        df1['Block'] = row_block
                                                        df1['Habit Category'] = h_cat                            
                                                        df1['Sub Category'] = s_cat

                                                        #Check if master dataframe is empty then copy otherwise concat new Dataframe
                                                        if df is not None:            
                                                            df = pd.concat([df,df1])
                                                        else:
                                                            df = df1.copy()

                                                        #Next Data Page
                                                        i+=1

                                                    #Go Back to previous page for next column
                                                    driver.execute_script("window.history.go(-1)")

                                                #If Encounters error
                                                except Exception as e:
                                                    print("!Encountered Error Going back | Error: ",str(e))
                                                    data_table_form()
                                                    try:
                                                        #Wait for Table to Load fully
                                                        element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))            
                                                        WebDriverWait(driver, 5).until(element_present)
                                                    except Exception as e:
                                                        data_for_scraping = 0
                                                        print("!Encountered Error while recovering from error Skipping this district ",str(e))
                                                        pass

                                            #Next Column it is loop in subcategory
                                            column+=1
                                        #Loop completed for Habitat category
                                #Loop completed for whole Block
                                row+=1
            #state loop Closes
            state_i+=1
    #Cat Loop Closes
    cat_i+=1
In [ ]:
df
In [ ]:
#Save Scrapped File
df.to_csv(os.path.join(output_folder, financial_year+'_Scrapped_Data.csv'), encoding='utf-8', index=False)
In [ ]:
#Read Edited File
df = pd.read_csv(os.path.join(output_folder, financial_year+'_Scrapped_Data.csv'), encoding='utf-8')
In [ ]:
#Cleaner

#Reset Index
df = df.reset_index(drop=True)
df = df.drop(columns=['Sl.No.'])

#Get List of Columns to Group
columns = list(df.columns)
columns.remove('Category')

#Create Binary Variables
cat_i = 0 #Increase to skip states
cat_i_count = len(categories)
while cat_i < cat_i_count:
    row_category_dict = categories[cat_i]
    row_category = row_category_dict['text']
    cat_key = row_category_dict['value']
    df[row_category] = 0
    df.loc[df['Category'] == row_category, row_category] = 1
    cat_i+=1
    
#Group the dataframe and sum the binary variables
df = df.fillna(-999).groupby(columns).sum().reset_index()

#Save to file
df.to_csv(os.path.join(output_folder, financial_year+'_Scrapped_Cleaned_Data.csv'), encoding='utf-8', index=False)
In [ ]: