#Import Libraries
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException,TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from selenium.common.exceptions import StaleElementReferenceException
import pandas as pd
import lxml
import os
import time
import datetime
import json
#Start Timmer
start = time.time()
#Set Output Folder
output_folder = datetime.datetime.now().strftime("Date-%Y-%m-%d-Time-%H-%M-%S")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
#Start Chrome
driver = webdriver.Chrome()
driver.implicitly_wait(20) # seconds
#Prepare URL
protocol = "http://"
url = protocol + "indiawater.gov.in/imisreports/Reports/Physical/rpt_RWS_TargetAchievement_S.aspx?"
params = {
'Rep': 0,
'RP': 'Y',
'APP': 'IMIS'
}
for key, value in params.items():
url = url + "&" + key + "=" + str(value)
#Playing with headers
my_referer = url
UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
mod_headers = {'referer': my_referer, 'User-Agent':UserAgent}
#Start Chrome
driver.get(url)
#Find Category Selector
financial_year_selector = "ctl00$ContentPlaceHolder$ddfinyear"
state_selector = "ctl00$ContentPlaceHolder$ddState"
cat_selector = "ctl00$ContentPlaceHolder$ddcat"
detail_checkbox = "ctl00$ContentPlaceHolder$chk_detail"
show_button = "ctl00$ContentPlaceHolder$btnGO"
last_col_head = "ctl00_ContentPlaceHolder_rpt_ctl00_lblnoofhabwithcov75to100popcov"
district_radio = "ctl00_ContentPlaceHolder_ddDivisionCat_0"
district_selector = "ctl00$ContentPlaceHolder$dddistrict"
financial_year = "2014-2015"
year_query = ".//option[@value='" + financial_year + "']"
reset = ".//option[@value='-1']"
cat_reset = ".//option[@value='Total']"
def del_show():
driver.execute_script("return document.getElementsByName('" + show_button + "')[0].remove()")
def check_show():
WebDriverWait( driver, 2 ).until( EC.visibility_of_element_located( ( By.NAME, show_button ) ) )
def detail_check():
try:
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, detail_checkbox))
)
if not element.is_selected():
#wait start
del_show()
element.click()
#wait end
check_show()
except Exception as e:
print("Error while checking details: ", e)
pass
def reset_page():
#Get Page
driver.get(url)
#Get Page
driver.get(url)
#wait end
check_show()
#Select Year
if (driver.find_element_by_name(financial_year_selector).get_attribute("value") == '-1' ):
#wait
time.sleep(1)
del_show()
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, financial_year_selector))
)
element.find_elements_by_xpath(".//option")[-1].click
#wait end
check_show()
#Reset now
#wait
time.sleep(1)
del_show()
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, financial_year_selector))
)
element.find_element_by_xpath(reset).click()
#wait end
check_show()
#Select year
#wait
time.sleep(1)
del_show()
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, financial_year_selector))
)
element.find_element_by_xpath(year_query).click()
#wait end
check_show()
#Reset State
#wait
time.sleep(1)
if not (driver.find_element_by_name(state_selector).get_attribute("value") == '-1' ):
#wait
del_show()
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, state_selector))
)
element.find_element_by_xpath(reset).click()
#wait end
check_show()
#Reset Category
#wait
time.sleep(1)
if not (driver.find_element_by_name(cat_selector).get_attribute("value") == 'Total' ):
#wait
del_show()
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, cat_selector))
)
element.find_element_by_xpath(cat_reset).click()
#wait end
check_show()
#Checkbox details
time.sleep(1)
detail_check()
#Cick Show Button
time.sleep(2)
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, show_button))
)
element.click()
#Wait for Table to Load fully
element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))
WebDriverWait(driver, 5).until(element_present)
time.sleep(2)
#Reset Page
reset_page()
#Get all data Categories
table = driver.find_element_by_id('tableReportTable')
head_cats = table.find_elements_by_xpath(".//thead//tr[@class='evenrowcolor']/th[@colspan='3']")[1:]
sub_cat = table.find_elements_by_xpath(".//tr[@class='oddrowcolor']/th")[:3]
#Get habitation Categories #loop over each head_cat
habit_Categories = []
for x in head_cats:
habit_Categories.append(x.text)
#Remove not needed cats
habit_Categories = habit_Categories[1:]
print ("Total habit_Categories to scrape data for: ",json.dumps(habit_Categories, sort_keys=True, indent=4))
#Get Sub habitation Categories #loop over each sub_cat
sub_Categories = []
for x in sub_cat:
sub_Categories.append(x.text)
print ("Total sub_Categories to scrape data for: ",json.dumps(sub_Categories, sort_keys=True, indent=4))
#Build Financial Year List
financial_years = driver.find_element_by_name(financial_year_selector)
options = financial_years.find_elements_by_xpath(".//option")
#Delete Total
options = options[1:]
financial_years = {}
#loop over each option
for x in options:
financial_years[x.get_attribute('value')] = x.text
print ("Total financial_year to scrape data for: ",json.dumps(financial_years, sort_keys=True, indent=4))
#Build Category List
temp_list = driver.find_element_by_name(cat_selector).find_elements_by_xpath(".//option")[1:]
categories = []
#loop over each option
for x in temp_list:
d = {}
d['text'] = x.text
d['value'] = x.get_attribute('value')
categories.append(d)
del temp_list
print ("Total Categories to scrape data for: ",json.dumps(categories, sort_keys=True, indent=4))
#Build State List
temp_list = driver.find_element_by_name(state_selector).find_elements_by_xpath(".//option")[1:]
states = []
#loop over each option
for x in temp_list:
d = {}
d['text'] = x.text
d['value'] = x.get_attribute('value')
states.append(d)
#print("State index:",i,d)
del temp_list
print ("Total States to scrape data for: ",json.dumps(states, sort_keys=True, indent=4))
#Build State Dict
temp_list = driver.find_element_by_name(state_selector).find_elements_by_xpath(".//option")[1:]
state_dict = {}
for x in temp_list:
state_dict[x.text] = x.get_attribute('value')
del temp_list
#Build State District Dict
driver.implicitly_wait(3) # seconds
state_district_dict = {}
for state in states:
try:
state_query = ".//option[@value='" + state['value'] + "']"
#Select State
#wait
time.sleep(1)
del_show()
try:
driver.execute_script("return document.getElementsByName('" + district_selector + "')[0].remove()")
except:
pass
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, state_selector))
)
element.find_element_by_xpath(state_query).click()
#wait end
check_show()
try:
WebDriverWait( driver, 3 ).until( EC.visibility_of_element_located( ( By.NAME, district_selector ) ) )
except:
#Cick Show Button
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, show_button))
)
element.click()
element = WebDriverWait(driver, 2).until(
EC.visibility_of_element_located((By.ID, district_radio))
)
element = WebDriverWait(driver, 2).until(
EC.element_to_be_clickable((By.ID, district_radio))
)
element.click()
WebDriverWait( driver, 5 ).until( EC.visibility_of_element_located( ( By.NAME, district_selector ) ) )
#Build District Dict
temp_list = driver.find_element_by_name(district_selector).find_elements_by_xpath(".//option")[1:]
district_dict = {}
for x in temp_list:
district_dict[x.text] = x.get_attribute('value')
del temp_list
#Append to State Dict
state_district_dict[state['text']] = district_dict
except Exception as e:
print("Error while Building state_district_dict: ", e)
pass
driver.implicitly_wait(20) # seconds
print (" State District Index: ",json.dumps(state_district_dict, sort_keys=True, indent=4))
df = None
cat_i = 5 #Increase to skip Category
cat_i_count = len(categories)
while cat_i < cat_i_count:
row_category_dict = categories[cat_i]
row_category = row_category_dict['text']
cat_key = row_category_dict['value']
cat_query = ".//option[@value='" + cat_key + "']"
print("\n\n\nAAAAAAAAA\n Scraping Data -> For category -> ",cat_key,": ",row_category)
#Reset Page
reset_page()
#Select Category
#wait
time.sleep(1)
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, cat_selector))
)
element.find_element_by_xpath(cat_query).click()
#Checkbox details
time.sleep(1)
detail_check()
#Remove Previous Table if any
try:
driver.execute_script('document.getElementById("tableReportTable").remove()')
time.sleep(.5)
except:
pass
#Cick Show Button
time.sleep(2)
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, show_button))
)
element.click()
data_for_states = 0
try:
#Wait for Table to Load fully
element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))
WebDriverWait(driver, 5).until(element_present)
time.sleep(2)
#Build Data State List
temp_list = driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[:-1]
data_state_list = []
i = 0
for x in temp_list:
xt = x.find_elements_by_xpath(".//td")[1].text.upper()
data_state_list.append(xt)
print("-Data Available in this category -> for State -> ",i,": ",xt)
i+=1
del temp_list
print("-Data Available in this category for Total States: ",len(data_state_list))
#All done so we have data
data_for_states = 1
except:
print("-No Data found for: ","category",row_category)
pass
#Start scraping data for states
if data_for_states > 0:
state_i = 2 #Increase to skip states
state_i_count = len(data_state_list)
while state_i < state_i_count:
row_state = data_state_list[state_i]
state_query = ".//option[@value='" + state_dict[row_state] + "']"
print("\n\n\nAAAAAAAAA\n -Scraping Data -> For category: ",row_category," -> for State: ",state_i," -> ",row_state)
#Reset Page
reset_page()
#Select State
#wait
time.sleep(1)
del_show()
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, state_selector))
)
element.find_element_by_xpath(state_query).click()
#wait end
check_show()
#Select Category
#wait
time.sleep(1)
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, cat_selector))
)
element.find_element_by_xpath(cat_query).click()
#Checkbox details
time.sleep(1)
detail_check()
#Remove Previous Table if any
try:
driver.execute_script('document.getElementById("tableReportTable").remove()')
time.sleep(.5)
except:
pass
#Cick Show Button
time.sleep(2)
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, show_button))
)
element.click()
#Check if data loaded at district level
driver.implicitly_wait(3) # seconds
try:
WebDriverWait( driver, 3 ).until( EC.visibility_of_element_located( ( By.NAME, district_selector ) ) )
except:
element = WebDriverWait(driver, 2).until(
EC.visibility_of_element_located((By.ID, district_radio))
)
element = WebDriverWait(driver, 2).until(
EC.element_to_be_clickable((By.ID, district_radio))
)
element.click()
WebDriverWait( driver, 5 ).until( EC.visibility_of_element_located( ( By.NAME, district_selector ) ) )
#Cick Show Button
time.sleep(2)
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, show_button))
)
element.click()
driver.implicitly_wait(20) # seconds
data_for_districts = 0
try:
#Wait for Table to Load fully
element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))
WebDriverWait(driver, 5).until(element_present)
time.sleep(2)
#Build Data State List
temp_list = driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[:-1]
data_district_list = []
for x in temp_list:
xt = x.find_elements_by_xpath(".//td")[1].text.upper()
data_district_list.append(xt)
print("--Data Available in this category -> this state -> for District: ",xt)
del temp_list
print("-Data Available in this category -> this state -> for Total District: ",len(data_district_list))
#All done so we have data
data_for_districts = 1
except:
print("-No Data found -> for: ","category",row_category)
pass
#Start scraping data for Districts
if data_for_districts > 0:
for row_district in data_district_list:
dist_match = 0
try:
district_query = ".//option[@value='" + state_district_dict[row_state][row_district] + "']"
print("\n\n\nAAAAAAAAA\n -Scraping Data -> For category: ",row_category," -> for State: ",row_state," -> for District: ",row_district)
dist_match = 1
except:
pass
if dist_match > 0:
def data_table_form():
#Reset Page
reset_page()
#Select State
#wait
time.sleep(1)
del_show()
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, state_selector))
)
element.find_element_by_xpath(state_query).click()
#wait end
check_show()
#Check if data loaded at district level
driver.implicitly_wait(3) # seconds
try:
WebDriverWait( driver, 3 ).until( EC.visibility_of_element_located( ( By.NAME, district_selector ) ) )
except:
#Cick Show Button
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, show_button))
)
element.click()
#Check District Radio button now
element = WebDriverWait(driver, 2).until(
EC.visibility_of_element_located((By.ID, district_radio))
)
element = WebDriverWait(driver, 2).until(
EC.element_to_be_clickable((By.ID, district_radio))
)
element.click()
WebDriverWait( driver, 5 ).until(
EC.visibility_of_element_located( ( By.NAME, district_selector ) )
)
#Cick Show Button
time.sleep(2)
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, show_button))
)
element.click()
driver.implicitly_wait(20) # seconds
#Select District
#wait
time.sleep(2)
del_show()
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, district_selector))
)
element.find_element_by_xpath(district_query).click()
#wait end
check_show()
#Select Category
#wait
time.sleep(1)
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, cat_selector))
)
element.find_element_by_xpath(cat_query).click()
#Checkbox details
time.sleep(1)
detail_check()
#Remove Previous Table if any
try:
driver.execute_script('document.getElementById("tableReportTable").remove()')
time.sleep(.5)
except:
pass
#Cick Show Button
time.sleep(2)
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, show_button))
)
element.click()
data_table_form()
data_for_scraping = 0
try:
#Wait for Table to Load fully
element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))
WebDriverWait(driver, 5).until(element_present)
time.sleep(2)
data_rows_count = len(driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[:-1])
if len(driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[:-1]) > 0:
#All done so we have data
data_for_scraping = 1
except Exception as e:
print("-No Data found for this category -> this state -> this District | Error: ",str(e))
pass
#Start scraping data
if data_for_scraping > 0:
#Loop over each row
row = 0
while row < data_rows_count and data_for_scraping > 0:
data_1 = 0
try:
#Wait for Table to Load fully
columns = driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[row].find_elements_by_xpath(".//td")
data_1 = 1
except Exception as e:
print("!Encountered Error retrying | Error: ",str(e))
data_table_form()
try:
#Wait for Table to Load fully
element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))
WebDriverWait(driver, 5).until(element_present)
columns = driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[row].find_elements_by_xpath(".//td")
data_1 = 1
except Exception as e:
data_1 = 0
pass
if data_1 > 0:
row_block = columns[1].text.strip()
print("----Scraping Data -> For category: ",row_category," -> for State: ",row_state," -> for District: ",row_district," -> for Block: ",row_block)
#Loop over all Data columns
column = len(columns) - 15 #5
for h_cat in habit_Categories:
for s_cat in sub_Categories:
try:
cur_col = driver.find_element_by_id('tableReportTable').find_elements_by_xpath(".//tbody/tr")[row].find_elements_by_xpath(".//td")[column]
data_2 = 1
except Exception as e:
print("!Encountered Error Reading Ccolumn | Error: ",str(e))
data_table_form()
try:
#Wait for Table to Load fully
element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))
WebDriverWait(driver, 5).until(element_present)
data_2 = 1
except Exception as e:
data_2 = 0
print("!Encountered Error while Reading this column ",str(e))
pass
#Check if Link is clickable otherwise nothing
if data_2 > 0 and ( not (cur_col.text == '0') ) :
#Try to click on link to data
try:
cur_col.find_element_by_tag_name('a').click()
#Wait for Table to Load fully
element_present = EC.presence_of_element_located((By.CLASS_NAME, "lnkPages"))
WebDriverWait(driver, 30).until(element_present)
data_pages = len(driver.find_elements_by_class_name("lnkPages"))
#print("-----Total Data Pages: ",data_pages)
#Testing only
#data_pages = 1
i = 0
while i < data_pages:
driver.execute_script('document.getElementById("tableReportTable").remove()')
driver.find_elements_by_class_name("lnkPages")[i].click()
#Wait for Table to Load fully
element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))
WebDriverWait(driver, 40).until(element_present)
#Once Loaded Create DataFrame from Table
html_table = driver.find_element_by_id('tableReportTable').get_attribute('outerHTML')
df1 = pd.read_html(html_table, skiprows=0)[0]
#Put Additional Information
df1['Financial Year'] = financial_year
df1['Category'] = row_category
df1['State'] = row_state
df1['District'] = row_district
df1['Block'] = row_block
df1['Habit Category'] = h_cat
df1['Sub Category'] = s_cat
#Check if master dataframe is empty then copy otherwise concat new Dataframe
if df is not None:
df = pd.concat([df,df1])
else:
df = df1.copy()
#Next Data Page
i+=1
#Go Back to previous page for next column
driver.execute_script("window.history.go(-1)")
#If Encounters error
except Exception as e:
print("!Encountered Error Going back | Error: ",str(e))
data_table_form()
try:
#Wait for Table to Load fully
element_present = EC.presence_of_element_located((By.ID, "tableReportTable"))
WebDriverWait(driver, 5).until(element_present)
except Exception as e:
data_for_scraping = 0
print("!Encountered Error while recovering from error Skipping this district ",str(e))
pass
#Next Column it is loop in subcategory
column+=1
#Loop completed for Habitat category
#Loop completed for whole Block
row+=1
#state loop Closes
state_i+=1
#Cat Loop Closes
cat_i+=1
df
#Save Scrapped File
df.to_csv(os.path.join(output_folder, financial_year+'_Scrapped_Data.csv'), encoding='utf-8', index=False)
#Read Edited File
df = pd.read_csv(os.path.join(output_folder, financial_year+'_Scrapped_Data.csv'), encoding='utf-8')
#Cleaner
#Reset Index
df = df.reset_index(drop=True)
df = df.drop(columns=['Sl.No.'])
#Get List of Columns to Group
columns = list(df.columns)
columns.remove('Category')
#Create Binary Variables
cat_i = 0 #Increase to skip states
cat_i_count = len(categories)
while cat_i < cat_i_count:
row_category_dict = categories[cat_i]
row_category = row_category_dict['text']
cat_key = row_category_dict['value']
df[row_category] = 0
df.loc[df['Category'] == row_category, row_category] = 1
cat_i+=1
#Group the dataframe and sum the binary variables
df = df.fillna(-999).groupby(columns).sum().reset_index()
#Save to file
df.to_csv(os.path.join(output_folder, financial_year+'_Scrapped_Cleaned_Data.csv'), encoding='utf-8', index=False)