#Import Libraries
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException,TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
import pandas as pd
import lxml
import os
import time
import datetime
import json
#Start Timmer
start = time.time()
#Set Output Folder
output_folder = datetime.datetime.now().strftime("Date-%Y-%m-%d-Time-%H-%M-%S")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
#Start Chrome
driver = webdriver.Chrome()
driver.implicitly_wait(20) # seconds
#Prepare URL
protocol = "https://"
url = protocol + "rhreporting.nic.in/netiay/FinancialProgressReport/finProgress_newRpt.aspx"
params = {
'params': 0
}
for key, value in params.items():
url = url #+ "&" + key + "=" + str(value)
#Playing with headers
my_referer = url
UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
mod_headers = {'referer': my_referer, 'User-Agent':UserAgent}
#Start Chrome
driver.get(url)
# Select List
year_select_name = 'ctl00$ContentPlaceHolder1$ddl_finYear'
radio_name = 'ctl00$ContentPlaceHolder1$rdlScheme'
excel_button = 'ctl00$ContentPlaceHolder1$btnExport'
content_table_class = 'table_s'
#Build State List
def build_list(element_name):
temp_list = driver.find_element_by_name(element_name).find_elements_by_xpath(".//option")[1:]
_list = []
#loop over each option
for x in temp_list:
d = {}
d['text'] = x.text
d['value'] = x.get_attribute('value')
_list.append(d)
#print("_list index:",i,d)
return _list
def check_table():
#Wait for Table to Load fully
element_present = EC.presence_of_element_located((By.CSS_SELECTOR, '.'+content_table_class))
WebDriverWait(driver, 2).until(element_present)
#Check Submit button
def delete_excel():
driver.execute_script("return document.getElementsByName('" + excel_button + "')[0].remove()")
def check_excel():
WebDriverWait( driver, 2 ).until( EC.visibility_of_element_located( ( By.NAME, excel_button ) ) )
#Start Chrome
driver.get(url)
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, year_select_name))
)
element.find_element_by_xpath(".//option[@value='0']").click()
yearlist = build_list(year_select_name)[0:1]
print(yearlist)
for year in yearlist:
print("--Scrapping data for Year: ", year['text'])
#Start Chrome
driver.get(url)
#Year Selection
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, year_select_name))
)
element.find_element_by_xpath(".//option[@value='0']").click()
year_query = ".//option[@value='" + year['value'] + "']"
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, year_select_name))
)
element.find_element_by_xpath(year_query).click()
#DataStore
df_list = []
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.NAME, radio_name))
)
driver.find_elements_by_name(radio_name)[-2].click()
#state select
check_table()
temp_list = driver.find_element_by_class_name(content_table_class).find_elements_by_xpath(".//tr")[3:-2]
states = []
for row in temp_list:
try:
x = {}
td = row.find_elements_by_xpath(".//td")[1]
x['name'] = td.text
x['link'] = td.find_element_by_tag_name("a").get_attribute("href")
states.append(x)
except Exception as e:
print("Error while parsing table data |", e)
for state in states:
print("----Scrapping data for State: ", state['name'])
driver.get(state['link'])
check_table()
temp_list = driver.find_element_by_class_name(content_table_class).find_elements_by_xpath(".//tr")[4:-1]
districts = []
for row in temp_list:
x = {}
x['name'] = row.find_elements_by_xpath(".//td")[1].text
x['link'] =row.find_elements_by_xpath(".//td")[1].find_element_by_tag_name("a").get_attribute("href")
districts.append(x)
for district in districts:
print("------Scrapping data for District: ", district['name'])
driver.get(district['link'])
check_table()
temp_list = driver.find_element_by_class_name(content_table_class).find_elements_by_xpath(".//tr")[4:-1]
blocks = []
for row in temp_list:
x = {}
x['name'] = row.find_elements_by_xpath(".//td")[1].text
x['link'] =row.find_elements_by_xpath(".//td")[1].find_element_by_tag_name("a").get_attribute("href")
blocks.append(x)
for block in blocks:
print("--------Scrapping data for Block: ", block['name'])
driver.get(block['link'])
check_table()
html_table = driver.find_element_by_class_name(content_table_class).get_attribute('outerHTML')
df = pd.read_html(html_table, header=3)[0]
df = df[:-1]
#df.columns = table_head
df['Financial Year'] = year['text']
df['State Name'] = state['name']
df['District Name'] = district['name']
df['Block Name'] = block['name']
df['Scheme'] = "All Central Scheme"
df_list.append(df)
#except Exception as e:
# print("Error while table data |", e)
#merge all dataframes to master dataframe
df = pd.concat(df_list)
df.drop(df.columns[0],axis=1,inplace=True)
df = df.reset_index()
#Save Scrapped File
df.to_csv(os.path.join(output_folder, year['text']+'_Scrapped_Data.csv'), encoding='utf-8', index=False)