In [10]:
#Import Libraries
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException,TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
import pandas as pd
import lxml
import os
import time
import datetime
import json
In [11]:
#Start Timmer
start = time.time()

#Set Output Folder
output_folder = datetime.datetime.now().strftime("Date-%Y-%m-%d-Time-%H-%M-%S")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
In [12]:
#Start Chrome
driver = webdriver.Chrome()
driver.implicitly_wait(20) # seconds

#Prepare URL
protocol = "https://"
url = protocol + "rhreporting.nic.in/netiay/FinancialProgressReport/finProgress_newRpt.aspx"
params = {
    'params': 0
}
for key, value in params.items():
    url = url #+ "&" + key + "=" + str(value)

#Playing with headers
my_referer = url
UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
mod_headers = {'referer': my_referer, 'User-Agent':UserAgent}

#Start Chrome
driver.get(url)
In [13]:
# Select List
year_select_name = 'ctl00$ContentPlaceHolder1$ddl_finYear'
radio_name = 'ctl00$ContentPlaceHolder1$rdlScheme'
excel_button = 'ctl00$ContentPlaceHolder1$btnExport'
content_table_class = 'table_s'
In [14]:
#Build State List
def build_list(element_name):
    temp_list = driver.find_element_by_name(element_name).find_elements_by_xpath(".//option")[1:]
    _list = []
    #loop over each option
    for x in temp_list:
        d = {}
        d['text'] = x.text
        d['value'] = x.get_attribute('value')
        _list.append(d)
        #print("_list index:",i,d)
    return _list

def check_table():
    #Wait for Table to Load fully
    element_present = EC.presence_of_element_located((By.CSS_SELECTOR, '.'+content_table_class))            
    WebDriverWait(driver, 2).until(element_present)
    
#Check Submit button
def delete_excel():
    driver.execute_script("return document.getElementsByName('" + excel_button + "')[0].remove()")
def check_excel():
    WebDriverWait( driver, 2 ).until( EC.visibility_of_element_located( ( By.NAME, excel_button ) ) )
In [ ]:
#Start Chrome
driver.get(url)
element = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.NAME, year_select_name))
)
element.find_element_by_xpath(".//option[@value='0']").click()
yearlist = build_list(year_select_name)[0:1]
print(yearlist)
for year in yearlist:
    print("--Scrapping data for Year: ", year['text'])
    #Start Chrome
    driver.get(url)
    #Year Selection
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.NAME, year_select_name))
    )
    element.find_element_by_xpath(".//option[@value='0']").click()
    year_query = ".//option[@value='" + year['value'] + "']"
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.NAME, year_select_name))
    )
    element.find_element_by_xpath(year_query).click()
    
    #DataStore
    df_list = []
    
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.NAME, radio_name))
    )
    driver.find_elements_by_name(radio_name)[-2].click()
    
    #state select
    check_table()
    temp_list = driver.find_element_by_class_name(content_table_class).find_elements_by_xpath(".//tr")[3:-2]
    states = []
    for row in temp_list:
        try:
            x = {}
            td = row.find_elements_by_xpath(".//td")[1]
            x['name'] = td.text
            x['link'] = td.find_element_by_tag_name("a").get_attribute("href")
            states.append(x)
        except Exception as e:
            print("Error while parsing table data |", e)
    for state in states:
        print("----Scrapping data for State: ", state['name'])
        driver.get(state['link'])
        check_table()
        temp_list = driver.find_element_by_class_name(content_table_class).find_elements_by_xpath(".//tr")[4:-1]
        districts = []
        for row in temp_list:
            x = {}            
            x['name'] = row.find_elements_by_xpath(".//td")[1].text
            x['link'] =row.find_elements_by_xpath(".//td")[1].find_element_by_tag_name("a").get_attribute("href")
            districts.append(x)
        for district in districts:
            print("------Scrapping data for District: ", district['name'])
            driver.get(district['link'])
            check_table()
            temp_list = driver.find_element_by_class_name(content_table_class).find_elements_by_xpath(".//tr")[4:-1]
            blocks = []
            for row in temp_list:
                x = {}            
                x['name'] = row.find_elements_by_xpath(".//td")[1].text
                x['link'] =row.find_elements_by_xpath(".//td")[1].find_element_by_tag_name("a").get_attribute("href")
                blocks.append(x)
            
            for block in blocks:
                print("--------Scrapping data for Block: ", block['name'])
                driver.get(block['link'])
                check_table()
                html_table = driver.find_element_by_class_name(content_table_class).get_attribute('outerHTML')
                df = pd.read_html(html_table, header=3)[0]
                df = df[:-1]
                #df.columns = table_head
                df['Financial Year'] = year['text']
                df['State Name'] = state['name']
                df['District Name'] = district['name']
                df['Block Name'] = block['name']
                df['Scheme'] = "All Central Scheme"
                df_list.append(df)
                #except Exception as e:
                #    print("Error while table data |", e)
                
    #merge all dataframes to master dataframe
    df = pd.concat(df_list)
    df.drop(df.columns[0],axis=1,inplace=True)
    df = df.reset_index()
    #Save Scrapped File
    df.to_csv(os.path.join(output_folder, year['text']+'_Scrapped_Data.csv'), encoding='utf-8', index=False)
[{'text': '2010-2011', 'value': '2010-2011 '}]
--Scrapping data for Year:  2010-2011
In [ ]:
 
In [ ]:
 
In [ ]: