In [17]:
#Import Libraries
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException,TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
import pandas as pd
import lxml
import os
import time
import datetime
import json

In [18]:
#Start Timmer
start = time.time()

#Set Output Folder
output_folder = datetime.datetime.now().strftime("Date-%Y-%m-%d-Time-%H-%M-%S")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [19]:
#Start Chrome
driver = webdriver.Chrome()
driver.implicitly_wait(20) # seconds

#Prepare URL
protocol = "https://"
url = protocol + "rhreporting.nic.in/netiay/PhysicalProgressReport/YearWiseHouseCompletionReport.aspx?"
params = {
    'params': 0
}
for key, value in params.items():
    url = url + "&" + key + "=" + str(value)

#Playing with headers
my_referer = url
UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
mod_headers = {'referer': my_referer, 'User-Agent':UserAgent}

#Start Chrome
driver.get(url)

In [20]:
# Select List
year_select_name = 'ctl00$ContentPlaceHolder1$ddlFinYear'
scheme_select_name = 'ctl00$ContentPlaceHolder1$ddlScheme'
state_select_name = 'ctl00$ContentPlaceHolder1$ddlState'
district_select_name = 'ctl00$ContentPlaceHolder1$ddlDistrict'
block_select_name = 'ctl00$ContentPlaceHolder1$ddlBlock'
submit_button_name = 'ctl00$ContentPlaceHolder1$btnSubmit'
content_table_pane = 'ContentPlaceHolder1_gvDataPanelItem'
content_table_head_id = 'ContentPlaceHolder1_gvDataCopy'
content_table_id = 'ContentPlaceHolder1_gvData'

In [21]:
#Build State List
def build_list(element_name):
    temp_list = driver.find_element_by_name(element_name).find_elements_by_xpath(".//option")[1:]
    _list = []
    #loop over each option
    for x in temp_list:
        d = {}
        d['text'] = x.text
        d['value'] = x.get_attribute('value')
        _list.append(d)
        #print("_list index:",i,d)
    return _list

#Check Submit button
def delete_submit():
    driver.execute_script("return document.getElementsByName('" + submit_button_name + "')[0].remove()")
def check_submit():
    WebDriverWait( driver, 2 ).until( EC.visibility_of_element_located( ( By.NAME, submit_button_name ) ) )

#Check table
def delete_submit():
    driver.execute_script("return document.getElementsByName('" + submit_button_name + "')[0].remove()")
def check_submit():
    WebDriverWait( driver, 2 ).until( EC.visibility_of_element_located( ( By.NAME, submit_button_name ) ) )    
    
#Check submit button
def delete_content():
    driver.execute_script("return document.getElementsByClassName('col-lg-10')[0].remove()")
def check_error():
    WebDriverWait( driver, 2 ).until( EC.visibility_of_element_located( ( By.ID, "ContentPlaceHolder1_lbl_data_msg" ) ) )
    

In [22]:
def get_total_houselist():
    #Wait for Table to Load fully
    element_present = EC.presence_of_element_located((By.ID, content_table_id))            
    WebDriverWait(driver, 2).until(element_present)
    #Once Loaded Create DataFrame from Table
    html_table = driver.find_element_by_id(content_table_id).get_attribute('outerHTML')
    df = pd.read_html(html_table, skiprows=0, header=0)[0]
    df = df[1:-1]        
    total_houses_completed = list(df[df.columns[-1]])
    return total_houses_completed

In [24]:
#Start Chrome
driver.get(url)

#Year Selection
element = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.NAME, year_select_name))
)
element.find_element_by_xpath(".//option[@value='0']").click()
yearlist = build_list(year_select_name)[1:2]
for year in yearlist:    
    print("--Scrapping data for Year: ", year['text'])
    year_query = ".//option[@value='" + year['value'] + "']"
    delete_submit()
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.NAME, year_select_name))
    )
    element.find_element_by_xpath(year_query).click()
    check_submit()
    
    #DataStore
    df_list = []
    
    #Scheme Selection
    schemelist = build_list(scheme_select_name)[2:]
    for scheme in schemelist:
        print("--//--Scrapping data for scheme: ", scheme['text'])
        scheme_query = ".//option[@value='" + scheme['value'] + "']"
        delete_submit()
        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.NAME, scheme_select_name))
        )
        element.find_element_by_xpath(scheme_query).click()
        check_submit()
        
        # Delete Content
        delete_content()        
        #Submit button
        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.NAME, submit_button_name))
        )
        element.click()
        
        #Load Filter List
        state_total_houses_completed = get_total_houselist();
        
        #state Selection
        statelist = build_list(state_select_name)
        for stateindex, state in enumerate(statelist):
            print(stateindex)
            print("--//--//--Scrapping data for state: ", state['text'])
            if state_total_houses_completed[int(stateindex)] > 0:
                state_query = ".//option[@value='" + state['value'] + "']"
                delete_submit()
                element = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.NAME, state_select_name))
                )
                element.find_element_by_xpath(state_query).click()
                check_submit()
                
                # Delete Content
                delete_content()        
                #Submit button
                element = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.NAME, submit_button_name))
                )
                element.click()

                #Load Filter List
                district_total_houses_completed = get_total_houselist();
         
                #district Selection
                districtlist = build_list(district_select_name)
                for districtindex, district in enumerate(districtlist):
                    print("--//--//--//--Scrapping data for district: ", district['text'])
                    if district_total_houses_completed[int(districtindex)] > 0:
                        district_query = ".//option[@value='" + district['value'] + "']"
                        delete_submit()
                        element = WebDriverWait(driver, 10).until(
                            EC.element_to_be_clickable((By.NAME, district_select_name))
                        )
                        element.find_element_by_xpath(district_query).click()
                        check_submit()
                        
                        # Delete Content
                        delete_content()        
                        #Submit button
                        element = WebDriverWait(driver, 10).until(
                            EC.element_to_be_clickable((By.NAME, submit_button_name))
                        )
                        element.click()

                        #Load Filter List
                        block_total_houses_completed = get_total_houselist();

                        #block Selection
                        blocklist = build_list(block_select_name)
                        for blockindex, block in enumerate(blocklist):
                            print("--//--//--//--//--Scrapping data for block: ", block['text'])
                            if block_total_houses_completed[int(blockindex)] > 0:
                                block_query = ".//option[@value='" + block['value'] + "']"
                                delete_submit()
                                element = WebDriverWait(driver, 10).until(
                                    EC.element_to_be_clickable((By.NAME, block_select_name))
                                )
                                element.find_element_by_xpath(block_query).click()
                                check_submit()

                                # Delete Content
                                delete_content()

                                #Submit button
                                element = WebDriverWait(driver, 10).until(
                                    EC.element_to_be_clickable((By.NAME, submit_button_name))
                                )
                                element.click()

                                try:
                                    #Wait for Table to Load fully
                                    element_present = EC.presence_of_element_located((By.ID, content_table_id))            
                                    WebDriverWait(driver, 2).until(element_present)

                                    #Once Loaded Create DataFrame from Table
                                    #html_head = driver.find_element_by_id(content_table_head_id).get_attribute('outerHTML')
                                    #df = pd.read_html(html_head, skiprows=0)[0]
                                    #table_head = list(df.values[0])
                                    html_table = driver.find_element_by_id(content_table_id).get_attribute('outerHTML')
                                    df = pd.read_html(html_table, skiprows=0, header=0)[0]
                                    df = df[1:-1]
                                    #df.columns = table_head
                                    df['Financial Year'] = year['text']
                                    df['Scheme'] = scheme['text']
                                    df['State Name'] = state['text']
                                    df['District Name'] = district['text']
                                    df['Block Name'] = block['text']
                                    df_list.append(df)
                                except Exception as e:
                                    print("Error while table data |", e)


    #merge all dataframes to master dataframe
    df = pd.concat(df_list)
    df = df.reset_index().drop(columns=['index', '#SNo'])
    df['#SNo'] = df.index + 1
    #Save Scrapped File
    df.to_csv(os.path.join(output_folder, year['text']+'_Scrapped_Data.csv'), encoding='utf-8', index=False)

--Scrapping data for Year:  2011-2012
--//--Scrapping data for scheme:  ALL STATE SCHEMES
0
--//--//--Scrapping data for state:  ARUNACHAL PRADESH
1
--//--//--Scrapping data for state:  ASSAM
2
--//--//--Scrapping data for state:  BIHAR
3
--//--//--Scrapping data for state:  CHHATTISGARH
4
--//--//--Scrapping data for state:  GOA
5
--//--//--Scrapping data for state:  GUJARAT
6
--//--//--Scrapping data for state:  HARYANA
7
--//--//--Scrapping data for state:  HIMACHAL PRADESH
8
--//--//--Scrapping data for state:  JAMMU AND KASHMIR
9
--//--//--Scrapping data for state:  JHARKHAND
--//--//--//--Scrapping data for district:  BOKARO
--//--//--//--Scrapping data for district:  CHATRA
--//--//--//--Scrapping data for district:  DEOGHAR
--//--//--//--Scrapping data for district:  DHANBAD
--//--//--//--Scrapping data for district:  DUMKA
--//--//--//--Scrapping data for district:  EAST SINGHBUM
--//--//--//--//--Scrapping data for block:  BAHRAGORA
--//--//--//--//--Scrapping data for block:

--//--//--//--//--Scrapping data for block:  PUNASA
--//--//--//--Scrapping data for district:  KHARGONE
--//--//--//--//--Scrapping data for block:  BARWAH
--//--//--//--//--Scrapping data for block:  BHAGVANPURA
--//--//--//--//--Scrapping data for block:  BHIKANGAON
--//--//--//--//--Scrapping data for block:  GOGANWA
--//--//--//--//--Scrapping data for block:  KASRAWAD
--//--//--//--//--Scrapping data for block:  KHARGONE
--//--//--//--//--Scrapping data for block:  MAHESHWAR
--//--//--//--//--Scrapping data for block:  SEGAON
--//--//--//--//--Scrapping data for block:  ZIRANYA
--//--//--//--Scrapping data for district:  MANDLA
--//--//--//--//--Scrapping data for block:  BICHHIYA
--//--//--//--//--Scrapping data for block:  BIJADANDI
--//--//--//--//--Scrapping data for block:  GHUGHRI
--//--//--//--//--Scrapping data for block:  MANDLA
--//--//--//--//--Scrapping data for block:  MAWAI
--//--//--//--//--Scrapping data for block:  MOHGAON
--//--//--//--//--Scrapping data for blo

--//--//--//--//--Scrapping data for block:  PATUR
--//--//--//--//--Scrapping data for block:  TELHARA
--//--//--//--Scrapping data for district:  AMRAVATI
--//--//--//--Scrapping data for district:  AURANGABAD
--//--//--//--Scrapping data for district:  BEED
--//--//--//--Scrapping data for district:  BHANDARA
--//--//--//--Scrapping data for district:  BULDHANA
--//--//--//--Scrapping data for district:  CHANDRAPUR
--//--//--//--Scrapping data for district:  DHULE
--//--//--//--Scrapping data for district:  GADCHIROLI
--//--//--//--Scrapping data for district:  GONDIA
--//--//--//--Scrapping data for district:  HINGOLI
--//--//--//--Scrapping data for district:  JALGAON
--//--//--//--Scrapping data for district:  JALNA
--//--//--//--Scrapping data for district:  KOLHAPUR
--//--//--//--Scrapping data for district:  LATUR
--//--//--//--//--Scrapping data for block:  AHMADPUR
--//--//--//--//--Scrapping data for block:  AUSA
--//--//--//--//--Scrapping data for block:  CHAKUR
--//--//-

--//--//--//--//--Scrapping data for block:  निम्बाहेड़ा
--//--//--//--//--Scrapping data for block:  बड़ीसादड़ी
--//--//--//--//--Scrapping data for block:  बेगूँ
--//--//--//--//--Scrapping data for block:  भदेसर
--//--//--//--//--Scrapping data for block:  भूपालसागर
--//--//--//--//--Scrapping data for block:  भैसरोड़गढ
--//--//--//--//--Scrapping data for block:  राशमी
--//--//--//--Scrapping data for district:  CHURU
--//--//--//--//--Scrapping data for block:  BIDASAR
--//--//--//--//--Scrapping data for block:  चूरु
--//--//--//--//--Scrapping data for block:  तारानगर
--//--//--//--//--Scrapping data for block:  रतनगढ
--//--//--//--//--Scrapping data for block:  राजगढ
--//--//--//--//--Scrapping data for block:  सरदारशहर
--//--//--//--//--Scrapping data for block:  सुजानगढ
--//--//--//--Scrapping data for district:  DAUSA
--//--//--//--//--Scrapping data for block:  LAWAAN
--//--//--//--//--Scrapping data for block:  द‍ौसा
--//--//--//--//--Scrapping data for block:  बांदीकुई
--

--//--//--//--Scrapping data for district:  SAWAI MADHOPUR
--//--//--//--//--Scrapping data for block:  CHAUTH KA BARWARA
--//--//--//--//--Scrapping data for block:  खण्डार
--//--//--//--//--Scrapping data for block:  गंगापुर सिटी
--//--//--//--//--Scrapping data for block:  बामनवास
--//--//--//--//--Scrapping data for block:  ब‍ौंली
--//--//--//--//--Scrapping data for block:  सवाई माध‍ोपुर
--//--//--//--Scrapping data for district:  SIKAR
--//--//--//--Scrapping data for district:  SIROHI
--//--//--//--//--Scrapping data for block:  आबुरोड 
--//--//--//--//--Scrapping data for block:  पिण्डवाडा 
--//--//--//--//--Scrapping data for block:  रेवदर 
--//--//--//--//--Scrapping data for block:  शिवगंज 
--//--//--//--//--Scrapping data for block:  सिरोही 
--//--//--//--Scrapping data for district:  SRI GANGANAGAR
--//--//--//--//--Scrapping data for block:  GHARSANA
--//--//--//--//--Scrapping data for block:  SRIVIJAYNAGR
--//--//--//--//--Scrapping data for block:  अनूपगढ
--//--//--//-

0
--//--//--Scrapping data for state:  ARUNACHAL PRADESH
1
--//--//--Scrapping data for state:  ASSAM
2
--//--//--Scrapping data for state:  BIHAR
3
--//--//--Scrapping data for state:  CHHATTISGARH
4
--//--//--Scrapping data for state:  GOA
5
--//--//--Scrapping data for state:  GUJARAT
6
--//--//--Scrapping data for state:  HARYANA
7
--//--//--Scrapping data for state:  HIMACHAL PRADESH
8
--//--//--Scrapping data for state:  JAMMU AND KASHMIR
9
--//--//--Scrapping data for state:  JHARKHAND
10
--//--//--Scrapping data for state:  KERALA
11
--//--//--Scrapping data for state:  MADHYA PRADESH
12
--//--//--Scrapping data for state:  MAHARASHTRA
13
--//--//--Scrapping data for state:  MANIPUR
14
--//--//--Scrapping data for state:  MEGHALAYA
15
--//--//--Scrapping data for state:  MIZORAM
16
--//--//--Scrapping data for state:  NAGALAND
17
--//--//--Scrapping data for state:  ODISHA
18
--//--//--Scrapping data for state:  PUNJAB
19
--//--//--Scrapping data for state:  RAJASTHAN
20
--//--

--//--//--//--//--Scrapping data for block:  नदबई
--//--//--//--//--Scrapping data for block:  बयाना
--//--//--//--//--Scrapping data for block:  रुपवास
--//--//--//--//--Scrapping data for block:  वैर
--//--//--//--//--Scrapping data for block:  सेवर
--//--//--//--Scrapping data for district:  BHILWARA
--//--//--//--//--Scrapping data for block:  BIJOLIYA
--//--//--//--//--Scrapping data for block:  आसीन्द
--//--//--//--//--Scrapping data for block:  कोटडी
--//--//--//--//--Scrapping data for block:  जहाजपुर
--//--//--//--//--Scrapping data for block:  बनेडा
--//--//--//--//--Scrapping data for block:  माण्डल
--//--//--//--//--Scrapping data for block:  माण्डलगढ
--//--//--//--//--Scrapping data for block:  रायपुर
--//--//--//--//--Scrapping data for block:  शाहपुरा
--//--//--//--//--Scrapping data for block:  सहाडा
--//--//--//--//--Scrapping data for block:  सुवाणा
--//--//--//--//--Scrapping data for block:  हुरडा
--//--//--//--Scrapping data for district:  BIKANER
--//--//--//--//-

--//--//--//--//--Scrapping data for block:  कुचामन
--//--//--//--//--Scrapping data for block:  जायल
--//--//--//--//--Scrapping data for block:  डीडवाना
--//--//--//--//--Scrapping data for block:  डेगाना
--//--//--//--//--Scrapping data for block:  नाग‍ौर
--//--//--//--//--Scrapping data for block:  परबतसर
--//--//--//--//--Scrapping data for block:  मकराना
--//--//--//--//--Scrapping data for block:  मूण्डवा
--//--//--//--//--Scrapping data for block:  मेड़ता
--//--//--//--//--Scrapping data for block:  लाडनूं
--//--//--//--Scrapping data for district:  PALI
--//--//--//--//--Scrapping data for block:  ज्ैातरण
--//--//--//--//--Scrapping data for block:  देसूरी
--//--//--//--//--Scrapping data for block:  पाली
--//--//--//--//--Scrapping data for block:  बाली
--//--//--//--//--Scrapping data for block:  मारवाड़ जंक्शन
--//--//--//--//--Scrapping data for block:  रानी
--//--//--//--//--Scrapping data for block:  रायपुर
--//--//--//--//--Scrapping data for block:  रोहत
--//--//--//--

--//--//--//--//--Scrapping data for block:  ASARGANJ
--//--//--//--//--Scrapping data for block:  BARIYARPUR
--//--//--//--//--Scrapping data for block:  DHARHARA
--//--//--//--//--Scrapping data for block:  JAMALPUR
--//--//--//--//--Scrapping data for block:  KHARAGPUR
--//--//--//--//--Scrapping data for block:  MUNGER SADAR
--//--//--//--//--Scrapping data for block:  SANGRAMPUR
--//--//--//--//--Scrapping data for block:  TARAPUR
--//--//--//--//--Scrapping data for block:  TETIABAMBAR
--//--//--//--Scrapping data for district:  MUZAFFARPUR
--//--//--//--Scrapping data for district:  NALANDA
--//--//--//--Scrapping data for district:  NAWADA
--//--//--//--Scrapping data for district:  PASHCHIM CHAMPARAN
--//--//--//--Scrapping data for district:  PATNA
--//--//--//--Scrapping data for district:  PURBI CHAMPARAN
--//--//--//--Scrapping data for district:  PURNIA
--//--//--//--Scrapping data for district:  ROHTAS
--//--//--//--Scrapping data for district:  SAHARSA
--//--//--//--Scr

TimeoutException: Message: 


In [8]:
df = pd.concat(df_list)

In [9]:
df

Unnamed: 0,#SNo,Panchayat Name,Houses completed for prior to 2006-2007,Houses completed for 2007-2008,Houses completed for 2008-2009,Houses completed for 2009-2010,Houses completed for 2010-2011,Houses completed for 2011-2012,Total Houses completed in 2011-2012,Financial Year,Scheme,State Name,District Name,Block Name
1,1.0,BAKUL,0,0,0,0,0,0,0,2011-2012,IAY NEW CONSTRUCTION,ASSAM,DIBRUGARH,LAHOWAL
2,2.0,CHIRINGHULLA,0,0,0,0,0,0,0,2011-2012,IAY NEW CONSTRUCTION,ASSAM,DIBRUGARH,LAHOWAL
3,3.0,HILOIDHARI,0,0,0,0,0,0,0,2011-2012,IAY NEW CONSTRUCTION,ASSAM,DIBRUGARH,LAHOWAL
4,4.0,IKRATALI,0,0,0,0,0,0,0,2011-2012,IAY NEW CONSTRUCTION,ASSAM,DIBRUGARH,LAHOWAL
5,5.0,LAHOWAL,0,0,0,0,0,0,0,2011-2012,IAY NEW CONSTRUCTION,ASSAM,DIBRUGARH,LAHOWAL
6,6.0,MAIJAN,0,0,0,0,0,0,0,2011-2012,IAY NEW CONSTRUCTION,ASSAM,DIBRUGARH,LAHOWAL
7,7.0,MODERKHAT,0,0,0,0,0,0,0,2011-2012,IAY NEW CONSTRUCTION,ASSAM,DIBRUGARH,LAHOWAL
8,8.0,MOHANBARI,0,0,0,0,0,0,0,2011-2012,IAY NEW CONSTRUCTION,ASSAM,DIBRUGARH,LAHOWAL
9,9.0,NIZKANAI,0,0,0,0,0,0,0,2011-2012,IAY NEW CONSTRUCTION,ASSAM,DIBRUGARH,LAHOWAL
10,10.0,PHUKONORKHAT,0,0,0,0,0,1,1,2011-2012,IAY NEW CONSTRUCTION,ASSAM,DIBRUGARH,LAHOWAL


In [10]:
#merge all dataframes to master dataframe
df = pd.concat(df_list)
df = df.reset_index().drop(columns=['index', '#SNo'])
df['#SNo'] = df.index + 1
#Save Scrapped File
df.to_csv(os.path.join(output_folder, year['text']+'_Scrapped_Data.csv'), encoding='utf-8', index=False)