# Import Libraries
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import os
import json
import requests
import time
import datetime
from collections import defaultdict
import lxml
# Import folders
data_folder = os.path.abspath('data')
# Set Output Folder
output_folder = os.path.abspath("output")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
#Start Timmer
start = time.time()
#Set Output Folder
cur_output_folder = os.path.join(output_folder, datetime.datetime.now().strftime("Date-%Y-%m-%d-Time-%H-%M-%S"))
if not os.path.exists(cur_output_folder):
os.makedirs(cur_output_folder)
#Start Chrome
driver = webdriver.Chrome()
driver.implicitly_wait(2) # seconds
#Prepare URL
protocol = "http://"
url = protocol + "nregarep2.nic.in/netnrega/dynamic2/dynamicreport_new4.aspx"
'''params = {
'Rep': 0,
'RP': 'Y',
'APP': 'IMIS'
}
for key, value in params.items():
url = url + "&" + key + "=" + str(value)'''
#Playing with headers
my_referer = url
UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
mod_headers = {'referer': my_referer, 'User-Agent':UserAgent}
#Start Chrome
driver.get(url)
financial_year = "2012"
def view_data():
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, 'viewDummy'))
)
element.click()
# Reset View to India level
def reset_view():
driver.get(url)
time.sleep(2)
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, 'DdlstFinYear'))
)
element.find_elements_by_xpath(".//option")[-1].click()
time.sleep(1)
element.find_elements_by_xpath(".//option")[0].click()
view_data()
# Select Year
driver.find_element_by_id("DdlstFinYear").find_element_by_xpath(".//option[@value='"+ financial_year +"']").click()
# Select Level
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, 'regionselect'))
)
element.find_elements_by_xpath(".//option")[-1].click()
'''# CLose Open Drawers
open_drawers = driver.find_elements_by_xpath(".//li[@class='statebox']").find_elements_by_xpath(".//button[text()='-']")
for x in open_drawers:
x.click()'''
def open_all_gps(j):
district = driver.find_elements_by_xpath(".//ul[@class='districtul']")[0].find_elements_by_xpath(".//li[@class='districtbox']")[j]
print("----#--#--Expanding District", district.text[2:], )
district.click()
district.find_element_by_xpath(".//button[text()='+']").click()
blocks = driver.find_elements_by_xpath(".//ul[@class='blockul']")[j].find_elements_by_xpath(".//li[@class='blockbox']")
# Loop over all Blocks
for k, block in enumerate(blocks):
block = driver.find_elements_by_xpath(".//ul[@class='blockul']")[j].find_elements_by_xpath(".//li[@class='blockbox']")[k]
print("----#--#--#--Expanding Block", block.text[2:])
block.find_element_by_xpath(".//button[text()='+']").click()
time.sleep(.5)
html = driver.find_elements_by_xpath(".//ul[@class='districtul']")[0].get_attribute('innerHTML')
html = html.split('<li class="districtbox">')[j].split('<li class="blockbox">')[k+1]
if len(html.split('<li class="panchbox">')) < 2:
block.click()
time.sleep(.5)
# All GPS are now visible
print("----#--All GPS are now visible")
def check_checkboxes():
checkboxes = driver.find_elements_by_xpath(".//ul[@class='checklist']")[0].find_elements_by_xpath(".//input[@type='checkbox']")# Open State
for x in checkboxes:
x.click()
# Uncheck unwanted Checkboxes
checkboxes = driver.find_elements_by_xpath(".//li[@id='filtertable']")[0].find_elements_by_xpath(".//input[@type='checkbox']")
for x in checkboxes:
x.click()
checkboxes = driver.find_elements_by_xpath(".//div[@id='UpdatePanelmiddle']")[0].find_elements_by_xpath(".//input[@type='checkbox']")[-6:]
for x in checkboxes:
x.click()
# Get State List
reset_view()
states = driver.find_elements_by_xpath(".//li[@class='statebox']/label")
#states = states[0:2]
print("----Total states to scrape data for:", len(states))
skip_index = 2
df_list = []
# Loop over each state
for i, state in enumerate(states):
if i > skip_index:
reset_view()
state = driver.find_elements_by_xpath(".//li[@class='statebox']/label")[i]
print("----#--Scraping Data for State", state.text)
state = driver.find_elements_by_xpath(".//li[@class='statebox']/label")[i]
#state.click()
driver.find_elements_by_xpath(".//li[@class='statebox']")[i].find_element_by_xpath(".//button[text()='+']").click() # Open State
districts = driver.find_elements_by_xpath(".//ul[@class='districtul']")[0].find_elements_by_xpath(".//li[@class='districtbox']")
# Loop over all districts
for j, district in enumerate(districts):
reset_view()
driver.find_elements_by_xpath(".//li[@class='statebox']")[i].find_element_by_xpath(".//button[text()='+']").click() # Open Stat
open_all_gps(j)
check_checkboxes()
time.sleep(1)
view_data()
# Scrape Table
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "GrdVwGenReport"))
)
html_table = element.get_attribute('outerHTML')
df = pd.read_html(html_table, skiprows=0, header=0)[0][:-1]
'''print("----#--Scraping Filters Data")
opts = list(range(1,6))
for l in opts:
reset_view()
driver.find_elements_by_xpath(".//li[@class='statebox']")[i].find_element_by_xpath(".//button[text()='+']").click()
open_all_gps(j)
driver.find_element_by_id("ChkLstFieldsWork_W_0").click()
driver.find_element_by_id("ChkLstFieldsWork_W_1").click()
driver.find_element_by_id("ChkBox4").click()
select = driver.find_element_by_id("DdlstChkBox4").find_elements_by_xpath(".//option")[l]
select.click()
filter = select.text.strip()
print("----#--Scraping Filters Data for filter:", filter)
time.sleep(1)
view_data()
# Scrape Table
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "GrdVwGenReport"))
)
html_table = element.get_attribute('outerHTML')
df1 = pd.read_html(html_table, skiprows=0, header=0)[0][:-1]
cols = df1.columns[-2:]
df1 = df1[cols]
cols = [ filter+"_Work_status_"+col for col in cols ]
df1.columns = cols
df = pd.concat([df, df1], axis=1)'''
# All Scraped
df_list.append(df)
df['financial_year'] = financial_year
else:
print("----Skipped State Index:", i)
df = pd.concat(df_list)
#Save Scrapped File
df.to_csv(os.path.join(cur_output_folder, str(financial_year)+'_Scrapped_Data.csv'), encoding='utf-8', index=False)
df