#Import Libraries
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException,TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
from selenium.common.exceptions import StaleElementReferenceException
import pandas as pd
import numpy as np
import lxml
import os
import time
import datetime
from dateutil.relativedelta import relativedelta
import json
import calendar
#Start Timmer
start = time.time()
#Set Output Folder
'''# Not Needed
output_folder = datetime.datetime.now().strftime("Date-%Y-%m-%d-Time-%H-%M-%S")
'''
output_folder = 'Scrapped Data'
if not os.path.exists(output_folder):
os.makedirs(output_folder)
temp_folder = 'Temp'
if not os.path.exists(temp_folder):
os.makedirs(temp_folder)
#Start Chrome
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory" : os.path.abspath(temp_folder )}
chromeOptions.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(chrome_options=chromeOptions)
driver.implicitly_wait(20) # seconds
#Prepare URL
protocol = "http://"
url = protocol + "nregarep2.nic.in/netnrega/dynamic2/DynamicReport_new4.aspx"
params = {
'key': 'value'
}
#for key, value in params.items():
# url = url + "&" + key + "=" + str(value)
#Playing with headers
my_referer = url
UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
mod_headers = {'referer': my_referer, 'User-Agent':UserAgent}
#Start Chrome
driver.get(url)
# Button and variables
year_selector_id = "DdlstFinYear"
view_button_id = "viewDummy"
details_selector_id = "regionselect"
gp_query = ".//option[@value='panchayat']"
pre_2011_data_button_id = "LinkButton1"
state_boxes_query = ".//li[@class='statebox']"
accordion_buttons_query = ".//button[@type='button'][@class='accordion'][text()='+']"
data_table_id = "GrdVwGenReport"
## Worker participation details
simplecheck = [
"ChkLstFieldsWorkerA_0",
"ChkLstFieldsWorkerA_1"
]
category_dropdwn = [
"TxtBox1",
"TxtBox9",
]
monthwise_dropdwn = [
"TxtBox5",
"TxtBox6",
]
# Loads Panchayat Name by opening accordians
def load_panchayat(i):
## Open State
driver.find_elements_by_xpath(state_boxes_query)[i].find_element_by_xpath(accordion_buttons_query).click()
## Wait for Accordions
check_list_query = ".//ul[@class='districtul'][position()='" + str(i+1) + "']"
buttons_query = ".//button[@type='button'][@class='accordion'][text()='+']"
element = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.XPATH, check_list_query))
)
buttons = element.find_elements_by_xpath(buttons_query)
actions = ActionChains(driver)
for button in buttons:
actions.move_to_element(button).perform()
button.click()
time.sleep(1)
## check state
check_box_query = ".//li[@class='statebox'][position()='" + str(i+1) + "']/label[@class='accordcheck']"
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, check_box_query))
)
element.click()
def check_simple_indicators():
for x_id in simplecheck:
driver.find_element_by_id(x_id).click()
time.sleep(1)
def get_home_page(year_query):
#Start Chrome
driver.get(url)
#Start Chrome
driver.get(url)
# Select Year
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, year_selector_id))
)
element.find_elements_by_xpath(".//option")[1].click()
# Select Year
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, year_query))
)
element.click()
# Select Level of Detail
## Remove Accordion Button
driver.execute_script('document.getElementsByClassName("accordion")[0].remove()')
## Select GP
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, details_selector_id))
)
element.find_element_by_xpath(gp_query).click()
## Wait for Accordions
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.XPATH, accordion_buttons_query))
)
return 1
# Init Master Data frame
df = None
# Select Year
year = "2013-2014"
year_query = ".//select[@id='" + year_selector_id + "']/option[text()='" + year + "']"
# Init Page
load_page = get_home_page(year_query)
# Get Total no of States
state_count = len(driver.find_elements_by_xpath(state_boxes_query))
# Loop over each state
i = 0
while i < state_count and i < 1:
# Init Page
load_page = get_home_page(year_query)
# load Panchayat
load_panchayat(i)
# Get simple Indicators
## Check the indicator boxes
check_simple_indicators()
# Click View Button
driver.find_element_by_id(view_button_id).click()
# Check for table to be loaded
element = WebDriverWait(driver, 30).until(
EC.visibility_of_element_located((By.ID, data_table_id))
)
html_table = element.get_attribute('outerHTML')
df1 = pd.read_html(html_table, skiprows=0, header =0)[0][:-1]
# State loop over
i+=1
df1