In [11]:
#Import Libraries
import numpy as np
import pandas as pd
import geopandas as gpd
import json
import requests
import math
import os
import time
In [12]:
# Import Folders
data_folder = os.path.abspath("data")
output_folder = os.path.abspath("output")
#check if outfolder exists if not create it
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
In [13]:
#Declarations
req_frmt = 'pjson'
geo_frmt = 'geojson'
crs = {'init': 'epsg:4326'}

#Playing with headers
my_referer = 'http://www.myplan.ie/'
UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'

#layers
Layers = ['https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/0',
          'https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/1',
          'https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/2',
          'https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/3',
          'https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/4',
          'https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/5',
          'https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/6',
          'https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/7',
         ]

#Skip layers by this
Layers = Layers[0:]
In [14]:
def getmetadata(_url):
    url = _url + '?f=' + req_frmt    
    print("--Getting Metadata fo layer with url", url)
    response = requests.get(url, headers={'referer': my_referer, 'User-Agent':UserAgent}, timeout=10)
    if response.status_code == 200:
        metadata = response.json()
        return metadata
    else:
        print("--Error in getting Metadata for layer")
        return 0
    
def getfeatcount(_url):
    query = 'where=1%3D1&text=&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&relationParam=&outFields=&returnGeometry=false&returnTrueCurves=false&maxAllowableOffset=&geometryPrecision=&outSR=&returnIdsOnly=false&returnCountOnly=true&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&gdbVersion=&returnDistinctValues=false&resultOffset=&resultRecordCount='
    url = _url + '/query?' + query + '&f=' + req_frmt
    response = requests.get(url, headers={'referer': my_referer, 'User-Agent':UserAgent}, timeout=120)
    if response.status_code == 200:
        response = response.json()
        feat_count = response['count']
        return feat_count
    else:
        print("--Error in getting Featurecount for layer:", layername)
        return 0
In [15]:
# Scrape Layers
for _url in Layers:
    # Get Metadata
    metadata = getmetadata(_url)
    if not metadata == 0:
        # Metadata to variables
        layername = metadata["name"]
        layer_folder = os.path.join(output_folder, layername)
        #check if outfolder exists if not create it
        if not os.path.exists(layer_folder):
            os.makedirs(layer_folder)
        _max_rec = int(metadata['maxRecordCount'])#/2)
        idfield = metadata["fields"][0]['name']
        feat_count = getfeatcount(_url)
        #Start scraping From server
        print("---Strating to scrape data for layer:", layername)
        offset = -1
        i = 1
        while offset < feat_count: #loop
            max_rec = _max_rec
            scraped = 0 
            while scraped == 0 and max_rec >= 1:
                #Query Server
                lowerlimit = str(offset)
                upperlimit = str(offset+max_rec+1)
                url = _url + '/query?where=' +  idfield + '%3E' + lowerlimit + '%20and%20' +  idfield + '%3C' + upperlimit + '&f=' + geo_frmt + '&where=1%3D1&outFields=*&returnGeometry=true&outSR=4326&returnIdsOnly=false&returnCountOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&gdbVersion=&returnDistinctValues=false&returnTrueCurves=true'
                print("---",url)
                #get Data to Memory
                status_code = 0
                try:
                    response = requests.get(url, headers={'referer': my_referer, 'User-Agent':UserAgent}, timeout=240)
                    status_code = response.status_code
                except:
                    pass
                err_code = 0
                try:
                    err_code = response.json()['error']['code']
                except:
                    pass
                print("--Status Code: ",status_code," | error code:",err_code)
                if status_code == 200 and err_code == 0:
                    #save json
                    response = response.json()
                    out_file = os.path.join(layer_folder, 'query-' + str(i) + '.json')
                    with open(out_file, 'w') as outfile:
                        json.dump(response, outfile)
                    #all done
                    scraped = 1
                    #Increase variable for next loop
                    i+=1
                    offset+=max_rec
                    print ("---Scraping ",layername," now progress: ", min(offset, feat_count), " Scrapped out of Total ", feat_count)
                else:                            
                    print("---Error in Scraping ", layername, " | Url: ", url, " | Rescraping ")
                    max_rec = int(max_rec / 2)
        print("\n")
--Getting Metadata fo layer with url https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/0?f=pjson
---Strating to scrape data for layer: Regional Assemblies (NUTS II)
--- https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/0/query?where=OBJECTID%3E-1%20and%20OBJECTID%3C1000&f=geojson&where=1%3D1&outFields=*&returnGeometry=true&outSR=4326&returnIdsOnly=false&returnCountOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&gdbVersion=&returnDistinctValues=false&returnTrueCurves=true
--Status Code:  200  | error code: 0
---Scraping  Regional Assemblies (NUTS II)  now progress:  2  Scrapped out of Total  2


--Getting Metadata fo layer with url https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/1?f=pjson
---Strating to scrape data for layer: Regional Authorities (NUTS III)
--- https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/1/query?where=OBJECTID%3E-1%20and%20OBJECTID%3C1000&f=geojson&where=1%3D1&outFields=*&returnGeometry=true&outSR=4326&returnIdsOnly=false&returnCountOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&gdbVersion=&returnDistinctValues=false&returnTrueCurves=true
--Status Code:  200  | error code: 0
---Scraping  Regional Authorities (NUTS III)  now progress:  9  Scrapped out of Total  9


--Getting Metadata fo layer with url https://maps.housing.gov.ie/arcgis/rest/services/MyPlan/AdministrativeBoundaries/MapServer/2?f=pjson
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    379             try:  # Python 2.7, use buffering of HTTP responses
--> 380                 httplib_response = conn.getresponse(buffering=True)
    381             except TypeError:  # Python 2.6 and older, Python 3

TypeError: getresponse() got an unexpected keyword argument 'buffering'

During handling of the above exception, another exception occurred:

WantReadError                             Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\urllib3\contrib\pyopenssl.py in recv_into(self, *args, **kwargs)
    279         try:
--> 280             return self.connection.recv_into(*args, **kwargs)
    281         except OpenSSL.SSL.SysCallError as e:

C:\ProgramData\Anaconda3\lib\site-packages\OpenSSL\SSL.py in recv_into(self, buffer, nbytes, flags)
   1714             result = _lib.SSL_read(self._ssl, buf, nbytes)
-> 1715         self._raise_ssl_error(self._ssl, result)
   1716 

C:\ProgramData\Anaconda3\lib\site-packages\OpenSSL\SSL.py in _raise_ssl_error(self, ssl, result)
   1520         if error == _lib.SSL_ERROR_WANT_READ:
-> 1521             raise WantReadError()
   1522         elif error == _lib.SSL_ERROR_WANT_WRITE:

WantReadError: 

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-15-dd0cf3feae59> in <module>()
      2 for _url in Layers:
      3     # Get Metadata
----> 4     metadata = getmetadata(_url)
      5     if not metadata == 0:
      6         # Metadata to variables

<ipython-input-14-3fa337b8f2dd> in getmetadata(_url)
      2     url = _url + '?f=' + req_frmt
      3     print("--Getting Metadata fo layer with url", url)
----> 4     response = requests.get(url, headers={'referer': my_referer, 'User-Agent':UserAgent}, timeout=10)
      5     if response.status_code == 200:
      6         metadata = response.json()

C:\ProgramData\Anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
     70 
     71     kwargs.setdefault('allow_redirects', True)
---> 72     return request('get', url, params=params, **kwargs)
     73 
     74 

C:\ProgramData\Anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
     56     # cases, and look like a memory leak in others.
     57     with sessions.Session() as session:
---> 58         return session.request(method=method, url=url, **kwargs)
     59 
     60 

C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    506         }
    507         send_kwargs.update(settings)
--> 508         resp = self.send(prep, **send_kwargs)
    509 
    510         return resp

C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
    616 
    617         # Send the request
--> 618         r = adapter.send(request, **kwargs)
    619 
    620         # Total elapsed time of the request (approximately)

C:\ProgramData\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    438                     decode_content=False,
    439                     retries=self.max_retries,
--> 440                     timeout=timeout
    441                 )
    442 

C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    599                                                   timeout=timeout_obj,
    600                                                   body=body, headers=headers,
--> 601                                                   chunked=chunked)
    602 
    603             # If we're going to release the connection in ``finally:``, then

C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    381             except TypeError:  # Python 2.6 and older, Python 3
    382                 try:
--> 383                     httplib_response = conn.getresponse()
    384                 except Exception as e:
    385                     # Remove the TypeError from the exception chain in Python 3;

C:\ProgramData\Anaconda3\lib\http\client.py in getresponse(self)
   1329         try:
   1330             try:
-> 1331                 response.begin()
   1332             except ConnectionError:
   1333                 self.close()

C:\ProgramData\Anaconda3\lib\http\client.py in begin(self)
    295         # read until we get a non-100 response
    296         while True:
--> 297             version, status, reason = self._read_status()
    298             if status != CONTINUE:
    299                 break

C:\ProgramData\Anaconda3\lib\http\client.py in _read_status(self)
    256 
    257     def _read_status(self):
--> 258         line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    259         if len(line) > _MAXLINE:
    260             raise LineTooLong("status line")

C:\ProgramData\Anaconda3\lib\socket.py in readinto(self, b)
    584         while True:
    585             try:
--> 586                 return self._sock.recv_into(b)
    587             except timeout:
    588                 self._timeout_occurred = True

C:\ProgramData\Anaconda3\lib\site-packages\urllib3\contrib\pyopenssl.py in recv_into(self, *args, **kwargs)
    290                 raise
    291         except OpenSSL.SSL.WantReadError:
--> 292             rd = util.wait_for_read(self.socket, self.socket.gettimeout())
    293             if not rd:
    294                 raise timeout('The read operation timed out')

C:\ProgramData\Anaconda3\lib\site-packages\urllib3\util\wait.py in wait_for_read(socks, timeout)
     31     or optionally a single socket if passed in. Returns a list of
     32     sockets that can be read from immediately. """
---> 33     return _wait_for_io_events(socks, EVENT_READ, timeout)
     34 
     35 

C:\ProgramData\Anaconda3\lib\site-packages\urllib3\util\wait.py in _wait_for_io_events(socks, events, timeout)
     24             selector.register(sock, events)
     25         return [key[0].fileobj for key in
---> 26                 selector.select(timeout) if key[1] & events]
     27 
     28 

C:\ProgramData\Anaconda3\lib\site-packages\urllib3\util\selectors.py in select(self, timeout)
    318             ready = []
    319             r, w, _ = _syscall_wrapper(self._select, True, self._readers,
--> 320                                        self._writers, timeout)
    321             r = set(r)
    322             w = set(w)

C:\ProgramData\Anaconda3\lib\site-packages\urllib3\util\selectors.py in _syscall_wrapper(func, _, *args, **kwargs)
     62         and recalculate their timeouts. """
     63         try:
---> 64             return func(*args, **kwargs)
     65         except (OSError, IOError, select.error) as e:
     66             errcode = None

C:\ProgramData\Anaconda3\lib\site-packages\urllib3\util\selectors.py in _select(self, r, w, timeout)
    308         def _select(self, r, w, timeout=None):
    309             """ Wrapper for select.select because timeout is a positional arg """
--> 310             return select.select(r, w, [], timeout)
    311 
    312         def select(self, timeout=None):

KeyboardInterrupt: