Skip to main content
2 of 2
Removed dead link; improved title
Jamal
  • 35.2k
  • 13
  • 134
  • 238

URL and source page scraper

The code does seem a bit repetitive in places such as the parenturlscraper module and the childurlscraper module.

Does anyone have any recommendations for improving my code and condensing it a little?

In essence, the code scrapes this site and populates a table with details about each crash, geocoding the location data extracted from the site using Google.

__version__ = '0.1'
__author__ = 'antmancoder'

# Importing of modules required for the script to run successfully
import scraperwiki
import lxml.html
import urlparse
import urllib2
import dateutil.parser
from geopy import geocoders

# Introduction of various global variables required throughout the running of the code
urlstem = "http://planecrashinfo.com"
urlyeardb = "database.htm"
yearsource = urlparse.urljoin(urlstem, urlyeardb)
yearlist = []
sourcepageurl = []


def parenturlscraper():
    """Function scrapes all of the parent URLs from 'planecrashinfo.com/database'"""
    html = scraperwiki.scrape(yearsource)
    root = lxml.html.fromstring(html)

    hrefs = root.cssselect('td a')

    for href in hrefs:
        link = href.attrib['href']
        url = urlparse.urljoin(urlstem, link)
        yearlist.append(url)

def childurlscraper():
    """Function scrapes all of the child URLs from those scraped in the parenturlscraper module"""
    for url in yearlist:
        html = scraperwiki.scrape(url)
        root = lxml.html.fromstring(html)
        hrefs = root.cssselect('td a')
        url = url[0:34]
        for href in hrefs:
            linkurl = href.attrib['href']
            url = urlparse.urljoin(url, linkurl)
            sourcepageurl.append(url)

def sourcepagescraper(): 
    """Function scrapes respective data for each accident and placed it into DB"""
    for url in sourcepageurl:
        try: 
            html = scraperwiki.scrape(url)
            root = lxml.html.fromstring(html)
            for tr in root.cssselect("body"):
                tds = tr.cssselect("td")
                location = coorlookup(tds[7].text_content())
                for td in tds:
                    crashinfo = {}
                    crashinfo['url'] = url
                    crashinfo['date'] = dateutil.parser.parse(tds[3].text_content()).date()
                    crashinfo['time'] = tds[5].text_content()
                    crashinfo['location'] = tds[7].text_content()
                    crashinfo['latitude'] = location[1][0]
                    crashinfo['longitude'] = location[1][1]
                    crashinfo['operator'] = tds[9].text_content()
                    crashinfo['flight no'] = tds[11].text_content()
                    crashinfo['route'] = tds[13].text_content()
                    crashinfo['aircraft type'] = tds[15].text_content()
                    crashinfo['registration'] = tds[17].text_content()
                    crashinfo['cn ln'] = tds[19].text_content()
                    crashinfo['aboard'] = tds[21].text_content()
                    crashinfo['fatalities'] = tds[23].text_content()
                    crashinfo['ground'] = tds[25].text_content()
                    crashinfo['summary'] = tds[27].text_content()

                scraperwiki.sqlite.save(unique_keys=['url'], data=crashinfo)
        except urllib2.HTTPError, err:
            if err.code == 404:
                continue

def coorlookup(location):
    """Function is called from the 'sourcepagescraper' function to geolocate locations listed on website for each accident"""
    g = geocoders.Google()
    try:
        loc = g.geocode(location, exactly_one=True)
        return loc
    except:
        return ("",("",""))

parenturlscraper()
childurlscraper()
sourcepagescraper()