Revisions to Web Scraping with Python + asyncio

edited tags

Link

edited Mar 22, 2022 at 15:06

Reinderien

71.1k
5
76
256

edited tags

Link

edited Jun 17, 2017 at 22:33

Jamal

35.2k
13
134
238

python python-3.x asynchronous web-scraping screen-scraping

Mark as Python 3, link library.

Source Link

edited Dec 1, 2015 at 0:05

ferada

11.4k
26
65

I've been working at speeding up my web scraping with the asyncioasyncio library. I have a working solution, but am unsure as to how pythonic it is or if I am properly using the library. Any input would be appreciated.

import aiohttp
import asyncio
import requests
from lxml import etree


@asyncio.coroutine
def get(*args, **kwargs):
    """
    A wrapper method for aiohttp's get method. Taken from Georges Dubus' article at
    http://compiletoi.net/fast-scraping-in-python-with-asyncio.html
    """
    response = yield from aiohttp.request('GET', *args, **kwargs)
    return (yield from response.read_and_close())


@asyncio.coroutine
def extract_text(url):
    """
    Given the url for a chapter, extract the relevant text from it
    :param url: the url for the chapter to scrape
    :return: a string containing the chapter's text
    """
    sem = asyncio.Semaphore(5)
    with (yield from sem):
        page = yield from get(url)

    tree = etree.HTML(page)
    paragraphs = tree.findall('.//*/div[@class="entry-content"]/p')[1: -1]
    return b'\n'.join(etree.tostring(paragraph) for paragraph in paragraphs)


def generate_links():
    """
    Generate the links to each of the chapters
    :return: A list of strings containing every url to visit
    """
    start_url = 'https://twigserial.wordpress.com/'
    base_url = 'https://twigserial.wordpress.com/category/story/'
    tree = etree.HTML(requests.get(start_url).text)
    xpath = './/*/option[@class="level-2"]/text()'
    return [base_url + suffix.strip() for suffix in tree.xpath(xpath)]


@asyncio.coroutine
def run():
    links = generate_links()
    chapters = []

    for f in asyncio.as_completed([extract_text(link) for link in links]):
        result = yield from f
        chapters.append(result)

    return chapters


def main():
    loop = asyncio.get_event_loop()
    chapters = loop.run_until_complete(run())
    print(len(chapters))


if __name__ == '__main__':
    main()

I've been working at speeding up my web scraping with the asyncio library. I have a working solution, but am unsure as to how pythonic it is or if I am properly using the library. Any input would be appreciated.

import aiohttp
import asyncio
import requests
from lxml import etree


@asyncio.coroutine
def get(*args, **kwargs):
    """
    A wrapper method for aiohttp's get method. Taken from Georges Dubus' article at
    http://compiletoi.net/fast-scraping-in-python-with-asyncio.html
    """
    response = yield from aiohttp.request('GET', *args, **kwargs)
    return (yield from response.read_and_close())


@asyncio.coroutine
def extract_text(url):
    """
    Given the url for a chapter, extract the relevant text from it
    :param url: the url for the chapter to scrape
    :return: a string containing the chapter's text
    """
    sem = asyncio.Semaphore(5)
    with (yield from sem):
        page = yield from get(url)

    tree = etree.HTML(page)
    paragraphs = tree.findall('.//*/div[@class="entry-content"]/p')[1: -1]
    return b'\n'.join(etree.tostring(paragraph) for paragraph in paragraphs)


def generate_links():
    """
    Generate the links to each of the chapters
    :return: A list of strings containing every url to visit
    """
    start_url = 'https://twigserial.wordpress.com/'
    base_url = 'https://twigserial.wordpress.com/category/story/'
    tree = etree.HTML(requests.get(start_url).text)
    xpath = './/*/option[@class="level-2"]/text()'
    return [base_url + suffix.strip() for suffix in tree.xpath(xpath)]


@asyncio.coroutine
def run():
    links = generate_links()
    chapters = []

    for f in asyncio.as_completed([extract_text(link) for link in links]):
        result = yield from f
        chapters.append(result)

    return chapters


def main():
    loop = asyncio.get_event_loop()
    chapters = loop.run_until_complete(run())
    print(len(chapters))


if __name__ == '__main__':
    main()

I've been working at speeding up my web scraping with the asyncio library. I have a working solution, but am unsure as to how pythonic it is or if I am properly using the library. Any input would be appreciated.

import aiohttp
import asyncio
import requests
from lxml import etree


@asyncio.coroutine
def get(*args, **kwargs):
    """
    A wrapper method for aiohttp's get method. Taken from Georges Dubus' article at
    http://compiletoi.net/fast-scraping-in-python-with-asyncio.html
    """
    response = yield from aiohttp.request('GET', *args, **kwargs)
    return (yield from response.read_and_close())


@asyncio.coroutine
def extract_text(url):
    """
    Given the url for a chapter, extract the relevant text from it
    :param url: the url for the chapter to scrape
    :return: a string containing the chapter's text
    """
    sem = asyncio.Semaphore(5)
    with (yield from sem):
        page = yield from get(url)

    tree = etree.HTML(page)
    paragraphs = tree.findall('.//*/div[@class="entry-content"]/p')[1: -1]
    return b'\n'.join(etree.tostring(paragraph) for paragraph in paragraphs)


def generate_links():
    """
    Generate the links to each of the chapters
    :return: A list of strings containing every url to visit
    """
    start_url = 'https://twigserial.wordpress.com/'
    base_url = 'https://twigserial.wordpress.com/category/story/'
    tree = etree.HTML(requests.get(start_url).text)
    xpath = './/*/option[@class="level-2"]/text()'
    return [base_url + suffix.strip() for suffix in tree.xpath(xpath)]


@asyncio.coroutine
def run():
    links = generate_links()
    chapters = []

    for f in asyncio.as_completed([extract_text(link) for link in links]):
        result = yield from f
        chapters.append(result)

    return chapters


def main():
    loop = asyncio.get_event_loop()
    chapters = loop.run_until_complete(run())
    print(len(chapters))


if __name__ == '__main__':
    main()

python python-3.x asynchronous web-scraping screen-scraping

Tweeted twitter.com/StackCodeReview/status/661377748926111744

occurred Nov 3, 2015 at 3:01