Revisions to Web scraping job postings from Facebook

deleted 4 characters in body

Source Link

edited Apr 25, 2019 at 9:47

41.7k
7
70
134

This way you only have to open the file once per page, instead of a hundred times. Even better would be to make the whole thing a generator, so you can write all rows at while opening the file only once:

def get__all_jobsget_all_jobs(url, cities, pages):
    for city in cities:
        for page in pages:
            params = {"page": page, "results_per_page": 100, "locations[0]": city}
            response = requests.get(url, params=params)
            # check status code

            yield from get_job_infos(response)

            # rate throttling, etc here
            ...

if __name__ == "__main__":
    cities = ["Menlo Park, CA", ...]
    pages = range(1, 3)
    url = "https://www.facebook.com/careers/jobs/"

    with open('facebook_job_list.csv', "w") as f:
        writer = csv.writer(f)
        writer.writerow(["Website", "Title", "Location", "Job URL"])
        writer.writerows(get_all_jobs(url, pages, cities))

This way you only have to open the file once per page, instead of a hundred times. Even better would be to make the whole thing a generator, so you can write all rows at while opening the file only once:

def get__all_jobs(url, cities, pages):
    for city in cities:
        for page in pages:
            params = {"page": page, "results_per_page": 100, "locations[0]": city}
            response = requests.get(url, params=params)
            # check status code

            yield from get_job_infos(response)

            # rate throttling, etc here
            ...

if __name__ == "__main__":
    cities = ["Menlo Park, CA", ...]
    pages = range(1, 3)
    url = "https://www.facebook.com/careers/jobs/"

    with open('facebook_job_list.csv', "w") as f:
        writer = csv.writer(f)
        writer.writerow(["Website", "Title", "Location", "Job URL"])
        writer.writerows(get_all_jobs(url, pages, cities))

This way you only have to open the file once per page, instead of a hundred times. Even better would be to make the whole thing a generator, so you can write all rows while opening the file only once:

def get_all_jobs(url, cities, pages):
    for city in cities:
        for page in pages:
            params = {"page": page, "results_per_page": 100, "locations[0]": city}
            response = requests.get(url, params=params)
            # check status code

            yield from get_job_infos(response)

            # rate throttling, etc here
            ...

if __name__ == "__main__":
    cities = ["Menlo Park, CA", ...]
    pages = range(1, 3)
    url = "https://www.facebook.com/careers/jobs/"

    with open('facebook_job_list.csv', "w") as f:
        writer = csv.writer(f)
        writer.writerow(["Website", "Title", "Location", "Job URL"])
        writer.writerows(get_all_jobs(url, pages, cities))

added 1016 characters in body

Source Link

edited Mar 25, 2019 at 8:40

Graipher

41.7k
7
70
134

def get_job_infos(response):
    """Parse the content of the request to get all job postings"""
    page_soup = BeautifulSoup(response.text, 'html.parser''lxml')
    job_containers = page_soup.find_all("a", "_69jm")

    # Select all 100 jobs containers from a single page
    for container in job_containers:
        site = page_soup.find("title").text
        title = container.find("div", "_69jo").text
        location = container.find("div", "_1n-z _6hy- _21-h").text
        job_link = "https://www.facebook.com/"com" + container.get("href")
        yield site, title, location, job_link

This is a generator over which you can iterate. Using the lxml parser is usually faster.

Note that csv can write multiple rows at once using writer.writerows, which takes any iterable of rows:

This way you only have to open the file once per page, instead of a hundred times. Even better would be to make the whole thing a generator, so you can write all rows at while opening the file only once:

def get__all_jobs(url, cities, pages):
    for city in cities:
        for page in pages:
            params = {"page": page, "results_per_page": 100, "locations[0]": city}
            response = requests.get(url, params=params)
            # check status code

            yield from get_job_infos(response)

            # rate throttling, etc here
            ...

if __name__ == "__main__":
    cities = ["Menlo Park, CA", ...]
    pages = range(1, 3)
    url = "https://www.facebook.com/careers/jobs/"

    with open('facebook_job_list.csv', "w") as f:
        writer = csv.writer(f)
        writer.writerow(["Website", "Title", "Location", "Job URL"])
        writer.writerows(get_all_jobs(url, pages, cities))

This way the get_all_jobs generator will yield jobs as it is being iterated over, getting the next page when needed.

def get_job_infos(response):
    """Parse the content of the request to get all job postings"""
    page_soup = BeautifulSoup(response.text, 'html.parser')
    job_containers = page_soup.find_all("a", "_69jm")

    # Select all 100 jobs containers from a single page
    for container in job_containers:
        site = page_soup.find("title").text
        title = container.find("div", "_69jo").text
        location = container.find("div", "_1n-z _6hy- _21-h").text
        job_link = "https://www.facebook.com/" + container.get("href")
        yield site, title, location, job_link

This is a generator over which you can iterate. Note that csv can write multiple rows at once using writer.writerows, which takes any iterable of rows:

def get_job_infos(response):
    """Parse the content of the request to get all job postings"""
    page_soup = BeautifulSoup(response.text, 'lxml')
    job_containers = page_soup.find_all("a", "_69jm")

    # Select all 100 jobs containers from a single page
    for container in job_containers:
        site = page_soup.find("title").text
        title = container.find("div", "_69jo").text
        location = container.find("div", "_1n-z _6hy- _21-h").text
        job_link = "https://www.facebook.com" + container.get("href")
        yield site, title, location, job_link

This is a generator over which you can iterate. Using the lxml parser is usually faster.

Note that csv can write multiple rows at once using writer.writerows, which takes any iterable of rows:

This way you only have to open the file once per page, instead of a hundred times. Even better would be to make the whole thing a generator, so you can write all rows at while opening the file only once:

def get__all_jobs(url, cities, pages):
    for city in cities:
        for page in pages:
            params = {"page": page, "results_per_page": 100, "locations[0]": city}
            response = requests.get(url, params=params)
            # check status code

            yield from get_job_infos(response)

            # rate throttling, etc here
            ...

if __name__ == "__main__":
    cities = ["Menlo Park, CA", ...]
    pages = range(1, 3)
    url = "https://www.facebook.com/careers/jobs/"

    with open('facebook_job_list.csv', "w") as f:
        writer = csv.writer(f)
        writer.writerow(["Website", "Title", "Location", "Job URL"])
        writer.writerows(get_all_jobs(url, pages, cities))

This way the get_all_jobs generator will yield jobs as it is being iterated over, getting the next page when needed.

Source Link

answered Mar 25, 2019 at 8:35

Graipher

41.7k
7
70
134

Some quick suggestions:

The requests module can urlencode strings for you if you use the params keyword:

import requests

cities = ["Menlo Park, CA"]
pages = range(1, 3)
url = "https://www.facebook.com/careers/jobs/"

for city in cities:
    for page in pages:
        params = {"page": page, "results_per_page": 100, "locations[0]": city}
        response = requests.get(url, params=params)

Organize your code using functions. This allows you to give them a readable name (and even add docstrings).

def get_job_infos(response):
    """Parse the content of the request to get all job postings"""
    page_soup = BeautifulSoup(response.text, 'html.parser')
    job_containers = page_soup.find_all("a", "_69jm")

    # Select all 100 jobs containers from a single page
    for container in job_containers:
        site = page_soup.find("title").text
        title = container.find("div", "_69jo").text
        location = container.find("div", "_1n-z _6hy- _21-h").text
        job_link = "https://www.facebook.com/" + container.get("href")
        yield site, title, location, job_link

This is a generator over which you can iterate. Note that csv can write multiple rows at once using writer.writerows, which takes any iterable of rows:

with open('facebook_job_list.csv', 'a', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(get_job_infos(response))

Stack Exchange Network

Return to Answer