I built a web scraper to pull a list of jobs on Facebook and other websites but want to break up the code into functions that I can reuse for other websites. This structure is working but I think it can be more efficient with functions. I'm getting stuck on how to structure the functions. It's only pulling two pages for testing.
from time import time
from requests import get
from time import sleep
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from bs4 import BeautifulSoup
import csv
# Range of only 2 pages
pages = [str(i) for i in range(1, 3)]
cities = ["Menlo%20Park%2C%20CA",
"Fremont%2C%20CA",
"Los%20Angeles%2C%20CA",
"Mountain%20View%2C%20CA",
"Northridge%2CCA",
"Redmond%2C%20WA",
"San%20Francisco%2C%20CA",
"Santa%20Clara%2C%20CA",
"Seattle%2C%20WA",
"Woodland%20Hills%2C%20CA"]
# Preparing the monitoring of the loop
start_time = time()
requests = 0
with open('facebook_job_list.csv', 'w', newline='') as f:
header = csv.writer(f)
header.writerow(["Website", "Title", "Location", "Job URL"])
for page in pages:
for c in cities:
# Requests the html page
response = get("https://www.facebook.com/careers/jobs/?page=" + page +
"&results_per_page=100&locations[0]=" + c)
# Pauses the loop between 8 and 15 seconds
sleep(randint(8, 15))
# Monitor the frequency of requests
requests += 1
elapsed_time = time() - start_time
print("Request:{}; Frequency: {} request/s".format(requests, requests/elapsed_time))
clear_output(wait=True)
# Throw a warning for non-200 status codes
if response.status_code != 200:
warn("Request: {}; Status code: {}".format(requests, response.status_code))
# Break the loop if number of requests is greater than expected
if requests > 2:
warn("Number of requests was greater than expected.")
break
# Parse the content of the request with BeautifulSoup
page_soup = BeautifulSoup(response.text, 'html.parser')
job_containers = page_soup.find_all("a", "_69jm")
# Select all 100 jobs containers from a single page
for container in job_containers:
site = page_soup.find("title").text
title = container.find("div", "_69jo").text
location = container.find("div", "_1n-z _6hy- _21-h").text
link = container.get("href")
job_link = "https://www.facebook.com" + link
with open('facebook_job_list.csv', 'a', newline='') as f:
rows = csv.writer(f)
rows.writerow([site, title, location, job_link])