This is my first time web scraping, and here is the code I whipped up:
from bs4 import BeautifulSoup
import requests
import time
keywords = ['python']
for n in range(1000):
res = requests.get(f"https://stackoverflow.com/questions?tab=newest&page={n}")
time.sleep(3) # Sleep to avoid getting rate limited again
soup = BeautifulSoup(res.text, "html.parser")
questions = soup.select(".question-summary") # List of all question summaries in the current page
for que in questions:
found = False
tagged = False
q = que.select_one('.question-hyperlink').getText() # Store the title of the question
for a in que.find_all('a', href=True):
u = a['href'] # Store the link
if u.split('/')[1] == 'questions' and u.split('/')[2] != 'tagged': # If this link is a question and not a tag
res2 = requests.get("https://stackoverflow.com" + u) # Send request for that question
time.sleep(3) # Extra precaution to avoid getting rate limited again
soup2 = BeautifulSoup(res2.text, "html.parser")
body = str(soup2.select(".s-prose")) # This is the body of the question
if any(key in body for key in keywords):
found = True
if 'tagged/python' in u:
tagged = True
if found and not tagged:
print(q)
My code basically scrapes Stack Overflow posts newest first, and prints out all the posts that has the keyword "python" in its body, but no python tag. I want to know, did I implement the algorithm optimally? Can you show me where to improve?