I have created a script for article scraping - it finds title, subtitle, href-link, and the time of publication. Once retrieved, information is converted to a pandas dataframe, and the link for the next page is returned as well (so that it parses page after page).
Everything works as expected, though I feel there should be an easier -or more elegant- way of loading a subsequent page within main function.
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
def read_page(url):
r = requests.get(url)
return BeautifulSoup(r.content, "lxml")
def news_scraper(soup):
BASE = "https://www.pravda.com.ua"
container = []
for i in soup.select("div.news.news_all > div"):
container.append(
[
i.a.text, # title
i.find(class_="article__subtitle").text, # subtitle
i.div.text, # time
BASE + i.a["href"], # link
]
)
dataframe = pd.DataFrame(container, columns=["title", "subtitle", "time", "link"])
dataframe["date"] = (
dataframe["link"]
.str.extract("(\d{4}/\d{2}/\d{2})")[0]
.str.cat(dataframe["time"], sep=" ")
)
next_page = soup.select_one("div.archive-navigation > a.button.button_next")["href"]
return dataframe.drop("time", axis=1), BASE + next_page
def main(START_URL):
print(START_URL)
results = []
soup = read_page(START_URL)
df, next_page = news_scraper(soup)
results.append(df)
while next_page:
print(next_page)
try:
soup = read_page(next_page)
df, next_page = news_scraper(soup)
results.append(df)
except:
next_page = False
sleep(1)
return pd.concat([r for r in results], ignore_index=True)
if __name__ == "__main__":
df = main("https://www.pravda.com.ua/archives/date_24122019/")
assert df.shape == (120, 4)