Skip to main content
Tweeted twitter.com/StackCodeReview/status/1210440290147930113
added a comment to assert statement
Source Link
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep


def read_page(url):
    r = requests.get(url)
    return BeautifulSoup(r.content, "lxml")


def news_scraper(soup):
    BASE = "https://www.pravda.com.ua"
    container = []
    for i in soup.select("div.news.news_all > div"):
        container.append(
            [
                i.a.text,  # title
                i.find(class_="article__subtitle").text,  # subtitle
                i.div.text,  # time
                BASE + i.a["href"],  # link
            ]
        )

    dataframe = pd.DataFrame(container, columns=["title", "subtitle", "time", "link"])
    dataframe["date"] = (
        dataframe["link"]
        .str.extract("(\d{4}/\d{2}/\d{2})")[0]
        .str.cat(dataframe["time"], sep=" ")
    )
    next_page = soup.select_one("div.archive-navigation > a.button.button_next")["href"]

    return dataframe.drop("time", axis=1), BASE + next_page


def main(START_URL):
    print(START_URL)

    results = []

    soup = read_page(START_URL)
    df, next_page = news_scraper(soup)
    results.append(df)

    while next_page:
        print(next_page)

        try:
            soup = read_page(next_page)
            df, next_page = news_scraper(soup)
            results.append(df)
        except:
            next_page = False

        sleep(1)

    return pd.concat([r for r in results], ignore_index=True)


if __name__ == "__main__":
    df = main("https://www.pravda.com.ua/archives/date_24122019/")
    assert df.shape == (120, 4) # it's true as of today, 12.26.2019
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep


def read_page(url):
    r = requests.get(url)
    return BeautifulSoup(r.content, "lxml")


def news_scraper(soup):
    BASE = "https://www.pravda.com.ua"
    container = []
    for i in soup.select("div.news.news_all > div"):
        container.append(
            [
                i.a.text,  # title
                i.find(class_="article__subtitle").text,  # subtitle
                i.div.text,  # time
                BASE + i.a["href"],  # link
            ]
        )

    dataframe = pd.DataFrame(container, columns=["title", "subtitle", "time", "link"])
    dataframe["date"] = (
        dataframe["link"]
        .str.extract("(\d{4}/\d{2}/\d{2})")[0]
        .str.cat(dataframe["time"], sep=" ")
    )
    next_page = soup.select_one("div.archive-navigation > a.button.button_next")["href"]

    return dataframe.drop("time", axis=1), BASE + next_page


def main(START_URL):
    print(START_URL)

    results = []

    soup = read_page(START_URL)
    df, next_page = news_scraper(soup)
    results.append(df)

    while next_page:
        print(next_page)

        try:
            soup = read_page(next_page)
            df, next_page = news_scraper(soup)
            results.append(df)
        except:
            next_page = False

        sleep(1)

    return pd.concat([r for r in results], ignore_index=True)


if __name__ == "__main__":
    df = main("https://www.pravda.com.ua/archives/date_24122019/")
    assert df.shape == (120, 4) 
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep


def read_page(url):
    r = requests.get(url)
    return BeautifulSoup(r.content, "lxml")


def news_scraper(soup):
    BASE = "https://www.pravda.com.ua"
    container = []
    for i in soup.select("div.news.news_all > div"):
        container.append(
            [
                i.a.text,  # title
                i.find(class_="article__subtitle").text,  # subtitle
                i.div.text,  # time
                BASE + i.a["href"],  # link
            ]
        )

    dataframe = pd.DataFrame(container, columns=["title", "subtitle", "time", "link"])
    dataframe["date"] = (
        dataframe["link"]
        .str.extract("(\d{4}/\d{2}/\d{2})")[0]
        .str.cat(dataframe["time"], sep=" ")
    )
    next_page = soup.select_one("div.archive-navigation > a.button.button_next")["href"]

    return dataframe.drop("time", axis=1), BASE + next_page


def main(START_URL):
    print(START_URL)

    results = []

    soup = read_page(START_URL)
    df, next_page = news_scraper(soup)
    results.append(df)

    while next_page:
        print(next_page)

        try:
            soup = read_page(next_page)
            df, next_page = news_scraper(soup)
            results.append(df)
        except:
            next_page = False

        sleep(1)

    return pd.concat([r for r in results], ignore_index=True)


if __name__ == "__main__":
    df = main("https://www.pravda.com.ua/archives/date_24122019/")
    assert df.shape == (120, 4) # it's true as of today, 12.26.2019
Source Link

Scraping next page using BeautifulSoup

I have created a script for article scraping - it finds title, subtitle, href-link, and the time of publication. Once retrieved, information is converted to a pandas dataframe, and the link for the next page is returned as well (so that it parses page after page).

Everything works as expected, though I feel there should be an easier -or more elegant- way of loading a subsequent page within main function.


import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep


def read_page(url):
    r = requests.get(url)
    return BeautifulSoup(r.content, "lxml")


def news_scraper(soup):
    BASE = "https://www.pravda.com.ua"
    container = []
    for i in soup.select("div.news.news_all > div"):
        container.append(
            [
                i.a.text,  # title
                i.find(class_="article__subtitle").text,  # subtitle
                i.div.text,  # time
                BASE + i.a["href"],  # link
            ]
        )

    dataframe = pd.DataFrame(container, columns=["title", "subtitle", "time", "link"])
    dataframe["date"] = (
        dataframe["link"]
        .str.extract("(\d{4}/\d{2}/\d{2})")[0]
        .str.cat(dataframe["time"], sep=" ")
    )
    next_page = soup.select_one("div.archive-navigation > a.button.button_next")["href"]

    return dataframe.drop("time", axis=1), BASE + next_page


def main(START_URL):
    print(START_URL)

    results = []

    soup = read_page(START_URL)
    df, next_page = news_scraper(soup)
    results.append(df)

    while next_page:
        print(next_page)

        try:
            soup = read_page(next_page)
            df, next_page = news_scraper(soup)
            results.append(df)
        except:
            next_page = False

        sleep(1)

    return pd.concat([r for r in results], ignore_index=True)


if __name__ == "__main__":
    df = main("https://www.pravda.com.ua/archives/date_24122019/")
    assert df.shape == (120, 4)