Skip to main content
edited tags
Link
200_success
  • 145.6k
  • 22
  • 191
  • 481
Source Link

Design Pattern: Builder - BeautifulSoup directory navigation and scraping

I wrote a class on top of BeautifulSoup using the builder design pattern that allows for the navigation of the necp data directory.

There are a couple navigation methods navto which just builds upon the base url to return a new instance, and inav useful when the urls are likely to change due to temporal updates.

import time
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

# for context TSRAGR is TAF code for a thunderstorm with hail

class TSragr:
    def __init__(self, base_url: str = None) -> None:

        self._baseurl = base_url

        r = requests.get(base_url)
        r.raise_for_status()

        soup = BeautifulSoup(r.content, "lxml").find_all("a")

        if soup[0].text == "Parent Directory":
            soup = soup[1:]

        self._soup = pd.Series([x.text for x in soup])

    def __repr__(self) -> str:
        return f"{self.url}\n"+ self._soup.__repr__()

    def __getitem__(self, args) -> "TSragr":
        self._soup = self._soup[args]
        return self

    @property
    def url(self) -> str:
        url = self._baseurl
        if not url.endswith("/"):
            url = url+ "/"
        return url

    def navto(self, *args: str) -> "TSragr":
        return TSragr(self.url + "/".join(args))

    def navup(self) -> "TSragr":
        return TSragr(re.match(r"^(.*[\/])", self.url).group())

    def inav(self, index: int) -> "TSragr":
        return TSragr(self.url + self._soup[index])

    def download(self, save_to="./", wait: float = 10) -> None:

        soup = self._soup.copy()
        soup.index = self.url + self._soup

        for url, filename in soup.items():
            print("DOWNLAODING FILE")
            local_filename = save_to + filename
            with requests.get(url, stream=True) as r:
                r.raise_for_status()
                with open(local_filename, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)

            print("FILE SAVED")
            time.sleep(60 * wait)


usage

>>> from wxlab.scrape import TSragr
>>> ragr = TSragr("https://nomads.ncep.noaa.gov/pub/data")
>>> ragr
https://nomads.ncep.noaa.gov/pub/data/
0     DSRC
1    nccf/
dtype: object
>>> ragr.navto("nccf")
https://nomads.ncep.noaa.gov/pub/data/nccf/
0            charts/
1               com/
2              dcom/
3    nonoperational/
4              pcom/
5             radar/
dtype: object
>>> ragr.navto("nccf","com")
https://nomads.ncep.noaa.gov/pub/data/nccf/com/
0               557ww/
1     amsu_estimation/
2                 aqm/
3                arch/
4               blend/
            ...       
61                uvi/
62               wave/
63        wave_nfcens/
64                wfs/
65          wsa_enlil/
Length: 66, dtype: object
>>> ragr.navto("nccf","com","blend")
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/
0    prod/
1    v4.0/
dtype: object
>>> ragr.navto("nccf","com","blend","prod")
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/
0    blend.20220604/
1    blend.20220605/
dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0)
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220604/
0     00/
1     01/
2     02/
3     03/
4     04/
5     05/
6     06/
7     07/
8     08/
9     09/
10    10/
11    11/
12    12/
13    13/
14    14/
15    15/
16    16/
17    17/
18    18/
19    19/
20    20/
21    21/
22    22/
23    23/
dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0).inav(0)
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220604/00/
0    core/
1     qmd/
2    text/
dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0).inav(0).navto("core")
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220604/00/core/
0           blend.t00z.core.f001.ak.grib2
1       blend.t00z.core.f001.ak.grib2.idx
2           blend.t00z.core.f001.co.grib2
3       blend.t00z.core.f001.co.grib2.idx
4           blend.t00z.core.f001.gu.grib2
                      ...                
1148        blend.t00z.core.f264.oc.grib2
1149    blend.t00z.core.f264.oc.grib2.idx
1150        blend.t00z.core.f264.pr.grib2
1151    blend.t00z.core.f264.pr.grib2.idx
1152                                 ls-l
Length: 1153, dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0).inav(0).navto("core")[0:6:2]
https://nomads.ncep.noaa.gov/pub/data/nccf/com/blend/prod/blend.20220604/00/core/
0    blend.t00z.core.f001.ak.grib2
2    blend.t00z.core.f001.co.grib2
4    blend.t00z.core.f001.gu.grib2
dtype: object
>>> ragr.navto("nccf","com","blend","prod").inav(0).inav(0).navto("core")[0:6:2].download(save_to="/media/external/data/", wait=1)
DOWNLAODING FILE
FILE SAVED
DOWNLAODING FILE
FILE SAVED