Revisions to Python BeautifulSoup - preparing HTML rows and td tags for Pandas

Rollback to Revision 10

Source Link

edited Jul 25 at 17:43

29.6k
16
46
203

[Update (2025)] It's been a while, so I'm not sure this is useful, but thought I'd add some of my evoution of thought on this. I basically ended up hard-coding the header column indices. It was just easier to parse.

    def _build_tag_map(self, td):
        tag_list = []
        tag_url_list = []
        for tag in td.find_all('a'):
            text = tag.text.strip()
            tag_list.append(text)
            tag_url_list.append(tag['href'])
        return tag_list, tag_url_list


    def _parse_row(self, row):
        parsed_data = []
        cells = row.find_all('td')
        skip_cols = [0, 3, 5, 6]
        text_cols = [1, 2, 7]
        tag_col = 4
        view_activity_col = 9
        
        for ix, td in enumerate(cells):
            if ix in skip_cols: 
                continue
            
            if ix in text_cols: 
                parsed_data.append(td.text.replace('\n', '').strip())
                link = td.find('a')
                parsed_data.append(link['href'] if link else '')
            
            elif ix == tag_col: 
                tags, tag_urls = self._build_tag_map(td)
                self.log.debug(f"{tags = }")
                self.log.debug(f"{tag_urls = }")
                parsed_data.append(tags)
                parsed_data.append(tag_urls)

            elif ix == view_activity_col:
                parsed_data.append("view activity")
                link = td.find('a')
                parsed_data.append(link['href'] if link else '')
            
            else:
                parsed_data.append(td.text.strip()) 
                
        return dict(zip(self.col_headers, parsed_data))

[Update (2025)] It's been a while, so I'm not sure this is useful, but thought I'd add some of my evoution of thought on this. I basically ended up hard-coding the header column indices. It was just easier to parse.

    def _build_tag_map(self, td):
        tag_list = []
        tag_url_list = []
        for tag in td.find_all('a'):
            text = tag.text.strip()
            tag_list.append(text)
            tag_url_list.append(tag['href'])
        return tag_list, tag_url_list


    def _parse_row(self, row):
        parsed_data = []
        cells = row.find_all('td')
        skip_cols = [0, 3, 5, 6]
        text_cols = [1, 2, 7]
        tag_col = 4
        view_activity_col = 9
        
        for ix, td in enumerate(cells):
            if ix in skip_cols: 
                continue
            
            if ix in text_cols: 
                parsed_data.append(td.text.replace('\n', '').strip())
                link = td.find('a')
                parsed_data.append(link['href'] if link else '')
            
            elif ix == tag_col: 
                tags, tag_urls = self._build_tag_map(td)
                self.log.debug(f"{tags = }")
                self.log.debug(f"{tag_urls = }")
                parsed_data.append(tags)
                parsed_data.append(tag_urls)

            elif ix == view_activity_col:
                parsed_data.append("view activity")
                link = td.find('a')
                parsed_data.append(link['href'] if link else '')
            
            else:
                parsed_data.append(td.text.strip()) 
                
        return dict(zip(self.col_headers, parsed_data))

Added some new code snippet on the parsing

Source Link

edited Jul 25 at 17:17

Meghan M.

141
4

[Update (2025)] It's been a while, so I'm not sure this is useful, but thought I'd add some of my evoution of thought on this. I basically ended up hard-coding the header column indices. It was just easier to parse.

    def _build_tag_map(self, td):
        tag_list = []
        tag_url_list = []
        for tag in td.find_all('a'):
            text = tag.text.strip()
            tag_list.append(text)
            tag_url_list.append(tag['href'])
        return tag_list, tag_url_list


    def _parse_row(self, row):
        parsed_data = []
        cells = row.find_all('td')
        skip_cols = [0, 3, 5, 6]
        text_cols = [1, 2, 7]
        tag_col = 4
        view_activity_col = 9
        
        for ix, td in enumerate(cells):
            if ix in skip_cols: 
                continue
            
            if ix in text_cols: 
                parsed_data.append(td.text.replace('\n', '').strip())
                link = td.find('a')
                parsed_data.append(link['href'] if link else '')
            
            elif ix == tag_col: 
                tags, tag_urls = self._build_tag_map(td)
                self.log.debug(f"{tags = }")
                self.log.debug(f"{tag_urls = }")
                parsed_data.append(tags)
                parsed_data.append(tag_urls)

            elif ix == view_activity_col:
                parsed_data.append("view activity")
                link = td.find('a')
                parsed_data.append(link['href'] if link else '')
            
            else:
                parsed_data.append(td.text.strip()) 
                
        return dict(zip(self.col_headers, parsed_data))

[Update (2025)] It's been a while, so I'm not sure this is useful, but thought I'd add some of my evoution of thought on this. I basically ended up hard-coding the header column indices. It was just easier to parse.

    def _build_tag_map(self, td):
        tag_list = []
        tag_url_list = []
        for tag in td.find_all('a'):
            text = tag.text.strip()
            tag_list.append(text)
            tag_url_list.append(tag['href'])
        return tag_list, tag_url_list


    def _parse_row(self, row):
        parsed_data = []
        cells = row.find_all('td')
        skip_cols = [0, 3, 5, 6]
        text_cols = [1, 2, 7]
        tag_col = 4
        view_activity_col = 9
        
        for ix, td in enumerate(cells):
            if ix in skip_cols: 
                continue
            
            if ix in text_cols: 
                parsed_data.append(td.text.replace('\n', '').strip())
                link = td.find('a')
                parsed_data.append(link['href'] if link else '')
            
            elif ix == tag_col: 
                tags, tag_urls = self._build_tag_map(td)
                self.log.debug(f"{tags = }")
                self.log.debug(f"{tag_urls = }")
                parsed_data.append(tags)
                parsed_data.append(tag_urls)

            elif ix == view_activity_col:
                parsed_data.append("view activity")
                link = td.find('a')
                parsed_data.append(link['href'] if link else '')
            
            else:
                parsed_data.append(td.text.strip()) 
                
        return dict(zip(self.col_headers, parsed_data))

Tweeted twitter.com/StackCodeReview/status/1610199309135249408

occurred Jan 3, 2023 at 9:00

Code revision in the update

Source Link

edited Dec 31, 2022 at 10:58

Meghan M.

141
4

class ParseTools:
    def __init__(self):
        pass

    def is_empty_dir(self, dir_name: str) -> bool:
        return not any(os.scandir(str(dir_name)))
        
    def get_cache_files(self, dir):
        '''Get html file names'''
        if self.is_empty_dir(dir):
            return []
        else:
            files = [os.path.join(dir, file) for file in os.listdir(dir)]
            files = [WindowsPath(entry) for entry in files]
            checked_files = [entry for entry in files if entry.is_file() and 'html' in str(entry)]
            return checked_files


class HTMLFileParser(ParseTools):
    def __init__(self, **args):
        ParseTools.__init__(self)
        self.__dict__.update(args)
        self.html_rows = []

    def start_file_parser(self):
        tic = time.perf_counter()
        ### self.cache_files come in as part of the **args
        for file in self.cache_files:
            rows_on_page = self.get_rows_from_page_file(file)
            self.html_rows.extend(rows_on_page)

    def get_rows_from_page_file(self, file):
            strainer = SoupStrainer("tr")
            with open(file, 'r', encoding='utf-8') as f:
                strained_soup = BeautifulSoup(f.read(), features="html.parser", parse_only=strainer)
                return strained_soup

    def rebuild_row(self, row):
        new_row = []
        # loop selector disregards 'th' rows
        # the comparator uses .select() to disregard 'stars' and 'img'
        for col in row.select('td'):
            if col.select('div.stars, img'): 
                continue
            if col.a:
                links = self.handle_links(col)
                new_row.append(links)
            else:
                if not col.text or not col.text.strip():
                    new_row.append(['NaN'])
                else:
                    new_text = self.clean_tag_text(col)
                    new_row.append(new_text)
        return new_row 

    def handle_links(self, col):
        links = col.find_all('a', href=True)
        if len(links) == 1:
            ### returns a tuple of (name, link)
            return [(self.clean_tag_text(links[0]), self.complete_url(links[0]['href']))]
        else:
            ### returns a tuple of (names, links)
            return [(self.clean_tag_text(tag), self.complete_url(tag['href'])) for tag in links]

    def complete_url(self, href_frag):
        return self.shelf_url + '/' + href_frag

    def clean_tag_text(self, tag):
        return tag.text.strip()

class ParseTools:
    def __init__(self):
        pass

    def is_empty_dir(self, dir_name: str) -> bool:
        return not any(os.scandir(str(dir_name)))
        
    def get_cache_files(self, dir):
        '''Get html file names'''
        if self.is_empty_dir(dir):
            return []
        else:
            files = [os.path.join(dir, file) for file in os.listdir(dir)]
            files = [WindowsPath(entry) for entry in files]
            checked_files = [entry for entry in files if entry.is_file() and 'html' in str(entry)]
            return checked_files


class HTMLFileParser(ParseTools):
    def __init__(self, **args):
        ParseTools.__init__(self)
        self.__dict__.update(args)
        self.html_rows = []

    def start_file_parser(self):
        tic = time.perf_counter()
        ### self.cache_files come in as part of the **args
        for file in self.cache_files:
            rows_on_page = self.get_rows_from_page_file(file)
            self.html_rows.extend(rows_on_page)

    def get_rows_from_page_file(self, file):
            strainer = SoupStrainer("tr")
            with open(file, 'r', encoding='utf-8') as f:
                strained_soup = BeautifulSoup(f.read(), features="html.parser", parse_only=strainer)
                return strained_soup

    def rebuild_row(self, row):
        new_row = []
        # loop selector disregards 'th' rows
        # the comparator uses .select() to disregard 'stars' and 'img'
        for col in row.select('td'):
            if col.select('div.stars, img'): 
                continue
            if col.a:
                links = self.handle_links(col)
                new_row.append(links)
            else:
                if not col.text or not col.text.strip():
                    new_row.append(['NaN'])
                else:
                    new_text = self.clean_tag_text(col)
                    new_row.append(new_text)
        return new_row

class ParseTools:
    def __init__(self):
        pass

    def is_empty_dir(self, dir_name: str) -> bool:
        return not any(os.scandir(str(dir_name)))
        
    def get_cache_files(self, dir):
        '''Get html file names'''
        if self.is_empty_dir(dir):
            return []
        else:
            files = [os.path.join(dir, file) for file in os.listdir(dir)]
            files = [WindowsPath(entry) for entry in files]
            checked_files = [entry for entry in files if entry.is_file() and 'html' in str(entry)]
            return checked_files


class HTMLFileParser(ParseTools):
    def __init__(self, **args):
        ParseTools.__init__(self)
        self.__dict__.update(args)
        self.html_rows = []

    def start_file_parser(self):
        tic = time.perf_counter()
        ### self.cache_files come in as part of the **args
        for file in self.cache_files:
            rows_on_page = self.get_rows_from_page_file(file)
            self.html_rows.extend(rows_on_page)

    def get_rows_from_page_file(self, file):
            strainer = SoupStrainer("tr")
            with open(file, 'r', encoding='utf-8') as f:
                strained_soup = BeautifulSoup(f.read(), features="html.parser", parse_only=strainer)
                return strained_soup

    def rebuild_row(self, row):
        new_row = []
        # loop selector disregards 'th' rows
        # the comparator uses .select() to disregard 'stars' and 'img'
        for col in row.select('td'):
            if col.select('div.stars, img'): 
                continue
            if col.a:
                links = self.handle_links(col)
                new_row.append(links)
            else:
                if not col.text or not col.text.strip():
                    new_row.append(['NaN'])
                else:
                    new_text = self.clean_tag_text(col)
                    new_row.append(new_text)
        return new_row 

    def handle_links(self, col):
        links = col.find_all('a', href=True)
        if len(links) == 1:
            ### returns a tuple of (name, link)
            return [(self.clean_tag_text(links[0]), self.complete_url(links[0]['href']))]
        else:
            ### returns a tuple of (names, links)
            return [(self.clean_tag_text(tag), self.complete_url(tag['href'])) for tag in links]

    def complete_url(self, href_frag):
        return self.shelf_url + '/' + href_frag

    def clean_tag_text(self, tag):
        return tag.text.strip()