[Update (2025)] It's been a while, so I'm not sure this is useful, but thought I'd add some of my evoution of thought on this. I basically ended up hard-coding the header column indices. It was just easier to parse.
def _build_tag_map(self, td):
tag_list = []
tag_url_list = []
for tag in td.find_all('a'):
text = tag.text.strip()
tag_list.append(text)
tag_url_list.append(tag['href'])
return tag_list, tag_url_list
def _parse_row(self, row):
parsed_data = []
cells = row.find_all('td')
skip_cols = [0, 3, 5, 6]
text_cols = [1, 2, 7]
tag_col = 4
view_activity_col = 9
for ix, td in enumerate(cells):
if ix in skip_cols:
continue
if ix in text_cols:
parsed_data.append(td.text.replace('\n', '').strip())
link = td.find('a')
parsed_data.append(link['href'] if link else '')
elif ix == tag_col:
tags, tag_urls = self._build_tag_map(td)
self.log.debug(f"{tags = }")
self.log.debug(f"{tag_urls = }")
parsed_data.append(tags)
parsed_data.append(tag_urls)
elif ix == view_activity_col:
parsed_data.append("view activity")
link = td.find('a')
parsed_data.append(link['href'] if link else '')
else:
parsed_data.append(td.text.strip())
return dict(zip(self.col_headers, parsed_data))