I have written code to get all urls on a webpage & put them in a set, and would like tips on simple changes I can make to increase its performance.
soup = BeautifulSoup(html_doc)
for link in soup.find_all('a'):
url = link.get('href')
if url is None or ' ' in url or '<' in url or '>' in url:
continue
if url.startswith('//'):
url = url.replace('//', 'http://')
if url.startswith('/'):
url = hostname + url
if '?' in url:
url = url.split('?')[0]
if '#' in url:
url = url.split('#')[0]
if url.endswith('/'):
url = url[:-1]
if url.endswith(excluded_extensions):
continue
if url.startswith(hostname):
urls_set.add(url)