I wanted to create a simple function that can read and return the HTML content from a specified URL. This is what reading here and there lead me to:
from socket import timeout
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
def get_html_content(url, max_attempt = 3):
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
content = ""
attempt = 1
while True:
try:
html_page = urlopen(req, timeout=10)
content = html_page.read()
except (HTTPError, URLError, timeout) as e:
if isinstance(e, HTTPError):
print("The server couldn\'t fulfill the request....attempt %d/%d" % (attempt, max_attempt))
print('Error code: ', e.code)
if isinstance(e, URLError):
print("We failed to reach a server....attempt %d/%d" % (attempt, max_attempt))
print('Reason: ', e.reason)
if isinstance(e, timeout):
print('timeout...attempt %d/%d' % (attempt, max_attempt))
attempt += 1
if attempt > max_attempt:
break
continue
else:
break
return content
I would use this function to parse the content of many URLs. For if content = "", I would raise a random exception after writing to some file whatever I had already successfully gathered.