I recently wrote a testing tool (called plink) for retrieving all the links from a website (and then retreiving links from the linked pages, and so on).
Essentially, the tool is designed to be pointed at a site, and will give you a summary of any broken links etc.
It includes whitelisting/blacklisting and allows you to specify the depth of recursion.
The project also contains unit testing, but I feel these are out-of-scope for the code review.
I'm fairly new to the Python ecosystem, having only written small single-file scripts in the past. I'm a .NET developer by day, and am trying to branch out a little. All advice is welcome! Thanks :)
Here's a directory tree if you need it:
src
plink
__init__.py
__main__.py
analyser.py
options.py
result.py
tests
[tests_are_here]
__main__.py includes the entry point, "splash screen" and argument parsing:
import argparse
import pkg_resources
from plink.analyser import Analyser
from plink.options import Options
from termcolor import colored
def print_splash_screen(verbose):
logo = '''
_ _ _
| (_) | |
_ __ | |_ _ __ | | __
| '_ \| | | '_ \| |/ /
| |_) | | | | | | <
| .__/|_|_|_| |_|_|\_\
| |
|_|
'''
print(colored(logo, "magenta"))
if verbose:
print(colored("plink v" + pkg_resources.get_distribution('plink-url').version, "cyan"))
def initialise(arguments):
return Options(whitelist=arguments.whitelist, depth=arguments.depth, blacklist=arguments.blacklist, start_url=arguments.start_url, verbose=arguments.verbose)
def main():
#Retrieve, validate and parse arguments from CLI
parser = argparse.ArgumentParser()
parser.add_argument("start_url", type=str, help="Initial URL to analyse")
parser.add_argument("--whitelist", "-w", help="Domains to analyse. Can only be provided if blacklist is missing", nargs="*")
parser.add_argument("--blacklist", "-b", help="Domains to skip, Can only be provided if whitelist is missing", nargs="*")
parser.add_argument("--depth", "-d", type=int, help="Number of analysis cycles", const=3, default=3, nargs="?")
parser.add_argument("--verbose", "-v", action="store_true", help="Give more output (including potential errors, and a list of individual URL scans)")
args = parser.parse_args()
#Both whitelist and blacklist cannot be specified
if args.whitelist is not None and args.blacklist is not None:
parser.error("Either provide a whitelist, or a blacklist, you cannot specify both")
print_splash_screen(args.verbose)
options = initialise(args)
a = Analyser(options)
a.analyse()
if __name__ == '__main__':
main()
options.py is a file designed to hold data about the program's config options.
from dataclasses import dataclass, field
@dataclass
class Options():
whitelist: list[str] = field(default_factory=list)
blacklist: list[str] = field(default_factory=list)
depth: int = 3
start_url: str = ""
verbose: bool = False
analyser.py is where the main functionality is housed at the moment:
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from plink.result import Result
from termcolor import colored
class Analyser():
def __init__(self, options):
self.options = options
def retrieve_content_by_url(self, url):
request = requests.get(url)
content = request.text
return content
def retrieve_links_from_html(self, html, base_url):
soup = BeautifulSoup(html, features="html.parser")
link_tags = soup.find_all('a')
links = [tag['href'] for tag in link_tags]
absolute_links = [urljoin(base_url, link) for link in links]
return absolute_links
def analyse_url(self, url, depth):
try:
content = self.retrieve_content_by_url(url)
links = self.retrieve_links_from_html(content, url)
if self.options.verbose:
print(colored((depth * " ") + "Success: " + url, "green"))
return Result(links=links, status="Success")
except Exception as ex:
if self.options.verbose:
print(colored((depth * " ") + str(ex), "red"))
print(colored((depth * " ") + "Fail: " + url, "red"))
return Result(status="Fail")
def find_domain_from_url(self, url):
domain = urlparse(url).netloc
if domain[:4] == "www.":
return domain[4:]
return domain
def compare_domains_from_urls(self, url1, url2):
formatted_url1 = self.find_domain_from_url(url1)
formatted_url2 = self.find_domain_from_url(url2)
return formatted_url1.lower() == formatted_url2.lower()
def check_whitelist(self, url):
return any(self.compare_domains_from_urls(url, w) for w in self.options.whitelist)
def check_blacklist(self, url):
return not any(self.compare_domains_from_urls(url, w) for w in self.options.blacklist)
def print_summary(self, urls_analysed):
number_of_successful_urls = sum(1 for url in urls_analysed if url[1] == "Success")
number_of_failed_urls = len(urls_analysed) - number_of_successful_urls
print(colored(f"{number_of_successful_urls} successful URLs", "green"))
print(colored(f"{number_of_failed_urls} failed URLs", "red"))
def analyse(self):
use_whitelist = self.options.whitelist is not None
use_blacklist = self.options.blacklist is not None
# Define default lists to analyse
analysed_urls = []
urls_to_analyse = [self.options.start_url]
# Each "step" is the next level of depth
for depth in range(0, self.options.depth):
urls_in_step = urls_to_analyse[:] # Copy list without reference
# Each "step" contains a list of urls to analyse
# This list will be empty by the time the step ends
for url in urls_in_step:
if url not in [url_result[0] for url_result in analysed_urls]:
# Check the blacklist doesn't contain the url OR the whitelist does contain it
if (use_whitelist and self.check_whitelist(url)) \
or (use_blacklist and self.check_blacklist(url)) \
or (not use_blacklist and not use_whitelist):
result = self.analyse_url(url, depth)
analysed_urls.append((url, result.status))
urls_to_analyse.remove(url)
# The URL has been analysed, remove it from this step and add it to the analysed list
for url_to_analyse in result.links:
if url_to_analyse not in urls_to_analyse:
urls_to_analyse.append(url_to_analyse)
self.print_summary(analysed_urls)
result.py is a dataclass designed to hold information about the results of a single URL scan:
from dataclasses import dataclass, field
@dataclass
class Result():
links: list[str] = field(default_factory=list)
status: str = ""
Thanks!