0

I am trying to build a web crawler to extract all the links on a webpage. I have created 2 python files. (class: scanner.py and object: vulnerability-scanner.py). When I run the script, it is continuously running without stopping. I am unable to find the error. Help me to solve this.

Here is my source code:

scanner.py

import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama

class Scanner:

    colorama.init()

    def __init__(self, url):
        self.target_url = url
        self.target_links = []

    def is_valid(self, url):
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)

    def get_all_website_links(self, url):

        GREEN = colorama.Fore.GREEN
        WHITE = colorama.Fore.WHITE
        RESET = colorama.Fore.RESET

        urls = set()
        internal_urls = set()
        external_urls = set()
        domain_name = urlparse(url).netloc
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        for a_tag in soup.findAll("a"):
            href = a_tag.attrs.get("href")
            if href == "" or href is None:
                continue
            href = urljoin(url, href)
            parsed_href = urlparse(href)
            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

            if not self.is_valid(href):
                continue
            if href in internal_urls:
                continue
            if domain_name not in href:
                if href not in external_urls:
                    print(f"{WHITE}[*] External link: {href}{RESET}")
                    external_urls.add(href)
                continue
            print(f"{GREEN}[*] Internal link: {href}{RESET}")
            urls.add(href)
            internal_urls.add(href)
        return urls

    def crawl(self, url):
        href_links = self.get_all_website_links(url)
        for link in href_links:
            print(link)
            self.crawl(link)

vulnerability-scanner.py

import argu

target_url = "https://hack.me/"
vul_scanner = argu.Scanner(target_url)
vul_scanner.crawl(target_url)
0

1 Answer 1

1

The following part is (almost) an infinite recursion:

for link in href_links:
    print(link)
    self.crawl(link)

I believe you added this on the notion of crawling the links in the page. But you didn't put a stopping condition. (Although currently, it seems like your only stopping condition is if there is a crawled page with no links at all).

One stopping condition might be to set a predefined number of "max" levels to crawl.

Something like this in your init function:

def __init__(self, url):
    self.target_url = url
    self.target_links = []
    self.max_parse_levels = 5 #you can go a step further and make this as an input to the constructore (i.e. __init__ function)
    self.cur_parse_levels = 0
.
.
.

def crawl(url):
    if self.cur_parse_levels > self.max_parse_levels:
        return
    for link in href_links:
        print(link)
        self.crawl(link)
Sign up to request clarification or add additional context in comments.

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.