I've built a web crawler that starts at an origin URL and crawls the web using a BFS or DFS method. Everything is working fine, but the performance is horrendous. I think the major cause of this is my use of synchronous requests. I've used BeautifulSoup and the Requests library to implement this, so nothing is happening asynchronously.
I've tried using AsyncIO and a couple other ways of making this async, but it's given me a lot of trouble. Any advice on how to do so, or other recommendations for improving performance would be much appreciated.
BFS Usage:
python3 Webcrawler.py [origin_url] BFS [#_nodes_to_crawl] 0 [keyword_to_find]
DFS Usage:
python3 Webcrawler.py [origin_url]] DFS [#_nodes_to_crawl] [depth_limit] [keyword_to_find]
Webcrawler.py
import urllib
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
import collections
from Graph import Graph
from Node import Node
import sys
from time import gmtime, strftime
from timeout import timeout
from multiprocessing import Pool
from multiprocessing import Process
import json
import pdb
class WebCrawler:
def __init__(self, originUrl, method, totalNodes, depthLimit=None, keyword=None):
self.originUrl = originUrl
self.method = method
self.totalNodes = int(totalNodes)
self.nodeCount = 0
self.depthLimit = int(depthLimit)
self.currentDepth = 0
self.keyword = keyword
self.keywordUrls = []
self.nodeUrlMap = {}
self.nodesToVisit = []
self.visitedUrls = set()
self.graph = Graph()
self.nodeIndex = 0
self.storeCookie()
originTitle = self.getTitle(originUrl)
startNode = Node(originUrl, None, originTitle)
self.crawl(startNode)
def crawl(self, node):
print("crawl(): " + strftime("%H:%M:%S", gmtime()))
visited = node.url in self.visitedUrls
if not visited:
self.graph.addNode(node, self.nodeIndex)
self.nodeIndex += 1
self.nodeCount += 1
self.visitedUrls.add(node.url)
if node.sourceNodes: # If this is not the starting node
sourceNode = node.sourceNodes.pop()
if sourceNode.index is not None and node.index is not None:
self.graph.addEdge(sourceNode.index, node.index) # Add an edge between sourceNode and node
if not visited:
soup = self.generateSoup(node.url)
hasKeyword = self.checkForKeyword(soup, node.url)
if hasKeyword:
node.keyword = True
links = self.findLinks(soup)
links = self.validLinks(links)
links = {l for l in links} # Remove duplicate links
if links:
if self.method == "BFS":
self.bfs(node, links)
else: # DFS
self.currentDepth += 1
if self.currentDepth >= self.depthLimit: # If depth limit reached, getNextNode (up a level)
self.currentDepth = 0 # Reset currentDepth
self.getNextNode()
else: # Otherwise, keep going deeper
self.dfs(node, links)
else: # No links present
self.getNextNode()
else: # Avoid infinite loop
self.getNextNode()
def validLinks(self, links):
print("validLinks(): " + strftime("%H:%M:%S", gmtime()))
validLinks = []
for link in links:
# Only add links while there is still room
if self.nodeCount + len(validLinks) <= self.totalNodes:
if self.isValidUrl(link):
validLinks.append(link)
return validLinks
def isValidUrl(self, url):
print("isValidUrl(): " + strftime("%H:%M:%S", gmtime()))
extensionBlacklist = ["zip", "dmg", "msi", "tar", "exe", "sisx"]
for x in extensionBlacklist:
if x in url:
return False
if "http" not in url: return False
parsed_url = urlparse(url)
if not bool(parsed_url.scheme): return False
try:
self.testRequest(url)
except:
return False
return True
@timeout(3)
def testRequest(self, url):
requests.get(url)
def getNextNode(self):
print("getNextNode(): " + strftime("%H:%M:%S", gmtime()))
if len(self.nodesToVisit) is not 0 and not self.nodeLimitReached():
# We use the same data structure to store urlsToVisit for BFS and DFS,
# and pop elements off the same way. How the elements are added is
# what's important.
nextNode = self.nodesToVisit.pop()
self.crawl(nextNode)
else: # Crawl is over
self.printGraph()
def printGraph(self):
for node in self.graph.nodes:
print("\nNode:")
if node.title:
print("Index: " + str(node.index))
print("Title: " + node.title)
print("URL: " + node.url)
print("Keyword: " + str(node.keyword))
if self.graph.edges:
print("\nEdges:")
edgeCount = 0
for e in self.graph.edges:
print("Source: " + str(e.source) + " Target: " + str(e.target))
if self.keywordUrls:
print("\nKeyword URLs:")
for k in self.keywordUrls:
print("URL: " + k)
print("\nJSON:")
print(self.jsonSerialize())
def jsonSerialize(self):
for n in self.graph.nodes:
n.sourceNodes = []
self.graph.edges = list(self.graph.edges)
return json.dumps(self.graph, default=lambda o: o.__dict__)
def storeCookie(self):
# Store graph as cookie (do this one)
pass
def nodeLimitReached(self):
return self.nodeCount >= self.totalNodes
# Convert URL into soup
def generateSoup(self, url):
print("generateSoup(): " + strftime("%H:%M:%S", gmtime()))
sourceCode = requests.get(url)
plainText = sourceCode.text
soup = BeautifulSoup(plainText, "html.parser")
return soup
# Parse soup to find links
def findLinks(self, soup):
print("findLinks(): " + strftime("%H:%M:%S", gmtime()))
links = soup.findAll('a')
hrefs = []
for link in links:
href = link.get('href', '')
hrefs.append(href)
return hrefs
def getTitle(self, url):
print("getTitle(): " + strftime("%H:%M:%S", gmtime()))
soup = self.generateSoup(url)
titles = soup.findAll('title')
if titles:
title = str(titles[0]).replace("<title>", "")
title = title.replace("</title>", "")
return title
def bfs(self, currentNode, links):
print("bfs(): " + strftime("%H:%M:%S", gmtime()))
for link in links:
# If url is not already visited, and nodesToVisit+nodeCount hasn't exceeded totalNodes
if link not in self.visitedUrls and self.nodeCount + len(self.nodesToVisit) <= self.totalNodes:
title = self.getTitle(link)
newNode = Node(link, [currentNode], title)
newNode.sourceNodes.insert(0, currentNode)
self.nodesToVisit.insert(0, newNode)
self.nodeUrlMap[link] = newNode
elif link in self.nodeUrlMap: # Repeat URL, get existing node
existingNode = self.nodeUrlMap[link]
existingNode.sourceNodes.insert(0, currentNode)
self.nodesToVisit.insert(0, existingNode)
self.getNextNode()
def dfs(self, currentNode, links):
print("dfs(): " + strftime("%H:%M:%S", gmtime()))
for link in links:
if link not in self.visitedUrls:
title = self.getTitle(link)
newNode = Node(link, [currentNode], title)
newNode.sourceNodes.append(currentNode)
self.nodesToVisit.append(newNode)
elif link in self.nodeUrlMap: # Repeat URL, get existing node
existingNode = self.nodeUrlMap[link]
existingNode.sourceNodes.append(currentNode)
self.nodesToVisit.append(existingNode)
self.getNextNode()
def checkForKeyword(self, soup, url):
# If keyword found in soup, append url to keywordUrls
if soup.body and soup.body.findAll(text=self.keyword):
self.keywordUrls.append(url)
return True
if __name__ == '__main__':
webCrawler = WebCrawler(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
Graph.py
from Edge import Edge
class Graph:
def __init__(self, nodes=[], edges=set()):
self.nodes = nodes
self.edges = edges
def addNode(self, node, nodeIndex):
node.index = nodeIndex
self.nodes.append(node)
def addEdge(self, sourceNodeIdx, targetNodeIdx):
edge = Edge(sourceNodeIdx, targetNodeIdx)
self.edges.add(edge)
Node.py
class Node:
def __init__(self, url, sourceNodes, title, index=None):
self.index = index
self.url = url
self.sourceNodes = sourceNodes
self.title = title
self.keyword = False
Edge.py
class Edge:
def __init__(self, source, target):
self.source = source
self.target = target
def __eq__(self, other):
return self.source == other.source and self.target == other.target
def __hash__(self):
return hash((self.source, self.target))
isValidUrlfor what I mean. \$\endgroup\$