I have written a web scraping script using Selenium to crawl blog content from multiple URLs. The script processes URLs in batches of 1000 and uses multithreading with the ThreadPoolExecutor to improve performance. It also handles graceful termination with signal handling to save progress in case of interruptions.
Key Features of the Code:
- Headless Chrome Driver: Configured for faster performance.
- Blocking Media Files: Prevents loading unnecessary resources like images and videos.
- Multithreading: Processes multiple URLs simultaneously to reduce execution time.
- Progress Saving: Saves intermediate results to a CSV file during execution and before exiting.
- Error Handling and Logging: Captures errors and logs details for debugging.
Issue:
Despite these optimizations, the execution time is still slower than expected when processing a large number of URLs. Each URL takes several seconds to fetch content, which adds up significantly for thousands of URLs.
Questions:
- How can I further reduce execution time for this multi-page crawling script?
- Are there any specific optimizations I can apply to improve Selenium's performance, especially when handling iframes and dynamic content?
import multiprocessing
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import logging
import random
import signal
import sys
# log
logging.basicConfig(filename='crawler.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# chrome driver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
def start_driver():
driver = webdriver.Chrome(options=chrome_options)
driver.execute_cdp_cmd('Network.enable', {})
try:
driver.execute_cdp_cmd('Network.setBlockedURLs', {
"urls": ["*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.mp4", "*.avi", "*.mkv", "*.mov"]
})
except Exception as e:
logging.error(f"Error setting blocked URLs: {e}")
return driver
# scraping
def scrap_blog_content(url):
driver = start_driver()
try:
driver.get(url)
WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "iframe"))
)
iframe = driver.find_element(By.CSS_SELECTOR, "iframe")
driver.switch_to.frame(iframe)
WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div.se-main-container"))
)
content = driver.find_element(By.CSS_SELECTOR, "div.se-main-container").text
time.sleep(random.uniform(1, 5))
return content
except Exception as e:
logging.error(f"Error while fetching content from {url}: {e}")
return None
finally:
driver.quit()
# thread
def process_urls(urls):
results = []
with ThreadPoolExecutor(max_workers=8) as executor:
future_to_url = {executor.submit(crawl_blog_content, url): url for url in urls}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
content = future.result()
if content:
results.append((url, content))
logging.info(f"Successfully crawled: {url}")
except Exception as exc:
logging.error(f"Error fetching {url}: {exc}")
return results
# result
global_results = []
output_file = 'contents_202101.csv'
# temp save
def save_progress():
if global_results:
temp_df = pd.DataFrame(global_results, columns=['URL', 'Content'])
temp_df.to_csv(output_file, index=False)
logging.info(f"Progress saved with {len(global_results)} entries.")
# exit
def signal_handler(sig, frame):
logging.info("Termination signal received. Saving progress...")
save_progress()
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
if __name__ == "__main__":
input_file = 'url_202101.csv'
urls_df = pd.read_csv(input_file)
urls = urls_df['URL'].tolist()
batch_size = 1000
# batch
url_chunks = [urls[i:i + batch_size] for i in range(0, len(urls), batch_size)]
for idx, chunk in enumerate(url_chunks):
logging.info(f"Processing batch {idx + 1}/{len(url_chunks)}")
results = process_urls(chunk)
global_results.extend(results)
save_progress()
logging.info(f"Batch {idx + 1} saved with {len(global_results)} entries.")
save_progress()
logging.info(f"Final results saved to {output_file} with {len(global_results)} entries.")```