Python3 URL Harverster

Here’s a basic Python 3 web crawler that collects URLs and follows each URL it collects to find even more URLs:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

def get_urls(url):
    urls = set()
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(url, href)
            urls.add(full_url)
    except Exception as e:
        print(f"Error fetching URL {url}: {e}")
    return urls

def crawl(url, depth):
    visited = set()
    urls_to_visit = [(url, 0)]

    while urls_to_visit:
        current_url, current_depth = urls_to_visit.pop(0)
        if current_url in visited or current_depth > depth:
            continue
        print(f"Crawling {current_url}")
        visited.add(current_url)
        new_urls = get_urls(current_url)
        urls_to_visit.extend([(new_url, current_depth + 1) for new_url in new_urls])

    return visited

if __name__ == "__main__":
    start_url = input("Enter the starting URL: ")
    max_depth = int(input("Enter the maximum depth to crawl: "))
    print("Crawling...")
    visited_urls = crawl(start_url, max_depth)
    print("Visited URLs:")
    for url in visited_urls:
        print(url)

This script uses the requests library to fetch web pages and the BeautifulSoup library to parse HTML. It starts with a given URL and recursively follows links found on each page up to a specified depth.

Make sure you have the requests and beautifulsoup4 libraries installed. You can install them via pip:

pip install requests beautifulsoup4

Please note that this is a basic web crawler and may need further customization or optimization for specific use cases. Additionally, be mindful of the websites you crawl and ensure you comply with their robots.txt file and terms of service.