Python3 URL Harverster
Here’s a basic Python 3 web crawler that collects URLs and follows each URL it collects to find even more URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
def get_urls(url):
urls = set()
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(url, href)
urls.add(full_url)
except Exception as e:
print(f"Error fetching URL {url}: {e}")
return urls
def crawl(url, depth):
visited = set()
urls_to_visit = [(url, 0)]
while urls_to_visit:
current_url, current_depth = urls_to_visit.pop(0)
if current_url in visited or current_depth > depth:
continue
print(f"Crawling {current_url}")
visited.add(current_url)
new_urls = get_urls(current_url)
urls_to_visit.extend([(new_url, current_depth + 1) for new_url in new_urls])
return visited
if __name__ == "__main__":
start_url = input("Enter the starting URL: ")
max_depth = int(input("Enter the maximum depth to crawl: "))
print("Crawling...")
visited_urls = crawl(start_url, max_depth)
print("Visited URLs:")
for url in visited_urls:
print(url)
This script uses the requests library to fetch web pages and the BeautifulSoup library to parse HTML. It starts with a given URL and recursively follows links found on each page up to a specified depth.
Make sure you have the requests
and beautifulsoup4
libraries installed. You can install them via pip:
pip install requests beautifulsoup4
Please note that this is a basic web crawler and may need further customization or optimization for specific use cases. Additionally, be mindful of the websites you crawl and ensure you comply with their robots.txt file and terms of service.