Python
Utility
Crawlcat
The agile crawler for modern SEOs. A lightweight, headless Python script to extract metadata, status codes, and internal links without the bloat of enterprise tools.
import requests from bs4 import BeautifulSoup import pandas as pd from urllib.parse import urljoin # Configuration START_URL = 'https://example.com' MAX_PAGES = 50 def crawl_site(start_url): visited = set() queue = [start_url] results = [] print(f"[*] Starting crawl on {start_url}...") while queue and len(visited) < MAX_PAGES: url = queue.pop(0) if url in visited: continue try: res = requests.get(url, timeout=10) soup = BeautifulSoup(res.text, 'html.parser') # Extract Data title = soup.title.string if soup.title else '' h1 = soup.find('h1').text.strip() if soup.find('h1') else '' results.append({ 'URL': url, 'Status': res.status_code, 'Title': title, 'H1': h1 }) visited.add(url) print(f"[+] Crawled: {url}") # Find Links for link in soup.find_all('a', href=True): full_url = urljoin(url, link['href']) if start_url in full_url and full_url not in visited: queue.append(full_url) except Exception as e: print(f"[!] Error: {e}") # Save to CSV df = pd.DataFrame(results) df.to_csv('crawl_results.csv', index=False) print("[*] Crawl complete. Saved to crawl_results.csv") if __name__ == "__main__": crawl_site(START_URL)
Installation
1. Install Dependencies
pip install requests beautifulsoup4 pandas
2. Run Script
python crawlcat.py