About Services Works Books Tools Insights Book Strategy
Back to Arsenal
Python Utility

Crawlcat

The agile crawler for modern SEOs. A lightweight, headless Python script to extract metadata, status codes, and internal links without the bloat of enterprise tools.

crawlcat.py
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

# Configuration
START_URL = 'https://example.com'
MAX_PAGES = 50

def crawl_site(start_url):
    visited = set()
    queue = [start_url]
    results = []
    
    print(f"[*] Starting crawl on {start_url}...")
    
    while queue and len(visited) < MAX_PAGES:
        url = queue.pop(0)
        if url in visited:
            continue
            
        try:
            res = requests.get(url, timeout=10)
            soup = BeautifulSoup(res.text, 'html.parser')
            
            # Extract Data
            title = soup.title.string if soup.title else ''
            h1 = soup.find('h1').text.strip() if soup.find('h1') else ''
            
            results.append({
                'URL': url,
                'Status': res.status_code,
                'Title': title,
                'H1': h1
            })
            
            visited.add(url)
            print(f"[+] Crawled: {url}")
            
            # Find Links
            for link in soup.find_all('a', href=True):
                full_url = urljoin(url, link['href'])
                if start_url in full_url and full_url not in visited:
                    queue.append(full_url)
                    
        except Exception as e:
            print(f"[!] Error: {e}")
            
    # Save to CSV
    df = pd.DataFrame(results)
    df.to_csv('crawl_results.csv', index=False)
    print("[*] Crawl complete. Saved to crawl_results.csv")

if __name__ == "__main__":
    crawl_site(START_URL)

Installation

1. Install Dependencies

pip install requests beautifulsoup4 pandas

2. Run Script

python crawlcat.py
Download Script