About Services Works Books Tools Insights Book Strategy
Back to Arsenal
Python Utility

Log File Analyzer

A lightweight Python script to parse server access logs. Identify crawl budget waste, spot zombie pages, and visualize bot behavior patterns without uploading sensitive data to the cloud.

log_analyzer.py
import re
import pandas as pd
from collections import Counter

# Configuration
LOG_FILE = 'access.log'
BOT_AGENTS = ['Googlebot', 'Bingbot', 'DuckDuckBot']

def parse_log_line(line):
    """Extracts IP, Timestamp, Method, URL, Status, UserAgent"""
    pattern = r'(\d+\.\d+\.\d+\.\d+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)"'
    match = re.match(pattern, line)
    if match:
        return match.groups()
    return None

def analyze_logs(file_path):
    print(f"[*] Analyzing {file_path}...")
    data = []
    
    with open(file_path, 'r') as f:
        for line in f:
            parsed = parse_log_line(line)
            if parsed:
                data.append(parsed)
                
    df = pd.DataFrame(data, columns=['IP', 'Time', 'Request', 'Status', 'Size', 'Referrer', 'UA'])
    
    # Filter for Bots
    bot_hits = df[df['UA'].str.contains('|'.join(BOT_AGENTS), case=False)]
    
    print(f"[+] Total Bot Hits: {len(bot_hits)}")
    print("[+] Top Crawled URLs:")
    print(bot_hits['Request'].value_counts().head(10))
    
    # Status Code Distribution
    print("\n[+] Status Code Distribution:")
    print(bot_hits['Status'].value_counts())

if __name__ == "__main__":
    analyze_logs(LOG_FILE)

Installation

1. Install Dependencies

pip install pandas regex

2. Run Script

python log_analyzer.py

Why use this?

  • Privacy First: Logs never leave your local machine.
  • Bot Verification: Distinguish between real Googlebot and spoofed agents.
  • Status Auditing: Instantly spot 404s and 5xx errors being crawled.
Download .py File