Python
Utility
Log File Analyzer
A lightweight Python script to parse server access logs. Identify crawl budget waste, spot zombie pages, and visualize bot behavior patterns without uploading sensitive data to the cloud.
import re import pandas as pd from collections import Counter # Configuration LOG_FILE = 'access.log' BOT_AGENTS = ['Googlebot', 'Bingbot', 'DuckDuckBot'] def parse_log_line(line): """Extracts IP, Timestamp, Method, URL, Status, UserAgent""" pattern = r'(\d+\.\d+\.\d+\.\d+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)"' match = re.match(pattern, line) if match: return match.groups() return None def analyze_logs(file_path): print(f"[*] Analyzing {file_path}...") data = [] with open(file_path, 'r') as f: for line in f: parsed = parse_log_line(line) if parsed: data.append(parsed) df = pd.DataFrame(data, columns=['IP', 'Time', 'Request', 'Status', 'Size', 'Referrer', 'UA']) # Filter for Bots bot_hits = df[df['UA'].str.contains('|'.join(BOT_AGENTS), case=False)] print(f"[+] Total Bot Hits: {len(bot_hits)}") print("[+] Top Crawled URLs:") print(bot_hits['Request'].value_counts().head(10)) # Status Code Distribution print("\n[+] Status Code Distribution:") print(bot_hits['Status'].value_counts()) if __name__ == "__main__": analyze_logs(LOG_FILE)
Installation
1. Install Dependencies
pip install pandas regex
2. Run Script
python log_analyzer.py
Why use this?
- Privacy First: Logs never leave your local machine.
- Bot Verification: Distinguish between real Googlebot and spoofed agents.
- Status Auditing: Instantly spot 404s and 5xx errors being crawled.