#lang scheme/base ;; Analyse apache log files using the standard debian log file format. (require (lib "pregexp.ss") scheme/control scheme/cmdline (planet zwizwa/plt/share)) ;; Creates the structure like define-struct with the additional identifiers: ;; make-refs-entry : create a struct which registers reference ;; -refs : parameter containing a hash table of unique references ;; (define-struct entry (ip day time request status referrer agent)) ;; TOOLS (define (take n lst) (if (or (null? lst) (zero? n)) '() (cons (car lst) (take (sub1 n) (cdr lst))))) ;; PARSER ;; Ad-hoc parser for current logfile format, which is I think the ;; apache (or debian) default. All entries are accepted by the ;; standard 'read. This is then used in a 2nd pass. (define (string->entry str) (define port (open-input-string str)) (define (x) (let ((item (read port))) (cond ((string? item) item) ((list? item) ;; it's the date (symbol->string (car item))) (else (format "~s" item))))) ;; easier for 2nd parsing pass (let* ((ip (x)) (foo (x)) (bar (x)) (date-str (x)) (request (x)) (status (x)) (size (x)) (referrer (x)) (agent (x))) (close-input-port port) (if (bot? agent ip) #f (let-values (((day month year hour minute second) (let ((split (pregexp-split "[:/]" date-str))) (if (= 6 (length split)) (apply values split) (begin (printf "warning: ~a\n" date-str) (list #f #f #f #f #f #f)))))) (vector ip (list year month day) (list hour minute) request status referrer agent))))) (define (logfile->table filename) (with-input-from-file filename (lambda () (let next ((entries '())) (let ((line (read-line))) (if (eof-object? line) entries (let ((parsed (with-handlers ((void (lambda (ex) (printf "error parsing line: ~a\n" line) #f))) (string->entry line)))) (if parsed (next (cons parsed entries)) (next entries))))))))) ;; BOT FILTER (define bot-ips '(;; Hetzner monitor "213.133.113.83" "213.133.113.82" ;; misc obscure bots "212.87.231.173" )) (define bot-agents '("ia_archiver (+http://www.alexa.com/site/help/webmasters; crawler@alexa.com)" "Hatena Antenna/0.5 (http://a.hatena.ne.jp/help)" "Spider/5.0" "Mozilla/5.0 (compatible; Charlotte/1.1; http://www.searchme.com/support/)" "Baiduspider+(+http://www.baidu.com/search/spider.htm)" "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)" "Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp)" "Mozilla/4.0 (compatible; NaverBot/1.0; http://help.naver.com/customer_webtxt_02.jsp)" "Mozilla/5.0 (compatible; DotBot/1.1; http://www.dotnetdotcom.org/, crawler@dotnetdotcom.org)" "kalooga/KaloogaBot (Kalooga; http://www.kalooga.com/info.html?page=crawler)" "Yanga WorldSearch Bot v1.1/beta (http://www.yanga.co.uk/)" "MLBot (www.metadatalabs.com/mlbot)" "AISearchBot (Email: aisearchbot@gmail.com; If your web site doesn't want to be crawled, please send us a email.)" "SurveyBot/2.3 (Whois Source)" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.8.1) VoilaBot BETA 1.2 (support.voilabot@orange-ftgroup.com)" "Mozilla/5.0 (compatible; DKIMRepBot/1.0; +http://www.dkim-reputation.org)" "Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)" "msnbot/2.0b (+http://search.msn.com/msnbot.htm)" "Googlebot-Image/1.0" "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; Girafabot; girafabot at girafa dot com; http://www.girafa.com)" "T-Mobile Dash Mozilla/4.0 (compatible; MSIE 4.01; Windows CE; Smartphone; 320x240;) MSNBOT-MOBILE/1.1 (+http://search.msn.com/msnbot.htm)" "yacybot (x86 Windows XP 5.1; java 1.6.0_13; Europe/de) http://yacy.net/bot.html" "MLBot (www.metadatalabs.com/mlbot)" "AISearchBot (Email: aisearchbot@gmail.com; If your web site doesn't want to be crawled, please send us a email.)" "Yeti/1.0 (NHN Corp.; http://help.naver.com/robots/)" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.8.1) VoilaBot BETA 1.2 (support.voilabot@orange-ftgroup.com)" "SAMSUNG-SGH-E250/1.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Browser/6.2.3.3.c.1.101 (GUI) MMP/2.0 (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html)" "TailsweepBlogCrawler/Tailsweep-2.5-SNAPSHOT (http://www.tailsweep.com/; bot at [tailsweep] dot com)" "Mozilla/5.0 (Twiceler-0.9 http://www.cuil.com/twiceler/robot.html)" "DoCoMo/2.0 N905i(c100;TB;W24H16) (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html)" "TailsweepBlogCrawler/Tailsweep-2.6-SNAPSHOT (http://www.tailsweep.com/; bot at [tailsweep] dot com)" "msnbot-media/1.1 (+http://search.msn.com/msnbot.htm)" "Gaisbot/3.0+(robot06@gais.cs.ccu.edu.tw;+http://gais.cs.ccu.edu.tw/robot.php)" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "msnbot/1.1 (+http://search.msn.com/msnbot.htm)" )) (define (make-bot? agent-list ip-list) (let ((agents (make-hash)) (ips (make-hash))) (for ((a agent-list)) (hash-set! agents a #t)) (for ((ip ip-list)) (hash-set! ips ip #t)) (lambda (agent ip) (or (hash-ref agents agent (lambda () #f)) (hash-ref ips ip (lambda () #f)))))) (define bot? (make-bot? bot-agents bot-ips)) ;; QUERIES ;; Select entries that refer to a certain object. ;; (entries (ip-refs) "1.2.3.4" entry-time entry-request) (define (entries field-hash object . gimme) (let ((entries (reflist-refs (hash-ref field-hash object)))) (for/list ((e entries)) (for/list ((g gimme)) (reflist-object (g e)))))) (define (popular hash) (sort (for/list (((_ entry) hash)) (list (length (reflist-refs entry)) (reflist-object entry))) > #:key car)) ;; TEST (define logfile #f) (define columns #f) (define expr #f) (define (test-table) (logfile->table "/tmp/access.log")) (define (test1) (let-values (((shared-table hashes) (table-share (test-table)))) (set! logfile shared-table) (set! columns hashes))) (define (test2) (set! expr (table->let (test-table)))) ;; (test) (take 10 (popular (vector-ref columns 0))) ;; ips (test2)