#lang racket (require racket/date) ;; (require "regexp.ss") (provide foldl-apache-log) ;; Maybe human-readable logfiles are not such a great idea after all... ;; All atoms are space separated: ;; w - non-quoted word ;; q - quoted string ;; d - date/dtime ;; _ - whitespace ;; w -> 1 value ;; d -> 7 values ;; q -> 2 values (2nd one is last matched char in string. FIXME: how?) ;; Quoted string. Don't include quotes to make it easier to work with ;; non-quoted and quoted entries in the original logfile. In the SQL ;; all will be quoted later just by adding the outer quotes. ;; In a pregexp, the '\' character needs to be quoted. Together with ;; string quoting, that's four '\' characters to match one in the ;; string for the q expression. (define w "(\\S+?)") (define d "\\[(.*?)/(.*?)/(.*?):(.*?):(.*?):(.*?)\\s+?(\\S+?)\\]") (define q "\"((\\\\.|[^\"])*)\"") (define _ "\\s+?") ;; Concatenate regexp fragements, interleaved with whitespace ;; separator regexp. Anchor at beginning of string. (define (regexp-line separator . strs) (apply string-append (cons "^" (cdr (append* (for/list ((s strs)) (list separator s))))))) (define (log-template . strs) (byte-pregexp (string->bytes/utf-8 (apply regexp-line _ strs)))) (define (bytes->date b) (or #f (case (string->symbol (bytes->string/utf-8 b)) ((Jan) 1) ((Feb) 2) ((Mar) 3) ((Apr) 4) ((May) 5) ((Jun) 6) ((Jul) 7) ((Aug) 8) ((Sep) 9) ((Oct) 10) ((Nov) 11) ((Dec) 12) (else (error 'date "~s" b))))) (define (bytes->int bs) (for/fold ((acc 0)) ((b (in-bytes bs))) (+ (* 10 acc) (- b 48)))) ;; (date->unix #"1" #"2" #"3" #"4" #"Dec" #"1999") (define (or-error . args) (if (null? args) #f (with-handlers ((void (lambda _ (apply or-error (cdr args))))) ((car args))))) (define (bytes->tz tz) (define (x offset char) (- (bytes-ref tz offset) char)) (let ((s (x 0 43)) (h2 (x 1 48)) (h1 (x 2 48)) (m2 (x 3 48)) (m1 (x 4 48))) (let ((sign (if (zero? s) 1 -1)) (hour (+ (* 10 h2) h1)) (min (+ (* 10 m2) m1))) (* 60 (* sign (+ (* 60 hour) min)))))) (define (seconds s m h D M Y tz) (date->seconds (date s m h D M Y 0 0 0 tz))) (define (date->unix s m h D M Y tz) (let ((s (bytes->int s)) (m (bytes->int m)) (h (bytes->int h)) (D (bytes->int D)) (M (bytes->date M)) (Y (bytes->int Y)) (tz (bytes->tz tz))) (or-error (lambda () (seconds s m h D M Y tz)) (lambda () (seconds s m (+ h 1) D M Y tz)) ;; DST workaround (lambda () (display (format "WARNING: can't handle date ~a\n" (list s m h D M Y tz)) (current-error-port)) 0 ;; what else? )))) (define (date->mysql s m h D M Y tz) ;; FIXME: not using time zone.. maybe this can be passed to mysql in a different way? (let ((s (bytes->int s)) (m (bytes->int m)) (h (bytes->int h)) (D (bytes->int D)) (M (bytes->date M)) (Y (bytes->int Y)) (tz (bytes->tz tz))) (let ((date (string->bytes/utf-8 (format "~a-~a-~a ~a-~a-~a" Y M D h m s)))) ;; (display (format "date: ~a\n" date) (current-error-port)) date))) ;; FIXME: the r are due to the double parens in the quoted string matcher. ;; LogFormat "%v:%p %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" vhost_combined (define vhost-combined (log-template w w w w d q w w q q)) (define (parse-apache-vhost-combined line) (let ((m (regexp-match vhost-combined line))) (and m (let-values (((whole_match vhost ip ident user day month year hour minute second timezone request r0 status size referrer r1 agent r2) (apply values m))) (vector ip (date->mysql second minute hour day month year timezone) request status referrer agent))))) ;; (date->seconds (date 3 52 19 25 11 1973 0 0 0 0)) ;; LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined (define combined (log-template w w w d q w w q q)) (define (parse-apache-combined line) (let ((m (regexp-match combined line))) (and m (let-values (((whole_match ip ident user day month year hour minute second timezone request r1 status size referrer r2 agent r3) (apply values m))) (vector ip (date->mysql second minute hour day month year timezone) request status referrer agent))))) ;; LogFormat "%h %l %u %t \"%r\" %>s %b" common (define common (log-template w w w d q w w)) (define (parse-apache-common line) (let ((m (regexp-match common line))) (and m (let-values (((whole_match ip ident user day month year hour minute second timezone request r1 status size) (apply values m))) (vector ip (date->mysql second minute hour day month year timezone) request status #"" #""))))) ;; Expose traversal as left fold. (define-syntax-rule (rot! l) (set! l (append (cdr l) (list (car l))))) (define (foldl-apache-log fn init [port (current-input-port)] [filename ""]) (define parsers (list parse-apache-vhost-combined parse-apache-combined parse-apache-common)) (define (parse l n [tries (length parsers)]) (when (zero? tries) (error 'parse-appache-invalid-format "~a: ~a: ~s" filename (+ n 1) l)) (let ((p ((car parsers) l))) (if p p (begin (rot! parsers) (parse l n (sub1 tries)))))) (for/fold ((s init)) ((l (in-lines port)) (n (in-naturals))) (fn (parse l n) s))) ; (define log (open-input-file "/tmp/access.log")) ; (define line (read-bytes-line log))