#lang scheme/base ;; Parse apache logs and dump out a set of tables. This replaces all ;; strings with ids and records unique strings in a table. ;; The interfaces use generators instead of (lazy) lists, as this ;; seems more appropriate when we're interfacing with unix pipes. (require "apachelog-regexp.ss") (require racket/generator) (provide (all-defined-out)) ;; TOOLS (define (make-progress) (let ((count 0)) (lambda _ (set! count (add1 count)) (when (zero? (modulo count 59)) (display (format "\r~s " count) (current-error-port)))))) ;; SCHEMA PRINTER (define (id-prefix table) (string->symbol (format "id_~a" table))) (define (id-type id) (case id ((date) 'DATETIME) (else 'MEDIUMINT))) (define (id-split? id) (eq? (id-type id) 'MEDIUMINT)) (define (print-entry-table table ids) (printf "CREATE TABLE ~a (\n" table) (printf " ~a MEDIUMINT NOT NULL AUTO_INCREMENT,\n" (id-prefix table)) ;; let DB generate these (for ((id ids)) (printf " ~a ~a NOT NULL,\n" (if (id-split? id) (id-prefix id) id) (id-type id))) (printf " PRIMARY KEY (~a)\n" (id-prefix table)) (printf ");\n")) (define (print-id-table name) (define id (id-prefix name)) (define auto-increment "") ;; /AUTO_INCREMENT (printf "CREATE TABLE ~a (\n" name) (printf " ~a MEDIUMINT NOT NULL ~a,\n" id auto-increment) (printf " ~a TEXT NOT NULL,\n" name) (printf " PRIMARY KEY (~a)\n" id) (printf ");\n")) (define (print-schema table fields) (when #t ;; FIXME: remove this! (printf "DROP TABLE ~a;\n" table) (for ((f fields)) (when (id-split? f) (printf "DROP TABLE ~a;\n" f)))) (print-entry-table table fields) (for ((f fields)) (when (id-split? f) (print-id-table f)))) ;; INSERT PRINTER ;(define (format-sanitize_ bytes) ; (let ((bytes (regexp-replace* #"'" bytes "\""))) ; bytes)) ;; Should be ok.. (define (format-sanitize bytes) bytes) (define (format-tuple lst) (define (format-element el) (cond ((or (bytes? el)) (format "\"~a\"" (format-sanitize el))) (else (format "~a" el)))) (define (prefix-parens lst) (map (lambda (el) (list ", " el)) lst)) (define (flatten lst) (if (list? lst) (apply append (map flatten lst)) (list lst))) (apply string-append (flatten (list "(" (cdr ;; drop leading paren (flatten (prefix-parens (map format-element lst)))) ")")))) (define (print-insert table fields values) (printf "INSERT INTO ~a ~a values ~a;\n" table (format-tuple fields) (format-tuple values))) (define (print-insert-idtable table id val) (define id_name (id-prefix table)) (print-insert table (list id_name table) (list id val))) ;; ID generation. This uses a hash table for keeping track of IDs. ;; Call (new! id) when a new entry was generated. (define (identify! hash val new!) (define (handle-new) (let ((id (hash-count hash))) (hash-set! hash val id) (new! id val) id)) (hash-ref hash val handle-new)) ;; Imperative iterator on top of fold. (define (for-each-apache-log fn port infile) (foldl-apache-log (lambda (record state) (fn record)) #f port infile)) ;; This layout is hardcoded in the apachelog-regexp.ss module. (define main-table 'entry) (define main-columns '(ip date req stat ref agent)) (define column-indices (make-hash (for/list ((c main-columns) (n (in-naturals))) (cons c n)))) (define (print-log infile [port (open-input-file infile)]) (define progress (make-progress)) (define hashes (build-vector (hash-count column-indices) (lambda _ (make-hash)))) ;; When a new item is detected, it is immediately printed to stdout. (define (print-insert-table table) (lambda (id val) (print-insert-idtable table id val))) ;; CREATE (print-schema main-table main-columns) ;; INSERT (for-each-apache-log (lambda (record) ;; Map record entries to IDs, possibly adding IDs to the ID ;; tables, which prints out an INSERT for the table. (define indirect (for/list (((col index) column-indices)) (let ((hash (vector-ref hashes index)) (val (vector-ref record index))) (if (not (id-split? col)) (cons col val) (cons (id-prefix col) (identify! hash val (print-insert-table col))))))) ;; Print INSERT for the main table, using IDs instead of the ;; original values. (print-insert main-table (map car indirect) (map cdr indirect)) (progress)) port infile)) ;; Interfaces use for-each instead of folds since we're using a ;; generator approach here isntead of a functional one. (define (test) (print-log "/tmp/access.log")) ; (test) (define (stdin) (print-log "" (current-input-port))) (stdin)