#lang scheme/base (require xml scheme/pretty (planet dherman/zip/unzip)) ;; OpenOffice's .odt is a ZIP archive containing multiple xml files. ;; This requires the 'flat XML' .fodt format. ;; The current document. (define doc (make-parameter #f)) ;; LOAD JAR ODT (define (load-odt filename) (unzip-entry filename (read-zip-directory filename) #"content.xml" (lambda (name dir port) (xml->xexpr (document-element (read-xml port)))))) ;; LOAD/SAVE FLAT XML ODT ONLY! (define (load-xml filename) (with-input-from-file filename read-xml)) (define (load-xml/xexpr filename) (xml->xexpr ;; easier to work with (document-element ;; only top element, ignore prolog and misc (load-xml filename)))) (define (to-file file thunk) (when (file-exists? file) (delete-file file)) (with-output-to-file file thunk)) (define (save-se filename [xexpr (doc)]) (to-file filename (lambda () (display ";; -*- scheme -*-\n") (pretty-print xexpr)))) (define (save-xml/xexpr filename [xexpr (doc)]) (to-file filename (lambda () (write-xml/content (xexpr->xml xexpr))))) ;; QUERIES. ;; These assume current doc. Add param later if necessary. ;; Lookup a sub-element. (define (ref el [tag #f]) (if tag (assoc tag (cddr el)) (map car (cddr el)))) ;; list (define (ref/ el tag . tags) (if (null? tags) (ref el tag) (apply ref/ (ref el tag) tags))) (define (text) (ref/ (doc) 'office:body 'office:text)) (define (unique-elements el) (define tags (map car (cddr el))) (define h (make-hash)) (for ((t tags)) (hash-set! h t #f)) (hash-map h (lambda (k v) k))) ;; Test ;; (define fodt "/home/tom/maricela/marcia.fodt") ;; (define fodt "/tmp/test.fodt") ;; (doc (load-odt fodt)) ;; (save-odt "/tmp/test.fodt") ;; Styles: ;; Represent paragraph and text styles separately. A style is modeled ;; as an expression transformer. (define (id x) x) ;(define *paragraph* (make-hash `((Standard . ,id)))) ;(define *text* (make-hash `((Standard . ,id)))) (define (compose0 . args) (if (null? args) id (apply compose args))) (define (aref assoc-list tag) (cond ((assoc tag assoc-list) => cadr) (else #f))) ;; Turn an assoc list and a bunch of keys into a wrapper procedure, ;; which wraps the associated values around an expression. (define (wrapper asq . tags) (define (ref tag) (let ((v (aref asq tag))) (if v (lambda (x) (cons (string->symbol v) x)) id))) (apply compose0 (map ref tags))) ((wrapper '((fo:font-style "abc") (fo:font-weight "def")) 'fo:font-style 'fo:font-weight) 123) ;; => (abc (def 123)) ;; Automatic style dereference. ;; If a style contains a style:text-properties item with a ;; fo:font-style item, we map to that. Other things are ignored. (define (style-mapper) (define *styles* (make-hash)) (hash-set! *styles* "Standard" id) (for-each (match-lambda (('style:style sa . prop) (let ((tp (or (aref prop 'style:text-properties) '()))) (hash-set! *styles* (aref sa 'style:name) (wrapper tp 'style:text-underline-style 'fo:font-style 'fo:font-weight))))) (cddr (ref/ (doc) 'office:automatic-styles))) (lambda (key) (hash-ref *styles* key))) (require mzlib/match) ;; old style easier for tagged lists ;; A guessterpreter for Melissa's markup style. ; (define (simplify-markup) ;; Simplify markup. If it's standard, map it to a string. Otherwise ;; wrap strings in tags, derived from paragraph and span markups. (define (simplified-pars) (define style (style-mapper)) ;; Apply expression wrapper for this style. (define (do-style attrib lst) ((style (aref attrib 'text:style-name)) lst)) ;; Fix nesting for nonstandard paragraphs. This merges span and ;; paragraph markup into a single consistent kind. (define (merge-par/span lst) (if (and (pair? lst) (symbol? (car lst))) (list lst) lst)) ;; Support some basic markup tags. (define (simplify-par par) (map (match-lambda (('text:soft-page-break props) '(page)) (('text:s prop) `(spaces ,(string->number (or (aref prop 'text:c) "1")))) (('text:tab . _) '(tab)) (('text:span attrib . lst) (do-style attrib lst)) (o o)) par)) (filter id (map (match-lambda (('text:p attrib . lst) (merge-par/span (do-style attrib (simplify-par lst)))) (('text:sequence-decls . _) #f) (other (printf "ignored: ~a\n" other) #f)) (cddr (text))))) (define (text-line lst) (for/list ((l lst)) (filter string? l))) (define (textfile) (apply string-append (for/list ((l (text-line (simplified-pars)))) (format "~a\n" (apply string-append l))))) ;; (doc (load-odt "/home/tom/test.odt")) (doc (load-odt "/home/tom/LOCD_draft.odt")) (save-se "/tmp/test.se")