;;;; Generate training and test data for TnT tagger experiments ;;;; Stuart Shieber ;;;; Tue Jan 29 15:12:26 PST 2002 (in-package :tsdb) ;;;;--------------------------------------------------------------------- ;;;; Auxiliary functions ;;; derivation-terminalp ;;; -------------------- ;;; ;;; Holds if the derivation deriv is a terminal symbol instance. (defun derivation-terminalp (deriv) (stringp (derivation-root deriv))) ;;; derivation-terminal ;;; ------------------- ;;; ;;; Returns the terminal symbol associated with a terminal (leaf) of a ;;; derivation. If the flag normalize is T, replaces all spaces with ;;; underscores so that the terminal symbol will be readable asa ;;; single symbol. (defun derivation-terminal (deriv &key (normalize nil)) (and (derivation-terminalp deriv) (let* ((string (derivation-root deriv))) (if normalize (substitute #\_ #\Space string) string)))) ;;; derivation-preterminalp ;;; ----------------------- ;;; ;;; Holds if deriv is a preterminal derivation, that is, is the ;;; immediate parent of a terminal derivation. (defun derivation-preterminalp (deriv) (and (derivation-unaryp deriv) (derivation-terminalp (nth-daughter 0 deriv)))) ;;; derivation-preterminal ;;; ---------------------- ;;; Returns the preterminal symbol associated with a preterminal ;;; derivation. This is generated by looking up the type of the ;;; instance in the lexicon. (defun derivation-preterminal (deriv) (and (derivation-preterminalp deriv) (type-of-lexical-entry (derivation-root deriv)))) ;;; nth-daughter ;;; ------------ ;;; Returns the nth daughter subderivation for a derivation. Like ;;; nth(), indexing is zero-based and it returns nil if no such ;;; daughter exists. (defun nth-daughter (n deriv) (nth n (derivation-daughters deriv))) ;;; derivation-unaryp ;;; ----------------- ;;; Holds if deriv has just a single child. (defun derivation-unaryp (deriv) (and (nth-daughter 0 deriv) (not (nth-daughter 1 deriv)))) ;;; type-of-lexical-entry ;;; --------------------- ;;; Returns the (preterminal) type of a lexical entry by lookup in the ;;; lexicon. Hard error if no type is available from the lexicon. ;;; Essentials of this function (sans error checking) were provided by ;;; Stephan Oepen. (defun type-of-lexical-entry (instance &optional package) (let* ((instance (lkb::get-lex-entry-from-id instance)) (tdfs (and instance (lkb::lex-entry-full-fs instance))) (dag (and tdfs (lkb::tdfs-indef tdfs))) (type (when dag (lkb::type-of-fs dag)))) (if (null type) (error "Null lexical entry type ~s~%" instance) (if package (intern type package) type)))) ;;;;--------------------------------------------------------------------- ;;;; Derivation processing ;;; tnt-process-deriv ;;; ----------------- ;;; Generates tagger data for a single derivation deriv, written to ;;; the specified stream (or standard output if none provided). (defun tnt-process-deriv (deriv &optional (stream t)) (cond ((derivation-terminalp deriv) (error "Terminal without preterminal ~s~%" deriv)) ((derivation-preterminalp deriv) (format stream "~(~20a~c~a~c~%~)" (derivation-terminal (nth-daughter 0 deriv) :normalize t) #\tab (derivation-preterminal deriv) #\tab)) #+:null ((inflectional-rule-p deriv) (format stream "~(~20a~c~30a~c~@[~a~]~c~%~)" (derivation-terminal (nth-daughter 0 (nth-daughter 0 deriv)) :normalize t) #\tab (derivation-preterminal (nth-daughter 0 deriv)) #\tab (when (inflectional-rule-p deriv) (derivation-root deriv)) #\tab)) ((derivation-daughters deriv) (mapc (lambda (daughter) (tnt-process-deriv daughter stream)) (derivation-daughters deriv))) (t (error "Unrecognized derivation form ~s~%" deriv)))) ;;; export-trees-for-tnt ;;; -------------------- ;;; Generates TnT data for all active derivations for the provided ;;; data (defaulting to *tsdb-data*), restricted by the provided ;;; condition and written to files in the directory provided as path. (defun export-trees-for-tnt (&key (data *tsdb-data*) (condition *statistics-select-condition*) path) (let* ((target (or path (format nil "/eo/e5/users/sshieber/tnt/data/~a/" (substitute #\. #\/ data)))) (*package* (find-package :lkb))) ;; delete any previous results and make a new directory for these #+:allegro (ignore-errors (mkdir target)) (loop with cutoffincrement = 100 for cutoff from 0 by cutoffincrement for items = (analyze data :thorough '(:derivation) :condition (format nil "(i-id >= ~a) and (i-id < ~a) ~@[and (~a)~] and (t-active == 1)" cutoff (+ cutoff cutoffincrement) condition)) while items ;; for each item (sentence) do (loop for item in items do ;; extract a bunch of properties of the item (let* ((input (or (get-field :o-input item) (get-field :i-input item))) (i-id (get-field :i-id item)) (parse-id (get-field :parse-id item)) (results (get-field :results item)) (trees (select '("t-version") '(:integer) "tree" (format nil "parse-id == ~a" parse-id) data :sort :parse-id)) (version (loop for tree in trees maximize (get-field :t-version tree))) (active (let ((foo (select '("result-id") '(:integer) "preference" (format nil "parse-id == ~a && t-version == ~d" parse-id version) data))) (loop for bar in foo collect (get-field :result-id bar))))) ;; announce we're working on the item (format t "~d: ~d active tree~p (of ~d)~%" i-id (length active) (length active) (length results)) (if (/= (length active) 1) ;; skip if there isn't exactly one active tree (format t "...skipped~%") (progn ;; make a subdirectory for the output (mkdir (format nil "~a/~d" target i-id)) ;; for each parse of the item (loop for result in results do ;; get the derivation (let* ((id (get-field :result-id result)) (derivation (get-field :derivation result))) ;; if there is one, open a file for the output (if derivation (let ((active? (member id active :test #'eql))) (with-open-file (stream (format nil "~a/~d/~d~a" target i-id id (if active? "A" "")) :direction :output :if-exists :supersede :if-does-not-exist :create) ;; print a header (format stream "%% TnT data from LinGO Redwoods (~a by ~a)~%~ %% ~d tree ~a(of ~d) for item # ~d~%~ %% ~a~%" (current-time) (current-user) id (if active? "[active] " "") (length results) i-id input) ;; print sentence start tags (format stream "~20a~:*~30a~:*~%~20a~:*~30a~%" "-s-") ;; print the tagger data (tnt-process-deriv derivation stream) ;; print sentence end tags (format stream "~20a~:*~30a~:*~%~20a~:*~30a~%" "-e-") ))))))))))))