;;; -*- Mode: LISP; Syntax: Common-Lisp; Package: LKB -*- ;;; Copyright (c) 2000--2002 ;;; John Carroll, Ann Copestake, Robert Malouf, Stephan Oepen, Ben Waldron; ;;; see `LICENSE' for conditions. (in-package :lkb) ;; ;; (bmw) where is this code called from? ;; (defun preprocess-for-pet (string &optional tagger) (if (and tagger (consp tagger) (keywordp (first tagger))) (multiple-value-bind (tokens length) (case (first tagger) (:tnt (apply #'tag-tnt (preprocessor:preprocess string :format :list :verbose nil) (rest tagger)))) (loop for (id start end form surface . tags) in tokens for token = (format nil "(~d, ~d, ~d, 1, \"~a\" \"~a\", 0, \"null\",~ ~{ ~s ~,4f~})" id start end (preprocessor:x-escape-string form) (preprocessor:x-escape-string surface) tags) collect token into tokens finally (return (values (format nil "~{~a~^ ~}" tokens) length)))) (preprocessor:preprocess string :format :pet :verbose nil))) (defparameter *tagger-application* '((:tnt "tnt -z100 /user/oe/src/tnt/models/wsj -"))) (defun tag-tnt (tokens &optional run &key (n 1)) (labels ((commentp (string) (and (>= (length string) 2) (characterp (char string 0)) (char= (char string 0) #\%) (characterp (char string 1)) (char= (char string 1) #\%)))) (let* ((run (or run (loop for run in *tagger-application* when (eq (first run) :tnt) return (first (rest run))) "tnt -z100 /user/oe/src/tnt/models/wsj -")) (command (format nil "exec ~a" run)) (input (format nil "/tmp/.tnt.in.~a" (current-user))) (output (format nil "/tmp/.tnt.out.~a" (current-user))) (length 0) analyses) (with-open-file (stream input :direction :output :if-exists :supersede) (loop with i = -1 for token in tokens for start = (second token) unless (= i start) do (setf i start) (incf length) (format stream "~a~%" (fifth token)) finally (format stream "~%~%"))) (run-process command :wait t :input input :output output :if-output-exists :supersede :error-output "/dev/null" :if-error-output-exists :append) (with-open-file (stream output :direction :input) (loop with buffer = (make-array 512 :element-type 'character :adjustable t :fill-pointer 0) with i = 0 for string = (read-line stream nil nil) while (and string (not (zerop (length string)))) unless (commentp string) do (incf i) (loop with foo = nil with n = 0 initially (setf (fill-pointer buffer) 0) for c across string when (char= c #\tab) do (when (not (zerop (fill-pointer buffer))) (push (if (and (evenp n) (not (zerop n))) (read-from-string (copy-seq buffer)) (copy-seq buffer)) foo) (setf (fill-pointer buffer) 0) (incf n)) else do (vector-push c buffer) finally (when (not (zerop (fill-pointer buffer))) (push (read-from-string (copy-seq buffer)) foo)) (when foo (push (nreverse foo) analyses))))) (loop with tags = (make-array (length analyses)) for analysis in (nreverse analyses) for i from 0 do (setf (aref tags i) (rest analysis)) finally (loop for token in tokens for analysis = (aref tags (second token)) do (nconc token (loop with n = (* 2 n) for foo in analysis while (< 0 n) collect foo do (decf n))))) (values tokens length)))) ;;(defvar saf:*gmap*) (defvar saf:*config*) (defvar *saf-v* -1) (defvar *HIDDEN-smaf-id-to-edge-id* nil) ;; MUST reset when initializing tchart (defvar *smaf-id-to-edge-id* nil) ;; MUST reset when initializing tchart (defvar *chart-node* -1) ;; MUST reset when initializing tchart (defvar *smaf-node-to-chart-node* nil) ;; MUST reset when initializing tchart ;; hack: passes fallback edges from SAF-tchart construction function ;; to parse function (defvar *fallback-medges* nil) ;; ;; SAF -> tchart ;; (defun saf-setup-morphs (saf) (saf-to-tchart saf)) ;; (defun xml-saf-to-tchart (xml) (saf-to-tchart (smaf:xml-to-saf-object xml))) (defun new-tchart () (setf *tchart* (make-array (list *chart-limit* 2) :initial-element nil)) (setf *tchart-max* 0) (setf *fallback-medges* nil) (setf *smaf-id-to-edge-id* nil) (setf *HIDDEN-smaf-id-to-edge-id* nil) ) (defun get-edge-by-id (id &optional (tchart *tchart*)) (find id (get-edges tchart) :key #'edge-id :test #'=)) ;; used in test for duplicate chart/tchart edges (defun dup-edge= (x y) (cond ;; token edges ((and (typep x 'token-edge) (typep y 'token-edge)) (token-edge= x y)) ;; morph edges ((and (typep x 'morpho-stem-edge) (typep y 'morpho-stem-edge)) nil ;; NOT YET IMPLEMENTED ) ;; chart edges ((and (typep x 'edge) (typep y 'edge)) nil) ;; NOT YET IMPLEMENTED ;; different edge types (t (error "edges expected")))) (defun clean-tchart (&optional (tchart *tchart*)) (let ((dups (get-duplicate-edge-sets tchart))) ;(print-tchart) (replace-dup-children dups) ;(print-tchart) (replace-dup-edges dups tchart) ;(print-tchart) )) (defun replace-dup-edges (dups &optional (tchart *tchart*)) (loop for i from 0 upto (1- *chart-limit*) do (setf (aref tchart i 0) (loop for e in (aref tchart i 0) if (member e dups :key #'car) collect e)) (setf (aref tchart i 1) (loop for e in (aref tchart i 1) if (member e dups :key #'car) collect e)))) (defun replace-dup-children (dups) (loop for dup in dups for e = (car dup) for children = (edge-children e) for new-children = (loop for child in children for new-child = (get-dup-edge child dups) ;; do (unless (eq child new-child) ;; (format t "~&; Warning: altering edge ~a (replacing child ~a by duplicate edge ~a)" e child new-child)) collect new-child) do (setf (edge-children e) new-children))) (defun get-dup-edge (e dups) (car (get-dup-set e dups))) (defun get-dup-set (e dups) (find e dups :test #'dup-edge= :key #'car)) (defun get-duplicate-edge-sets (&optional (tchart *tchart*)) (loop with dups for e in (reverse (get-edges tchart)) for dup = (get-dup-set e dups) do (cond (dup ;; duplicate (format t "~&WARNING: pruning duplicate edge ~a" e) (setf (cdr (last dup)) (list e)) ) (t (push (list e) dups))) finally (return dups))) ;; generate sentence strings for all token paths through tchart (defun tchart-to-sentence-strings nil (loop with edge-paths = (get-edge-paths-x2y 0 *tchart-max*) for edge-path in edge-paths collect (with-output-to-string (s) (princ (edge-string (pop edge-path)) s) (loop for edge in edge-path for string = (edge-string edge) do (princ #\Space s) (princ string s))))) ;; generate sentence strings for all token paths through S(M)AF XML input (defun xml-to-sentence-strings (xml &optional (stream t)) (if (lxml:xml-whitespace-p xml) (return-from xml-to-sentence-strings)) (if (null xml) (return-from xml-to-sentence-strings)) (let ((len (length xml))) (when (string= "" (subseq xml (1- len) len)) (setf xml (subseq xml 0 (1- len))))) (when xml (let* ((saf (saf:xml-to-saf-object xml)) (id (saf:saf-id saf))) (saf-to-tchart saf) (loop for sent in (tchart-to-sentence-strings) for str = (format nil "~%;~a~%~a" id sent) do (princ str stream) ;collect str )))) ;; read file containing one S(M)AF XML per line, and generate set sentence test items ;; (for itsdb) (defun file-to-sentence-strings (filename) (with-open-file (s filename :external-format :utf-8) (with-open-file (s-out (format nil "~a.items" filename) :direction :output :if-exists :supersede :external-format :utf-8) (loop for line = (read-line s nil nil) while line ;append do (handler-case (xml-to-sentence-strings line s-out) #+:allegro (EXCL:INTERRUPT-SIGNAL () (error "Interrupt-Signal")) (error (condition) (format t "~&Error: ~A~%whilst processing ~a~%" condition line)) ) )))) ;; use to process set of files containing SAF XML segments into sentences #+:allegro (defun file-pattern-to-sentence-strings (pattern) (require :osi) (loop for filename in (excl.osi:command-output (format nil "ls ~a" pattern)) do (format t "~%; [processing file ~a]" filename) (file-to-sentence-strings filename) )) ;; if set, perform cleanup operations after converting ;; SAF to tchart (defvar *clean-tchart-p* nil) (defun saf-to-tchart (saf &key (filter #'identity)) (new-tchart) (initialize-smaf-node-to-chart-node saf) (saf-lattice-to-tchart (smaf:saf-lattice saf) :filter filter :addressing (smaf:saf-meta-addressing (smaf:saf-meta saf))) (if *clean-tchart-p* (clean-tchart *tchart*)) ;; FIXME! disabled until bug fixed in clean-tchart *tchart*) ;; init=0 ... final=latticeSize (defun initialize-smaf-node-to-chart-node (saf) (let* ((lattice (smaf:saf-lattice saf)) (init (and lattice (smaf:saf-lattice-start-node lattice))) (final (and lattice (smaf:saf-lattice-end-node lattice))) (edges (and lattice (smaf:saf-lattice-edges lattice))) (meta (smaf:saf-meta saf)) (addressing (smaf:saf-meta-addressing meta)) ) (setf *chart-node* -1) (setf *smaf-node-to-chart-node* nil) (when lattice (if init (and (push (cons init 0) *smaf-node-to-chart-node*) (setf *chart-node* 0)) (format t "~%WARNING: no init node in SMAF lattice")) (if final (push (cons final (smaf:get-smaf-lattice-size saf)) *smaf-node-to-chart-node*) (format t "~%WARNING: no final node in SMAF lattice")) ;; create nicely ordered mapping into chart nodes (loop for e in (smaf:sort-edges-by-from edges :addressing addressing) for source = (smaf:saf-edge-source e) for target = (smaf:saf-edge-target e) do (smaf-node-to-chart-node source) (smaf-node-to-chart-node target)) ;(print *smaf-node-to-chart-node*) ))) ;; token edge -> t1 ... ;; morph edge -> m1 ... (defun initialize-smaf-id-to-edge-id-from-tchart nil (setf *smaf-id-to-edge-id* nil) ;; tedges (loop for tedge in (get-tedges) for id = (edge-id tedge) do (push (cons (format nil "t~a" id) id) *smaf-id-to-edge-id*)) ;; medges (loop for medge in (get-medges) for id = (edge-id medge) do (push (cons (format nil "m~a" id) id) *smaf-id-to-edge-id*)) *smaf-id-to-edge-id*) (defun saf-lattice-to-tchart (saf-lattice &key (filter #'identity) addressing) (loop for e in (loop for f in (and saf-lattice (smaf:saf-lattice-edges saf-lattice)) when (funcall filter f) collect f) if (string= (saf:l-edgeType e) "tok") collect e into toks else if (string= (saf:l-edgeType e) "tok+morph") collect e into tokMorphs else if (string= (saf:l-edgeType e) "morph") collect e into morphs else do (format t "~&WARNING: SMAF edge ~a has unknown edgeType '~a' (allowed values: 'tok' 'tok+morph' 'morph')" (smaf:saf-edge-id e) (saf:l-edgeType e)) finally (loop for e in toks do (augment-tchart-from-saf-edge e #'saf-edge-to-tedge addressing)) (loop for e in tokMorphs do (augment-tchart-from-saf-edge e #'saf-edge-to-tedge addressing) (augment-tchart-from-saf-edge e #'saf-edge-to-medge addressing)) (loop for e in morphs do (augment-tchart-from-saf-edge e #'saf-edge-to-medge addressing)) )) #+:null (defun next-tchart-edge-id (&optional (tchart *tchart*)) (let ((edges (get-edges tchart))) (if edges (apply #'max (mapcar #'edge-id edges)) 0))) ;; to do: replace global *tchart* + *tchart-max* + ??? with objects (defun augment-tchart-from-saf-edge (saf-edge fn addressing) (loop with edges = (funcall fn saf-edge addressing) for edge in edges if (smaf:saf-fs-feature-value2 (smaf:saf-edge-l-content saf-edge) :|fallback|) do ;; fallback edges are stored and resurrected during parsing if ;; lexical lookup fails (push edge *fallback-medges*) else do (augment-tchart-from-saf-edge-aux edge))) (defun augment-tchart-from-saf-edge-aux (edge) (let ((from (edge-from edge)) (to (edge-to edge))) (unless (and (integerp from) (integerp to)) (format t "~&WARNING: ignoring malformed chart edge '~a' (from='~a', to='~a')" (edge-id edge) from to) (return-from augment-tchart-from-saf-edge-aux)) (push edge (aref *tchart* to 0)) (push edge (aref *tchart* from 1)) (when (> to *tchart-max*) ;; (format t "~%WARNING: increasing *tchart-max* to ~a" to) (setf *tchart-max* to))) *tchart*) ;; [bmw] very basic unknown word mechanism (doesn't require SMAF XML input) ;; set to grammar type for unknown words ;; (must also set *fallback-pos-p* to T) (defvar smaf:*unknown-word-type* nil) ;; generate fallback edges, then add them to tchart (defun augment-tchart-with-fallback-morphop nil (loop for medge in (get-fallback-morphop-edges) for children = (edge-children medge) ;; for grammar-type = ;; (dag-type ;; (tdfs-indef ;; (lex-entry-full-fs ;; (slot-value medge 'l-content)))) do (format t "~&;;; WARNING: adding fallback edge ~a for unknown token ~a" (edge-id medge) children) (add-edge-to-tchart medge))) ;; generate fallback edges (defun get-fallback-morphop-edges nil (or (and smaf:*unknown-word-type* ;; very basic mechanism, same for all unknown words (loop for tedge in (get-unanalysed-and-unspanned-tedges) for e-from = (edge-from tedge) for e-to = (edge-to tedge) for children = (list tedge) for leaf-edges = children for children-words = (loop for l in leaf-edges collect (token-edge-string l)) for cfrom = (edge-cfrom tedge) for cto = (edge-cto tedge) for form = (str-list-2-str children-words) for stem = (string-upcase form) for dummy-entry = (get-dummy-unexpanded-lex-entry stem :unifs (list (cons :|type| (2-str smaf:*unknown-word-type*))) :gmap '((:|type| NIL :sym))) collect (make-morpho-stem-edge :id (next-edge) :children children :leaves (loop for x in leaf-edges collect (edge-string x)) :from e-from :to e-to :cfrom cfrom :cto cto :string form :word (string-upcase form) :current (string-upcase form) :stem stem :partial-tree nil :l-content (cons :full dummy-entry) )) ) (and t ;; more sophisticated mechanism ;; (eg. map POS tags into grammar types) (loop with tedges = (get-unanalysed-and-unspanned-tedges) for medge in *fallback-medges* for children = (edge-children medge) when (intersection children tedges) ;; eg. any children are unanalysed collect medge)))) ;; and slot edge into *tchart* from/to array (defun add-edge-to-tchart (edge) (let ((from (edge-from edge)) (to (edge-to edge))) (push edge (aref *tchart* to 0)) (push edge (aref *tchart* from 1)) (when (> to *tchart-max*) ;; (format t "~%WARNING: increasing *tchart-max* to ~a" to) (setf *tchart-max* to))) *tchart*) ;; input: edge of type 'tok' or 'tok+morph' (defun saf-edge-to-tedge (saf-edge addressing) (unless (or (string= "tok" (saf:l-edgeType saf-edge)) (string= "tok+morph" (saf:l-edgeType saf-edge))) (error "edgeType='tok' expected (got '~a')" (saf:l-edgeType saf-edge))) (with-slots (smaf:id smaf:source smaf:target smaf:from smaf:to smaf:l-content) saf-edge (let* ((tokenStr (smaf:saf-fs-feature-value2 smaf:l-content :|tokenStr|)) (e-id (if (string= "tok" (saf:l-edgeType saf-edge)) (smaf-id-to-edge-id smaf:id) (HIDDEN-smaf-id-to-edge-id smaf:id :token))) (tedge (make-token-edge :id e-id :from (smaf-node-to-chart-node smaf:source) :to (smaf-node-to-chart-node smaf:target) :string tokenStr :cfrom (smaf:point-to-char-point smaf:from addressing) :cto (smaf:point-to-char-point smaf:to addressing) :word (string-upcase tokenStr) :leaves (list tokenStr)))) (list tedge)))) (defun HIDDEN-smaf-id-to-edge-id (smaf-id hidden) (let* ((h-str (case hidden (:token "T") (t (error "expected :token, got ~S" hidden)))) (id (concatenate 'string h-str smaf-id)) (match (cdr (assoc id *HIDDEN-smaf-id-to-edge-id* :test #'string=)))) (if match (return-from HIDDEN-smaf-id-to-edge-id match)) ;; new edge id (push (cons id (incf *edge-id*)) *HIDDEN-smaf-id-to-edge-id*) *edge-id*)) (defun smaf-id-to-edge-id (smaf-id) (let ((match (cdr (assoc smaf-id *smaf-id-to-edge-id* :test #'string=)))) (if match (return-from smaf-id-to-edge-id match)) ;; new edge id (push (cons smaf-id (incf *edge-id*)) *smaf-id-to-edge-id*) *edge-id*)) (defun edge-id-to-smaf-id (edge-id) (let ((match (car (rassoc edge-id *smaf-id-to-edge-id* :test #'equalp)))) (if match (return-from edge-id-to-smaf-id match)))) ;; init/final dealt with via correct initialization of mapping (defun smaf-node-to-chart-node (smaf-node) (let ((match (cdr (assoc smaf-node *smaf-node-to-chart-node* :test #'string=)))) (if match (return-from smaf-node-to-chart-node match)) ;; new chart node (push (cons smaf-node (incf *chart-node*)) *smaf-node-to-chart-node*) *chart-node*)) ;; id: first char ignored, rest gives integer #+:null (defun id-to-int (id &key (generate t)) (handler-case (let ((i (if id (parse-integer (subseq id 1))))) (if i (and (setf *edge-id* (max i *edge-id*)) i) (if generate (- (incf *edge-id*))))) (error (condition) (error "unable to convert malformed id `~a': ~a" id condition)))) ;; input: saf edge of type 'tok' or 'tok+morph' (defun saf-edge-to-medge (saf-edge addressing) (unless (or (string= "morph" (saf:l-edgeType saf-edge)) (string= "tok+morph" (saf:l-edgeType saf-edge))) (error "'morph' edge expected (got '~a')" (saf:l-edgeType saf-edge))) ;; assumes tedges already in chart (with-slots (smaf:id smaf:source smaf:target smaf:deps smaf:l-content smaf:from smaf:to) saf-edge (let* ((children (cond ((string= "tok+morph" (saf:l-edgeType saf-edge)) ;; find hidden 'tok' edge (let ((child (smaf-id-to-token-edge smaf:id (get-tedges *tchart*) :hidden :token))) (if child (list child)))) (t ;; derive child edges from saf deps (loop for d in smaf:deps for tedge = (smaf-id-to-token-edge d (get-tedges *tchart*)) if tedge collect tedge else do (format t "~&WARNING: missing SMAF edge '~a' (child of '~a')" d smaf:id))))) (leaf-edges children) ;;fix me (children-words (loop for l in leaf-edges collect (token-edge-string l))) (form (str-list-2-str children-words)) (stem (string-upcase (or (smaf:saf-fs-feature-value2 smaf:l-content :|stem|) form))) (partialTree (smaf:saf-fs-feature-value2 smaf:l-content :|partialTree|)) (partialTree2 (smaf:saf-fs-feature-value2 smaf:l-content :|+partialTree|)) (gmap-unifs (smaf:get-gmap-unifs smaf:l-content)) (dummy-entry (get-dummy-unexpanded-lex-entry form :unifs gmap-unifs :gmap (saf:config-gmap saf:*config*) :rmrs (smaf:saf-fs-feature-value2 smaf:l-content :|rmrs|) )) (inject (get-inject gmap-unifs (saf:config-gmap saf:*config*))) (l-content (if (smaf:saf-fs-feature-value2 (smaf:saf-edge-l-content saf-edge) :|inject|) (and inject (cons :inject inject)) (and dummy-entry (cons :full dummy-entry)))) (e-from (smaf-node-to-chart-node smaf:source)) (e-to (smaf-node-to-chart-node smaf:target)) (cfrom (or (smaf:point-to-char-point smaf:from addressing) (get-min-edge-cfrom children))) (cto (or (smaf:point-to-char-point smaf:to addressing) (get-max-edge-cto children))) (partial-tree (or (smaf:saf-fs-partial-tree-2-list-partial-tree partialTree) (saf-plus-2-list-partial-tree partialTree2) )) err-flag medges medge ) (unless leaf-edges (format t "~&ERROR: no leaf-edges for SMAF edge '~a'" smaf:id) (return-from saf-edge-to-medge)) (unless (or stem dummy-entry) (format t "~&WARNING: no stem/gType for SMAF edge '~a'" smaf:id) (setf err-flag t)) (unless (integerp e-from) (format t "~&WARNING: missing source for SMAF edge '~a'" smaf:id) (setf err-flag t)) (unless (integerp e-to) (format t "~&WARNING: missing target for SMAF edge '~a'" smaf:id) (setf err-flag t)) (when (and (integerp e-from) (integerp e-to) (not (= e-from (leaf-edges-from leaf-edges)))) (format t "~&WARNING: source mismatch between SMAF edge '~a' and it's daughters" smaf:id) (setf err-flag t)) (when (and (integerp e-from) (integerp e-to) (not (= e-to (leaf-edges-to leaf-edges)))) (format t "~&WARNING: target mismatch between SMAF edge '~a' and it's daughters" smaf:id) (setf err-flag t)) (unless err-flag (push (setf medge (make-morpho-stem-edge :id (smaf-id-to-edge-id smaf:id) :children children :leaves (loop for x in leaf-edges collect (edge-string x)) :from e-from :to e-to :cfrom cfrom :cto cto :string form :word (string-upcase form) :current (string-upcase form) :stem stem :partial-tree partial-tree :l-content l-content )) medges) (when (smaf:saf-fs-feature-value2 (smaf:saf-edge-l-content saf-edge) :|analyseMorph|) (loop with morph-analyses = (get-morph-analyses (string-upcase form)) for (stem . partial-tree) in morph-analyses when partial-tree do (push (make-morpho-stem-edge :id (next-edge) :children (list medge) :leaves (loop for x in leaf-edges collect (edge-string x)) :from e-from :to e-to :cfrom cfrom :cto cto :string form :word (string-upcase form) :current stem :stem stem :partial-tree partial-tree :l-content l-content ) medges)))) medges))) (defun get-inject (unifs gmap) (loop for (feat . val) in unifs for (feat2 path2 type) = (find feat gmap :key #'first) for path = ;;HACK! *ersatz-carg-path* overrides :|carg| path (or (and (eq feat :|carg|) saf:*ersatz-carg-path*) path2) for decoded-val = (if (eq type :sym) (intern val) val) do (setf feat2 feat2) ;; hack to get rid of compiler warning unless (and (eq feat :|carg|) (null saf:*ersatz-carg-path*)) ;;HACK! see above collect (make-unification :lhs (make-path :typed-feature-list path) :rhs (make-u-value :type decoded-val)))) (defun smaf-id-to-token-edge (id tedges &key hidden) (if hidden (find (HIDDEN-smaf-id-to-edge-id id hidden) tedges :key #'token-edge-id :test #'=) (find (smaf-id-to-edge-id id) tedges :key #'token-edge-id :test #'=))) #+:null (defun str-2-int (str) (if str (parse-integer str))) ;; ;; ; ;; (defun mps-str (x) (or (encode-mixed-as-str x) "")) (defvar mrs::*main-semantics-path*) ;(GET-DUMMY-UNEXPANDED-LEX-ENTRY "LDA" ; :UNIFS ((:|carg| . "bmw[Li+].CC(C)[N-]C(C)C") (:|pred|) ; (:|type| . "n_proper_nale")) ; :GMAP ((:|carg| (SYNSEM LKEYS KEYREL CARG) :STR) ; (:|pred| (SYNSEM LKEYS KEYREL PRED) :SYM) (:|type| NIL :SYM))) ;; mixed up code: unifs shadowing unifs! (defun get-dummy-unexpanded-lex-entry (orth &key unifs gmap rmrs) (let* ((unifs2 (loop for (key . val) in unifs for (c-dummy c-path c-type) = (find key gmap :key #'car) for val2 = (mps-str (if (eq c-type :sym) (intern val) val)) collect (list val2 c-path '(MIXED)) do (setf c-dummy c-dummy))) (lex-entry (get-dummy-unexpanded-lex-entry2 orth :unifs unifs2)) ;; need: mrs:s:*semi* and mrs::*meta-semi* (mrs (and rmrs (mrs::convert-rmrs-to-mrs rmrs))) (rels (and mrs (mrs::psoa-liszt mrs))) (rels-unifs (and rels (mrs::create-unifs-from-rels2 rels mrs::*main-semantics-path*)))) (when lex-entry (setf (lex-entry-unifs lex-entry) (append rels-unifs (lex-entry-unifs lex-entry)))) lex-entry)) ;(GET-DUMMY-UNEXPANDED-LEX-ENTRY2 "LDA" ; :UNIFS (("\"bmw[Li+].CC(C)[N-]C(C)C\"" (SYNSEM LKEYS KEYREL CARG) (MIXED)) ; ("" (SYNSEM LKEYS KEYREL PRED) (MIXED)) ; ("n_proper_nale" NIL (MIXED)))) ;; (val path type)* (defun get-dummy-unexpanded-lex-entry2 (orth &key unifs) (when unifs (loop with dfn = '((:ID :|name| "" (SYM)) (:ORTH :|orthography| "" (STR-RAWLST)) (:UNIFS :|orthography| "(stem)" (STR-LST))) with vals = (list "" orth) with keys = '(:|name| :|orthography|);;! for (val path type) in unifs for key from 0 do (push (list :unifs key (format nil "~S" path) type) dfn) (push val vals) (push key keys) finally (return (make-psort-struct2 vals keys :dfn dfn))))) (defun read-smaf-conf (x) (format t "~&;;; reading SMAF config file '~a'" x) (setf saf:*config* (saf:conf-read-file x)) ; (saf:get-saf-l-map x) t) ;; (defun run-parse-server (&rest rest) (apply 'saf:run-parse-server rest)) (defun run-fspp-server (&rest rest) (apply 'saf:run-fspp-server rest)) ;; ;; print token counts for all unanalysed tokens (defun report-unanalysed-tokens nil (let ((hash (make-hash-table :test #'equalp)) count-toks) (loop for tok in *unanalysed-tokens* for count = (or (gethash tok hash) 0) do (setf (gethash tok hash) (1+ count))) (setf count-toks (loop for tok being each hash-key in hash for count = (gethash tok hash) collect (cons count tok))) (loop for (count . tok) in (sort count-toks #'> :key #'car) do (format t "~%~a ~a" count tok)))) ;; ;; batch processing ;; SENTENCE -> PARSE ;; (defun process-standoff-sentence-file (filename &key show-parse) (process-saf-file-sentences filename :show-parse show-parse)) (defun process-saf-file-sentences (filename &key (show-parse t) reset-unanalysed-tokens &allow-other-keys) (with-open-file (ofile (merge-pathnames (make-pathname :name (format nil "~a.out" (pathname-name (pathname filename)))) (pathname filename)) :direction :output :if-exists :supersede :if-does-not-exist :create) (format t "~&;;; Input sentence file: ~a" filename) (format t "~&;;; Output file: ~a" (namestring ofile)) (process-saf-sentences (saf:xml-to-saf-object (saf:read-file-to-string filename) :dir (pathname-directory (pathname filename))) :ostream ofile :show-parse show-parse :reset-unanalysed-tokens reset-unanalysed-tokens))) (defun process-saf-sentences (saf &key (ostream t) show-parse reset-unanalysed-tokens pprint) (let* ((textfilename (saf:saf-meta-document (saf:saf-meta saf))) (text (if textfilename (saf:read-file-to-string textfilename)))) (format t "~&;;; Data file: ~a" (saf:saf-meta-document (saf:saf-meta saf))) (format ostream "~a" (saf:saf-header (saf:make-saf-meta :addressing :|char| :document (saf:saf-meta-document (saf:saf-meta saf))))) (when reset-unanalysed-tokens (setf *unanalysed-tokens* nil)) (loop for s in (sort (loop for e in (saf:saf-lattice-edges (saf:saf-lattice saf)) when (eq :|sentence| (saf:saf-edge-type e)) collect e) #'< :key #'(lambda (x) (or (saf:point-to-char-point (saf:saf-edge-from x) :|char|) -1))) for from = (saf:saf-edge-from s) for to = (saf:saf-edge-to s) unless (and from to) do (format t "~&~%CANNOT PROCESS SENTENCE ~a due to null pointer: from=~a to=~a" (saf:saf-edge-id s) from to) when (and from to) do (format t "~&~%PROCESSING SENTENCE ~a: ~& ~a" (saf:saf-edge-id s) (saf:x-span text from to (saf:saf-meta-addressing (saf:saf-meta saf))) ) (time (handler-case (cond ((saf:saf-meta-document (saf:saf-meta saf)) (let ((*generate-messages-for-all-unanalysed-tokens* t) ;(*char-map-add-offset* ;(point-to-char-point (saf-edge-from s) :|char|)) ) (setf saf:*char-map-add-offset* (saf:point-to-char-point (saf:saf-edge-from s) :|char|)) (x-parse text (saf:saf-edge-from s) (saf:saf-edge-to s) (saf:saf-meta-addressing (saf:saf-meta saf)) :document (saf:saf-meta-document (saf:saf-meta saf)) :char-map #'saf:char-map-add-x :show-parse show-parse))) (t (x-parse (saf:saf-edge-content s) nil nil nil :document nil :show-parse show-parse))) (storage-condition (condition) (format t "~&Memory allocation problem: ~A" condition)) #+:allegro (EXCL:INTERRUPT-SIGNAL () (error "Interrupt-Signal")) (error (condition) (format t "~&Error: ~A" condition)) )) (saf:dump-sentence-analyses s :stream ostream :pprint pprint)) (format ostream "~&"))) (defvar saf:*document* nil) (defvar *text* nil) (defun x-parse (text from to addressing &key document (char-map #'identity) (show-parse t)) (unless (preprocessor:preprocessor-initialized-p) (error "please load preprocessor")) (setf saf:*document* document) (let ((str (cond ((and from to addressing) (saf:x-span text from to addressing)) ((and (null from) (null to) (null addressing)) text) (t (error "from/to/addressing=~a/~a/~a" from to addressing)))) (preprocessor:*local-to-global-point-mapping* char-map) (*text* text) (old-x-fspp-global (preprocessor:x-fspp-global preprocessor:*preprocessor*)) ) (setf *sentence* str) ; ; (format t "~%~%=.~a.~%~%" preprocessor:*local-to-global-point-mapping*) ; (parse (preprocessor:x-preprocess str :format :maf) show-parse) (setf (preprocessor:x-fspp-global preprocessor:*preprocessor*) old-x-fspp-global) t)) (defvar saf:*morph-rule-map* nil) ;; [for testing purposes] ;(defvar smaf:*morph-rule-map* ; '(("M1" . "THIRD_SG_FIN_VERB_ORULE") ; ("M2" . "PUNCT_PERIOD_ORULE") ; ("M0" . "PLUR_NOUN_ORULE"))) ;; support for external morphology built up from separate slots ;; and where external morph names are mapped into grammal rules (defun saf-plus-2-list-partial-tree (plus-list) (loop for x2 in plus-list for x = (or (cdr (assoc x2 saf:*morph-rule-map* :test #'string=)) x2) collect (list (intern x :lkb) nil))) ;; ;; (defun concatenate-strings (x) (apply #'concatenate (cons 'string x))) (defun split-str-on-spc (str) (mapcar #'car (split-on-spc str))) ;; return list of (WORD-STRING FROM TO) ;; where FROM, TO are char offsets (defun split-on-spc (preprocessed-string) (remove "" (loop with c-list = (coerce preprocessed-string 'list) with c-list-word with from = 0 for c in c-list for i from 1 to (length c-list) if (char= c #\Space) collect (list (coerce (nreverse c-list-word) 'string) from (1- i)) into words and do (setf from i) and do (setf c-list-word nil) else do (push c c-list-word) finally (return (append words (list (list (coerce (nreverse c-list-word) 'string) from i))))) :key #'car :test #'string=)) ;; ;; XML test suite ;; (defvar *xml-test-neg* nil) (defvar *xml-test-pos* nil) (defun xml-test-initialise nil (setf *xml-test-neg* nil) (setf *xml-test-pos* nil)) (defun xml-test-file (filename) (handler-case (with-open-file (s filename) (s-xml:parse-xml s) (push filename *xml-test-pos*)) (error (condition) (format t "~&FILENAME: ~a Error: ~A" filename condition) (push (cons filename condition) *xml-test-neg*)))) #+:allegro (defun xml-test-files (pattern) (xml-test-initialise) (loop for filename in (excl.osi:command-output (format nil "ls ~a" pattern)) do (xml-test-file filename) )) ;; ;; #+:null (defvar *maf-token-id-counter* 0) ;;; ;;; BASIC XML TO MAF TOKENS ;;; also MAF TOKENS <-> TCHART #+:null (defun tchart-to-maf-tokens (&optional (tchart *tchart*)) (tchart-to-maf tchart :wordforms nil)) ;;; ;;; MAF TOKENS TO MAF WORDFORMS ;;; ;;; ;;; tchart -> maf-wordforms.xml ;;; #+:null (defun tchart-to-maf-wordforms (&optional (tchart *tchart*)) (tchart-to-maf tchart :wordforms t)) ;;; ;;; MAF to tchart mapping ;;; (defun leaf-edges-from (leaf-edges) ; (unless leaf-edges ; (error "leaf-edges is null")) (and leaf-edges (apply #'min (mapcar #'edge-from leaf-edges)))) (defun leaf-edges-to (leaf-edges) ; (unless leaf-edges ; (error "leaf-edges is null")) (and leaf-edges (apply #'max (mapcar #'edge-to leaf-edges)))) ;;; ;;; tchart to (S)(M)AF mapping ;;; (defun tchart-to-saf (&optional (tchart *tchart*) &key (wordforms t)) (tchart-to-maf tchart :wordforms wordforms :saf t)) (defun tchart-to-maf (&optional (tchart *tchart*) &key (wordforms t) saf) (initialize-smaf-id-to-edge-id-from-tchart) (let* ((strm (make-string-output-stream)) (tedges (get-tedges tchart)) (medges (if wordforms (get-medges tchart)))) (format strm "~a" (smaf:saf-header (smaf:make-saf-meta :document nil :addressing :|char|))) (format strm "~a" (fsm-xml tedges medges :saf saf)) (if saf (format strm "") (format strm "")) (get-output-stream-string strm))) (defun fsm-xml (tedges medges &key saf) (let* ((strm (make-string-output-stream)) (v-min (loop for x in tedges minimize (edge-from x))) (v-max (1+ (loop for x in tedges maximize (edge-from x))))) (format strm "" v-min v-max) ;; states (loop for i from v-min to v-max do (format strm "" i)) ;; token edges (loop for tedge in tedges do (format strm "~a" (tedge-to-token-xml tedge :saf saf))) ;; wordform edges (loop for medge in medges do (format strm "~a" (medge-to-wordform-xml medge :saf saf))) (format strm "") (get-output-stream-string strm))) (defun medge-to-wordform-xml (medge &key saf) (with-slots (from to string stem partial-tree id) medge (cond (saf (concatenate 'string (format nil "" ;(if (caar partial-tree) ; (cl-ppcre:regex-replace "_INFL_RULE$" (string (caar partial-tree)) "") ;"") (edge-id-to-smaf-id id) (edge-to-tokens-id-str medge) from to) (format nil "") (format nil "~a" (stem-to-fs stem)) (if partial-tree ;(format nil "~a" (partial-tree-to-fs-lazy partial-tree))) (format nil "~a" (partial-tree-to-fs partial-tree))) (format nil "") (format nil ""))) (t (concatenate 'string (format nil "" (edge-to-tokens-id-str medge) from to) (format nil "") (format nil "~a" (stem-to-fs stem)) (if partial-tree (format nil "~a" (partial-tree-to-fs partial-tree))) ;(format nil "~a" (partial-tree-to-fs-lazy partial-tree))) (format nil "") (format nil "")) )))) ;; store as lisp list text (defun partial-tree-to-fs (p-tree) (format nil "~a" (partial-tree-to-fs2 p-tree))) (defun partial-tree-to-fs2 (p-tree) (if (null p-tree) "" (format nil "~a~a" (partial-tree-elt-to-fs (car p-tree)) (partial-tree-to-fs2 (cdr p-tree))))) (defun partial-tree-elt-to-fs (p-tree-elt) (format nil "~a~a" (xml-escape (format nil "~a" (first p-tree-elt))) (xml-escape (format nil "~a" (second p-tree-elt))))) (defun partial-tree-to-fs-lazy (p-tree) (concatenate 'string (format nil "") (xml-escape (format nil "~S" p-tree)) (format nil ""))) (defun stem-to-fs (stem) (concatenate 'string (format nil "") (xml-escape (format nil "~a" stem)) (format nil ""))) (defun edge-to-leaf-token-edges (edge) (cond ((token-edge-p edge) (edge-id edge)) (t (loop with children = (or (edge-children edge) (edge-tchildren edge) (error "children or tchildren expected in edge ~a" edge)) with tedges = (extract-descendant-tedges children) for tedge in tedges collect tedge)))) (defun edge-to-tokens-id-str (edge) (concatenate-strings (cdr (loop for tedge in (edge-to-leaf-token-edges edge) for id = (edge-id tedge) collect " " collect (edge-id-to-smaf-id id))))) (defun extract-descendant-tedges (children) (loop for child in children append (cond ((edge-children child) (extract-descendant-tedges (edge-children child))) ((edge-tchildren child) (extract-descendant-tedges (edge-tchildren child))) (t (if (token-edge-p child) (list child) (error "unexpected non-token-edge leaf edge ~a" child)))))) (defun tedge-to-token-xml (tedge &key saf) ;; use cfrom/cto in place of xfrom/xto (with-slots (id from to string cfrom cto) tedge (cond (saf (format nil "" (xml-escape (format nil "t~a" id)) (xml-escape (2-str (or cfrom "?"))) (xml-escape (2-str (or cto "?"))) (xml-escape string) (xml-escape (or (2-str from) "?")) (xml-escape (or (2-str to) "?"))) ) (t (format nil "" (xml-escape (format nil "t~a" id)) (xml-escape (2-str (or cfrom "?"))) (xml-escape (2-str (or cto "?"))) (xml-escape string) (xml-escape (or (2-str from) "?")) (xml-escape (or (2-str to) "?"))) )))) ;; assume xpoint order is string order for now (defun x< (x y) (string< x y)) (defun x> (x y) (string> x y)) (defun x= (x y) (string= x y)) (defun char-offset-to-xpoint (i) (unless (integerp i) (error "char offset must be integer")) (format nil "/1/1.~a" i)) (defun xpoint-to-char-offset (xp) (unless (stringp xp) (error "expected Xpoint as string")) (unless (and (>= (length xp) 5) (string= (subseq xp 0 5) "/1/1.")) ;;temporary hack (return-from xpoint-to-char-offset -1) (error "unhandled Xpoint ~a (work in progress)" xp)) (read-from-string (subseq xp 5))) ;; assume for now XML root element contains only CDATA (defun xpoint-range (xml xrange) (let ((x-from (car xrange)) (x-to (cdr xrange))) (let* ((p-xml (s-xml:parse-xml-string xml)) (text (second (car (member '|text| p-xml :key #'car)))) (c-from (xpoint-to-char-offset x-from)) (c-to (xpoint-to-char-offset x-to))) (unless (and (>= c-from 0) (<= c-to (length text))) (error "Xpoint out of range")) (subseq text c-from c-to))))