;;; Copyright (c) 1991-2018 ;;; John Carroll, Ann Copestake, Robert Malouf, Stephan Oepen,Ben Waldron ;;; see `LICENSE' for conditions. ;;; AAC April 2005 - rationalisation of morphology and MWEs ;;; - also preparation for chart output from tokeniser ;;; Notes 1. removed ltemplates - I don't have a grammar etc to test ;;; this mechanism, but I rather think it will be superseded by the ;;; new approach when we get this fully implemented ;;; 2. sensible messages when missing lexical entries (or unknown words) ;;; 3. more or less minor FIXes as below (unify-in) (in-package :lkb) ;;; This file implements the chart parser. ;;; The chart data structures: ;;; ;;; The token chart is a 2D array with dimension 0 being a vertex number v, and ;;; dimension 1 holding the edges ending at vertex v (0) / the edges starting at ;;; vertex v (1). ;;; ;;; The parse chart is a 2D array with dimensions being the start vertex and the ;;; end vertex. (defvar *parser-rules* nil) (defvar *parser-lexical-rules* nil) ;;; *chart-limit* is defined in globals.lsp (defvar *text* nil) (defvar *sentence* nil) (defvar *sentence-number* nil) (defparameter *tchart-max* 0 "set by the tokeniser to the maximum vertex in the tchart") (defparameter *chart-max* 0 "set by the stem edge addition code to the maximum vertex in the chart") (defvar *chart-dependencies*) (defvar *chart-generation-counter* 0 "counter used by the user interface to make sure parse tree windows etc have the current chart") (defparameter *unpacking-scoring-hook* nil) ;; [bmw] some globals for customization of the 'parse' function (defvar *generate-messages-for-all-unanalysed-tokens* nil) (defvar *unanalysed-tokens* nil) (defvar *maf-p* nil) (defvar *abort-parse-after-morphosyntax* nil) (defvar *abort-parse-after-lexical-lookup* nil) (defvar *abort-parse-if-no-spanning-morph-edge-path* nil) (defvar *fallback-pos-p* nil) ;;; ;;; [bmw] HOWTO collect into *unanalysed-tokens* tokens for which lexical lookup ;;; failed: ;;; - set *generate-messages-for-all-unanalysed-tokens* to T ;;; - set *clean-tchart-p* to T (ensures no duplicate token edges) ;;; - ... parse some input ... ;;; - look in *unanalysed-tokens* for bag of token instances ;;; ;;; ;;; recently added variant: use active key-driven parsing strategy; this seems ;;; to ;;; ;;; - perform better than the default unidirectional passive breadth-first ;;; search (up to 40 % time reduction on longer VerbMobil sentences); ;;; - simplify best-first search and generalize to more than just binary ;;; branching rules; ;;; - outperform passive best-first mode modestly. ;;; ;;; until tested somewhat better, the active parser is in experimental state, ;;; disabled by default, and hidden in `active.lsp'. ;;; (20-jun-99 - oe) ;;; ;;; now that we have tested the active parser a bit (and people had the time ;;; to get used to the idea of active parsing :-), turn it on by default. ;;; (20-jan-00 - oe) (defvar *active-parsing-p* t) ;;; ;;; *chart* contains lists of chart-configurations ;;; ;;; for now it seems best to keep the token and morphophonology ;;; chart separate from the `real' one. It seems possible that ;;; the number of vertices in *chart* may end up being ;;; greater than the number in *tchart* if we allow compounding. ;;; Keeping the charts distinct also avoids having to filter ;;; out the token edges and the morphop edges. ;;; (defvar *chart* nil) (defvar *tchart* nil) (defvar *parse-record* nil) (defstruct (chart-configuration) ;; this structure is specialised for the active parser (see active.lsp) ;; begin is a vertex - edge is an edge begin edge roots end) ;;; ;;; globally keep track of _all_ edges; the registry needs to be reset whenever ;;; we start a new context of computation, e.g. a top-level call to the parser ;;; or generator. ;;; _fix_me_ ;;; it would be tempting to encapsulate more information in the registry, i.e. ;;; the running counter(s) for edges of various types. (8-dec-06; oe) ;;; #| (defparameter *edge-registry* (make-hash-table)) (defun purge-edge-registry () (when (hash-table-p *edge-registry*) (clrhash *edge-registry*))) (defun register-edge (edge) (if (hash-table-p *edge-registry*) (setf (gethash (edge-id edge) *edge-registry*) edge) edge)) (defun retrieve-edge (&key id) (when (numberp id) (gethash id *edge-registry*))) |# ;;; hash table changed to a simple list of edges since edge registry is hardly ever ;;; looked up - JAC 27-May-2020 (defparameter *edge-registry* nil) (defun purge-edge-registry () (setq *edge-registry* nil)) (defun register-edge (edge) (push edge *edge-registry*) edge) (defun retrieve-edge (&key id) (when (numberp id) (find id *edge-registry* :key #'edge-id))) ;;; an edge is a structure ;;; it has the following properties: ;;; id - unique ;;; score - for best-first ;;; category - eg S VP etc - now a type ;;; rule - either the word (storms etc) or the grammar rule itself which has ;;; been applied to create that edge ;;; dag - the dag associated with the constituent ;;; odag - something to do with packing? ;;; dag-restricted - for packing ;;; leaves - orthography of whatever this edge has been formed from ;;; lex-ids - like leaves, but identifiers of whole structures ;;; parents - for active parsing ;;; children - daughter edges ;;; tchildren - token daughter edges (for debugging) ;;; orth-tdfs - fs containing orthography ;;; partial-tree - specification of affixes etc ;;; from - vertex ;;; to - vertex ;;; label - ? ;;; head - used in maxent.lisp ;;; cfrom - character position ;;; cto - character position ;;; string - original string - potentially redundant ;;; because it could be recovered from cfrom and cto ;;; but useful to keep it locally ;;; mrs - generation etc ;;; ;;; `flags', `foo,' `bar', and `baz' are junk slots for various pieces of code ;;; (e.g. the Redwoods annotation tool and PCFG estimation) to use at their ;;; discretion. (28-oct-02; oe) ;;; ;;; packed ;;; equivalent ;;; frozen ;;; adjuncts ;;; unpacking ;;; ;;; _note_ ;;; as of december 2006, some of the derived structures (viz. `dotted-edge' and ;;; `g-edge' further down in this file) have custom constructors too, hence it ;;; is important to propagate any changes to `edge' slots and arguments to its ;;; constructor into those structure definitions too. (22-dec-06; oe) ;;; (defstruct (edge (:constructor make-edge-x (&key id score category rule dag odag (dag-restricted (when dag (restrict-fs (tdfs-indef dag)))) leaves lex-ids parents children tchildren orth-tdfs partial-tree (from (when (edge-p (first children)) (edge-from (first children)))) (to (when (edge-p (first (last children))) (edge-to (first (last children))))) label head (lnk (let ((first (first children)) (last (first (last children)))) (when (and (edge-p first) (edge-p last)) (mrs::combine-lnks (edge-lnk first) (edge-lnk last))))) (cfrom (if (edge-p (first children)) (edge-cfrom (first children)) -1)) (cto (if (edge-p (first (last children))) (edge-cto (first (last children))) -1)) string mrs flags foo bar baz packed equivalent frozen adjuncts unpacking))) id score category rule dag odag dag-restricted leaves lex-ids parents children tchildren orth-tdfs partial-tree from to label head lnk cfrom cto string mrs flags foo bar baz packed equivalent frozen adjuncts unpacking) (defstruct (token-edge (:include edge)) word maf-id xfrom xto) (defstruct (morpho-stem-edge (:include edge)) word stem current l-content) (defun make-edge (&rest rest) (declare (dynamic-extent rest)) (register-edge (apply #'make-edge-x rest))) (defmethod print-object ((instance chart-configuration) stream) (format stream "[~S ~S]" (chart-configuration-edge instance) (chart-configuration-roots instance))) (defmethod print-object ((instance edge) stream) (format stream "#[Edge # ~d: `~(~a~)' <~{~a~^ ~}>" (edge-id instance) (let ((rule (edge-rule instance))) (if rule (typecase rule (string rule) (rule (rule-id rule))) (edge-category instance))) (loop for child in (edge-children instance) collect (if (edge-p child) (edge-id child) "_"))) (when (or (edge-packed instance) (edge-equivalent instance)) (format stream " {~{~a~^ ~}}" (loop for edge in (append (edge-packed instance) (edge-equivalent instance)) collect (edge-id edge)))) (when (edge-lnk instance) (format stream " ") (mrs:output-lnk (edge-lnk instance) :stream stream)) ;; print cfrom/cto (with-slots (cfrom cto) instance (format stream "~:[~2*~; (~A c ~A) ~]" (and cfrom cto) cfrom cto)) (format stream "]")) (defmethod print-object ((instance token-edge) stream) (format stream "#[Token edge # ~d: ~S ~S ~S" (token-edge-id instance) (token-edge-word instance) (token-edge-from instance) (token-edge-to instance)) ;; print cfrom/cto (with-slots (cfrom cto) instance (format stream "~:[~2*~; (~A c ~A) ~]" (and cfrom cto) cfrom cto)) (format stream "]")) (defmethod print-object ((instance morpho-stem-edge) stream) (format stream "#[Morph edge # ~d: ~S ~S ~S ~S ~A" (morpho-stem-edge-id instance) (morpho-stem-edge-word instance) (morpho-stem-edge-current instance) (morpho-stem-edge-from instance) (morpho-stem-edge-to instance) (morpho-stem-edge-partial-tree instance)) ;; print cfrom/cto (with-slots (cfrom cto) instance (format stream "~:[~2*~; (~A c ~A) ~]" (and cfrom cto) cfrom cto)) (format stream "]")) (defvar *edge-id* 0) ;;; ;;; when unpacking, edges are a lot cheaper, as we are deterministic at this ;;; point (e.g. there will be very few failing unifications, if any). hence, ;;; impose a separate limit for unpacking, although it could be argued that we ;;; should count on top of the edges built already (like PET does). i prefer ;;; independent counting, however, since active edges (in generation at least) ;;; are also included in the edge count, but we could dispose of them when we ;;; move on into unpacking. (26-nov-04; oe) ;;; (defparameter %edge-allowance% 0) (defun next-edge (&optional type) ;; return new id for a passive edge (not active) ;; edges created during unpacking are accounted for separately and do not count ;; towards the overall *maximum-number-of-edges* limit - this gives more control ;; over unpacking (if (eq type :unpack) (incf (statistics-uedges *statistics*)) (incf (statistics-pedges *statistics*))) (if (eq type :unpack) (when (>= (incf %edge-allowance%) *unpack-edge-allowance*) (error "Hit limit of ~a edges built during unpacking ~ (see documentation for *unpack-edge-allowance*)" *unpack-edge-allowance*)) (when (>= *edge-id* (+ *maximum-number-of-edges* %edge-allowance%)) (error "Hit limit of ~a chart edges, possibly due to a runaway rule ~ (see documentation for *maximum-number-of-edges*)" *maximum-number-of-edges*))) (incf *edge-id*)) ;;; ;;; keep track of lexical items used in each parse: whenever we use the same ;;; lexical entry the second time, we better copy it first to avoid spurious ;;; reentrancies. this mechanism should suffice to remove a test in the parser ;;; that currently checks for duplicates among the edges feeding into a rule. ;;; --- come to think of it, this might also be useful in resetting temporary ;;; pointers in feature structures that are part of the grammar ... ;;; (defvar *lexical-entries-used* nil) ;;; ***************************************************************** ;;; ;;; Agenda stuff - the agenda is represented as a heap (as in Cormen, ;;; Leiserson, and Rivest). The heap is a tree stored in an array: the car of ;;; each array element is the key and the cdr is the value. ;;; ;;; ****************************************************************** (defmacro heap-size (a) `(the fixnum (aref ,a 0))) (defun heapify (a i) (macrolet ((left (i) `(the fixnum (* (the fixnum ,i) 2))) (right (i) `(the fixnum (1+ (* (the fixnum ,i) 2))))) (let* ((l (left i)) (r (right i)) (largest (if (and (<= l (heap-size a)) (> (car (aref a l)) (car (aref a r)))) l i))) (when (and (<= r (heap-size a)) (> (car (aref a r)) (car (aref a largest)))) (setq largest r)) (unless (= largest i) (rotatef (aref a i) (aref a largest)) (heapify a largest))))) (defun agenda-insert (a key value) (macrolet ((parent (i) `(the fixnum (floor (the fixnum ,i) 2)))) (incf (heap-size a)) (when (>= (heap-size a) (array-dimension a 0)) (if (>= (array-dimension a 0) *maximum-number-of-tasks*) (error "Hit limit of ~a pending tasks ~ (see documentation for *maximum-number-of-tasks*)" *maximum-number-of-tasks*) (setq a (adjust-array a (min (* (array-dimension a 0) 2) *maximum-number-of-tasks*) :initial-element nil)))) (loop with i = (heap-size a) while (and (> i 1) (< (car (aref a (parent i))) key)) do (setf (aref a i) (aref a (parent i))) (setf i (parent i)) finally (setf (aref a i) (cons key value))) value)) (defun agenda-extract-max (a) (when (< (heap-size a) 1) (error "Agenda size is less than 1. Something's wrong with the parser.")) (let* ((max (aref a 1)) (key (car max)) (value (cdr max))) (setf (aref a 1) (aref a (heap-size a))) (decf (heap-size a)) (heapify a 1) (setf (cdr max) nil) ; allow value to be GCed (values value key))) (defun new-agenda () (let ((a (make-array (min 4 *maximum-number-of-tasks*) ; must start small and expand as needed :adjustable t :initial-element nil))) (setf (heap-size a) 0) a)) (defun agenda-empty-p (a) (zerop (heap-size a))) (defun flush-agenda (a) (when (> (array-dimension a 0) *maximum-number-of-tasks*) ; user has reduced the limit (setq a (adjust-array a *maximum-number-of-tasks*))) (when (cdr (aref a 1)) ;; agenda contains old value(s), so clear completely to allow them to be GCed (fill a nil)) (setf (heap-size a) 0)) (defmacro do-agenda ((var agenda &optional result) &body body) ; like dolist (let ((a (gensym)) (n (gensym))) `(loop with ,a = ,agenda for ,n from 1 to (heap-size ,a) for ,var = (cdr (aref ,a ,n)) do ,@body finally (return ,result)))) (defmacro with-agenda (priority &body body) ;; inelegant repetition of .body. in the expansion - but we don't want to ;; unnecessarily heap-allocate a closure in the non-agenda case (let ((p (gensym))) `(let ((,p ,priority)) (if ,p (agenda-insert *agenda* ,p #'(lambda () ,@body)) (progn ,@body))))) (defvar *agenda* (new-agenda)) (defun clear-chart nil (purge-edge-registry) (incf *chart-generation-counter*) (setf *parse-record* nil) (let ((clen (1+ *chart-limit*))) (unless (and (arrayp clen) (= (array-dimension *chart* 0) clen) (arrayp *tchart*) (= (array-dimension *tchart* 0) clen)) (setq *chart* (make-array (list clen clen))) (setq *tchart* (make-array (list clen 2))))) (dotimes (i (array-total-size *chart*)) (setf (row-major-aref *chart* i) nil)) (dotimes (i (array-total-size *tchart*)) (setf (row-major-aref *tchart* i) nil)) (setf *tchart-max* 0) (setf *chart-max* 0) (setf *edge-id* 0) (setf %edge-allowance% 0) (when *active-parsing-p* (clear-achart))) (defvar *cached-category-abbs* nil "variable used in output to avoid recomputation of tree nodes") #+(or :allegro :lispworks :ccl :sbcl) (defvar *parser-lock* (mp:make-process-lock)) (defmacro with-parser-lock ((&optional foo) &body body) (declare (ignore foo)) #+(or :allegro :lispworks :ccl :sbcl) `(mp:with-process-lock (*parser-lock*) ,@body) #-(or :allegro :lispworks :ccl :sbcl) `(progn ,@body)) ;;; ;;; satisfy measurement fetish: list used to store (cpu) time used to find ;;; individual readings: bottom element is start time for parse(), topmost is ;;; end time; in best-first mode, additional elements record time for finding ;;; an analysis, one per reading. (24-feb-99 - oe) ;;; (defparameter *parse-times* nil) (defparameter *show-parse-p* t) (defvar *brackets-list* nil) ;;; **************************************************************** ;;; ;;; Entry point to this group of functions is parse which is passed the ;;; sentence as a list of strings and is called from the top level ;;; ;;; **************************************************************** ;;; Revised approach to tokenisation, morphology and MWES ;;; AAC April 2005 ;;; The chart is used for all of these replacing various ;;; subsidiary structures. ;;; ;;; The standard system has four phases: ;;; ;;; Phase 1 - token list is used to instantiate the chart ;;; for simple tokenisers. For more complex cases, the tokeniser ;;; outputs a chart (possibly in XML if a separate tokeniser). ;;; *tchart* is instantiated with token-edges ;;; ;;; Phase 2 - the morphophonology component instantiates the ;;; chart with possible stems plus affixes. In the revised ;;; version of the LKB this is done on a rule-by-rule basis. ;;; *tchart* is instantiated with morpho-stem edges ;;; The edges are associated with a partially specified derivation tree. ;;; In the case where an external morphophonology component operates ;;; (such as sppp) *tchart* is instantiated with morpho-stem-edges ;;; with stem set ;;; ;;; Phase 3 - lexical lookup including MWE lookup ;;; - instantiate *chart* with normal edges ;;; ;;; Phase 4 - parsing - application of morphosyntax and lexical rules ;;; along with grammar rules. The morphosyntax rules respect ;;; the partially specified derivation tree from phase 2. ;;; Instantiate the chart with edges ;;; ;;; The following are possible variations (in principle) ;;; ;;; Input source: ;;; 1a) user input from LKB ;;; 1b) real input (with markup) ;;; 1c) test suite input ;;; ;;; Preprocessor: ;;; 2a) tokens only: `standard' preprocessor (strings) ;;; : `char' preprocessor (marked up with character positions) ;;; 2b) tokens and external morphophonology ;;; i) partial-tree morphology (current sppp case) ;;; :with-tokeniser-partial-tree ;;; ii) `word' morphosyntax ;;; i.e., morphology specified as a chart with morphemes separated ;;; :with-tokeniser-retokenise ;;; Partial tree morphology (for 2a case): ;;; 3a) (new) internal LKB morphology (one rule at a time) ;;; the default ;;; 3b) external morphology (one rule at a time) ;;; :external-rule-by-rule ;;; 3c) external morphology (full partial tree, as old LKB) ;;; :external-partial-tree (defun check-morph-options (input) ;;; make sure that plausible functions have been specified ;;; try and deal with all ickinesses until interface is ;;; clarified here (unless (member *morph-option* '(:default :distinct-mphon :external-rule-by-rule :external-partial-tree :with-tokeniser-partial-tree :with-tokeniser-retokenise)) (format t "~%Unrecognised *morph-option* switch ~A: resetting to default" *morph-option*) (setf *morph-option* :default)) (when (and (listp input) (consp (first input)) (loop for token in input thereis (assoc :analyses token))) ;;; current test for sppp (setf *morph-option* :with-tokeniser-partial-tree)) (if (or (eql *morph-option* :external-rule-by-rule) (eql *morph-option* :external-partial-tree)) (unless *foreign-morph-fn* (format t "~%*morph-option* ~A requires external morphology function *foreign-morph-fn* which is not supplied - resetting *morph-option* to default" *morph-option*) (setf *morph-option* :default)) (when *foreign-morph-fn* (format t "~%*morph-option* ~A requires *foreign-morph-fn* to be nil - resetting it" *morph-option*) (setf *foreign-morph-fn* nil)))) (defun first-only-p-normalise (x) (cond ((null x) nil) ((and (numberp x) (<= x 0)) nil) ((numberp x) (ceiling x)) (t 1))) ;; supported input: ;; - basic: "the" "dog" "barks" ;; - bracketed: "Kim" "(" "(" "likes" ")" "Sandy" ")" ;; - chared: #S(CHARED-WORD :WORD "The" :CFROM 0 :CTO 2) ;; #S(CHARED-WORD :WORD "cat" :CFROM 4 :CTO 6) ;; #S(CHARED-WORD :WORD "barks" :CFROM 8 :CTO 12)) ;; - s(m)af xml: "LKB20:46:50 1/18/2006 (UTC)THEDOGBARK((PLUR_NOUN_ORULE "BARKS"))BARK((THIRD_SG_FIN_VERB_ORULE "BARKS"))" ;; - SAF object: #[SAF] ;; - sppp: ((:END . 1) ;; (:START . 0) ;; (:ANALYSES ;; ((:RULES) ;; (:INFLECTION) ;; (:STEM . "kim"))) ;; (:TO . 3) ;; (:FROM . 0) ;; (:FORM . "kim")) ;; ((:END . 2) ;; (:START . 1) ;; (:ANALYSES ;; ((:RULES ;; ((:FORM . "sleeps") ;; (:ID . PLUR_NOUN_INFL_RULE)) ;; ((:FORM . "sleeps.") ;; (:ID . PUNCT_PERIOD_RULE))) ;; (:INFLECTION) ;; (:STEM . "sleep")) ;; ((:RULES ;; ((:FORM . "sleeps") ;; (:ID . THIRD_SG_FIN_VERB_INFL_RULE)) ;; ((:FORM . "sleeps.") ;; (:ID . PUNCT_PERIOD_RULE))) ;; (:INFLECTION) ;; (:STEM . "sleep"))) ;; (:TO . 10) ;; (:FROM . 4) ;; (:FORM . "sleeps.")) (defun parse (input &optional (show-parse-p *show-parse-p*) (first-only-p *first-only-p*)) ;; ;; keep track of mutual dependencies between various configurations: ;; - input bracketing is only available in passive mode; ;; - passive best-first restricted to unary and binary rules. ;; ;; if input is determined to be XML, read in as SMAF object (when (xml-p input) (setf input (smaf:xml-to-saf-object input))) (check-morph-options input) (reset-statistics) (when (smaf::saf-p input) (setf input (saf::instantiate-l-content input smaf::*config*))) ;; ;; shadow global variable to allow best-first mode to decrement ;; for each result found; no need for additional result count. ;; (22-jan-00 - oe) (let* ((*first-only-p* (first-only-p-normalise first-only-p)) (*active-parsing-p* (if *bracketing-p* nil *active-parsing-p*)) (*first-only-p* (if (and *first-only-p* (null *active-parsing-p*) (greater-than-binary-p)) (progn (format t "~&Passive best-first mode only available for ~ unary and binary rules.~%~ Disabling best-first mode: binding ~ *first-only-p* to `nil'.~%") nil) *first-only-p*)) ;; (input (if (xml-p input) ; JAC 2019-03-09 - removed since already done above ;; (smaf::xml-to-saf-object input) ;; input)) (maf-p (smaf::saf-p input)) (*maf-p* maf-p) ;; ;; input originating from SPPP is a set of SPPP tokens ;; (spppp (and (listp input) (consp (first input)))) length-user-input) (when (stringp input) (format t "~&WARNING: [parse] invalid input: '~a'" input) (return-from parse)) ;; eg. user-input -> ("the" "dog" "barks") (multiple-value-bind (user-input brackets-list) (if (and *bracketing-p* (not maf-p) (not spppp)) (initialise-bracket-list input) (values input nil)) (setf length-user-input (cond (maf-p (smaf::get-smaf-lattice-size user-input)) (spppp (loop for token in input maximize (rest (assoc :end token)))) (t (length user-input)))) (when (> length-user-input *chart-limit*) (error "Sentence `~a' is longer than the ~a word maximum ~ (see documentation for *chart-limit*)" user-input *chart-limit*)) (let* ((*brackets-list* brackets-list) (*parser-rules* (get-matching-rules nil nil)) (*parser-rules* (loop for rule in *parser-rules* unless (member (rule-id rule) *parse-ignore-rules*) collect rule)) (*parser-lexical-rules* (get-matching-lex-rules nil)) (*lexical-entries-used* nil) (*minimal-vertex* 0) (*maximal-vertex* length-user-input)) (declare (special *minimal-vertex* *maximal-vertex*)) (with-parser-lock () (flush-agenda *agenda*) (clear-chart) (setf *cached-category-abbs* nil) (setf *parse-record* nil) (setf *parse-times* (list (get-internal-run-time))) (let ((*safe-not-to-copy-p* t)) ;; instantiate token chart (instantiate-chart-with-tokens user-input) (ecase *morph-option* (:default (instantiate-chart-with-morphop)) (:distinct-mphon (instantiate-chart-with-morphop)) (:external-rule-by-rule (instantiate-chart-with-morphop)) ;; *foreign-morph-fn* is set and will be called (:external-partial-tree (instantiate-chart-with-morpho-stem-edges)) (:with-tokeniser-partial-tree nil) (:with-tokeniser-retokenise nil)) (when *generate-messages-for-all-unanalysed-tokens* (generate-messages-for-all-unanalysed-tokens *tchart*)) (when *abort-parse-after-morphosyntax* (return-from parse (get-parse-return-values))) (when *fallback-pos-p* (augment-tchart-with-fallback-morphop)) (when (and *abort-parse-if-no-spanning-morph-edge-path* (not (medge-spanning-path-p))) ;; [bmw] would it be safe to make this behaviour the default? ;; that is: if, after adding morpho-stem edges, there is no path ;; through lattice that consists of morpho-stem edges, we should ;; be able to assume that any parse attemp will fail... (format t "~&;;; WARNING: No morph edge paths span input. Aborting parse.") (return-from parse (get-parse-return-values))) (instantiate-chart-with-stems-and-multiwords) (when *abort-parse-after-lexical-lookup* (return-from parse (get-parse-return-values))) (catch :best-first (add-words-to-chart (if (and *first-only-p* (null *active-parsing-p*)) (cons *minimal-vertex* *maximal-vertex*) nil)) (if *active-parsing-p* (complete-chart) (loop until (agenda-empty-p *agenda*) do (funcall (agenda-extract-max *agenda*))))) ;; record time for parse forest construction, before attempting to extract parses (push (get-internal-run-time) *parse-times*) ;; look for complete parses, but only if we didn't already (i.e. we're not in ;; the mode that attempts to find a parse quickly without necessarily ;; constructing the complete parse forest) (unless (and *first-only-p* (null *unpacking-scoring-hook*)) (setq *parse-record* (parses-from-spanning-edges (find-spanning-edges *minimal-vertex* *maximal-vertex*) *first-only-p*))))) (when show-parse-p (show-parse)) (values (statistics-etasks *statistics*) (statistics-stasks *statistics*) -1 (statistics-ftasks *statistics*) (statistics-mtasks *statistics*)))))) (defun get-parse-return-values nil (push (get-internal-run-time) *parse-times*) (values (statistics-etasks *statistics*) (statistics-stasks *statistics*) -1 (statistics-ftasks *statistics*) (statistics-mtasks *statistics*))) (defun file-xml-p (filename) #+:s-xml ;; [bmw] return false if :s-xml not compiled in (xml-p (with-open-file (istream filename :direction :input) (read-line istream nil ""))) #-:s-xml nil) (defun xml-p (input) (and (stringp input) (> (length input) 4) (string= " to *tchart-max*) (setf *tchart-max* to)) (let ((existing-ccs (aref *tchart* to 0)) (edge (make-token-edge :id (next-edge) :string base-word :word word :leaves (list base-word) :from from :to to :cfrom cfrom :cto cto))) (unless (token-edge-match edge existing-ccs) (let ((cc (make-chart-configuration :begin from :edge edge :end to))) (push cc (aref *tchart* to 0)) (push cc (aref *tchart* from 1))) edge))) ;; [bmw] factored into token-edge-match + token-edge= (defun token-edge-match (edge cclist) (member-if #'(lambda (x) (token-edge= edge x)) cclist :key #'chart-configuration-edge)) (defun token-edge= (edge1 edge2) (and (eql (edge-from edge1) (edge-from edge2)) (eql (edge-to edge1) (edge-to edge2)) (equal (token-edge-word edge1) (token-edge-word edge2)))) ;;; ***************************************************** ;;; ;;; Morphophonology ;;; ;;; ***************************************************** ;;; We have a chart instantiated with token-edges ;;; This phase calls the LKB built-in spelling component (or some other ;;; morphological engine), which ultimately provides a stem and a tree of ;;; rules to apply to it. ;;; The revised LKB morph code does this one rule at a time but the case ;;; where a complete analysis is done in one step is also supported. ;;; ;;; partial-tree handling ;;; ;;; ;;; FIX ;;; currently we assume that a partial tree is in fact a ;;; list of partial-tree nodes, but this will have to change ;;; when we allow for compounding. To try and make this simpler ;;; the code mostly abstracts from the partial tree encoding details ;;; search for WARNING for exceptions (defmacro make-pt-node (rule str) `(list ,rule ,str)) (defmacro pt-node-rule (ptnode) `(car ,ptnode)) (defmacro pt-node-string (ptnode) `(cadr ,ptnode)) (defmacro add-pt-node-to-tree (ptnode partial-tree) ;;; sometimes called with partial-tree being nil `(cons ,ptnode ,partial-tree)) (defmacro max-one-deep-p (partial-tree) ;;; may be called with partial-tree being nil `(not (cdr ,partial-tree))) (defmacro partial-trees-equal (partial-tree1 partial-tree2) `(tree-equal ,partial-tree1 ,partial-tree2)) ;;; end partial tree handling ;;; start of code specific to default *morph-option* (i.e., :default ;;; or :distinct-mphon) ;;; or :external-rule-by-rule (defparameter *morph-agenda* nil) (defparameter *morphophon-cache* nil) ;; [bmw] ensure only unanalysed token edges are sent to add-morpho-partial-edges ;; (necessary in case SMAF input contains some morph edges) (defun instantiate-chart-with-morphop nil (loop for token-edge in (get-unanalysed-tedges *tchart*) for word = (token-edge-word token-edge) do (add-morpho-partial-edges word token-edge)) *tchart*) #| Redoing this from the initial version: Partial-tree is a set of partial trees (although for now, unpack when we add a stem edge because I don't want to rewrite the rest of the code immediately) complication 1 - analyses that have no morphological effect, contributed by irregular forms (or otherwise) call on "BET" with rules nil - find passive, psp, past recurse on BET with rules (passive psp past) - if nothing can feed any of these rules, we will stop there complication 2 - rules that extend the length of the string - e.g. Spanish stems are longer than the inflected forms. As with null effects, this doesn't matter unless we get into a recursive situation. complication 3 - as we're analysing, we want to block impossible analyses based on the rule feeding, but when we return to a string having analysed with different rules, we may need to redo. This led to a potential bug in the old code. If affixes always shortened the string, we could order them so that we never revisited a string we'd analysed. i.e. we could search breadth first, longest string first. This isn't the case, but it's probably close enough that the potential reanalysis won't hurt. Use an agenda so that we can investigate possibilities breadth first. Abstract example: initial agenda contains ((str (nil))) this is popped (morph-analyse-reg-irreg str) => ((str1 . r1a) (str1 . r1b) (str2 . r2a) (str2 . r2b) (str . r3a) (str . r3b)) In the agenda, these are packed and sort by length of string - assume str1 > str and str2 < str for the special case where the morphological analysis allows a zero affixation, we bypassing the agenda mechanism and add them as partial trees to give the set of partial trees (nil (r2a.str) (r2b.str)) (note r1a.str is an abbreviation for (r1a str) here - i.e. the nodes in partial trees are actually lists of rule and form) note also that we need to have the element nil in the partial-tree-set suppose that r1a feeds r2a.str and r1b.str feeds r2a.str but none of the others feed agenda ((str1 ((r1a.str) (r1b.str) (r1a.str r2a.str) (r1b.str r2a.str)) (str2 ((r2a.str) (r2b.str)))) str1 is top, so we investigate it. agenda is ((str2 ((r2a.str) (r2b.str)))) since we pop Suppose: (morph-analyse-reg-irreg str1) => ((str2 . r2c)) We add to the agenda for str2 ((str2 ((r2a.str) (r2b.str) (r2c.str1 r1a.str) (r2c.str1 r1b.str)))) Because the morphophonology could have affixed forms that were shorter than the stem, we can't guarantee not to redo work. For instance, in the example above, suppose (morph-analyse-reg-irreg str2) => ((str1 . r2d)) we've taken str1 off the agenda, so we'll end up reanalysing it. However, the morphophon results are cached, so the effects are relatively limited. |# ;;; Morphology agenda (defmacro make-morph-agenda-item (str partial-tree-set) `(cons ,str ,partial-tree-set)) (defmacro morph-agenda-string (agenda-item) `(car ,agenda-item)) (defmacro morph-agenda-partial-tree-set (agenda-item) `(cdr ,agenda-item)) (defmacro morph-agenda-latest-rules (agenda-item) ;;; WARNING: assumes partial tree is a list! `(loop for pt in (morph-agenda-partial-tree-set ,agenda-item) collect (pt-node-rule (car pt)))) (defun morph-agenda-match (str) (dolist (item *morph-agenda*) (when (string-equal str (morph-agenda-string item)) (return item)))) (defun insert-morph-agenda-item (str partial-tree-set) ;;; if the string matches an existing item, then add the ;;; partial trees into that record. Otherwise, insert ;;; in string length order, longest first (incf (statistics-mtasks *statistics*)) (if *morph-agenda* (let* ((strlength (length str)) (checked nil) (new-agenda (do ((remainder *morph-agenda* (cdr remainder))) ((not remainder) nil) (let ((item (car remainder))) (cond ((> strlength (length (morph-agenda-string item))) (return (append (nreverse checked) (list (make-morph-agenda-item str partial-tree-set)) remainder))) ((string-equal str (morph-agenda-string item)) (setf (morph-agenda-partial-tree-set item) (append partial-tree-set (morph-agenda-partial-tree-set item))) ;;; don't think we need to test for duplicates here (return *morph-agenda*)) (t (push item checked))))))) (setf *morph-agenda* (or new-agenda (nreverse (cons (make-morph-agenda-item str partial-tree-set) checked))))) (setf *morph-agenda* (list (make-morph-agenda-item str partial-tree-set))))) ;;; end agenda functions ;;; Main code (defun add-morpho-partial-edges (unanalysed token-edge) (setf *morph-agenda* nil) (setf *morphophon-cache* nil) (insert-morph-agenda-item unanalysed (list nil)) (loop (unless *morph-agenda* (return nil)) (let* ((agenda-item (pop *morph-agenda*))) (analyse-agenda-item agenda-item token-edge)))) ;; [bmw] this function, based on add-morpho-partial-edges, added ;; in order to collect morph analyses as required in SMAF code (defvar *partial-trees* nil) (defun get-morph-analyses (unanalysed) (let ((*partial-trees* nil)) (setf *morph-agenda* nil) (setf *morphophon-cache* nil) (insert-morph-agenda-item unanalysed (list nil)) (loop (unless *morph-agenda* (return nil)) (let* ((agenda-item (pop *morph-agenda*))) (analyse-agenda-item agenda-item nil :bottom-out-fn #'get-valid-morph-analyses))) *partial-trees*)) ;; [bmw] customised to allow alternative "bottom-out-fn" at bottom of recursion ;; (to allow use by get-morph-analyses fn) (defun analyse-agenda-item (agenda-item token-edge &key (bottom-out-fn #'add-as-unanalysed)) (let* ((unanalysed (morph-agenda-string agenda-item)) (partial-tree-set (morph-agenda-partial-tree-set agenda-item)) (last-rule-ids (morph-agenda-latest-rules agenda-item)) (partial-trees-plus-nulls partial-tree-set)) (when (dolist (last-rule-id last-rule-ids) (when (or (null last-rule-id) (spelling-rule-feeds-p last-rule-id)) (return t))) ;;; no point continuing to morph-analyse if there's no ;;; rule that changes morphology that can feed the ;;; one we've seen (let ((one-steps (morph-analyse-reg-irreg unanalysed))) (setf partial-trees-plus-nulls (null-morphophon-combinations unanalysed (loop for morph-pair in one-steps when (string-equal (car morph-pair) unanalysed) collect (cdr morph-pair)) partial-tree-set)) ;;; (morph-analyse-reg-irreg "CANNED") returns ;;; (("CAN" . PAST-V_IRULE) ("CAN" . PSP-V_IRULE) ("CANNED" . MY_RULE)) ;;; the new analyses get merged here with each other and the agenda (dolist (morph-pair one-steps) (unless (string-equal (car morph-pair) unanalysed) (let* ((newstr (car morph-pair)) (rule (cdr morph-pair)) (rule-entry (or (get-lex-rule-entry rule) (error "~%Rule ~A returned from morph analyser is undefined (error in grammar loading?)" rule))) (ptnode (make-pt-node rule unanalysed)) (new-partial-tree-set (loop for partial-tree in partial-trees-plus-nulls when (or (null partial-tree) (check-nosp-feeding (get-lex-rule-entry (pt-node-rule (car partial-tree))) ;;; WARNING rule-entry)) collect (progn (when (>= (length partial-tree) *maximal-lex-rule-applications*) (error "Hit limit on morphological rule applications in ~a, ~ possibly due to a runaway rule (see documentation for ~ *maximal-lex-rule-applications*)" partial-tree)) (add-pt-node-to-tree ptnode partial-tree))))) (when new-partial-tree-set (insert-morph-agenda-item newstr new-partial-tree-set))))))) (funcall bottom-out-fn unanalysed token-edge partial-trees-plus-nulls))) (defun get-valid-morph-analyses (word token-edge partial-tree-set) (declare (ignore token-edge)) (when (lookup-word *lexicon* word) (loop for partial-tree in partial-tree-set do (push (cons word partial-tree) *partial-trees*)))) (defun add-as-unanalysed (unanalysed token-edge partial-tree-set) ;;; try and terminate the recursion, recording the analyses via ;;; the partial tree mechanism. Unpack the partial trees and add a ;;; new edge for each for now. (when (lookup-word *lexicon* unanalysed) (dolist (partial-tree partial-tree-set) (add-morpho-stem-edge-from-token-edge unanalysed partial-tree token-edge)))) (defun morph-analyse-reg-irreg (unanalysed) (or (cdr (assoc unanalysed *morphophon-cache* :test #'string-equal)) (let* ((res (if *foreign-morph-fn* (apply *foreign-morph-fn* (list unanalysed)) (append (find-irregular-morphs unanalysed) (one-step-morph-analyse unanalysed)))) (filtered-res (loop for poss in res when (morph-not-blocked-p unanalysed (car poss) (cdr poss)) collect poss))) (push (cons unanalysed filtered-res) *morphophon-cache*) filtered-res))) (defun morph-not-blocked-p (surface underlying rule) ;;; given a pattern (* ed) (!ty !tied) (e ed) (!t!v!c !t!v!c!ced) ;;; we may only want to match the most specific case ;;; for instance, given `hated' we don't want to match this to the ;;; stem `hat' because it should have the (!t!v!c !t!v!c!ced) pattern ;;; Similarly, if we have an irregular form `slept' we don't ;;; want to match `sleeped' ;;; *irregular-forms-only-p* (if only for backward compatibility) (or *foreign-morph-fn* ;;; we assume no blocking with external morphology (if (or *most-specific-only-p* *irregular-forms-only-p*) (let* ((irreg-surface-list (morph-matches-irreg-list underlying rule))) (if irreg-surface-list ;;; only OK if the specified form is one of the irregular ;;; forms (member surface irreg-surface-list :test #'equal) ;; no irreg form (if *most-specific-only-p* ;;; then we need to see if we have the most specific pattern ;;; which we can just get by calling morph-generate ;;; A more sophisticated version might allow multiple results ;;; here (let ((most-specific (morph-generate underlying rule))) (equal surface most-specific)) ;;; otherwise only *irregular-forms-only-p* ;;; is set and we're only blocking by irregulars, so ;;; we're OK here t))) ;;; we're not doing any blocking so it's OK t))) (defun null-morphophon-combinations (unanalysed null-rules partial-tree-set) ;;; we have a set of rules that can apply to a string without ;;; morphophonological effect. While such rules would normally be ;;; specified as lexical, special cases of morphophonologically active ;;; rules (irregulars in particular) may arise here. We filter these ;;; out from the usual agenda mechanism, to prevent looping. ;;; ;;; returns a set of partial-trees (append partial-tree-set (apply-morph-null-rules partial-tree-set null-rules unanalysed))) (defun apply-morph-null-rules (partial-tree-set null-rules unanalysed) (let ((new-pt-set (loop for null-rule in null-rules for rule-entry = (get-lex-rule-entry null-rule) for ptnode = (list null-rule unanalysed) nconc (loop for partial-tree in partial-tree-set when (or (null partial-tree) (check-nosp-feeding (get-lex-rule-entry (pt-node-rule (car partial-tree))) ;;; WARNING rule-entry)) collect (progn (when (>= (length partial-tree) *maximal-lex-rule-applications*) (error "Hit limit on morphological rule applications in ~a, ~ possibly due to a runaway rule (see documentation for ~ *maximal-lex-rule-applications*)" partial-tree)) (add-pt-node-to-tree ptnode partial-tree)))))) (if new-pt-set (append new-pt-set (apply-morph-null-rules new-pt-set null-rules unanalysed)) nil))) ;;; end stuff specific to the :default, :distinct-mphon ;;; and :external-rule-by-rule case (defun add-morpho-stem-edge-from-token-edge (stem partial-tree token-edge) ;;; This is called from the version with the rule-by-rule morphology ;;; (for the irregs and as the base case with regular morphology). ;;; It is also called when we have a version of morphology ;;; giving full partial trees and we're using the tokeniser ;;; (:external-partial-tree is the value of *morph-option*). ;;; But most of the work is done in add-morpho-stem-edge ;;; which is called from the sppp stuff (i.e. external ;;; tokeniser and partial tree morphology). ;;; The macro with-slots establishes a lexical environment for ;;; referring to the slots in token-edge as though they were variables ;;; with the given slot name. (with-slots (from to word string cfrom cto leaves) token-edge (add-morpho-stem-edge stem partial-tree from to word string cfrom cto leaves token-edge))) (defun add-morpho-stem-edge (stem partial-tree from to word string cfrom cto tleaves tedge) ;;; this is the (re)entry point for the sppp code ;;; ;;; Copy most of token-edge ;;; Add partial-tree and stem ;;; Put into chart at same places (if (and (or (member *morph-option* '(:default :distinct-mphon :external-rule-by-rule)) ;;; if we're proceeding rule by rule we've already ;;; done this checking (and (lookup-word *lexicon* stem) (or (max-one-deep-p partial-tree) (check-rule-filter-morph partial-tree string)))) (not (morpho-stem-edge-match2 from to stem partial-tree nil (aref *tchart* from 1)))) (let* ((new-edge (make-morpho-stem-edge :id (next-edge) :word word :string string :stem stem :current stem :partial-tree partial-tree :leaves tleaves :children (if (token-edge-p tedge) (list tedge)) :from from :to to :cfrom cfrom :cto cto)) (cc (make-chart-configuration :begin from :edge new-edge :end to))) (push cc (aref *tchart* from 1)) (push cc (aref *tchart* to 0)) new-edge) nil)) (defun morpho-stem-edge= (edge1 edge2) (and (morpho-stem-edge-p edge1) ;; both medges (morpho-stem-edge-p edge2) (= (edge-from edge1) (edge-from edge2)) ;; from (= (edge-to edge1) (edge-to edge2)) ;; to (equal (morpho-stem-edge-stem edge1) ;; stem (morpho-stem-edge-stem edge2)) (partial-trees-equal (edge-partial-tree edge1) ;; partial-tree (edge-partial-tree edge2)) (equalp (morpho-stem-edge-l-content edge1) ;; l-content (morpho-stem-edge-l-content edge2)) )) ;; [bmw] alternative to above code, where we don't pass the entire first medge ;; for efficiency reasons (defun morpho-stem-edge-match2 (from to stem partial-tree l-content cclist) (member-if #'(lambda (x) (morpho-stem-edge=2 from to stem partial-tree l-content x)) cclist :key #'chart-configuration-edge)) (defun morpho-stem-edge=2 (from to stem partial-tree l-content edge2) (and (morpho-stem-edge-p edge2) ;; medge (= from (edge-from edge2)) ;; from (= to (edge-to edge2)) ;; to (equal stem (morpho-stem-edge-stem edge2)) ;; stem (partial-trees-equal partial-tree (edge-partial-tree edge2)) ;; partial tree (equalp l-content (morpho-stem-edge-l-content edge2)) ;; l-content )) ;;; ************************************************************* ;;; ;;; This alternative version is for the one step morphophonology case ;;; where the foreign-morph-fn returns a full partial-tree ;;; and a stem - this is called when ;;; :external-partial-tree is the value of *morph-option* (defun instantiate-chart-with-morpho-stem-edges nil (dotimes (current *tchart-max*) (let ((token-ccs (aref *tchart* current 1))) (dolist (token-cc token-ccs) (let* ((token-edge (chart-configuration-edge token-cc)) (word (token-edge-word token-edge)) (morph-specs (union (filter-for-irregs (remove-duplicates (apply *foreign-morph-fn* (list word)) :test #'equalp)) ;; filter-for-irregs is in rules.lsp (find-irregular-morphs-old word) :test #'equalp))) (dolist (morph-spec morph-specs) (add-morpho-stem-edge-from-token-edge (car morph-spec) (cdr morph-spec) token-edge))))))) ;;; *foreign-morph-fn* is assumed to return something like ;;; ("A" (RULE1 "AB") (RULE2 "ABC")) ;;; as the old morph-analyse did ;;; ************************************************************* ;;; this stuff is only for the case where we have not done the ;;; morphophonology rule by rule (defun check-rule-filter-morph (partial-tree string) ;;; This is called with a partial-tree that will be ;;; tried later on in the parser proper. ;;; Make sure we can get actual rules, and, if we can, ;;; filter impossible combinations now according to the ;;; rule filter. ;;; ;;; WARNING: assumes partial tree is a list! (let ((rule-entries (loop for rule-info in partial-tree collect (let* ((rule-id (pt-node-rule rule-info)) (rule-entry (get-lex-rule-entry rule-id))) (or rule-entry (progn (format t "~%Warning: rule ~A specified by ~ morphology for ~A was not found" rule-id string) (return-from check-rule-filter-morph nil))))))) (check-rule-filter-morph-aux rule-entries))) (defun check-rule-filter-morph-aux (rule-list) (loop for (first . rest) on rule-list while rest always (check-nosp-feeding (first rest) first))) ;;; end of Phase 2 code ;;; ***************************************************** ;;; ;;; Lexical lookup and MWEs - Phase 3 ;;; ;;; ***************************************************** ;;; multi-word entries may have affixation on any member ;;; but they span several vertices. It's therefore ;;; necessary to treat them as individual words with respect to ;;; the spelling component. ;;; Putative multiword entries are checked ;;; when we get to the rightmost element. (defun instantiate-chart-with-stems-and-multiwords nil (dotimes (current *tchart-max*) (dolist (cc (aref *tchart* current 1)) (let ((edge (chart-configuration-edge cc))) (when (morpho-stem-edge-p edge) (add-single-and-multiple-senses edge))))) ;;; FIX - the active-parser doesn't necessarily ;;; put the stem edges directly onto the chart - need ;;; to find out how else to check them to get warning messages (unless (and *active-parsing-p* (or *first-only-p* *chart-dependencies*)) (check-stem-coverage *tchart-max*))) (defun add-single-and-multiple-senses (morpho-stem-edge) ;;; (format t "~%add-single-and-multiple-senses: ~A ~A" (edge-from morpho-stem-edge) (edge-to morpho-stem-edge)) (let* ((to (edge-to morpho-stem-edge)) (edge-stem (morpho-stem-edge-stem morpho-stem-edge)) (from (edge-from morpho-stem-edge)) (cfrom (edge-cfrom morpho-stem-edge)) (cto (edge-cto morpho-stem-edge)) (partial-tree (edge-partial-tree morpho-stem-edge)) (edge-string (edge-string morpho-stem-edge)) (edge-entries (get-edge-entries morpho-stem-edge))) ;;; FIX when the precheck code is written the entries ;;; will be on the edges (dolist (entry edge-entries) (let ((entry-orth-list (mapcar #'string-upcase (lex-entry-orth entry)))) (if (cdr entry-orth-list) ;; multi-word (check-multi-word entry-orth-list edge-stem edge-string entry from to cto partial-tree morpho-stem-edge) (add-stem-edge edge-stem edge-string from to cfrom cto partial-tree entry morpho-stem-edge)))))) ;;; FIX - put back unify-in ;; [bmw] l-content takes precedence over morph analysis of stem ;; FIXME: make precedence configurable (defun get-edge-entries (morpho-stem-edge) (with-slots (l-content stem) morpho-stem-edge (let ((l-content-type (car l-content)) (l-content-value (cdr l-content)) entries) (case l-content-type (:full ;; we use l-content in place of lexical lookup (setf entries (list l-content-value))) (:inject ;; we inject unifs into result of lexical lookup (setf entries (get-safe-unexpanded-lex-entry stem)) ;; if we have something to inject, collect copies of entries, ;; each with adjusted lex-entry-unifs (if l-content-value (setf entries (loop for e-orig in (get-safe-unexpanded-lex-entry stem) collect (get-injected-lex-entry e-orig l-content-value)) )) ) (t ;; default (and non-SMAF) case (setf entries (get-safe-unexpanded-lex-entry stem)))) entries))) ;; ensure all ersatzes are copies (defun get-safe-unexpanded-lex-entry (stem) (loop with entries = (get-unexpanded-lex-entry stem) for entry in entries for i from 0 when (lex-entry-is-ersatz entry) do (setf (nth i entries) (copy-lex-entry entry)) (get-expanded-lex-entry entry) ;; so we don't break tsdb's *lex-ids-used* hack finally (return entries))) ;; takes lex-entry + injection specs (defun get-injected-lex-entry (e-orig l-content-injects) (inject-lex-entry e-orig l-content-injects :e-orig e-orig)) (defun inject-lex-entry (e l-content-injects &key e-orig) (setf (lex-entry-full-fs e) nil) ;; ensure NO full-fs yet (setf (lex-entry-unifs e) (inject-unifs l-content-injects (lex-entry-unifs e) :entry e-orig)) e) (defvar smaf::*ersatz-carg-path*) ;; tests whether lex-entry should be treated as an ersatz ;; (ersatz's undergo CARG-injection) ;; an ersatz is an lex entry containing a token that looks like "...ersatz" (defun lex-entry-is-ersatz (entry) ;; if parameter is unset ersatzes get no special treatment (unless smaf::*ersatz-carg-path* (return-from lex-entry-is-ersatz nil)) (loop for word in (lex-entry-orth entry) for len = (if (stringp word) (length word) 0) when (and (> len 6) (string= "ersatz" (string-downcase (subseq word (max 0 (- len 6)) len)))) return t)) ;; [bmw] non-destructively, return result of injecting 'inject-unifs' into 'unifs' (defun inject-unifs (inject-unifs unifs &key entry) (loop for unif in unifs for lhs = (unification-lhs unif) for inject-unif = (find lhs inject-unifs :key #'unification-lhs :test #'equalp) if inject-unif collect inject-unif into new-unifs and do (if entry (format t "~&; WARNING: redefining ~a in copy of lex entry ~a" (unifs-to-tdl-body (list inject-unif)) (to-tdl entry))) else collect unif into new-unifs finally (return (append inject-unifs new-unifs)))) (defun add-stem-edge (edge-stem edge-string from to cfrom cto partial-tree entry dtr) #+:arboretum (declare (special *mal-active-p*)) (let* ((expanded-entry (get-expanded-lex-entry entry))) ;; side effect of calling get-lex-entry-from-id is ;; instantiation of :full-fs slot (when (and expanded-entry #+:arboretum (or *mal-active-p* (not (mal-lex-entry-p expanded-entry)))) (let* ((new-fs (copy-lex-fs-as-needed (lex-entry-full-fs entry) :cfrom cfrom :cto cto)) (new-edge (make-edge :id (next-edge) :category (indef-type-of-tdfs new-fs) :rule edge-stem :dag new-fs :leaves (list edge-string) :lex-ids (list (lex-entry-id entry)) :from from :to to :tchildren (list dtr) :partial-tree partial-tree :string edge-string :cfrom cfrom :cto cto)) (cc (make-chart-configuration :begin from :edge new-edge :end to))) (when (> to *chart-max*) (setf *chart-max* to)) (cond (*active-parsing-p* (lexical-task nil cc)) (t (push cc (aref *chart* from to)))))))) ;; fix_me (too much potential copying) (defun copy-lex-fs-as-needed (tdfs &key cfrom cto) (when *characterize-p* (setf tdfs (set-characterization-tdfs tdfs cfrom cto))) (cond ((or *recording-word* (smember tdfs *lexical-entries-used*)) (copy-tdfs-completely tdfs)) (t (push tdfs *lexical-entries-used*) tdfs))) (defun check-multi-word (entry-orth-list edge-stem edge-string unexpanded-entry from to cto partial-tree dtr) ;;; we have a possible match on a multiword ;;; i.e., something which corresponds to multiple tokens ;;; according to the tokeniser. For every valid multiword ;;; we find, we will add an edge to the chart via add-stem-edge ;;; ;;; a multiword entry gives a list of stems ;;; (let* ((inflection-position (lex-entry-infl-pos unexpanded-entry)) (number-of-words (length entry-orth-list)) (to-be-accounted-for (reverse entry-orth-list)) (current-entry-stem (car to-be-accounted-for))) (unless (string-equal current-entry-stem edge-stem) (return-from check-multi-word nil)) ;; only check multi-words when we have the rightmost (when (< to number-of-words) (return-from check-multi-word nil)) ; too near start of sentence ;; because of tokeniser ambiguity this is not a perfect check ;; but the more complex cases will be caught below #| (when (and partial-tree (not (eql inflection-position number-of-words))) (return-from check-multi-word nil)) ; inflection not allowed here |# (check-multi-and-add-edges entry-orth-list (cdr to-be-accounted-for) from nil to cto inflection-position unexpanded-entry (list partial-tree) edge-string dtr t :ersatz-p (lex-entry-is-ersatz unexpanded-entry)))) (defun check-multi-and-add-edges (entry-orth-list remaining-words from cfrom to cto inflection-position unexpanded-entry partial-tree-set amalgamated-string dtr first-p &key cargs ersatz-p) ;; check we have some match on each element ;; and find partial tree(s) on inflected position ;; this is called initially when we've got a match on the rightmost ;; element. We add edges for each match. Note that because ;; tokenisation is now non-deterministic, and because ;; there may be ambiguity in the morphology, we may end up with several ;; edges being added. ;; e.g. ("a" "b" "c") as entry-orth-list with inflection ;; position 2 would match a set of contiguous edges where ;; edge 1 had stem "a" and no partial tree ;; edge 2 had stem "b" and an optional partial tree ;; edge 3 we already know has stem "c" when this is called ;; ;; Note that, despite the name `inflection position' an MWE like ;; "chest pain" would match "chest painless" (if the inflection position ;; was set to 2 and there was a productive affix "less", despite the ;; fact that "less" would be derivational. This would have to be controlled ;; by the grammar (presumably by whatever mechanism was used to block ;; derivation occurring in the wrong place anyway). ;; Revised version allows a nil setting for infl-pos (see ERG ;; user-fns) In this case, multiple elements of the multiword are ;; allowed to have partial-trees - these are amalgamated according ;; to the ordering allowed by the rule-filter. ;; This is needed for punctuation in the current ERG - e.g. for (Palo Alto) ;; Only one ;; possibility is returned - this should really be FIXed so that ;; multiple possibilities are allowed and return multiple edges. ;; Affixation is only allowed for the leftmost or rightmost ;; element of the multiword, but this code currently allows the leftmost ;; element to have a suffix and/or the rightmost a prefix ;; thus allowing `Palo. Alto' - FIX ;; aac oct 20 2006 (if remaining-words (let ((ccs (aref *tchart* from 0)) (entry-stem (car remaining-words))) (dolist (cc ccs) (let* ((edge (chart-configuration-edge cc)) ;; get carg, if ersatz (carg-extra (and ersatz-p (get-carg-from-edge edge)))) (when (and (morpho-stem-edge-p edge) (equal entry-stem (morpho-stem-edge-stem edge)) (if inflection-position ;; behaviour should be unchanged from ;; pre oct 20 2006 version (if (eql (length remaining-words) inflection-position) (progn (setf partial-tree-set (list (morpho-stem-edge-partial-tree edge))) t) (not (morpho-stem-edge-partial-tree edge))) ;; no infl-pos, post oct 20 2006 (or (not (morpho-stem-edge-partial-tree edge)) (if (or first-p (not (cdr remaining-words))) ;; at beginning/end of multiword (progn (setf partial-tree-set (cons (morpho-stem-edge-partial-tree edge) partial-tree-set)) t) ;; else nil)))) (if carg-extra (push carg-extra cargs)) ;; collect cargs, if ersatz (check-multi-and-add-edges entry-orth-list (cdr remaining-words) (edge-from edge) (edge-cfrom edge) to cto inflection-position unexpanded-entry partial-tree-set (concatenate 'string (edge-string edge) " " amalgamated-string) dtr nil :cargs cargs :ersatz-p ersatz-p ))))) (let* ((amalgamated-partial-tree (combine-partial-trees partial-tree-set)) carg-final) ;; inject cargs into CARG, if ersatz (when ersatz-p (setf carg-final (get-carg-from-entry unexpanded-entry)) (if carg-final (setf cargs (append cargs (list carg-final)))) (inject-cargs unexpanded-entry cargs)) (add-stem-edge (format nil "~{~A ~}" entry-orth-list) amalgamated-string ;; FIX - when cfrom cto is universal we can replace this ;; by a lookup in the original characters. Currently ;; this is a bit of a hack since ;; we just guess that the strings of the individual words ;; were split by spaces. from to cfrom cto amalgamated-partial-tree unexpanded-entry dtr)))) ;; extract carg from an edge, via l-content unifs (defun get-carg-from-edge (edge) (and (typep edge 'morpho-stem-edge) (let* ((l-content (morpho-stem-edge-l-content edge)) (unifs (cdr l-content))) (get-carg-from-unifs unifs)))) ;; extract carg from an (unexpanded) entry, via unifs (defun get-carg-from-entry (entry) (let ((unifs (lex-entry-unifs entry))) (get-carg-from-unifs unifs))) ;; extract carg from a set of unifs (defun get-carg-from-unifs (unifs &key (carg-path (smaf::ersatz-carg-path))) (let* ((carg-unif (find carg-path unifs :test #'(lambda (x y) (equalp (path-typed-feature-list (unification-lhs y)) x)))) (carg (and carg-unif (u-value-type (unification-rhs carg-unif))))) carg)) ;; map cargs to string and inject into entry CARG (defun inject-cargs (entry cargs &key (carg-slot-name :|carg|)) (inject-lex-entry entry (get-inject (list (cons carg-slot-name (str-list-2-str cargs))) (smaf::gmap)))) (defun combine-partial-trees (partial-tree-set) ;;; WARNING: assumes partial tree is a list ;;; takes a list of partial trees and returns a single ;;; partial tree (if (cdr partial-tree-set) (stable-sort (apply #'append partial-tree-set) #'partial-tree-order :key #'car) (car partial-tree-set))) (defun partial-tree-order (rule-id1 rule-id2) (let ((re1 (get-lex-rule-entry rule-id1)) (re2 (get-lex-rule-entry rule-id2))) (and (check-nosp-feeding re2 re1) (not (check-nosp-feeding re1 re2))))) ;;; ************************************************** ;;; ;;; Unknown words / messages about missing words ;;; ;;; ************************************************** #| The idea is to allow for ambiguity in the token input. We scan initially, recording how far we can get in res-array, which stores t at index n if there is a potential span between 0 and n. If we get to a point where there's a gap, we generate a warning and perhaps an unknown word, treat the gap as filled and go on from there. |# (defun check-stem-coverage (max) (let ((res-array (make-array (+ max 1) :initial-element nil))) (setf (aref res-array 0) t) (check-stem-coverage-aux 0 max res-array) (generate-messages-and-unknown-words res-array max))) (defun generate-messages-and-unknown-words (res-array max-dim) ;; (format t "~%generate-messages-and-unknown-words ~A" res-array) (unless (aref res-array max-dim) (dotimes (current (1+ max-dim)) (unless (aref res-array current) (let ((token-ccs (aref *tchart* current 0))) (dolist (cc token-ccs) (let ((token-entry (chart-configuration-edge cc))) (unless *generate-messages-for-all-unanalysed-tokens* (format t "~%No analysis found corresponding to token ~a-~a ~A" (token-edge-from token-entry) (token-edge-to token-entry) (token-edge-word token-entry))) ))) ;;; FIX (generate-unknown-word-entries stem-string) (setf (aref res-array current) t) (check-stem-coverage-aux current max-dim res-array) (generate-messages-and-unknown-words res-array max-dim) (return))))) (defun generate-messages-for-all-unanalysed-tokens (tchart) (declare (ignore tchart)) ;; [bmw] need to encapsulate *tchart* and associated variables in an object ;; before we can start passing it around... (loop for tedge in (get-unanalysed-and-unspanned-tedges) do (with-slots (from to word) tedge (push word *unanalysed-tokens*) (format t "~&No lexical analysis found corresponding to token ~a-~a ~A" from to word)))) (defun check-stem-coverage-aux (start max res-array) ;; (format t "~%check-stem-coverage-aux ~A ~A ~A" start max res-array) (loop for end from 1 to max do (let ((stem-ccs (aref *chart* start end)) (end-points nil)) (dolist (cc stem-ccs) (let ((end-point (chart-configuration-end cc))) (unless (member end-point end-points) (push end-point end-points) (setf (aref res-array end-point) t)))) (dolist (end end-points) (check-stem-coverage-aux end max res-array))))) ;;; ***************************************************** ;;; ;;; Morphosyntax interaction with phase 4 ;;; ;;; Morphological and lexical rule application ;;; ;;; ***************************************************** ;;; We now have a chart with edges corresponding to stems which ;;; may need some morphological rules to be added. ;;; This is complicated because the morphological rules can be ;;; interleaved with arbitrary numbers of lexical rules. ;;; ;;; We want to combine this with parsing proper. ;;; Ordinary grammar rules are not allowed to apply to things with a ;;; partial-tree. Spelling-change rules are only allowed to apply ;;; when the next thing on the partial-tree is them. Lexical ;;; rules can apply any time. (defun find-spelling-info (edge) ;;; WARNING - assumes partial tree is a list (let* ((partial-tree (edge-partial-tree edge))) ;;; list of (rule "FORM") (if partial-tree (let* ((current-rule-info (first partial-tree)) (new-orth (pt-node-string current-rule-info)) (orth-tdfs (when new-orth (make-orth-tdfs new-orth))) (rule-id (pt-node-rule current-rule-info)) (rule-entry (get-lex-rule-entry rule-id))) (if rule-entry (values rule-entry orth-tdfs (rest partial-tree)) (progn (format t "~%Warning: rule ~A specified by ~ morphology was not found" rule-id) nil)))))) (defun apply-immediate-spelling-rule (rule orth-tdfs remaining-tree left-vertex child-edge right-vertex f) ;;; this is a special case of apply-immediate-grammar-rule #+:pdebug (format t "~&apply-immediate-spelling-rule(): `~(~a~) <-- ~d~^ [~d -- ~d]" (rule-id rule) (edge-id child-edge) left-vertex right-vertex) (when (and (check-rule-filter rule (edge-rule child-edge) 0) (restrictors-compatible-p (car (rule-daughters-restricted-reversed rule)) (edge-dag-restricted child-edge))) (multiple-value-bind (unification-result first-failed-p) (evaluate-unifications rule (list (edge-dag child-edge)) orth-tdfs nil nil (edge-cfrom child-edge) (edge-cto child-edge)) (if unification-result (let* ((new-edge (make-edge :id (next-edge) :category (indef-type-of-tdfs unification-result) :rule rule :children (list child-edge) :dag unification-result :lex-ids (edge-lex-ids child-edge) :leaves (edge-leaves child-edge) :partial-tree remaining-tree))) #+pdebug (format t " ... success.~%") (activate-context left-vertex new-edge right-vertex f) t) (progn #+pdebug (format t " ... ~:[fail~;throw~].~%" first-failed-p) (if first-failed-p nil t)))))) ;;; ************************************************************** ;;; ;;; Parsing - Phase 4 proper ;;; ;;; ************************************************************* (defun add-words-to-chart (f) (dotimes (end *chart-max*) (dotimes (start *chart-max*) (let ((stem-ccs (aref *chart* start (1+ end)))) (dolist (stem-cc stem-ccs) (let ((edge (chart-configuration-edge stem-cc))) (add-word edge f (chart-configuration-begin stem-cc) (1+ end)))))))) (defun add-word (edge f left-vertex right-vertex) (declare (ignore left-vertex)) (unless *active-parsing-p* (with-agenda (when f (lex-priority edge)) (activate-context-no-add (edge-from edge) edge right-vertex f)))) (defun unify-in-word (tdfs word-string) (declare (ignore word-string)) tdfs) (defun activate-context (left-vertex edge right-vertex f) #+:pdebug (format t "~&activate-context(): edge # ~d: [~d -- ~d];~%" (edge-id edge) left-vertex right-vertex) (add-to-chart left-vertex edge right-vertex f) (activate-context-no-add left-vertex edge right-vertex f)) (defun activate-context-no-add (left-vertex edge right-vertex f) ;; when we have a partial-tree specification on the edge, one ;; option is the corresponding morphological rule ;; When this is applied, the new edge has a record ;; that removes that element from the partial-tree (multiple-value-bind (spelling-rule orth-tdfs remaining-morph) (find-spelling-info edge) (when spelling-rule (apply-immediate-spelling-rule spelling-rule orth-tdfs remaining-morph left-vertex edge right-vertex f)) (dolist (rule (if spelling-rule (loop for lr in *parser-lexical-rules* when (check-sp-lr-feeding spelling-rule lr) collect lr) *parser-rules*)) ;; when we have a partial-tree specification, we have all ;; lexical rules here and no parser rules ;; ;; grammar rule application is attempted ;; when we've got all the bits (try-grammar-rule-left rule ;; avoid a call to reverse here in a fairly tight loop (rule-daughters-restricted-reversed rule) left-vertex right-vertex (list edge) f (1- (length (the list (rule-daughters-apply-order rule))))) ;; when we don't build up the chart in strict left-to-right ;; order (as when we're doing a best-first search), we need to ;; check for rule applying to the right as well as to the left. ;; WARNING: this will only work correctly if all rules are no ;; more than binary branching!! (when (and f (cdr (rule-daughters-restricted rule))) (try-grammar-rule-right rule (rule-daughters-restricted rule) left-vertex right-vertex (list edge) f 0))))) (defun add-to-chart (left edge right f) (let ((config (make-chart-configuration :begin left :edge edge :end right))) (push config (aref *chart* left right)) ;; are we in quick first-only (non-exhaustive) mode and did we just find a ;; spanning config? (when (and f (eql left (car f)) (eql right (cdr f)) (not (edge-partial-tree edge))) (let ((results (parses-from-spanning-edges (list edge) *first-only-p*))) (when results (setq *parse-record* (append *parse-record* results)) (decf *first-only-p* (length results)) (when (< *first-only-p* 1) (throw :best-first t))))))) (defun try-grammar-rule-left (rule rule-restricted-list left-vertex right-vertex child-edge-list f n) ;; Application of a grammar rule: Every time an edge is added to the chart, ;; a check is made to see whether its addition passes the rule application ;; filter. ;; If yes collect the dags associated with the ;; children, perform the unifications specified by the rule, and if the ;; unification(s) succeed, create a new edge (for the mother), record its ;; dag and associated information, add this to the chart, and invoke the ;; same process recursively. (declare (type fixnum n)) #+:pdebug (format t "~&try-grammar-rule-left(): `~(~a~) [~d] <-- ~{~d~^ ~} [~d -- ~d]~%" (rule-id rule) (length rule-restricted-list) (loop for e in child-edge-list collect (edge-id e)) left-vertex right-vertex) (if (and (check-rule-filter rule (edge-rule (car child-edge-list)) n) (restrictors-compatible-p (car rule-restricted-list) (edge-dag-restricted (car child-edge-list)))) (if (cdr rule-restricted-list) ; do we need more daughters? (loop for r from (1+ left-vertex) to *chart-max* with entry = (aref *chart* left-vertex r) do (dolist (config entry) (unless (edge-partial-tree (chart-configuration-edge config)) (unless ;; inner recursive call returns nil in cases when first ;; unif attempt fails - if this happens there's no point ;; continuing with other alternatives here (try-grammar-rule-left rule (cdr rule-restricted-list) (chart-configuration-begin config) right-vertex (cons (chart-configuration-edge config) child-edge-list) f (1- n)) #+:pdebug (format t "~&try-grammar-rule-left(): ~ `~(~a~) [~d] <-- ~{~d~^ ~} [~d -- ~d] ... throw~%" (rule-id rule) (length rule-restricted-list) (loop for e in child-edge-list collect (edge-id e)) left-vertex right-vertex) ;; nil returned from the inner call is a signal ;; that unification of the edge we're triggering off ;; failed, so success with any combination is impossible #-:vanilla (return-from try-grammar-rule-left nil) #+:vanilla t))) finally (return t)) ; return t, because we don't want an outer loop to throw ;; we've got all the bits (with-agenda (when f (rule-priority rule)) (apply-immediate-grammar-rule rule left-vertex right-vertex child-edge-list f t))) (progn (incf (statistics-ftasks *statistics*)) t))) (defun try-grammar-rule-right (rule rule-restricted-list left-vertex right-vertex child-edge-list f n) (declare (type fixnum n)) (if (and (check-rule-filter rule (edge-rule (car child-edge-list)) n) (restrictors-compatible-p (car rule-restricted-list) (edge-dag-restricted (car child-edge-list)))) (if (cdr rule-restricted-list) (loop for l from 0 to (1- right-vertex) with entry = (aref *chart* l right-vertex) do (dolist (config entry) (unless (edge-partial-tree (chart-configuration-edge config)) (unless (try-grammar-rule-right rule (cdr rule-restricted-list) left-vertex (chart-configuration-end config) (cons (chart-configuration-edge config) child-edge-list) f (1+ n)) #-:vanilla (return-from try-grammar-rule-right nil) #+:vanilla t))) finally (return t)) ;; we've got all the bits (with-agenda (when f (rule-priority rule)) (apply-immediate-grammar-rule rule left-vertex right-vertex child-edge-list f nil))) (progn (incf (statistics-ftasks *statistics*)) t))) (defparameter *debugging* nil) (defun apply-immediate-grammar-rule (rule left-vertex right-vertex child-edge-list f backwardp) ;; attempt to apply a grammar rule when we have all the parts which match ;; its daughter categories (cond ((and *brackets-list* (not (consistent-bracketing-p (mapcar #'(lambda (edge) (cons (edge-from edge) (edge-to edge))) child-edge-list) *brackets-list*))) t) ; t because we don't want the first-failed-p effect (t #+:pdebug (format t "~&try-immediate-grammar-rule(): `~(~a~) <-- ~{~d~^ ~} [~d -- ~d]" (rule-id rule) (loop for e in child-edge-list collect (edge-id e)) left-vertex right-vertex) (let ((child-edge-list-reversed (reverse child-edge-list))) (multiple-value-bind (unification-result first-failed-p) (evaluate-unifications rule (mapcar #'edge-dag child-edge-list-reversed) nil child-edge-list-reversed backwardp (loop for edge in child-edge-list minimize (edge-cfrom edge)) (loop for edge in child-edge-list maximize (edge-cto edge))) (if unification-result (let* ((edge-list (if backwardp child-edge-list child-edge-list-reversed)) (new-edge (make-edge :id (next-edge) :category (indef-type-of-tdfs unification-result) :rule rule :children edge-list :dag unification-result :lex-ids (mapcan #'(lambda (child) (copy-list (edge-lex-ids child))) edge-list) :leaves (mapcan #'(lambda (child) (copy-list (edge-leaves child))) edge-list) :partial-tree (edge-partial-tree (first edge-list)) ;; should be unary-rule if there is a partial ;; tree (worry about compounds later ...) ;; WARNING ))) #+pdebug (format t " ... success.~%") (activate-context left-vertex new-edge right-vertex f) t) (progn #+pdebug (format t " ... ~:[fail~;throw~].~%" first-failed-p) (if first-failed-p nil t)))))))) (defun evaluate-unifications (rule child-fs-list &optional nu-orth child-edge-list backwardp cfrom cto) ;; nu-orth is the new orthography if the unifications relate to a morphological ;; process - to be inserted into the result fs (let ((current-tdfs (rule-full-fs rule)) (rule-daughter-order (if backwardp (rule-daughters-order-reversed rule) (cdr (rule-order rule)))) (n -1) (new-orth-fs ;; shouldn't really make a new tdfs here because we won't need it if a unification ;; fails, but we must do it outside the unification context since they cannot nest (when nu-orth (if (tdfs-p nu-orth) nu-orth (make-orth-tdfs nu-orth))))) (declare (type fixnum n)) (with-unification-context (ignore) (dolist (rule-feat rule-daughter-order) (incf n) (let ((child-edge (pop child-edge-list))) (cond ((zerop n)) ((x-restrict-and-compatible-p (if (listp rule-feat) (x-existing-dag-at-end-of (deref-dag (tdfs-indef current-tdfs)) rule-feat) (x-get-dag-value (deref-dag (tdfs-indef current-tdfs)) rule-feat)) (edge-dag-restricted child-edge))) (t (incf (statistics-ftasks *statistics*)) (return-from evaluate-unifications nil)))) (incf (statistics-etasks *statistics*)) (let ((child (pop child-fs-list))) ;; if two daughters are eq, avoid the possibility of the unifier's subgraph ;; sharing code creating spurious coreferences in the result (when (member child child-fs-list :test #'eq) (setq child (copy-tdfs-completely child))) (if (setq current-tdfs (yadu current-tdfs (create-temp-parsing-tdfs child rule-feat))) (incf (statistics-stasks *statistics*)) (return-from evaluate-unifications (values nil (zerop n)))))) ; first attempt failed? ;; if (car (rule-order rule)) is nil then tdfs-at-end-of will return the ;; entire structure (let ((result (tdfs-at-end-of (car (rule-order rule)) current-tdfs))) (when new-orth-fs (setq result (yadu result new-orth-fs))) (when result (restrict-and-copy-tdfs result :cfrom cfrom :cto cto)))))) (defun create-temp-parsing-tdfs (tdfs flist) (if (null flist) tdfs (let ((indef-dag (create-dag)) (tail nil)) (unify-list-path flist indef-dag (tdfs-indef tdfs)) (when (tdfs-tail tdfs) (let ((path (create-path-from-feature-list (if (listp flist) flist (list flist))))) (loop for tail-element in (tdfs-tail tdfs) do (push (add-path-to-tail path tail-element) tail)))) (make-tdfs :indef indef-dag :tail tail)))) ;;; *********************************************************** ;;; ;;; Finishing off ;;; ;;; ********************************************************** (defun find-spanning-edges (start-vertex end-vertex) ;; return all complete edges between start and end vertices (loop for c in (aref *chart* start-vertex end-vertex) unless (edge-partial-tree (chart-configuration-edge c)) collect (chart-configuration-edge c))) (defparameter *additional-root-condition* nil "defined in mrs/idioms.lisp") (defun filter-root-edges (edge start-symbols &optional (full-check-p t)) ;; if any of the start-symbols unify then this may be a parse. ;; NB despite the name of the function, the 1st arg is definitely a single edge. ;; If full-check-p is nil, skip any additional root condition test. ;; Adding a quick check here unlikely to filter out many candidates. ;; Ensure any changes here remain compatible with tsdb/lisp/derivations.lisp (and (loop for start-symbol in start-symbols for rtdfs = (get-tdfs-given-id start-symbol) ; might be a type or a root entry thereis (and rtdfs (yaduablep rtdfs (edge-dag edge)))) (if (and full-check-p *additional-root-condition*) (funcall *additional-root-condition* (edge-dag edge)) t) (list edge))) (defun compute-root-edges (edges start-symbols) (if (and start-symbols *substantive-roots-p*) (loop for edge in edges nconc (loop ;; by this stage we know that at least one of the roots is unifiable - use ;; the first of these to build a single new top edge for start-symbol in start-symbols for rtdfs = (get-tdfs-given-id start-symbol) ; might be a type or a root entry when rtdfs do (let ((unif (yadu rtdfs (edge-dag edge)))) (when unif (let ((new-edge (make-edge :id (next-edge) :category (indef-type-of-tdfs unif) :rule start-symbol :dag (copy-tdfs-elements unif) :dag-restricted (restrict-fs (tdfs-indef unif)) :children (list edge) :leaves (edge-leaves edge) :lex-ids (edge-lex-ids edge) :from (edge-from edge) :to (edge-to edge)))) (add-to-chart (edge-from edge) new-edge (edge-to edge) nil) ; don't (recursively) check for success (return (list new-edge))))))) edges)) ;;; *************************************************************** ;;; ;;; TTY printout of chart ;;; chart edges are ordered on: right vertex, left vertex, edge id ;;; ;;; *************************************************************** (defun print-chart (&key frozen concise (stream t)) (format stream "~%> chart dump:~%") (when *chart* (loop for end from 1 to *chart-max* do (print-chart-entry end (loop for start from 0 to (1- end) append (aref *chart* start end)) :frozen frozen :concise concise :stream stream))) (terpri stream)) (defun print-tchart (&key frozen concise (stream t)) (format stream "~%> token/spelling chart dump:~%") (when *tchart* (loop for end from 1 to *tchart-max* do (print-chart-entry end (aref *tchart* end 0) :frozen frozen :concise concise :stream stream))) (terpri stream)) (defun print-chart-entry (vertex item &key frozen concise (stream t)) (when item (terpri stream) (dolist (configuration (sort (copy-list item) #'(lambda (span1 span2) (cond ((eql (chart-configuration-begin span1) (chart-configuration-begin span2)) (< (edge-id (chart-configuration-edge span1)) (edge-id (chart-configuration-edge span2)))) (t (< (chart-configuration-begin span1) (chart-configuration-begin span2))))))) (print-chart-item configuration vertex :frozen frozen :concise concise :stream stream)))) (defun print-chart-item (item &optional end &key (frozen nil frozenp) concise (stream t)) (declare (ignore concise)) (let ((edge (if (edge-p item) item (chart-configuration-edge item))) (begin (unless (edge-p item) (chart-configuration-begin item))) (roots (unless (edge-p item) (chart-configuration-roots item)))) (when (or (null frozenp) (eq (edge-frozen edge) frozen)) (format stream "~&~:[~2*~;~A-~A ~][~A] ~A => ~A~A [~{~A~^ ~}]" (and begin end) begin end (edge-id edge) (if (token-edge-p edge) (token-edge-word edge) (if (morpho-stem-edge-p edge) (format nil "~A+~{~A ~}" (morpho-stem-edge-stem edge) (mapcar #'(lambda (rule-spec) (concise-rule-name (pt-node-rule rule-spec))) (morpho-stem-edge-partial-tree edge))) ;;; WARNING - assume partial-tree is a list (concise-edge-label edge))) (edge-leaves edge) (cond ;; ((token-edge-p edge) " T") (roots "*") (t "")) (or (loop for child in (edge-children edge) collect (edge-id child)) (loop for child in (edge-tchildren edge) collect (edge-id child)))) (format stream "~:[~2*~; ~:[+~;~]~d~]" (edge-frozen edge) (and (edge-frozen edge) (minusp (edge-frozen edge))) (edge-frozen edge)) ;; ;; if applicable, print out compact summary of packings (9-aug-99 - oe) ;; (when (or (edge-equivalent edge) (edge-packed edge)) (let ((edge (first (or (edge-equivalent edge) (edge-packed edge))))) (format stream " { [~d < ~{~d~^ ~}" (edge-id edge) (loop for child in (edge-children edge) collect (edge-id child)))) (loop for edge in (rest (edge-equivalent edge)) do (format stream "; ~d < ~{~d~^ ~}" (edge-id edge) (loop for child in (edge-children edge) collect (edge-id child)))) (loop for edge in (if (edge-equivalent edge) (edge-packed edge) (rest (edge-packed edge))) do (format stream "; ~d < ~{~d~^ ~}" (edge-id edge) (loop for child in (edge-children edge) collect (edge-id child)))) (format stream "]")) ;; print cfrom/cto (when (token-edge-p edge) (with-slots (cfrom cto) edge (format stream "~:[~2*~; <~A c ~A> ~]" (and cfrom cto) cfrom cto))) ;; print xfrom/xto (when (token-edge-p edge) (with-slots (xfrom xto) edge (format stream "~:[~2*~; <~A x ~A> ~]" (and xfrom xto) xfrom xto))) (format stream "~%")))) (defun concise-edge-label (edge) (if (rule-p (edge-rule edge)) (rule-id (edge-rule edge)) (first (edge-lex-ids edge)))) ;;; ************************************************************** ;;; ;;; Parsing sentences from file (when the fine system is just too ;;; much ...) ;;; ;;; ************************************************************** (defun parse-sentences (&optional input-file (output-file nil output-file-p) &rest rest) ;; if output-file is nil or t, then output is suppressed or goes to standard-output, ;; respectively (unless input-file (setq input-file (or (ask-user-for-existing-pathname "Sentence file?") (return-from parse-sentences)))) (cond ((not (probe-file input-file)) (show-message-window (format nil "Input file `~a' does not exist" input-file)) (return-from parse-sentences)) ((file-xml-p input-file) ; if xml input, assume SAF XML (return-from parse-sentences (apply #'process-saf-file-sentences input-file rest))) (t (setq input-file (truename input-file)))) ;; (unless output-file-p (setq output-file (or (ask-user-for-new-pathname "Output file?" input-file) (return-from parse-sentences)))) (setq output-file (cond ((member output-file '(nil t)) output-file) ((probe-file output-file)) (t (merge-pathnames output-file)))) (when (equal output-file input-file) (show-message-window (format nil "Not permitted to overwrite the input file `~a'" input-file)) (return-from parse-sentences)) ;; (with-open-file (istream input-file :direction :input) (if (member output-file '(nil t)) (batch-parse-sentences istream output-file #'extract-fine-system-sentence) (with-open-file (ostream output-file :direction :output :if-exists :supersede :if-does-not-exist :create) (batch-parse-sentences istream ostream #'extract-fine-system-sentence))))) (defparameter *do-something-with-parse* nil) (defparameter *lex-ids-used* nil) (defparameter *parse-input* nil) (defparameter *ostream* nil) (defun batch-parse-sentences (istream ostream &optional access-fn) ;; if *do-something-with-parse* is bound to a function, then this is called after parsing ;; each sentence; otherwise the value of ostream determines what is output: either ;; nil (only final summary, to standard output), t (progress and summary to standard output), ;; or a stream (ditto to the stream) (setf *unanalysed-tokens* nil) (clear-type-cache) (format t "~%;;; Parsing test file~%") (force-output) (let ((nsent 0) (edge-total 0) (parse-total 0) (*print-right-margin* 300) raw-sentence (start-time (get-internal-run-time)) #+:sbcl (gc-bytes (sb-ext:bytes-consed-between-gcs))) #+:sbcl (setf (sb-ext:bytes-consed-between-gcs) (truncate (* #+:x86-64 1 #-:x86-64 0.5 (expt 2 30)))) ; double usual (unwind-protect (loop (setq raw-sentence (read-line istream nil 'eof)) (when (eq raw-sentence 'eof) (unless (fboundp *do-something-with-parse*) (let ((out (or ostream t))) (format out "~&~%;;; Total CPU time: ~A msecs~%" (round (* (- (get-internal-run-time) start-time) 1000) internal-time-units-per-second)) (format out ";;; Mean edges: ~,2F~%" (/ edge-total nsent)) (format out ";;; Mean parses: ~,2F~%" (/ parse-total nsent)))) (format t "~%;;; Finished test file~%") (finish-output) (lkb-beep) (return)) (let ((sentence (string-trim '(#\Space #\Tab #\Newline) (if access-fn (funcall access-fn raw-sentence) raw-sentence)))) (unless (or (zerop (length sentence)) (char= (char sentence 0) #\;)) (incf nsent) (unless (fboundp *do-something-with-parse*) ;; if we're doing something else, let that function control output (when ostream (format ostream "~&~A ~A " nsent sentence) (force-output (if (eq ostream t) nil ostream)))) (let ((parse-input (split-into-words (if (fboundp 'preprocess-sentence-string) (funcall (symbol-function 'preprocess-sentence-string) sentence) sentence))) ;; ask for recycling of safe dags from the pool ;; NB lexical entries must not contain safe dags - so expand-psort-entry ;; and friends must rebind *safe-not-to-copy-p* to nil #+:pooling (*dag-recycling-p* t)) #+:pooling (reset-pools #+:gdebug :forcep #+:gdebug t) (#-:gdebug handler-case #+:gdebug progn (progn (parse parse-input nil) (setf *sentence-number* nsent) (setf *sentence* sentence) (setf *parse-input* parse-input) (setf *ostream* ostream) (if (fboundp *do-something-with-parse*) (funcall *do-something-with-parse*) (when ostream (format ostream "~A ~A~%" (length *parse-record*) *edge-id*)))) #-:gdebug (storage-condition (condition) (format t "~&Memory allocation problem: ~A caused by ~A~%" condition raw-sentence)) #+(and (not :gdebug) (or :allegro :sbcl :ccl)) (#+:allegro excl:interrupt-signal #+:sbcl sb-sys:interactive-interrupt #+:ccl ccl:interrupt-signal-condition () (error "Interrupt signalled")) #-:gdebug (error (condition) (format t "~&Error: ~A caused by `~A'~%" condition raw-sentence))) (incf parse-total (length *parse-record*)) (incf edge-total *edge-id*))))) (setq *lex-ids-used* (collect-expanded-lex-ids *lexicon*)) (flush-agenda *agenda*) ; clear out agenda in case of early exit from last parse #+:pooling (reset-pools #+:gdebug :forcep #+:gdebug t) (clear-chart) ; prevent access to any dags in recycling pool #+:sbcl (setf (sb-ext:bytes-consed-between-gcs) gc-bytes) #+:sbcl (sb-ext:gc :full t) ; release unused memory, shrinking process size ))) (defun edge-count nil (let ((distinct-parse-edges nil)) (dolist (p *parse-record*) (setq distinct-parse-edges (local-parse-tsdb-distinct-edges p distinct-parse-edges))) (length distinct-parse-edges))) (defun local-parse-tsdb-distinct-edges (edge found) ;; same as parse-tsdb-distinct-edges but don't want ;; to necessarily load tsdb (pushnew edge found :test #'eq) (when (and (edge-children edge) (not (lexical-rule-p (edge-rule edge)))) (dolist (c (edge-children edge)) (setq found (local-parse-tsdb-distinct-edges c found)))) found) ;;; ************************************************************* ;;; ;;; generator utilities and structures ;;; ;;; ************************************************************ ;;; extracting a list of lexical entries used in a parse ;;; used for testing the generation lexical lookup algorithm (defun retrieve-lex-from-parses nil (loop for edge in *parse-record* collect (edge-lex-ids edge))) ; (collect-parse-base (car *parse-record*)) (defun collect-parse-base (edge-rec) ;;; takes a top edge, returns a list of ;;; lexical identifiers, unary-rule-list pairs (if (or (cdr (edge-lex-ids edge-rec)) (and (rule-p (edge-rule edge-rec)) (not (lexical-rule-p (edge-rule edge-rec))))) (loop for child in (edge-children edge-rec) append (collect-parse-base child)) (list (cons (car (edge-lex-ids edge-rec)) (nreverse (collect-unary-rule-names edge-rec)))))) (defun collect-unary-rule-names (edge-rec) (when (cdr (edge-children edge-rec)) (error "~%Should be unary edge ~A" edge-rec)) (if (edge-children edge-rec) (cons (rule-id (edge-rule edge-rec)) (collect-unary-rule-names (car (edge-children edge-rec)))))) ;;; edge structures (defstruct (dotted-edge (:include edge) (:constructor make-dotted-edge-x (&key id score category rule dag odag (dag-restricted (when dag (restrict-fs (tdfs-indef dag)))) leaves lex-ids parents children tchildren orth-tdfs partial-tree (from (when (edge-p (first children)) (edge-from (first children)))) (to (when (edge-p (first (last children))) (edge-to (first (last children))))) label head (lnk (let ((first (first children)) (last (first (last children)))) (when (and (edge-p first) (edge-p last)) (mrs::combine-lnks (edge-lnk first) (edge-lnk last))))) (cfrom (when (edge-p (first children)) (edge-cfrom (first children)))) (cto (when (edge-p (first (last children))) (edge-cto (first (last children))))) string mrs flags foo bar baz packed equivalent frozen adjuncts unpacking res needed))) res ; feature name of mother in rule dag in active edges needed ; ordered list of names of daughter features still to be found ) (defun make-dotted-edge (&rest rest) (declare (dynamic-extent rest)) (register-edge (apply #'make-dotted-edge-x rest))) (defstruct (g-edge (:include dotted-edge) (:constructor make-g-edge-x (&key id score category rule dag odag (dag-restricted (when dag (restrict-fs (tdfs-indef dag)))) leaves lex-ids parents children tchildren orth-tdfs partial-tree (from (when (edge-p (first children)) (edge-from (first children)))) (to (when (edge-p (first (last children))) (edge-to (first (last children))))) label head (lnk (let ((first (first children)) (last (first (last children)))) (when (and (edge-p first) (edge-p last)) (mrs::combine-lnks (edge-lnk first) (edge-lnk last))))) (cfrom (when (edge-p (first children)) (edge-cfrom (first children)))) (cto (when (edge-p (first (last children))) (edge-cto (first (last children))))) string mrs flags foo bar baz packed equivalent frozen adjuncts unpacking res needed rels-covered index lexemes mod-index accessible))) ;; category, id, rule, leaves: filled in, not used in generation algorithm ;; itself, but needed for debug output, chart display etc rels-covered ; set of relations generated so far index lexemes ; non-ordered set of found-lex structures mod-index ; 0-based index to modifier under intersective rule instantiation accessible ; indices accessible in this edge ) (defun make-g-edge (&rest rest) (declare (dynamic-extent rest)) (register-edge (apply #'make-g-edge-x rest))) ;;; morphology for the generator (defun apply-morph-rule (rule fs fs-restricted new-orth &optional daughter) #-:debug (declare (ignore daughter)) ;; ;; _fix_me_ ;; (let* ((qc (restrictors-compatible-p (car (rule-daughters-restricted rule)) fs-restricted)) (result (and qc (evaluate-unifications rule (list fs) new-orth)))) #+:debug (when qc (format t "apply-morph-rule(): ~a + ~a: ~:[nil~;t~] ~:[nil~;t~]~%" (rule-id rule) (if (rule-p daughter) (rule-id daughter) daughter) qc result)) result)) ;; ;; [bmw] some utility functions ;; (defun get-edges (&optional (tchart *tchart*)) (loop for i from 1 to (1- *chart-limit*) for ccs-incident = (aref tchart i 0) append (loop for cc in ccs-incident for edge = (chart-configuration-edge cc) when (edge-p edge) collect edge))) (defun get-tedges (&optional (tchart *tchart*)) (loop for i from 1 to (1- *chart-limit*) for ccs-incident = (aref tchart i 0) append (loop for cc in ccs-incident for edge = (chart-configuration-edge cc) when (token-edge-p edge) collect edge))) (defun get-medges (&optional (tchart *tchart*)) (loop for i from 1 to (1- *chart-limit*) for ccs-incident = (aref tchart i 0) append (loop for cc in ccs-incident for edge = (chart-configuration-edge cc) when (morpho-stem-edge-p edge) collect edge))) (defun get-unanalysed-tedges (&optional (tchart *tchart*)) ;; [bmw] return tedges with no medge ancestors (let ((tedges (get-tedges tchart)) (medges (get-medges tchart))) (loop for medge in medges for tchildren = (edge-children medge) do (setf tedges (set-difference tedges tchildren))) tedges)) (defun get-unanalysed-and-unspanned-tedges nil ;; [bmw] return tedges with no accompanying medge and for which there ;; exists no alternative lattice path of medges (loop for tedge in (get-unanalysed-tedges) for source = (edge-from tedge) for target = (edge-to tedge) unless (medge-spanned-p source target) collect tedge)) (defun get-min-edge-cfrom (edges) (when edges (loop for e in edges minimize (edge-cfrom e)))) (defun get-max-edge-cto (edges) (when edges (loop for e in edges maximize (edge-cto e))))