;;; Copyright (c) 1991-2022 ;;; John Carroll, Ann Copestake, Robert Malouf, Stephan Oepen,Ben Waldron ;;; see `LICENSE' for conditions. ;;; AAC April 2005 - rationalisation of morphology and MWEs ;;; - also preparation for chart output from tokeniser ;;; Notes 1. removed ltemplates - I don't have a grammar etc to test ;;; this mechanism, but I rather think it will be superseded by the ;;; new approach when we get this fully implemented ;;; 2. sensible messages when missing lexical entries (or unknown words) ;;; 3. more or less minor FIXes as below (unify-in) (in-package :lkb) ;;; This file implements the chart parser. ;;; The chart data structures: ;;; ;;; The token chart is a 2D array with dimension 0 being a vertex number v, and ;;; dimension 1 holding the edges ending at vertex v (0) / the edges starting at ;;; vertex v (1). ;;; ;;; The parse chart is a 2D array with dimensions being the start vertex and the ;;; end vertex. (defvar *parser-rules* nil) (defvar *parser-lexical-rules* nil) (defvar *sentence* nil) (defvar *sentence-number* nil) (defvar *tchart-max* 0 "set by the tokeniser to the maximum vertex in the token chart") (defvar *chart-max* 0 "set by the stem edge addition code to the maximum vertex in the parse chart") (defvar *chart-dependencies*) (defvar *chart-generation-counter* 0 "counter used by the user interface to make sure parse tree windows etc have the current parse chart") (defparameter *chart-packing-p* t) ; JAC 2-Aug-17: changed to t, having fixed a few issues (defparameter *unpacking-scoring-hook* nil) (defparameter *cm-debug* nil) ; t/1: rule applications; 2: +regex matching; 3: +rule attempts ;; [bmw] some globals for customising the behaviour of the 'parse' function (defvar *generate-messages-for-all-unanalysed-tokens* nil) (defvar *unanalysed-tokens* nil) (defvar *abort-parse-after-morphosyntax* nil) (defvar *abort-parse-after-lexical-lookup* nil) (defvar *abort-parse-if-no-spanning-morph-edge-path* nil) (defvar *fallback-pos-p* nil) ;;; ;;; [bmw] HOWTO collect into *unanalysed-tokens* tokens for which lexical lookup ;;; failed: ;;; - set *generate-messages-for-all-unanalysed-tokens* to T ;;; - set *clean-tchart-p* to T (ensures no duplicate token edges) ;;; - ... parse some input ... ;;; - look in *unanalysed-tokens* for bag of token instances ;;; ;;; ;;; turn on the active key-driven parsing strategy, since it: ;;; ;;; - performs better than the default unidirectional passive breadth-first ;;; search (up to 40% time reduction on longer VerbMobil sentences); ;;; - simplifies best-first search and generalizes to more than just binary ;;; branching rules; ;;; - outperforms passive in best-first mode. ;;; (20-jan-00 - oe) ;;; (defvar *active-parsing-p* t) ;;; ;;; each cell in *chart* contains a list of edges (used to be a list of ;;; chart-configuration structures) ;;; ;;; for now it seems best to keep the token and morphophonology chart ;;; (*tchart*) separate from the `real' one (*chart*). It seems possible ;;; that the number of vertices in *chart* may end up being ;;; greater than the number in *tchart* if we allow compounding. ;;; Keeping the charts distinct also avoids having to filter ;;; out the token edges and the morphop edges. ;;; (defvar *chart* (make-array '(1 1) :initial-element nil)) (declaim (type (simple-array t (* *)) *chart*) #+:sbcl (sb-ext:always-bound *chart*)) (defvar *tchart* (make-array '(1 2) :initial-element nil)) (declaim (type (simple-array t (* 2)) *tchart*) #+:sbcl (sb-ext:always-bound *tchart*)) (defvar *parse-record* nil) ;;; ;;; The 'edge registry' globally kept track of edges; however it had only one programmatic ;;; user - and now none, after a change to more efficiently get edges from the generator ;;; result. Therefore I'm disabling it. ;;; More generally, we should try to minimise the use of such global data structures since ;;; they can lead to memory leaks; setting them to nil does not guarantee timely garbage ;;; collection of their contents (for example if we're unlucky and some of their memory ;;; cells have been promoted to a long-lived generation). JAC 22-Mar-2021 ;;; (defparameter *edge-registry* nil) (defun purge-edge-registry () (setq *edge-registry* nil)) #| (defun register-edge (edge) ;; (push edge *edge-registry*) edge) (defun retrieve-edge (&key id) ;; JAC 22-Mar-2021: get edges with find-[gen-]edge-... instead ;; (when (numberp id) ;; (find id *edge-registry* :key #'edge-id)) (declare (ignore id)) nil) |# ;;; An edge is a structure with the following slots: ;;; ;;; dag - the dag associated with the constituent ;;; dag-restricted - quickcheck vector corresponding to the dag ;;; rule - either the word ("storms" etc) or the grammar rule itself which has ;;; been applied to create that edge ;;; open ;;; forwardp ;;; partial-tree - specification of affixes etc ;;; frozen - used by packing machinery ;;; packed ;;; equivalent ;;; unpacking ;;; children - daughter edges ;;; from - vertex ;;; to - vertex ;;; id - unique ;;; score - for best-first ;;; category - a type ;;; odag - dag at bottom of chain of lexical rules ;;; leaves - orthography of whatever this edge has been formed from ;;; lex-ids - like leaves, but identifiers of whole structures ;;; parents - for propagating freezing in packing ;;; orth-tdfs - fs containing orthography ;;; label - used to stash a human-readable name for the edge ;;; head - used in maxent.lisp ;;; lnk ;;; cfrom - character position ;;; cto - character position ;;; string - original string - potentially redundant ;;; because it could be recovered from cfrom and cto ;;; but useful to keep it locally ;;; mrs - generation etc ;;; tchildren - token daughter edges (for debugging) ;;; ;;; `flags', `foo,' `bar', and `baz' are scratch slots for various pieces of code ;;; (e.g. the Redwoods annotation tool and PCFG estimation) to use at their ;;; discretion. (28-oct-02; oe) ;;; ;;; 'foo' plays an important role in unpacking, so care must be taken if it is ;;; used elsewhere. ;;; ;;; NB when a structure 'inherits' from another with :include (e.g. g-edge ;;; inheriting from edge), if we want the same slot defaults as computed in the ;;; constructor for the structure being inherited from then we must take care to ;;; repeat these defaults in the inheriting structure's constructor. (declaim (inline make-edge make-token-edge make-morpho-stem-edge make-g-edge)) (defstruct (edge (:constructor make-edge (&key dag dag-restricted rule open forwardp partial-tree frozen packed equivalent unpacking children (from (when (edge-p (first children)) (edge-from (first children)))) (to (when (edge-p (first (last children))) (edge-to (first (last children))))) id score category odag leaves lex-ids parents orth-tdfs label head (lnk (let ((first (first children)) (last (first (last children)))) (when (and (edge-p first) (edge-p last)) (mrs::combine-lnks (edge-lnk first) (edge-lnk last))))) (cfrom (if (edge-p (first children)) (edge-cfrom (first children)) -1)) (cto (if (edge-p (first (last children))) (edge-cto (first (last children))) -1)) string mrs tchildren flags foo bar baz))) dag dag-restricted rule open forwardp partial-tree frozen packed equivalent unpacking children from to id score category odag leaves lex-ids parents orth-tdfs label head lnk cfrom cto string mrs tchildren flags foo bar baz) (defstruct (token-edge (:include edge)) word maf-id xfrom xto) (defstruct (morpho-stem-edge (:include edge)) word stem current l-content) (defstruct (g-edge (:include edge) (:constructor make-g-edge (&key dag dag-restricted rule open forwardp partial-tree frozen packed equivalent unpacking children from to id score category odag leaves lex-ids parents orth-tdfs label head lnk (cfrom -1) (cto -1) string mrs tchildren flags foo bar baz res needed rels-covered index lexemes accessible mod-index adjuncts))) res ; feature name of mother in rule dag in active edges needed ; ordered list of names of daughter features still to be found rels-covered ; set of relations accounted for by edge index ; semantic index of FS lexemes ; non-ordered set of found-lex structures accessible ; semantic indices accessible in this edge mod-index ; 0-based index to modifier under intersective rule instantiation adjuncts) #+:sbcl (declaim (sb-ext:freeze-type edge token-edge morpho-stem-edge g-edge)) (defmethod print-object ((instance edge) stream) (format stream "#[Edge # ~d: `~(~a~)' <~{~a~^ ~}>" (edge-id instance) (let ((rule (edge-rule instance))) (if rule (typecase rule ((or string symbol) rule) (rule (rule-id rule))) (edge-category instance))) (loop for child in (edge-children instance) collect (if (edge-p child) (edge-id child) "_"))) (when (or (edge-packed instance) (edge-equivalent instance)) (format stream " {~{~a~^ ~}}" (loop for edge in (append (edge-packed instance) (edge-equivalent instance)) collect (edge-id edge)))) (when (edge-lnk instance) (format stream " ") (mrs:output-lnk (edge-lnk instance) :stream stream)) (with-slots (cfrom cto) instance (format stream "~:[~2*~; (~A c ~A) ~]" (and cfrom cto) cfrom cto)) (format stream "]")) (defmethod print-object ((instance token-edge) stream) (format stream "#[Token edge # ~d: ~S ~S ~S" (token-edge-id instance) (token-edge-word instance) (token-edge-from instance) (token-edge-to instance)) (with-slots (cfrom cto) instance (format stream "~:[~2*~; (~A c ~A) ~]" (and cfrom cto) cfrom cto)) (format stream "]")) (defmethod print-object ((instance morpho-stem-edge) stream) (format stream "#[Morph edge # ~d: ~S ~S ~S ~S ~A" (morpho-stem-edge-id instance) (morpho-stem-edge-word instance) (morpho-stem-edge-current instance) (morpho-stem-edge-from instance) (morpho-stem-edge-to instance) (morpho-stem-edge-partial-tree instance)) (with-slots (cfrom cto) instance (format stream "~:[~2*~; (~A c ~A) ~]" (and cfrom cto) cfrom cto)) (format stream "]")) ;;; ;;; when unpacking, edges are a lot cheaper, as we are deterministic at this ;;; point (e.g. there will be very few failing unifications, if any). hence, ;;; impose a separate limit for unpacking, although it could be argued that we ;;; should count on top of the edges built already (like PET does). i prefer ;;; independent counting, however, since active edges (in generation at least) ;;; are also included in the edge count, but we could dispose of them when we ;;; move on into unpacking. (26-nov-04; oe) ;;; (defparameter %edge-allowance% 0) (defvar *edge-id* 0) (defun next-edge (&optional type) ;; return new id for a passive edge (not active) ;; edges created during unpacking are accounted for separately and do not count ;; towards the overall *maximum-number-of-edges* limit - this gives more control ;; over unpacking (if (eq type :unpack) (incf (statistics-uedges *statistics*)) (incf (statistics-pedges *statistics*))) (if (eq type :unpack) (when (>= (incf %edge-allowance%) *unpack-edge-allowance*) (error "Hit limit of ~a edges built during unpacking ~ (see documentation for *unpack-edge-allowance*)" *unpack-edge-allowance*)) (when (>= *edge-id* (+ *maximum-number-of-edges* %edge-allowance%)) (error "Hit limit of ~a chart edges, possibly due to a runaway rule ~ (see documentation for *maximum-number-of-edges*)" *maximum-number-of-edges*))) (incf *edge-id*)) ;;; ;;; Keep track of lexical entries used in each multi-word: whenever we use the same ;;; lexical entry the second time, we copy it first to avoid spurious reentrancies. ;;; The variable has no global value since it should be bound to nil afresh for ;;; each multi-word (or non-multi-word) stem ;;; (defvar *lexical-entries-used*) ;;; Agenda handling - based on a priority queue implemented as a max-heap: a ;;; balanced binary tree stored in flattened form in an array (see e.g. ;;; "Introduction to Algorithms" by Cormen, Leiserson, Rivest & Stein). Each heap ;;; element consists of a numeric key (the priority) and an associated data value, ;;; held in the array as a cons pair. Priorities are coerced to single-floats on ;;; insertion to avoid generic arithmetic comparison operations. ;;; ;;; To allow the heap to resize transparently, there is a level of indirection ;;; between the agenda (a structure) and the heap (a simple-vector inside the ;;; structure). This indirection could have been pushed down and hidden within the ;;; Lisp system by having the agenda itself be an adjustable array - but then ;;; every heap element access in heapify and agenda-insert would incur ;;; unacceptable overhead. (defstruct (agenda (:constructor new-agenda)) (a (vector 0 nil nil nil) :type simple-vector)) (defun resize-agenda (agenda n) (setf (agenda-a agenda) ;; could instead call adjust-array (without the :displaced-to option), but that's ;; unnecessarily heavyweight (replace (make-array n :initial-element nil) (agenda-a agenda))) agenda) (defmacro heap-size (a) `(the fixnum (svref ,a 0))) (defun heapify (a i) (declare (simple-vector a) (type (and fixnum unsigned-byte) i)) (macrolet ((left (i) `(* ,i 2)) (right (i) `(1+ (* ,i 2))) (key-> (x y) `(> (the single-float (car ,x)) (the single-float (car ,y))))) ;; improved version of CLRS max-heapify that in most cases performs fewer array ;; writes: instead of exchanging nodes pairwise along the traversal path, move each ;; node one level upwards and fill the final 'hole' with the original index i node (loop with heap-size = (heap-size a) with a-top = (svref a i) for a-largest = a-top for largest of-type fixnum = i for l of-type fixnum = (left i) for r of-type fixnum = (right i) do (when (and (<= l heap-size) (key-> (svref a l) a-top)) (setq largest l a-largest (svref a l))) (when (and (<= r heap-size) (key-> (svref a r) a-largest)) (setq largest r a-largest (svref a r))) (setf (svref a i) a-largest) (if (= largest i) (loop-finish) (setq i largest))))) (declaim (fixnum *maximum-number-of-tasks*)) (defun agenda-insert (agenda key value) (let* ((a (agenda-a agenda)) (heap-size (incf (heap-size a)))) (declare (type (and fixnum unsigned-byte) heap-size)) (when (>= heap-size (length a)) (if (>= (length a) *maximum-number-of-tasks*) (error "Hit limit of ~a pending tasks ~ (see documentation for *maximum-number-of-tasks*)" *maximum-number-of-tasks*) (resize-agenda agenda (min (* (length a) 2) *maximum-number-of-tasks*)))) (let ((a (agenda-a agenda))) ; in case agenda was just resized (unless (< heap-size (length a)) ; help compiler elide bounds checks (error "Inconsistency in AGENDA-INSERT")) (macrolet ((parent (i) `(floor ,i 2))) (loop with key = (coerce key 'single-float) with parent of-type fixnum with i of-type fixnum = heap-size while (and (> i 1) (progn (setq parent (parent i)) (< (the single-float (car (svref a parent))) key))) do (setf (svref a i) (svref a parent)) (setq i parent) finally (setf (svref a i) (cons key value))))))) (defun agenda-extract-max (agenda) (let* ((a (agenda-a agenda)) (heap-size (heap-size a))) (when (< heap-size 1) (error "Inconsistency - attempt to extract an element from an empty agenda")) (let* ((max (svref a 1)) (key (car max)) (value (cdr max))) (setf (svref a 1) (svref a heap-size)) (setf (heap-size a) (1- heap-size)) (heapify a 1) ;; allow value to be GCed. Instead of setting the vector element to nil we zap ;; the data item reference in the cons pair, which gets rid of the reference in ;; previous heap locations and also in previous heap vectors that this one was ;; resized from (particularly important if those vectors have got tenured) (setf (cdr max) nil) (values value key)))) (defun agenda-empty-p (agenda) (zerop (heap-size (agenda-a agenda)))) (defun flush-agenda (agenda) (when (> (length (agenda-a agenda)) *maximum-number-of-tasks*) ;; seems the user has reduced the agenda size limit, so follow their wishes (resize-agenda agenda (max 4 *maximum-number-of-tasks*))) (let ((a (agenda-a agenda))) (when (> (heap-size a) 0) ;; heap isn't empty (probably due to last run being interrupted): reset it ;; and zap references to old values, as in agenda-extract-max (setf (heap-size a) 0) (loop for i from 1 below (length a) for e = (svref a i) when (consp e) do (setf (cdr e) nil)))) agenda) (defmacro do-agenda ((var agenda &optional result) &body body) ;; like dolist syntactically, though elements are presented in indeterminate order (let ((a (gensym)) (n (gensym))) `(loop with ,a = (agenda-a ,agenda) for ,n from 1 to (heap-size ,a) for ,var = (cdr (svref ,a ,n)) do (locally ,@body) finally (return ,result)))) (defmacro with-agenda (priority &body body) ;; if .priority. evaluates to non-nil, add .body. to the global agenda, otherwise ;; execute it immediately (i.e. agenda-less processing). Can't cover both cases ;; without repeating .body. in the macro expansion: if we were to turn .body. into a ;; local function we can't declare it dynamic-extent in case it's going to be added ;; to the agenda; on the other hand if it's going to be executed immediately we ;; would want it to be declared dynamic-extent to avoid unnecessary heap allocation (let ((p (gensym))) `(let ((,p ,priority)) (if ,p (agenda-insert *agenda* ,p #'(lambda () ,@body)) (locally ,@body))))) (defvar *agenda* (new-agenda)) (defun clear-chart nil (purge-edge-registry) (incf *chart-generation-counter*) (setf *parse-record* nil) (let ((nvertices (1+ *chart-limit*))) (unless (and (arrayp *chart*) (= (array-dimension *chart* 0) nvertices) (= (array-dimension *chart* 1) nvertices) (arrayp *tchart*) (= (array-dimension *tchart* 0) nvertices) (= (array-dimension *tchart* 1) 2)) (setq *chart* (make-array (list nvertices nvertices))) (setq *tchart* (make-array (list nvertices 2))))) (dotimes (i (array-total-size *chart*)) (setf (row-major-aref *chart* i) nil)) (dotimes (i (array-total-size *tchart*)) (setf (row-major-aref *tchart* i) nil)) (setf *tchart-max* 0) (setf *chart-max* 0) (setf *edge-id* 0) (setf %edge-allowance% 0) (when *active-parsing-p* (clear-achart))) #+(or :allegro :lispworks :ccl :sbcl) (defvar *parser-lock* (mp:make-process-lock)) (defmacro with-parser-lock ((&optional foo) &body body) (declare (ignore foo)) #+(or :allegro :lispworks :ccl :sbcl) `(mp:with-process-lock (*parser-lock*) ,@body) #-(or :allegro :lispworks :ccl :sbcl) `(progn ,@body)) ;;; ;;; satisfy measurement fetish: list used to store (cpu) time used to find ;;; individual readings: bottom element is start time for parse(), topmost is ;;; end time; in best-first mode, additional elements record time for finding ;;; an analysis, one per reading. (24-feb-99 - oe) ;;; (defparameter *parse-times* nil) (defparameter *show-parse-p* t) (defvar *brackets-list* nil) (defvar *minimal-vertex* 0) (defvar *maximal-vertex* 0) (declaim (fixnum *minimal-vertex* *maximal-vertex*) #+:sbcl (sb-ext:always-bound *minimal-vertex* *maximal-vertex*)) ;;; **************************************************************** ;;; ;;; Entry point to this group of functions is parse which is passed the ;;; sentence as a list of strings and is called from the top level ;;; ;;; **************************************************************** ;;; Revised approach to tokenisation, morphology and MWES ;;; AAC April 2005 ;;; The chart is used for all of these replacing various ;;; subsidiary structures. ;;; ;;; The standard system has four phases: ;;; ;;; Phase 1 - token list is used to instantiate the chart ;;; for simple tokenisers. For more complex cases, the tokeniser ;;; outputs a chart (possibly in XML if a separate tokeniser). ;;; *tchart* is instantiated with token-edges ;;; ;;; Phase 2 - the morphophonology component instantiates the ;;; chart with possible stems plus affixes. In the revised ;;; version of the LKB this is done on a rule-by-rule basis. ;;; *tchart* is instantiated with morpho-stem edges ;;; The edges are associated with a partially specified derivation tree. ;;; In the case where an external morphophonology component operates ;;; (such as sppp) *tchart* is instantiated with morpho-stem-edges ;;; with stem set ;;; ;;; Phase 3 - lexical lookup including MWE lookup ;;; - instantiate *chart* with normal edges ;;; ;;; Phase 4 - parsing - application of morphosyntax and lexical rules ;;; along with grammar rules. The morphosyntax rules respect ;;; the partially specified derivation tree from phase 2. ;;; Instantiate the chart with edges ;;; ;;; The following are possible variations (in principle) ;;; ;;; Input source: ;;; 1a) user input from LKB ;;; 1b) real input (with markup) ;;; 1c) test suite input ;;; ;;; Preprocessor: ;;; 2a) tokens only: `standard' preprocessor (strings) ;;; : `char' preprocessor (marked up with character positions) ;;; 2b) tokens and external morphophonology ;;; i) partial-tree morphology (current sppp case) ;;; :with-tokeniser-partial-tree ;;; ii) `word' morphosyntax ;;; i.e., morphology specified as a chart with morphemes separated ;;; :with-tokeniser-retokenise ;;; Partial tree morphology (for 2a case): ;;; 3a) (new) internal LKB morphology (one rule at a time) ;;; the default ;;; 3b) external morphology (one rule at a time) ;;; :external-rule-by-rule ;;; 3c) external morphology (full partial tree, as old LKB) ;;; :external-partial-tree (defun check-morph-options (input) ;; make sure that *morph-option* and *foreign-morph-fn* are valid and compatible ;; with type of input (unless (member *morph-option* '(:default :distinct-mphon :external-rule-by-rule :external-partial-tree :with-tokeniser-partial-tree :with-tokeniser-retokenise)) (format t "~%Unrecognised *morph-option* ~S - using :default instead" *morph-option*) (setf *morph-option* :default)) (when (and (listp input) (consp (first input)) (loop for token in input thereis (assoc :analyses token))) (setf *morph-option* :with-tokeniser-partial-tree)) (if (or (eq *morph-option* :external-rule-by-rule) (eq *morph-option* :external-partial-tree)) (unless *foreign-morph-fn* (format t "~%*morph-option* ~S requires *foreign-morph-fn* to be non-nil - using :default instead" *morph-option*) (setf *morph-option* :default)) (when *foreign-morph-fn* (format t "~%*morph-option* ~S requires *foreign-morph-fn* to be nil - ignoring it" *morph-option*) (setf *foreign-morph-fn* nil)))) (defun first-only-p-normalise (x) (cond ((null x) nil) ((numberp x) (max (ceiling x) 0)) (t 1))) (defun check-parse-input-length (maf-p sppp-p input) (let ((length (cond (maf-p (smaf::get-smaf-lattice-size input)) (sppp-p (loop for token in input maximize (rest (assoc :end token)))) (t (length input))))) (when (> length *chart-limit*) (error "Sentence `~a' is longer than the ~a word maximum (see documentation for *chart-limit*)" input *chart-limit*)))) ;; supported input: ;; - basic: "the" "dog" "barks" ;; - bracketed: "Kim" "(" "(" "likes" ")" "Sandy" ")" ;; - chared: #S(CHARED-WORD :WORD "The" :CFROM 0 :CTO 2) ;; #S(CHARED-WORD :WORD "cat" :CFROM 4 :CTO 6) ;; #S(CHARED-WORD :WORD "barks" :CFROM 8 :CTO 12)) ;; - s(m)af xml: "LKB20:46:50 1/18/2006 (UTC)THEDOGBARK((PLUR_NOUN_ORULE "BARKS"))BARK((THIRD_SG_FIN_VERB_ORULE "BARKS"))" ;; - SAF object: #[SAF] ;; - sppp: ((:END . 1) ;; (:START . 0) ;; (:ANALYSES ;; ((:RULES) ;; (:INFLECTION) ;; (:STEM . "kim"))) ;; (:TO . 3) ;; (:FROM . 0) ;; (:FORM . "kim")) ;; ((:END . 2) ;; (:START . 1) ;; (:ANALYSES ;; ((:RULES ;; ((:FORM . "sleeps") ;; (:ID . PLUR_NOUN_INFL_RULE)) ;; ((:FORM . "sleeps.") ;; (:ID . PUNCT_PERIOD_RULE))) ;; (:INFLECTION) ;; (:STEM . "sleep")) ;; ((:RULES ;; ((:FORM . "sleeps") ;; (:ID . THIRD_SG_FIN_VERB_INFL_RULE)) ;; ((:FORM . "sleeps.") ;; (:ID . PUNCT_PERIOD_RULE))) ;; (:INFLECTION) ;; (:STEM . "sleep"))) ;; (:TO . 10) ;; (:FROM . 4) ;; (:FORM . "sleeps.")) (defun parse (input &optional (show-parse-p *show-parse-p*) (first-only-p *first-only-p*)) (cond ((xml-p input) ;; if input is XML then convert it to a SMAF object (setq input (smaf:xml-to-saf-object input))) ((stringp input) ;; simplest form of input is ("the" "dog" "barks"), not "the dog barks" (error "Parse function may not be applied to a non-XML string") (return-from parse))) (when (smaf::saf-p input) (setq input (saf::instantiate-l-content input smaf::*config*))) ;; keep track of mutual dependencies between various configurations: ;; - input bracketing is only available in passive mode; ;; - passive best-first restricted to unary and binary rules. ;; ;; rebind *first-only-p* to allow best-first mode to decrement it for each ;; result found, avoiding the need for an additional result count variable (let* ((*morph-option* *morph-option*) (*foreign-morph-fn* *foreign-morph-fn*) (*active-parsing-p* (if *bracketing-p* nil *active-parsing-p*)) (*first-only-p* (first-only-p-normalise first-only-p)) (*first-only-p* (if (and *first-only-p* (not *active-parsing-p*) (greater-than-binary-p)) (progn (format t "~&Warning: passive best-first mode only available for unary and binary rules; ~ therefore disabling best-first~%") nil) *first-only-p*)) (maf-p (smaf::saf-p input)) (sppp-p (and (listp input) (consp (first input))))) (check-morph-options input) (multiple-value-bind (input *brackets-list*) (if (and *bracketing-p* (not maf-p) (not sppp-p)) (initialise-bracket-list input) (values input nil)) (check-parse-input-length maf-p sppp-p input) (let ((*parser-rules* (loop for rule in (get-matching-rules nil nil) unless (member (rule-id rule) *parse-ignore-rules* :test #'eq) do (setf (rule-spanning-only-p rule) (member (rule-id rule) *spanning-only-rules* :test #'eq)) and collect rule)) (*parser-lexical-rules* (get-matching-lex-rules nil)) (*safe-not-to-copy-p* t)) (with-parser-lock () (unwind-protect (progn (flush-agenda *agenda*) (clear-chart) (reset-statistics) (setf *parse-record* nil) (setf *parse-times* (list (get-internal-run-time))) (instantiate-chart-with-tokens input) (ecase *morph-option* (:default (instantiate-chart-with-morphop)) (:distinct-mphon (instantiate-chart-with-morphop)) (:external-rule-by-rule (instantiate-chart-with-morphop)) (:external-partial-tree ; *foreign-morph-fn* is set and will be called (instantiate-chart-with-morpho-stem-edges)) (:with-tokeniser-partial-tree nil) (:with-tokeniser-retokenise nil)) (when *generate-messages-for-all-unanalysed-tokens* (generate-messages-for-all-unanalysed-tokens *tchart*)) (when *abort-parse-after-morphosyntax* (return-from parse (get-parse-return-values))) (when *fallback-pos-p* (augment-tchart-with-fallback-morphop)) (when (and *abort-parse-if-no-spanning-morph-edge-path* (not (medge-spanning-path-p))) ;; [bmw] would it be safe to make this behaviour the default? ;; JAC - no, because apparent morph gaps in the token chart could be filled ;; by generic LEs when instantiating the parse chart (format t "~&Warning: halting parse since no morph edge path spans input~%") (return-from parse (get-parse-return-values))) (instantiate-chart-with-stems-and-multiwords) (when *abort-parse-after-lexical-lookup* (return-from parse (get-parse-return-values))) (let ((*minimal-vertex* 0) (*maximal-vertex* *chart-max*)) (catch :best-first (if *active-parsing-p* (active-chart-parse) (loop initially (add-words-to-chart (and *first-only-p* (cons *minimal-vertex* *maximal-vertex*))) until (agenda-empty-p *agenda*) do (funcall (agenda-extract-max *agenda*))))) ;; record time for parse forest construction, before extracting parses (push (get-internal-run-time) *parse-times*) ;; look for complete parses, but only if we didn't already (i.e. we're not ;; in the mode that attempts to find a parse quickly without necessarily ;; constructing the complete parse forest) (unless (and *first-only-p* (null *unpacking-scoring-hook*)) (setq *parse-record* (parses-from-spanning-edges (find-spanning-edges *minimal-vertex* *maximal-vertex*) *first-only-p*))))) (flush-agenda *agenda*))) (when show-parse-p (show-parse)) (values (statistics-etasks *statistics*) (statistics-stasks *statistics*) -1 (statistics-ftasks *statistics*) (statistics-mtasks *statistics*)))))) (defun get-parse-return-values nil (push (get-internal-run-time) *parse-times*) (values (statistics-etasks *statistics*) (statistics-stasks *statistics*) -1 (statistics-ftasks *statistics*) (statistics-mtasks *statistics*))) (defun file-xml-p (filename) #+:s-xml ;; [bmw] return false if :s-xml not compiled in (xml-p (with-open-file (istream filename :direction :input) (read-line istream nil ""))) #-:s-xml nil) (defun xml-p (input) (and (stringp input) (> (length input) 4) (string= " ~a~%" (dag-arc-attribute arc) cfrom) (setf (dag-new-type dag2) (princ-to-string cfrom))) ((and cto (eq (dag-arc-attribute arc) 'CTO) (string-type-p type)) ;; (format t "~&set ~a -> ~a~%" (dag-arc-attribute arc) cto) (setf (dag-new-type dag2) (princ-to-string cto))) (t (replace-dag-cfrom-cto dag2 cfrom cto)))))) (replace-in-arcs (dag-arcs dag)) (replace-in-arcs (dag-comp-arcs dag)) dag)) (defun instantiate-chart-with-tokens (input) ;; The input has been pre-processed from one of a variety of input formats: ;; SMAF, SPPP, YY, tokens-with-character-offsets, plain list of tokens. Extract ;; from this the required info to initialise token chart, possibly also including ;; morph edges. ;; fix_me - need a cleaner method for determining what the input format is. ;; JAC Apr 2023: for the former 3 formats (SMAF, SPPP, YY) when they provide ;; morphosyntactic information, there is no consistent way of subsequently applying ;; token mapping rules: morph information is not visible to the rules so it wouldn't ;; survive intact. In any case, it makes no sense to re-tokenise after morphological ;; analysis. Therefore the LKB does not provide this processing path. (cond ((smaf::saf-p input) ; SMAF input? (saf-setup-morphs input)) ((and (consp (first input)) (loop for x in input thereis (assoc :analyses x))) ;; SPPP format with morphological analysis already done externally (sppp-setup-morphs input)) ((and (consp (first input)) (loop for x in input thereis (assoc :inflection x))) ;; YY format - assumed always to include morphological analysis (yy-setup-morphs input)) ((consp (first input)) ;; SPPP format without morphological analysis, so may apply token mapping rules (sppp-setup-morphs input) (when *token-type* (perform-token-mapping))) (t ;; tokens as plain strings (e.g. "the") or chared-word structures (loop with charedp = (chared-word-p (first input)) for token in input and start from 0 and end from 1 do (let* ((base-word (if charedp (chared-word-word token) token)) (word (string-upcase base-word)) (cfrom (if charedp (chared-word-cfrom token) -1)) (cto (if charedp (chared-word-cto token) -1))) (add-token-edge base-word word start end cfrom cto))) (when *token-type* (perform-token-mapping))))) (defun add-token-edge (base-word word from to cfrom cto) ;; e.g. (add-token-edge "The" "THE" 0 1 nil nil) (when (> to *tchart-max*) (setf *tchart-max* to)) (let (; (existing-edges (aref *tchart* to 0)) ; JAC 29-Nov-2023 - see below (edge (make-token-edge :id (next-edge) :string base-word :word word :leaves (list base-word) :from from :to to :cfrom cfrom :cto cto))) (unless nil ; JAC 29-Nov-2023 - removed sloppy (token-edge-match edge existing-edges) (push edge (aref *tchart* to 0)) (push edge (aref *tchart* from 1)) edge))) ;; JAC 29-Nov-2023 - no longer used ;; (defun token-edge-match (edge existing-edges) ;; (find edge existing-edges :test #'token-edge=)) (defun token-edge= (edge1 edge2) (and (= (edge-from edge1) (edge-from edge2)) (= (edge-to edge1) (edge-to edge2)) (equal (token-edge-word edge1) (token-edge-word edge2)))) (defun find-tedge-given-id (id) (loop for s from 0 to (1- *tchart-max*) do (loop for edge in (aref *tchart* s 1) when (eql (edge-id edge) id) do (return-from find-tedge-given-id edge)))) ;;; ***************************************************** ;;; ;;; Morphophonology ;;; ;;; ***************************************************** ;;; We have a chart instantiated with token-edges ;;; This phase calls the LKB built-in spelling component (or some other ;;; morphological engine), which ultimately provides a stem and a tree of ;;; rules to apply to it. ;;; The revised LKB morph code does this one rule at a time but the case ;;; where a complete analysis is done in one step is also supported. ;;; ;;; partial-tree handling ;;; ;;; ;;; FIX ;;; currently we assume that a partial tree is in fact a ;;; list of partial-tree nodes, but this will have to change ;;; when we allow for compounding. To try and make this simpler ;;; the code mostly abstracts from the partial tree encoding details ;;; search for WARNING for exceptions (defmacro make-pt-node (rule str) `(list ,rule ,str)) (defmacro pt-node-rule (ptnode) `(car ,ptnode)) (defmacro pt-node-string (ptnode) `(cadr ,ptnode)) (defmacro add-pt-node-to-tree (ptnode partial-tree) ;;; sometimes called with partial-tree being nil `(cons ,ptnode ,partial-tree)) (defmacro max-one-deep-p (partial-tree) ;;; may be called with partial-tree being nil `(not (cdr ,partial-tree))) (defmacro partial-trees-equal (partial-tree1 partial-tree2) `(tree-equal ,partial-tree1 ,partial-tree2)) ;;; end partial tree handling ;;; start of code specific to default *morph-option* (i.e., :default ;;; or :distinct-mphon) ;;; or :external-rule-by-rule (defparameter *morph-agenda* nil) (defparameter *morphophon-cache* nil) ;; [bmw] ensure only unanalysed token edges are sent to add-morpho-partial-edges ;; (necessary in case SMAF input contains some morph edges) (defun instantiate-chart-with-morphop nil (loop for token-edge in (get-unanalysed-tedges *tchart*) for word = (token-edge-word token-edge) do (add-morpho-partial-edges word token-edge)) *tchart*) #| Redoing this from the initial version: Partial-tree is a set of partial trees (although for now, unpack when we add a stem edge because I don't want to rewrite the rest of the code immediately) complication 1 - analyses that have no morphological effect, contributed by irregular forms (or otherwise) call on "BET" with rules nil - find passive, psp, past recurse on BET with rules (passive psp past) - if nothing can feed any of these rules, we will stop there complication 2 - rules that extend the length of the string - e.g. Spanish stems are longer than the inflected forms. As with null effects, this doesn't matter unless we get into a recursive situation. complication 3 - as we're analysing, we want to block impossible analyses based on the rule feeding, but when we return to a string having analysed with different rules, we may need to redo. This led to a potential bug in the old code. If affixes always shortened the string, we could order them so that we never revisited a string we'd analysed. i.e. we could search breadth first, longest string first. This isn't the case, but it's probably close enough that the potential reanalysis won't hurt. Use an agenda so that we can investigate possibilities breadth first. Abstract example: initial agenda contains ((str (nil))) this is popped (morph-analyse-reg-irreg str) => ((str1 . r1a) (str1 . r1b) (str2 . r2a) (str2 . r2b) (str . r3a) (str . r3b)) In the agenda, these are packed and sort by length of string - assume str1 > str and str2 < str for the special case where the morphological analysis allows a zero affixation, we bypassing the agenda mechanism and add them as partial trees to give the set of partial trees (nil (r2a.str) (r2b.str)) (note r1a.str is an abbreviation for (r1a str) here - i.e. the nodes in partial trees are actually lists of rule and form) note also that we need to have the element nil in the partial-tree-set suppose that r1a feeds r2a.str and r1b.str feeds r2a.str but none of the others feed agenda ((str1 ((r1a.str) (r1b.str) (r1a.str r2a.str) (r1b.str r2a.str)) (str2 ((r2a.str) (r2b.str)))) str1 is top, so we investigate it. agenda is ((str2 ((r2a.str) (r2b.str)))) since we pop Suppose: (morph-analyse-reg-irreg str1) => ((str2 . r2c)) We add to the agenda for str2 ((str2 ((r2a.str) (r2b.str) (r2c.str1 r1a.str) (r2c.str1 r1b.str)))) Because the morphophonology could have affixed forms that were shorter than the stem, we can't guarantee not to redo work. For instance, in the example above, suppose (morph-analyse-reg-irreg str2) => ((str1 . r2d)) we've taken str1 off the agenda, so we'll end up reanalysing it. However, the morphophon results are cached, so the effects are relatively limited. |# ;;; Morphology agenda (defmacro make-morph-agenda-item (str partial-tree-set) `(cons ,str ,partial-tree-set)) (defmacro morph-agenda-string (agenda-item) `(car ,agenda-item)) (defmacro morph-agenda-partial-tree-set (agenda-item) `(cdr ,agenda-item)) (defmacro morph-agenda-latest-rules (agenda-item) ;;; WARNING: assumes partial tree is a list! `(loop for pt in (morph-agenda-partial-tree-set ,agenda-item) collect (pt-node-rule (car pt)))) (defun morph-agenda-match (str) (dolist (item *morph-agenda*) (when (string-equal str (morph-agenda-string item)) (return item)))) (defun insert-morph-agenda-item (str partial-tree-set) ;;; if the string matches an existing item, then add the ;;; partial trees into that record. Otherwise, insert ;;; in string length order, longest first (incf (statistics-mtasks *statistics*)) (if *morph-agenda* (let* ((strlength (length str)) (checked nil) (new-agenda (do ((remainder *morph-agenda* (cdr remainder))) ((not remainder) nil) (let ((item (car remainder))) (cond ((> strlength (length (morph-agenda-string item))) (return (append (nreverse checked) (list (make-morph-agenda-item str partial-tree-set)) remainder))) ((string-equal str (morph-agenda-string item)) (setf (morph-agenda-partial-tree-set item) (append partial-tree-set (morph-agenda-partial-tree-set item))) ;;; don't think we need to test for duplicates here (return *morph-agenda*)) (t (push item checked))))))) (setf *morph-agenda* (or new-agenda (nreverse (cons (make-morph-agenda-item str partial-tree-set) checked))))) (setf *morph-agenda* (list (make-morph-agenda-item str partial-tree-set))))) ;;; end agenda functions ;;; Main code (defun add-morpho-partial-edges (unanalysed token-edge) (setf *morph-agenda* nil) (setf *morphophon-cache* nil) (insert-morph-agenda-item unanalysed (list nil)) (loop (unless *morph-agenda* (return nil)) (let* ((agenda-item (pop *morph-agenda*))) (analyse-agenda-item agenda-item token-edge)))) (defvar *partial-trees* nil) (defun get-morph-analyses (unanalysed) ;; [bmw] function is based on add-morpho-partial-edges, added ;; in order to collect morph analyses as required in SMAF code (let ((*partial-trees* nil)) (setf *morph-agenda* nil) (setf *morphophon-cache* nil) (insert-morph-agenda-item unanalysed (list nil)) (loop (unless *morph-agenda* (return nil)) (let* ((agenda-item (pop *morph-agenda*))) (analyse-agenda-item agenda-item nil :bottom-out-fn #'get-valid-morph-analyses))) *partial-trees*)) (defun analyse-agenda-item (agenda-item token-edge &key (bottom-out-fn #'add-as-unanalysed)) ;; [bmw] customised to allow alternative "bottom-out-fn" at bottom of recursion ;; (to allow use by get-morph-analyses fn) (let* ((unanalysed (morph-agenda-string agenda-item)) (partial-tree-set (morph-agenda-partial-tree-set agenda-item)) (last-rule-ids (morph-agenda-latest-rules agenda-item)) (partial-trees-plus-nulls partial-tree-set)) (when (dolist (last-rule-id last-rule-ids nil) (when (or (null last-rule-id) (spelling-rule-feeds-p last-rule-id)) ;; only continue to morph-analyse if there's some rule that changes ;; morphology that can feed the one we've seen (return t))) (let ((one-steps (morph-analyse-reg-irreg unanalysed))) ;; (morph-analyse-reg-irreg "CANNED") returns ;; (("CAN" . PAST-V_IRULE) ("CAN" . PSP-V_IRULE) ("CANNED" . MY_RULE)) ;; the new analyses get merged here with each other and the agenda (setf partial-trees-plus-nulls (null-morphophon-combinations unanalysed (loop for morph-pair in one-steps when (string-equal (car morph-pair) unanalysed) collect (cdr morph-pair)) partial-tree-set)) (dolist (morph-pair one-steps) (unless (string-equal (car morph-pair) unanalysed) (let* ((newstr (car morph-pair)) (rule (cdr morph-pair)) (rule-entry (or (get-lex-rule-entry rule) (error "~%Rule ~A returned from morph analyser is undefined (error in grammar loading?)" rule))) (ptnode (make-pt-node rule unanalysed)) (new-partial-tree-set (loop for partial-tree in partial-trees-plus-nulls when (or (null partial-tree) (check-nosp-feeding (get-lex-rule-entry (pt-node-rule (car partial-tree))) ;;; WARNING rule-entry)) collect (progn (when (>= (length partial-tree) *maximal-lex-rule-applications*) (error "Hit limit on morphological rule applications in ~a, ~ possibly due to a runaway rule (see documentation for ~ *maximal-lex-rule-applications*)" partial-tree)) (add-pt-node-to-tree ptnode partial-tree))))) (when new-partial-tree-set (insert-morph-agenda-item newstr new-partial-tree-set))))))) (funcall bottom-out-fn unanalysed token-edge ;; we were getting duplicates in this list, untidy fix - JAC 30-Jun-2022 (remove-duplicates partial-trees-plus-nulls :test #'equal)))) (defun get-valid-morph-analyses (word token-edge partial-tree-set) (declare (ignore token-edge)) (when (lookup-word *lexicon* word) (loop for partial-tree in partial-tree-set do (push (cons word partial-tree) *partial-trees*)))) (defun add-as-unanalysed (unanalysed token-edge partial-tree-set) ;;; try and terminate the recursion, recording the analyses via ;;; the partial tree mechanism. Unpack the partial trees and add a ;;; new edge for each for now. (when (lookup-word *lexicon* unanalysed) (dolist (partial-tree partial-tree-set) (add-morpho-stem-edge-from-token-edge unanalysed partial-tree token-edge)))) (defun morph-analyse-reg-irreg (unanalysed) (or (cdr (assoc unanalysed *morphophon-cache* :test #'string-equal)) (let* ((res (if *foreign-morph-fn* (apply *foreign-morph-fn* (list unanalysed)) (append (find-irregular-morphs unanalysed) (one-step-morph-analyse unanalysed)))) (filtered-res (loop for poss in res when (morph-not-blocked-p unanalysed (car poss) (cdr poss)) collect poss))) (push (cons unanalysed filtered-res) *morphophon-cache*) filtered-res))) (defun morph-not-blocked-p (surface underlying rule) ;;; given a pattern (* ed) (!ty !tied) (e ed) (!t!v!c !t!v!c!ced) ;;; we may only want to match the most specific case ;;; for instance, given `hated' we don't want to match this to the ;;; stem `hat' because it should have the (!t!v!c !t!v!c!ced) pattern ;;; Similarly, if we have an irregular form `slept' we don't ;;; want to match `sleeped' ;;; *irregular-forms-only-p* (if only for backward compatibility) (or *foreign-morph-fn* ;;; we assume no blocking with external morphology (if (or *most-specific-only-p* *irregular-forms-only-p*) (let* ((irreg-surface-list (morph-matches-irreg-list underlying rule))) (if irreg-surface-list ;;; only OK if the specified form is one of the irregular ;;; forms (member surface irreg-surface-list :test #'equal) ;; no irreg form (if *most-specific-only-p* ;;; then we need to see if we have the most specific pattern ;;; which we can just get by calling morph-generate ;;; A more sophisticated version might allow multiple results ;;; here (let ((most-specific (morph-generate underlying rule))) (equal surface most-specific)) ;;; otherwise only *irregular-forms-only-p* ;;; is set and we're only blocking by irregulars, so ;;; we're OK here t))) ;;; we're not doing any blocking so it's OK t))) (defun null-morphophon-combinations (unanalysed null-rules partial-tree-set) ;;; we have a set of rules that can apply to a string without ;;; morphophonological effect. While such rules would normally be ;;; specified as lexical, special cases of morphophonologically active ;;; rules (irregulars in particular) may arise here. We filter these ;;; out from the usual agenda mechanism, to prevent looping. ;;; ;;; returns a set of partial-trees (append partial-tree-set (apply-morph-null-rules partial-tree-set null-rules unanalysed))) (defun apply-morph-null-rules (partial-tree-set null-rules unanalysed) (let ((new-pt-set (loop for null-rule in null-rules for rule-entry = (get-lex-rule-entry null-rule) for ptnode = (list null-rule unanalysed) nconc (loop for partial-tree in partial-tree-set when (or (null partial-tree) (check-nosp-feeding (get-lex-rule-entry (pt-node-rule (car partial-tree))) ;;; WARNING rule-entry)) collect (progn (when (>= (length partial-tree) *maximal-lex-rule-applications*) (error "Hit limit on morphological rule applications in ~a, ~ possibly due to a runaway rule (see documentation for ~ *maximal-lex-rule-applications*)" partial-tree)) (add-pt-node-to-tree ptnode partial-tree)))))) (if new-pt-set (append new-pt-set (apply-morph-null-rules new-pt-set null-rules unanalysed)) nil))) ;;; end of code specific to the :default, :distinct-mphon ;;; and :external-rule-by-rule case (defun add-morpho-stem-edge-from-token-edge (stem partial-tree token-edge) ;; This is called from the version with the rule-by-rule morphology ;; (for the irregs and as the base case with regular morphology). ;; It is also called when we have a version of morphology ;; giving full partial trees and we're using the tokeniser ;; (:external-partial-tree is the value of *morph-option*). ;; But most of the work is done in add-morpho-stem-edge ;; which is called from the sppp stuff (i.e. external ;; tokeniser and partial tree morphology). (with-slots (from to word string cfrom cto leaves) token-edge (add-morpho-stem-edge stem partial-tree from to word string cfrom cto leaves token-edge))) (defun add-morpho-stem-edge (stem partial-tree from to word string cfrom cto tleaves tedge) (if (and (or (member *morph-option* '(:default :distinct-mphon :external-rule-by-rule)) ;; if we're proceeding rule by rule we've already done this checking (and (lookup-word *lexicon* stem) (or (max-one-deep-p partial-tree) (check-rule-filter-morph partial-tree string)))) ;; removed check below for pre-existing edge that's similar; the check is wrong ;; since it would filter out edges produced during token mapping that are similar ;; but have different FSes; the check may originally have been added to work ;; around problem of duplicate analyses in analyse-agenda-item - JAC 30-Jun-2022 ;; (not (morpho-stem-edge-match2 ;; from to stem partial-tree nil (aref *tchart* from 1))) ) (let ((new-edge (make-morpho-stem-edge :id (next-edge) :dag (if (token-edge-p tedge) (token-edge-dag tedge)) :word word :string string :stem stem :current stem :partial-tree partial-tree :leaves tleaves :children (if (token-edge-p tedge) (list tedge)) :from from :to to :cfrom cfrom :cto cto))) (push new-edge (aref *tchart* from 1)) (push new-edge (aref *tchart* to 0)) new-edge) nil)) ;;; see comment above about morpho-stem-edge-match2 ;; ;; (defun morpho-stem-edge= (edge1 edge2) ;; (and (morpho-stem-edge-p edge1) ; both medges ;; (morpho-stem-edge-p edge2) ;; (= (edge-from edge1) (edge-from edge2)) ;; (= (edge-to edge1) (edge-to edge2)) ;; (equal (morpho-stem-edge-stem edge1) ;; (morpho-stem-edge-stem edge2)) ;; (partial-trees-equal (edge-partial-tree edge1) ;; (edge-partial-tree edge2)) ;; (equalp (morpho-stem-edge-l-content edge1) ;; (morpho-stem-edge-l-content edge2)))) ;; ;; (defun morpho-stem-edge-match2 (from to stem partial-tree l-content existing-edges) ;; ;; is there an existing edge with fields matching the first 5 arguments? ;; (member-if #'(lambda (x) ;; (morpho-stem-edge=2 from to stem partial-tree l-content x)) ;; existing-edges)) ;; ;; (defun morpho-stem-edge=2 (from to stem partial-tree l-content edge2) ;; (and (morpho-stem-edge-p edge2) ; medge ;; (= from (edge-from edge2)) ;; (= to (edge-to edge2)) ;; (equal stem (morpho-stem-edge-stem edge2)) ;; stem ;; (partial-trees-equal partial-tree (edge-partial-tree edge2)) ;; (equalp l-content (morpho-stem-edge-l-content edge2)))) ;;; ************************************************************* ;;; ;;; This alternative version is for the one step morphophonology case ;;; where the foreign-morph-fn returns a full partial-tree ;;; and a stem - this is called when the value of *morph-option* ;;; is :external-partial-tree ;;; *foreign-morph-fn* is assumed to return something like ;;; ("A" (RULE1 "AB") (RULE2 "ABC")) ;;; as the old morph-analyse did (defun instantiate-chart-with-morpho-stem-edges nil (dotimes (current *tchart-max*) (dolist (token-edge (aref *tchart* current 1)) (let* ((word (token-edge-word token-edge)) (morph-specs (union (filter-for-irregs ; in rules.lsp (remove-duplicates (apply *foreign-morph-fn* (list word)) :test #'equalp)) (find-irregular-morphs-old word) :test #'equalp))) (dolist (morph-spec morph-specs) (add-morpho-stem-edge-from-token-edge (car morph-spec) (cdr morph-spec) token-edge)))))) ;;; ************************************************************* ;;; Code below is only for the case where we have not done the ;;; morphophonology rule by rule (defun check-rule-filter-morph (partial-tree string) ;; This is called with a partial-tree that will be ;; tried later on in the parser proper. ;; Make sure we can get actual rules, and, if we can, ;; filter impossible combinations now according to the ;; rule filter. (let ((rule-entries (loop for rule-info in partial-tree collect (let* ((rule-id (pt-node-rule rule-info)) (rule-entry (get-lex-rule-entry rule-id))) (or rule-entry (progn (format t "~%Warning: rule ~A specified by ~ morphology for ~A was not found" rule-id string) (return-from check-rule-filter-morph nil))))))) (check-rule-filter-morph-aux rule-entries))) (defun check-rule-filter-morph-aux (rule-list) (loop for (first . rest) on rule-list while rest always (check-nosp-feeding (first rest) first))) ;;; ***************************************************** ;;; ;;; Lexical lookup and MWEs - Phase 3 ;;; ;;; ***************************************************** ;;; multi-word entries may have affixation on any member ;;; but they span several vertices. It's therefore ;;; necessary to treat them as individual words with respect to ;;; the spelling component. ;;; Putative multiword entries are checked ;;; when we get to the rightmost element. (defun instantiate-chart-with-stems-and-multiwords nil (when *cm-debug* (format t "~&Instantiating lexical entries in parse chart~%")) (loop for s from 0 to (1- *tchart-max*) do (loop for edge in (aref *tchart* s 1) do ;; set *chart-max* from token edges in case *tchart-max* is incorrect due to ;; token mapping deleting edges at right boundary ;; NB analogous case of left boundary not dealt with (setq *chart-max* (max *chart-max* (edge-to edge))) (etypecase edge (token-edge (when *token-type* ;; edge contains an instantiated *token-type* feature structure (add-generic-lexical-entries edge))) (morpho-stem-edge (add-single-and-multiple-senses edge))))) ;; fix_me - check-stem-coverage was called when the full chart was available, but ;; the function seems to hang on some inputs; moreover, medge-spanning-path-p ;; (which may be called by parse just before this point) duplicates some of its ;; processing - JAC 25-Jun-2022 ;; (check-stem-coverage *tchart-max*) ) (defmethod lookup-word ((lexicon (eql 'identity)) orth &key cache) (declare (ignore lexicon cache)) orth) (defun add-generic-lexical-entries (tedge) (let* ((word (token-edge-word tedge)) (string (token-edge-string tedge)) (from (token-edge-from tedge)) (to (token-edge-to tedge)) (cfrom (token-edge-cfrom tedge)) (cto (token-edge-cto tedge)) (pt (token-edge-partial-tree tedge)) ; only applicable for YY mode (stem-and-pt-list (if pt ; e.g. from YY format preprocessor analysis (list (cons word pt)) (let ((*lexicon* 'identity)) (get-morph-analyses word)))) (glex (lex "gle"))) ;; Most grammars that perform token mapping also define a generic lexical entry file. ;; Therefore check that the file has been loaded correctly (i.e. as a _sub-lexicon_) ;; since this is easy to get wrong. The grammarian should supply an empty file if ;; there really are no generic lexical entries. (unless glex (error "Could not find sub-lexicon \"gle\" when looking for generic lexical entries")) (loop for gid in (collect-psort-ids glex) for entry = (get-lex-entry-from-id gid) do (loop for (stem . partial-tree) in stem-and-pt-list do (let ((*lexical-entries-used* nil)) (add-stem-edge stem string from to cfrom cto partial-tree entry (list tedge))))))) (defun add-single-and-multiple-senses (morpho-stem-edge) (let ((edge-stem (morpho-stem-edge-stem morpho-stem-edge)) (edge-string (edge-string morpho-stem-edge)) (from (edge-from morpho-stem-edge)) (to (edge-to morpho-stem-edge)) (cfrom (edge-cfrom morpho-stem-edge)) (cto (edge-cto morpho-stem-edge)) (partial-tree (edge-partial-tree morpho-stem-edge))) (dolist (entry (get-edge-entries morpho-stem-edge)) (let ((*lexical-entries-used* nil)) (if (cdr (lex-entry-orth entry)) ; a multi-word? (check-multi-word (mapcar #'string-upcase (lex-entry-orth entry)) edge-stem edge-string entry from to cto partial-tree (list morpho-stem-edge)) (add-stem-edge edge-stem edge-string from to cfrom cto partial-tree entry (list morpho-stem-edge))))))) (defun get-edge-entries (morpho-stem-edge) ;; [bmw] l-content takes precedence over morph analysis of stem ;; FIXME: make precedence configurable (with-slots (l-content stem) morpho-stem-edge (let ((l-content-type (car l-content)) (l-content-value (cdr l-content)) entries) (case l-content-type (:full ;; we use l-content in place of lexical lookup (setf entries (list l-content-value))) (:inject ;; we inject unifs into result of lexical lookup (setf entries (get-safe-unexpanded-lex-entry stem)) ;; if we have something to inject, collect copies of entries, ;; each with adjusted lex-entry-unifs (when l-content-value (setf entries (loop for e-orig in (get-safe-unexpanded-lex-entry stem) collect (get-injected-lex-entry e-orig l-content-value))))) (t ;; default (and non-SMAF) case (setf entries (get-safe-unexpanded-lex-entry stem)))) entries))) (defun get-safe-unexpanded-lex-entry (stem) ;; ensure all ersatzes are copies (loop with entries = (get-unexpanded-lex-entry stem) for entry in entries for i from 0 when (lex-entry-is-ersatz entry) do (setf (nth i entries) (copy-lex-entry entry)) (get-expanded-lex-entry entry) ;; so we don't break tsdb's *lex-ids-used* hack finally (return entries))) (defun get-injected-lex-entry (e-orig l-content-injects) ;; takes lex-entry + injection specs (inject-lex-entry e-orig l-content-injects :e-orig e-orig)) (defun inject-lex-entry (e l-content-injects &key e-orig) (setf (lex-entry-full-fs e) nil) ; ensure NO full-fs yet (setf (lex-entry-unifs e) (inject-unifs l-content-injects (lex-entry-unifs e) :entry e-orig)) e) (defparameter smaf::*ersatz-carg-path* nil) ; if false, ersatzes get no special treatment (defun lex-entry-is-ersatz (entry) ;; test whether lex-entry should be treated as an ersatz ;; (ersatz's undergo CARG-injection) ;; an ersatz is an lex entry containing a token that looks like "...ersatz" (unless smaf::*ersatz-carg-path* (return-from lex-entry-is-ersatz nil)) (loop for word in (lex-entry-orth entry) thereis (and (stringp word) (eql (mismatch "ersatz" word :from-end t :key #'char-equal) 0)))) (defun inject-unifs (inject-unifs unifs &key entry) ;; [bmw] return result of non-destructively injecting 'inject-unifs' into 'unifs' (loop for unif in unifs for lhs = (unification-lhs unif) for inject-unif = (find lhs inject-unifs :key #'unification-lhs :test #'equalp) if inject-unif collect inject-unif into new-unifs and do (if entry (format t "~&; WARNING: redefining ~a in copy of lex entry ~a" (unifs-to-tdl-body (list inject-unif)) (to-tdl entry))) else collect unif into new-unifs finally (return (append inject-unifs new-unifs)))) (defun add-stem-edge (edge-stem edge-string from to cfrom cto partial-tree entry dtrs) #+:arboretum (declare (special *mal-active-p*)) (let ((expanded-entry (get-expanded-lex-entry entry))) (when (and expanded-entry #+:arboretum (or *mal-active-p* (not (mal-lex-entry-p expanded-entry)))) (let* ((fs (copy-lex-fs-as-needed (lex-entry-full-fs entry))) (new-fs (cond ((edge-dag (car dtrs)) (unify-in-tokens-list fs (mapcar #'edge-dag dtrs))) (*characterize-p* (set-characterization-tdfs fs cfrom cto)) (t fs)))) (when new-fs (let ((new-edge (make-edge :id (next-edge) :category (indef-type-of-tdfs new-fs) :rule edge-stem :dag new-fs :dag-restricted (restrict-fs (tdfs-indef new-fs)) :leaves (list edge-string) :lex-ids (list (lex-entry-id entry)) :from from :to to :tchildren dtrs :partial-tree partial-tree :string edge-string :cfrom cfrom :cto cto))) (when *cm-debug* (format t "~&Adding edge ~D for lexical entry ~(~A~) `~A'~@[ requiring ~{~(~A~)~^, ~}~]~%" (edge-id new-edge) (lex-entry-id entry) edge-string (mapcar #'car partial-tree))) (push new-edge (aref *chart* from to)))))))) (defun copy-lex-fs-as-needed (tdfs) ;; prevent spurious reentrancies between multiple uses of the same lexical entry in ;; a single multi-word (could happen in reduplication for instance) (cond ((member tdfs *lexical-entries-used* :test #'eq) (copy-tdfs-completely tdfs)) (t (push tdfs *lexical-entries-used*) tdfs))) (defun unify-in-tokens-list (fs tdags) ;; unify token dags into lexical entry fs (let ((*recording-fail-paths-p* nil)) ; quickcheck not used inside here (multiple-value-bind (tlist tlast) (build-list-dag (mapcar #'tdfs-indef tdags)) (with-unification-context (fs) (let ((*unify-debug* (and (numberp *cm-debug*) (>= *cm-debug* 2)))) (when *unify-debug* (format t "~&Attempting to unify token FS into new lexical edge")) (and (setq fs (yadu! fs (make-nondefault-tdfs tlist) *lexicon-tokens-path*)) (setq fs (yadu! fs (make-nondefault-tdfs tlast) *lexicon-last-token-path*)) (copy-tdfs-elements fs))))))) (defun check-multi-word (entry-orth-list edge-stem edge-string unexpanded-entry from to cto partial-tree dtrs) ;; we have a possible match on a multiword ;; i.e., something which corresponds to multiple tokens ;; according to the tokeniser. For every valid multiword ;; we find, we will add an edge to the chart via add-stem-edge ;; ;; a multiword entry gives a list of stems ;; (let* ((inflection-position (lex-entry-infl-pos unexpanded-entry)) (number-of-words (length entry-orth-list)) (to-be-accounted-for (reverse entry-orth-list)) (current-entry-stem (car to-be-accounted-for))) (unless (string-equal current-entry-stem edge-stem) (return-from check-multi-word nil)) ;; only check multi-words when we have the rightmost (when (< to number-of-words) (return-from check-multi-word nil)) ; too near start of sentence ;; because of tokeniser ambiguity this is not a perfect check ;; but the more complex cases will be caught below #| (when (and partial-tree (not (eql inflection-position number-of-words))) (return-from check-multi-word nil)) ; inflection not allowed here |# (check-multi-and-add-edges entry-orth-list (cdr to-be-accounted-for) from nil to cto inflection-position unexpanded-entry (list partial-tree) edge-string dtrs t :ersatz-p (lex-entry-is-ersatz unexpanded-entry)))) (defun check-multi-and-add-edges (entry-orth-list remaining-words from cfrom to cto inflection-position unexpanded-entry partial-tree-set amalgamated-string dtrs first-p &key cargs ersatz-p) ;; check we have some match on each element ;; and find partial tree(s) on inflected position ;; this is called initially when we've got a match on the rightmost ;; element. We add edges for each match. Note that because ;; tokenisation is now non-deterministic, and because ;; there may be ambiguity in the morphology, we may end up with several ;; edges being added. ;; e.g. ("a" "b" "c") as entry-orth-list with inflection ;; position 2 would match a set of contiguous edges where ;; edge 1 had stem "a" and no partial tree ;; edge 2 had stem "b" and an optional partial tree ;; edge 3 we already know has stem "c" when this is called ;; ;; Note that, despite the name `inflection position' an MWE like ;; "chest pain" would match "chest painless" (if the inflection position ;; was set to 2 and there was a productive affix "less", despite the ;; fact that "less" would be derivational. This would have to be controlled ;; by the grammar (presumably by whatever mechanism was used to block ;; derivation occurring in the wrong place anyway). ;; Revised version allows a nil setting for infl-pos (see ERG ;; user-fns) In this case, multiple elements of the multiword are ;; allowed to have partial-trees - these are amalgamated according ;; to the ordering allowed by the rule-filter. ;; This is needed for punctuation in the current ERG - e.g. for (Palo Alto) ;; Only one possibility is returned - this should really be FIXed so that ;; multiple possibilities are allowed and return multiple edges. ;; Affixation is only allowed for the leftmost or rightmost ;; element of the multiword, but this code currently allows the leftmost ;; element to have a suffix and/or the rightmost a prefix ;; thus allowing `Palo. Alto' - FIX ;; aac oct 20 2006 (if remaining-words (let ((entry-stem (car remaining-words))) (dolist (edge (aref *tchart* from 0)) (let ((carg-extra (and ersatz-p ; get carg, if ersatz (get-carg-from-edge edge)))) (when (and (morpho-stem-edge-p edge) (equal entry-stem (morpho-stem-edge-stem edge)) (if inflection-position ;; behaviour should be unchanged from ;; pre oct 20 2006 version (if (= (length remaining-words) inflection-position) (progn (setf partial-tree-set (list (morpho-stem-edge-partial-tree edge))) t) (not (morpho-stem-edge-partial-tree edge))) ;; no infl-pos, post oct 20 2006 (or (not (morpho-stem-edge-partial-tree edge)) (if (or first-p (not (cdr remaining-words))) ;; at beginning/end of multiword (progn (setf partial-tree-set (cons (morpho-stem-edge-partial-tree edge) partial-tree-set)) t) nil)))) (when carg-extra (push carg-extra cargs)) ; collect cargs, if ersatz (check-multi-and-add-edges entry-orth-list (cdr remaining-words) (edge-from edge) (edge-cfrom edge) to cto inflection-position unexpanded-entry partial-tree-set (concatenate 'string (edge-string edge) " " amalgamated-string) (cons edge dtrs) nil :cargs cargs :ersatz-p ersatz-p))))) (let* ((amalgamated-partial-tree (combine-partial-trees partial-tree-set)) carg-final) ;; inject cargs into CARG, if ersatz (when ersatz-p (setf carg-final (get-carg-from-entry unexpanded-entry)) (if carg-final (setf cargs (append cargs (list carg-final)))) (inject-cargs unexpanded-entry cargs)) (add-stem-edge (format nil "~{~A ~}" entry-orth-list) amalgamated-string ;; FIX - when cfrom cto is universal we can replace this ;; by a lookup in the original characters. Currently ;; this is a bit of a hack since ;; we just guess that the strings of the individual words ;; were split by spaces. from to cfrom cto amalgamated-partial-tree unexpanded-entry dtrs)))) (defun get-carg-from-edge (edge) ;; extract carg from an edge, via l-content unifs (and (typep edge 'morpho-stem-edge) (let* ((l-content (morpho-stem-edge-l-content edge)) (unifs (cdr l-content))) (get-carg-from-unifs unifs)))) (defun get-carg-from-entry (entry) ;; extract carg from an (unexpanded) entry, via unifs (let ((unifs (lex-entry-unifs entry))) (get-carg-from-unifs unifs))) (defun get-carg-from-unifs (unifs &key (carg-path (smaf::ersatz-carg-path))) ;; extract carg from a set of unifs (let* ((carg-unif (find carg-path unifs :test #'(lambda (x y) (equalp (path-typed-feature-list (unification-lhs y)) x)))) (carg (and carg-unif (u-value-type (unification-rhs carg-unif))))) carg)) (defun inject-cargs (entry cargs &key (carg-slot-name :|carg|)) ;; map cargs to string and inject into entry CARG (inject-lex-entry entry (get-inject (list (cons carg-slot-name (str-list-2-str cargs))) (smaf::gmap)))) (defun combine-partial-trees (partial-tree-set) ;; take a list of partial trees and return a single partial tree (if (cdr partial-tree-set) (stable-sort (mapcan #'copy-list partial-tree-set) #'partial-tree-order :key #'car) (car partial-tree-set))) (defun partial-tree-order (rule-id1 rule-id2) (let ((re1 (get-lex-rule-entry rule-id1)) (re2 (get-lex-rule-entry rule-id2))) (and (check-nosp-feeding re2 re1) (not (check-nosp-feeding re1 re2))))) ;;; ************************************************** ;;; ;;; Unknown words / messages about missing words ;;; ;;; ************************************************** ;;; The idea is to allow for ambiguity in the token input. We scan ;;; initially, recording how far we can get in res-array, which stores ;;; t at index n if there is a potential span between 0 and n. If we get ;;; to a point where there's a gap, we generate a warning and perhaps ;;; an unknown word, treat the gap as filled and go on from there. (defun check-stem-coverage (max) (let ((res-array (make-array (+ max 1) :initial-element nil))) (setf (svref res-array 0) t) (check-stem-coverage-aux 0 max res-array) (generate-messages-and-unknown-words res-array max))) (defun generate-messages-and-unknown-words (res-array max-dim) ;; (format t "~%generate-messages-and-unknown-words ~A" res-array) (unless (svref res-array max-dim) (dotimes (current (1+ max-dim)) (unless (svref res-array current) (dolist (token-entry (aref *tchart* current 0)) (unless *generate-messages-for-all-unanalysed-tokens* (format t "~%No analysis found corresponding to token ~a-~a ~A" (token-edge-from token-entry) (token-edge-to token-entry) (token-edge-word token-entry)))) ;; fix_me (generate-unknown-word-entries stem-string) (setf (svref res-array current) t) (check-stem-coverage-aux current max-dim res-array) (generate-messages-and-unknown-words res-array max-dim) (return))))) (defun generate-messages-for-all-unanalysed-tokens (tchart) (declare (ignore tchart)) ;; [bmw] need to encapsulate *tchart* and associated variables in an object ;; before we can start passing it around... (loop for tedge in (get-unanalysed-and-unspanned-tedges) do (with-slots (from to word) tedge (push word *unanalysed-tokens*) (format t "~&No lexical analysis found corresponding to token ~a-~a ~A" from to word)))) (defun check-stem-coverage-aux (start max res-array) ;; (format t "~%check-stem-coverage-aux ~A ~A ~A" start max res-array) (loop for end from 1 to max do (let ((stem-edges (aref *chart* start end)) (end-points nil)) (dolist (edge stem-edges) (let ((end-point (edge-to edge))) (unless (member end-point end-points) (push end-point end-points) (setf (aref res-array end-point) t)))) (dolist (end end-points) (check-stem-coverage-aux end max res-array))))) ;;; ***************************************************** ;;; ;;; Morphosyntax interaction with phase 4 ;;; ;;; Morphological and lexical rule application ;;; ;;; ***************************************************** ;;; We now have a chart with edges corresponding to stems which ;;; may need some morphological rules to be added. ;;; This is complicated because the morphological rules can be ;;; interleaved with arbitrary numbers of lexical rules. ;;; ;;; We want to combine this with parsing proper. ;;; Ordinary grammar rules are not allowed to apply to things with a ;;; partial-tree. Spelling-change rules are only allowed to apply ;;; when the next thing on the partial-tree is them. Lexical ;;; rules can apply any time. (defun find-spelling-info (edge) (let ((partial-tree (edge-partial-tree edge))) ;; list of (rule "FORM") (when partial-tree (let* ((current-rule-info (first partial-tree)) (new-orth (pt-node-string current-rule-info)) (orth-tdfs (when new-orth (make-orth-tdfs new-orth))) (rule-id (pt-node-rule current-rule-info)) (rule-entry (get-lex-rule-entry rule-id))) (if rule-entry (values rule-entry orth-tdfs (rest partial-tree)) (progn (format t "~%Warning: rule ~A specified by ~ morphology was not found" rule-id) nil)))))) (defun apply-immediate-spelling-rule (rule orth-tdfs remaining-tree left-vertex child-edge right-vertex f) ;; this is a special case of apply-immediate-grammar-rule #+:pdebug (format t "~&apply-immediate-spelling-rule(): `~(~a~) <-- ~d~^ [~d -- ~d]" (rule-id rule) (edge-id child-edge) left-vertex right-vertex) (when (and (check-rule-filter rule (edge-rule child-edge) 0) (restrictors-compatible-p (car (last (rule-daughters-restricted rule))) (edge-dag-restricted child-edge))) (multiple-value-bind (unification-result first-failed-p) (evaluate-unifications rule (list (edge-dag child-edge)) orth-tdfs nil nil (edge-cfrom child-edge) (edge-cto child-edge)) (if unification-result (let* ((new-edge (make-edge :id (next-edge) :category (indef-type-of-tdfs unification-result) :rule rule :children (list child-edge) :dag unification-result :lex-ids (edge-lex-ids child-edge) :leaves (edge-leaves child-edge) :partial-tree remaining-tree))) #+pdebug (format t " ... success.~%") (activate-context left-vertex new-edge right-vertex f) t) (progn #+pdebug (format t " ... ~:[fail~;throw~].~%" first-failed-p) (if first-failed-p nil t)))))) ;;; ************************************************************** ;;; ;;; Parsing - Phase 4 proper ;;; ;;; ************************************************************* (defun add-words-to-chart (f) (dotimes (end *chart-max*) (dotimes (start *chart-max*) (let ((stem-edges (aref *chart* start (1+ end)))) (dolist (edge stem-edges) (add-word edge f (edge-from edge) (1+ end))))))) (defun add-word (edge f left-vertex right-vertex) (declare (ignore left-vertex)) (with-agenda (when f (lex-priority edge)) (activate-context-no-add (edge-from edge) edge right-vertex f))) (defun unify-in-word (tdfs word-string) (declare (ignore word-string)) tdfs) (defun activate-context (left-vertex edge right-vertex f) #+:pdebug (format t "~&activate-context(): edge # ~d: [~d -- ~d];~%" (edge-id edge) left-vertex right-vertex) (add-to-chart left-vertex edge right-vertex f) (activate-context-no-add left-vertex edge right-vertex f)) (defun add-to-chart (left edge right f) (push edge (aref *chart* left right)) ;; are we in quick first-only (non-exhaustive) mode and did we just find a ;; spanning edge? (when (and f (= left (car f)) (= right (cdr f)) (not (edge-partial-tree edge))) (let ((results (parses-from-spanning-edges (list edge) *first-only-p*))) (when results (setq *parse-record* (append *parse-record* results)) (decf *first-only-p* (length results)) (when (< *first-only-p* 1) (throw :best-first t)))))) (defun activate-context-no-add (left-vertex edge right-vertex f) ;; when we have a partial-tree specification on the edge, one ;; option is the corresponding morphological rule ;; When this is applied, the new edge has a record ;; that removes that element from the partial-tree (multiple-value-bind (spelling-rule orth-tdfs remaining-morph) (find-spelling-info edge) (when spelling-rule (apply-immediate-spelling-rule spelling-rule orth-tdfs remaining-morph left-vertex edge right-vertex f)) (dolist (rule (if spelling-rule (loop for lr in *parser-lexical-rules* when (check-sp-lr-feeding spelling-rule lr) collect lr) *parser-rules*)) ;; when we have a partial-tree specification, we have all ;; lexical rules here and no parser rules ;; ;; grammar rule application is attempted ;; when we've got all the bits (try-grammar-rule-left rule (reverse (rule-daughters-restricted rule)) left-vertex right-vertex (list edge) f (1- (length (the list (rule-daughters-apply-order rule))))) ;; when we don't build up the chart in strict left-to-right ;; order (as when we're doing a best-first search), we need to ;; check for rule applying to the right as well as to the left. ;; WARNING: this will only work correctly if all rules are no ;; more than binary branching!! (when (and f (cdr (rule-daughters-restricted rule))) (try-grammar-rule-right rule (rule-daughters-restricted rule) left-vertex right-vertex (list edge) f 0))))) (defun try-grammar-rule-left (rule rule-restricted-list left-vertex right-vertex child-edge-list f n) ;; Application of a grammar rule: Every time an edge is added to the chart, ;; a check is made to see whether its addition passes the rule application ;; filter. ;; If yes collect the dags associated with the ;; children, perform the unifications specified by the rule, and if the ;; unification(s) succeed, create a new edge (for the mother), record its ;; dag and associated information, add this to the chart, and invoke the ;; same process recursively. (declare (type fixnum n)) #+:pdebug (format t "~&try-grammar-rule-left(): `~(~a~) [~d] <-- ~{~d~^ ~} [~d -- ~d]~%" (rule-id rule) (length rule-restricted-list) (loop for e in child-edge-list collect (edge-id e)) left-vertex right-vertex) (if (and (check-rule-filter rule (edge-rule (car child-edge-list)) n) (restrictors-compatible-p (car rule-restricted-list) (edge-dag-restricted (car child-edge-list)))) (if (cdr rule-restricted-list) ; do we need more daughters? (loop for r from (1+ left-vertex) to *chart-max* do (dolist (edge (aref *chart* left-vertex r)) (unless (edge-partial-tree edge) (unless ;; inner recursive call returns nil in cases when first ;; unif attempt fails - if this happens there's no point ;; continuing with other alternatives here (try-grammar-rule-left rule (cdr rule-restricted-list) (edge-from edge) right-vertex (cons edge child-edge-list) f (1- n)) #+:pdebug (format t "~&try-grammar-rule-left(): ~ `~(~a~) [~d] <-- ~{~d~^ ~} [~d -- ~d] ... throw~%" (rule-id rule) (length rule-restricted-list) (loop for e in child-edge-list collect (edge-id e)) left-vertex right-vertex) ;; nil returned from the inner call is a signal ;; that unification of the edge we're triggering off ;; failed, so success with any combination is impossible #-:vanilla (return-from try-grammar-rule-left nil) #+:vanilla t))) finally (return t)) ; return t, because we don't want an outer loop to throw ;; we've got all the bits (with-agenda (when f (rule-priority rule)) (apply-immediate-grammar-rule rule left-vertex right-vertex child-edge-list f t))) (progn (incf (statistics-ftasks *statistics*)) t))) (defun try-grammar-rule-right (rule rule-restricted-list left-vertex right-vertex child-edge-list f n) (declare (type fixnum n)) (if (and (check-rule-filter rule (edge-rule (car child-edge-list)) n) (restrictors-compatible-p (car rule-restricted-list) (edge-dag-restricted (car child-edge-list)))) (if (cdr rule-restricted-list) (loop for l from 0 to (1- right-vertex) do (dolist (edge (aref *chart* l right-vertex)) (unless (edge-partial-tree edge) (unless (try-grammar-rule-right rule (cdr rule-restricted-list) left-vertex (edge-to edge) (cons edge child-edge-list) f (1+ n)) #-:vanilla (return-from try-grammar-rule-right nil) #+:vanilla t))) finally (return t)) ;; we've got all the bits (with-agenda (when f (rule-priority rule)) (apply-immediate-grammar-rule rule left-vertex right-vertex child-edge-list f nil))) (progn (incf (statistics-ftasks *statistics*)) t))) (defun apply-immediate-grammar-rule (rule left-vertex right-vertex child-edge-list f backwardp) ;; attempt to apply a grammar rule when we have all the parts which match ;; its daughter categories (cond ((and *brackets-list* (not (consistent-bracketing-p (mapcar #'(lambda (edge) (cons (edge-from edge) (edge-to edge))) child-edge-list) *brackets-list*))) t) ; t because we don't want the first-failed-p effect (t #+:pdebug (format t "~&try-immediate-grammar-rule(): `~(~a~) <-- ~{~d~^ ~} [~d -- ~d]" (rule-id rule) (loop for e in child-edge-list collect (edge-id e)) left-vertex right-vertex) (let ((child-edge-list-reversed (reverse child-edge-list))) (multiple-value-bind (unification-result first-failed-p) (evaluate-unifications rule (mapcar #'edge-dag child-edge-list-reversed) nil child-edge-list-reversed backwardp (loop for edge in child-edge-list minimize (edge-cfrom edge)) (loop for edge in child-edge-list maximize (edge-cto edge))) (if unification-result (let* ((edge-list (if backwardp child-edge-list child-edge-list-reversed)) (new-edge (make-edge :id (next-edge) :category (indef-type-of-tdfs unification-result) :rule rule :children edge-list :dag unification-result :lex-ids (mapcan #'(lambda (child) (copy-list (edge-lex-ids child))) edge-list) :leaves (mapcan #'(lambda (child) (copy-list (edge-leaves child))) edge-list) :partial-tree (edge-partial-tree (first edge-list)) ;; NB should be unary-rule if there is a partial ;; tree (worry about compounds later ...) ))) #+pdebug (format t " ... success.~%") (activate-context left-vertex new-edge right-vertex f) t) (progn #+pdebug (format t " ... ~:[fail~;throw~].~%" first-failed-p) (if first-failed-p nil t)))))))) (defun evaluate-unifications (rule child-fs-list &optional nu-orth child-edge-list backwardp cfrom cto) ;; nu-orth is the new orthography if the unifications relate to a morphological ;; process - to be inserted into the result fs (let ((current-tdfs (rule-full-fs rule)) (rule-daughter-order (if backwardp (reverse (cdr (rule-order rule))) (cdr (rule-order rule)))) (n -1) (new-orth-fs ;; shouldn't really make a new tdfs here because we won't need it if a unification ;; fails, but we must do it outside the unification context since they cannot nest (when nu-orth (if (tdfs-p nu-orth) nu-orth (make-orth-tdfs nu-orth))))) (declare (type fixnum n)) (with-unification-context (ignore) (dolist (rule-feat rule-daughter-order) (incf n) (let ((child-edge (pop child-edge-list))) (cond ((zerop n)) ((x-restrict-and-compatible-p (if (listp rule-feat) (unify-existing-dag-at-end-of (tdfs-indef current-tdfs) rule-feat) (unify-get-dag-value (deref-dag (tdfs-indef current-tdfs)) rule-feat)) (edge-dag-restricted child-edge))) (t (incf (statistics-ftasks *statistics*)) (return-from evaluate-unifications nil)))) (incf (statistics-etasks *statistics*)) (let ((child (pop child-fs-list))) ;; if two daughters are eq, avoid the possibility of the unifier's subgraph ;; sharing code creating spurious coreferences in the result (when (member child child-fs-list :test #'eq) (setq child (copy-tdfs-completely child))) (if (setq current-tdfs (yadu! current-tdfs child rule-feat)) (incf (statistics-stasks *statistics*)) (return-from evaluate-unifications (values nil (zerop n)))))) ; first attempt failed? ;; if (car (rule-order rule)) is nil then tdfs-at-end-of will return the ;; entire structure (let ((result (tdfs-at-end-of (car (rule-order rule)) current-tdfs))) (when new-orth-fs (setq result (yadu result new-orth-fs))) (when result (restrict-and-copy-tdfs result :cfrom cfrom :cto cto)))))) (defun yadu! (tdfs1 tdfs2 &optional path) ;; Unify tdfs1.path with tdfs2; if successful return tdfs1. Path may be nil. ;; In some common situations the (quasi-)destructiveness of the unification ;; algorithm allows us to invoke the unifier starting at tdfs1.path. Otherwise we ;; have to start at the top of tdfs1 and create and prepend to tdfs2 new dag ;; structure representing the path. That creates a lot of garbage since it's ;; done even if the unification ends up failing (let ((d1 (and *within-unification-context-p* (null (tdfs-tail tdfs1)) (null (tdfs-tail tdfs2)) (unify-existing-dag-at-end-of (tdfs-indef tdfs1) path)))) ; might not exist (if d1 (progn (incf (statistics-unifications *statistics*)) (when (unify-wffs d1 (tdfs-indef tdfs2)) tdfs1)) (yadu tdfs1 (create-temp-parsing-tdfs tdfs2 path))))) (defun create-temp-parsing-tdfs (tdfs flist) (if (null flist) tdfs (let ((indef-dag (create-dag)) (tail nil)) (unify-list-path flist indef-dag (tdfs-indef tdfs)) (when (tdfs-tail tdfs) (let ((path (create-path-from-feature-list (if (listp flist) flist (list flist))))) (loop for tail-element in (tdfs-tail tdfs) do (push (add-path-to-tail path tail-element) tail)))) (make-tdfs :indef indef-dag :tail tail)))) ;;; *********************************************************** ;;; ;;; Finishing off ;;; ;;; ********************************************************** (defun find-spanning-edges (start-vertex end-vertex) ;; return all complete edges between start and end vertices (loop for edge in (aref *chart* start-vertex end-vertex) unless (edge-partial-tree edge) collect edge)) (defun filter-root-edges (edge start-symbols &optional (full-check-p t)) ;; if any of the start-symbols unify then this may be a parse. ;; NB despite the name of the function, the 1st arg is definitely a single edge. ;; If full-check-p is nil, skip any additional root condition test. ;; Adding a quick check here is unlikely to filter out many candidates. ;; Ensure any changes here remain compatible with tsdb/lisp/derivations.lisp (and (loop with *recording-fail-paths-p* = nil ; quickcheck not used inside here for start-symbol in start-symbols for rtdfs = (get-tdfs-given-id start-symbol) ; might be a type or a root entry thereis (and rtdfs (yaduablep rtdfs (edge-dag edge)))) (if full-check-p (satisfies-additional-root-condition-p (edge-dag edge)) t) (list edge))) (defun compute-root-edges (edges start-symbols) (if (and start-symbols *substantive-roots-p*) (loop with *recording-fail-paths-p* = nil ; quickcheck not used inside here for edge in edges nconc (loop ;; by this stage we know that at least one of the roots is unifiable - use ;; the first of these to build a single new top edge for start-symbol in start-symbols for rtdfs = (get-tdfs-given-id start-symbol) ; might be a type or a root entry when rtdfs do (let ((unif (yadu rtdfs (edge-dag edge)))) (when unif (let ((new-edge (make-edge :id (next-edge) :category (indef-type-of-tdfs unif) :rule start-symbol :dag unif :dag-restricted (restrict-fs (tdfs-indef unif)) :children (list edge) :leaves (edge-leaves edge) :lex-ids (edge-lex-ids edge) :from (edge-from edge) :to (edge-to edge)))) (unless *chart-packing-p* ;; it would be misleading to add a post-unpacking edge to chart (add-to-chart (edge-from edge) new-edge (edge-to edge) nil)) ; don't (recursively) check for success (return (list new-edge))))))) edges)) ;;; *************************************************************** ;;; ;;; TTY printout of chart ;;; chart edges are ordered on: right vertex, left vertex, edge id ;;; ;;; *************************************************************** (defun print-chart (&key frozen concise (stream *standard-output*)) (format stream "~%> chart dump:~%") (when *chart* (loop for end from 1 to *chart-max* do (print-chart-entry (loop for start from 0 to (1- end) append (aref *chart* start end)) :frozen frozen :concise concise :stream stream))) (terpri stream)) (defun print-tchart (&key frozen concise (stream *standard-output*)) (format stream "~%> token/spelling chart dump:~%") (when *tchart* (loop for end from 1 to *tchart-max* do (print-chart-entry (aref *tchart* end 0) :frozen frozen :concise concise :stream stream))) (terpri stream)) (defun print-chart-entry (edges &key frozen concise (stream *standard-output*)) (when edges (terpri stream) (dolist (edge (sort (copy-list edges) #'(lambda (e1 e2) (cond ((= (edge-from e1) (edge-from e2)) (< (edge-id e1) (edge-id e2))) (t (< (edge-from e1) (edge-from e2))))))) (print-chart-item edge :frozen frozen :concise concise :stream stream)))) (defun print-chart-item (edge ;; JAC 7-Jan-2022 - removed superfluous optional argument 'end' &key (frozen nil frozenp) concise (stream *standard-output*)) (declare (ignore concise)) (when (or (not frozenp) (eq (edge-frozen edge) frozen)) (format stream "~&~A-~A [~A] ~A => ~A <~{~A~^ ~}>" (edge-from edge) (edge-to edge) (edge-id edge) (cond ((token-edge-p edge) (token-edge-word edge)) ((morpho-stem-edge-p edge) ;; NB assume partial-tree is a list (format nil "~A+~{~A ~}" (morpho-stem-edge-stem edge) (mapcar #'(lambda (rule-spec) (concise-rule-name (pt-node-rule rule-spec))) (morpho-stem-edge-partial-tree edge)))) (t (concise-edge-label edge))) (edge-leaves edge) ;; JAC 7-Jan-2022 - removed unused 'roots' output from here (or (mapcar #'edge-id (edge-children edge)) (mapcar #'edge-id (edge-tchildren edge)))) ;; if frozen then details (format stream "~:[~2*~; ~:[+~;~]~d~]" (edge-frozen edge) (and (edge-frozen edge) (minusp (edge-frozen edge))) (edge-frozen edge)) ;; if applicable, compact summary of packings (9-aug-99 - oe) (loop with edges = (append (edge-equivalent edge) (edge-packed edge)) for edge in edges do (format stream (if (eq edge (car edges)) " {" "; ")) (format stream "~d < ~{~d~^ ~}" (edge-id edge) (mapcar #'edge-id (edge-children edge))) finally (when edges (format stream "}"))) (when (token-edge-p edge) (with-slots (cfrom cto) edge (format stream "~:[~2*~; <~A c ~A> ~]" (and cfrom cto) cfrom cto))) (when (token-edge-p edge) (with-slots (xfrom xto) edge (format stream "~:[~2*~; <~A x ~A> ~]" (and xfrom xto) xfrom xto))) (format stream "~%"))) (defun concise-edge-label (edge) (if (rule-p (edge-rule edge)) (rule-id (edge-rule edge)) (first (edge-lex-ids edge)))) ;;; Diagnostic / stat utilities intended to be called directly by the grammarian for ;;; in-depth investigation of behaviour of a grammar (defun print-chart-counts () ;; output the number of parser edges in each chart cell (excluding packed edges) (let* ((maxv *chart-max*) (total (loop for s from 0 to (1- maxv) sum (loop for e from (1+ s) to maxv sum (length (aref *chart* s e)))))) (format t "~%Total cell counts: ~D~%" total) (when (> total 0) (loop initially (format t " end") for e from 1 to maxv do (format t " ~2D " e)) (loop initially (format t "~&start~%") for s from 0 to (1- maxv) do (loop initially (format t "~& ~2D " s) for e from 1 to maxv do (if (<= e s) (format t " ") (format t "~5D" (length (aref *chart* s e))))))))) (defun check-subsume-edges (n1 n2) ;; print paths/types causing subsumption failure between a pair of parser edges (let ((e1 (find-edge-given-id n1)) (e2 (find-edge-given-id n2)) (*subsume-debug* t)) (unless (and e1 e2) (error "Could not find one or both edges in parse chart")) (dag-subsumes-p (tdfs-indef (edge-dag e1)) (tdfs-indef (edge-dag e2)) t t))) (defun edge-count nil (let ((distinct-parse-edges nil)) (dolist (p *parse-record*) (setq distinct-parse-edges (local-parse-tsdb-distinct-edges p distinct-parse-edges))) (length distinct-parse-edges))) (defun local-parse-tsdb-distinct-edges (edge found) ;; same as parse-tsdb-distinct-edges in itsdb - in case not available (pushnew edge found :test #'eq) (when (and (edge-children edge) (not (lexical-rule-p (edge-rule edge)))) (dolist (c (edge-children edge)) (setq found (local-parse-tsdb-distinct-edges c found)))) found) ;;; ************************************************************* ;;; ;;; generator utilities and structures ;;; ;;; ************************************************************ ;;; extracting a list of lexical entries used in a parse ;;; used for testing the generation lexical lookup algorithm (defun retrieve-lex-from-parses nil (loop for edge in *parse-record* collect (edge-lex-ids edge))) (defun collect-parse-base (edge-rec) ;; takes a top edge, returns a list of ;; lexical identifiers, unary-rule-list pairs ;; e.g. (collect-parse-base (car *parse-record*)) (if (or (cdr (edge-lex-ids edge-rec)) (and (rule-p (edge-rule edge-rec)) (not (lexical-rule-p (edge-rule edge-rec))))) (loop for child in (edge-children edge-rec) append (collect-parse-base child)) (list (cons (car (edge-lex-ids edge-rec)) (nreverse (collect-unary-rule-names edge-rec)))))) (defun collect-unary-rule-names (edge-rec) (when (cdr (edge-children edge-rec)) (error "~%Should be unary edge ~A" edge-rec)) (when (edge-children edge-rec) (cons (rule-id (edge-rule edge-rec)) (collect-unary-rule-names (car (edge-children edge-rec)))))) ;;; morphology for the generator (defun apply-morph-rule (rule fs fs-restricted new-orth &optional daughter) #-:debug (declare (ignore daughter)) ;; ;; _fix_me_ ;; (let* ((qc (restrictors-compatible-p (car (rule-daughters-restricted rule)) fs-restricted)) (result (and qc (evaluate-unifications rule (list fs) new-orth)))) #+:debug (when qc (format t "apply-morph-rule(): ~a + ~a: ~:[nil~;t~] ~:[nil~;t~]~%" (rule-id rule) (if (rule-p daughter) (rule-id daughter) daughter) qc result)) result)) ;; ;; [bmw] some utility functions ;; (defun get-edges (&optional (tchart *tchart*)) (loop for i from 1 to (1- *chart-limit*) for edges-incident = (aref tchart i 0) append (loop for edge in edges-incident when (edge-p edge) collect edge))) (defun get-tedges (&optional (tchart *tchart*)) (loop for i from 1 to (1- *chart-limit*) for edges-incident = (aref tchart i 0) append (loop for edge in edges-incident when (token-edge-p edge) collect edge))) (defun get-medges (&optional (tchart *tchart*)) (loop for i from 1 to (1- *chart-limit*) for edges-incident = (aref tchart i 0) append (loop for edge in edges-incident when (morpho-stem-edge-p edge) collect edge))) (defun get-unanalysed-tedges (&optional (tchart *tchart*)) ;; [bmw] return tedges with no medge ancestors (let ((tedges (get-tedges tchart)) (medges (get-medges tchart))) (loop for medge in medges for tchildren = (edge-children medge) do (setf tedges (set-difference tedges tchildren))) tedges)) (defun get-unanalysed-and-unspanned-tedges nil ;; [bmw] return tedges with no accompanying medge and for which there ;; exists no alternative lattice path of medges (loop for tedge in (get-unanalysed-tedges) for source = (edge-from tedge) for target = (edge-to tedge) unless (medge-spanned-p source target) collect tedge)) (defun get-min-edge-cfrom (edges) (when edges (loop for e in edges minimize (edge-cfrom e)))) (defun get-max-edge-cto (edges) (when edges (loop for e in edges maximize (edge-cto e))))