;;; Copyright (c) 1998-2022 John Carroll, Ann Copestake, Robert Malouf, Stephan Oepen ;;; see LICENSE for conditions (in-package :lkb) ;;; Creating and running the `quickcheck'; for motivation, algorithms and empirical ;;; results see Malouf, Carroll & Copestake (2000) ;;; *check-paths* - defined in globals.lsp - is the parameter set in user file. This is ;;; used to construct value of *check-paths-tree* which is what is used internally (defvar *check-paths-tree* nil) #+:sbcl (declaim (sb-ext:always-bound *check-paths-tree*)) (defvar *fail-paths-and-unifs*) (defvar *fail-paths*) (deftype qc-index () '(unsigned-byte 8)) (declaim (type qc-index *check-path-count*)) ; help compiler and defend against bad values (defstruct qcleaf (index 0 :type qc-index :read-only t) (spec nil :type (or list hash-table symbol) :read-only t)) #+:sbcl (declaim (sb-ext:freeze-type qcleaf)) ;;; Wrapping with-check-path-list-collection around some call to parse a sentence or ;;; a set of sentences collects stats on all feature paths that fail in unification, ;;; and computes the best set for checking values of before unifications. Writes ;;; out set to specified file ;;; ;;; NB existing set of quick check paths is left untouched #| (with-check-path-list-collection "/tmp/checkpaths-test.lsp" (parse '("Devito" "manages" "a" "programmer" "Abrams" "interviewed" "and" "Browne" "hired") nil)) (with-check-path-list-collection "/tmp/checkpaths-gen.lsp" (generate-from-mrs-file "~/Documents/grammars/erg-2018/tsdb/gold/mrs/result" t)) (with-check-path-list-collection "/tmp/checkpaths.lsp" (parse-sentences "~/Documents/grammars/erg-2018/lkb/checkpaths.items" t)) |# (defmacro with-check-path-list-collection (output-file &body forms) `(let ((*recording-fail-paths-p* t) (*fail-paths-and-unifs* nil) (.completedp. nil)) (unwind-protect (multiple-value-prog1 ;; disable existing quickcheck, compute full forest and unpack at most 1 result (let ((*check-path-count* 0) (*check-paths-tree* nil) (*first-only-p* 1) (*gen-first-only-p* 1) (*unpacking-scoring-hook* (constantly 0.0)) (*show-parse-p* nil)) ,@forms) (setq .completedp. t)) (when .completedp. (format t "~%Extracting paths...") (force-output) (let ((check-paths (check-path-convert (extract-check-paths *fail-paths-and-unifs*)))) (with-open-file (str ,output-file :direction :output :if-exists :supersede :if-does-not-exist :create) (with-standard-io-syntax (with-package (:lkb) (format str "#|~%Check paths created from execution of~% ~S~%with grammar ~A on ~A~%|#~%" '(with-check-path-list-collection ,output-file ,@forms) (get-grammar-version) (current-time :long t)) (format str "(CL:IN-PACKAGE #:LKB)~%") (format str "(DEFPARAMETER *CHECK-PATHS*~% '(~{~S~^~% ~}))~%" check-paths))))) (format t "~&Wrote file ~A~%" (truename ,output-file)))))) (defun call-with-fail-paths-recording (f dag1 dag2 &rest args) ;; called from a unification / subsumption /etc function when *recording-fail-paths-p* ;; is non-nil - calls f on dag1, dag2 and any extra args, then accumulates fail paths ;; + unification pairs after calls to record-fail-path (declare (dynamic-extent args)) (let* ((*fail-paths* nil) (res (multiple-value-list (apply f dag1 dag2 args)))) (if *fail-paths* (loop with paths = (loop for path in *fail-paths* when (and (existing-dag-at-end-of dag1 path) (existing-dag-at-end-of dag2 path)) collect path) with npaths = (length paths) for path in paths for item = (assoc path *fail-paths-and-unifs* :test #'equal) do (unless item (setq item (cons path (make-hash-table))) (setq *fail-paths-and-unifs* (nconc *fail-paths-and-unifs* (list item)))) ; found later => less frequent (setf (gethash *unify-generation* (cdr item)) npaths) finally (return nil)) ; indicate overall failure since there were fail-paths (values-list res)))) (defun record-fail-path (path) ;; we need all fresh conses when recording path since it was stack-allocated (push (reverse path) *fail-paths*)) ;;; Take a list of paths, each paired with a set whose elements are integers ;;; representing unifications and subsumptions that failed, and order the paths ;;; so that the first k paths cover as many of these failures as possible. ;;; This is an instance of the maximum coverage problem (Cormen et al. 2009 ;;; Introduction to Algorithms, 3rd Edition, 35.3). We use the standard greedy search, ;;; which seems to be the only practical approach since at this point k is not fixed ;;; (it can be changed at grammar load time via the parameter *check-path-count*). (defun extract-check-paths (fail-paths-and-unifs) (labels ((greedy-failure-cover (fail-paths-and-unifs) (when fail-paths-and-unifs (let ((max-item ;; path+set accounting for largest number of unification/subsumption failures (reduce #'(lambda (x y) (if (> (hash-table-count (cdr x)) (hash-table-count (cdr y))) x y)) fail-paths-and-unifs))) ;; erase each such failure u from all remaining paths p; each u is associated with ;; a count n of the number of paths it's in, which allows us to stop looking for u ;; as soon as we've found n paths where u is present - giving a substantial speed-up (cons (cons (car max-item) (hash-table-count (cdr max-item))) (let ((rest-items (remove max-item fail-paths-and-unifs :test #'eq))) (maphash #'(lambda (u n) (decf n) ; discount the occurrence of u in max-item (loop for (p . unifs) in rest-items while (> n 0) do (when (remhash u unifs) (decf n)))) (cdr max-item)) ;; repeat, dropping any paths whose failures have all been erased (greedy-failure-cover (remove-if #'(lambda (x) (zerop (hash-table-count (cdr x)))) rest-items)))))))) (greedy-failure-cover ;; an initial sort on set size massively speeds up erasing failures (sort fail-paths-and-unifs #'> :key #'(lambda (x) (hash-table-count (cdr x))))))) #| (defun extract-check-paths (fail-paths-and-unifs) ;; naive version which just returns paths in order of decreasing number of fails (sort (mapcar #'(lambda (item) (cons (car item) (hash-table-count (cdr item)))) fail-paths-and-unifs) #'> :key #'cdr)) |# ;;; Interactive interface (defun interactive-create-check-paths nil (let* ((test-file (ask-user-for-existing-pathname "Checkpaths sample file?")) (output-file (and test-file (ask-user-for-new-pathname "Checkpaths output file?")))) (when (and test-file output-file) (with-check-path-list-collection output-file (parse-sentences test-file t)) (format t "~%Grammar loading script should contain:~ ~%(lkb-load-lisp (this-directory) t)~%")))) ;;; Compute *check-paths-tree* - the 'recipe' for computing a quickcheck ;;; vector that's specific to this particular grammar version. ;;; Called after type hierarchy has been processed and constraints expanded. ;;; First used later on in the grammar loading process to fill the ;;; daughters-restricted field of rules. Each qc vector element corresponds to a ;;; feature path, and can contain one of 3 things: nil, a type name (a symbol), or an ;;; embedding of the type with respect to a portion of the type hierarchy (represented ;;; as a small integer). ;;; ;;; JAC 4-Dec-20: improved the latter case, so that only each leaf type descendant of ;;; the maximal type for the feature path has a bit set. Logical and-ing two such ;;; integers tells us whether there is a greatest common subtype, and we can also test ;;; for subsumption with logand or logandc1. A qc vector element is nil if that path does ;;; not exist in the FS, or if the type at the end of the path is the maximal type for ;;; that path. Integer embedding paths can be compared far more quickly than type name ;;; paths, so paths are re-sorted with all of the former preceding all of the latter. ;;; ;;; NB Encoding only descendants that are leaves gives a big saving on bits, but isn't a ;;; general-purpose type unification technique since it can't be used to actually identify ;;; greatest common subtypes; moreover it can give a false positive result when testing ;;; for subsumption in the case where there's a non-branching chain between two types in ;;; the type hierarchy - since both would have the same integer encoding. (This could be ;;; fixed by giving a bit to each type with only 1 descendant, but here it's not worth it). ;;; ;;; The embedding representation is particularly useful for LKB qc because of a fundamental ;;; implementation decision in the LKB to represent types in FSes by their names rather ;;; than as small integers, which leads to extra indirection in the obvious approach to ;;; type unification. (defun optimise-check-unif-paths (&optional verbose) (setq *check-paths-tree* nil) (when (find :vanilla *features*) (return-from optimise-check-unif-paths nil)) (let ((paths-and-freqs (loop with warned = nil for path-and-freq in *check-paths* when (and (or (and (consp path-and-freq) ; defensive since potentially user-editable (listp (car path-and-freq)) (null (cdr (last (car path-and-freq)))) (every #'symbolp (car path-and-freq)) (typep (cdr path-and-freq) '(integer 1 *))) (progn (format t "~%WARNING: Invalid item ~A in *check-paths* - ignoring it" path-and-freq) nil)) (loop for feat in (car path-and-freq) always (or (maximal-type-of feat) ; feature known? (progn (unless (member feat warned :test #'eq) (format t "~%WARNING: *check-paths* contains unknown feature ~A - ignoring path(s) concerned" feat) (push feat warned)) nil)))) collect path-and-freq))) (let ((paths-and-specs (let ((*map-cache* nil)) (declare (special *map-cache*)) (loop for (path . freq) in paths-and-freqs repeat *check-path-count* collect (optimise-check-unif-path path freq verbose))))) (setq paths-and-specs (stable-sort paths-and-specs #'(lambda (x y) (and (not (symbolp x)) (symbolp y))) ; prioritise integer embeddings :key #'cdr)) (loop for (path . spec) in paths-and-specs for n from 0 do (setq *check-paths-tree* (add-path-to-tree path (make-qcleaf :index n :spec spec) *check-paths-tree*))) t))) (defun add-path-to-tree (p v tree) (if p (let ((branch (find (car p) tree :key #'(lambda (x) (and (consp x) (car x)))))) (if branch (progn (setf (cdr branch) (add-path-to-tree (cdr p) v (cdr branch))) tree) (append tree (list (reduce #'list p :from-end t :initial-value v))))) (cons v tree))) (defun optimise-check-unif-path (path freq verbose) (declare (special *map-cache*) (ignore freq)) (cons path (if path (let* ((feat (car (last path))) (fs (constraint-of (maximal-type-of feat))) (type (type-of-fs (get-dag-value fs feat))) (subs (retrieve-descendants type)) (nsubs (length subs)) (nleaves (count-if-not #'ltype-descendants subs))) (when verbose (format t "~&~A has value of type ~A, which has ~A subtypes (~A leaves)~%" path type nsubs nleaves)) (if (and (<= nleaves (1+ (integer-length most-positive-fixnum))) ; + and -ve fixnums (not (string-type-p type))) ;; for each possible type ty at end of path, create a mapping from ty to an ;; integer with bits set corresponding to each of ty's leaf type descendants (or (getf *map-cache* type) (setf (getf *map-cache* type) (loop with leaves = (remove-if #'ltype-descendants subs) with htp = (> nsubs 40) ; hash table or plist? with map = (if htp (make-hash-table :test #'eq :size (* nsubs 2)) nil) for ty in (cons (get-type-entry type) subs) for val = nil then (let ((i 0)) (dolist (x (cons ty (ltype-descendants ty)) i) (let ((pos (position x leaves :test #'eq))) (when pos (setq i (dpb 1 (byte 1 pos) i)))))) do (if htp (setf (gethash (ltype-name ty) map) val) (setq map (nconc map (list (ltype-name ty) val)))) finally (return map)))) type)) *toptype*))) ;;; Statically compute quickcheck vector for a dag, and check two such vectors for ;;; (unification) compatibility or subsumption (declaim (inline restrict-fs-type-representation)) (defun restrict-fs-type-representation (type spec) ;; compute the qc representation of type according to spec argument - representation ;; is either the type name itself, nil (if it's the maximal type for the value of the ;; feature concerned), or an integer embedding of the type (covering only the part ;; of the hierarchy below that maximal type, looked up in a plist / hash table) (etypecase spec (cons (getf spec type)) (hash-table (gethash type spec)) ((and symbol (not null)) (if (eq type spec) ; maximal type for feature concerned? nil type)))) (defun restrict-fs (fs) (let ((vals (make-array *check-path-count* :initial-element nil))) (labels ((traverse-qctree (tree d) (declare (type dag d) (inline get-dag-value)) (dolist (branch tree) (etypecase branch (cons (let ((v (get-dag-value d (car branch)))) (when v (traverse-qctree (cdr branch) v)))) (qcleaf (setf (svref vals (qcleaf-index branch)) (restrict-fs-type-representation (type-of-fs d) (qcleaf-spec branch)))))))) (traverse-qctree *check-paths-tree* fs) vals))) (defmacro type-bit-representation-p (x) `(typep ,x 'fixnum)) (defun restrictors-compatible-p (daughter-restricted child-restricted) (declare (simple-vector daughter-restricted child-restricted)) (loop for dt across daughter-restricted for ct across child-restricted always (cond ((or (eq dt ct) (null dt) (null ct))) ((and (type-bit-representation-p dt) (type-bit-representation-p ct)) ;; fixnum (bit) encodings (not (zerop (logand dt ct)))) (t ;; type name symbol encodings (greatest-common-subtype dt ct))))) (defun restrictors-subsuming-p (restricted1 restricted2 genp) ;; check whether the corresponding pairs of types in the restrictors are related by ;; subsumption in one direction and/or the other; also if genp=t, whether one type in the ;; pair subsumes the other in either direction (so there could possibly be a dag that ;; generalises those the restrictors were derived from - in the limited sense used by the ;; parser) (declare (simple-vector restricted1 restricted2)) (loop with forwardp = t ; t1 subsumes or is equal to t2 and backwardp = t ; vice versa for t1 across restricted1 for t2 across restricted2 do (flet ((not-forward () (if backwardp (setq forwardp nil) (unless genp (return nil)))) (not-backward () (if forwardp (setq backwardp nil) (unless genp (return nil))))) (cond ((eq t1 t2)) ((null t1) (not-backward)) ; since t2 must be non-nil ((null t2) (not-forward)) ((and (type-bit-representation-p t1) (type-bit-representation-p t2)) ;; fixnum (bit) encodings (unless (= t1 t2) ; compiler can elide if fixnums satisfy not-eq => not-= (let ((gcs (logand t1 t2))) (cond ((= gcs t1) (not-forward)) ((= gcs t2) (not-backward)) (t (return nil)))))) (t ;; type name symbol encodings (let ((gcs (greatest-common-subtype t1 t2))) ; eq case checked already (cond ((eq gcs t1) (not-forward)) ((eq gcs t2) (not-backward)) (t (return nil))))))) finally (return (values forwardp backwardp genp)))) ;;; Dynamically compute quickcheck vector while inside scope of a set of unifications (defun unify-restrict-fs (fs) ;; fs assumed to have already been dereferenced (let ((vals (make-array *check-path-count* :initial-element nil))) (labels ((unify-traverse-qctree (tree d) (declare (type dag d) (inline unify-get-dag-value)) (dolist (branch tree) (etypecase branch (cons (let ((v (unify-get-dag-value d (car branch)))) (when v (unify-traverse-qctree (cdr branch) (deref-dag v))))) (qcleaf (setf (svref vals (qcleaf-index branch)) (restrict-fs-type-representation (unify-get-type d) (qcleaf-spec branch)))))))) (unify-traverse-qctree *check-paths-tree* fs) vals))) (defun x-restrict-and-compatible-p (fs child-restricted) ;; On its way to being removed. ;; Not on a critical execution path, so turned into a no-op. A complete version would ;; include most of unify-restrict-fs and restrictors-compatible-p but with ;; (svref child-restricted (qcleaf-index branch)). Also it could only sensibly check ;; types in the order they occur in *check-paths-tree* rather than in quickcheck vector (declare (ignore fs child-restricted)) t)