(in-package :lkb) ;;; ;;; identify characters that can form words; all other characters will create ;;; word boundaries and later be suppressed in tokenization. ;;; (defun alphanumeric-or-extended-p (c) (and (graphic-char-p c) (not (member c *punctuation-characters*)))) ;;; ;;; determine surface order of constituents in rule: returns list of paths into ;;; feature structure of rule, i.e. (nil (args first) (args rest first)) for a ;;; binary rule, where the first list element is the path to the mother node of ;;; the rule. ;;; (defun establish-linear-precedence (rule) (let ((daughters (loop for args = (existing-dag-at-end-of rule '(args)) then (existing-dag-at-end-of args *list-tail*) for daughter = (when args (get-value-at-end-of args *list-head*)) for path = (list 'args) then (append path *list-tail*) while (and daughter (not (eq daughter 'no-way-through))) collect (append path *list-head*)))) (if (null daughters) (cerror "Ignore it" "Rule without daughters") (cons nil daughters)))) ;;; ;;; detect rules that have orthographemic variation associated to them; those ;;; who do should only be applied within the morphology system; for the time ;;; being use value of NEEDS-AFFIX feature, though it would be nicer to rely ;;; on a type distinction of lexical rules or re-entrancy of ORTH. ;;; (defun spelling-change-rule-p (rule) (let ((affix (get-dag-value (tdfs-indef (rule-full-fs rule)) 'needs-affix))) (and affix (bool-value-true affix)))) ;;; ;;; create feature structure representation of orthography value for insertion ;;; into the output structure of inflectional rules; somewhat more complicated ;;; than one might expect because of treatment for multi-word elements. ;;; (defun make-orth-tdfs (orth) (let ((unifs nil) (tmp-orth-path *orth-path*)) (loop for orth-value in (split-into-words orth) do (let ((opath (create-path-from-feature-list (append tmp-orth-path *list-head*)))) (push (make-unification :lhs opath :rhs (make-u-value :type orth-value)) unifs) (setq tmp-orth-path (append tmp-orth-path *list-tail*)))) (let ((indef (process-unifications unifs))) (when indef (setf indef (create-wffs indef)) (make-tdfs :indef indef))))) (defun make-unknown-word-sense-unifications (word-string &optional stem) ;;; this assumes we always treat unknown words as proper names ;;; uncomment the *unknown-word-types* in globals.lsp ;;; to activate this (when word-string (list (make-unification :lhs (create-path-from-feature-list '(MORPH LIST FIRST STEM FIRST)) :rhs (make-u-value :type (or stem word-string))) (make-unification :lhs (create-path-from-feature-list '(MORPH LIST FIRST STEM REST)) :rhs (make-u-value :type 'lkb::*null*)) (make-unification :lhs (create-path-from-feature-list '(SYNSEM LKEYS KEYTAG)) :rhs (make-u-value :type (string-downcase word-string)))))) (defun instantiate-generic-lexical-entry (gle surface) (let ((tdfs (copy-tdfs-elements (lex-entry-full-fs (if (gle-p gle) (gle-le gle) gle))))) (loop with dag = (tdfs-indef tdfs) for path in '((MORPH LIST FIRST STEM FIRST) (SYNSEM LKEYS KEYTAG)) for foo = (existing-dag-at-end-of dag path) do (setf (dag-type foo) *string-type*)) (let* ((unifications (make-unknown-word-sense-unifications surface (or #+:logon (case (gle-id gle) (guess_n_gle (format nil "/~a/" surface)) (decade_gle (format nil "~as" surface))) surface))) (indef (process-unifications unifications)) (indef (and indef (create-wffs indef))) (overlay (and indef (make-tdfs :indef indef)))) (when indef (with-unification-context (ignore) (let ((foo (yadu tdfs overlay))) (when foo (copy-tdfs-elements foo)))))))) ;;;(defun make-orth-tdfs (orthography) ;;; (let* ((unifications ;;; (loop ;;; for token in (split-into-words orthography) ;;; for path = *orth-path* then (append path *list-tail*) ;;; for opath = (create-path-from-feature-list ;;; (append path *list-head*)) ;;; collect (make-unification :lhs opath ;;; :rhs (make-u-value :type token)))) ;;; (indef (process-unifications unifications))) ;;; (when indef ;;; (make-tdfs :indef (create-wffs indef))))) ;;; ;;; determine path and file names for lexicon and leaf type cache files. ;;; (defun set-temporary-lexicon-filenames nil (let* ((version (or (find-symbol "*GRAMMAR-VERSION*" :common-lisp-user) (and (find-package :lkb) (find-symbol "*GRAMMAR-VERSION*" :lkb)))) (prefix (if (and version (boundp version)) (remove-if-not #'alphanumericp (symbol-value version)) "lexicon"))) (setf *psorts-temp-file* (make-pathname :name prefix :directory (pathname-directory (lkb-tmp-dir)))) (setf *psorts-temp-index-file* (make-pathname :name (concatenate 'string prefix ".idx") :directory (pathname-directory (lkb-tmp-dir)))) (setf *leaf-temp-file* (make-pathname :name (concatenate 'string prefix ".lfs") :directory (pathname-directory (lkb-tmp-dir)))))) (defun bool-value-true (fs) (and fs (let ((fs-type (type-of-fs fs))) (eql fs-type '+)))) (defun bool-value-false (fs) (and fs (let ((fs-type (type-of-fs fs))) (eql fs-type '-)))) (defun gen-extract-surface (edge &optional (initialp t) &key stream) (if stream (let ((daughters (edge-children edge))) (if daughters (loop for daughter in daughters for foo = initialp then nil append (gen-extract-surface daughter foo :stream stream)) (let* ((entry (get-lex-entry-from-id (first (edge-lex-ids edge)))) (tdfs (and entry (lex-entry-full-fs entry))) (type (and tdfs (type-of-fs (tdfs-indef tdfs)))) (string (string-downcase (copy-seq (first (edge-leaves edge))))) (capitalizep (ignore-errors (loop for match in '(proper-noun-le proper-noun-name-le proper-noun-det-le anrede-title-noun-le title-noun-le anrede-form-noun-le count-title-noun-le deverbal-noun-le count-noun-le count-noun-mass-unit-le adj-count-noun-le count-noun-t-le inf-count-noun-le currency-noun-le unknown-currency-noun-le unit-noun-le unknown-date-noun-le unknown-percent-noun-le pp-noun-le scomp-noun-le int-clause-noun-le mass-noun-le mass-pp-noun-le relational-noun-le month-noun-le month-mod-noun-le day-noun-le time-mod-noun-le time-mod-noun-ty-le cardyear-mod-noun-ty-le special-count-noun-le special-mass-noun-le letter-noun-le ) thereis (or (eq type match) (subtype-p type match))))) (cliticp (and (> (length string) 0) (char= (char string 0) #\')))) (when capitalizep (loop with spacep = t for i from 0 to (- (length string) 1) for c = (schar string i) when (char= c #\Space) do (setf spacep t) else when (char= c #\_) do (setf spacep t) (setf (schar string i) #\Space) else do (when (and spacep (alphanumericp c)) (setf (schar string i) (char-upcase c))) (setf spacep nil))) (when (and initialp (alphanumericp (schar string 0))) (setf (schar string 0) (char-upcase (schar string 0)))) (format stream "~@[ ~*~]~a" (and (not initialp) (not cliticp)) string)))) (let ((stream (make-string-output-stream))) (gen-extract-surface edge initialp :stream stream) (get-output-stream-string stream)))) (eval-when #+:ansi-eval-when (:load-toplevel :compile-toplevel :execute) #-:ansi-eval-when (load eval compile) (setf *gen-extract-surface-hook* 'gen-extract-surface)) (defparameter *tsdb-name* "[incr tsdb()]") (defparameter *tsdb-version* "2.0 (24-apr-06; beta)") (defparameter *tsdb-application* (format nil "exec ~a" (namestring (make-pathname :directory (pathname-directory make::bin-dir) :name "tsdb")))) (defparameter *tsdb-home* (namestring (dir-append (get-sources-dir "tsdb") '(:relative "tsdb" "home")))) (defparameter *tsdb-skeleton-directory* (namestring (dir-append (get-sources-dir "tsdb") '(:relative "tsdb" "skeletons" "english")))) (defparameter *tsdb-data* "toy") (defparameter *tsdb-gold* nil) (defparameter *tsdb-encoding* nil) (defparameter *tsdb-protocol-file * nil) (defparameter *tsdb-server-mode-p* nil) (defparameter *tsdb-io* t) (defparameter *tsdb-write-run-p* t) (defparameter *tsdb-write-parse-p* t) (defparameter *tsdb-write-result-p* t) (defparameter *tsdb-write-edge-p* t) (defparameter *tsdb-write-passive-edges-p* nil) (defparameter *tsdb-write-lexicon-chart-p* nil) (defparameter *tsdb-write-syntax-chart-p* nil) (defparameter *tsdb-write-output-p* nil) (defparameter *tsdb-rule-statistics-p* nil) (defparameter *tsdb-verbose-processing-p* t) (defparameter *tsdb-cache-database-writes-p* :raw) (defparameter *tsdb-flush-cache-threshold* 5000) (defparameter *tsdb-verbose-cache-flush-p* nil) (defparameter *tsdb-preprocessing-hook* nil) (defparameter *tsdb-tagging-hook* nil) (defparameter *tsdb-result-hook* "tsdb::result-hook") (defparameter *tsdb-trees-hook* nil) (defparameter *tsdb-semantix-hook* nil) (defparameter *tsdb-gc-p* nil) (defparameter *tsdb-minimize-gcs-p* t) (defparameter *tsdb-tenure-p* nil) (defparameter *tsdb-generation-spread* 10) (defparameter *tsdb-scavenge-limit* nil) (defparameter *tsdb-tenured-bytes* 0) (defparameter *tsdb-tenured-bytes-limit* (* 32 1024 1024)) (defparameter *tsdb-gc-verbosity* nil) (defparameter *tsdb-gc-message-p* t) (defparameter *tsdb-gc-cursor-p* t) (defparameter *tsdb-edge-factor* 2.0) (defparameter *tsdb-exhaustive-p* t) (defparameter *tsdb-ignore-output-p* nil) (defparameter *tsdb-maximal-number-of-edges* 100000) (defparameter *tsdb-maximal-number-of-analyses* 0) (defparameter *tsdb-maximal-number-of-results* 5000) (defparameter *tsdb-default-skeleton* "english") (defparameter *tsdb-skeleton-index* "Index.lisp") (defparameter *tsdb-relations-skeleton* "Relations") (defparameter *tsdb-instance-template* "%g/%v/%t/%d/%s") (defparameter *tsdb-skeletons* nil) (defparameter *tsdb-initialized-p* nil) (defvar *tsdb-phenomena* (make-hash-table :test #'equal)) (defparameter *tsdb-data-hook* nil) (defparameter *tsdb-gold-hook* nil) (defparameter *tsdb-gc-statistics* nil) (defparameter *tsdb-ofs* #\@) (defparameter *tsdb-efs* #\@) (defparameter *tsdb-redwoods-files* '("tree" "decision" "preference" "update" "fold" "score")) (defparameter *tsdb-profile-files* (append '("daughter" "edge" "parse" "result" "rule" "run") *tsdb-redwoods-files*)) (defparameter *tsdb-id-attributes* '(:i-id :p-id :ip-id :s-id :run-id :parse-id :result-id)) (defparameter *tsdb-coded-attributes* '(:i-difficulty :i-wf :polarity)) (defparameter *tsdb-tokens-to-ignore* '("." "(" ")" "!" "?" "," "+" "-" "'" "[" "]" "`")) (defparameter *tsdb-slash* #\/) (defparameter *tsdb-debug-mode-p* nil) (defparameter *pvm-cpus* nil) (defparameter *pvm-clients* nil) (defparameter *process-default-task* :parse) (defparameter *process-suppress-duplicates* '(:mrs)) (defparameter *process-exhaustive-inputs-p* nil) (defparameter *process-client-retries* 0) (defparameter *process-scope-generator-input-p* nil) (defparameter *process-pretty-print-trace-p* t) (defparameter *process-raw-print-trace-p* nil) (defparameter *process-sort-profile-p* t) (defparameter *process-fan-out-log* nil) (defparameter *process-fan-out-xml* nil) (defparameter *statistics-select-condition* nil) (defparameter *redwoods-export-values* '(:derivation :tree :avm :mrs :indexed :dependencies)) (defparameter *redwoods-update-exact-p* nil) (defparameter *redwoods-update-flag-p* t) (defparameter *redwoods-agreement-exact-p* t) (defparameter *redwoods-score-similarities* nil) (defparameter *redwoods-use-item-sets-p* t) (defparameter *redwoods-thinning-export-p* nil) (defparameter *redwoods-thinning-normalize-p* nil) (defvar *reconstruct-cache* nil) (defparameter %tsdb-lexical-preterminals% nil) (defun install-gc-strategy (gc &key (verbosity *tsdb-gc-verbosity*) (tenure *tsdb-tenure-p*) burst verbose) #+:allegro (let ((environment (pairlis '(:print :stats :verbose :auto-step) (list (sys:gsgc-switch :print) (sys:gsgc-switch :stats) (sys:gsgc-switch :verbose) (sys:gsgc-parameter :auto-step)))) (statsp (member :stats verbosity :test #'eq)) (verbosep (member :verbose verbosity :test #'eq)) (*tsdb-tenured-bytes-limit* nil)) (setf (system:gsgc-switch :dump-on-error) t) (setf (sys:gsgc-switch :print) (or verbosep statsp)) (setf (sys:gsgc-switch :verbose) verbosep) (setf (sys:gsgc-switch :stats) statsp) (setf (sys:gsgc-parameter :auto-step) tenure) (setf (sys:gsgc-parameter :generation-spread) *tsdb-generation-spread*) (unless tenure (setf (system:gsgc-switch :gc-old-before-expand) t) (when verbose (format *tsdb-io* "install-gc-strategy(): ~ disabling tenure; global garbage collection ...")) #-(version>= 5 0) (busy :gc :start) (excl:gc (if burst :mark-for-tenure :tenure)) #-(version>= 5 0) (busy :gc :end) (setf *tsdb-tenured-bytes* 0) (when verbose (format *tsdb-io* " done.~%"))) (when (and (null tenure) (eq gc :global) verbose) (format *tsdb-io* "install-gc-strategy(): ~ tenure disabled; supressing preliminary gc()s.~%")) (acons :gc (if (and (null tenure) (eq gc :global)) nil gc) environment)))