;;; -*- Mode: LISP; Syntax: Common-Lisp; Package: LKB -*- ;;; Copyright (c) 2023 ;;; John Carroll, Ann Copestake, Robert Malouf, Stephan Oepen; ;;; see `LICENSE' for conditions. (in-package :lkb) ;;; Interface to DELPH-IN YY input format, integrating a preprocessing pipeline (tokenisation, ;;; tagging and morphosyntactic analysis) with subsequent LKB processing (lexical and phrasal ;;; parsing etc). ;;; ;;; To use this mode: (1) set the parameter *yy-application* to the name of a unix shell ;;; script or application that takes in sentences one-per-line, and for each sentence outputs ;;; a YY representation - which again occupies a single line; and then (2) call the function ;;; initialize-yy, which starts up this YY preprocessing application. NB the preprocessing ;;; application must stream data without buffering (i.e. each sentence must be processed as ;;; soon as it is input, with the corresponding output available immediately after). ;;; ;;; In normal use, the preprocessing pipeline and YY format will be completely transparent: ;;; plain text sentences (not YY format tuples) should be input as normal via the parse ;;; dialog, the function do-parse-tty, [incr tsdb()] item files, etc. (defvar *yy-pid* nil) (defvar *yy-stream* nil) (defparameter *yy-application* nil) ; something like "$DELPHINHOME/srg/util/srg-yy.sh" (defparameter *fl-application* nil) ; !!! old, deprecated (defun initialize-yy (&key (verbose t)) (unless (or *yy-application* *fl-application*) ; !!! (error "YY shell command or application has not been specified")) (when verbose (format t "~&Initializing YY ") (force-output)) (shutdown-yy) (multiple-value-bind (in-out stderr pid) (run-process (or *yy-application* *fl-application*) ; !!! :wait nil :input :stream :output :stream :error-output *debug-io*) ; always to terminal window (declare (ignore stderr)) (sleep 0.5) ; so any start-up error messages get interleaved reasonably with other output (setq *yy-stream* in-out *yy-pid* pid))) (eval-when (:load-toplevel :execute) (setf (symbol-function 'fl-initialize) (symbol-function 'initialize-yy))) ; !!! old, deprecated (defun shutdown-yy () (when *yy-stream* (ignore-errors (close *yy-stream*)) (setq *yy-stream* nil)) (when (and (integerp *yy-pid*) (> *yy-pid* 0)) (loop for signal in '("TERM" "QUIT" "KILL") until (zerop (run-process (format nil "kill -~a ~d" signal (- *yy-pid*)) ; -PID does all processes in group :wait t)) do ; give processes some time to exit cleanly before sending a stronger signal (sleep 0.5)) #+:allegro (sys:os-wait nil *yy-pid*)) (setq *yy-pid* nil)) (defun yy-preprocess-string (sentence) (declare (optimize (speed 0) (debug 3))) ; make debugging easier if something unexpected happens (unless (and (streamp *yy-stream*) (open-stream-p *yy-stream*) (integerp *yy-pid*) (> *yy-pid* 0)) (error "Attempt to use YY interface when it was not ready, possibly because ~ the function `initialize-yy' has not been called")) (when (listen *yy-stream*) ;; there are characters waiting to be read - this could happen if FreeLing identified ;; a sentence boundary inside the previous input (loop with buf = (make-array 80 :element-type 'character :adjustable t :fill-pointer 0) for c = (read-char-no-hang *yy-stream*) until (null c) do (vector-push-extend c buf) finally (format t "~&Warning: Ignoring extraneous input from YY interface: `~A' " buf))) (cond ((not (stringp sentence)) (error "YY interface given an invalid sentence ~S" sentence)) ((equal sentence "") "") (t (setq sentence (substitute #\space #\newline sentence)) (handler-case (progn (write-line sentence *yy-stream*) (finish-output *yy-stream*)) (error (c) (warn "~a~%Unable to send sentence to YY interface - attempting to re-initialize" c) (initialize-yy) (return-from yy-preprocess-string ""))) (handler-case (read-line *yy-stream*) (error (c) (warn "~a~%Unable to get result from YY interface - attempting to re-initialize" c) (initialize-yy) (return-from yy-preprocess-string "")))))) (defun yy (text) ;; relies on the [incr tsdb()] code being present at runtime (although at compile time it ;; might not yet be loaded) (funcall (find-symbol "YY-READ-INPUT" :tsdb) (yy-preprocess-string text) :format :raw)) (defun yy-setup-morphs (tokens) ;; each token is e.g. ;; ((:TAGS "vmm03p0" 0.0 "+pp3cn00" 1.0) (:INFLECTION "vmm03p0" "+pp3cn00") ;; (:SURFACE . "imagĂ­nense") (:FORM . "imaginar") (:TO . 10) (:FROM . 0) ;; (:END . 1) (:START . 0) (:ID . 1)) (loop for token in tokens for surface = (rest (assoc :surface token)) for start = (or (rest (assoc :start token)) -1) for end = (or (rest (assoc :end token)) -1) for from = (or (rest (assoc :from token)) -1) for to = (or (rest (assoc :to token)) -1) for stem = (string-upcase (rest (assoc :form token))) for rule = (rest (assoc :inflection token)) for partial-tree = (if (member rule '("null" ("null")) :test #'equal) nil ; activate built-in morphology mode (loop for r in (if (consp rule) rule (list rule)) ; was originally a singleton collect (list (intern (string-upcase r) :lkb) surface))) for edge = (add-token-edge surface (string-upcase surface) start end from to) do (when *token-type* ;; although we don't allow token mapping rules to be applied to YY format ;; input, we do support activation of generic lexical entries via token FS (create-token-edge-dag edge (rest (assoc :id token)) (rest (assoc :tags token))) ;; make partial-tree info available when adding generic lexical entry edges (setf (token-edge-partial-tree edge) partial-tree)) (add-morpho-stem-edge stem partial-tree start end (string-upcase surface) surface from to (token-edge-leaves edge) edge)))