;;; -*- mode: common-lisp; coding: utf-8; package: tsdb -*- (in-package :tsdb) ;;; ;;; [incr tsdb()] --- Competence and Performance Profiling Environment ;;; Copyright (c) 2013 -- 2017 Stephan Oepen (oe@ifi.uio.no) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; (defparameter *sdp-debug-p* nil) (defun dm-construct (item active &key (grammar :erg) (format :sdp) (debugp *sdp-debug-p*)) (let* ((user (current-user)) (pid (current-pid)) (gensym (gensym "")) (id (get-field :i-id item)) (export (format nil "~a/.export.~a.~a.~a" (tmp) user pid gensym)) (dtm (format nil "~a/.dtm.~a.~a.~a" (tmp) user pid gensym)) (converter (format nil "exec ~a --tok ptb --data ~s --grammar ~s --dtm ~s" (logon-file "bin" "dtm" :string) export (case grammar (:erg (logon-directory "lingo/erg" :string)) (:terg (logon-directory "lingo/terg" :string))) dtm)) (*derivations-print-rule-type-p* nil) (*derivations-print-lexical-type-p* t) (*redwoods-export-values* '(:input :derivation :mrs :eds))) (unless (numberp id) (set-field :i-id item (setf id 42))) (mkdir export) (mkdir dtm) (let ((file (format nil "~a/~a" export id))) (with-open-file (stream file :direction :output :if-exists :supersede) (format stream ";;;~%;;; transient pseudo-export for DM conversion~%;;;~%~%~ [~d] (~a of ~d) {~d} `~a'~%" id 1 1 1 (get-field :i-input item)) (export-tree item (list (first active)) :format :text :out stream)) (run-process (format nil "exec gzip '~a'" file) :wait t)) ;; ;; _fix_me_ ;; can we trust the converter process to terminate? (7-aug-16; oe) ;; (run-process converter :wait t :output (unless debugp "/dev/null") :if-output-exists :append :error-output :output) (let* ((file (format nil "~a/.export.~a.~a.~a.ptb.dtm" dtm user pid gensym)) (items (when (probe-file file) (read-items-from-conll-file file :type :dtm :rawp t)))) (unless debugp (fad:delete-directory-and-files export :if-does-not-exist :ignore) (fad:delete-directory-and-files dtm :if-does-not-exist :ignore)) (when items (with-output-to-string (stream) (conll-output (dtm-normalize (first items)) :type format :stream stream)))))) (defun dtm-normalize (item &key generics) (let ((tokens (get-field :i-tokens item))) ;; ;; patch up those bloody contracted (negated) auxiliaries ;; (loop for token in tokens for neg = (find "NEG" (get-field :edges token) :key #'first :test #'string=) when neg do (let ((target (rest neg)) (edges (loop for edge in (get-field :edges token) unless (or (eq edge neg) (string= (first edge) "neg")) collect edge))) ;; ;; the target of "NEG" cannot have outgoing edges (the converter ;; treats the |n't| in contracted auxiliaries as the head). ;; (set-field :edges edges target) (if edges (set-field :edges (list (cons "neg" target)) token) (set-field :edges (remove neg (get-field :edges token)) token)))) (loop for token in tokens do (set-field :pred (not (null (get-field :edges token))) token)) (loop for token in tokens for lemma = (get-field :lemma token) for entry = (ignore-errors (when (stringp lemma) (lkb::get-lex-entry-from-id (intern (string-upcase lemma) :lkb)))) for edges = (loop for (role . target) in (get-field :edges token) for label = (dtm-rewrite-label role) do (set-field :content t target) collect (cons label target)) when entry do ;; ;; _fix_me_ ;; for generic lexical entries, we end up with _generic_card_ne_ and ;; the like; for most (though not all of these) we would be better ;; off with the CARG, but that is no longer accessible to us here. ;; i should reimplement the converter from scratch. (6-apr-19; oe) ;; ;; in preparing the MRP 2019 data, these pseudo-lemmas became all too ;; visible, so i ended up patching up the DM graphs after the fact: ;; falling back to PSD lemmas or the surface form. (25-may-19; oe) ;; (let ((stem (format nil "~{~a~^+~}" (lkb::lex-entry-orth entry)))) (set-field :lemma stem token) (set-field :entry entry token)) else do (set-field :lemma "_" token) (set-field :entry nil token) when generics do (let ((lemma (get-field :lemma token)) (form (get-field :form token))) (if (consp (first generics)) (when (member lemma (first generics) :test #'string=) (set-field :lemma (funcall (rest generics) form lemma) token)) (when (member lemma generics :test #'string=) (set-field :lemma form token)))) when edges do (set-field :content t token) do (set-field :edges edges token)) (loop for token in tokens for entry = (get-field :entry token) for sense = (when (and entry (get-field :content token)) (le-sense entry :format :sdp)) do (set-field :sense (or sense "_") token))) item) (defun dtm-rewrite-label (string) (cond ((string= string "L-HNDL") "ARG1") ((string= string "L-INDEX") "ARG1") ((string= string "R-HNDL") "ARG2") ((string= string "R-INDEX") "ARG2") ((string= string "compound_name") "compound") ((string= string "implicit_conj") "conj") ((ppcre:scan "^loc_" string) "loc") ((string= string "of_p") "of") ((string= string "part_of") "part") ((ppcre:scan "^temp_" string) "temp") ((string= string "unspec_manner") "manner") ((member string '("neg" "mwe") :test #'string-equal) (string-downcase string)) (t string))) (defparameter *sdp-semi* nil) (defun le-sense (entry &key (format :raw)) (let* ((id (lkb::lex-entry-id entry)) (ges (gethash id (mt::semi-ges *sdp-semi*))) (spe (and ges (first (mt::ges-spes ges)))) (ep (and spe (mt::spe-ep spe))) (pred (and ep (mt::ep-pred ep))) (pred (if mrs::*normalize-predicates-p* (mrs:normalize-predicate pred) pred)) (synopsis (when ep (loop for role in (mt::ep-roles ep) for name = (mt::role-name role) for value = (mt::role-value role) for type = (when (mt::variable-p value) (mt::variable-type value)) collect (cons (string-downcase name) type))))) (when (and pred synopsis) (case format (:raw (cons pred synopsis)) (:text (format nil "~a[~{~:@(~a~)~@[ ~a~]~^, ~}]" pred (loop for (name . value) in synopsis collect name collect value))) (:sdp (let ((pred (string-downcase pred))) (when (string= pred "string") (setf pred (cond ((eq id 'lkb::generic_mass_noun) "n") ((eq id 'lkb::generic_mass_count_noun) "n") ((eq id 'lkb::generic_pl_noun) "n") ((eq id 'lkb::generic_adj) "a") ((eq id 'lkb::generic_adj_compar) "a") ((eq id 'lkb::generic_adj_superl) "a") ((eq id 'lkb::generic_adverb) "a") ((ppcre:scan "^generic_trans_verb" (string-downcase id)) "v") (t pred)))) (when (char= (char pred 0) #\_) (let ((break (position #\_ pred :start 1))) (when break (setf pred (subseq pred (+ break 1)))))) (when (ppcre:scan "_[0-9]$" pred) (setf pred (subseq pred 0 (- (length pred) 2)))) (setf pred (cond ((ppcre:scan "_q(?:_[^_]+)?$" pred) "q") ((ppcre:scan "^c_" pred) "c") ((ppcre:scan "^comp_" pred) "comp") ((string= pred "ellipsis_ref") "ellipsis") ((string= pred "generic_entity") "x") ((string= pred "generic_verb") "v") ;; ;; _fix_me_ ;; what about ‘_a_p_per’? (7-dec-16; oe) ;; ((ppcre:scan "^p_" pred) "p") ((ppcre:scan "^x_" pred) "x") (t pred))) (format nil "~a:~{~a~^-~}" pred (loop for role in synopsis collect (or (rest role) "c"))))))))) (defun dtm-to-dm (input output &key align generics) (let ((align (loop for item in align when (consp item) collect (get-field :i-id item) else collect item))) (with-open-file (stream output :direction :output :if-exists :supersede) (format stream "#SDP 2015~%") (loop for item in (read-items-from-conll-file input :type :dtm :rawp t) when (or (null align) (member (get-field :i-id item) align :test #'=)) do (conll-output (dtm-normalize item :generics generics) :type :sdp :stream stream)))))