;;; -*- mode: common-lisp; coding: utf-8; package: tsdb -*- (in-package :tsdb) ;;; ;;; [incr tsdb()] --- Competence and Performance Profiling Environment ;;; Copyright (c) 2008 -- 2017 Stephan Oepen (oe@ifi.uio.no) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; (defparameter *conll-type* :starsem) (defun read-items-from-conll-file (stream &key (base 1) (offset 0) shift cycle (type *conll-type*) fields (commentp t) rawp (log t)) (cond ((and (stringp stream) (string= stream "-")) (setf stream *standard-input*)) ((or (pathnamep stream) (stringp stream)) (when (probe-file stream) (with-open-file (stream stream :direction :input) (read-items-from-conll-file stream :base base :offset offset :shift shift :cycle cycle :type type :fields fields :rawp rawp :log log)))) ((streamp stream) (loop with identifier = nil with id = base with input = nil with counter = 0 for line = (read-line stream nil nil) while line when (string= line "#SDP 2015") do (format log "SDP 2015 file format detected.~%") else when (ppcre:scan "^#[0-9]+$" line) do (setf identifier (parse-integer line :start 1)) else when (string= line "") collect (let* ((i (or identifier id)) (length (length input)) (string (format nil "~{~a~^~%~}" (nreverse input))) (tokens (conll-preprocess string :type type :fields fields :format :raw)) (negations (when (smember type '(:starsem :starsem+)) (handler-case (starsem-summarize-tokens tokens :i counter) (condition (condition) (warn (format nil "~a" condition)) nil))))) (when negations (incf counter (max (length (get-field :ncues negations)) (length (get-field :nscopes negations)) (length (get-field :nevents negations))))) (when (smember type '(:dtm :sdp :sdp+)) (cond ((null tokens) (format log "read-items-from-conll-file(): empty graph in item #~a.~%" i)) (t (setf tokens (ignore-errors (conll-expand-graph tokens))) (if (null tokens) (format log "read-items-from-conll-file(): graph error in item #~a.~%" i) (when cycle (multiple-value-bind (token path) (conll-cyclic-p tokens) (when token (format log "read-items-from-conll-file(): cycle in item #~a ~ (~a: ~{~a~^.~}).~%" i (get-field :id token) path) (setf tokens nil)))))))) (setf identifier nil) (if (smember type '(:starsem :starsem+)) (labels ((segment (chapter) (cond ;; ;; based on chapter names, carve up the identifier ;; space as follows, one segment per chapter: ;; [0-1] wisteria ;; [2-17] baskervilles ;; ((search "wisteria" chapter :end2 8) (- (parse-integer chapter :start 8) 1)) ((search "baskervilles" chapter :end2 12) (+ (parse-integer chapter :start 12) 1)) ((search "cardboard" chapter :end2 9) 16) ((search "circle" chapter :end2 6) (+ (parse-integer chapter :start 6) 16))))) (let* ((token (first tokens)) (segment (or (segment (get-field :chapter token)) 0)) (sentence (get-field :sentence token))) (setf i (+ 10000 (* segment 1000) sentence)))) (when (functionp shift) (setf i (funcall shift i)))) (incf id) (when (and (consp tokens) (null rawp)) (setf tokens (write-to-string tokens :case :downcase))) (when (and (consp negations) (null rawp)) (setf negations (write-to-string negations :case :downcase))) (setf input nil) (pairlis '(:i-id :i-wf :i-length :i-input :i-tokens :i-comment) (list (+ offset i) 1 length string tokens negations))) else do (unless (and commentp (ppcre:scan "^#" line)) (push line input)))))) (defun conll-preprocess (string &key (format :string) fields gaps (mode :erg) type) (let ((tokens (loop with lines = (ppcre:create-scanner "\\n") with columns = (ppcre:create-scanner "\\t") for token in (ppcre:split lines string) for conll = (ppcre:split columns token) when (consp fields) collect ;; ;; a custom format, with a list of column labels provided ;; (loop for key in fields for values on conll for value = (first values) when (smember key '(:id :start :end :head)) collect (cons key (parse-integer value :junk-allowed t)) else when key collect (cons key value)) else when (eq type :dtm) collect ;; ;; DELPH-IN syntactic and semantic bi-lexical dependencies (DTM) ;; (loop for key in '(:id :form :lemma :pos :let nil nil nil :head :label :pred :args) for values on conll while (or (null fields) (>= (decf fields) 0)) for value = (first values) when (smember key '(:id :head)) collect (cons key (parse-integer value :junk-allowed t)) else when (eq key :args) collect (cons key values) else when key collect (cons key value)) else when (eq type :sdp) collect ;; ;; SemEval 2014 Semantic Depedency Parsing ;; (loop for key in '(:id :form :lemma :pos :top :pred :args) for values on conll while (or (null fields) (>= (decf fields) 0)) for value = (first values) when (smember key '(:id)) collect (cons key (parse-integer value :junk-allowed t)) else when (smember key '(:top :pred)) do (setf value (string= value "+")) and collect (cons key value) else when (eq key :args) collect (cons key values) else when key collect (cons key value)) else when (eq type :sdp+) collect ;; ;; SemEval 2015 Semantic Depedency Parsing ;; (loop for key in '(:id :form :lemma :pos :top :pred :sense :args) for values on conll while (or (null fields) (>= (decf fields) 0)) for value = (first values) when (smember key '(:id)) collect (cons key (parse-integer value :junk-allowed t)) else when (smember key '(:top :pred)) do (setf value (string= value "+")) and collect (cons key value) else when (eq key :args) collect (cons key values) else when key collect (cons key value)) else when (smember type '(:starsem :starsem+)) collect ;; ;; the 2012 *SEM pseudo-CoNLL format ;; (loop for key in (append '(:chapter :sentence :id) (if (eq type :starsem+) '(:start :end :form :lemma :pos :features :head :deprel) '(:form :lemma :pos :ptb))) for value = (pop conll) when (smember key '(:sentence :id :start :end :head)) collect (cons key (parse-integer value :junk-allowed t)) into token else when key collect (cons key value) into token finally (unless (eq type :starsem+) (let* ((id (get-field :id token)) (length (length (get-field :form token)))) (set-field :start (* id 100) token) (set-field :end (+ (* id 100) length) token))) (return (acons :starsem conll token))) else when (eq type :tt) collect ;; ;; a house-internal convention: tokens, lemmas, and tags ;; (loop for key in '(:id :form :lemma :pos) for value in conll when (smember key '(:id)) collect (cons key (parse-integer value :junk-allowed t)) else when key collect (cons key value)) else when (smember type '(:conllu :conllu+)) ;; ;; the newer, CoNLL-U format; in theory, its token identifiers ;; can be ranges (`3-4') or synthesized identifiers (`5.1'), but ;; we expect it might be a little while before parsers output ;; these. (16-mar-17; oe) ;; collect (loop for key in (append '(:id) (and (eq type :conllu+) '(:start :end)) '(:form :lemma :upos :xpos :feat :head :deprel :deps :misc)) for value in conll while (or (null fields) (>= (decf fields) 0)) when (smember key '(:id :start :end :head)) collect (let ((n (ignore-errors (parse-integer value)))) (cons key (or n value))) else collect (cons key value)) else when (= (length conll) 10) collect ;; ;; the earlier, CoNLL 2007 format (CoNLL-X) ;; (loop for key in '(:id :form :lemma :cpos :pos :feat :head :deprel :phead :pdeprel) for value in conll while (or (null fields) (>= (decf fields) 0)) when (smember key '(:id :head)) collect (cons key (parse-integer value :junk-allowed t)) else collect (cons key value)) else collect ;; ;; the richer, CoNLL 2009 format ;; (loop for key in '(:id :form :lemma :plemma :pos :ppos :feat :pfeat :head :phead :deprel :pdeprel :fillpred :pred :apreds) for value in conll while (or (null fields) (>= (decf fields) 0)) when (smember key '(:id :head :phead)) collect (cons key (parse-integer value :junk-allowed t)) else collect (cons key value))))) (case format (:raw tokens) (:string (let ((forms (loop for token in tokens unless (and (null gaps) (eq mode :srg) (or (string= (get-field :stem token) "_") (string= (get-field :lemma token) "_")) (string= (get-field :pos token) "p")) collect (get-field :form token)))) (format nil "~{~a~^ ~}" forms)))))) (defun conll-parse-features (string &key filter rename) (unless (string= string "_") (loop for feature in (ppcre:split "\\|" string) for (key value) = (ppcre:split "=" feature) for symbol = (let* ((symbol (intern key :keyword)) (match (rest (assoc symbol rename :test #'eq)))) (or match symbol)) unless (smember symbol filter) collect (cons symbol value)))) (defun conll-for-pet (string &optional tagger &key gold (characterize t) (mode :erg) type stream) (declare (ignore tagger)) (loop with result with i = 0 with start = 0 with end = 1 for token in (conll-preprocess string :format :raw :type type) for id = (get-field :id token) for form = (rewrite-conll-token (get-field :form token)) for plemma = (get-field :plemma token) for lemma = (if (or gold (not plemma)) (or (get-field :stem token) (get-field :lemma token)) plemma) for cpos = (get-field :cpos token) for pos = (or cpos (if gold (get-field :pos token) (get-field :ppos token))) for feat = (if (or gold (not plemma)) (get-field :feat token) (get-field :pfeat token)) for yy = (case mode (:jacy (let* ((feat (substitute #\+ #\| feat)) (feat (substitute #\- #\= feat))) (unless #+:lkb (lkb::punctuationp form) #-:lkb nil (format nil "(~d, ~d, ~d, ~:[~*~*~;<~a:~a>, ~]~ 1, ~s, 0, \"null\"~@[, \"~a+~a\" 1.0~])" id i (+ i 1) characterize start end form pos feat)))) (:srg (unless (and (string= lemma "_") (string= pos "p")) (loop for tag in (conll-to-parole pos feat) for stem = (if (ppcre:scan "^(?:[zwf]|np|ao)" tag) (cond ((and (char= (char tag 0) #\z) (member form '("un" "una" "uno") :test #'string=)) form) (t tag)) lemma) for yy = (format nil "(~d, ~d, ~d, ~:[~*~*~;<~a:~a>, ~]~ 1, ~s \"~a\", 0, \"$~a\"~@[, ~s 1.0~])" id i (+ i 1) characterize start end stem form tag pos) when tag collect yy into result finally (when (and (null result) (not (string= pos "f"))) (format t "~&conll-for-pet(): ~ ignoring token #~a (`~a' `~a' `~a')~%" id form pos feat)) (return result)))) (t (let ((start (or (get-field :start token) i)) (end (or (get-field :end token) (+ i 1)))) (format nil "(~d, ~d, ~d, ~:[~*~*~;<~a:~a>, ~]~ 1, ~s, 0, \"null\"~@[, ~s 1.0~])" id i (+ i 1) characterize start end form pos)))) when (consp yy) do (setf result (nconc yy result)) (incf i) when (stringp yy) do (push yy result) (incf i) when yy do (setf start end) (setf end (+ start 1)) else do (incf end) finally (setf result (nreverse result)) (cond ((or (streamp stream) (eq stream t)) (format stream "~{~a~^ ~}" (nreverse result))) ((stringp stream) (with-open-file (stream stream :direction :output :if-exists :supersede) (format stream "~{~a~^ ~}" result)))) (return (values (format nil "~{~a~^ ~}" result) i)))) (defun rewrite-conll-token (token &optional pos) (let ((token (rewrite-ptb-token token pos))) ;; ;; it would seem that, the CoNLL 2007 data at least, has braces instead of ;; parentheses. i dimly recall reading something about this problem in an ;; early README file (maybe for the second PTB release), but i wonder why ;; CoNLL in 2007 should have this problem? (3-nov-10; oe) ;; (cond ((string-equal pos "{") "(") ((string-equal pos "}") ")") (t token)))) (defun conll-expand-graph (tokens) (let ((n 0)) (loop for token in tokens for pred = (get-field :pred token) when (or (equal pred "_") (null pred)) do (set-field :pred nil token) else do (incf n)) (let ((predicates (make-array n :initial-element nil))) (loop for token in tokens for pred = (get-field :pred token) for args = (get-field :args token) when (stringp pred) do (set-field :opred pred token) (cond ((and (not (zerop (length pred))) (char= (char pred 0) #\^)) (set-field :top t token) (set-field :pred (subseq pred 1) token)) (t (set-field :top nil token))) do (loop for i from 0 for arg in args unless (string= arg "_") do (push (cons arg token) (aref predicates i)))) (loop with i = 0 for token in tokens for pred = (get-field :pred token) for args = (and pred (aref predicates i)) when args do (set-field :pred pred token) else do (set-field :pred nil token) do (set-field :edges args token) (setf token (delete :args token :key #'first)) (when pred (incf i))) tokens))) (defun conll-cyclic-p (tokens) (labels ((walk (token &optional history path) (if (smember token history) (nreverse path) (loop for (role . value) in (get-field :edges token) thereis (walk value (cons token history) (cons role path)))))) (loop for token in tokens for path = (walk token) when path return (values token path)))) (defun conll-analyze (tokens) (let ((mark (gensym "")) (edges 0) roots singletons roles) (labels ((walk (token &optional recursep) (let ((edges (get-field :edges token))) (unless (or (eq (get-field :mark token) mark) (and (null edges) (null recursep))) (set-field :mark mark token) (loop for (label . token) in edges do (pushnew label roles :test #'string-equal) (walk token t)))))) (loop for token in tokens do (walk token)) (loop for token in tokens do (incf edges (length (get-field :edges token))) when (get-field :top token) do (push token roots) else unless (eq (get-field :mark token) mark) do (push token singletons)) (values (conll-cyclic-p tokens) edges (nreverse roots) (nreverse singletons) (sort roles #'string<))))) (defun conll-output (item &key (type :sdp) stringp (stream t) header footer lemma pos sense (amp "\\amp")) (unless (smember type '(:sdp :latex :epe)) (error "conll-output(): unknown output format `~(~a~)'." type)) (when header (if (stringp header) (write-string header stream) (case type (:latex (format stream "\\documentclass[10pt,a1paper,landscape]{article}~%~ \\usepackage[T1]{fontenc}~%~ \\usepackage[utf8]{inputenc}~%~ \\usepackage{tikz-dependency}~%~ \\usepackage[a1paper,landscape]{geometry}~%~%~ \\begin{document}~%~%"))))) (let ((n 0) (tokens (get-field :i-tokens item))) ;; ;; _hack_ ;; see whether the tokens string looks much like an association list; if ;; so, parse that list. ;; (let* ((n (when (stringp tokens) (- (length tokens) 1)))) (when (and n (< 3 n) (char= (schar tokens 0) #\() (char= (schar tokens 1) #\() (char= (schar tokens (- n 1)) #\)) (char= (schar tokens n) #\))) (setf tokens (ignore-errors (read-from-string tokens))))) (when (and (loop for token in tokens always (null (get-field :edges token))) (loop for token in tokens thereis (get-field :deprel token))) (loop for token in tokens for head = (let* ((head (get-field :head token)) (head (if (stringp head) (ignore-errors (parse-integer head)) head))) (find head tokens :key #'(lambda (foo) (get-field :id foo)))) for label = (get-field :deprel token) ;; ;; _fix_me_ ;; per discussion with milan stranka and the EPE organizers, we make ;; all dependents of the ‘pseudo’ node #0 top nodes, no matter their ;; incoming dependency type. (18-apr-17; oe) ;; when (and (null head) #+:null (string-equal label "root")) do (set-field :top t token) when (and head label) do (let ((edge (cons label token))) (if (get-field :edges head) (push edge (get-field :edges head)) (set-field :edges (list edge) head))))) (loop for token in tokens when (get-field :edges token) do (set-field :rank n token) (incf n)) (loop for token in tokens do (set-field :predicates (make-array n) token)) (loop for token in tokens for rank = (get-field :rank token) for edges = (get-field :edges token) do (loop for (role . target) in edges do (setf (aref (get-field :predicates target) rank) role))) (case type (:sdp (format stream "#~a~%" (get-field :i-id item))) (:latex (format stream "\\begin{dependency}[edge above, edge slant=0.15ex, ~ edge unit distance=2ex]~% ~ \\begin{deptext}[column sep=1ex]~% ") (loop for token in tokens unless (eq token (first tokens)) do (format stream " ~a " amp) do (format stream "~a" (latex-escape-string (get-field :form token) :escape '(#\&)))) (format stream "\\\\~%") (when lemma (loop for token in tokens unless (eq token (first tokens)) do (format stream " ~a " amp) do (format stream "~a" (latex-escape-string (get-field :lemma token) :escape '(#\&)))) (format stream "\\\\~%")) (when pos (loop for token in tokens unless (eq token (first tokens)) do (format stream " ~a " amp) do (format stream "~a" (latex-escape-string (get-field :pos token) :escape '(#\&)))) (format stream "\\\\~%")) (when sense (loop for token in tokens unless (eq token (first tokens)) do (format stream " ~a " amp) do (format stream "~a" (latex-escape-string (get-field :sense token) :escape '(#\& #\\)))) (format stream "\\\\~%")) (format stream "\\end{deptext}~%")) (:epe (format stream "{~@[\"id\": ~a, ~]~@[\"string\": ~a, ~]\"nodes\": [" (get-field :i-id item) (and stringp (json-escape-string (get-field :i-input item)))))) (loop for token in tokens for predicates = (get-field :predicates token) for i from 1 do (case type (:sdp (format stream "~a~c~a~c~a~c~a~c~:[-~;+~]~c~:[-~;+~]~c~a" i #\tab (get-field :form token) #\tab (or (get-field :lemma token) (get-field :stem token)) #\tab (get-field :pos token) #\tab (get-field :top token) #\tab (get-field :pred token) #\tab (or (get-field :sense token) "_")) (loop for i from 0 below n do (format stream "~c~a" #\tab (or (aref predicates i) "_"))) (terpri stream)) (:latex (when (get-field :top token) (format stream "\\deproot{~a}{top}~%" (get-field :id token))) (loop for (label . target) in (get-field :edges token) do (format stream "\\depedge{~a}{~a}{~a}~%" (get-field :id token) (get-field :id target) (latex-escape-string label)))) (:epe (format stream "~:[, ~;~]{\"id\": ~a" (eq token (first tokens)) (or (get-field :id token) i)) (let ((form (get-field :form token))) (when form (format stream ", \"form\": \"~a\"" (json-escape-string form)))) (let ((start (get-field :start token)) (end (get-field :end token))) (when (and start end) (format stream ", \"start\": ~a, \"end\": ~a" start end))) (when (get-field :top token) (format stream ", \"top\": true")) (let ((properties (get-field :properties token))) (loop for key in '(:lemma :pos :upos :xpos) for value = (get-field key token) unless (or (null value) (member key properties :key #'first :test #'eq)) do (push (cons key value) properties)) (loop with features = (let ((features (or (get-field :feat token) (get-field :features token)))) (if (consp features) features (conll-parse-features features))) for feature in features for key = (first feature) unless (member key properties :key #'first :test #'eq) do (push feature properties)) (when properties (set-field :properties properties token) (format stream ", \"properties\": {") (loop for property in properties for key = (first property) do (when (smember key '(:lemma :pos :upos :xpos)) (setf key (string-downcase key))) (format stream "~:[, ~;~]\"~a\": \"~a\"" (eq property (first properties)) (json-escape-string key) (json-escape-string (rest property)))) (format stream "}"))) (loop with edges = (get-field :edges token) for edge in edges initially (when edges (format stream ", \"edges\": [")) do (format stream "~:[, ~;~]{\"label\": ~s, \"target\": ~a}" (eq edge (first edges)) (first edge) (get-field :id (rest edge))) finally (when edges (format stream "]"))) (let ((negation (get-field :negation token))) (when negation (format stream ", \"negation\": ~a" negation))) (format stream "}")))) (case type (:sdp (terpri stream)) (:latex (format stream "\\end{dependency}~%")) (:epe (format stream "]}")))) (when footer (if (stringp footer) (write-string footer stream) (case type (:latex (format stream "\\end{document}~%")))))) (defun tt-output (item &key (type :tt) (stream t)) (unless (eq type :tt) (error "tt-output(): unknown output format `~(~a~)'." type)) (format stream "#~a~%" (get-field :i-id item)) (loop for token in (get-field :i-tokens item) for i from 1 do (format stream "~a~c~a~c~a~c~a~%" i #\tab (get-field :form token) #\tab (get-field :lemma token) #\tab (get-field :pos token))) (terpri stream)) (defparameter *conll-parole-a-map* '((("postype=ordinal" . "o") ("postype=qualificative" . "q") 0) (0) (("gen=f" . "f") ("gen=m" . "m") ("gen=c" . "c") ("gen=c" . "0") 0) (("num=s" . "s") ("num=p" . "p") ("num=c" . "0") ("num=c" . "n") 0) (("posfunction=participle" . "p") 0))) (defparameter *conll-parole-c-map* '((("postype=coordinating" . "c") ("postype=subordinating" . "s") 0))) (defparameter *conll-parole-d-map* '((("postype=article" . "a") ("postype=demonstrative" . "d") ("postype=exclamative" . "e") ("postype=indefinite" . "i") ("postype=numeral" . "n") ("postype=possessive" . "p") ("postype=interrogative" . "t") 0) (("person=1" . "1") ("person=2" . "2") ("person=3" . "3") 0) (("gen=f" . "f") ("gen=m" . "m") ("gen=c" . "c") ("gen=c" . "n") ("gen=c" . "0") 0) (("num=s" . "s") ("num=p" . "p") ("num=c" . "0") ("num=c" . "n") 0) (("possessornum=s" . "s") ("possessornum=c" . "c") ("possessornum=p" . "p") 0))) (defparameter *conll-parole-f-map* '((("punct=exclamationmark" . "a") ("punct=colon" . "d") ("punct=quotation" . "e") ("punct=hyphen" . "g") ("punct=slash" . "h") ("punct=etc" . "s") ("punct=semicolon" . "x") ("punct=mathsign" . "z") 0) (("punctenclose=open" . "a") ("punctenclose=close" . "t") ("punctenclose=close" . "c") 0))) (defparameter *conll-parole-n-map* '((("postype=common" . "c") 0) (("gen=f" . "f") ("gen=m" . "m") ("gen=c" . "c") ("gen=c" . "0") 0) (("num=s" . "s") ("num=p" . "p") ("num=c" . "n") ("num=c" . "0") 0) (0) (0) (0))) (defparameter *conll-parole-p-map* '((("postype=personal" . "p") ("postype=demonstrative" . "d") ("postype=possessive" . "x") ("postype=indefinite" . "i") ("postype=interrogative" . "t") ("postype=relative" . "r") ("postype=numeral" . "n") ("postype=exclamative" . "e") 0) (("person=1" . "1") ("person=2" . "2") ("person=3" . "3") 0) (("gen=f" . "f") ("gen=m" . "m") ("gen=c" . "0") ("gen=c" . "c") ("gen=c" . "n") 0) (("num=s" . "s") ("num=p" . "p") ("num=c" . "0") ("num=c" . "n") 0) (("case=nominative" . "n") ("case=accusative" . "a") ("case=dative" . "d") ("case=oblique" . "o") 0) (("possessornum=s" . "s") ("possessornum=p" . "p") ("possessornum=c" . "c") 0) (("polite=yes" . "p") 0))) (defparameter *conll-parole-v-map* '((("postype=auxiliary" . "a") ("postype=main" . "m") ("postype=semiauxiliary" . "s") 0) (("mood=indicative" . "i") ("mood=subjunctive" . "s") ("mood=imperative" . "m") ("mood=infinitive" . "n") ("mood=gerund" . "g") ("mood=pastparticiple" . "p") 0) (("tense=present" . "p") ("tense=imperfect" . "i") ("tense=future" . "f") ("tense=past" . "s") ("tense=conditional" . "c") 0) (("person=1" . "1") ("person=2" . "2") ("person=3" . "3") 0) (("num=s" . "s") ("num=p" . "p") ("num=c" . "0") 0) (("gen=f" . "f") ("gen=m" . "m") ("gen=c" . "0") 0))) (defparameter *conll-parole-z-map* '((("postype=currency" . "m") ("postype=percentage" . "p") 0))) (defun conll-to-parole (pos features &key (filter t)) (labels ((cross-product (lists) (if (null (rest lists)) (loop for foo in (first lists) collect (list foo)) (loop with rests = (cross-product (rest lists)) for foo in (first lists) nconc (loop for bar in rests collect (cons foo bar))))) (fields (map) (let ((fields (loop for field in map collect (loop for entry in field when (and (consp entry) (search (first entry) features :test #'string=)) collect (rest entry) into matches finally (return (or matches (last field))))))) (loop for values in (cross-product fields) for tag = (format nil "~a~{~a~}" pos values) when (or (null filter) #+:lkb (let ((symbol (intern (string-upcase tag) :lkb))) (gethash symbol lkb::*lexical-rules*)) #-:lkb t) collect tag)))) (cond ((string= pos "a") (fields *conll-parole-a-map*)) ((string= pos "c") (fields *conll-parole-c-map*)) ((string= pos "d") (cond ((search "postype=numeral" features) (list "z")) (t (fields *conll-parole-d-map*)))) ((string= pos "f") (cond ((string= features "punct=period") (list "fp")) ((string= features "punct=bracket|punctenclose=open") (list "fpa")) ((string= features "punct=bracket|punctenclose=close") (list "fpt")) ((string= features "punct=questionmark|punctenclose=open") (list "fia")) ((string= features "punct=questionmark|punctenclose=close") (list "fit")) ((string= features "punct=comma") (list "fc")) ((string= features "punct=bracket|punctenclose=open") (list "fca")) ((string= features "punct=bracket|punctenclose=close") (list "fct")) (t (fields *conll-parole-f-map*)))) ((string= pos "i") (list "i")) ((string= pos "n") (cond ((string= features "postype=proper|gen=c|num=c") (list "np00000")) (t (fields *conll-parole-n-map*)))) ((string= pos "p") (fields *conll-parole-p-map*)) ((string= pos "r") (cond ((string= features "postype=negative") (list "rn")) ((string= features "_") (list "rg")))) ((string= pos "s") (cond ((string= features "postype=preposition|gen=m|num=s|contracted=yes") (list "spcms")) #+:null ((string= features "postype=preposition|gen=m|num=p|contracted=yes") (list "spcmp")) ((string= features "postype=preposition|gen=c|num=c") (list "sps00")))) ((string= pos "v") (fields *conll-parole-v-map*)) ((string= pos "w") (list "w")) ((string= pos "z") (fields *conll-parole-z-map*))))) (defun item-to-conll (item &key (stream t) gold result-id) (when (stringp stream) (return-from item-to-conll (with-open-file (stream stream :direction :output :if-exists :supersede) (item-to-conll item :stream stream :gold gold :result-id result-id)))) (when (stringp gold) (return-from item-to-conll (with-open-file (gold gold :direction :output :if-exists :supersede) (item-to-conll item :stream stream :gold gold :result-id result-id)))) (labels ((heads (node &optional heads) (if (null (node-daughters node)) (let* ((head (first heads))) (list (cons node (if (eq head node) (second heads) head)))) (loop with head = (node-head node) with heads = (if (eq head (first heads)) heads (cons head heads)) for node in (node-daughters node) append (heads node heads))))) (let* ((input (get-field :i-input item)) (conll (or (get-field :conll item) (let ((conll (conll-preprocess input :format :raw))) (nconc item (acons :conll conll nil)) conll))) (results (get-field :results item)) (result (when (numberp result-id) (loop for result in results when (eql (get-field :result-id result) result-id) return result))) (result (or result (first results))) (derivation (let ((derivation (get-field :derivation result)) (*package* (find-package :tsdb))) (when (stringp derivation) (setf derivation (read-from-string derivation)) (setf (get-field :derivation result) derivation)) derivation)) (node (let ((*derivations-ignore-tokens-p* nil)) (derivation-to-node derivation))) (heads (heads node)) (total 0) (correct 0)) (labels ((nucleus (node) (setf (node-nucleus node) (if (= (+ (node-from node) 1) (node-to node)) (node-to node) (loop with punctuation = '(#\` #\' #\" #\( #\) #\[ #\] #\{ #\} #\. #\? #\. #\, #\; #\:) for i from (node-from node) to (- (node-to node) 1) for token = (nth i conll) for form = (get-field :form token) unless (loop for c across form always (member c punctuation :test #'char=)) return (+ i 1) finally (return (node-to node))))))) (loop for (dependent . head) in heads unless (node-nucleus dependent) do (nucleus dependent) unless (or (null head) (node-nucleus head)) do (nucleus head))) (loop for token in conll for id = (get-field :id token) for nodes = (let ((node (first (first heads)))) (if (and node (> id (node-from node)) (<= id (node-to node))) (pop heads) nodes)) for head = (if (rest nodes) (node-nucleus (rest nodes)) 0) do ;; ;; the scorer, it seems, uses the HEAD column; for easy eyeballing ;; of results, put the original HEAD into the PHEAD column. ;; (when stream (format stream "~a~{ ~a~} ~a ~a _ _ ~ ~a _ _~%" id (loop for key in '(:form :stem :plemma :pos :ppos :feat :pfeat) collect (get-field key token)) head (get-field :head token) (get-field :fillpred token))) (when (streamp gold) (format gold "~a~{ ~a~} ~a ~a~{ ~a~}~%" id (loop for key in '(:form :stem :plemma :pos :ppos :feat :pfeat) collect (get-field key token)) (get-field :head token) (get-field :phead token) (loop for key in '(:deprel :pdeprel :fillpred :pred :apreds) collect (get-field key token)))) (incf total) (when (equal head (get-field :head token)) (incf correct))) (when stream (terpri stream)) (when (streamp gold) (terpri gold)) (pairlis '(:total :head) (list total correct))))) (defun conll-item-enhancer (item) (nconc item (loop with ranks for result in (get-field :results item) for id = (get-field :result-id result) for score = (let* ((score (item-to-conll item :stream nil :result-id id)) (total (get-field :total score)) (head (get-field :head score))) (when (and total head) (divide head total))) unless (numberp score) do (format t "conll-item-enhancer(): no score on item # ~a (result # ~a).~%" (get-field :i-id item) (get-field :result-id result)) and return nil else do (push (acons :asu score result) ranks) finally (return (let ((ranks (sort ranks #'> :key #'(lambda (result) (get-field :asu result))))) (acons :ranks (loop with top = (get-field :asu (first ranks)) for rank in ranks while (= (get-field :asu rank) top) collect (acons :rank 1 rank)) nil)))))) (defun conll-convert (input output &key (from :conllu) (to :epe) raw) ;; ;; some sanity checks ;; (when (eq from :conllx) (setf from :conll)) (unless (smember from '(:corenlp :conll :conllu :dsynt :bpa :sdp :sdp+)) (error "conll-convert(): unknown input format ‘~(~a~)’." from)) (unless (smember to '(:epe :conllu+ :starsem+)) (error "conll-convert(): unknown output format ‘~(~a~)’." to)) (unless (or (smember from '(:conllu+ :starsem+)) (and raw (not (string= raw "")) (probe-file raw))) (error "conll-convert(): missing or invalid ‘raw’ text file~@[ ‘~a’~]." (and (not (string= raw "")) raw))) ;; ;; make sure .input. and .output. are valid streams ;; (when (and (stringp input) (string= input "-")) (setf input *standard-input*)) (when (and (stringp output) (string= output "-")) (setf output *standard-output*)) (cond ((or (pathnamep input) (stringp input)) (if (probe-file input) (with-open-file (stream input :direction :input) (conll-convert stream output :from from :to to :raw raw)) (error "conll-convert(): invalid input file ‘~a’." (namestring input)))) ((or (pathnamep output) (stringp output)) (with-open-file (stream output :direction :output :if-exists :supersede) (conll-convert input stream :from from :to to :raw raw))) (t ;; ;; now, read a stream of input graphs, if necessary align against raw text ;; and augment each token with start and end off-sets, and write out again ;; (loop with user = (current-user) with pid = (current-pid) with tokens = (format nil "~a/.conll.tokens.~a.~a.~a" (tmp :tsdb) user pid (string-downcase (gensym ""))) with resa = (format nil "~a/.conll.resa.~a.~a.~a" (tmp :tsdb) user pid (string-downcase (gensym ""))) with resaalign = (logon-file "bin" "resaalign" :string) with command = (format nil "exec ~a -f TAB -r '~a' -i '~a' -o '~a'" resaalign (namestring raw) tokens resa) with n = 0 with items = (cond ((eq from :corenlp) (read-items-from-conll-file input :type :conll :fields '(:id :form :lemma :pos nil :head :deprel) :rawp t)) ((eq from :dsynt) (read-items-from-conll-file input :type :conll :fields '(:id :sense :lemma nil :pos nil :features nil :governors nil :dependencies)' :rawp t)) ((eq from :bpa) (read-items-from-conll-file input :type :conll :fields '(:id :vn :sense nil nil nil :features nil :governors nil :dependencies)' :rawp t)) ((smember from '(:conll :conllu :sdp :sdp+)) (read-items-from-conll-file input :type from :rawp t))) initially (unless (smember from '(:conllu+ :dsynt :bpa :starsem+ :epe)) (with-open-file (stream tokens :direction :output :if-exists :supersede) (loop for item in items do (loop for token in (get-field :i-tokens item) do (format stream "~a~%" (get-field :form token)) (incf n)) (terpri stream))) (run-process command :wait t :output nil :error-output nil) (unless (probe-file resa) (error "conll-convert(): failure invoking ‘~a’" resaalign)) (with-open-file (stream resa) (loop with items = items with tokens = (get-field :i-tokens (first items)) for line = (read-line stream nil nil) for fields = (and line (ppcre:split "\\t" line)) while line when (null fields) do (pop items) (setf tokens (get-field :i-tokens (first items))) else when (string= (third fields) "TOK") do (set-field :start (first fields) (first tokens)) (set-field :end (second fields) (first tokens)) (pop tokens) (decf n)))) (when (or (eq from :dsynt) (eq from :bpa)) (let ((ssynts (read-epe raw))) (loop for item in items for ssynt in ssynts do (conll-barcelona-expand item ssynt :from from)))) (unless (zerop n) (error "conll-convert(): mysterious mismatch in token alignment (~a)." n)) for item in items do (conll-output item :stream output :type to) (terpri output))))) #+:null (loop for i in '("00" "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24") do (do-import-items (format nil "/home/oe/src/ptb/mrg/~a" i) (format nil "test/wsj~a" i) :format :ptb)) #+:null (loop for i from 2 to 21 for shift = #'(lambda (id) (+ id 20000000 (* i 100000))) do (do-import-items (format nil "/home/oe/src/conll09/en/09~2,'0d.txt" i) (format nil "test/conll~2,'0d" i) :format :conll :shift shift)) #+:null (loop for i from 2 to 24 for shift = #'(lambda (id) (+ id 20000000 (* i 100000))) do (do-import-items (format nil "/home/oe/src/conll07/09~2,'0d.txt" i) (format nil "test/conll~2,'0d" i) :format :conll :shift shift))