(in-package :tsdb) ;;; ;;; [incr tsdb()] --- Competence and Performance Profiling Environment ;;; Copyright (c) 2008 -- 2009 Erik Velldal (erikve@ifi.uio.no) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; (defparameter *conll-scopes* :hscopes) (defparameter *conll-cues* :hcues) ;;(defparameter *conll-scope-system* :ranker) ;; :rules (defun read-items-from-conll-file (file &key (base 1) (offset 0) shift) (when (probe-file file) (with-open-file (stream file :direction :input) (loop with id = base with input = nil for line = (read-line stream nil nil) while line when (string= line "") collect (let ((i id) (length (length input)) (string (format nil "~{~a~^~%~}" (nreverse input)))) (when (functionp shift) (setf i (funcall shift i))) (incf id) (setf input nil) (pairlis '(:i-id :i-wf :i-length :i-input) (list (+ offset i) 1 length string))) else do (push line input))))) (defun conll-preprocess (string &key (format :string) gaps (mode :erg)) (let ((tokens (loop for token in (ppcre:split "\\n" string) for conll = (ppcre:split "\\t" token) collect (loop for key in '(:id :form :lemma :plemma :pos :ppos :feat :pfeat :head :phead :deprel :pdeprel :fillpred :pred :apreds) for value in conll when (smember key '(:id :head :phead)) collect (cons key (parse-integer value :junk-allowed t)) else collect (cons key value))))) (case format (:raw tokens) (:string (let ((forms (loop for token in tokens unless (and (null gaps) (eq mode :srg) (string= (get-field :lemma token) "_") (string= (get-field :pos token) "p")) collect (get-field :form token)))) (format nil "~{~a~^ ~}" forms)))))) (defun conll-for-pet (string &optional tagger &key gold (characterize t) (mode :erg)) (declare (ignore tagger)) (loop with result with i = 0 with start = 0 with end = 1 for token in (conll-preprocess string :format :raw) for id = (get-field :id token) for form = (get-field :form token) for lemma = (if gold (get-field :lemma token) (get-field :plemma token)) for pos = (if gold (get-field :pos token) (get-field :ppos token)) for feat = (if gold (get-field :feat token) (get-field :pfeat token)) for yy = (case mode (:jacy (let* ((feat (substitute #\+ #\| feat)) (feat (substitute #\- #\= feat))) (unless #+:lkb (lkb::punctuationp form) #-:lkb nil (format nil "(~d, ~d, ~d, ~:[~*~*~;<~a:~a>, ~]~ 1, \"~a\", 0, \"null\"~@[, \"~a+~a\" 1.0~])" id i (+ i 1) characterize start end form pos feat)))) (:srg (unless (and (string= lemma "_") (string= pos "p")) (loop for tag in (conll-to-parole pos feat) for stem = (if (ppcre:scan "^(?:[zwf]|np|ao)" tag) (cond ((and (char= (char tag 0) #\z) (member form '("un" "una" "uno") :test #'string=)) form) (t tag)) lemma) for yy = (format nil "(~d, ~d, ~d, ~:[~*~*~;<~a:~a>, ~]~ 1, \"~a\" \"~a\", 0, \"$~a\"~@[, ~s 1.0~])" id i (+ i 1) characterize start end stem form tag pos) when tag collect yy into result finally (when (and (null result) (not (string= pos "f"))) (format t "~&conll-for-pet(): ~ ignoring token #~a (`~a' `~a' `~a')~%" id form pos feat)) (return result)))) (t (format nil "(~d, ~d, ~d, ~:[~*~*~;<~a:~a>, ~]~ 1, \"~a\", 0, \"null\"~@[, ~s 1.0~])" id i (+ i 1) characterize i (+ i 1) form pos))) when (consp yy) do (setf result (nconc yy result)) (incf i) when (stringp yy) do (push yy result) (incf i) when yy do (setf start end) (setf end (+ start 1)) else do (incf end) finally (return (values (format nil "~{~a~^ ~}" (nreverse result)) i)))) (defparameter *conll-parole-a-map* '((("postype=ordinal" . "o") ("postype=qualificative" . "q") 0) (0) (("gen=f" . "f") ("gen=m" . "m") ("gen=c" . "c") ("gen=c" . "0") 0) (("num=s" . "s") ("num=p" . "p") ("num=c" . "0") ("num=c" . "n") 0) (("posfunction=participle" . "p") 0))) (defparameter *conll-parole-c-map* '((("postype=coordinating" . "c") ("postype=subordinating" . "s") 0))) (defparameter *conll-parole-d-map* '((("postype=article" . "a") ("postype=demonstrative" . "d") ("postype=exclamative" . "e") ("postype=indefinite" . "i") ("postype=numeral" . "n") ("postype=possessive" . "p") ("postype=interrogative" . "t") 0) (("person=1" . "1") ("person=2" . "2") ("person=3" . "3") 0) (("gen=f" . "f") ("gen=m" . "m") ("gen=c" . "c") ("gen=c" . "n") ("gen=c" . "0") 0) (("num=s" . "s") ("num=p" . "p") ("num=c" . "0") ("num=c" . "n") 0) (("possessornum=s" . "s") ("possessornum=c" . "c") ("possessornum=p" . "p") 0))) (defparameter *conll-parole-f-map* '((("punct=exclamationmark" . "a") ("punct=colon" . "d") ("punct=quotation" . "e") ("punct=hyphen" . "g") ("punct=slash" . "h") ("punct=etc" . "s") ("punct=semicolon" . "x") ("punct=mathsign" . "z") 0) (("punctenclose=open" . "a") ("punctenclose=close" . "t") ("punctenclose=close" . "c") 0))) (defparameter *conll-parole-n-map* '((("postype=common" . "c") 0) (("gen=f" . "f") ("gen=m" . "m") ("gen=c" . "c") ("gen=c" . "0") 0) (("num=s" . "s") ("num=p" . "p") ("num=c" . "n") ("num=c" . "0") 0) (0) (0) (0))) (defparameter *conll-parole-p-map* '((("postype=personal" . "p") ("postype=demonstrative" . "d") ("postype=possessive" . "x") ("postype=indefinite" . "i") ("postype=interrogative" . "t") ("postype=relative" . "r") ("postype=numeral" . "n") ("postype=exclamative" . "e") 0) (("person=1" . "1") ("person=2" . "2") ("person=3" . "3") 0) (("gen=f" . "f") ("gen=m" . "m") ("gen=c" . "0") ("gen=c" . "c") ("gen=c" . "n") 0) (("num=s" . "s") ("num=p" . "p") ("num=c" . "0") ("num=c" . "n") 0) (("case=nominative" . "n") ("case=accusative" . "a") ("case=dative" . "d") ("case=oblique" . "o") 0) (("possessornum=s" . "s") ("possessornum=p" . "p") ("possessornum=c" . "c") 0) (("polite=yes" . "p") 0))) (defparameter *conll-parole-v-map* '((("postype=auxiliary" . "a") ("postype=main" . "m") ("postype=semiauxiliary" . "s") 0) (("mood=indicative" . "i") ("mood=subjunctive" . "s") ("mood=imperative" . "m") ("mood=infinitive" . "n") ("mood=gerund" . "g") ("mood=pastparticiple" . "p") 0) (("tense=present" . "p") ("tense=imperfect" . "i") ("tense=future" . "f") ("tense=past" . "s") ("tense=conditional" . "c") 0) (("person=1" . "1") ("person=2" . "2") ("person=3" . "3") 0) (("num=s" . "s") ("num=p" . "p") ("num=c" . "0") 0) (("gen=f" . "f") ("gen=m" . "m") ("gen=c" . "0") 0))) (defparameter *conll-parole-z-map* '((("postype=currency" . "m") ("postype=percentage" . "p") 0))) (defun conll-to-parole (pos features &key (filter t)) (labels ((cross-product (lists) (if (null (rest lists)) (loop for foo in (first lists) collect (list foo)) (loop with rests = (cross-product (rest lists)) for foo in (first lists) nconc (loop for bar in rests collect (cons foo bar))))) (fields (map) (let ((fields (loop for field in map collect (loop for entry in field when (and (consp entry) (search (first entry) features :test #'string=)) collect (rest entry) into matches finally (return (or matches (last field))))))) (loop for values in (cross-product fields) for tag = (format nil "~a~{~a~}" pos values) when (or (null filter) #+:lkb (let ((symbol (intern (string-upcase tag) :lkb))) (gethash symbol lkb::*lexical-rules*)) #-:lkb t) collect tag)))) (cond ((string= pos "a") (fields *conll-parole-a-map*)) ((string= pos "c") (fields *conll-parole-c-map*)) ((string= pos "d") (cond ((search "postype=numeral" features) (list "z")) (t (fields *conll-parole-d-map*)))) ((string= pos "f") (cond ((string= features "punct=period") (list "fp")) ((string= features "punct=bracket|punctenclose=open") (list "fpa")) ((string= features "punct=bracket|punctenclose=close") (list "fpt")) ((string= features "punct=questionmark|punctenclose=open") (list "fia")) ((string= features "punct=questionmark|punctenclose=close") (list "fit")) ((string= features "punct=comma") (list "fc")) ((string= features "punct=bracket|punctenclose=open") (list "fca")) ((string= features "punct=bracket|punctenclose=close") (list "fct")) (t (fields *conll-parole-f-map*)))) ((string= pos "i") (list "i")) ((string= pos "n") (cond ((string= features "postype=proper|gen=c|num=c") (list "np00000")) (t (fields *conll-parole-n-map*)))) ((string= pos "p") (fields *conll-parole-p-map*)) ((string= pos "r") (cond ((string= features "postype=negative") (list "rn")) ((string= features "_") (list "rg")))) ((string= pos "s") (cond ((string= features "postype=preposition|gen=m|num=s|contracted=yes") (list "spcms")) #+:null ((string= features "postype=preposition|gen=m|num=p|contracted=yes") (list "spcmp")) ((string= features "postype=preposition|gen=c|num=c") (list "sps00")))) ((string= pos "v") (fields *conll-parole-v-map*)) ((string= pos "w") (list "w")) ((string= pos "z") (fields *conll-parole-z-map*))))) (defun item-to-conll (item &key (stream t) gold result-id) (when (stringp stream) (return-from item-to-conll (with-open-file (stream stream :direction :output :if-exists :supersede) (item-to-conll item :stream stream :gold gold :result-id result-id)))) (when (stringp gold) (return-from item-to-conll (with-open-file (gold gold :direction :output :if-exists :supersede) (item-to-conll item :stream stream :gold gold :result-id result-id)))) (labels ((heads (node &optional heads) (if (null (node-daughters node)) (let* ((head (first heads))) (list (cons node (if (eq head node) (second heads) head)))) (loop with head = (node-head node) with heads = (if (eq head (first heads)) heads (cons head heads)) for node in (node-daughters node) append (heads node heads))))) (let* ((input (get-field :i-input item)) (conll (or (get-field :conll item) (let ((conll (conll-preprocess input :format :raw))) (nconc item (acons :conll conll nil)) conll))) (results (get-field :results item)) (result (when (numberp result-id) (loop for result in results when (eql (get-field :result-id result) result-id) return result))) (result (or result (first results))) (derivation (let ((derivation (get-field :derivation result)) (*package* (find-package :tsdb))) (when (stringp derivation) (setf derivation (read-from-string derivation)) (setf (get-field :derivation result) derivation)) derivation)) (node (let ((*derivations-ignore-tokens-p* nil)) (derivation-to-node derivation))) (heads (heads node)) (total 0) (correct 0)) (labels ((nucleus (node) (setf (node-nucleus node) (if (= (+ (node-from node) 1) (node-to node)) (node-to node) (loop with punctuation = '(#\` #\' #\" #\( #\) #\[ #\] #\{ #\} #\. #\? #\. #\, #\; #\:) for i from (node-from node) to (- (node-to node) 1) for token = (nth i conll) for form = (get-field :form token) unless (loop for c across form always (member c punctuation :test #'char=)) return (+ i 1) finally (return (node-to node))))))) (loop for (dependent . head) in heads unless (node-nucleus dependent) do (nucleus dependent) unless (or (null head) (node-nucleus head)) do (nucleus head))) (loop for token in conll for id = (get-field :id token) for nodes = (let ((node (first (first heads)))) (if (and node (> id (node-from node)) (<= id (node-to node))) (pop heads) nodes)) for head = (if (rest nodes) (node-nucleus (rest nodes)) 0) do ;; ;; the scorer, it seems, uses the HEAD column; for easy eyeballing ;; of results, put the original HEAD into the PHEAD column. ;; (when stream (format stream "~a~{ ~a~} ~a ~a _ _ ~ ~a _ _~%" id (loop for key in '(:form :lemma :plemma :pos :ppos :feat :pfeat) collect (get-field key token)) head (get-field :head token) (get-field :fillpred token))) (when (streamp gold) (format gold "~a~{ ~a~} ~a ~a~{ ~a~}~%" id (loop for key in '(:form :lemma :plemma :pos :ppos :feat :pfeat) collect (get-field key token)) (get-field :head token) (get-field :phead token) (loop for key in '(:deprel :pdeprel :fillpred :pred :apreds) collect (get-field key token)))) (incf total) (when (equal head (get-field :head token)) (incf correct))) (when stream (terpri stream)) (when (streamp gold) (terpri gold)) (pairlis '(:total :head) (list total correct))))) (defun conll-item-enhancer (item) (nconc item (loop with ranks for result in (get-field :results item) for id = (get-field :result-id result) for score = (let* ((score (item-to-conll item :stream nil :result-id id)) (total (get-field :total score)) (head (get-field :head score))) (when (and total head) (divide head total))) unless (numberp score) do (format t "conll-item-enhancer(): no score on item # ~a (result # ~a).~%" (get-field :i-id item) (get-field :result-id result)) and return nil else do (push (acons :asu score result) ranks) finally (return (let ((ranks (sort ranks #'> :key #'(lambda (result) (get-field :asu result))))) (acons :ranks (loop with top = (get-field :asu (first ranks)) for rank in ranks while (= (get-field :asu rank) top) collect (acons :rank 1 rank)) nil)))))) #+:null (eval-when #+:ansi-eval-when (:load-toplevel :execute) #-:ansi-eval-when (load eval) (setf (gethash :i-input *statistics-readers*) #'(lambda (string) (let ((*package* (find-package :tsdb))) (ptb-preprocess string :plainp t))))) #+:null (loop for i in '("00" "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24") do (do-import-items (format nil "/home/oe/src/ptb/mrg/~a" i) (format nil "test/wsj~a" i) :format :ptb)) #+:null (loop for i from 2 to 21 for shift = #'(lambda (id) (+ id 20000000 (* i 100000))) do (do-import-items (format nil "/home/oe/src/conll09/en/09~2,'0d.txt" i) (format nil "test/conll~2,'0d" i) :format :conll :shift shift)) (defun read-classified-items (target gold &key (ignore *redwoods-test-ignore-profiles*) enhancers);;(list #'downcase-token-stems))) (purge-profile-cache gold) (purge-profile-cache target) (let* ((*redwoods-task* :classify) (items (analyze gold :score target :scorep t :commentp t :tokensp t));;; :thorough '(:derivation))) (items (loop for item in items unless (member (get-field :source item) ignore :test #'string=) collect item))) ;;;fixme: consider letting `analyze()' do the ignoring bit above. (loop for enhancer in enhancers do (loop for item in items do (call-raw-hook enhancer item))) (loop for item in items for tokens = (get-field :i-tokens item) for (cues scopes) = (loop for score in (get-field :classes item) for type = (get-field :learner score) for start = (get-field :score-start score) for end = (get-field :score-end score) if (and (string= "token" type) tokens) do (loop with val = (read-from-string (get-field :score score) nil 0) for token in tokens when (and (= start (get-field :start token)) (= end (get-field :end token))) do (if (get-field :class token) (setf (get-field :class token) val) (nconc token (acons :class val nil)))) else if (string= "cue" type) collect (list (cons :id (get-field :score-id score)) (cons :span (list start end))) into c-tmp else if (string= "scope" type) collect (list (cons :id (get-field :score-id score)) (cons :span (list start end))) into s-tmp finally (return (list c-tmp s-tmp))) do (if (test-field *conll-cues* item) (setf (get-field *conll-cues* item) cues) (nconc item (acons *conll-cues* cues nil))) (if (test-field *conll-scopes* item) (setf (get-field *conll-scopes* item) scopes) (nconc item (acons *conll-scopes* scopes nil)))) items)) (defun apply-cue-rules (items &key (add-scope t)) (let ((new-cues) (processed) (id 0)) (labels ((ensure-class-1 (token) (unless (> (get-field :class token) 0) (setf (get-field :class token) 1))) (ensure-class-0 (token) (unless (<= (get-field :class token) 0) (setf (get-field :class token) 0))) (add-to-processed (tokens) (loop for token in tokens do (push (list (get-field :start token) (get-field :end token)) processed))) (processed-p (token) (member (list (get-field :start token) (get-field :end token)) processed :test #'equal)) (stem= (token &rest strings) (let ((stem (get-field :stem token))) (find-if #'(lambda (string) (string= stem string)) strings))) ;;; (form= (token &rest strings) ;;; (let ((form (string-downcase ;;; (get-field :form token)))) ;;; (find-if #'(lambda (string) ;;; (string= form string)) ;;; strings))) (add-cue (span tokens) (push (list (cons :id id) (cons :span span)) new-cues) (mapc #'ensure-class-1 tokens) (add-to-processed tokens) (incf id)) (update-item-and-reset (item) (if (test-field *conll-cues* item) (setf (get-field *conll-cues* item) new-cues) (nconc item (acons *conll-cues* new-cues nil))) (if add-scope (insert-default-scopes item) (when (test-field *conll-scopes* item) (setf (get-field *conll-scopes* item) nil))) ;;; reset: (setq processed nil) (setq new-cues nil) (setq id 0))) (case *conll-cues* ;;;; ------- SPECULATION ------- (:hcues (loop for item in items for tokens = (get-field :i-tokens item) for cues = (get-field *conll-cues* item) when cues;;fixme; also want to add cues in "all-negative" contexts? do (loop for i from 0 for token in tokens ;;; for prev = (and (> i 0) (nth (- i 1) tokens)) for next1 = (nth (+ i 1) tokens) for next2 = (nth (+ i 2) tokens) ;;; for cue-p = (> (get-field :class token) 0) for start = (get-field :start token) for end = (get-field :end token) with input = (get-field :i-input item) ;;; unless (or (processed-p token) (not cue-p)) unless (processed-p token) ;;; "indicate that" if (and (stem= token "indicate") (stem= next1 "that")) do (add-cue (list start (get-field :end next1)) (list token next1)) ;;; "whether or not" else if (and (stem= token "whether") (stem= next1 "or") (stem= next2 "not")) do (add-cue (list start (get-field :end next2)) (list token next1 next2)) ;;; "may(,) or may not" else if (stem= token "may") do (multiple-value-bind (from to) (ppcre:scan "[,]? or may not" input :start end) (when from (let ((tokens (spanned-tokens from to item))) (add-cue (list start to) (cons token tokens))))) ;;; " known/clear/evident/understood/exclude" else if (and (stem= token "not") (stem= next1 "know" "clear" "evident" "understand" "exclude")) do (add-cue (list start (get-field :end next1)) (list token next1)) ;;; "not ///" ;;; else if (and (stem= token "know" "clear" "evident" ;;; "understand" "exclude") ;;; (stem= prev "not") ;;; (not (processed-p prev))) ;;; do (add-cue (list (get-field :start prev) end) ;;; (list prev token)) ;;; " evidence/proof/guarantee" else if (and (stem= token "no") (stem= next1 "evidence" "proof" "guarantee")) do (add-cue (list start (get-field :end next1)) (list token next1)) ;;; "no //" ;;; else if (and (stem= prev "no") ;;; (stem= token "evidence" "proof" "guarantee") ;;; (not (processed-p prev))) ;;; do (add-cue (list (get-field :start prev) end) ;;; (list prev token)) ;;; " be exclude" else if (and (stem= token "cannot") (stem= next1 "be") (stem= next2 "exclude")) do (add-cue (list start (get-field :end next2)) (list token next1 next2)) ;;; " exclude" else if (and (stem= token "cannot") (stem= next1 "exclude")) do (add-cue (list start (get-field :end next1)) (list token next1)) ;;; "cannot " ;;; else if (and (stem= prev "cannot") (stem= token "exclude") ;;; (not (processed-p prev))) ;;; do (add-cue (list (get-field :start prev) end) ;;; (list prev token)) ;;; raises questions ;;; raising the intriguing possibility else if (stem= token "raise") ;;; only needed for non-stoplist model do (let* ((right-pos (loop ;; fixme; use find-if for token in (subseq tokens (+ i 1)) for j from (+ i 1) when (stem= token "possibility" "question" "issue" "hypothesis") return j)) (right (and right-pos (nth right-pos tokens)))) (if (and right (< (- right-pos i) 6)) ;;6 pos. limit. (progn (add-cue (list start (get-field :end right)) (subseq tokens i (+ right-pos 1)))) (progn (ensure-class-0 token) ;;(probably a false positive!) (add-to-processed (list token))))) ;;; "raise the .* possibility / question / issue / hypothesis else if (stem= token "possibility" "question" "issue" "hypothesis") do (multiple-value-bind (from to) (ppcre:scan "\\b[Rr]ais(e|es|ed|ing) (\\w+ )?(\\w+ )?(\\w+ )?" input :end start) (when from (add-cue (list from end) (spanned-tokens from end item)))) ;;; "either / or" else if (stem= token "either") do (let* ((or-token (find-if #'(lambda (token) (stem= token "or")) (subseq tokens (+ i 1))))) (when (and or-token (or (> (get-field :class token) 0) (> (get-field :class or-token) 0))) (add-cue (list start end (get-field :start or-token) (get-field :end or-token)) (list token or-token)))) ;;; any cue without a special rule: when (and (> (get-field :class token) 0) ;;check if cue again since we might have changed class (not (processed-p token))) do (add-cue (list start end) (list token)) finally (update-item-and-reset item)))) ;;;; ------- NEGATION ------- (:ncues (loop for item in items for tokens = (get-field :i-tokens item) for cues = (get-field *conll-cues* item) when cues;;fixme; also want to add cues in "all-negative" contexts? do (loop for i from 0 for token in tokens ;;; for prev = (and (> i 0) (nth (- i 1) tokens)) for next1 = (nth (+ i 1) tokens) for cue-p = (> (get-field :class token) 0) for start = (get-field :start token) for end = (get-field :end token) with input = (get-field :i-input item) unless (processed-p token) ;;; unless (or (processed-p token) (not cue-p)) ;;; "rather than" if (and (stem= token "rather") (stem= next1 "than")) do (add-cue (list start (get-field :end next1)) (list token next1)) ;;; "can/could " else if (and (stem= token "can" "could") (stem= next1 "not")) do (add-cue (list start (get-field :end next1)) (list token next1)) ;;; else if (and (stem= prev "can" "could") (stem= token "not")) ;;; do (add-cue (list (get-field :start prev) end) ;;; (list prev token)) ;;; " longer" else if (and (stem= token "no") (stem= next1 "longer")) do (add-cue (list start (get-field :end next1)) (list token next1)) ;;; " of" else if (and (stem= token "instead") (stem= next1 "of")) do (add-cue (list start (get-field :end next1)) (list token next1)) ;;; the * exception of else if (stem= token "with") do (multiple-value-bind (from to) (ppcre:scan "[Ww]ith the ([^ ]+ )?exception of" input :start start) (when from ;;; DEBUGGING: ;;; (print item)(terpri) ;;; (print (spanned-tokens from to item))(terpri) ;;; (print new-cues)(terpri) ;;; (print processed)(terpri) ;;; (print id)(terpri) ;;; (let ((tokens (spanned-tokens from to item))) (add-cue (list from to) (spanned-tokens from to item)))) ;;; ;;; with the * of ;;; else if (and (stem= token "exception") ;;; (stem= next1 "of")) ;;; do (multiple-value-bind (from to) ;;; (ppcre:scan "[Ww]ith the ([^ ]+ )?exception of" ;;; input :end (get-field :end next1)) ;;; (when from ;;; (add-cue (list from to) ;;; (spanned-tokens from to item)))) ;;; " / nor" else if (stem= token "neither") do (let* ((nor-token (find-if #'(lambda (token) (stem= token "nor")) (subseq tokens (+ i 1))))) (when nor-token (add-cue (list start end (get-field :start nor-token) (get-field :end nor-token)) (list token nor-token)))) ;;; " !neither / nor" else if (stem= token "no" "not") do (let* ((seen-neither-p) (nor-token (find-if #'(lambda (token) (when (stem= token "neither") (setq seen-neither-p t)) (and (stem= token "nor") ;;already tied to "neither"? (not (processed-p token)) (not seen-neither-p))) (subseq tokens (+ i 1))))) (when nor-token (add-cue (list start end (get-field :start nor-token) (get-field :end nor-token)) (list token nor-token)))) ;;; any cue without a special rule: when (and (> (get-field :class token) 0) ;;check if cue again as we might have changed class (not (processed-p token))) do (add-cue (list start end) (list token)) finally (update-item-and-reset item))))))) items) (defun export-to-bio-xml (&key items name pattern file source (task :task2) supress-cues-p (ignore *redwoods-test-ignore-profiles*) (dtd :bioscope) supersede) (format t "~&export-to-bio-xml(): no. items; ~a, name; ~a, source; ~a, ignore; ~a ~%" (length items) name source ignore) (loop for profile in (cond ((and items (null pattern) (null name)) (list (or (get-field :vp (first items)) (get-field :source (first items))))) ((not (null pattern)) (mapcar #'(lambda(db) (get-field :database db)) (find-tsdb-directories *tsdb-home* :pattern pattern :name name))) ((and name (listp name)) name) ((stringp name) (list name)) (t (error "export-to-bio-xml(): ~ name, pattern or items argument missing."))) for path = (or file (make-pathname :directory (find-tsdb-directory profile) :name (format nil "~(~a~).~(~a~)" dtd task) :type "xml")) for itemz = (when (or supersede (not (probe-file path))) (or items (read-classified-items profile source :ignore ignore))) when itemz do (let ( ;;; (path (or file (format nil "/tmp/bio_~a.~a.xml" ;;; (current-time :long nil) ;;; (current-time :long :short)))) (entities '(("&" . "&") ("'" . "'") ;; ("’" . "'") ("‘" . "'") ;; ("’" . "'") ("\"" . """) ("<" . "<") (">" . ">")))) (labels ((escape-string (bar) (loop with foo = bar for (char . esc) in entities do (setq foo (cl-ppcre:regex-replace-all char foo esc)) finally (return foo)))) (with-open-file (str path :direction :output :if-exists :supersede) (loop initially (format str "~&~%") (format str "~&~%") (format str "~&~%" (current-time :long t) (current-user) (current-pid)) (format str "~&~5t~%") with prev-did with prev-dpart for item in itemz for source = (get-field :source item) for ignorep = (and source (member source ignore :test #'string=)) for ids = (get-field :identifiers item) for sid = (get-field :sid ids) ;;sentence id ;;; the doc ids are not required according to the dtd, ;;; but announced to be required for evaluation nonetheless. for did = (get-field :did ids) ;;document id for did-type = (get-field :did-type ids) for dtype = (get-field :dtype ids) ;; e.g. article ;; not required by the dtd: for dpart = (get-field :dpart ids) ;; e.g text for cues = (get-field *conll-cues* item) for scopes = (get-field *conll-scopes* item) ;;; for document boundaries: when (and (not (equalp prev-did did)) (not ignorep)) do (unless (null prev-did) (format str "~&~20t~%") (format str "~&~10t~%")) (format str "~&~10t~%" dtype) (format str "~&~15t~a~%" did-type did) (setq prev-dpart nil) ;;;reset document part. ;;; for document part boundaries: when (and (not (equalp prev-dpart dpart)) (not ignorep)) do (unless (not prev-dpart) (format str "~&~20t~%")) (format str "~&~20t~%" dpart) ;;; sentence level: unless ignorep ;;fixme: test all the ignore stuff! if (eq task :task1) do (format str "~&~25t" sid cues) else if (eq task :task2) do (format str "~&~25t" sid) unless ignorep if (or (null cues) (and (eq task :task1) supress-cues-p)) do (format str "~a" (escape-string (get-field :i-input item))) else do (let* ((input (get-field :i-input item)) (annotated "") scope-open-tags scope-close-tags cue-open-tags cue-close-tags tags sorted-tags) ;;produce the scope tags: (unless (eq task :task1) (loop for scope in scopes for (start end) = (get-field :span scope) collect (pairlis '(:position :form) (list start (format nil "" (get-field :id scope)))) into open collect (pairlis '(:position :form) (list end "")) into close finally (setq scope-open-tags open scope-close-tags close))) ;;produce the cue tags: (loop for cue in cues do (loop for (start end) in ;;(to handle multiword-cues) (list-to-pairs (get-field :span cue)) do (push (pairlis '(:position :form) (list start (case task (:task1 "") (:task2 (case *conll-cues* (:hcues (format nil "" (get-field :id cue))) (:ncues (format nil "" (get-field :id cue)))))))) cue-open-tags) (push (pairlis '(:position :form) (list end (case task (:task1 "") (:task2 "")))) cue-close-tags)) finally (setq cue-open-tags (nreverse cue-open-tags) cue-close-tags (nreverse cue-close-tags))) ;;; add the open/close tags for cues and scopes to token list: ;; (order matters!) (setq tags (append scope-open-tags cue-open-tags cue-close-tags scope-close-tags)) (setq sorted-tags (stable-sort tags #'< :key #'(lambda (x) (get-field :position x)))) ;;; produce the new annotated string: (loop for tag in sorted-tags with start = 0 for end = (get-field :position tag) do (setq annotated (concatenate 'string annotated (escape-string (subseq input start end)) (get-field :form tag))) (setq start end) finally (setq annotated (concatenate 'string annotated (escape-string (subseq input start))))) (format str "~a" annotated)) unless ignorep do (format str "~%") (setq prev-did did prev-dpart dpart) finally (format str "~&~20t~%") (format str "~&~10t~%") (format str "~&~5t~%") (format str "~&~%"))))) finally (return path))) (defun conll10-score (xml-gold &key (source "conll10") (task :task2) name pattern (ignore *redwoods-test-ignore-profiles*) supersede (file "/tmp/conll10-score") apply-rules-p) (with-open-file (out file :direction :output :if-exists :supersede) (let ((*redwoods-task* :classify) (scores (format nil "~a/.scorer.~a.~a" (tmp :redwoods) (current-user) (current-pid))) (command (format nil "java -jar ~a/uio/bioscope/src/scorer_task~d.jar ~a" (getenv "LOGONROOT") (if (eq task :task1) 1 2) xml-gold))) (loop for profile in (cond ((not (null pattern)) (mapcar #'(lambda(db) (get-field :database db)) (find-tsdb-directories *tsdb-home* :pattern pattern :name name))) ((and name (listp name)) name) ((stringp name) (list name)) (t (error "conll10-score():~ name or pattern argument missing."))) for file = (make-pathname :directory (find-tsdb-directory profile) :name (format nil "bioscope.~(~a~)" task) :type "xml") for items = (when (or supersede (not (probe-file file))) (if apply-rules-p (re-classify-profile profile :source source :export-xml-p nil :tasks (list task) :ignore ignore) (read-classified-items profile source :ignore ignore))) for xml = (export-to-bio-xml :items items :task task :name (if supersede nil profile) :supersede supersede :ignore ignore :file file) for foo = (format nil "~a ~a" command xml) for scoredp = (and (zerop (run-process foo :wait t :output scores :if-output-exists :supersede)) (probe-file scores)) if (not scoredp) do (format t "~&conll10-score(): unable to score ~a~%" profile) else do (with-open-file (stream scores :direction :input) (loop for line = (read-line stream nil nil) while line do (ppcre:register-groups-bind (f-score) ;;fixme: record TP/FN/FP as well! ("^F-measure:\\s+(\\d+\.\\d+)" line) (format out "~&~a ~a~%" f-score profile)))) do (purge-profile-cache profile))))) (defun re-classify-profile (classified &key target (source "conll10") (skeleton source) export-xml-p threshold filter-fn ;;;nfoldp (tasks '(:task2)) score-p gold-p supress-cues-p ;;default-scope-p (scopes :default) ;; :dep-ranker / :ranker / :default / :rules (scopes-fall-back :default) ;; :default / rules (ignore *redwoods-test-ignore-profiles*) supersede trace) (when target (when supersede (let ((path (find-tsdb-directory target))) (when (fad:file-exists-p path) (ignore-errors (fad:delete-directory-and-files path))))) (tsdb :create target :skeleton skeleton) (purge-test-run target :action :purge)) (let* ((*redwoods-test-ignore-profiles* ignore) (*redwoods-task* :classify) (items (if gold-p (analyze classified :commentp t :tokensp t) ;;; :thorough '(:derivation)) (read-classified-items classified source))) (cache (when target (create-cache target :verbose t :protocol :raw)))) ;;; (test-fold-iids ;;; (when (and (eq scopes :ranker) ;;; nfoldp) ;;; (if (stringp nfoldp) ;;; (index-test-fold-iids nfoldp) ;;; ;; for cases where we're re-classifying a gold profile, which ;;; ;; won't have any 'fold' information, we can supply another ;;; ;; experiment profile instead. ;;; (index-test-fold-iids classified))))) (when filter-fn (setq items (loop for item in items when (funcall filter-fn item) collect item))) (when threshold ;; Check for w*f>threshold instead of w*f>0 (loop ;; when classifying cues. for item in items do (loop for token in (get-field :i-tokens item) for class = (get-field :class token) when class do (setf (get-field :class token) (- class threshold))))) ;;; find multiword cues: (unless gold-p (apply-cue-rules items :add-scope nil) ) ;;; determine scopes: (case scopes (:rules (apply-scope-rules items));; :trace trace)) ((:dep-ranker :ranker) (apply-scope-ranker items ;;:test-fold-iids test-fold-iids :fall-back scopes-fall-back :dep-features-p (eq scopes :dep-ranker))) (:default (loop for item in items when (get-field *conll-cues* item) do (insert-default-scopes item))) (t (error "re-classify-profile(): invalid scope type: ~a" scopes))) ;;; (if default-scope-p ;;; (loop for item in items ;;; when (get-field *conll-cues* item) ;;; do (insert-default-scopes item)) ;;; (apply-scope-system items)) (when (and target score-p) ;;; (let* ((fold nil) ;;; (scores ;;; (summarize-classification items source))) ;;; (nconc fold (acons :f-extras scores nil)) ;;; (write-fold target fold :cache cache)) ; (flush-cache cache :verbose t)) (loop for item in items do (write-classification-scores item target cache)) (flush-cache cache :verbose t) (purge-profile-cache target)) (purge-profile-cache source) (purge-profile-cache classified) ;; fixme: make this more general: (when export-xml-p (loop for task in tasks for file = (make-pathname :directory (find-tsdb-directory (or target classified)) :name (format nil "bioscope.~(~a~)" task) :type "xml") do (export-to-bio-xml :items items :task task :supersede t :supress-cues-p supress-cues-p :file file :ignore ignore))) items)) (defun report-scope-errors (profile source &key ignore ;; '("bsa" "_bsa" "bsr" "_bsr") (output "/tmp/scope_errors") nfoldp with-derivation-only-p commas-only-p gold-p (scopes :default)) ;;:rules/:ranker (let ((items (re-classify-profile profile :source source :scopes scopes :nfoldp nfoldp :export-xml-p nil :tasks '(:task2) :ignore ignore :gold-p gold-p)) (gitems (analyze source :commentp t :tokensp t :thorough (when with-derivation-only-p '(:derivation))))) ;; filter (when ignore (setq items (loop for item in items unless (member (get-field :source item) ignore :test #'string=) collect item)) (setq gitems (loop for item in gitems unless (member (get-field :source item) ignore :test #'string=) collect item))) ;; filtering out items without a derviation, -usefull when ;; dealing with :scopes = :ranker. (when with-derivation-only-p (setq items (loop for item in items when (get-field :derivation (first (get-field :results item))) collect item)) (setq gitems (loop for item in gitems when (get-field :derivation (first (get-field :results item))) collect item))) (with-open-file (out output :direction :output :if-exists :supersede) (loop with counts = (make-hash-table :test #'equalp) for item in items for gitem in gitems for cues = (get-field *conll-cues* item) for gcues = (get-field *conll-cues* gitem) for scopes = (get-field *conll-scopes* item) for gscopes = (get-field *conll-scopes* gitem) when (and cues gcues) do (loop for cue in cues for span = (get-field :span cue) for tokens = (spanned-tokens (first (get-field :span cue)) (second (get-field :span cue)) item) for id = (get-field :id cue) for scope = (find-if #'(lambda (x) (equalp (get-field :id x) id)) scopes) for scope-span = (get-field :span scope) for gcue = (find-if #'(lambda (x) (equalp (get-field :span x) span)) gcues) for gspan = (get-field :span gcue) for gid = (get-field :id gcue) for gscope = (find-if #'(lambda (x) (equalp (get-field :id x) gid)) gscopes) for gscope-span = (get-field :span gscope) ;; when (and gcue (not (equalp gscope-span scope-span))) when (and gcue (not (equalp gscope-span scope-span)) (or (not commas-only-p) (eq #\, (aref (get-field :i-input item) (second gscope-span)))) ;;; DEBUGGING: (filter out known errors in annotation) (not (eq :S415.16 (get-field :sid (get-field :identifiers gitem)))) (not (eq :S571.2 (get-field :sid (get-field :identifiers gitem))))) ;;; we have a correct cue but mismatched scope do ;;;debugging: ;;; (format t "~&sentence: ~a~%" (get-field :i-input item)) ;;; (format t "~&cue span: ~a~%" span) ;;; (format t "~&scope span: ~a~%" scope-span) (let* ((tagged-cues (mapcar #'(lambda (x) (format nil "~a (~a)" (get-field :stem x) (get-field :tag x))) tokens)) (input (get-field :i-input item)) (new (concatenate 'string (subseq input 0 (first scope-span)) "{" (subseq input (first scope-span) (first span)) "<" (subseq input (first span) (second span)) ">" (subseq input (second span) (second scope-span))"}" (subseq input (second scope-span)))) (new-gold (concatenate 'string (subseq input 0 (first gscope-span)) "{" (subseq input (first gscope-span) (first gspan)) "<" (subseq input (first gspan) (second gspan)) ">" (subseq input (second gspan) (second gscope-span))"}" (subseq input (second gscope-span))))) (format out "~&~a:~%gold: ~a~%pred: ~a~%" (get-field :i-id item) new-gold new) (format out "~&cues: ~{~a ~}~%" tagged-cues) (terpri out) (loop for cue in tagged-cues do (incf (gethash cue counts 0))))) finally ;; print summary of cue counts w/POS (loop for cue being each hash-key using (hash-value count) in counts collect (cons cue count) into list finally (mapcar #'(lambda (x) (format out "~&~a ~d~%" (car x) (cdr x))) (sort list #'> :key #'cdr))))))) (defun report-scope-boundary-on-comma (source &key ignore (output "/tmp/commas")) (labels ((print-sorted-hash (table stream) (loop for cue being each hash-key using (hash-value count) in table collect (cons cue count) into list finally (mapcar #'(lambda (x) (format stream "~&~a ~d~%" (car x) (cdr x))) (sort list #'> :key #'cdr))))) (let ((gitems (analyze source :commentp t :tokensp t))) (with-open-file (out output :direction :output :if-exists :supersede) (when ignore (setq gitems (loop for item in gitems unless (member (get-field :source item) ignore :test #'string=) collect item))) (loop ;;; with debug = 0 with cue-counts = (make-hash-table :test #'equalp) with cue-next-counts = (make-hash-table :test #'equalp) with cue-next-pos-counts = (make-hash-table :test #'equalp) with pos-counts = (make-hash-table :test #'equalp) with next-counts = (make-hash-table :test #'equalp) with next-pos-counts = (make-hash-table :test #'equalp) with prev-pos-counts = (make-hash-table :test #'equalp) with prev-next-pos-counts = (make-hash-table :test #'equalp) with pos-pos-counts = (make-hash-table :test #'equalp) for item in gitems for cues = (get-field *conll-cues* item) for scopes = (get-field *conll-scopes* item) do (loop for scope in scopes for span = (get-field :span scope) for id = (get-field :id scope) when (eq #\, (aref (get-field :i-input item) (second span))) do (let* ((cue (find-if #'(lambda (x) (equalp (get-field :id x) id)) cues)) (cspan (get-field :span cue)) (tokens (spanned-tokens (first cspan) (second cspan) item)) (prev (find-if #'(lambda (x) (= (get-field :end x) (second span))) (get-field :i-tokens item))) (next (find-if #'(lambda (x) (= (get-field :start x) (+ (second span) 2))) (get-field :i-tokens item)))) ;;; (incf debug) ;;; (when (< debug 2) ;;; (break) ;;; (setq %cue cue %scope scope %cspan cspan ;;; %tokens tokens %next next %item item)) (when next (incf (gethash (format nil "~a (~a)" (get-field :stem next) (get-field :tag next)) next-counts 0)) (incf (gethash (get-field :tag next) next-pos-counts 0))) (when prev (incf (gethash (get-field :tag prev) prev-pos-counts 0)) (when next (incf (gethash (format nil "~a, ~a" (get-field :tag prev) (get-field :tag next)) prev-next-pos-counts 0)))) (loop for token in tokens do (incf (gethash (format nil "~a (~a)" (get-field :stem token) (get-field :tag token)) cue-counts 0)) (incf (gethash (get-field :tag token) pos-counts 0)) (incf (gethash (format nil "<~a> , ~a" (get-field :tag token) (get-field :tag next)) pos-pos-counts 0)) (incf (gethash (format nil "<~a> , ~a" (get-field :stem token) (get-field :stem next)) cue-next-counts 0)) (incf (gethash (format nil "<~a> , ~a" (get-field :stem token) (get-field :tag next)) cue-next-pos-counts 0))))) finally ;; print summary of cue counts w/POS (format out "~&Cues (POS) preceeding a scope-closing comma:~%") (print-sorted-hash cue-counts out) ;; print summary of cue POS counts (terpri out) (format out "~&POS of cues preceeding a scope-closing comma:~%") (print-sorted-hash pos-counts out) ;; print summary of right-of-comma counts w/POS (terpri out) (format out "~&Lemmas (POS) following a scope-closing comma:~%") (print-sorted-hash next-counts out) ;; print summary of right-of-comma POS counts (terpri out) (format out "~&POS following a scope-closing comma:~%") (print-sorted-hash next-pos-counts out) ;; print summary of left-of-comma POS counts (terpri out) (format out "~&POS preceeding a scope-closing comma:~%") (print-sorted-hash prev-pos-counts out) ;; print summary of left-of- and right-of-comma POS counts (terpri out) (format out "~&POS-before-comma, POS-after-comma:~%") (print-sorted-hash prev-next-pos-counts out) ;; print summary of cue-POS + right-of-comma POS counts (terpri out) (format out "~& , POS-after-comma:~%") (print-sorted-hash pos-pos-counts out) ;; print summary of cue + right-of-comma lemma counts (terpri out) (format out "~& , lemma-after-comma:~%") (print-sorted-hash cue-next-counts out) ;; print summary of cue + right-of-comma POS counts (terpri out) (format out "~& , POS-after-comma:~%") (print-sorted-hash cue-next-pos-counts out)))))) (defun index-cues (items &key (form :stem)) "Returns a hash of cue word frequencies." (loop with cues = (make-hash-table :test #'equal) for item in items do (loop for (from to) in (list-cue-spans item) do (loop for token in (spanned-tokens from to item) do (incf (gethash (get-field form token) cues 0)))) finally (return cues))) (defun invert-index-cues (index) "Indexes cue words on frequencies, given the output of index-cues()." (loop with inverted = (make-hash-table :test #'eql) for cue being each hash-key in index using (hash-value count) do (push cue (gethash count inverted nil)) finally (return inverted))) (defun index-non-cues (items cue-index &key (form :stem)) "Counts non-cue usage of cue-words, given the output of index-cues()." (loop with non-cues = (make-hash-table :test #'equal) for item in items for spans = (list-cue-spans item) do (loop for token in (get-field :i-tokens item) for start = (get-field :start token) for end = (get-field :end token) when (and (gethash (get-field form token) cue-index) (not (some #'(lambda (span) (and (<= (first span) start) (>= (second span) end))) spans))) do (incf (gethash (get-field form token) non-cues 0))) finally (return non-cues))) (defun index-all-cue-usage (items cue-index &key (form :stem)) "Counts all usage of cue-words, given the output of index-cues()." (loop with counts = (make-hash-table :test #'equal) for item in items do (loop for token in (get-field :i-tokens item) when (gethash (get-field form token) cue-index) do (incf (gethash (get-field form token) counts 0))) finally (return counts))) ;;; (loop for count being each hash-key in counts using (hash-value cues) ;;; collect (cons count cues) into list ;;; finally (mapc #'(lambda (cue-counts) ;;; (format t "~&~a (~a): ~{~a~^, ~}~%" ;;; (first cue-counts) (length (cdr cue-counts)) (cdr cue-counts))) ;;; (sort list #'> :key #'car))) ;;; collects all tokens classified as cues: ;;;(setq items (read-scored-connll10-items ;;; "experiments/rr-svm_genia=fw0_sw0_tw0_fl2_sl2_tl2_fr1_sr1_tr1_malt=patr__path__drel__ltrip__ptrip__xle=xdep__sub__co__adv__adj__opt=lin_it5.0e+3_b1_pc100" ;;; "conll10")) ;;; ;;;(setq positives ;;; (loop for item in items ;;; append (loop for score in (get-field :classes item) ;;; for val = (or (read-from-string (get-field :score score) nil) 0) ;;; when (and (> val 0) ;;; (string= (get-field :learner score) "token")) ;;; collect (loop for token in (get-field :i-tokens item) ;;; when (and (= (get-field :start token) (get-field :score-start score)) ;;; (= (get-field :end token) (get-field :score-end score))) ;;; do (return token))))) ;;; collects all gold tokens classified as cues: ;;(setq gitems (analyze "conll10" :commentp t)) ;; ;;;(setq gpositives ;;; (loop for item in gitems ;;; for cue-spans = (list-cue-spans item) ;;; when cue-spans ;;; append (loop for (from to) in cue-spans ;;; append (spanned-tokens from to item)))) ;;;;;;;;;;; utilities (defun print-cues-and-scopes (item) (loop with i-input = (get-field :i-input item) ;; for type in '(:neg :spec) for cues = (get-field *conll-cues* item) for scopes = (get-field *conll-scopes* item) when cues do (loop for cue in cues for id = (get-field :id cue) for scope = (find-if #'(lambda (x) (equalp id (get-field :id x))) scopes) do (loop with spans = (copy-list (get-field :span cue)) for from = (pop spans) for to = (pop spans) while (and from to) do (format t "~&~a: " (subseq i-input from to)) (format t "~a (~a)~%" (subseq i-input (first (get-field :span scope)) (second (get-field :span scope))) *conll-cues*))))) ;; multiword cues can be of several types: ;; ;; -a list of spans (for non-adjacent tokens) ;; -one span including several (adjacent) tokens (defun list-multiword-cues (items &optional (form :form)) ;:stem (loop with cindex = (make-hash-table :test #'equalp);;cue level with tindex = (make-hash-table :test #'equalp);;token level with mwccount = 0 ;; multiword cue count with mwtcount = 0 ;; multiword token count with ccount = 0 ;; cue count with tcount = 0 ;; token count for item in items for cues = (get-field *conll-cues* item) when cues do (incf ccount (length cues)) (loop for cue in cues for span = (get-field :span cue) for from = (first span) for to = (second span) for tokens = (spanned-tokens from to item) when (> (length span) 2) ;; multiword (non-adjacent) do (loop with spans = (cddr (copy-list span)) for from = (pop spans) for to = (pop spans) while (and from to) do (setq tokens (append (spanned-tokens from to item) tokens))) do (setq tokens (reverse (mapcar #'(lambda (token) (get-field form token)) tokens))) (incf tcount (length tokens)) when (= 0 (length tokens)) do (format t "~&WTF?!?!~%") (print item) when (> (length tokens) 1) do (incf (gethash tokens cindex 0)) (incf mwccount) (mapcar #'(lambda (x) (incf (gethash x tindex 0))) tokens) (incf mwtcount (length tokens))) finally (format t "~&MULTIWORD CUES (~a out of ~a cues in total):~%" mwccount ccount) (loop for cue being the hash-keys in cindex using (hash-value count) collect (cons cue count) into list finally (mapcar #'(lambda (pair) (format t "~&~{~a~^ ~} (~a)~%" (car pair) (cdr pair))) (sort list #'> :key #'cdr))) (format t "~%MULTIWORD CUE TOKENS (~a out of ~a cue tokens in total):~%" mwtcount tcount) (loop for cue being the hash-keys in tindex using (hash-value count) collect (cons cue count) into list finally (mapcar #'(lambda (pair) (format t "~&~a (~a)~%" (car pair) (cdr pair))) (sort list #'> :key #'cdr))))) ;;;;(list-multiword-cues (append bsa bsp) :stem) (defparameter *conll-hcues-stop-list* (let ((stems (list "," "-" "a" "an" "as" "be" "for" "interesting" "intriguing" "of" "raise" "that" "the" "to" "with")) (table (make-hash-table :test #'equal))) (mapc #'(lambda (stem) (setf (gethash stem table) t)) stems) table)) (defparameter *conll-ncues-stop-list* (let ((stems (list "with" "the" "of" "than" "(" "can" "could" "notable")) (table (make-hash-table :test #'equal))) (mapc #'(lambda (stem) (setf (gethash stem table) t)) stems) table)) (defun index-test-fold-iids (profile) (let ((folds (reverse (select (list "f-tests") '(:string) "fold" nil profile))) (index (make-hash-table :test #'equalp))) (loop for fold in folds for i from 0 for string = (get-field :f-tests fold) for iids = (loop with start = 0 for (val end) = (multiple-value-bind (val end) (read-from-string string nil nil :start start) (list val end)) while val do (setq start end) collect val into list finally (return list)) do (mapc #'(lambda (id) (setf (gethash id index) i)) iids)) ;;; (format t "~&index-test-fold-iids(): indexed ~a items for profile '~a'~%" ;;; (hash-table-count index) profile) ;;; debugging index)) ;;;(progn ;;; (setq obsat (analyze "obsat" :commentp t :tokensp t)) ;;; (setq obspt (analyze "obspt" :commentp t :tokensp t)) ;;; (setq obsrt (analyze "obsrt" :commentp t :tokensp t)) ;;; (setq obsae (analyze "obsae" :commentp t :tokensp t)) ;;; (setq obspe (analyze "obspe" :commentp t :tokensp t)) ;;; (setq obsre (analyze "obsre" :commentp t :tokensp t)) ;;; (setq obs (append obsat obspt obsrt obsae obspe obsre)) ;;; (list-multiword-cues obs :stem)) ;;; producing iids of test folds for jonathon: ;;; ;;;(let* ((path "exp/spec/bsap/") ;;; (name "plain_svm_genia=fw0_sw0_tw__fl__sl3_tl__fr2_sr3_tr__malt=patr__path__drel__ltrip__ptrip__xle=xdep__sub__co__adv__adj__opt=lin_it5.0e+3_b1.0_t0.001_pc100") ;;; (profile (format nil "~a~a" path name)) ;;; (output (format nil "~a~a~a" *tsdb-home* path "test-fold-iids"))) ;;; (with-open-file (out output :direction :output ;;; :if-exists :supersede) ;;; (let ((folds (reverse ;;; (select (list "f-tests") ;;; '(:string) "fold" nil profile)))) ;;; (loop ;;; for fold in folds ;;; for i from 1 ;;; for string = (get-field :f-tests fold) ;;; for ids = ;;; (loop ;;; with start = 0 ;;; for (val end) = ;;; (multiple-value-bind (val end) ;;; (read-from-string string nil nil :start start) ;;; (list val end)) ;;; while val ;;; do (setq start end) ;;; collect val into ids ;;; finally (return ids)) ;;; do (write ids :stream out :pretty nil) ;;; (terpri out))))) (defun report-cue-errors (profile gold &key output) (let (stream) (unwind-protect (let* ((items (read-classified-items profile gold)) (gitems (analyze gold :commentp t :tokensp t)) (index (index-cues gitems)) (no-index (index-non-cues gitems index))) (apply-cue-rules items :add-scope nil) (setq stream (if output (open output :direction :output :if-exists :supersede) *standard-output*)) (loop for case in '(:fp :fn) do (loop with total = 0 with counts = (make-hash-table :test #'equal) with mwct = 0 with mwc = 0 for item in items for gitem in gitems for cues = (get-field *conll-cues* item) for gcues = (get-field *conll-cues* gitem) for cuez = (case case (:fp cues) (:fn gcues)) ;; for spans = (list-cue-spans item) for gspans = (list-cue-spans gitem) ;; for spanz = (case case (:fp spans) (:fn gspans)) for input = (get-field :i-input item) when cuez do (loop for mwc-p = nil for mwct-p = nil for cue in cuez for span = (list-to-pairs (get-field :span cue)) ;;; when spanz do (when (or (> (length span) 1) ;; MWC w/ several non-adjacent tokens (> (length (spanned-tokens (caar span) (cadar span) item)) 1)) ;; MWC spanning multiple adjacent tokens (setq mwc-p t mwct-p t )) (loop for (from to) in span do (loop for token in (spanned-tokens from to item) for start = (get-field :start token) for end = (get-field :end token) unless (case case (:fp (some #'(lambda (span) (and (<= (first span) start) (>= (second span) end))) gspans)) (:fn (> (get-field :class token) 0))) do (format stream "~&~a: `~a' (~a) in ~s ~:[~;[MWC]~]~%" case (get-field :stem token) (get-field :tag token) (concatenate 'string (subseq input 0 from) "<" (subseq input from to) ">" (subseq input to)) mwct-p) (when mwct-p (incf mwct)) ;;counting errorneous MWC tokens. (when mwc-p (incf mwc) ;;counting distinct MWC errors (once). (setq mwc-p nil)) (incf total) (incf (gethash (get-field :stem token) counts 0))))) finally (terpri stream) (loop for cue being each hash-key in counts using (hash-value count) collect (cons count cue) into list finally (format stream "~&~a, sorted by frequency (Total = ~a. Total MWC tokens = ~a. Total distinct MWCs = ~a.):~%" (case case (:fp "False Positives") (:fn "False Negatives")) total mwct mwc) (mapc #'(lambda (cue-counts) (format stream "~&~a (~(~a~):~a, cue: ~a, non-cue: ~a, ratio of tot. errors: ~$%)~%" (cdr cue-counts) case (car cue-counts) (gethash (cdr cue-counts) index) (gethash (cdr cue-counts) no-index) (* 100 (float (/ (car cue-counts) total))))) (sort list #'> :key #'car))) (terpri stream)))) (when (and output (streamp stream)) (close stream))))) (defun score-spans (gitem item) (loop with gcues = (get-field *conll-cues* gitem) with gscopes = (get-field *conll-scopes* gitem) with cues = (get-field *conll-cues* item) with scopes = (get-field *conll-scopes* item) for gcue in gcues for gid = (get-field :id gcue) for gcue-span = (get-field :span gcue) for gscope = (find-if #'(lambda (scope) (eq gid (get-field :id scope))) gscopes) for gscope-span = (get-field :span gscope) for cue = (find-if #'(lambda (cue) (equalp gcue-span (get-field :span cue))) cues) for id = (get-field :id cue) for scope = (and id (find-if #'(lambda (scope) (eq id (get-field :id scope))) scopes)) for scope-span = (get-field :span scope) collect (if (and cue (equalp gscope-span scope-span)) 1 0))) (defun ensure-numerical-ids (items) "Maps the BioScope id/ref strings for cues and scopes to numerical ids." (dolist (item items) (when (get-field *conll-cues* item) (let ((map-id -1)) (dolist (cue (get-field *conll-cues* item)) (when (not (integerp (get-field :id cue))) (let* ((id (get-field :id cue)) (new-id) (scope (find-if #'(lambda (x) (equalp (get-field :id x) id)) (get-field *conll-scopes* item)))) (incf map-id) ;; (setf (gethash map-id map) id) (setq new-id map-id) (when scope (setf (get-field :id scope) new-id)) (setf (get-field :id cue) new-id))))))) items)