(in-package :tsdb) ;(defparameter *redwoods-debug-cue-count* 0) ;;; ;;; [incr tsdb()] --- Competence and Performance Profiling Environment ;;; Copyright (c) 1996 -- 2006 Stephan Oepen (oe@csli.stanford.edu) ;;; Copyright (c) 2005 -- 2006 Erik Velldal (erikve@ifi.uio.no) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; ;;; ;;; ToDo ;;; ;;; - no need to save yield (`value') for discriminants; recompute from start ;;; and end indices into full string; ;;; - protect against saving with an empty `decision' set; ;;; - fill in `preference' relation; ;;; - confidence menu; ;;; - `Reset' button: re-instantiate original, preset state; ;;; - reorder trees: active at top; ;;; - pairwise comparison of trees; ;;; - highlighting of discriminants on tree select; ;;; - highlighting of trees on discriminant select; ;;; - utilize status vector to, e.g. fast-forward to first unannotated; ;;; - record all :select decisions, valid at `Save' time; ;;; - add print button: include edge id in display and print out; ;;; (defparameter *redwoods-semantix-hook* nil) (defparameter *redwoods-trees-hook* nil) (defparameter *redwoods-shift* nil) (defparameter *redwoods-record-void-discriminants-p* nil) (defparameter *redwoods-trace* nil) (defparameter *redwoods-reconstruct-mode* :word) (defparameter *redwoods-train-percentage* 100) (defparameter *redwoods-task* :rank) (defparameter *models* nil) (defparameter %model% nil) (defparameter %redwoods-items-increment% #-:64bit 500 #+:64bit 2000) (defparameter %redwoods-items-percentile% 20) (defparameter *redwoods-test-ignore-profiles* nil) (defparameter *redwoods-split* nil) (defparameter *redwoods-token-filter* nil) (defparameter *redwoods-item-expander* nil) (defstruct fc file db (strikes 0) cache) (defun browse-trees (&optional (data *tsdb-data*) &key (condition *statistics-select-condition*) gold (shift *redwoods-shift*) strip inspect (bestp *redwoods-thinning-normalize-p*) (exactp *redwoods-update-exact-p*) (cache *tsdb-cache-database-writes-p*) (verbose t) interactive (stream *tsdb-io*) (runp t) interrupt meter) (declare (optimize (speed 3) (safety 0) (space 0))) (initialize-tsdb) (when strip (when (< (profile-granularity data) 200509) (format stream "~%browse-trees(): out-of-date profile `~a'.~%" data) (return-from browse-trees)) (unless (do-import-database (find-tsdb-directory data) strip :meter (when meter (make-meter 0 1)) :except (append '("tree" "decision" "preference") (and bestp '("result")))) (return-from browse-trees nil))) (let* (#+:null (*tsdb-connection-expiry* 200) (condition (if (and condition (not (equal condition ""))) (concatenate 'string "(readings >= 1) && " condition) "readings >= 1")) (items (if (stringp data) (analyze data :condition condition :meter meter :message meter) data)) (message (format nil "~a `~a' trees ..." (cond (strip "normalizing") (gold "updating") (t "browsing")) data)) (items (sort (copy-list items) #'< :key #'(lambda (foo) (get-field :i-id foo)))) (schema (read-database-schema data)) (cache (when cache (create-cache (or strip data) :schema schema :verbose verbose :protocol cache))) (gc-strategy (unless (or interactive (null runp)) (install-gc-strategy nil :tenure *tsdb-tenure-p* :burst t :verbose t))) (display (let ((foo (getenv "DISPLAY"))) (and (stringp foo) (not (string= foo "")) foo))) (frame (unless #-:expand strip #+:expand nil (if (and runp display) (clim:make-application-frame 'lkb::compare-frame) (clim:make-application-frame 'lkb::compare-frame :frame-manager nil)))) %client%) (declare (special %client%)) #+:debug (setf lkb::%frame% frame) (when frame (setf (lkb::compare-frame-ids frame) (loop for item in items collect (get-field :i-id item)))) (when meter (status :text message) (meter :value 0)) (when runp (loop with last = nil with increment = (and meter (/ 1 (if items (length items) 1))) with title = (format nil "[incr tsdb()] Tree ~a (`~a'~ ~:[~*~; from `~a'~])~@[ @ `~a'~]" (if gold "Update" "Annotation") data gold gold condition) with nitems = (length items) with annotated = (make-array nitems :initial-element 0) with position = 0 initially (when frame (setf (lkb::compare-frame-chart frame) nil) (setf (clim:frame-pretty-name frame) title) (setf (lkb::compare-frame-controller frame) *current-process*)) for item = (when position (nth position items)) for i-id = (get-field :i-id item) for status = (when (integerp i-id) (or #+:allegro (when *tsdb-tenure-p* (excl:tenuring (browse-tree data i-id frame :gold gold :shift shift :strip strip :bestp bestp :inspect inspect :exactp exactp :cache cache :title title :display display :verbose verbose :stream stream))) (browse-tree data i-id frame :gold gold :shift shift :strip strip :bestp bestp :inspect inspect :exactp exactp :cache cache :title title :display display :verbose verbose :stream stream))) for action = (get-field :status status) for offset = (or (get-field :offset status) 1) while (and status (not (eq action :close)) (numberp position)) do (when (and (eq action :save) increment (zerop (aref annotated position))) (meter-advance increment)) (case action (:first (setf position 0)) (:previous (decf position offset) (close-connections :data data)) ((:skip :null :flag) (if (eq last :previous) (decf position) (incf position)) (setf action last)) ((:next :save) (when (eq action :save) (incf (aref annotated position))) (incf position offset)) (:last (setf position (- nitems 1)))) (setf last action) (when (or (>= position nitems) (< position 0)) (setf position nil)) (purge-profile-cache data :expiryp nil) (when gold (purge-profile-cache gold :expiryp nil)) when (interrupt-p interrupt) do (format stream "browse-trees(): external interrupt signal~%") (force-output stream) (return)) (when frame ;; ;; according to section 9.6 of the CLIM User Guide, frame-exit() cannot ;; have an effect unless called from the process running the top-level ;; of that frame. ;; (clim:frame-exit frame) #+:null (clim:destroy-frame frame)) (when (mp:process-p %client%) (mp:process-kill %client%)) (when meter (status :text (format nil "~a done" message) :duration 10) (meter :value 1))) (when cache (flush-cache cache :verbose verbose)) (when gc-strategy (restore-gc-strategy gc-strategy)) (purge-profile-cache data :expiryp nil) (or frame t))) (defun browse-tree (data i-id frame &key gold shift strip bestp inspect exactp subset title cache verbose (runp t) display stream) (declare (special %client%)) #+:debug (setf lkb::%frame% frame) (when (or (null runp) (null %client%) (and (mp:process-p %client%) (mp:process-active-p %client%))) #+:allegro (format excl:*initial-terminal-io* "~&[~a] browse-tree(): `~a' ~@[(~a) ~]--- item # ~a~%" (current-time :long :short) data gold i-id) (let* ((lkb::*chart-packing-p* nil) (lkb::*edge-registry* nil) (*reconstruct-cache* (make-hash-table :test #'eql)) (lkb::*tree-update-match-hook* #'update-match-p) (lkb::*tree-automatic-update-p* (when gold lkb::*tree-automatic-update-p*)) (condition (format nil "i-id = ~a" i-id)) (items (let ((*package* (find-package lkb::*lkb-package*))) (analyze data :thorough '(:derivation :mrs) :commentp t :taggingp t :condition condition))) (item (and (null (rest items)) (first items))) (input (or (get-field :o-input item) (get-field :i-input item))) (tags (get-field :tags item)) (i-id (get-field :i-id item)) (i-length (get-field :i-length item)) (i-comment (get-field :i-comment item)) (readings (get-field :readings item)) (parse-id (get-field :parse-id item)) (results (get-field :results item)) (trees (when parse-id #+:allegro (format excl:*initial-terminal-io* "~&[~a] browse-tree(): ~ retrieved item # ~a (~a parse~p).~%" (current-time :long :short) i-id (length results)(length results)) (select '("parse-id" "t-version" "t-active" "t-confidence" "t-author" "t-start" "t-end" "t-comment") '(:integer :integer :integer :integer :string :date :date :string) "tree" (format nil "parse-id == ~a" parse-id) data :sort :parse-id))) (version (loop for tree in trees maximize (get-field :t-version tree))) (trees (loop for tree in trees when (eq version (get-field :t-version tree)) collect tree)) (user (get-field :t-author (first trees))) (date (get-field :t-end (first trees))) (confidence (let* ((foo (get-field :t-confidence (first trees)))) (if (and (integerp foo) (>= foo 0) (<= foo 3)) foo 3))) (history (let* ((foo (get-field :t-confidence (first trees))) (confidence (if (and (integerp foo) (>= foo 0) (<= foo 3)) (aref #("zero" "low" "fair" "high") foo) "unknown"))) (if (and (>= version 0) user date) (format nil "(~a) ~a on ~a: ~a (~a)" version user date confidence foo) ""))) (edges (unless (or #-:expand strip #+:expand (null trees)) #+:allegro (format excl:*initial-terminal-io* "~&[~a] browse-tree(): retrieved ~a tree record~p.~%" (current-time :long :short) (length trees) (length trees)) (loop with edges with mode = (if (eq (lkb::compare-frame-mode frame) :modern) t *redwoods-reconstruct-mode*) for result in results for id = (get-field :result-id result) for derivation = (get-field :derivation result) for mrs = (let ((mrs (get-field :mrs result))) (and mrs (not (equal mrs "")) mrs)) for edge = (when (or (null subset) (member id subset)) (if (and derivation (not (equal derivation ""))) (reconstruct derivation mode) (when mrs (lkb::make-edge :id id :from 0 :to i-length)))) ;; ;; _fix_me_ ;; this seems overly robust: issue a warning message ;; whenever we fail to reconstruct an edge. ;; 7-jun-04; oe) when edge do (setf (lkb::edge-foo edge) id) (setf (lkb::edge-bar edge) derivation) (setf (lkb::edge-mrs edge) mrs) (push edge edges) finally #+:allegro (format excl:*initial-terminal-io* "~&[~a] browse-tree(): reconstructed ~a edge~p.~%" (current-time :long :short) (length edges) (length edges)) (return (nreverse edges))))) (edges (sort edges #'< :key #'lkb::edge-foo)) (foo (first edges)) (start (and foo (lkb::edge-from foo))) (end (and foo (lkb::edge-to foo))) (decisions (when (and parse-id version) (select '("parse-id" "t-version" "d-state" "d-type" "d-key" "d-value" "d-start" "d-end" "d-date") '(:integer :integer :integer :integer :string :string :integer :integer :date) "decision" (format nil "parse-id == ~a && t-version == ~a" parse-id version) data))) (discriminants (unless #-:expand strip #+:expand nil #+:allegro (format excl:*initial-terminal-io* "~&[~a] browse-tree(): retrieved ~a decision~p.~%" (current-time :long :short) (length decisions) (length decisions)) (reconstruct-discriminants decisions))) (gi-id (if (functionp shift) (funcall shift i-id) i-id)) (greadings (when (and gold (null strip)) (let ((items (select '("readings") '(:integer) "parse" (format nil "i-id == ~a" gi-id) gold))) (when (= (length items) 1) (get-field :readings (first items)))))) (gtrees (when (and gold (null strip)) (select '("parse-id" "t-version" "t-active" "t-author" "t-end") '(:integer :integer :integer :string :date) "tree" (format nil "i-id == ~a" gi-id) gold :sort :parse-id))) (gversion (loop for tree in gtrees maximize (get-field :t-version tree))) (gtrees (loop for tree in gtrees when (eq gversion (get-field :t-version tree)) collect tree)) (gactive (when (= (length gtrees) 1) (let ((gactive (get-field :t-active (first gtrees)))) (unless (minus-one-p gactive) gactive)))) (gitem (when (and gactive (or exactp (= readings 1))) (first (analyze gold :thorough '(:derivation) :condition condition)))) (gpreferences (when (and gitem (= (length gtrees) 1)) (select '("parse-id" "t-version" "result-id") '(:integer :integer :integer) "preference" (format nil "i-id == ~a && t-version == ~a" gi-id gversion) gold))) (gderivation (when (= (length gpreferences) 1) (loop with gpreference = (first gpreferences) with key = (get-field :result-id gpreference) for result in (get-field :results gitem) for id = (get-field :result-id result) thereis (when (= id key) (get-field :derivation result))))) (ghistory (when (and (integerp greadings) (integerp gactive)) (let* ((guser (get-field :t-author (first gtrees))) (gdate (get-field :t-end (first gtrees)))) (format nil "(~a) ~a on ~a; [~a : ~a] active" gversion guser gdate gactive (- greadings gactive))))) (gdecisions (when (and gold gversion) #+:allegro (format excl:*initial-terminal-io* "~&[~a] browse-tree(): retrieved ~a gold tree~p.~%" (current-time :long :short) (length gtrees) (length gtrees)) (unless exactp (select '("parse-id" "t-version" "d-state" "d-type" "d-key" "d-value" "d-start" "d-end" "d-date") '(:integer :integer :integer :integer :string :string :integer :integer :date) "decision" (format nil "i-id == ~a && t-version == ~a" gi-id gversion) gold)))) (gdiscriminants (when gdecisions #+:allegro (format excl:*initial-terminal-io* "~&[~a] browse-tree(): ~ retrieved ~a gold decision~p.~%" (current-time :long :short) (length gdecisions) (length gdecisions)) (reconstruct-discriminants gdecisions))) (version (max (if version version 0) (if gversion gversion 0))) (lkb::*parse-record* edges)) (declare (ignore active)) (when strip #-:expand (loop with preferences = (select '("parse-id" "t-version" "result-id") '(:integer :integer :integer) "preference" (format nil "parse-id == ~a && t-version == ~a" parse-id version) data) for preference in preferences do (write-preference strip preference :cache cache) finally (when (and trees bestp) ;; ;; _fix_me_ ;; in :random mode, we need to also adjust the preference ;; relation accordingly. (26-apr-04; oe) ;; (let* ((ids (if (eq bestp :random) (random-sample 1 readings (length preferences)) (loop for preference in preferences collect (get-field :result-id preference)))) (condition (format nil "parse-id == ~a~ ~@[ && (~{result-id == ~a~^ ||~})~]" parse-id ids)) (schema (read-database-schema data)) (relation (loop for (relation . structure) in schema when (string= relation "result") return structure)) (fields (loop for field in relation collect (first field))) (types (loop for field in relation collect (second field))) (results (when ids (select fields types "result" condition data)))) (when (or *redwoods-semantix-hook* *redwoods-trees-hook*) (loop for result in results for derivation = (get-field :derivation result) for edge = (when derivation (reconstruct derivation)) for mrs = (when (and edge *redwoods-semantix-hook*) (call-hook *redwoods-semantix-hook* edge)) for tree = (when (and edge *redwoods-trees-hook*) (call-hook *redwoods-trees-hook* edge)) when mrs do (setf (get-field :mrs result) mrs) when tree do (setf (get-field :tree result) tree)) (setf lkb::*cached-category-abbs* nil)) (write-results parse-id results strip :cache cache)))) (if trees (write-tree strip (first trees) :cache cache) (let* ((user (current-user)) (time (current-time :long :tsdb)) (tree (pairlis '(:parse-id :t-version :t-active :t-confidence :t-author :t-start :t-end :t-comment) (list parse-id 0 -1 -1 user time time "")))) (write-tree strip tree :cache cache))) (loop for decision in decisions do (write-decision strip decision :cache cache)) #-:expand (return-from browse-tree (acons :status :save nil))) (when (null edges) (when verbose (format stream "browse-tree(): failed to reconstruct item # ~d (parse # ~d).~%" i-id parse-id)) (return-from browse-tree (acons :status :null nil))) (setf (lkb::compare-frame-edges frame) nil) (setf (lkb::compare-frame-input frame) input) (setf (lkb::compare-frame-tags frame) tags) (setf (lkb::compare-frame-item frame) i-id) (setf (lkb::compare-frame-start frame) start) (setf (lkb::compare-frame-end frame) end) (setf (lkb::compare-frame-derivations frame) (loop for result in results collect (get-field :derivation result))) (setf (lkb::compare-frame-version frame) history) (setf (lkb::compare-frame-comment frame) i-comment) (setf (lkb::compare-frame-confidence frame) confidence) (setf (lkb::compare-frame-preset frame) discriminants) (setf (lkb::compare-frame-gold frame) gdiscriminants) (setf (lkb::compare-frame-gversion frame) ghistory) (setf (lkb::compare-frame-gactive frame) gactive) (setf (lkb::compare-frame-gderivation frame) gderivation) (setf (lkb::compare-frame-inspect frame) inspect) (setf (lkb::compare-frame-update frame) (when (and gactive greadings) (pairlis '(:parse-id :u-gin :u-gout) (list parse-id gactive (- greadings gactive))))) (when exactp (loop with gderivation = (if (stringp gderivation) (ignore-errors (read-from-string gderivation nil nil)) gderivation) initially (setf (lkb::compare-frame-exact frame) nil) for edge in edges for derivation = (lkb::edge-bar edge) when (derivation-equal gderivation derivation) do (push edge (lkb::compare-frame-exact frame)))) (when (and runp display (null %client%)) (setf %client% (mp:run-function (or title "[incr tsdb()] Tree Selection") #'lkb::run-compare-frame frame))) (let ((status (lkb::set-up-compare-frame frame lkb::*parse-record* :runp runp :display display))) ;; ;; _fix_me_ ;; grey out `Save' button in compare frame, when we have a read-only ;; cache, i.e. (:protocol . :ro). (26-jan-04; oe) ;; #+:expand (lkb::record-decision (lkb::make-decision :type :save) frame) #-:expand (unless (or (eq status :skip) (null runp)) (process-add-arrest-reason *current-process* :wait))) (when runp (let* ((decisions (lkb::compare-frame-decisions frame)) (status (lkb::decision-type (first decisions))) (recent (second decisions))) (when (and (eq status :flag) (null trees) *redwoods-update-flag-p*) (let* ((user (current-user)) (time (current-time :long :tsdb)) (tree (pairlis '(:parse-id :t-version :t-active :t-confidence :t-author :t-start :t-end :t-comment) (list parse-id 0 -1 -1 user time time "")))) (write-tree strip tree :cache cache))) (when (eq status :save) (let* ((version (if version #-:expand (incf version) #+:expand version 1)) (edges (lkb::compare-frame-in frame)) (active (length edges)) (foo (lkb::compare-frame-confidence frame)) (confidence (if (and (integerp foo) (>= foo 0) (<= foo 3)) foo -1)) (t-author (current-user)) (t-start (let* ((start (first (last decisions))) (start (when (lkb::decision-p start) (lkb::decision-time start)))) (if start (decode-time start :long :tsdb) (current-time :long :tsdb)))) (t-end (let* ((end (first decisions)) (end (when (lkb::decision-p end) (lkb::decision-time end)))) (if end (decode-time end :long :tsdb) (current-time :long :tsdb))))) #-:expand (write-tree data (pairlis '(:parse-id :t-version :t-active :t-confidence :t-author :t-start :t-end :t-comment) (list parse-id version active confidence t-author t-start t-end "")) :cache cache) (loop for edge in edges for id = (when (lkb::edge-p edge) (lkb::edge-foo edge)) do (write-preference data (pairlis '(:parse-id :t-version :result-id) (list parse-id version id)) :cache cache))) #-:expand (when (and (lkb::decision-p recent) (member (lkb::decision-type recent) '(:reject :select))) (let* ((version (or version 1)) (state (encode-discriminant-state recent)) (type (encode-discriminant-type recent)) (start (lkb::compare-frame-start frame)) (end (lkb::compare-frame-end frame)) (time (let ((time (lkb::decision-time recent))) (if time (decode-time time :long :tsdb) (current-time :long :tsdb))))) (write-decision data (pairlis '(:parse-id :t-version :d-state :d-type :d-key :d-value :d-start :d-end :d-date) (list parse-id version state type nil nil start end time)) :cache cache))) #-:expand (loop with version = (or version 1) for discriminant in (lkb::compare-frame-discriminants frame) for state = (encode-discriminant-state discriminant) for type = (encode-discriminant-type discriminant) for key = (lkb::discriminant-key discriminant) for value = (lkb::discriminant-value discriminant) for start = (lkb::discriminant-start discriminant) for end = (lkb::discriminant-end discriminant) for time = (let ((time (lkb::discriminant-time discriminant))) (if time (decode-time time :long :tsdb) (current-time :long :tsdb))) unless (or (lkb::discriminant-tag discriminant) (and (null *redwoods-record-void-discriminants-p*) (= state 5))) do (write-decision data (pairlis '(:parse-id :t-version :d-state :d-type :d-key :d-value :d-start :d-end :d-date) (list parse-id version state type key value start end time)) :cache cache)) #-:expand (let* ((update (lkb::compare-frame-update frame)) (discriminants (lkb::compare-frame-discriminants frame)) (decisions (loop for foo in discriminants for bar = (lkb::discriminant-toggle foo) count (and (null (lkb::discriminant-gold foo)) (not (eq bar :unknown))))) (in (length (lkb::compare-frame-in frame))) (out (length (lkb::compare-frame-out frame)))) (when (and update (>= (profile-granularity data) 0210)) (write-update data (append (pairlis '(:t-version :u-new :u-in :u-out) (list (or version 1) decisions in out)) update) :cache cache)))) (pairlis '(:status) (list status))))))) (defun encode-discriminant-state (discriminant) (cond ((lkb::discriminant-p discriminant) (let ((toggle (lkb::discriminant-toggle discriminant)) (state (lkb::discriminant-state discriminant))) (cond ((eq toggle t) 1) ((null toggle) 2) ((eq state t) 3) ((null state) 4) (t 5)))) ((lkb::decision-p discriminant) -1) (t -1))) (defun encode-discriminant-type (discriminant) (cond ((lkb::discriminant-p discriminant) (case (lkb::discriminant-type discriminant) (:relation 1) (:type 2) (:constituent 3) (:ed 6) (t 0))) ((lkb::decision-p discriminant) (case (lkb::decision-type discriminant) (:select 4) (:reject 5) (t -1))) (t -1))) (defun reconstruct-discriminants (decisions) (loop for decision in decisions for state = (get-field :d-state decision) for type = (get-field :d-type decision) for key = (get-field :d-key decision) for value = (get-field :d-value decision) for start = (get-field :d-start decision) for end = (get-field :d-end decision) for discriminant = (and state type key value start end (not (minus-one-p type)) (reconstruct-discriminant state type key value start end)) when discriminant collect discriminant)) (defun reconstruct-discriminant (istate type key value start end) (let* ((type (cond ((eql type 1) :relation) ((eql type 2) :type) ((eql type 3) :constituent) ((eql type 4) :select) ((eql type 5) :reject) ((eql type 6) :ed) (t nil))) (toggle :unknown) (state :unknown)) (cond ((eql istate 1) (setf toggle t) (setf state t)) ((eql istate 2) (setf toggle nil) (setf state nil)) ((eql istate 3) (setf state t)) ((eql istate 4) (setf state nil))) (lkb::make-discriminant :type (intern type :keyword) :key key :value value :start start :end end :toggle toggle :state state))) (defun analyze-trees (&optional (profile *tsdb-data*) &key (condition *statistics-select-condition*) file append (format :latex) meter) (let* ((stream (create-output-stream file append)) (items (if (stringp profile) (analyze-aggregates profile :condition condition :trees t :meter meter :format format) profile)) (averages (summarize-competence-parameters items)) (averages (remove 0 averages :key #'(lambda (foo) (get-field :results (rest foo))))) (naggregates (- (length averages) 1)) (alabel (if (eq *statistics-aggregate-dimension* :phenomena) "Phenomenon" "Aggregate")) (ncolumns 17) (caption (format nil "(generated by ~a at ~a)" *tsdb-name* (current-time :long :pretty)))) (when (or (null items) (null averages)) (return-from analyze-trees 1)) (case format (:tcl (when *statistics-tcl-formats* (format stream *statistics-tcl-formats*)) (format stream "flags 2~%~ layout col def -m1 5 -r 1 -m2 5 -c black -j right~%~ layout row def -m1 5 -r 0 -m2 5 -c black -j center~%~ layout col 0 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 1 -m1 5 -r 2 -m2 5 -c black -j left~%~ layout col 4 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 7 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 10 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 13 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 16 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout row 0 -m1 5 -r 2 -m2 5 -c black -j center~%~ layout row 2 -m1 5 -r 2 -m2 5 -c black -j center~%~ layout row ~d -m1 5 -r 2 -m2 5 -c black -j center~%~ layout row ~d -m1 5 -r 2 -m2 5 -c black -j center~%~%" (+ naggregates 2) (+ naggregates 3)) (format stream "cell 1 1 -contents {~a} -format title~%~ region 1 1 2 1 -contents {~a} -format title ~ -hor_justify left -ver_justify center~%~ region 1 2 1 4 -contents {all results} ~ -format title -hor_justify center~%~ region 1 5 1 7 -contents {t-active = 0} ~ -format title -hor_justify center~%~ region 1 8 1 10 -contents {t-active = 1} ~ -format title -hor_justify center~%~ region 1 11 1 13 -contents {t-active > 1} ~ -format title -hor_justify center~%~ region 1 14 1 16 -contents {unannotated} ~ -format title -hor_justify center~%" alabel alabel) (loop for i from 2 to 14 by 3 do (loop for label in '("items\\n#" "words\\n\\330" "trees\\n\\330") for j from 0 for k = (+ i j) do (format stream "cell 2 ~d -contents \"~a\" -format title~%~ region 2 ~d 2 ~d -contents \"~a\" -format title ~ -hor_justify center~%" k label k k label)))) (:html (format stream "~%") (html-output "redwoods-annotations-header.html" :stream stream :values (list alabel)))) (loop with i = 2 for aggregate in (remove :all items :key #'first) for data = (rest (assoc (first aggregate) averages)) for name = (if (eq format :latex) (latexify-string (second aggregate)) (second aggregate)) when data do (incf i) (case format (:tcl (format stream "cell ~d 1 -contents {~a} -format aggregate~%~ cell ~d 2 -contents ~d -format data~%~ cell ~d 3 -contents ~,2f -format data~%~ cell ~d 4 -contents ~,1f -format data~%~ cell ~d 5 -contents ~d -format data~%~ cell ~d 6 -contents ~,2f -format data~%~ cell ~d 7 -contents ~,1f -format data~%~ cell ~d 8 -contents ~d -format data~%~ cell ~d 9 -contents ~,2f -format data~%~ cell ~d 10 -contents ~,1f -format data~%~ cell ~d 11 -contents ~d -format data~%~ cell ~d 12 -contents ~,2f -format data~%~ cell ~d 13 -contents ~,1f -format data~%~ cell ~d 14 -contents ~d -format data~%~ cell ~d 15 -contents ~,2f -format data~%~ cell ~d 16 -contents ~,1f -format data~%" i name i (get-field :results data) i (get-field :i-length data) i (get-field :analyses data) i (get-field :rresults data) i (get-field :rlength data) i (get-field :ranalyses data) i (get-field :uresults data) i (get-field :ulength data) i (get-field :uanalyses data) i (get-field :aresults data) i (get-field :alength data) i (get-field :aanalyses data) i (get-field :sresults data) i (get-field :slength data) i (get-field :sanalyses data))) (:html (html-output (if (= i 3) "redwoods-annotations-body-first.html" "redwoods-annotations-body.html") :stream stream :values (list name (get-field :results data) (get-field :i-length data) (get-field :analyses data) (get-field :rresults data) (get-field :rlength data) (get-field :ranalyses data) (get-field :uresults data) (get-field :ulength data) (get-field :uanalyses data) (get-field :aresults data) (get-field :alength data) (get-field :aanalyses data) (get-field :sresults data) (get-field :slength data) (get-field :sanalyses data)))) (:csv (format stream "~a,~a,~a,~ ~,2f,~,2f,~,2f,~,2f,~,2f,~,2f,~,2f,~,2f,~,2f,~,2f,~,2f,~,2f~%" profile name (get-field :items data) (get-field :results data) (get-field :i-length data) (get-field :analyses data) (get-field :rresults data) (get-field :rlength data) (get-field :ranalyses data) (get-field :uresults data) (get-field :ulength data) (get-field :uanalyses data) (get-field :aresults data) (get-field :alength data) (get-field :aanalyses data) (get-field :sresults data) (get-field :slength data) (get-field :sanalyses data))))) (let* ((total (rest (assoc :total averages))) (name "Total") (n (+ naggregates 3))) (case format (:tcl (format stream "cell ~d 1 -contents {~a} -format aggregate~%~ cell ~d 2 -contents ~d -format data~%~ cell ~d 3 -contents ~,2f -format data~%~ cell ~d 4 -contents ~,1f -format data~%~ cell ~d 5 -contents ~d -format data~%~ cell ~d 6 -contents ~,2f -format data~%~ cell ~d 7 -contents ~,1f -format data~%~ cell ~d 8 -contents ~d -format data~%~ cell ~d 9 -contents ~,2f -format data~%~ cell ~d 10 -contents ~,1f -format data~%~ cell ~d 11 -contents ~d -format data~%~ cell ~d 12 -contents ~,2f -format data~%~ cell ~d 13 -contents ~,1f -format data~%~ cell ~d 14 -contents ~d -format data~%~ cell ~d 15 -contents ~,2f -format data~%~ cell ~d 16 -contents ~,1f -format data~%" n name n (get-field :results total) n (get-field :i-length total) n (get-field :analyses total) n (get-field :rresults total) n (get-field :rlength total) n (get-field :ranalyses total) n (get-field :uresults total) n (get-field :ulength total) n (get-field :uanalyses total) n (get-field :aresults total) n (get-field :alength total) n (get-field :aanalyses total) n (get-field :sresults total) n (get-field :slength total) n (get-field :sanalyses total))) (:html (html-output "redwoods-annotations-total.html" :stream stream :values (list name (get-field :results total) (get-field :i-length total) (get-field :analyses total) (get-field :rresults total) (get-field :rlength total) (get-field :ranalyses total) (get-field :uresults total) (get-field :ulength total) (get-field :uanalyses total) (get-field :aresults total) (get-field :alength total) (get-field :aanalyses total) (get-field :sresults total) (get-field :slength total) (get-field :sanalyses total))) (format stream "~% ~ ~%~%
~% ~ ~a~%
~%" ncolumns caption)) (:csv (format stream "~a,~a,~a,~ ~,2f,~,2f,~,2f,~,2f,~,2f,~,2f,~,2f,~,2f,~,2f,~,2f,~,2f,~,2f~%" profile name (get-field :items total) (get-field :results total) (get-field :i-length total) (get-field :analyses total) (get-field :rresults total) (get-field :rlength total) (get-field :ranalyses total) (get-field :uresults total) (get-field :ulength total) (get-field :uanalyses total) (get-field :aresults total) (get-field :alength total) (get-field :aanalyses total) (get-field :sresults total) (get-field :slength total) (get-field :sanalyses total)))) #+:debug (format t "`~a'~%~% ~ ~d items; ~d results; ~ ~,2f tokens; ~,2f words (~,2f); ~,2f readings;~% ~ rejected: ~d [~,2f ~,2f (~,2f) ~,2f]~% ~ unambiguous: ~d [~,2f ~,2f (~,2f) ~,2f]~% ~ ambiguous: ~d [~,2f ~,2f (~,2f) ~,2f]~% ~ unannotated: ~d [~,2f ~,2f (~,2f) ~,2f]~%" profile (get-field :items total) (get-field :results total) (get-field :i-length total) (get-field :words total) (divide (get-field :words total) (get-field :i-length total)) (get-field :analyses total) (get-field :rresults total) (get-field :rlength total) (get-field :rwords total) (divide (get-field :rwords total) (get-field :rlength total)) (get-field :ranalyses total) (get-field :uresults total) (get-field :ulength total) (get-field :uwords total) (divide (get-field :uwords total) (get-field :ulength total)) (get-field :uanalyses total) (get-field :aresults total) (get-field :alength total) (get-field :awords total) (divide (get-field :awords total) (get-field :alength total)) (get-field :aanalyses total) (get-field :sresults total) (get-field :slength total) (get-field :swords total) (divide (get-field :swords total) (get-field :slength total)) (get-field :sanalyses total))) (when (or (stringp file) (stringp append)) (close stream)) 0)) (defun analyze-update (&optional (data *tsdb-data*) &key (condition *statistics-select-condition*) file append (format :latex) meter) (declare (ignore meter)) (let* ((stream (create-output-stream file append)) (averages (summarize-update data :condition condition :format format)) (naggregates (length averages)) (alabel "Aggregate") (ncolumns 11) #+:latex (caption (format nil "(generated by ~a at ~a)" *tsdb-name* (current-time :long :pretty)))) (when (or (null averages) (= naggregates 1)) (return-from analyze-update 1)) (case format (:tcl (when *statistics-tcl-formats* (format stream *statistics-tcl-formats*)) (format stream "layout col def -m1 5 -r 1 -m2 5 -c black -j right~%~ layout row def -m1 5 -r 0 -m2 5 -c black -j center~%~ layout col 0 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 1 -m1 5 -r 2 -m2 5 -c black -j left~%~ layout col 2 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 4 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 6 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 8 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 9 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col ~a -m1 5 -r 2 -m2 5 -c black -j right~%~ layout row 0 -m1 5 -r 2 -m2 5 -c black -j center~%~ layout row 2 -m1 5 -r 2 -m2 5 -c black -j center~%~ layout row ~d -m1 5 -r 2 -m2 5 -c black -j center~%~ layout row ~d -m1 5 -r 2 -m2 5 -c black -j center~%~%" ncolumns (+ naggregates 1) (+ naggregates 2)) (format stream "cell 1 1 -contents {~a} -format title~%~ region 1 1 2 1 -contents {~a} -format title ~ -hor_justify left -ver_justify center~%~ region 1 3 1 4 -contents {gold} ~ -format title -hor_justify center~%~ region 1 5 1 6 -contents {matches} ~ -format title -hor_justify center~%~ region 1 7 1 8 -contents {update} ~ -format title -hor_justify center~%~ region 1 9 1 9 -contents {} ~ -format title -hor_justify center~%~ region 1 10 1 11 -contents {final} ~ -format title -hor_justify center~%" alabel alabel) (loop for i from 2 to 11 for label in '("items\\n#" "in\\n\\330" "out\\n\\330" "yes\\n\\330" "no\\n\\330" "in\\n\\330" "out\\n\\330" "new\\n\\330" "in\\n\\330" "out\\n\\330") do (format stream "cell 2 ~d -contents \"~a\" -format title~%~ region 2 ~d 2 ~d -contents \"~a\" -format title ~ -hor_justify center~%" i label i i label)))) (loop with i = 2 for aggregate in (remove :total averages :key #'first) for data = (rest (rest aggregate)) for name = (second aggregate) do (incf i) (case format (:tcl (format stream "cell ~d 1 -contents {~a} -format aggregate~%~ cell ~d 2 -contents ~d -format data~%~ cell ~d 3 -contents ~,1f -format data~%~ cell ~d 4 -contents ~,1f -format data~%~ cell ~d 5 -contents ~,1f -format data~%~ cell ~d 6 -contents ~,1f -format data~%~ cell ~d 7 -contents ~,1f -format data~%~ cell ~d 8 -contents ~,1f -format data~%~ cell ~d 9 -contents ~,1f -format data~%~ cell ~d 10 -contents ~,1f -format data~%~ cell ~d 11 -contents ~,1f -format data~%" i name i (get-field :nitems data) i (get-field :gin data) i (get-field :gout data) i (get-field :matches data) i (get-field :mismatches data) i (get-field :pin data) i (get-field :pout data) i (get-field :new data) i (get-field :in data) i (get-field :out data))))) (let* ((data (rest (rest (assoc :total averages)))) (name "Total") (i (+ naggregates 2))) (case format (:tcl (format stream "cell ~d 1 -contents {~a} -format aggregate~%~ cell ~d 2 -contents ~d -format data~%~ cell ~d 3 -contents ~,1f -format data~%~ cell ~d 4 -contents ~,1f -format data~%~ cell ~d 5 -contents ~,1f -format data~%~ cell ~d 6 -contents ~,1f -format data~%~ cell ~d 7 -contents ~,1f -format data~%~ cell ~d 8 -contents ~,1f -format data~%~ cell ~d 9 -contents ~,1f -format data~%~ cell ~d 10 -contents ~,1f -format data~%~ cell ~d 11 -contents ~,1f -format data~%" i name i (get-field :nitems data) i (get-field :gin data) i (get-field :gout data) i (get-field :matches data) i (get-field :mismatches data) i (get-field :pin data) i (get-field :pout data) i (get-field :new data) i (get-field :in data) i (get-field :out data))))) (when (or (stringp file) (stringp append)) (close stream)) 0)) (defun summarize-update (data &key condition format) (loop with result = nil with tnitems = 0 with tmatches = 0 with tmismatches = 0 with tnew = 0 with tgin = 0 with tgout = 0 with tpin = 0 with tpout = 0 with tin = 0 with tout = 0 with items = (select '("parse-id" "t-version" "u-matches" "u-mismatches" "u-new" "u-gin" "u-gout" "u-pin" "u-pout" "u-in" "u-out") '(:integer :integer :integer :integer :integer :integer :integer :integer :integer :integer :integer) "update" condition data :sort :parse-id) with aggregates = (aggregate-by-classes items '(0 1 2) :dimension :u-new :format format) for aggregate in aggregates for data = (rest (rest aggregate)) for anitems = (length data) for amatches = 0 for amismatches = 0 for anew = 0 for agin = 0 for agout = 0 for apin = 0 for apout = 0 for ain = 0 for aout = 0 do (loop for item in data do (incf amatches (get-field :u-matches item)) (incf amismatches (get-field :u-mismatches item)) (incf anew (get-field :u-new item)) (incf agin (get-field :u-gin item)) (incf agout (get-field :u-gout item)) (incf apin (get-field :u-pin item)) (incf apout (get-field :u-pout item)) (incf ain (get-field :u-in item)) (incf aout (get-field :u-out item))) (push (append (list (first aggregate) (second aggregate)) (pairlis '(:nitems :matches :mismatches :new :gin :gout :pin :pout :in :out) (list anitems (divide amatches anitems) (divide amismatches anitems) (divide anew anitems) (divide agin anitems) (divide agout anitems) (divide apin anitems) (divide apout anitems) (divide ain anitems) (divide aout anitems)))) result) (incf tnitems anitems) (incf tmatches amatches) (incf tmismatches amismatches) (incf tnew anew) (incf tgin agin) (incf tgout agout) (incf tpin apin) (incf tpout apout) (incf tin ain) (incf tout aout) finally (return (nreverse (cons (append (list :total "Total") (pairlis '(:nitems :matches :mismatches :new :gin :gout :pin :pout :in :out) (list tnitems (divide tmatches tnitems) (divide tmismatches tnitems) (divide tnew tnitems) (divide tgin tnitems) (divide tgout tnitems) (divide tpin tnitems) (divide tpout tnitems) (divide tin tnitems) (divide tout tnitems)))) result))))) (defun update-match-p (frame) ;; ;; during updates, a `save' match is indicated by the following conditions: ;; ;; - the current item has not been tree annotated already; ;; - the number of active trees in the current set equals the number of ;; active trees in the gold set; ;; - either the current item has more than one reading, or that single one ;; reading has the exact same derivation as the preferred tree from the ;; gold set. ;; - also, when in `exact-match' update mode, be content if there is at ;; least one unique result. ;; (or (and (lkb::compare-frame-exact frame) (lkb::compare-frame-in frame) #+:null (null (rest (lkb::compare-frame-in frame)))) (and (or (null (lkb::compare-frame-version frame)) (equal(lkb::compare-frame-version frame) "")) (integerp (lkb::compare-frame-gactive frame)) (= (length (lkb::compare-frame-in frame)) (lkb::compare-frame-gactive frame)) (or (not (= (length (lkb::compare-frame-edges frame)) 1)) (derivation-equal (lkb::compare-frame-gderivation frame) (loop with id = (lkb::edge-id (first (lkb::compare-frame-in frame))) for derivation in (lkb::compare-frame-derivations frame) thereis (when (= (derivation-id derivation) id) derivation))))))) (defun export-trees (data &key (condition *statistics-select-condition*) path prefix interrupt meter (compressor "gzip -c -9") (suffix "gz") (stream *tsdb-io*)) (loop with offset = (cond ((search "vm6" data) 60000) ((search "vm13" data) 130000) ((search "vm31" data) 310000) ((search "vm32" data) 320000) ((search "ecoc" data) 1000000) ((search "ecos" data) 2000000) ((search "ecpa" data) 3000000) ((search "ecpr" data) 4000000) (t 0)) with target = (format nil "~a/~a" (or path (tmp :redwoods)) (directory2file data)) with lkb::*chart-packing-p* = nil with *reconstruct-cache* = (make-hash-table :test #'eql) with items = (analyze data :thorough '(:derivation :mrs) :condition condition :commentp t) with increment = (when (and meter items) (/ (- (get-field :end meter) (get-field :start meter)) (length items) 1)) with gc-strategy = (install-gc-strategy nil :tenure nil :burst t :verbose t) initially #+:allegro (ignore-errors (mkdir target)) (when meter (meter :value (get-field :start meter))) for item in items for i-wf = (get-field :i-wf item) for input = (or (get-field :o-input item) (get-field :i-input item)) for i-comment = (get-field :i-comment item) for parse-id = (get-field :parse-id item) for results = (let ((results (get-field :results item))) (sort (copy-list results) #'< :key #'(lambda (foo) (get-field :result-id foo)))) for trees = (select '("t-active" "t-version") '(:integer :integer) "tree" (format nil "parse-id == ~a" parse-id) data) for version = (when trees (loop for tree in trees maximize (get-field :t-version tree))) for active = (if version (let ((foo (select '("result-id") '(:integer) "preference" (format nil "parse-id == ~a && t-version == ~d" parse-id version) data))) (loop for bar in foo collect (get-field :result-id bar))) (list (get-field :result-id (first results)))) for file = (format nil "~a/~@[~a.~]~d~@[.~a~]" target prefix (+ parse-id offset) suffix) when results do (format stream "[~a] export-trees(): [~a] ~a active tree~:[~;s~] (of ~d).~%" (current-time :long :short) (+ parse-id offset) (length active) (or (null version) (> (length active) 1)) (length results)) (clrhash *reconstruct-cache*) #+:allegro (multiple-value-bind (stream foo pid) (run-process compressor :wait nil :input :stream :output file :if-output-exists :supersede :error-output nil) (declare (ignore foo #-:allegro pid)) (format stream ";;;~%;;; Redwoods export of `~a';~%;;; (~a@~a; ~a).~%;;;~%~%" data (current-user) (current-host) (current-time :long :pretty)) (format stream "[~d] (~a of ~d) {~d} `~a'~@[ [~a]~]~%~a~%" (+ parse-id offset) (length active) (length results) i-wf input i-comment #\page) (export-tree item active :offset offset :stream stream) (unless *redwoods-thinning-export-p* (export-tree item active :complementp t :offset offset :stream stream)) (force-output stream) (close stream) (sys:os-wait nil pid)) (when increment (meter-advance increment)) when (interrupt-p interrupt) do (format stream "[~a] export-trees(): external interrupt signal~%" (current-time :long :short)) (force-output stream) (return) finally (when meter (meter :value (get-field :end meter))) (when gc-strategy (restore-gc-strategy gc-strategy)))) (defun export-tree (item active &key complementp (offset 0) (stream *tsdb-io*)) #+:debug (setf %item% item %active% active) (loop with *package* = (find-package :lkb) with lkb::*deleted-daughter-features* = (if (or (eq *redwoods-export-values* :all) (smember :avm *redwoods-export-values*)) nil lkb::*deleted-daughter-features*) with i-input = (get-field :i-input item) with i-id = (get-field :i-id item) with i-comment = (get-field :i-comment item) with parse-id = (get-field :parse-id item) with results = (get-field :results item) for i from 1 for result in results for result-id = (get-field :result-id result) for activep = (if complementp (not (member result-id active :test #'eql)) (member result-id active :test #'eql)) for derivation = (and activep (get-field :derivation result)) for edge = (and derivation (reconstruct derivation)) for tree = (when (and edge (or (eq *redwoods-export-values* :all) (smember :tree *redwoods-export-values*))) (let ((tree (ignore-errors (lkb::parse-tree-structure edge)))) (unless tree (format stream "[~a] export-trees(): [~a] ~ error() labeling tree # ~a.~%" (current-time :long :short) (+ parse-id offset) result-id)) tree)) for dag = (and edge (let ((tdfs (lkb::edge-dag edge))) (and (lkb::tdfs-p tdfs) (lkb::tdfs-indef tdfs)))) for mrs = (or (let ((mrs (get-field :mrs result))) (mrs::read-mrs-from-string mrs)) (and edge (mrs::extract-mrs edge))) for ident = (format nil "~a @ ~a~@[ @ ~a~]" i-id result-id i-comment) when (zerop (mod i 100)) do (clrhash *reconstruct-cache*) when (and activep (or dag mrs)) do (format stream "[~d:~d] ~:[(active)~;(inactive)~]~%~%" (+ parse-id offset) result-id complementp) (setf lkb::*cached-category-abbs* nil) (when (or (eq *redwoods-export-values* :all) (smember :derivation *redwoods-export-values*)) (let ((*package* (find-package :tsdb))) (format stream "~s~%~%~%" derivation))) (when (or (eq *redwoods-export-values* :all) (smember :tree *redwoods-export-values*)) (if tree (format stream "~a~%~%" tree) (format stream "()~%~%"))) (when (or (eq *redwoods-export-values* :all) (smember :avm *redwoods-export-values*)) (lkb::display-dag1 dag 'lkb::compact stream) (format stream "~%~%")) (when (or (eq *redwoods-export-values* :all) (smember :mrs *redwoods-export-values*)) (mrs::output-mrs1 mrs 'mrs::simple stream)) (when (and (not (eq *redwoods-export-values* :all)) (smember :indexed *redwoods-export-values*)) (mrs::output-mrs1 mrs 'mrs::indexed stream)) (when (or (eq *redwoods-export-values* :all) (smember :prolog *redwoods-export-values*)) (mrs::output-mrs1 mrs 'mrs::prolog stream) (format stream "~%")) (when (or (eq *redwoods-export-values* :all) (smember :mrx *redwoods-export-values*)) (mrs::output-mrs1 mrs 'mrs::mrs-xml stream) (format stream "~%")) (when (or (eq *redwoods-export-values* :all) (smember :rmrs *redwoods-export-values*)) (ignore-errors (mrs::output-rmrs1 (mrs::mrs-to-rmrs mrs) 'mrs::compact stream) (format stream "~%"))) (when (or (eq *redwoods-export-values* :all) (smember :xml *redwoods-export-values*)) (ignore-errors (mrs::output-rmrs1 (mrs::mrs-to-rmrs mrs) 'mrs::xml stream nil nil i-input ident) (format stream "~%"))) (when (or (eq *redwoods-export-values* :all) (smember :dependencies *redwoods-export-values*)) (ignore-errors (mrs::ed-output-psoa mrs :stream stream))) (when (or (eq *redwoods-export-values* :all) (smember :triples *redwoods-export-values*)) (ignore-errors (mrs::ed-output-psoa mrs :format :triples :cargp nil :markp nil :lnkp nil :collocationp t :abstractp t :stream stream))) (when (or (eq *redwoods-export-values* :all) (smember :mtriples *redwoods-export-values*)) (ignore-errors (mrs::ed-output-psoa mrs :format :triples :cargp nil :markp t :lnkp nil :collocationp t :abstractp t :stream stream))) (when (or (eq *redwoods-export-values* :all) (smember :ltriples *redwoods-export-values*)) (ignore-errors (mrs::ed-output-psoa mrs :format :triples :cargp t :markp nil :lnkp t :collocationp nil :abstractp nil :stream stream))) #+:cambridge (when (smember :qa *redwoods-export-values*) (mrs::output-rmrs-from-itsdb (+ parse-id offset) (or (get-field :o-input item) (get-field :i-input item)) mrs)) (format stream "~c~%" #\page))) (defun semantic-equivalence (data &key condition (file (format nil "~a/equivalences" (tmp :redwoods)))) (loop with stream = (open file :direction :output :if-exists :supersede) with lkb::*chart-packing-p* = nil with *reconstruct-cache* = (make-hash-table :test #'eql) with items = (analyze data :thorough '(:derivation) :condition condition :readerp nil) for item in items for i-id = (get-field :i-id item) for input = (or (get-field :o-input item) (get-field :i-input item)) for results = (nreverse (copy-list (get-field :results item))) do (clrhash *reconstruct-cache*) (format t "~a: [~a] `~a'~%" i-id (length results) input) (format stream "~a: [~a] `~a'~%" i-id (length results) input) (loop with *package* = (find-package :lkb) for result in results for derivation = (get-field :derivation result) for edge = (when derivation (reconstruct derivation)) for id = (when edge (lkb::edge-id edge)) for mrs = (when edge (mrs::extract-mrs edge)) do (nconc result (pairlis '(:id :mrs) (list id mrs)))) (loop for result = (pop results) for id1 = (get-field :id result) for mrs1 = (get-field :mrs result) while result do (format stream "~a:"id1) (loop for foo in results for id2 = (get-field :id foo) for mrs2 = (get-field :mrs foo) when (apply #'mrs::mrs-equalp mrs1 mrs2 '(t nil)) do (format stream " ~a" id2)) (format stream "~%")) (format stream "~a~%" #\page) finally (close stream))) (defun analyze-scores (data &optional (gold data) &key (condition *statistics-select-condition*) spartanp (scorep t) (n 1) test loosep file append (format :latex) meter) (let* (;; ;; _fix_me_ ;; we changed the format for string similarity as returned from ;; summarize-scores(), now returning an a-list with multiple scores ;; (now averaged already). until we adapt the output generation code, ;; effectively disable all of it. (25-jan-06; oe & erik) ;; (*redwoods-score-similarities* nil) (stream (create-output-stream file append)) (aggregates (summarize-scores data gold :condition condition :spartanp spartanp :scorep scorep :n n :test test :loosep loosep :format format :meter meter)) (aggregates (nreverse aggregates)) (alabel (if (eq *statistics-aggregate-dimension* :phenomena) "Phenomenon" "Aggregate")) (caption (format nil "(generated by ~a at ~a)" *tsdb-name* (current-time :long :pretty))) (n (if (and n (> n 1)) (- n 1) 0)) (ncolumns (+ n (if loosep 8 7) (if *redwoods-score-similarities* 1 0))) (i 2)) ;; ;; _fix_me_ ;; the :random values appear bogus; debug this further. (13-may-05; oe) ;; #+:null (let ((total (rest (rest (find :total aggregates :key #'first))))) (format t "~,1f of ~a = ~,2f~%" (get-field :random total) (get-field :scores total) (* 100 (divide (get-field :random total) (get-field :scores total))))) (case format (:latex (format stream "\\begin{tabular}{@{}|l|c|c|c|c|c|~:[~;c|~]~:[~;c|~]c|@{}}~% ~ \\hline~% ~ \\multicolumn{~d}{|c|}~% {\\bf `~a' ~a Profile}\\\\~% ~ \\hline\\hline~% ~ & {\\bf total} & {\\bf total} & {\\bf word} ~ & {\\bf parser}~% ~ & {\\bf exact} & {\\bf near}~:[~; & {\\bf loose}~]~ ~:[~; & {\\bf simlarity}~]~ & {\\bf overall}\\\\~% ~ {\\bf ~a} & {\\bf items} & {\\bf scores} & {\\bf string} ~ & {\\bf analyses}~% ~ & {\\bf matches} & {\\bf matches}~:[~; & {\\bf matches}~]~% ~ ~:[~; & {\\bf matches}~]~ & {\\bf accuracy}\\\\~% ~ & $\\sharp$ & $\\sharp$ & $\\phi$ & $\\phi$~% ~ & $\\sharp$ & $\\sharp$~:[~; & $\\sharp$~] & $\\%$\\\\~% ~ \\hline~% ~ \\hline~%" loosep *redwoods-score-similarities* ncolumns (if (stringp data) data "Some") "Parse Selection" loosep alabel loosep *redwoods-score-similarities* loosep *redwoods-score-similarities*)) (:tcl (format stream *statistics-tcl-formats*) (format stream "flags 2~%~ layout col def -m1 5 -r 1 -m2 5 -c black -j right~%~ layout row def -m1 5 -r 0 -m2 5 -c black -j center~%~ layout col 0 -m1 5 -r 2 -m2 5 -c black -j left~%~ layout col 1 -m1 5 -r 2 -m2 5 -c black -j left~%~ layout col ~d -m1 5 -r 2 -m2 5 -c black -j right~%~ layout row 0 -m1 5 -r 2 -m2 5 -c black -j center~%~ layout row 1 -m1 5 -r 2 -m2 5 -c black -j center~%" ncolumns) (format stream "cell 1 1 -contents {~a} -format title~%~ cell 1 2 -contents \"total\\nitems\\n#\" -format title~%~ cell 1 3 -contents \"total\\nscores\\n#\" -format title~%~ cell 1 4 -contents \"word\\nstring\\n\\330\" -format title~%~ cell 1 5 -contents \"parser\\nanalyses\\n\\330\" -format title~%~ cell 1 6 -contents \"exact\\nmatches\\n#\" -format title~%~ ~:[~*~;cell 1 ~d -contents \"loose\\nmatches\\n#\" -format title~%~]~ ~:[~*~;cell 1 ~d -contents \"similar\\nmatches\\n#\" ~ -format title~%~]~ cell 1 ~d -contents \"overall\\naccuracy\\n%\" -format title~%" alabel loosep (+ 7 n) *redwoods-score-similarities* (+ 8 n) ncolumns) (unless (zerop n) (loop for j from 0 to (- n 2) do (format stream "layout col ~d -m1 5 -r 0 -m2 5 -c black -j right~%" (+ 7 j))) (format stream "~:[~;cell 1 7 -contents \"near\\nmatches\\n#\" -format title~%~]~ region 1 7 1 ~d -contents \"near\\nmatches\\n#\" -format title ~ -hor_justify center -ver_justify center~%" (= n 1) (+ 7 (- n 1)))) (format stream "~%"))) (loop for (id foo . data) in aggregates for name = (if (eq format :latex) (latexify-string foo) foo) for items = (get-field :items data) for scores = (get-field :scores data) for length = (get-field+ :i-length data 0) for analyses = (get-field :analyses data) for exact = (get-field :exact data) for near = (get-field :near data) for successes = (get-field :successes data) for loose = (and loosep (get-field :loose data)) for similarity = 0 for accuracy = (if (zerop scores) 100 (* (divide (+ exact near) scores) 100)) unless (or (smember id '(:all :total)) (zerop scores)) do (setf id id) (case format (:latex (format stream " ~a & ~d & ~d & ~,2f & ~,2f ~ & ~,1f & ~,1f~@[ & ~d~] & ~,2f\\\\~%" name items scores length analyses exact near loose accuracy)) (:tcl (format stream "cell ~d 1 -contents {~a} -format aggregate~%~ cell ~d 2 -contents ~d -format data~%~ cell ~d 3 -contents ~d -format data~%~ cell ~d 4 -contents ~,2f -format data~%~ cell ~d 5 -contents ~,2f -format data~%~ cell ~d 6 -contents ~,1f -format data~%~ ~:[~*~*~*~;cell ~d ~d -contents ~d -format data~%~]~ ~:[~*~*~*~;cell ~d ~d -contents ~,4f -format data~%~]~ cell ~d ~d -contents ~,2f -format data~%" i name i items i scores i length i analyses i exact loosep i (+ 7 n) loose *redwoods-score-similarities* i (+ 8 n) similarity i ncolumns accuracy) (unless (zerop n) (loop for j from 0 to (- n 1) for k = (aref successes (+ j 1)) do (format stream "cell ~d ~d -contents ~,1f -format data~%" i (+ 7 j) k))) (format stream "~%"))) (incf i)) (let* ((data (rest (rest (assoc :total aggregates)))) (name "Total") (items (get-field :items data)) (scores (get-field :scores data)) (length (get-field+ :i-length data 0)) (analyses (get-field :analyses data)) (exact (get-field :exact data)) (near (get-field :near data)) (successes (get-field :successes data)) (loose (and loosep (get-field :loose data))) (similarity 0) (accuracy (if (zerop scores) 100 (* (divide (+ exact near) scores) 100)))) (case format (:latex (format stream "~:[~; \\hline~% \\hline~%~] ~ {\\bf ~a} & {\\bf ~d} & {\\bf ~d} & {\\bf ~,2f} & {\\bf ~,2f}~% ~ & {\\bf ~,1f} & {\\bf ~,1f}~@[ & {\\bf ~d}~] & {\\bf ~,2f}\\\\~% ~ \\hline~%" (= i 1) name items scores length analyses exact near loose accuracy) (format stream " \\multicolumn{~d}{r}{\\tiny ~% ~a}~%~ \\end{tabular}~%" ncolumns caption)) (:tcl (format stream "layout row ~d -m1 5 -r 2 -m2 5 -c black -j center~%~ layout row ~d -m1 5 -r 2 -m2 5 -c black -j center~%~%~ cell ~d 1 -contents {~a} -format total~%~ cell ~d 2 -contents ~d -format total~%~ cell ~d 3 -contents ~d -format total~%~ cell ~d 4 -contents ~,2f -format total~%~ cell ~d 5 -contents ~,2f -format total~%~ cell ~d 6 -contents ~,1f -format total~%~ ~:[~*~*~*~;cell ~d ~d -contents ~d -format total~%~]~ ~:[~*~*~*~;cell ~d ~d -contents ~,4f -format total~%~]~ cell ~d ~d -contents ~,2f -format total~%" (- i 1) i i name i items i scores i length i analyses i exact loosep i (+ 7 n) loose *redwoods-score-similarities* i (+ 8 n) similarity i ncolumns accuracy) (unless (zerop n) (loop for j from 0 to (- n 1) for k = (aref successes (+ j 1)) do (format stream "cell ~d ~d -contents ~,1f -format total~%" i (+ 7 j) k))) (format stream "~%")))) (when (or (stringp file) (stringp append)) (close stream)))) (defun summarize-scores (data &optional (gold data) &key (condition *statistics-select-condition*) spartanp (scorep t) (n 1) test loosep analogon (trace *redwoods-trace*) (format :latex) meter) (declare (ignore meter)) (if (eq *redwoods-task* :classify) (summarize-classification data gold);;; data :scorep scorep) ;; ;; score results in .data. against ground truth in .gold. operates in ;; several slightly distinct modes: (i) using the implicit parse ranking in ;; the order of `results' or (ii) using an explicit ranking from the `score' ;; relation; an orthogonal dimension of variation is (a) scoring by result ;; identifier (e.g. within the same profile or against one that is comprised ;; of identical results) vs. (b) scoring by derivation equivalence (e.g. ;; when comparing best-first parser output against a gold standard). ;; (let* ((thorough (when (eq test :derivation) '(:derivation))) ;; ;; _fix_me_ ;; ideally, score-item() should not have to look at anything from the ;; `result' relation but the result identifier; the following for now ;; includes the `surface' field, so we can perform a string similarity ;; comparison on generator outputs. (28-oct-04; oe) ;; (thorough (if (or trace *redwoods-score-similarities*) (cons :surface thorough) thorough)) (items (if (stringp data) (analyze (if spartanp gold data) :thorough (or thorough spartanp) :condition condition :score (if scorep data t) :scorep t :readerp (eq test :derivation)) data)) (gitems (if (stringp gold) (analyze gold :thorough thorough :condition condition :gold gold :readerp (eq test :derivation)) gold)) (aggregates (if analogon (aggregate-by-analogy items analogon) (aggregate items :format format))) (gaggregates (aggregate-by-analogy gitems aggregates :loosep t)) results) #+:debug (setf %items items %aggregates aggregates %gitems gitems %gaggregates gaggregates) (loop with nsimilarities = (length *redwoods-score-similarities*) with tnitems = 0 with tnscores = 0 with tlength = 0 with treadings = 0 with texact = 0 with tnear = 0 with tloose = 0 with ttsimilarities = (make-array nsimilarities :initial-element 0) with tnsimilarities = (make-array nsimilarities :initial-element 0) with tsuccesses = (and n (make-array n :initial-element 0)) with trandom = 0 for (id name . data) in aggregates for gaggregate = (when (equal (first (first gaggregates)) id) (pop gaggregates)) for gdata = (rest (rest gaggregate)) when gdata do (loop with anitems = 0 with anscores = 0 with alength = 0 with areadings = 0 with aexact = 0 with anear = 0 with aloose = 0 with atsimilarities = (make-array nsimilarities :initial-element 0) with ansimilarities = (make-array nsimilarities :initial-element 0) with asuccesses = (and n (make-array n :initial-element 0)) with arandom = 0 for item in data for i-id = (get-field :i-id item) for length = (get-field :i-length item) for readings = (get-field :readings item) for gitem = (loop for gitem = (first gdata) while (and gitem (< (get-field :i-id gitem) i-id)) do (pop gdata) finally (return (let ((i (get-field :i-id (first gdata)))) (when (and i (= i i-id)) (pop gdata))))) when gitem do (multiple-value-bind (i score loosep similarities) (score-item item gitem :test test :n n :loosep loosep) (when (and trace (open-stream-p trace) i) (format trace "[~a] <~,1f~@[ : ~{~,2f~^ ~}~]~@[ @ ~a~]> {~a} |~a|~%" i-id score (loop for similarity in similarities collect (second similarity)) (and (> i 0) i) (length (get-field :ranks gitem)) (get-field :i-input gitem)) (loop with granks = (get-field :ranks gitem) for i from 1 to n do (loop for grank in granks when (= (get-field :rank grank) i) do (format trace " < [~a] {~a} |~a|~%" (get-field :result-id grank) i (get-field :surface grank)))) (loop for rank in (get-field :ranks item) for score = (let ((foo (get-field :score rank))) (if (stringp foo) (read-from-string foo) foo)) do (format trace " > [~a] {~a} ~@[<~,6f>~] |~a|~%" (get-field :result-id rank) (get-field :rank rank) score (get-field :surface rank))) (format trace "~%") (force-output trace)) (incf anitems) ;; ;; _fix_me_ ;; rewrite the cond() below; it would seem most if not all of ;; its body can go into the when() below. (18-apr-06; oe) ;; (when i (incf arandom (divide 1 areadings))) (cond ((null i)) ((zerop i) (incf anscores) (incf alength length) (incf areadings readings) (loop for i from 0 for (tag top nbest) in similarities do (setf tag tag) (incf (aref atsimilarities i) top) (incf (aref ansimilarities i) nbest))) (t (incf anscores) (incf alength length) (incf areadings readings) (when (<= i n) (if (= i 1) (incf aexact score) (incf anear score))) (when loosep (incf aloose)) (loop for i from 0 for (tag top nbest) in similarities do (setf tag tag) (incf (aref atsimilarities i) top) (incf (aref ansimilarities i) nbest)) (when asuccesses (incf (aref asuccesses (- i 1)) score))))) finally (incf tnitems anitems) (incf tnscores anscores) (incf tlength alength) (incf treadings areadings) (incf texact aexact) (incf tnear anear) (incf tloose aloose) (loop for i from 0 for atsimilarity across atsimilarities do (incf (aref ttsimilarities i) atsimilarity)) (loop for i from 0 for ansimilarity across ansimilarities do (incf (aref tnsimilarities i) ansimilarity)) (loop for i from 0 for j across asuccesses do (incf (aref tsuccesses i) j)) (incf trandom arandom) (push (nconc (list id name) (pairlis '(:items :scores :i-length :analyses :exact :near :loose :successes :tsimilarities :nsimilarities :random) (list anitems anscores (divide alength anscores) (divide areadings anscores) aexact anear aloose asuccesses (loop for tag in *redwoods-score-similarities* for similarity across atsimilarities collect (cons tag (divide similarity anscores))) (loop for tag in *redwoods-score-similarities* for similarity across ansimilarities collect (cons tag (divide similarity anscores))) arandom))) results)) finally (push (nconc (list :total "Total") (pairlis '(:items :scores :i-length :analyses :exact :near :loose :successes :tsimilarities :nsimilarities :random) (list tnitems tnscores (divide tlength tnscores) (divide treadings tnscores) texact tnear tloose tsuccesses (loop for tag in *redwoods-score-similarities* for similarity across ttsimilarities collect (cons tag (divide similarity tnscores))) (loop for tag in *redwoods-score-similarities* for similarity across tnsimilarities collect (cons tag (divide similarity tnscores))) trandom))) results)) (when (eq test :derivation) (purge-profile-cache data :expiryp nil) (unless (equal data gold) (purge-profile-cache gold :expiryp nil))) results))) (defun summarize-classification (data gold &key ignore token-filter) ;; fixme: ;;take height for multiword cues, including non-adjacent cues. ;; fixme: rewrite to compare :cues / :scopes directly instead (or in ;; addition). maybe store 3 fields in scores, corresponding to ;; token-level, item-level and cue-level. and scope-level. (labels ((copy-list-by-iid (items1 items2) (loop for item2 in items2 for iid = (get-field :i-id item2) collect (loop for item1 = (car items1) if (= (get-field :i-id item1) iid) do (return item1) else do (setq items1 (cdr items1)))))) (let* ((items (if (stringp data);;fixme; scoring: (analyze gold :commentp t :tokensp t :score data) data)) (gitems (if (stringp gold) (analyze gold :commentp t :tokensp t);; :gold gold) gold)) (items (if ignore (loop for item in items unless (member (get-field :source item) ignore :test #'string=) collect item) items)) (gitems (if ignore (loop for item in gitems unless (member (get-field :source item) ignore :test #'string=) collect item) gitems))) ;; (gitems (copy-list gitems))) ;;fixme!! ask stephan about the aggregate stuff ;;; ;;;; fixme: ;;;; if we find x of the cues in a multiword cue of y words, give ourselves a credit of x/y. ;;;; count sentences for which there ;;;; ;;;; fixme: insert a :cues field in the (non-gold) items, and compute ;;;; scores by comparing these fields directly. ;;; (setq %items items %gitems gitems %data data %gold gold) ;;; (break) (loop with t_false_positives = 0 ;;Token level, Type I error with t_false_negatives = 0 ;;Token level, Type II error with t_true_positives = 0 ;;Token level positive match with t_true_negatives = 0 ;;Token level negative match with s_false_positives = 0 ;;Sentence level, Type I error with s_false_negatives = 0 ;;Sentence level, Type II error with s_true_positives = 0 ;;Sentence level positive match with s_true_negatives = 0 ;;Sentence level negative match with ncue-tokens = 0 ;; no. cue words with ncue-strings = 0 ;; no. cue strings with ncue-items = 0 ;; no. hedged items with nitems = 0 ;; no. items with ntokens = 0 ;; no. tokens for item in items for gitem in (copy-list-by-iid gitems items) ;;; for gcues = (get-field :cues gitem) ;;; for gscopes = (get-field :scopes gitem) ;;; for cues = (get-field :cues item) ;;; for scopes = (get-field :scopes item) ;;; for tokens = (get-field :i-tokens item) for tokens = (if token-filter (remove-if-not token-filter (get-field :i-tokens item)) (get-field :i-tokens item)) for cspans = (list-cue-spans gitem) for tspans = ;; = all token indicies within the spans of all cues (loop for (from to) in cspans append (spanned-tokens from to gitem :positions-only-p t)) for (positives negatives) = (loop for token in tokens for class = (get-field :class token) for start-end = (list (get-field :start token) (get-field :end token)) if (and (numberp class) (> class 0)) collect start-end into pos else collect start-end into neg finally (return (list pos neg))) ;;; for (pos_forms neg_forms) = ;;; (when error-stream ;;; (loop ;;; for token in tokens ;;; for class = (get-field :class token) ;;; for form = (get-field :form token) ;;; for start-end = (list (get-field :start token) ;;; (get-field :end token)) ;;; if (and (numberp class) (> class 0)) ;;; collect form into pos ;;; else collect form into neg ;;; finally (return (list pos neg)))) ;;; start counting when tokens do (incf nitems) (incf ntokens (length tokens)) ;;; sentence level counting: (if cspans (if positives (incf s_true_positives) (incf s_false_negatives)) (if positives (incf s_false_positives) (incf s_true_negatives))) ;;; token level counting: (loop for pos in positives ;; for i from 0 if (member pos tspans :test #'equal) do (incf t_true_positives) else do (incf t_false_positives)) ;;; (if error-stream ;;; (format error-stream .... (loop for neg in negatives if (member neg tspans :test #'equal) do (incf t_false_negatives) else do (incf t_true_negatives)) when cspans do (incf ncue-strings (length cspans)) (incf ncue-tokens (length tspans)) (incf ncue-items) finally (let* ((t_prec (* 100 (divide t_true_positives (+ t_true_positives t_false_positives)))) (t_rec (* 100 (divide t_true_positives (+ t_true_positives t_false_negatives)))) ;;; = (divide t_true_positives ncue-tokens) (t_acc (* 100 (divide (+ t_true_positives t_true_negatives) ntokens))) (t_f1 (* 2 (divide (* t_prec t_rec) (+ t_prec t_rec)))) (s_prec (* 100 (divide s_true_positives (+ s_true_positives s_false_positives)))) (s_rec (* 100 (divide s_true_positives (+ s_true_positives s_false_negatives)))) ;;; = (divide s_true_positives ncue-items) (s_acc (* 100 (divide (+ s_true_positives s_true_negatives) nitems))) (s_f1 (* 2 (divide (* s_prec s_rec) (+ s_prec s_rec))))) (return (pairlis (list :t_precision :t_recall :t_accuracy :t_f1 :s_precision :s_recall :s_accuracy :s_f1 :t_false_positives :t_false_negatives :t_true_positives :t_true_negatives :s_false_positives :s_false_negatives :s_true_positives :s_true_negatives :ncue-tokens :ncue-strings :ncue-items :nitems :ntokens) (list (float t_prec) (float t_rec) (float t_acc) (float t_f1) (float s_prec) (float s_rec) (float s_acc) (float s_f1) t_false_positives t_false_negatives t_true_positives t_true_negatives s_false_positives s_false_negatives s_true_positives s_true_negatives ncue-tokens ncue-strings ncue-items nitems ntokens)))))))) ;;;(mapcon #'(lambda (list) ;;; (unless (oddp (length list)) ;;; (list (list (first list) (second list))))) ;;; foo) ;; fixme; this needs to be done for weigh-result as well: do an ;; initial pass, augmenting cues with information about the number of ;; tokens that it spans and the correspnding span positions. this is ;; necessary because of the presence of multiword cues. ;; ;; when scoring: ;; count no hits within cues. this is necessary in order to compute the correct x/y. ;; ;;;(defun apply-cue-rules (items &key (add-scope t)) ;;; (let ((new-cues) ;;; (processed)) ;;; (labels ((ensure-class-1 (token) ;;; (unless (> (get-field :class token) 0) ;;; (setf (get-field :class token) 1))) ;;; (ensure-class-0 (token) ;;; (unless (<= (get-field :class token) 0) ;;; (setf (get-field :class token) 0))) ;;; (add-to-processed (tokens) ;;; (loop for token in tokens ;;; do (push (list (get-field :start token) ;;; (get-field :end token)) ;;; processed))) ;;; (processed-p (token) ;;; (member (list (get-field :start token) ;;; (get-field :end token)) ;;; processed :test #'equal)) ;;; (add-cue (id &rest span) ;;; (push (list (cons :id id) ;;; (cons :span span)) ;;; new-cues)) ;;; (stem= (token string) ;;; (string= (get-field :stem token) string)) ;;; (form= (token string) ;;; (string= (string-downcase (get-field :stem token)) string))) ;;; ;;; (loop ;;; for item in items ;;; for tokens = (get-field :i-tokens item) ;;; for cues = (get-field *conll-cues* item) ;;; when cues;;fixme; also want to add cues in "all-negative" contexts? ;;; do (loop ;;; for token in (get-field :i-tokens item) ;;; for i from 0 ;;; with id = 0 ;;; for cue-p = (> (get-field :class token) 0) ;;; for start = (get-field :start token) ;;; for end = (get-field :end token) ;;; ;;; "incidate that" ;;; unless (processed-p token) ;;; if (and cue-p (stem= token "indicate")) ;;; do (let ((next (nth (+ i 1) tokens))) ;;; (when (stem= next "that") ;;; (add-cue id start (get-field :end next)) ;;; (add-to-processed (list token next)) ;;; (ensure-class-1 next) ;;; (incf id))) ;;; ;;; "whether or not" ;;; else if (and cue-p (stem= token "whether")) ;;; do (let ((next1 (nth (+ i 1) tokens)) ;;; (next2 (nth (+ i 2) tokens))) ;;; (when (and (stem= next1 "or") ;;; (stem= next2 "not")) ;;; (add-cue id start (get-field :end next2)) ;;; (add-to-processed (list token next1 next2)) ;;; (ensure-class-1 next1) ;;; (ensure-class-1 next2) ;;; (incf id))) ;;; ;;; "may(,) or may not" ;;; else if (and cue-p (stem= token "may")) ;;; do (multiple-value-bind (from to) ;;; (ppcre:scan "[,]? or may not" ;;; (get-field :i-input item) :start end) ;;; (when from ;;; (let ((tokens (spanned-tokens from to item))) ;;; (mapc #'ensure-class-1 tokens) ;;; (add-cue id start to) ;;; (add-to-processed (cons token tokens)) ;;; (incf id)))) ;;;;;; ;;; " known/clear/evident/understood" ;;; else if (and cue-p (stem= token "not")) ;;; do (let ((next (nth (+ i 1) tokens))) ;;; (when (or (stem= next "know") ;;; (stem= next "clear") ;;; (stem= next "evident") ;;; (stem= next "understand") ;;; (stem= next "exclude")) ;;; (add-cue id start (get-field :end next)) ;;; (add-to-processed (list token next)) ;;; (ensure-class-1 next) ;;; (incf id))) ;;; ;;; "not ///" ;;; else if (and cue-p (or (stem= token "know") ;;; (stem= token "clear") ;;; (stem= token "evident") ;;; (stem= token "understand") ;;; (stem= token "exclude"))) ;;; do (let ((prev (and (> i 0) (nth (- i 1) tokens)))) ;;; (when (and (stem= prev "not") ;;; (not (processed-p prev))) ;;; (add-cue id (get-field :start prev) end) ;;; (add-to-processed (list prev token)) ;;; (ensure-class-1 prev) ;;; (incf id))) ;;; ;;; " evidence/proof/guarantee" ;;; else if (and cue-p (stem= token "no")) ;;; do (let ((next (nth (+ i 1) tokens))) ;;; (when (or (stem= next "evidence") ;;; (stem= next "proof") ;;; (stem= next "guarantee")) ;;; (add-cue id start (get-field :end next)) ;;; (add-to-processed (list token next)) ;;; (ensure-class-1 next) ;;; (incf id))) ;;; ;;; "no //" ;;; else if (and cue-p (or (stem= token "evidence") ;;; (stem= token "proof") ;;; (stem= token "guarantee"))) ;;; do (let ((prev (and (> i 0) (nth (- i 1) tokens)))) ;;; (when (and (stem= prev "no") ;;; (not (processed-p prev))) ;;; (add-cue id (get-field :start prev) end) ;;; (add-to-processed (list prev token)) ;;; (ensure-class-1 prev) ;;; (incf id))) ;;;;;; ;;; " exclude" ;;; else if (and cue-p (form= token "cannot")) ;;; do (let ((next (nth (+ i 1) tokens))) ;;; (when (stem= next "exclude") ;;; (add-cue id start (get-field :end next)) ;;; (add-to-processed (list token next)) ;;; (ensure-class-1 next) ;;; (incf id))) ;;; ;;; "cannot " ;;; else if (and cue-p (stem= token "exclude")) ;;; do (let ((prev (and (> i 0) (nth (- i 1) tokens)))) ;;; (when (and (form= prev "cannot") ;;; (not (processed-p prev))) ;;; (add-cue id (get-field :start prev) end) ;;; (add-to-processed (list prev token)) ;;; (ensure-class-1 prev) ;;; (incf id))) ;;; ;;; "raise the .* possibility / question" ;;; else if (and cue-p (stem= token "raise")) ;;; do (let* ((right-pos ;;; (loop ;;; for token in (subseq tokens (+ i 1)) ;;; for j from (+ i 1) ;;; when (or (stem= token "possibility") ;;; (stem= token "question") ;;; (stem= token "issue") ;;; (stem= token "hypothesis")) ;;; return j)) ;;; (right (and right-pos (nth right-pos tokens)))) ;;; (if (and right (< (- right-pos i) 6)) ;;fixme; skip this limitation? ;;; (progn ;;; (add-cue id start (get-field :end right)) ;;; (add-to-processed (subseq tokens i (+ right-pos 1))) ;;; (loop for token in (subseq tokens (+ i 1) (+ right-pos 1)) ;;; do (ensure-class-1 token)) ;;; (incf id)) ;;; (progn ;;; (ensure-class-0 token) ;;; (add-to-processed (list token))))) ;;(probably a false positive!) ;;; ;;; "either / or" ;;; else if (and cue-p (stem= token "either")) ;;; do (let* ((or-pos (loop ;;; for token in (subseq tokens (+ i 1)) ;;; for j from (+ i 1) ;;; when (stem= token "or") ;;; return j)) ;;; (or-token (and or-pos (nth or-pos tokens)))) ;;; (when or-token ;;; (add-cue id start end ;;; (get-field :start or-token) ;;; (get-field :end or-token)) ;;; (add-to-processed (list token or-token)) ;;; (ensure-class-1 or-token) ;;; (incf id))) ;;; ;;; any cue without a special rule: ;;; when (and (> (get-field :class token) 0) ;;; ;;check if cue again since we might have changed class ;;; (not (processed-p token))) ;;; do (add-cue id start end) ;;; (add-to-processed (list token)) ;;; (incf id) ;;; finally ;;; (if (test-field *conll-cues* item) ;;; (setf (get-field *conll-cues* item) new-cues) ;;; (nconc item (acons *conll-cues* new-cues nil))) ;;; (if add-scope ;;; (insert-default-scopes item) ;;; (when (test-field *conll-scopes* item) ;;; (setf (get-field *conll-scopes* item) nil))) ;;; ;;; reset: ;;; (setq processed nil) ;;; (setq new-cues nil))))) ;;; items) ;; now doing this in re-classify profile instead: ;;;(defun apply-scope-system (items &optional (system *conll-scope-system*)) ;;; (case system ;;; (:rules (apply-scope-rules items)) ;;; (:ranker (apply-scope-ranker items)) ;;; (:default ;;; (loop ;;; for item in items ;;; when (get-field *conll-cues* item) ;;; do (insert-default-scopes item)))) ;;; items) (defparameter *councill-hack* nil) (defun apply-scope-ranker (items &key ;;test-fold-iids (dep-features-p t) (fall-back :rules)) ;; or :default (ensure-numerical-ids items) ;;; (when (and dep-features-p (eq back-off :rules)) ;;; (apply-scope-rules items)) ;;; (setq with-rules-p nil) (loop for item in items for cues = (get-field *conll-cues* item) when cues do (let* ((iid (get-field :i-id item)) ;;; (derivation (get-field :derivation ;;; (first (get-field :results item)))) (cuefile (format nil "/tmp/.cues.~a.~a" (current-user) (current-pid))) (scopefile (format nil "/tmp/.scopes.~a.~a" (current-user) (current-pid))) (script (cond (*councill-hack* "~jread/workspace/bioscope/extract_scope_obsp5") ((member (get-field :source item) '("bse" "obsp" "obsr") :test #'equalp) "~jread/workspace/bioscope/extract_scope_final") ((string= "obsa" (get-field :source item)) "~jread/workspace/bioscope/extract_scope_cvat") (t "~jread/workspace/bioscope/extract_scope"))) (n-m (if *councill-hack* "-n 25 -m 15" (if (eq *conll-scopes* :nscopes) (if dep-features-p "-n 15 -m 5" "-n 10 -m 1") (if dep-features-p "-n 5 -m 20" "-n 1 -m 3")))) (options (format nil "~:[~;-t negation ~]~a -i ~a -o ~a ~a" (eq *conll-scopes* :nscopes) n-m (current-pid) scopefile cuefile)) (command (format nil "~a ~a" script options)) (rule-scoped-item (cond ((and dep-features-p (not (eq fall-back :rules))) (first (apply-scope-rules (list (copy-graph item))))) ((and dep-features-p (eq fall-back :rules)) (first (apply-scope-rules (list item)))))) (data (if dep-features-p (list (list iid 'null ;;; derivation (mapcar #'(lambda (cue) (cons (get-field :id cue) (get-field :span cue))) (get-field *conll-cues* rule-scoped-item)) (mapcar #'(lambda (scope) (cons (get-field :id scope) (get-field :span scope))) (get-field *conll-scopes* rule-scoped-item)))) (list (list iid 'null ;;; derivation (mapcar #'(lambda (cue) (cons (get-field :id cue) (get-field :span cue))) (get-field *conll-cues* item))))))) ;; fall back on default scope when we don't have a derivation. ;; otherwise, apply statistical subtree ranker: ;;; (if (null derivation) ;;; (progn ;;; (format t "~&[~a] apply-scope-ranker(): ~ ;;; no derivation for item ~a (using default scope).~%" ;;; (current-time :long :short) iid) ;;; (insert-default-scopes item)) ;;; (progn ;;; write the cue file (with-open-file (out cuefile :direction :output :if-exists :supersede) (write data :stream out :pretty nil)) ;;; apply the ranker script (format t "~&[~a] apply-scope-ranker(): ~ calling for item ~a: '~a'.~%" (current-time :long :short) iid command) ;; debugging (when (and command (cl-fad:file-exists-p cuefile)) (run-process command :wait t :output scopefile :if-output-exists :supersede)) ;;; read the new scope file, and insert scopes in items. (with-open-file (in scopefile :direction :input) (let* ((scopes) (line (first (read in nil nil))) (iid2 (pop line))) ;;do some sanity checking? ;; (derviation (pop line))) ;;; (format t "~&[~a] apply-scope-ranker(): read: ~a~%" ;;; (current-time :long :short) line) (format t "~&[~a] apply-scope-ranker(): ~ spans for item ~a: ~a~%" (current-time :long :short) iid line) (unless (= iid iid2) (error "~&apply-scope-ranker(): non-matching iids~ (in; ~a, out;~a)~%" iid iid2)) (if (eql :no-parse (first line)) ;; apply fall-back scopes if no derivation: (case fall-back (:rules (unless dep-features-p ;; allready inserted? (apply-scope-rules (list item)))) (:default (insert-default-scopes item))) ;;; (when (and (not with-rules-p) ;;; (test-field *conll-scopes* item)) ;;; (setf (get-field *conll-scopes* item) nil)) ;; insert ranker scopes: (loop with ids = nil for scope in line for id = (pop scope) unless (member id ids :test #'eql) do (push (list (cons :id id) (cons :span scope)) scopes) (push id ids) finally (if (test-field *conll-scopes* item) (setf (get-field *conll-scopes* item) scopes) (nconc item (acons *conll-scopes* scopes nil)))))))) ;;;; (format t "~%apply-scope-ranker(): about to check item ~a~%" (get-field :i-id item)) ;; debug (scope-sanity-check item)) ;;; (setq %items items) ;;; (break) items) (defun apply-scope-rules (items &key trace) ;; (ensure-numerical-ids items) (loop for item in items for cues = (get-field *conll-cues* item) when cues ;;fixme: consider writing a macro `with-open-gz-file()' do (let* ((source (get-field :source item)) (depdir (directory-namestring (profile-find-addon source :type :malt+xle))) (iid (get-field :i-id item)) (depfile (make-pathname :directory depdir :name (format nil "~a" iid))) (depfile-gz (make-pathname :directory depdir :name (format nil "~a" iid) :type "gz")) (cuefile (format nil "/tmp/.depscopes.cues.~a.~a" (current-user) (current-pid))) (scopefile (format nil "/tmp/.depscopes.out.~a.~a" (current-user) (current-pid))) (command ;;fixme: script name ;; (format nil "perl /logon/erikve/documents/speculation/coling10/merged_depScopes.perl -i ~a > ~a" ;; (format nil "perl /logon/erikve/documents/speculation/coling10/depScopes.perl -i ~a > ~a" ;;; (format nil "perl /logon/erikve/documents/speculation/conll10/depScopes.perl -i ~a > ~a" (case *conll-scopes* (:hscopes (format nil "perl /ltg/erikve/documents/speculation/conll10/depScopes.perl -i ~a > ~a" ;;; (format nil "perl /logon/erikve/documents/speculation/coling10/merged_new-brackets_depScopes.perl -i ~a > ~a" cuefile scopefile)) (:nscopes (format nil "perl /ltg/erikve/documents/speculation/cl/depScopes_negation.perl -i ~a > ~a" cuefile scopefile)))) ;; (format nil "perl ~a/uio/bioscope/src/depScopes.perl -i ~a > ~a" ;; root cuefile scopefile)) (unzipped-p nil)) (when (and (not (cl-fad:file-exists-p depfile-gz)) (not (cl-fad:file-exists-p depfile))) (error "apply-scope-rules(): no `depfile(.gz)' exists for item `~a'." item) (return-from apply-scope-rules)) (when (cl-fad:file-exists-p depfile-gz) (run-process (format nil "gunzip ~a" (namestring depfile-gz)) :wait t) (setq unzipped-p t)) ;;; index cue-ids on tokens spans: ;;; (let ((index (make-hash-table :test #'equal :size 5))) ;;; (loop ;;; for cue in cues ;;; for cid = (get-field :id cue) ;;; for cspans = (list-to-pairs (get-field :span cue)) ;;; for tspans = ;;; (loop for (from to) in cspans ;;; append (spanned-tokens from to item ;;; :positions-only-p t)) ;;; do (loop for pair in tspans ;;; do (setf (gethash pair index) cid))) (let ((index (make-hash-table :test #'equal :size 5)) ;; map can be used for retrieving the original id strings (map (make-hash-table :size 5))) (loop with map-id = -1 for cue in cues for cid = (if (integerp (get-field :id cue)) (get-field :id cue) ;; map string ids to numerical ids: (let* ((id (get-field :id cue)) (new-id) (scope (find-if #'(lambda (x) (equalp (get-field :id x) id)) (get-field *conll-scopes* item)))) (incf map-id) (setf (gethash map-id map) id) (setq new-id map-id) (when scope (setf (get-field :id scope) new-id)) (setf (get-field :id cue) new-id) new-id)) for cspans = (list-to-pairs (get-field :span cue)) for tspans = (loop for (from to) in cspans append (spanned-tokens from to item :positions-only-p t)) do (loop for pair in tspans do (setf (gethash pair index) cid))) ;;; read the dep. file, while writing the cue file (with-open-file (out cuefile :direction :output :if-exists :supersede) (with-open-file (in depfile :direction :input) (loop for line = (read-line in nil nil) with span = nil while line do (ppcre:register-groups-bind (start end) ("^(\\d+)\\s+(\\d+).*$" line) (setq span (list (parse-integer start) (parse-integer end))) (write-string line out) (when (gethash span index) (write-char #\Tab out) (write-char #\C out) (write-char #\: out) (write (gethash span index) :stream out)) (terpri out))))) (when unzipped-p (run-process (format nil "gzip -9 ~a" (namestring depfile)) :wait t)) ;;; apply the rule script (when (and command (cl-fad:file-exists-p cuefile)) (run-process command :wait t :output scopefile :if-output-exists :supersede)) ;;; read the new scope file, and insert scopes in items. (with-open-file (in scopefile :direction :input) (loop for line = (read-line in nil nil) with span = nil with id = nil with ids = nil with scopes = nil while line do (ppcre:register-groups-bind (cue-id start end) ("^Cue\\s+(\\d+):\\s+(\\d+)\\s+(\\d+)" line) (setq span (list (parse-integer start) (parse-integer end))) (setq id (parse-integer cue-id)) (unless (member id ids :test #'=) (push (list (cons :id id) (cons :span span)) scopes) (push id ids))) finally ;; fixme: should we do anything with prior scopes? (if (test-field *conll-scopes* item) (setf (get-field *conll-scopes* item) scopes) (nconc item (acons *conll-scopes* scopes nil))) (scope-sanity-check item))) ;; code to produce lists of cue and scope spans for ;; jonathon's ranker, using the original BS string ids: (when trace (write (list (cons (get-field :i-id item) (list 'null ;;; dummy / derivation (mapcar #'(lambda (cue) (cons (gethash (get-field :id cue) map) (get-field :span cue))) (get-field *conll-cues* item)) (mapcar #'(lambda (scope) (cons (gethash (get-field :id scope) map) (get-field :span scope))) (get-field *conll-scopes* item))))) :stream trace :pretty nil) (terpri trace))))) ;;; (setq %items items) ;;; (break) items) (defun scope-sanity-check (item) ;;;; (format t "~%entering scope-sanity-check()!~%") ;; debug (let* ((tokens (get-field :i-tokens item)) (scopes (copy-list (get-field *conll-scopes* item))) (n (length tokens)) (last (nth (- n 1) tokens)) (max-span-pos (if (member (get-field :form last) '("." "!" "?" ";" ":") :test #'equalp) (get-field :start last) (get-field :end last))) (new-scopes nil) (insane-p nil)) ;; first do a quick fix for cues missing scopes: (loop for cue in (get-field *conll-cues* item) for cue-id = (get-field :id cue) unless (find-if #'(lambda (scope) (equalp (get-field :id scope) cue-id)) scopes) do (push (list (cons :id cue-id) (cons :span (list (first (get-field :span cue)) max-span-pos))) scopes) (setq insane-p t) (format t "~&scope-sanity-check() [0]~%")) ;;; 0 (setq scopes (stable-sort scopes #'< :key #'(lambda (x) (first (get-field :span x))))) ;;; (if (and (get-field *conll-cues* item) ;;; (not (= (length (get-field *conll-scopes* item)) ;;; (length (get-field *conll-cues* item))))) ;; quick fix for cues missing scopes. ;;; (progn ;;; (format t "~&scope-sanity-check() [0] item ~a: #cues=~a, #scopes=~a~%" ;;; (get-field :i-id item) ;;; 0 ;;; (length (get-field *conll-cues* item)) ;;; (length (get-field *conll-scopes* item))) ;;; (insert-default-scopes item)) ;;; fixme: add check for scope-less cues... ;;; (if (and (get-field *conll-cues* item) ;;; (null scopes)) ;;; (loop for cue in (get-field *conll-cues* item) ;;; (setf (get-field *conll-scopes* item) ;;; (list (cons :id id) (cons :span (list start end))) (loop with prev-end with prev-start with cues = (get-field *conll-cues* item) for scope in scopes for id = (get-field :id scope) for cue = (find-if #'(lambda (x) (equalp (get-field :id x) id)) cues) for start = (first (get-field :span scope)) for end = (second (get-field :span scope)) ;;; for contains-cue-p = ;;; (and (<= start (first (get-field :span cue))) ;;; (>= end (second (get-field :span cue)))) for crossed-cue = (find-if #'(lambda (cue) (and (> start (first (get-field :span cue))) (< start (second (get-field :span cue))))) cues) if crossed-cue do (setq insane-p t) (format t "~&scope-sanity-check() [1]~%") ;;; 1 (setf start (1+ (second (get-field :span crossed-cue)))) if (and prev-end ;; crossing boundaries with other scope? (< start prev-end) (> end prev-end) (> start prev-start)) ;;; --> just added... check/fixme do (setq insane-p t) (format t "~&scope-sanity-check() [2]~%") ;;; 2 (format t "~&scope-sanity-check() sorted scopes: ~a.~%" scopes) (format t "~&scope-sanity-check() prev-start: ~a, prev-end: ~a, start: ~a, end: ~a.~%" prev-start prev-end start end) (setf (second (get-field :span (first new-scopes))) max-span-pos) if (not (<= start (first (get-field :span cue))));;doesn't contain cue do (setq insane-p t) (format t "~&scope-sanity-check() [3]~%") ;;; 3 (setq start (first (get-field :span cue))) ;;; if (not (>= end (second (get-field :span cue)))) ;;doesn't contain cue if (not (>= end (first (last (get-field :span cue))))) ;;doesn't contain cue do (setq insane-p t) (format t "~&scope-sanity-check() [4]~%") ;;; 4 (setq end max-span-pos) if (> end max-span-pos) ;;; end of span beyond maximum do (setq insane-p t) (format t "~&scope-sanity-check() [5]~%") ;;; 5 (setq end max-span-pos) ;;always do (push (list (cons :id id) (cons :span (list start end))) new-scopes) (setq prev-end end) (setq prev-start start) ;; fixme: should we do anything with prior scopes? finally (when insane-p (format t "~&[~a] scope-sanity-check(): ~ fired for item ~a. New scopes: ~a~%" (current-time :long :short) (get-field :i-id item) new-scopes) (setf (get-field *conll-scopes* item) new-scopes)))) item) (defun insert-default-scopes (item) "Inserts default (ie. maximal) scopes for all cues." ;;; (format t "~&insert-default-scopes(): item ~a~%" ;;; (get-field :i-id item)) ;;;debug (let* ((tokens (get-field :i-tokens item)) (spans (loop for cue in (get-field *conll-cues* item) collect (list (first (get-field :span cue)) (get-field :id cue)))) (n (length tokens)) (last (nth (- n 1) tokens)) (max-span-pos (if (member (get-field :form last) '("." "!" "?" ";" ":") :test #'equalp) (get-field :start last) (get-field :end last)))) (loop for (from id) in spans for span = (list from max-span-pos) collect (list (cons :id id) (cons :span span)) into tmp finally (if (test-field *conll-scopes* item) (setf (get-field *conll-scopes* item) tmp) (nconc item (acons *conll-scopes* tmp nil))))) nil) (defun insert-cues (item) "Inserts cues based on token classification." (let* ((tokens (get-field :i-tokens item)) (spans (loop for token in tokens when (> (get-field :class token) 0) collect (list (get-field :start token) (get-field :end token))))) (loop for span in spans for id from 0 collect (list (cons :id id) (cons :span span)) into tmp finally (if (test-field *conll-cues* item) (setf (get-field *conll-cues* item) tmp) (nconc item (acons *conll-cues* tmp nil))))) nil) ;;fixme; remove all calls to the following, swithing to the funcs above instead!! (defun insert-cues-and-scopes (item) "Inserts cues based on token classification. Also adds default scopes." (let* ((tokens (get-field :i-tokens item)) ;;; fixme; this could be the place to apply post-processing (spans (loop for token in tokens when (> (get-field :class token) 0) collect (list (get-field :start token) (get-field :end token)))) (n (length tokens)) (last (nth (- n 1) tokens)) (max-span-pos (if (member (get-field :form last) '("." "!" "?" ";" ":") :test #'equalp) (get-field :start last) (get-field :end last)))) (if spans (loop for cspan in spans for sspan = (list (first cspan) max-span-pos) ;;default scope for now for id from 0 collect (list (cons :id id) (cons :span cspan)) into c-tmp collect (list (cons :id id) (cons :span sspan)) into s-tmp finally (if (test-field *conll-cues* item) (setf (get-field *conll-cues* item) c-tmp) (nconc item (acons *conll-cues* c-tmp nil))) (if (test-field *conll-scopes* item) (setf (get-field *conll-scopes* item) s-tmp) (nconc item (acons *conll-scopes* s-tmp nil)))) ;; if we haven't found any cues, make sure we forget everything ;; we might know about cues / scopes from before: (progn (if (get-field *conll-cues* item) (setf (get-field *conll-cues* item) nil)) (if (get-field *conll-scopes* item) (setf (get-field *conll-scopes* item) nil))))) nil) (defun list-to-pairs (list) (when list (cons (list (car list) (cadr list)) (list-to-pairs (cddr list))))) (defun list-cue-spans (item &key (field *conll-cues*)) (loop for foo in (get-field field item) append (list-to-pairs (get-field :span foo)))) ;;;(defun list-cue-spans (item &key (field *conll-cues*)) ;;; (loop with spans ;;; for foo in (get-field field item) ;;; for list = (get-field :span foo) ;;; if (> (length list) 2) do ;;In case of multiword cues ;;; (loop while list ;;of non-continuous substrings. ;;; do (push (list (pop list) (pop list)) ;;; spans)) ;;; else do (push list spans) ;;; finally (return (nreverse spans)))) ;;; fixme; rewrite the above (inline) to something like (mapcar #'list-to-pairs .. ;;;; (mapcan #'(lambda (x) ;;; (mapcon #'(lambda (list) ;;; (unless (oddp (length list)) ;;; (list (list (first list) ;;; (second list))))) x)) ;;; (mapcar #'(lambda (cue) ;;; (get-field :span cue)) ;;; (get-field *conll-cues* item)))) (defun spanned-tokens (from to item &key positions-only-p) "Returns the list of start-end positions of all tokens within the given span." (loop with token-spans = nil for token in (get-field :i-tokens item) for start = (get-field :start token) for end = (get-field :end token) when (and (>= start from) (<= end to)) do (push (if positions-only-p (list start end) token) token-spans) finally (return token-spans))) (defun write-classification-scores (item target cache) (let ((p-id (or (get-field :parse-id item) (get-field :i-id item)))) (loop ;;write token scores (fixme: consider ditching) with tokens = (if (functionp *redwoods-token-filter*) (remove-if-not *redwoods-token-filter* (get-field :i-tokens item)) (get-field :i-tokens item)) for token in tokens for score = (get-field :class token) ;;fixme; adapt this to multiclass. ;;fixme": make robust gold profiles ;; (where the :class field isn't present)! for class = (if (<= score 0) -1 +1) for from = (get-field :start token) for to = (get-field :end token) do (write-score target (pairlis '(:parse-id :result-id :score-id :score-start :score-end :rank :score :learner) (list p-id 0 -1 ;dummy score+res.ids from to class score "token")) :cache cache)) (loop ;;write cue scores for cue in (get-field *conll-cues* item) for id = (get-field :id cue) for pairs = (list-to-pairs (get-field :span cue)) do (loop for (from to) in pairs do (write-score target (pairlis '(:parse-id :result-id :score-id :score-start :score-end :rank :score :learner) (list p-id 0 id ;;dummy r-id from to 1 1 "cue")) :cache cache))) (loop ;;write scope scores for scope in (get-field *conll-scopes* item) for id = (get-field :id scope) for pairs = (list-to-pairs (get-field :span scope)) do (loop for (from to) in pairs do (write-score target (pairlis '(:parse-id :result-id :score-id :score-start :score-end :rank :score :learner) (list p-id 0 id ;;dummy r-id from to 1 1 "scope")) :cache cache))) nil)) (defun score-item (item gold &key test (n 1) (loosep t) errorp (task *redwoods-task*)) #+:debug (setf %item% item %gold% gold) (case task (:classify (let* ((tokens (get-field :i-tokens item)) (cspans (list-cue-spans gold)) (tspans (loop for (from to) in cspans append (spanned-tokens from to gold :positions-only-p t))) ;; compute token-level scores (token-scores (loop for token in tokens for class = (get-field :class token) for span = (list (get-field :start token) (get-field :end token)) for positivep = (member span tspans :test #'equal) collect (if (or (and (> class 0) positivep) ;;true positive (and (<= class 0) (not positivep))) ;;true negative 1 0))) ;; compute item-level score (ipositivep (some #'(lambda (token) (> (get-field :class token) 0)) tokens)) (item-score (if (or (and cspans ipositivep) ;;true positive (and (not cspans) (not ipositivep))) ;;true negative 1 0))) (values item-score token-scores (score-spans gold item)))) (:rank (let ((ranks (get-field :ranks item)) (granks (get-field :ranks gold)) (test (cond ((functionp test) test) ((or (null test) (eq test :id)) #'(lambda (old new) (let ((foo (get-field :result-id old)) (bar (get-field :result-id new))) (and foo bar (= foo bar))))) ((eq test :derivation) #'(lambda (old new) (let ((foo (get-field :derivation old)) (bar (get-field :derivation new))) (and foo bar (derivation-equal foo bar)))))))) ;; ;; check calling context: both the test item and ground truth needs to ;; come with ranks; all ranks in the ground truth are 1 (since they ;; came from annotations in a treebank); and unless .loosep. is on, ;; there must not be more than one gold target. ;; (cond ((or (null ranks) (null granks)) nil) ((loop for grank in granks for rank = (get-field :rank grank) thereis (or (not (integerp rank)) (not (= rank 1)))) nil) ((and (rest granks) (null loosep)) nil) ;; ;; now do up to three nested searches, aiming to find the best .match. ;; for one of the gold targets, i.e. the one with the smallest rank ;; (when ranking within an n-best beam of more than 1 is enabled) or ;; otherwise the one with the smallest set of ties. for each .match. ;; find the set of results that the model put at a lower or the same ;; rank, ignoring the ones that are a gold target themselves. ;; (t (loop with result = nil with best = nil for grank in granks for match = (loop for rank in ranks for i = (get-field :rank rank) while (or (null n) errorp (<= i n)) thereis (and (funcall test rank grank) rank)) for i = (get-field :rank match) for matches = (when i (loop for rank in ranks for j = (get-field :rank rank) while (<= j i) when (and (= i j) (not (loop for grank in granks thereis (funcall test rank grank)))) collect rank)) for errors = (when errorp (if i (loop for rank in ranks for j = (get-field :rank rank) while (< j i) collect rank) ranks)) when (and (numberp i) (or (null best) (< i (get-field :rank best)) (and (= i (get-field :rank best)) (< (length matches) (length result))))) do (setf best match) (setf result matches) finally (let ((similarity (when *redwoods-score-similarities* (loop with hooks = *redwoods-score-similarities* with size = (length hooks) with tscores = (make-array size :initial-element 0) with nscores = (make-array size :initial-element 0) with ranks ;; ;; _fix_me_ ;; we would like to assume that :ranks entries ;; are actually full results, but somehow that ;; appears to not always be true? ;; (17-feb-06; oe & erik) ;; --- what we used to do here was plain wrong, ;; in the sense that we ended up aligning ;; arbitrary ranks and results. go back to the ;; original code, and when we re-discover the ;; problem of missing fields in :ranks tuples, ;; work out what goes wrong. (18-apr-06; oe) ;; = (loop for rank in ranks for i = (get-field :rank rank) while (<= i n) collect rank) with granks = (loop for rank in granks while (= (get-field :rank rank) 1) collect rank) for rank in ranks do (loop for grank in granks do (loop for i from 0 for tag in hooks for score = (string-similarity rank grank :type tag) when (numberp score) do (setf (aref nscores i) (max (aref nscores i) score)) (when (= (get-field :rank rank) 1) (setf (aref tscores i) (max (aref tscores i) score))))) finally (return (loop for tag in hooks for top across tscores for nbest across nscores collect (list tag top nbest))))))) (return (values (if best (get-field :rank best) 0) (if best (divide 1 (+ (length result) 1)) 0) (rest granks) similarity errors match (delete match result))))))))))) (defun string-similarity (rank grank &key (type :bleu) scrub) (let ((*string-similarity-punctuation-characters* nil) (string (get-field :surface rank)) (gstring (get-field :surface grank))) (first (score-strings (list string) (list gstring) :type type :scrub scrub)))) (defun analyze-errors (data &optional (gold data) &key (condition *statistics-select-condition*) spartanp (scorep t) (n 1) test loosep file append (format :tcl) meter) (declare (ignore meter)) (let* ((errors (summarize-errors data gold :condition condition :spartanp spartanp :scorep scorep :n n :test test :loosep loosep)) (stream (create-output-stream file append))) (when (listp errors) (case format (:tcl (when *statistics-tcl-formats* (format stream *statistics-tcl-formats*)) (format stream "flags 0~%~ layout col def -m1 5 -r 1 -m2 5 -c black -j right~%~ layout row def -m1 5 -r 0 -m2 5 -c black -j center~%~ layout col 0 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 1 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 2 -m1 5 -r 2 -m2 5 -c black -j left~%~ layout col 4 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout col 7 -m1 5 -r 2 -m2 5 -c black -j right~%~ layout row 0 -m1 5 -r 2 -m2 5 -c black -j center~%~ layout row 2 -m1 5 -r 2 -m2 5 -c black -j center~%") (format stream "cell 1 1 -contents {i-id} -format title~%~ region 1 1 2 1 -contents {i-id} ~ -format title -hor_justify center~%~ cell 1 2 -contents {i-input} -format title~%~ region 1 2 2 2 -contents {i-input} ~ -format title -hor_justify center~%~ cell 1 3 -contents {xxxx} -format title~%~ cell 1 4 -contents {xxxx} -format title~%~ region 1 3 2 4 -contents {readings} ~ -format title -hor_justify center~%") (format stream "region 1 5 1 7 -contents {scores} ~ -format title -hor_justify center~%~ cell 2 5 -contents {<} -format title~%~ cell 2 6 -contents {H(p)} -format title~%~ region 2 6 2 6 -contents {H(p)} ~ -format title -hor_justify center~%~ cell 2 7 -contents {=} -format title~%"))) (loop for (item gitem rank errors match others entropy) in errors for i-id = (get-field :i-id item) for i-input = (or (get-field :o-input item) (get-field :i-input item)) for greadings = (get-field :readings gitem) for readings = (get-field :readings item) for tag = (intern (gensym "") :keyword) with i = 3 when (zerop (mod (- i 2) 10)) do (case format (:tcl (format stream "layout row ~d -m1 5 -r 2 -m2 5 -c black -j center~%" i))) do ;; ;; _fix_me_ this creates a potential memory leak: as soon as the ;; window for this table is destroyed, there will be no further ;; reference to the (tag) symbols used to store data on the lisp ;; side. yet, the values associated with the symbol properties ;; will never become unbound. (16-feb-03) ;; (setf rank rank) (setf (get :source tag) gold) (setf (get :i-id tag) i-id) (setf (get :i-input tag) i-input) (setf (get :match tag) match) (setf (get :errors tag) errors) (setf (get :others tag) others) (format stream "cell ~d 1 -contents {~a} -format data~%~ cell ~d 2 -contents {~a} -format data -key ~d -source {~a}~%~ cell ~d 3 -contents {~a} -format data~%~ cell ~d 4 -contents {~a} -format data~%~ cell ~d 5 -contents {~a} -format data~%~ cell ~d 6 -contents {~,4f} -format data ~ -action inspect -tag ~a~%~ cell ~d 7 -contents {~a} -format data~%" i i-id i i-input i-id data i greadings i readings i (length errors) i entropy tag i (length others)) (incf i) finally (when (> i 3) (case format (:tcl (format stream "layout row ~d -m1 5 -r 2 -m2 5 -c black -j center~%~ cell ~d 1 -contents {~a} -format total~%~ layout row ~d -m1 5 -r 2 -m2 5 -c black -j center~%" (- i 1) i (- i 3) i)))))) (when (or (stringp file) (stringp append)) (close stream)) (if (listp errors) 0 -1))) (defun summarize-errors (data &optional (gold data) &key (condition *statistics-select-condition*) spartanp (scorep t) (n 1) test loosep meter) (declare (ignore meter)) (let* ((thorough (when (eq test :derivation) '(:derivation))) (items (if (stringp data) (analyze (if spartanp gold data) :thorough (or thorough spartanp) :condition condition :score (if scorep data t) :readerp (eq test :derivation) :scorep t) data)) (items (sort (copy-list items) #'< :key #'(lambda (foo) (get-field :i-id foo)))) (gitems (if (stringp gold) (analyze gold :thorough thorough :condition condition :gold gold :readerp (eq test :derivation) :scorep t) gold)) (gitems (sort (copy-list gitems) #'< :key #'(lambda (foo) (get-field :i-id foo)))) result) (loop for item in items for gitem in gitems for i-id = (get-field :i-id item) for gi-id = (get-field :i-id gitem) when (or (not (numberp i-id)) (not (numberp gi-id)) (not (= i-id gi-id))) do (setf result :error) (return) else do (multiple-value-bind (rank foo bar baz errors match others) (score-item item gitem :test test :n n :loosep loosep :errorp t) (declare (ignore foo bar baz)) (when (and rank (or (zerop rank) (> rank n) others)) (let* ((ranks (when (or errors others) (get-field :ranks item))) (scores (loop for rank in ranks collect (get-field :score rank))) (probabilities (scores-to-probabilities scores)) (entropy (entropy probabilities))) (push (list item gitem rank errors match others entropy) result))))) (if (listp result) (nreverse result) result))) (defun analyze-agreement (data1 data2 &key (condition *statistics-select-condition*) (test '(:edge :discriminant)) file append (format :tcl) meter) (let* ((stream (create-output-stream file append)) (aggregates (summarize-agreement data1 data2 :condition condition :test test :format format :meter meter)) (aggregates (nreverse aggregates)) (alabel (if (eq *statistics-aggregate-dimension* :phenomena) "Phenomenon" "Aggregate")) (caption (format nil "(generated by ~a at ~a)" *tsdb-name* (current-time :long :pretty))) (ncolumns 12) (i 2)) (declare (ignore caption)) #+:debug (setf %aggregates aggregates) (case format (:tcl (format stream *statistics-tcl-formats*) (format stream "flags 0~%~ layout col def -m1 5 -r 1 -m2 5 -c black -j right~%~ layout row def -m1 5 -r 0 -m2 5 -c black -j center~%~ layout col 0 -m1 5 -r 2 -m2 5 -c black -j left~%~ layout col 1 -m1 5 -r 2 -m2 5 -c black -j left~%~ layout col ~d -m1 5 -r 2 -m2 5 -c black -j right~%~ layout row 0 -m1 5 -r 2 -m2 5 -c black -j center~%~ layout row 1 -m1 5 -r 2 -m2 5 -c black -j center~%" ncolumns) (format stream "cell 1 1 -contents {~a} -format title~%~ cell 1 2 -contents \"total\\nitems\\n#\" -format title~%~ cell 1 3 -contents \"word\\nstring\\n\\330\" -format title~%~ cell 1 4 -contents \"parser\\nanalyses\\n\\330\" -format title~%~ cell 1 5 -contents \"random\\nbase\\n%\" -format title~%~ cell 1 6 -contents \"exact\\nmatch\\n%\" -format title~%~ cell 1 7 -contents \"distinct\\nnodes\\n\\330\" -format title~%~ cell 1 8 -contents \"node\\nmatch\\n%\" -format title~%~ cell 1 9 -contents \"distinct\\nlabels\\n\\330\" -format title~%~ cell 1 10 -contents \"label\\nmatch\\n%\" -format title~%~ cell 1 11 -contents \"distinct\\nbrackets\\n\\330\" -format title~%~ cell 1 12 -contents \"bracket\\nmatch\\n%\" -format title~%" alabel))) (loop for (id foo . data) in aggregates for name = (if (eq format :latex) (latexify-string foo) foo) for items = (get-field :items data) for length = (get-field+ :i-length data 0) for analyses = (get-field :analyses data) for random = (get-field :random data) for exact = (get-field :exact data) for nodes1 = (get-field :nodes1 data) for nodes2 = (get-field :nodes2 data) for ids = (get-field :ids data) for labels = (get-field :labels data) for brackets = (get-field :brackets data) for cids = (get-field :cids data) for clabels = (get-field :clabels data) for cbrackets = (get-field :cbrackets data) unless (or (smember id '(:all :total)) (zerop items)) do (case format (:tcl (format stream "cell ~d 1 -contents {~a} -format aggregate~%~ cell ~d 2 -contents ~d -format data~%~ cell ~d 3 -contents ~,2f -format data~%~ cell ~d 4 -contents ~,2f -format data~%~ cell ~d 5 -contents ~,2f -format data~%~ cell ~d 6 -contents ~,2f -format data~%~ cell ~d 7 -contents ~,2f -format data~%~ cell ~d 8 -contents ~,2f -format data~%~ cell ~d 9 -contents ~,2f -format data~%~ cell ~d 10 -contents ~,2f -format data~%~ cell ~d 11 -contents ~,2f -format data~%~ cell ~d 12 -contents ~,2f -format data~%" i name i items i length i analyses i (* (divide random items) 100) i (* (divide exact items) 100) i ids i (* (divide cids ids) 100) i labels i (* (divide (* 2 (/ clabels nodes1) (/ clabels nodes2)) (+ (/ clabels nodes1) (/ clabels nodes2))) 100) #+:null (* (divide clabels labels) 100) i brackets i (* (divide cbrackets brackets) 100)))) (incf i)) (let* ((data (rest (rest (assoc :total aggregates)))) (name "Total") (items (get-field :items data)) (length (get-field+ :i-length data 0)) (analyses (get-field :analyses data)) (random (get-field :random data)) (exact (get-field :exact data)) (nodes1 (get-field :nodes1 data)) (nodes2 (get-field :nodes2 data)) (ids (get-field :ids data)) (labels (get-field :labels data)) (cids (get-field :cids data)) (clabels (get-field :clabels data)) (brackets (get-field :brackets data)) (cbrackets (get-field :cbrackets data))) (case format (:tcl (format stream "layout row ~d -m1 5 -r 2 -m2 5 -c black -j center~%~ layout row ~d -m1 5 -r 2 -m2 5 -c black -j center~%~%~ cell ~d 1 -contents {~a} -format total~%~ cell ~d 2 -contents ~d -format total~%~ cell ~d 3 -contents ~,2f -format total~%~ cell ~d 4 -contents ~,2f -format total~%~ cell ~d 5 -contents ~,2f -format total~%~ cell ~d 6 -contents ~,2f -format total~%~ cell ~d 7 -contents ~,2f -format total~%~ cell ~d 8 -contents ~,2f -format total~%~ cell ~d 9 -contents ~,2f -format total~%~ cell ~d 10 -contents ~,2f -format total~%~ cell ~d 11 -contents ~,2f -format total~%~ cell ~d 12 -contents ~,2f -format total~%" (- i 1) i i name i items i length i analyses i (* (divide random items) 100) i (* (divide exact items) 100) i ids i (* (divide cids ids) 100) i labels i (* (divide (* 2 (/ clabels nodes1) (/ clabels nodes2)) (+ (/ clabels nodes1) (/ clabels nodes2))) 100) #+:null (* (divide clabels labels) 100) i brackets i (* (divide cbrackets brackets) 100))))) (when (or (stringp file) (stringp append)) (close stream)) (if (listp aggregates) 0 -1))) (defun summarize-agreement (data1 data2 &key (condition *statistics-select-condition*) (test '(:edge :discriminant)) (format :latex) meter) (declare (ignore meter)) (let* ((thorough (when (smember :edge test) '(:derivation))) (items1 (if (stringp data1) (analyze data1 :thorough thorough :condition condition :readerp (smember :derivation thorough) :gold data1) data1)) (items1 (sort (copy-list items1) #'< :key #'(lambda (foo) (get-field :i-id foo)))) (items2 (if (stringp data2) (analyze data2 :thorough thorough :condition condition :readerp (smember :derivation thorough) :gold data2) data2)) (items2 (sort (copy-list items2) #'< :key #'(lambda (foo) (get-field :i-id foo)))) items results) ;; ;; now we need to do the traditional alignment of two sorted sets of data: ;; iterate through one, search for match in the other as long as id is less ;; than or equal to current position in first set. ;; (loop for item1 in items1 for id1 = (get-field :i-id item1) for input = (get-field :i-input item1) for length = (get-field :i-length item1) for pid = (get-field :parse-id item1) for readings = (get-field :readings item1) for item2 = (when (numberp id1) (loop for id2 = (get-field :i-id (first items2)) while (and (numberp id2) (<= id2 id1)) do (let ((item2 (pop items2))) (when (and (numberp id2) (= id2 id1)) (return item2))))) for id2 = (get-field :i-id item2) while (numberp id1) when (numberp id2) do (let ((item (pairlis '(:i-id :i-input :i-length :parse-id :readings) (list id1 input length pid readings))) collectp) (when (smember :edge test) (let* (;; ;; _fix_me_ ;; currently, assume we are operating on thinned profiles, ;; such that we only have active derivations available; we ;; need to fix the :random normalize first, before the ;; general code will work. (27-apr-04; oe) ;; (derivations1 #+:null (loop for result in (get-field :ranks item1) when (= (get-field :rank result) 1) collect (get-field :derivation result)) (loop for result in (get-field :results item1) collect (get-field :derivation result))) (derivations2 #+:null (loop for result in (get-field :ranks item2) when (= (get-field :rank result) 1) collect (get-field :derivation result)) (loop for result in (get-field :results item2) collect (get-field :derivation result))) ;; ;; _fix_me_ ;; generalize for not fully disambiguated items ;; (agreement (when (and derivations1 (null (rest derivations1)) derivations2 (null (rest derivations2))) (if (and *redwoods-agreement-exact-p* (= (derivation-id (first derivations1)) (derivation-id (first derivations2)))) (acons :exact t nil) (summarize-derivation-agreement (first derivations1) (first derivations2)))))) (when agreement (nconc item (acons :edge agreement nil)) (setf collectp t)))) (when collectp (push item items)))) (let* ((aggregates (aggregate items :format format))) (loop with tnitems = 0 with tlength = 0 with treadings = 0 with turandom = 0 with tuexact = 0 with tunodes1 = 0 with tunodes2 = 0 with tuids = 0 with tulabels = 0 with tubrackets = 0 with tucids = 0 with tuclabels = 0 with tucbrackets = 0 for (id name . data) in aggregates do (loop with anitems = 0 with alength = 0 with areadings = 0 with aurandom = 0 with auexact = 0 with aunodes1 = 0 with aunodes2 = 0 with auids = 0 with aulabels = 0 with aubrackets = 0 with aucids = 0 with auclabels = 0 with aucbrackets = 0 for item in data for length = (get-field :i-length item) for readings = (get-field :readings item) for edge = (get-field :edge item) when (and edge (get-field :exact edge)) do (incf anitems) (incf alength length) (incf areadings readings) (incf aurandom (divide 1 readings)) (incf auexact) else when edge do (incf anitems) (incf alength length) (incf areadings readings) (incf aurandom (divide 1 readings)) (incf aunodes1 (get-field :nodes1 edge)) (incf aunodes2 (get-field :nodes2 edge)) (incf auids (get-field :ids edge)) (incf aulabels (get-field :labels edge)) (incf aubrackets (get-field :brackets edge)) (incf aucids (get-field :cids edge)) (incf auclabels (get-field :clabels edge)) (incf aucbrackets (get-field :cbrackets edge)) finally (incf tnitems anitems) (incf tlength alength) (incf treadings areadings) (incf turandom aurandom) (incf tuexact auexact) (incf tunodes1 aunodes1) (incf tunodes2 aunodes2) (incf tuids auids) (incf tulabels aulabels) (incf tucids aucids) (incf tuclabels auclabels) (incf tubrackets aubrackets) (incf tucbrackets aucbrackets) (push (nconc (list id name) (pairlis '(:items :i-length :analyses :random :exact :nodes1 :nodes2 :ids :labels :brackets :cids :clabels :cbrackets) (let ((nedges (- anitems auexact))) (list anitems (divide alength anitems) (divide areadings anitems) aurandom auexact (divide aunodes1 nedges) (divide aunodes2 nedges) (divide auids nedges) (divide aulabels nedges) (divide aubrackets nedges) (divide aucids nedges) (divide auclabels nedges) (divide aucbrackets nedges))))) results)) finally (push (nconc (list :total "Total") (pairlis '(:items :i-length :analyses :random :exact :nodes1 :nodes2 :ids :labels :brackets :cids :clabels :cbrackets) (let ((nedges (- tnitems tuexact))) (list tnitems (divide tlength tnitems) (divide treadings tnitems) turandom tuexact (divide tunodes1 nedges) (divide tunodes2 nedges) (divide tuids nedges) (divide tulabels nedges) (divide tubrackets nedges) (divide tucids nedges) (divide tuclabels nedges) (divide tucbrackets nedges))))) results))) (when (smember :edge test) (purge-profile-cache data1 :expiryp nil) (purge-profile-cache data2 :expiryp nil)) results)) (defun summarize-derivation-agreement (derivation1 derivation2 &key (test '(:id :label :bracket))) ;; ;; for now, make sure we do not count word nodes themselves, i.e. the ones ;; that have no edge id; this should deflate the agreement measure somewhat. ;; (let* ((nodes1 (delete nil (derivation-nodes derivation1) :key #'first)) (nodes2 (delete nil (derivation-nodes derivation2) :key #'first)) (cids (when (smember :id test) (intersection nodes1 nodes2 :key #'first))) (clabels (when (smember :label test) (intersection nodes1 nodes2 :test #'(lambda (foo bar) (and (eql (fourth foo) (fourth bar)) (eql (fifth foo) (fifth bar)) (equal (second foo) (second bar))))))) (cbrackets (when (smember :label test) (intersection nodes1 nodes2 :test #'(lambda (foo bar) (and (eql (fourth foo) (fourth bar)) (eql (fifth foo) (fifth bar))))))) (ids (union nodes1 nodes2 :key #'first)) (labels (union nodes1 nodes2 :test #'(lambda (foo bar) (and (eql (fourth foo) (fourth bar)) (eql (fifth foo) (fifth bar)) (equal (second foo) (second bar)))))) (brackets (union nodes1 nodes2 :test #'(lambda (foo bar) (and (eql (fourth foo) (fourth bar)) (eql (fifth foo) (fifth bar))))))) (pairlis '(:nodes1 :nodes2 :ids :labels :brackets :cids :clabels :cbrackets) (list (length nodes1) (length nodes2) (length ids) (length labels) (length brackets) (length cids) (length clabels) (length cbrackets))))) (defun operate-on-profiles (profiles &key (condition *statistics-select-condition*) (model (make-model)) (type :mem) cached-test-p (purgep t) (recursep t) internalp (stream *tsdb-io*) (verbose t) (task :fc) target initialp finalp firstp lastp (resolvedp t) vp (increment %redwoods-items-increment%) cache interrupt meter) (declare (ignore interrupt meter) (special *feature-item-enhancers* *redwoods-item-expander* *feature-flags* *feature-grandparenting* *feature-ngram-size*)) ;; ;; invoke various memory-intensive operations successively on sub-sets of ;; items from .profiles. go through three states: (a) the top-level call ;; must always provide a list of profiles and dispatches recursively for ;; each individual profile; (b) when operating on a single profile, we work ;; out a suitable sub-division of items, construct a conditon, and pass that ;; into yet another recursive call; finally, (c) the target task is executed. ;; ;; there are four relevant boundary conditions: working on the first or last ;; profile and, within each profile, working on the first or last increment. ;; .initialp. and .finalp. are profile-level, .first. and .lastp. increment- ;; level indicators of where we are in the calling chain. ;; (unless (or internalp (consp profiles)) (error "operate-on-profiles(): non-list argument `~a'." profiles)) (cond ((consp profiles) (loop with lkb::*edge-registry* = nil with *tsdb-connection-expiry* = 200 with gc = (install-gc-strategy nil :tenure nil :verbose verbose) with condition = (case task ((:fc :rank) (if (eq *redwoods-task* :classify) condition (if resolvedp (if (and condition (not (equal condition ""))) (format nil "t-active > 0 && readings > 1 && (~a)" condition) "t-active > 0 && readings > 1") (if (and condition (not (equal condition ""))) (format nil "readings > 1 && (~a)" condition) "readings > 1")))) (t condition)) for i from 0 for remaining on profiles for active = (first remaining) for virtualp = (virtual-profile-p active) when (find-tsdb-directory active :test t) do (operate-on-profiles active :condition condition :model model :task task :stream stream :initialp (eq remaining profiles) :finalp (null (rest remaining)) :target target :internalp t :resolvedp resolvedp :vp vp :type type :cached-test-p cached-test-p) (when (and purgep (not virtualp)) (purge-profile-cache active)) else do (format stream "operate-on-profiles(): invalid `~a'.~%" active) finally (restore-gc-strategy gc)) ;; ;; _fix_me_ ;; when operating on a group of profiles, there is no clear notion of which ;; should be home to the `model' file (including the symbol table, counts, ;; and maybe other relevant information). rethink? (1-feb-06; erik & oe) ;; (when (eq task :fc) (let ((embassador (first profiles))) (print-model model :file (profile-find-model embassador) :format :freeze)) model)) ((virtual-profile-p profiles) (loop ;;fixme; consider using analyze-virtual instead (erikve 11/3-'10) with sub-profiles = (virtual-profile-components profiles) for remaining on sub-profiles for active = (first remaining) do (operate-on-profiles active :condition condition :model model :task task :stream stream :initialp (and initialp (eq remaining sub-profiles)) :finalp (and finalp (null (rest remaining))) :target target :internalp t :resolvedp resolvedp :vp profiles :type type :cached-test-p cached-test-p) (when purgep (purge-profile-cache active) #+:allegro (progn (let ((*tsdb-gc-debug* nil)) (excl:print-type-counts :new) (excl:print-type-counts :old) (excl:gc :tenure) (excl:print-type-counts :new) (excl:print-type-counts :old) (excl:gc t) (excl:print-type-counts :old) (excl:print-type-counts :holes)))))) (recursep (when (eq task :unfc) ;;fixme; add support for virtual profiles (let ((fc (profile-find-feature-cache profiles))) (ignore-errors (delete-file (fc-file fc)))) (return-from operate-on-profiles)) (when verbose (format stream "[~a] operate-on-profiles(): reading `~a'~%" (current-time :long :short) profiles)) (loop with items = (select "i-id" :integer "item" nil profiles :sort :i-id) with first = (get-field :i-id (first items)) with last = (get-field :i-id (first (last items))) with delta = (if (numberp %redwoods-items-percentile%) (or (loop with i = (ceiling (length items) %redwoods-items-percentile%) for item in items when (<= (decf i) 0) return (max increment (- (get-field :i-id item) first))) increment) increment) with n = (max (ceiling (- last first) delta) 1) with cache = (when (or (eq task :rank) (eq task :classify)) (when initialp (purge-test-run target :action :score)) (let ((target (or target profiles))) (create-cache target :verbose verbose :protocol :raw))) for i from 1 to n for low = (+ first (* (- i 1) delta)) for high = (+ first (* i delta)) for foo = (format nil "i-id >= ~d && i-id < ~d~@[ && (~a)~]" low high condition) do (when verbose (format stream "[~a] operate-on-profiles(): running `~a' [~a - ~a|.~%" (current-time :long :short) profiles low high)) (operate-on-profiles profiles :condition foo :model model :task task :stream stream :internalp t :recursep nil :firstp (= i 1) :lastp (= i n) :target target :cache cache :resolvedp resolvedp :vp vp :type type :cached-test-p cached-test-p) ;; ;; do not expire the DB yet, while running sub-sets of items from it ;; (purge-profile-cache profiles :expiryp nil) finally (when cache (flush-cache cache :verbose verbose)))) (t (let* ((thorough (case *redwoods-task* (:rank (append (when (or (>= *feature-grandparenting* 0) (> *feature-ngram-size* 0)) '(:derivation)) (when *feature-flags* '(:flags)) '(:surface))))) ;;;; (:classify '(:derivation)))) (data (analyze profiles :thorough thorough :condition condition :gold (and (eq *redwoods-task* :rank) profiles) :commentp t :tokensp (eq *redwoods-task* :classify))) (data (if (null vp) data (mapcar #'(lambda (item) (if (get-field :vp item) (setf (get-field :vp item) vp) (nconc item (acons :vp vp nil)))) data)))) (case task (:fc (loop for enhancer in *feature-item-enhancers* do (loop for item in data do (call-raw-hook enhancer item))) (when (eq *redwoods-task* :rank) (setf data (loop for item in data for readings = (get-field :readings item) for ranks = (length (get-field :ranks item)) unless (or (= readings ranks) (null ranks)) collect item))) (cache-features data model :createp firstp :stream stream :verbose verbose)) ;; ;; _fix_me_ ;; this bit is hacky and MEM-specific for now. (4-apr-06; erik & oe) ;; (:rank (loop for enhancer in *feature-item-enhancers* do (loop for item in data do (call-raw-hook enhancer item))) (loop with target = (or target profiles) for item in data for readings = (get-field :readings item) for parse-id = (get-field :parse-id item) for results = (get-field :results item) for nresults = (length results) for nranks = (length (get-field :ranks item)) unless (= readings nranks) do (format stream "~&[~a] operate-on-profiles(): ~ scored item # ~d (~d @ ~d).~%" (current-time :long :short) (get-field :i-id item) nresults nranks) (let* ((ranks (loop for result in results for rid = (get-field :result-id result) for score = (mem-score-result result (or model %model%)) collect (pairlis '(:parse-id :result-id :score) (list parse-id rid score)))) (ranks (sort ranks #'> :key #'(lambda (foo) (get-field :score foo)))) (ranks (loop with last = (get-field :score (first ranks)) with i = 1 with j = 2 for rank in ranks for score = (get-field :score rank) unless (= score last) do (setf i j) (setf last score) (incf j) collect (acons :rank i rank)))) (loop for score in ranks do (write-score target score :cache cache))))) (:classify (loop for enhancer in *feature-item-enhancers* do (loop for item in data do (call-raw-hook enhancer item))) (if cached-test-p (let ((classified) (native-model (read-model (profile-find-model profiles)))) ;;; consider using %redwoods-items-percentile% = 100 when testing, ;;; as we othewise end up spending a lot of time in read-model(). (cache-contexts data model (current-pid) :format type :native native-model) (setq classified (learner-rank-items data model :type type)) (loop for item in classified do ;; (insert-cues-and-scopes item) ;; deactivated for now (erikve 26/4/2016) (write-classification-scores item target cache))) (loop with target = (or target profiles) for item in data for results = (or (get-field :results item) (when (eq *redwoods-task* :classify) ;;hack for the case of no results ;;when doing substring classification. (list (acons :result-id 0 nil)))) for nresults = (length results) for tokens = (get-field :i-tokens item) for ntokens = (length tokens) do ;;fixme: not tested yet. (format stream "~&[~a] operate-on-profiles(): ~ scoring item # ~d (~d tokens, ~d results).~%" (current-time :long :short) (get-field :i-id item) ntokens nresults) (loop for result in results do (mem-score-tokens item result model)) (when *redwoods-item-expander* (call-raw-hook *redwoods-item-expander* item)) ;; (insert-cues-and-scopes item) ;; deactivated for now (erikve 26/4/2016) (write-classification-scores item target cache))))))))) (defun train (source file &key (condition *statistics-select-condition*) (type :mem) (fcp t) (ccp t) (identity (current-pid)) target (resolvedp t) normalizep (verbose t) (stream t) interrupt meter) (declare (ignore meter) (special *feature-item-enhancers*)) (format t "[~a] train(): reading `~a'~%" (current-time :long :short) source) ;; ;; _fix_me_ ;; it looks like we cannot call train() with :fcp nil and expect to have our ;; `model' (feature table with counts) read from .source. do we need to ;; make-model() here, actually? (5-apr-06; oe & erik) ;; (let ((*maxent-debug-p* t) (model (if fcp (make-model) (read-model (profile-find-model source))))) (declare (special *maxent-debug-p*)) (when fcp (operate-on-profiles (list source) :condition condition :task :fc :model model :resolvedp resolvedp :verbose verbose :stream stream :interrupt interrupt)) (rank-profile source target :nfold 1 :recache ccp :model model :type type :identity identity :resolvedp resolvedp :normalizep normalizep :enhancers (unless resolvedp *feature-item-enhancers*) :verbose verbose :stream stream :interrupt interrupt :condition condition) ;;; (setq %model model %source source %target target) ;;; (break) (let ((parameters (cond ((eq type :mem) (model-parameters model)) ((and (or (eq type :svm) (eq type :svmrank) (eq type :perf)) (not (= *svm-kernel* 0))) nil) ((and (or (eq type :svm) (eq type :svmrank) (eq type :perf)) (= *svm-kernel* 0)) (svm2weights (model-parameters model)))))) ;;; (case type ;;; (:mem (model-parameters model)) ;;; (:svm (svm2weights (model-parameters model)))))) (when (and (or (eq type :svm) (eq type :svmrank) (eq type :perf)) (not (= *svm-kernel* 0))) (print "ok1") (let ((sv-file (format nil "~a.~a" (namestring file) "sv"))) (cl-fad:copy-file (model-parameters model) sv-file :overwrite t))) (print "ok2") ;;; (unless (probe-file parameters);;fixme: add kernel support! ;;; (format t "train(): unable to read MLM parameters.") ;;; (return-from train)) (read-weights model parameters (and (or (eq type :svm) (eq type :svmrank) (eq type :perf)) (not (= *svm-kernel* 0))))) (print "ok3") (when file (print-model model :file file :format :export)) model)) (defun rank-profile (source &optional target &key (condition *statistics-select-condition*) data (nfold 10) (niterations nfold) (type :mem) model (percentage *redwoods-train-percentage*) (identity (current-pid)) per-fold-cache (stream *tsdb-io*) (cache :raw) (verbose t) (overwrite t) interrupt meter (resolvedp t) recache enhancers normalizep (split *redwoods-split*) (ignore *redwoods-test-ignore-profiles*) (expander *redwoods-item-expander*)) (declare (special *feature-flags*)) (format stream "~&[~a] rank-profile():~% `~a'~%~@[ --> `~a'~%~]" (current-time :long :short) source target) (when (and overwrite target) (purge-test-run target :action :score)) (let* ((lkb::*edge-registry* nil) (gc (install-gc-strategy nil :tenure *tsdb-tenure-p* :burst nil :verbose t)) (ignore (when ignore #'(lambda (item-set) (let ((source (get-field :source (first item-set)))) (and source (member source ignore :test #'string=)))))) (condition (if (eq *redwoods-task* :classify) condition (if resolvedp (if (and condition (not (equal condition ""))) (format nil "t-active > 0 && readings > 1 && (~a)" condition) "t-active > 0 && readings > 1") (if (and condition (not (equal condition ""))) (format nil "readings > 1 && (~a)" condition) "readings > 1")))) ;; ;; _fix_me_ ;; in principle, most experiments should be able to make do without ;; any data from the `result' relation (it can be huge and thus slow ;; to retrieve and join from the DB); this is not finished, though. ;; (23-nov-07; oe) (thorough (unless (eq *redwoods-task* :classify) (append (and #+:null (eq type :ngram) '(:surface)) (and *feature-flags* '(:flags))))) (gold (or data (analyze source :commentp (eq *redwoods-task* :classify) :tokensp (eq *redwoods-task* :classify) :thorough thorough :condition condition :gold source :readerp nil :burst t :purge :db :message meter))) (nsifted 0) (data (loop ;;assume *feature-item-enhancers* is NIL for classification initially (loop for enhancer in enhancers do (loop for item in gold do (call-raw-hook enhancer item))) for item in gold for readings = (get-field :readings item) for ranks = (length (get-field :ranks item)) if (or (eq *redwoods-task* :classify) (not (= readings ranks))) collect (copy-graph item) else do (incf nsifted)))) (when data (format t "~&DEBUG: 'data' is non-null~%")) (when (stringp data) (format t "~&DEBUG: 'data' is a string.~%")) (when (listp data) (format t "~&DEBUG: 'data' is a string.~%")) (format t "~&DEBUG: 'source' is ~a.~%" source) (when verbose (format stream "~&[~a] rank-profile(): using ~d items (ignoring ~d).~%" (current-time :long :short) (length data) nsifted)) (when (smember type '(:mem :perf :svm :svmrank)) ;; as of february 2006, whenever there is a feature cache there ;; also has to be a serialized partial model, recording the full ;; symbol table and frequency counts (later used in restricting ;; the context cache). ;; (unless (model-p model) (setf model (read-model (profile-find-model source)))) (unless (model-p model) (format t "~&[~a] rank-profile(): no MLM for `~a'.~%" (current-time :long :short) source) (return-from rank-profile)) (when *feature-random-indexing-p* (setq per-fold-cache t) (when (eq *feature-random-index-type* :hash) (initialize-ri-hashes model)) ;;; (initialize-random-indexing model) (when (stringp *feature-random-indexing-p*) (format stream "~&[~a] rank-profile(): importing RI vectors.~%" (current-time :long :short)) (import-random-indexing model *feature-random-indexing-p*))) (when *feature-selection-p* (select-features data model :identity identity)) ;; ;; when requested, re-build the context cache ;; (when (and recache (not per-fold-cache)) (cache-contexts data model identity :format type :normalizep normalizep))) (loop with folds with cache = (when target (create-cache target :verbose verbose :protocol cache)) with sets = (let ((sets (when *redwoods-use-item-sets-p* (select-item-sets source)))) ;; ;; when there are no item sets, manufacture a dummy list ;; of singleton sets, one per item ;; (if sets (loop for set in sets for items = (loop while data for iid in set for item = (when (= (get-field :i-id (first data)) iid) (pop data)) when item collect item) when items collect items) (loop for item in data collect (list item)))) with increment = (when meter (/ (mduration meter) nfold)) ;; ;; for some model types, never do more than one iteration; also, if ;; there are fewer item sets than folds, we can maximally do one fold ;; per set. ;; with nfold = (if (smember type '(:ngram :chance :oracle)) 1 (min (length sets) nfold)) initially #+:debug (setf %data% data) (when meter (meter :value (get-field :start meter))) for i from 1 to niterations when (interrupt-p interrupt) do (format stream "[~a] rank-profile(): external interrupt signal~%" (current-time :long :short)) (flush-cache cache :verbose verbose) (restore-gc-strategy gc) (return) do (multiple-value-bind (stest strain) (if (functionp split) (funcall split sets) (ith-nth sets i nfold ignore)) (let ((test (apply #'append stest)) (train (apply #'append strain))) ;;; DEBUGGING: (format stream "[~a] rank-profile(): no. test items: ~a, no. train items: ~a.~%" (current-time :long :short) (length test) (length train)) ;; ;; for .nfold. == 1, train and test on the same data set ;; (when (and (null train) (not (smember type '(:ngram :chance :oracle)))) (setf train test)) (when (and (numberp percentage) (> percentage 0) (< percentage 100)) (setf train (i-jth-nth train 1 percentage 100))) ;;; with random indexing we must create the cache ;;; (read-only) for the test items after caching for the ;;; train items. (else the feature vectors would contain ;;; index mappings for features that didn't occur in the ;;; training data.) erikve 16/11/2010 (when (and per-fold-cache (not (and (= 1 niterations) (not recache)))) (wipe-model-map model) (format stream "~&[~a] rank-profile(): building cc for ~d training items~%" (current-time :long :short) (length train)) (cache-contexts train model identity :format type :normalizep normalizep) (unless (eq train test) (format stream "~&[~a] rank-profile(): building cc for ~d test items~%" (current-time :long :short) (length test)) (cache-contexts test model identity :format type :normalizep normalizep :rop t))) ;;; (when (and test (or train (= nfold 1))) ::bug? (when (and train (or test (= nfold 1))) ;;erikve 25/3/2011 (format stream "~&[~a] rank-profile(): iteration # ~d (~d against ~d)~%" (current-time :long :short) i (length test) (length train)) (when meter (status :text (format nil "~a-fold cross-validation: ~ iteration # ~d (~d against ~d)" nfold i (length test) (length train)))) (multiple-value-bind (items fold) (train-and-rank train (when target test) :type type :identity identity :model model :id i) (case *redwoods-task* (:classify (when items ;; apply the MWC rules before scoring. #+:null (apply-cue-rules items :add-scope nil) (when expander (loop for item in items do (call-raw-hook expander item))) (let* ((scores (summarize-classification items gold :token-filter *redwoods-token-filter*))) (if (get-field :f-extras fold) (nconc (get-field :f-extras fold) scores) (nconc fold (acons :f-extras scores nil))) (nconc fold (acons :f-accuracy (get-field :t_accuracy scores) nil))) (push (acons :test test fold) folds)) (loop for item in items do ;;fixme, do post-processing here? #+:null (insert-cues-and-scopes item) (write-classification-scores item target cache))) (:rank (loop for item in items for parse-id = (get-field :parse-id item) for ranks = (get-field :ranks item) when ranks do (loop for foo in ranks for result-id = (get-field :result-id foo) for score = (let ((score (get-field :score foo))) (if score (format nil "~,16f" score) "")) for rank = (get-field :rank foo) do #+:debug (format stream " parse: ~a; result: ~d; rank: ~d; score: ~a~%" parse-id result-id rank score) (when target (write-score target (pairlis '(:parse-id :result-id :rank :score) (list parse-id result-id rank score)) :cache cache))) finally (let* ((scores (summarize-scores items gold :condition nil :n 5 :test :id :spartanp t :loosep t)) (total (rest (rest (assoc :total scores)))) (nscores (get-field :scores total)) (exact (get-field :exact total)) (near (get-field :near total)) (similarities (let ((top (get-field :tsimilarities total)) (nbest (get-field :nsimilarities total))) (append (and top (acons :tsimilarities top nil)) (and nbest (acons :nsimilarities nbest nil))))) (accuracy (if exact (* 100 (divide exact nscores)) 0.0)) (naccuracy (if (and exact near) (float (* 100 (divide (+ exact near) nscores))) 0.0))) (if (get-field :f-extras fold) (nconc (get-field :f-extras fold) similarities) (nconc fold (acons :f-extras similarities nil))) (nconc (get-field :f-extras fold) (acons :naccuracy naccuracy nil)) (nconc fold (acons :f-accuracy accuracy nil))) (push (acons :test test fold) folds)))))))) (when meter (meter-advance increment)) finally (when meter (meter :value (get-field :end meter)) (status :text "")) (when target (loop for fold in folds do (write-fold target fold :cache cache)) (flush-cache cache :sort t :verbose verbose)) (when (and *feature-random-indexing-p* *feature-random-indexing-export-p*) (format stream "~&[~a] rank-profile(): exporting random index vectors.~%" (current-time :long :short)) (export-random-indexing model target)) ;;; (when (and (eq *redwoods-task* :classify) ;;; *conll10-xml-export-p*) ;;; (export-items-to-bioscope-xml items :task *conll10-task*)) (restore-gc-strategy gc)))) (defun train-and-rank (train test &key (type :mem) (identity (current-pid)) model id (stream *tsdb-io*)) (declare (ignore stream)) #+:debug (setf %train train %test test) (let* ((trains (let ((stream (make-string-output-stream))) (loop for item in train do (format stream " ~a" (get-field :i-id item))) (get-output-stream-string stream))) (tests (let ((stream (make-string-output-stream))) (loop for item in test do (format stream " ~a" (get-field :i-id item))) (get-output-stream-string stream))) (f-environment (case type ((:svm :svmrank :perf :mem) (feature-environment)))) (l-environment (case type ((:svm :svmrank :perf) (svm-environment)) ((:mem) (mem-environment)))) (environment (format nil "~a ~a" f-environment l-environment)) (fold (pairlis '(:f-id :f-train :f-trains :f-test :f-tests :f-environment :f-user :f-host :f-start) (list (or id -1) (length train) trains (length test) tests environment (current-user) (current-host) (current-time :long :tsdb)))) (model (case type (:pcfg (estimate-cfg train)) ((:mem :svm :svmrank :perf) (estimate-model train :identity identity :fold fold :type type :model model)) (:ngram "string n-grams") (:chance "chance") (:oracle "oracle"))) (ranks (when test (case type ((:svm :svmrank :perf :mem) (learner-rank-items test model :identity identity :fold fold :type type)) (:chance (chance-rank-items test :fold fold)) (:oracle (oracle-rank-items test :fold fold)) #+:logon (:ngram (ngram-rank-items test :fold fold)))))) (nconc fold (pairlis '(:f-end) (list (current-time :long :tsdb)))) (values ranks fold))) #+:logon (defun ngram-rank-items (items &key fold) (declare (ignore fold)) (loop for item in items for results = (get-field :results item) for ranks = (loop for result in results for string = (get-field :surface result) collect string into strings finally (return (loop for result in (get-field :results item) for score in (mt::lm-score-strings strings) for rid = (get-field :result-id result) collect (pairlis '(:result-id :score) (list rid (- (cdr score))))))) do (let* ((ranks (sort ranks #'> :key #'(lambda (foo) (get-field :score foo)))) (ranks (loop with last = (get-field :score (first ranks)) with i = 1 with j = 2 for rank in ranks for score = (get-field :score rank) unless (= score last) do (setf i j) (setf last score) (incf j) collect (acons :rank i rank)))) (if (get-field :ranks item) (setf (get-field :ranks item) ranks) (nconc item (acons :ranks ranks nil))))) items) (defun chance-rank-items (test &key fold) (declare (ignore fold)) (loop for item in test for results = (get-field :results item) for nresults = (length results) for random = (make-array nresults) collect (acons :ranks (loop for result in (get-field :results item) for rid = (get-field :result-id result) for rank = (loop for i = (random nresults) unless (aref random i) do (setf (aref random i) i) and return (+ i 1)) collect (pairlis '(:result-id :rank :score) (list rid rank 0.0)) into ranks finally (return (sort ranks #'< :key #'(lambda (rank) (get-field :rank rank))))) item))) (defun oracle-rank-items (test &key fold) (declare (ignore fold)) (loop for item in test collect (acons :ranks (loop for result in (get-field :results item) for rid = (get-field :result-id result) with active = (loop for r in (get-field :ranks item) collect (get-field :result-id r)) for active-p = (if (member rid active :test #'=) 1 0) collect (pairlis '(:result-id :rank :score) (list rid active-p active-p))) item))) (defun kappa (actual expected) (/ (- actual expected) (- 100 expected))) (defun answer-enrich-mrs (edge &key (format :string)) #+:lkb (let ((mrs (typecase edge (lkb::edge (mrs::extract-mrs edge)) (mrs::psoa edge)))) (when (mrs::psoa-p mrs) (setf (mrs::psoa-vcs mrs) (loop for edge in (mt::transfer-mrs mrs :filter nil :task :trigger) for mtr = (mt::edge-rule edge) for id = (mt::mtr-trigger mtr) when (and id (not (smember id lkb::*duplicate-lex-ids*))) collect id)) (case format (:string (with-output-to-string (stream) (mrs::output-mrs1 mrs 'mrs::simple stream))) (:raw mrs))))) (defun desperate-split-hack (sets) (loop for set in sets when (string-equal (get-field :source (first set)) "bse") collect set into test else collect set into train finally (return (values test train)))) ;;; ;;; generate summary statistics for a set of treebanked profiles ;;; #+:null (loop with *phenomena* = nil with *statistics-aggregate-dimension* = :phenomena with *tsdb-home* = (logon-directory "lingo/terg/tsdb/gold" :string) for db in (find-tsdb-directories) for name = (get-field :database db) do (analyze-trees name :append "/tmp/redwoods.csv" :format :csv))