;;; -*- Mode: COMMON-LISP; Syntax: Common-Lisp; Package: TSDB -*- ;;; ;;; [incr tsdb()] --- Competence and Performance Profiling Environment ;;; Copyright (c) 1996 -- 2006 Stephan Oepen (oe@csli.stanford.edu) ;;; Copyright (c) 2005 -- 2006 Erik Velldal (erikve@ifi.uio.no) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; ;;; ;;; typical TADM output: ;;; ;;; Classes = 26944 ;;; Contexts = 600 ;;; Features = 149774 / 283752 ;;; Non-zeros = 24071755 ;;; ;;; we interpret that as 26944 events in 600 contexts; 283752 total features of ;;; which 149774 have some property; and 24071755 actual (non-zero) feature ;;; counts, i.e. actually observed feature occurences. a day later, we think ;;; the relevant property of the 149774 sub-set of features might be that they ;;; are attested in active events, but this remains guesswork. ;;; (in-package :tsdb) (defparameter *maxent-method* :tao_lmvm) (defparameter *maxent-iterations* 5000) (defparameter *maxent-prior* :l2) (defparameter *maxent-relative-tolerance* 1e-10) (defparameter *maxent-absolute-tolerance* 1e-20) (defparameter *maxent-variance* 1e-2) (defparameter *maxent-uniform* nil) (defparameter *maxent-extra-options* nil) (defparameter *maxent-options* '(*maxent-method* *maxent-iterations* *maxent-relative-tolerance* *maxent-absolute-tolerance* *maxent-variance*)) (defparameter *maxent-debug-p* t) (defparameter *svm-kernel* 0) (defparameter *svm-error-to-margin* nil) ;; svm_light default is dynamic. ;; svm_rank default = 0.01. (defparameter *svm-cost-balance* 1.0) (defparameter *svm-iterations* 1e+5) (defparameter *svm-tolerance* 0.001) (defparameter *svm-poly-d* nil) (defparameter *svm-rbf-g* nil) (defparameter *svm-sig-poly-s* nil) (defparameter *svm-sig-poly-r* nil) (defparameter *svm-cache-size* 1000) ;5000.0) (defparameter *svm-options* '(*svm-kernel* *svm-rbf-g* *svm-poly-d* *svm-sig-poly-s* *svm-sig-poly-r* *svm-iterations* *svm-cost-balance* *svm-error-to-margin* *svm-tolerance*)) (defun feature-environment (&key (format :string)) (case *redwoods-task* (:classify (case format (:compact (format nil "~(fw~:[_~;~:*~d~]_sw~:[_~;~:*~d~]_tw~:[_~;~:*~d~]~ _fl~:[_~;~:*~d~]_sl~:[_~;~:*~d~]_tl~:[_~;~:*~d~]~ _fr~:[_~;~:*~d~]_sr~:[_~;~:*~d~]_tr~:[_~;~:*~d~]~ _cf~:[_~;~:*~d~]~ _ps~:[_~;~:*~d~]_pp~:[_~;~:*~d~]_pa~:[_~;~:*~d~]~ _pl~:[_~;~:*~d~]_pr~:[_~;~:*~d~]_s~:[_~;~:*~d~]~)" ;; possibly int valued parameters: *cfeature-form-win* *cfeature-stem-win* *cfeature-tag-win* *cfeature-form-l-ngram* *cfeature-stem-l-ngram* *cfeature-tag-l-ngram* *cfeature-form-r-ngram* *cfeature-stem-r-ngram* *cfeature-tag-r-ngram* *cfeature-conn-form* *cfeature-ptb-self* *cfeature-ptb-parent* *cfeature-ptb-path* *cfeature-ptb-left* *cfeature-ptb-right* *cfeature-stanford*)))) (:rank (labels ((count (bar) (and (numberp bar) (> bar 0) bar))) (case format (:string (let ((counts (when (counts-p *feature-frequency-threshold*) (list (count (counts-absolute *feature-frequency-threshold*)) (count (counts-contexts *feature-frequency-threshold*)) (count (counts-events *feature-frequency-threshold*)) (count (counts-relevant *feature-frequency-threshold*))))) (ngramp (> *feature-ngram-size* 0)) (weightp (and (numberp *feature-constituent-weight*) (> *feature-constituent-weight* 0)))) (if *feature-flags* (format nil "~{~a[~a] ~}~ FT[~{~@[~a~]~^:~}] RS[~@[~a~]]" (loop for flag in *feature-flags* collect (second flag) collect (first flag)) counts *feature-random-sample-size*) (format nil "GP[~a] ~:[-~;+~]PT ~:[-~;+~]LEX CW[~@[~a~]] ~ ~:[-~;+~]AE NS[~a] ~ NT[~@[~(~a~)~]] ~:[-~;+~]NB LM[~:[0~*~;~a~]] ~ FT[~{~@[~a~]~^:~}] RS[~@[~a~]]" *feature-grandparenting* *feature-use-preterminal-types-p* *feature-lexicalization-p* (and weightp *feature-constituent-weight*) *feature-active-edges-p* *feature-ngram-size* (and ngramp *feature-ngram-tag*) (and ngramp *feature-ngram-back-off-p*) *feature-lm-p* *feature-lm-p* counts *feature-random-sample-size*)))) (:compact (let ((ngramp (> *feature-ngram-size* 0))) (format nil "~(g~d_p~:[0~;1~]_l~:[0~;1~]_cw~d_a~:[0~;1~]_~ n~d_nt~:[0~;1~]_nb~:[0~;1~]_lm~a_c~:[0~;~:*~a~]_~ r~:[0~;~:*~a~]_rs~:[0~;~:*~d~]~)" *feature-grandparenting* *feature-use-preterminal-types-p* *feature-lexicalization-p* (or *feature-constituent-weight* 0) *feature-active-edges-p* *feature-ngram-size* (and ngramp (eq *feature-ngram-tag* :type)) (and ngramp *feature-ngram-back-off-p*) *feature-lm-p* (and (counts-p *feature-frequency-threshold*) (counts-contexts *feature-frequency-threshold*)) (and (counts-p *feature-frequency-threshold*) (counts-relevant *feature-frequency-threshold*)) *feature-random-sample-size*))) (:list (loop for key in *feature-options* collect (cons key (symbol-value key))))))))) ;;fix_me add experiment type to names (defun mem-environment (&key (format :string) full prefix) (let ((features (and full (feature-environment :format format)))) (case format (:string (format nil "~@[[~a] ~]~@[~a ~]MM[~(~a~)] MI[~@[~a~]] ~ RT[~@[~e~]] AT[~@[~e~]] VA[~@[~e~]]~ ~@[~* PC[~a]~]" prefix features *maxent-method* *maxent-iterations* *maxent-relative-tolerance* *maxent-absolute-tolerance* *maxent-variance* full (or *redwoods-train-percentage* 100))) (:compact (format nil "~(~@[~a~]mem_~@[~a_~]rt~e_at~e_v~e~@[_pc~d~]~)" prefix features *maxent-relative-tolerance* *maxent-absolute-tolerance* *maxent-variance* *redwoods-train-percentage*)) (:list (nconc features (loop for key in *maxent-options* collect (cons key (symbol-value key)))))))) (defun svm-environment (&key (format :string) full prefix) (let ((features (and full (feature-environment :format format)))) (case format (:string (format nil "~@[[~a] ~]~@[~a ~] K[~[lin~;pol~;rbf~;sig~;usr~]]~ ~@[ G[~a]~]~@[ D[~a]~]~@[ S[~a]~]~@[ R[~a]~]~ ~@[ IT[~a]~]~@[ B[~a]~]~@[ EM[~a]~]~@[ T[~a]~]" prefix features *svm-kernel* *svm-rbf-g* *svm-poly-d* *svm-sig-poly-s* *svm-sig-poly-r* *svm-iterations* *svm-cost-balance* *svm-error-to-margin* *svm-tolerance*)) (:compact (format nil "~(~@[~a~]svm_~@[~a_~]opt=~[lin~;pol~;rbf~;sig~;usr~]~ ~@[_g~a~]~@[_d~a~]~@[_s~a~]~@[_r~a~]~@[_it~e~]~ ~@[_b~a~]~@[_em~a~]~@[_t~a~]~@[_pc~d~]~)" prefix features *svm-kernel* *svm-rbf-g* *svm-poly-d* *svm-sig-poly-s* *svm-sig-poly-r* *svm-iterations* *svm-cost-balance* *svm-error-to-margin* *svm-tolerance* *redwoods-train-percentage*))))) (defun parse-environment (string) (ppcre:split "_(?:fw|sw|tw|fl|sl|tl|fr|sr|tr|cf|ps|pp|pa|pl|pr|s|opt=|it|b|t|pc)" string)) (defun print-model (model &key (file "/dev/null") stream (format :rpm)) (case format ((:mem :rpm) (with-open-file (foo file :direction :output :if-exists :supersede) (loop with stream = (or stream foo) for context in (model-contexts model) do (print-context context :stream stream :model model :format format)))) ((:freeze :export) (with-open-file (foo file :direction :output :if-exists :supersede) (let ((stream (or stream foo)) (table (model-table model)) (*print-circle* nil) (*print-pretty* nil)) (format stream ";;;~%;;; ~a~%;;; (~a@~a; ~a)~%;;;~%" model (current-user) (current-host) (current-time :long :pretty)) (format stream "~%:begin :model ~d.~%~%" (model-ncontexts model)) (when (> (model-count model) 0) (if *feature-flags* (let ((*print-case* :downcase)) (format stream "*feature-flags := [~{~a~^ ~}].~%~%" (loop for flag in *feature-flags* collect (first flag)))) (loop with *print-case* = :downcase for key in *feature-options* for value = (let ((foo (symbol-value key))) (cond ((null foo) "no") ((eq foo t) "yes") (t foo))) when (boundp key) do (format stream "~a := ~a.~%~%" key value))) (loop with *print-case* = :downcase for key in *maxent-options* when (boundp key) do (format stream "~a := ~a.~%~%" key (symbol-value key)))) (format stream ":begin :features ~d.~%~%" (symbol-table-count table)) (loop with *print-case* = :downcase with *package* = (find-package :lkb) with map = (model-map model) with i = 0 for code from 0 to (- (symbol-table-count table) 1) for symbol = (code-to-symbol code table) for weight = (aref (model-weights model) code) for counts = (or (aref (model-counts model) code) (make-counts)) for mapped = (and map (symbol-to-code code map :rop t)) for minmax = (aref (model-minmax model) code) when (or (eq format :freeze) (numberp weight)) do (case format (:freeze (format stream "(~d~@[ ~d~]) " code mapped)) (:export (format stream "(~d) " i) (incf i))) (format stream "[~{~s~^ ~}] ~:[null~*~;~f~] " symbol weight weight) (print-object counts stream) (when minmax (format stream " [~d ~d]" (first minmax) (second minmax))) (format stream "~%")) (format stream "~%:end :features.~%~%:end :model.~%")))))) (defun read-model (file &key (verbose t) id sv-p (symbols-p *feature-symbols-p*)) (labels ((|[|-reader (stream char) (declare (ignore char)) (read-delimited-list #\] stream nil)) (|{|-reader (stream char) (declare (ignore char)) (read-delimited-list #\} stream nil))) (let* ((*readtable* (copy-readtable nil)) (*package* (find-package :lkb)) (model (make-model)) (table (model-table model)) (name (format nil "~a~@[.~a~]" (pathname-name file) (pathname-type file)))) (when sv-p (let ((sv-file (format nil "~a.~a" (namestring file) "sv"))) (setf (model-parameters model) (format nil "~a/.model.~a.~a.weights" (tmp :redwoods) (current-user) (current-pid))) (cl-fad:copy-file sv-file (model-parameters model) :overwrite t) (format t "~&read-model(): copied ~a to ~a~%" sv-file (model-parameters model)))) ;; fixme: to support cahced scoring for :mem, print the weights to ;; (model-parameters model) (set-syntax-from-char #\. #\space *readtable*) (if (probe-file file) (with-open-file (stream file :direction :input) (unless (and (eq (read stream nil nil) :begin) (eq (read stream nil nil) :model) (integerp (setf (model-ncontexts model) (read stream nil nil)))) (format t "read-model(): invalid header in `~a'.~%" name) (return-from read-model)) (when verbose (format t "~&read-model(): reading file `~a'.~%" name)) (loop with bodyp = nil for form = (read stream nil :eof) while (not (eq form :eof)) when (and (eq form :begin) (eq (read stream nil nil) :features)) do (let ((n (read stream nil nil))) (unless (and (integerp n) (>= n 0)) (format t "read-model(): invalid `:begin :feature' block in `~a'.~%" name) (return-from read-model)) (setf (model-size model) n) (setf (model-minmax model) (make-array n)) (setf (model-counts model) (make-array n)) (setf (model-weights model) (make-array n)) (when (and *feature-random-indexing-p* (eq :vector *feature-random-index-type*)) (vector (setf (model-random-indexes model) (make-array n))))) (setf *readtable* (copy-readtable nil)) (set-syntax-from-char #\[ #\( *readtable*) (set-syntax-from-char #\] #\) *readtable*) (set-macro-character #\[ #'|[|-reader nil *readtable*) (set-syntax-from-char #\{ #\( *readtable*) (set-syntax-from-char #\} #\) *readtable*) (set-macro-character #\{ #'|{|-reader nil *readtable*) (setf bodyp t) else when (eq form :end) do (set-syntax-from-char #\. #\space *readtable*) (unless (and (eq (read stream nil nil) :features) (eq (read stream nil nil) :end) (eq (read stream nil nil) :model)) (format t "read-model(): invalid model prologue.~%") (return-from read-model)) (when id (push (cons id model) *models*)) (return (setf %model% model)) else when bodyp do (unless (and (consp form) (numberp (first form))) (format t "read-model(): invalid codes `~a'.~%" form) (return-from read-model)) (let* ((symbol (let ((foo (read stream nil nil))) (unless (consp foo) (format t "read-model(): invalid symbol `~a'.~%" foo) (return-from read-model)) foo)) (code (first form)) (mapped (second form)) (weight (read stream nil nil)) (counts (read-preserving-whitespace stream nil nil)) (minmax (unless (eq #\Newline (peek-char nil stream nil nil)) (read stream nil nil)))) (when symbols-p (set-symbol-and-code symbol code table)) (when (numberp mapped) (set-symbol-and-code code mapped (model-map model))) (when (>= code (model-size model)) (format t "read-model(): mysterious feature overflow (~a vs. ~a).~%" code (model-size model)) (return-from read-model)) (when (numberp weight) (setf (aref (model-weights model) code) weight) (incf (model-count model))) (setf (aref (model-counts model) code) (make-counts :absolute (first counts) :contexts (second counts) :events (third counts) :relevant (fourth counts))) (when minmax (setf (aref (model-minmax model) code) minmax))))) (format t "read-model(): unable to open `~a'.~%" name))))) ;;; mostly cut out from `estimate-model': (defun create-event-file (items &key file (identity (current-pid))) (let* ((source (get-field :source (first items))) (cache (profile-find-context-cache source identity)) (events (or file (format nil "~a/.model.~a.~a.events" (tmp :redwoods) (current-user) (current-pid))))) (format t "[~a] create-event-file(): Writing events for ~a items to ~a.~%" (current-time :long :short) (length items) events) ;; ;; in order to keep using this stream across multiple calls to cp(), its ;; :element-type needs to match the expecations of cp(), i.e. be byte-wise ;; binary data. ;; (with-open-file (out events :direction :output :if-does-not-exist :create :if-exists :supersede :element-type '(unsigned-byte 8)) (loop for item in items for cc = (let ((foo (get-field :source item))) (cond ((string= source foo) cache) (t (setf source foo) (setf cache (profile-find-context-cache source identity))))) for iid = (get-field :i-id item) for file = (merge-pathnames cc (make-pathname :name (format nil "~a~@[.~a~]" iid *feature-random-sample-size*))) for readings = (get-field :readings item) when (or (eq *redwoods-task* :classify) (> readings 1)) do ;; ;; for items that were not annotated or for some other reason have ;; no information in the feature and, thus, context cache, cp() ;; will just do nothing, when .file. does not exist. ;; (when (or (null *feature-random-sample-size*) (<= readings *feature-random-sample-size*) (null (cp file out))) ;; ;; even when random sampling is enabled, for contexts with fewer ;; events than the maximum sample size, the context cache will ;; not contain two files, but rather just the base file name; so ;; try to fall back on the non-sampled file then. ;; (let ((file (merge-pathnames cc (make-pathname :name (format nil "~a" iid))))) (cp file out))))) events)) (defun estimate-model (items &key (identity (current-pid)) fold (stream *tsdb-io*) model type) (declare (ignore stream)) (let* ((model (or model (make-model))) (events (create-event-file items :identity identity)) (trace (format nil "~a/.model.~a.~a.trace" (tmp :redwoods) (current-user) (current-pid)))) (unless (model-parameters model) (setf (model-parameters model) (format nil "~a/.model.~a.~a.weights" (tmp :redwoods) (current-user) (current-pid)))) (let* ((parameters (model-parameters model)) ;; ;;fixme; check for list of variances ;; (variances (when (numberp *maxent-variance*) ;; (let ((name (format ;; nil ;; "~a/.model.~a.~a.variances" ;; (tmp :redwoods) ;; (current-user) (current-pid)))) ;; (with-open-file (stream name ;; :direction :output ;; :if-exists :supersede) ;; (format stream "~f" *maxent-variance*)) ;; name))) (command (case type (:mem (format nil "tadm -monitor -events_in ~a -params_out ~a~ ~@[ -method ~(~a~)~]~ ~@[ -max_it ~a~]~@[ -frtol ~a~]~@[ -fatol ~a~]~ ~@[ -l2 ~a~]~ ~:[~; -uniform~]~ ~@[ ~a~]" ;;; ~@[ -variances ~a~]~ events parameters *maxent-method* *maxent-iterations* *maxent-relative-tolerance* *maxent-absolute-tolerance* *maxent-variance* ;;; variances *maxent-uniform* *maxent-extra-options*)) (:svmrank (format nil "svm_rank_learn -v 2 -n 10 -q 40 -m ~a -t ~a~ ~@[ -g ~a~]~ ~@[ -d ~a~]~@[ -s ~a~]~@[ -r ~a~]~@[ -# ~a~]~ ~@[ -e ~a~]~@[ -c ~a~] ~a ~a" *svm-cache-size* *svm-kernel* *svm-rbf-g* *svm-poly-d* *svm-sig-poly-s* *svm-sig-poly-r* *svm-iterations* *svm-tolerance* *svm-error-to-margin* events parameters)) (:perf (format nil ;;;nb trying F1 error citerion! "svm_perf_learn -v 1 -y 2 -n 10 -q 40 -t ~a -l 1 -w 3~ ~@[ -# ~a~]~@[ -e ~a~]~@[ -c ~a~] ~a ~a" *svm-kernel* *svm-iterations* *svm-tolerance* *svm-error-to-margin* events parameters)) (:svm (format nil "svm_learn -v 2 -n 10 -q 40 -m ~a -z ~a -t ~a~ ~@[ -g ~a~]~ ~@[ -d ~a~]~@[ -s ~a~]~@[ -r ~a~]~@[ -# ~a~]~ ~@[ -j ~a~]~@[ -e ~a~]~@[ -c ~a~] ~a ~a" *svm-cache-size* (case *redwoods-task* (:rank "p") (:classify "c")) *svm-kernel* *svm-rbf-g* *svm-poly-d* *svm-sig-poly-s* *svm-sig-poly-r* *svm-iterations* *svm-cost-balance* *svm-tolerance* *svm-error-to-margin* events parameters)))) (output (if *maxent-debug-p* nil "/dev/null"))) (format t "estimate-model(): running process: ~a~%" command) (when (and (zerop (run-process ;;fixme: if the command does not terminate with zero, raise an error! (format nil "~a | tee '~a'" command trace) :wait t :output output :if-output-exists :supersede)) (probe-file parameters)) (when (and (eq type :mem) (probe-file trace)) (let ((trace (read-file trace)) (iterations 0) events) ;; ;; extract number of event and features and count up the number ;; of iterations ;; ;; fixme: adapt this for SVMs as well (erikve 19/02-10) (multiple-value-bind (foo matches) (ppcre::scan-to-strings "\\nClasses = ([0-9]+)" trace) (declare (ignore foo)) (when matches (setf events (parse-integer (aref matches 0) :junk-allowed t)))) (ppcre:do-matches (start end "\\n *[0-9]+ [0-9.e+-]+ [0-9.e+-]+ [0-9.e+-]+" trace nil) (incf iterations)) (nconc fold (pairlis '(:f-events :f-iterations :f-estimation) (list events iterations trace))))) (when (or (eq type :mem) (and (or (eq type :svm) (eq type :svmrank) (eq type :perf)) (= *svm-kernel* 0))) (let ((parameters (case type (:mem parameters) ((:svm :svmrank :perf) (svm2weights parameters))))) (when (probe-file parameters) (nconc fold (acons :f-features (get-field :lines (wc parameters)) nil))))) ;;; (unless *maxent-debug-p* ;;; (ignore-errors (delete-file trace)) ;;; (ignore-errors (delete-file events)) ;;; (ignore-errors (delete-file parameters))) model)))) ;;;(defun estimate-model (items ;;; &key (identity (current-pid)) fold ;;; (stream *tsdb-io*) model type) ;;; ;;; (declare (ignore stream)) ;;; ;;; (let* ((model (or model (make-model))) ;;; (events (format ;;; nil ;;; "~a/.model.~a.~a.events" ;;; (tmp :redwoods) (current-user) (current-pid))) ;;; (trace (format ;;; nil ;;; "~a/.model.~a.~a.trace" ;;; (tmp :redwoods) (current-user) (current-pid))) ;;; (source (get-field :source (first items))) ;;; (cache (profile-find-context-cache source identity))) ;;; (unless (model-parameters model) ;;; (setf (model-parameters model) ;;; (format ;;; nil ;;; "~a/.model.~a.~a.weights" ;;; (tmp :redwoods) (current-user) (current-pid)))) ;;; (with-open-file (out events :direction :output ;;; :if-does-not-exist :create ;;; :if-exists :supersede ;;; :element-type '(unsigned-byte 8)) ;;; (loop ;;; for item in items ;;; for cc = (let ((foo (get-field :source item))) ;;; (cond ;;; ((string= source foo) cache) ;;; (t ;;; (setf source foo) ;;; (setf cache ;;; (profile-find-context-cache source identity))))) ;;; for iid = (get-field :i-id item) ;;; for file = (merge-pathnames ;;; cc ;;; (make-pathname :name (format ;;; nil ;;; "~a~@[.~a~]" ;;; iid *feature-random-sample-size*))) ;;; for readings = (get-field :readings item) ;;; when (or (eq *redwoods-task* :classify) ;;; (> readings 1)) ;;; do ;;; ;; ;;; ;; for items that were not annotated or for some other reason have ;;; ;; no information in the feature and, thus, context cache, cp() ;;; ;; will just do nothing, when .file. does not exist. ;;; ;; ;;; (when (or (null *feature-random-sample-size*) ;;; (<= readings *feature-random-sample-size*) ;;; (null (cp file out))) ;;; ;; ;;; ;; even when random sampling is enabled, for contexts with fewer ;;; ;; events than the maximum sample size, the context cache will ;;; ;; not contain two files, but rather just the base file name; so ;;; ;; try to fall back on the non-sampled file then. ;;; ;; ;;; (let ((file (merge-pathnames ;;; cc ;;; (make-pathname :name (format nil "~a" iid))))) ;;; (cp file out))))) ;;; ;;; (let* ((parameters (model-parameters model)) ;;;;; ;;fixme; check for list of variances ;;;;; (variances (when (numberp *maxent-variance*) ;;;;; (let ((name (format ;;;;; nil ;;;;; "~a/.model.~a.~a.variances" ;;;;; (tmp :redwoods) ;;;;; (current-user) (current-pid)))) ;;;;; (with-open-file (stream name ;;;;; :direction :output ;;;;; :if-exists :supersede) ;;;;; (format stream "~f" *maxent-variance*)) ;;;;; name))) ;;; (command (case type ;;; (:mem ;;; (format ;;; nil ;;; "tadm -monitor -events_in ~a -params_out ~a~ ;;; ~@[ -method ~(~a~)~]~ ;;; ~@[ -max_it ~a~]~@[ -frtol ~a~]~@[ -fatol ~a~]~ ;;; ~@[ -l2 ~a~]~ ;;; ~:[~; -uniform~]~ ;;; ~@[ ~a~]" ;;;;;; ~@[ -variances ~a~]~ ;;; events parameters ;;; *maxent-method* ;;; *maxent-iterations* ;;; *maxent-relative-tolerance* *maxent-absolute-tolerance* ;;; *maxent-variance* ;;;;;; variances ;;; *maxent-uniform* ;;; *maxent-extra-options*)) ;;; (:perf ;;; (format ;;; nil ;;; "svm_perform_learn -v 1 -y 2 -n 10 -q 40 -t 0~ ;;; ~@[ -# ~a~]~@[ -e ~a~]~@[ -c ~a~] ~a ~a" ;;; *svm-iterations* *svm-tolerance* ;;; *svm-error-to-margin* events parameters)) ;;; (:svm ;;; (format ;;; nil ;;; "svm_learn -v 2 -n 10 -q 40 -m ~a -z ~a -t ~a~ ;;; ~@[ -g ~a~]~ ;;; ~@[ -d ~a~]~@[ -s ~a~]~@[ -r ~a~]~@[ -# ~a~]~ ;;; ~@[ -j ~a~]~@[ -e ~a~]~@[ -c ~a~] ~a ~a" ;;; *svm-cache-size* ;;; (case *redwoods-task* (:rank "p") (:classify "c")) ;;; *svm-kernel* *svm-rbf-g* *svm-poly-d* ;;; *svm-sig-poly-s* *svm-sig-poly-r* *svm-iterations* ;;; *svm-cost-balance* *svm-tolerance* ;;; *svm-error-to-margin* events parameters)))) ;;; (output (if *maxent-debug-p* nil "/dev/null"))) ;;; (format t "estimate-model(): running process: ~a~%" command) ;;; (when (and (zerop (run-process ;;fixme: if the command does not terminate with zero, raise an error! ;;; (format nil "~a | tee '~a'" command trace) ;;; :wait t ;;; :output output :if-output-exists :supersede)) ;;; (probe-file parameters)) ;;; ;;; (when (and (eq type :mem) (probe-file trace)) ;;; (let ((trace (read-file trace)) ;;; (iterations 0) ;;; events) ;;; ;; ;;; ;; extract number of event and features and count up the number ;;; ;; of iterations ;;; ;; ;;; ;; fixme: adapt this for SVMs as well (erikve 19/02-10) ;;; (multiple-value-bind (foo matches) ;;; (ppcre::scan-to-strings "\\nClasses = ([0-9]+)" trace) ;;; (declare (ignore foo)) ;;; (when matches ;;; (setf events ;;; (parse-integer (aref matches 0) :junk-allowed t)))) ;;; (ppcre:do-matches ;;; (start end ;;; "\\n *[0-9]+ [0-9.e+-]+ [0-9.e+-]+ [0-9.e+-]+" ;;; trace nil) ;;; (incf iterations)) ;;; (nconc ;;; fold ;;; (pairlis '(:f-events :f-iterations :f-estimation) ;;; (list events iterations trace))))) ;;; (when (or (eq type :mem) ;;; (and (eq type :svm) ;;; (= *svm-kernel* 0))) ;;; (let ((parameters (case type ;;; (:mem parameters) ;;; (:svm (svm2weights parameters))))) ;;; (when (probe-file parameters) ;;; (nconc ;;; fold ;;; (acons :f-features (get-field :lines (wc parameters)) nil))))) ;;;;;; (unless *maxent-debug-p* ;;;;;; (ignore-errors (delete-file trace)) ;;;;;; (ignore-errors (delete-file events)) ;;;;;; (ignore-errors (delete-file parameters))) ;;; model)))) (defun svm2weights (svm-file) (let* ((output (format nil "~a/.model.~a.~a.svm_weights" (tmp :redwoods) (current-user) (current-pid))) ;; ;; _fix_me_ ;; svm2weights.pl will only work for linear kernels ;; (15-feb-07; erik) ;; ;; _fix_me_ ;; the script should probably reside in the `bin' sub-directory, ;; side-by-side with the other SVM binaries. (12-mar-07; oe) ;; (command (format nil "~a/uio/bin/svm2weights.pl ~a" (getenv "LOGONROOT") svm-file))) (unless (and (zerop (run-process command :wait t :output output :if-output-exists :supersede)) (probe-file output)) (format t "svm2weights(): unable to compute weights from SVM model: ~a" svm-file)) output)) (defun read-weights (model &optional (file (model-parameters model)) sv-p) (if (and (null file) sv-p) ;; for kernels, create a model with all zero weights (let ((count (symbol-table-count (or (model-map model) (model-table model))))) ;;; (setf (model-size model) count) ;;; (setf (model-counts model) ;;; (adjust-array (model-counts model) count)) ;;; (setf (model-weights model) ;;; (adjust-array (model-weights model) count)) ;;; (setf (model-count model) count) ;;; (loop ;;; for i from 0 to (- count 1) ;;; for code = (if (model-map model) ;;; (code-to-symbol i (model-map model)) ;;; i) ;;; do (setf (aref (model-weights model) code) 0))) (loop for i from 0 to (- count 1) for code = (if (model-map model) (code-to-symbol i (model-map model)) i) for weight = 0 ; while weight do do (when (>= code (model-size model)) (let ((n (setf (model-size model) (* (model-size model) 2)))) (setf (model-counts model) (adjust-array (model-counts model) n)) (setf (model-weights model) (adjust-array (model-weights model) n)))) (setf (aref (model-weights model) code) weight) (incf (model-count model)))) (with-open-file (stream file :direction :input :if-does-not-exist nil) (when stream (loop for i from 0 for code = (if (model-map model) (code-to-symbol i (model-map model)) i) for weight = (read stream nil nil) while weight do (when (>= code (model-size model)) (let ((n (setf (model-size model) (* (model-size model) 2)))) (setf (model-counts model) (adjust-array (model-counts model) n)) (setf (model-weights model) (adjust-array (model-weights model) n)))) (setf (aref (model-weights model) code) weight) (incf (model-count model))))))) (defun learner-rank-items (items model &key (identity (current-pid)) fold type (stream *tsdb-io*)) (format t "~&[~a] learner-rank-items(): model-parameters = ~a~%" (current-time :long :short) (model-parameters model)) (let* ((parameters (model-parameters model)) (events (format nil "~a/.model.~a.~a.events" (tmp :redwoods) (current-user) (current-pid))) (source (get-field :source (first items))) (cache (profile-find-context-cache source identity)) active) (when (null parameters) (format t "[~a] learner-rank-items(): invalid model: no parameters.~%" (current-time :long :short)) (return-from learner-rank-items)) (format stream "~&[~a] learner-rank-items(): evaluating ~d item~p ~%" (current-time :long :short) (length items) (length items)) ;; ;; in order to keep using this stream across multiple calls to cp(), its ;; :element-type needs to match the expecations of cp(), i.e. be byte-wise ;; binary data. ;; (with-open-file (out events :direction :output :if-does-not-exist :create :if-exists :supersede :element-type '(unsigned-byte 8)) (loop for item in items for cc = (let ((foo (get-field :source item))) (cond ((string= source foo) cache) (t (setf source foo) (setf cache (profile-find-context-cache source identity))))) for iid = (get-field :i-id item) for readings = (get-field :readings item) for file = (merge-pathnames cc (make-pathname :name (format nil "~a" iid))) when (eq *redwoods-task* :classify) ;;; in case there are no active features for any for an item's ;;; tokens, and thereby no context-cache is generated, make sure ;;; every tokens has a :class. fixme: maye only do this for ;;; just the items that would otherwise be skipped. do (loop for token in (get-field :i-tokens item) if (test-field :class token) do (setf (get-field :class token) 0) else do (nconc token (acons :class 0 nil))) when (and (probe-file file) (or (eq *redwoods-task* :classify) (> readings 1))) do (cp file out) (push item active) else do (format t "~&[~a] learner-rank-items(): mysteriously skipping item # ~d.~%" (current-time :long :short) (get-field :i-id item)))) (setf active (nreverse active)) (let* ((scores (format nil "~a/.model.~a.~a.scores" (tmp :redwoods) (current-user) (current-pid))) (output (format nil "~a/.model.~a.~a.output" (tmp :redwoods) (current-user) (current-pid))) (command (case type (:mem (format nil "evaluate -s '~a' '~a' '~a'" scores parameters events)) (:perf (format nil "svm_perf_classify ~a ~a ~a" events parameters scores)) (:svmrank (format nil "svm_rank_classify ~a ~a ~a" events parameters scores)) (:svm (format nil "svm_classify '~a' '~a' '~a'" events parameters scores))))) (when (and (zerop (run-process command :wait t :output output :if-output-exists :supersede)) (probe-file scores) (probe-file output)) (format t "~&[~a] learner-rank-items(): ranking ~d item~p ~%" (current-time :long :short) (length active) (length active)) (with-open-file (stream scores :direction :input) (case *redwoods-task* (:rank (loop for item in active for results = (get-field :results item) for ranks = (loop for result in results for rid = (get-field :result-id result) for score = (read stream nil nil) unless score do (error "learner-rank-items(): mysterious score deficit") collect (pairlis '(:result-id :score) (list rid score))) do (let* ((ranks (sort ranks #'> :key #'(lambda (foo) (get-field :score foo)))) (ranks (loop with last = (get-field :score (first ranks)) with i = 1 with j = 2 for rank in ranks for score = (get-field :score rank) unless (= score last) do (setf i j) (setf last score) (incf j) collect (acons :rank i rank)))) (if (get-field :ranks item) (setf (get-field :ranks item) ranks) (nconc item (acons :ranks ranks nil)))))) (:classify ;; (when *token-filter-hook* ;;; (loop ;;; for item in items ;;; do (loop ;;; for token in (get-field :i-tokens item) ;;; if (test-field :class token) ;;; do (setf (get-field :class token) 0) ;;; else do (nconc token (acons :class 0 nil)))) (loop for item in active ;; for tokens = (get-field :i-tokens item) for tokens = (if *redwoods-token-filter* (remove-if-not *redwoods-token-filter* (get-field :i-tokens item)) (get-field :i-tokens item)) do (loop for token in tokens for score = (read stream nil nil) unless score do (error "learner-rank-items(): no classifier score.") if (get-field :class token) do (setf (get-field :class token) score) else do (nconc token (acons :class score nil)) when (eq type :mem);;discard rest of line (dummy event) do (read-line stream nil nil)))))) (when fold (when (eq type :mem) (with-open-file (stream output :direction :input) (let ((line (read-line stream nil nil))) (when line (multiple-value-bind (foo matches) (ppcre:scan-to-strings "([0-9.]+) [0-9]*$" line) (declare (ignore foo)) (when matches (let ((accuracy (acons :eaccuracy (aref matches 0) nil))) (if (get-field :f-extras fold) (nconc (get-field :f-extras fold) accuracy) (nconc fold (acons :f-extras accuracy nil)))))))))))) (unless *maxent-debug-p* (ignore-errors (delete-file events)) (ignore-errors (delete-file scores)) (ignore-errors (delete-file output)))) (if (eq *redwoods-task* :classify) items active))) (defconstant e (exp 1d0)) (defun scores-to-probabilities (scores) (loop with sum = 0d0 for score in scores for foo = (if (stringp score) (read-from-string score) score) for p = (expt e (coerce foo 'long-float)) collect p into probabilities do (incf sum p) finally (return (loop for p in probabilities collect (/ p sum))))) (defun entropy (probabilities) (loop with h = 0d0 for p in probabilities do (incf h (* p (log p 2d0))) finally (return (- h)))) (defun baseline (profile &key condition (n 1) (resolvedp t) enhancers) (loop with nitems = 0 for item in (loop for item in (let* ((condition (if resolvedp (if condition (format nil "readings > 1 && t-active >= 1 && (~a)" condition) "readings > 1 && t-active >= 1") (if condition (format nil "readings > 1 && (~a)" condition) "readings > 1"))) (items (analyze profile :thorough '(:flags) :condition condition :gold profile))) (loop for enhancer in enhancers do (loop for item in items do (call-raw-hook enhancer item))) items) for readings = (length (get-field :results item)) for ranks = (length (get-field :ranks item)) unless (= readings ranks) collect item and do (incf nitems)) for readings = (length (get-field :results item)) for gold = (max n (length (get-field :ranks item))) sum gold into gsum sum readings into rsum sum (/ gold readings) into sum finally (return (list (float (/ sum nitems)) ;;average random chance (float (/ gsum nitems)) ;;average # gold (float (/ rsum nitems)) ;;average # results nitems)))) ;;# items (defun print-score-file (&key (output (format nil "~a/scores" (tmp :redwoods))) gold name pattern condition (similarities '(:bleu))) (with-open-file (stream output :direction :output :if-exists :supersede) (loop with *redwoods-score-similarities* = similarities for db in (find-tsdb-directories *tsdb-home* :pattern pattern :name name) for name = (let ((name (get-field :database db))) (unless (string= name gold) name)) when name do (let* ((scores (summarize-scores name gold :condition condition :n 1 :test :id :spartanp t :loosep t)) (total (rest (rest (find :total scores :key #'first)))) (nscores (get-field :scores total)) (exact (get-field :exact total)) (tsims (get-field :tsimilarities total)) (nsims (get-field :nsimilarities total))) ;;;; (bleu (get-field :bleu (get-field :similarities total)))) (purge-profile-cache name) (format stream "~,6f ~a ~a `~a'~%" ;;;; "~,6f ~,6f `~a'~%" (* 100 (divide exact nscores)) tsims nsims ;;;; (if bleu (divide bleu nscores) 0.0) name) (force-output stream)))) (purge-profile-cache gold)) (defun summarize-folds (&key (output (format nil "~a/folds" (tmp :redwoods))) name pattern (score :accuracy) (type :total) header-p) (if (eq *redwoods-task* :classify) (summarize-classification-folds :output output :type type :header-p header-p :name name :pattern pattern) (with-open-file (stream output :direction :output :if-exists :supersede :if-not-exists :create) (let* ((key (if (and (eq type :total) (eq score :accuracy)) :f-accuracy :f-extras)) (foo (case type (:total :tsimilarities) (:nbest :nsimilarities) (t nil)));;assume score is a field in :f-extras. (selector (cond ((eq :f-accuracy key) #'(lambda (scores) (read-from-string (get-field key scores) nil))) ((and (eq type :nbest) (eq score :accuracy)) #'(lambda (scores) (get-field :naccuracy (read-from-string (get-field key scores) nil)))) ((null foo) ;;assume score is a field in :f-extras. #'(lambda (scores) (get-field score (read-from-string (get-field key scores) nil)))) (t #'(lambda (scores) (get-field score (get-field foo (read-from-string (get-field key scores) nil)))))))) (loop for profile in (cond ((not (null pattern)) (mapcar #'(lambda(db) (get-field :database db)) (find-tsdb-directories *tsdb-home* :pattern pattern :name name))) ((and name (listp name)) name) ((stringp name) (list name)) (t (error "summarize-folds():~ name or pattern argument missing."))) for values = (select (list (format nil "~(~a~)" key)) '(:string) "fold" nil profile) for scores = (when values (map 'list selector values)) when (and scores (notany 'null scores)) do (let* ((n (length scores)) (sum (sum scores)) (mean (/ sum n)) (min (apply #'min scores)) (max (apply #'max scores)) (range (- max min)) (var (if (= n 1) 0 (/ (sum (mapcar #'(lambda (x) (expt (- x mean) 2)) scores)) (- n 1)))) (std-dev (sqrt var))) (purge-profile-cache profile) (format stream "~,6f ~,6f ~,6f `~a'~%" mean std-dev range profile) (force-output stream))))))) (defun summarize-classification-folds (&key name pattern type header-p (output (format nil "~a/folds" (tmp :redwoods)))) (let* ((types (case type (:extended '(:t_false_positives :t_false_negatives :t_precision :t_recall :t_f1 :t_false_positives :s_false_negatives :s_precision :s_recall :s_f1)) (:f1 '(:t_f1 :s_f1)) (t '(:t_precision :t_recall :t_f1 :s_precision :s_recall :s_f1)))) (selector #'(lambda (scores) (loop for type in types collect (get-field type scores)))) (out)) (unwind-protect (progn (when (stringp output) (setq out (open output :direction :output :if-exists :supersede :if-not-exists :create))) (when header-p (format (or out t) "~&~(~{~a ~}~)~%" types)) (loop for profile in (cond ((not (null pattern)) (mapcar #'(lambda(db) (get-field :database db)) (find-tsdb-directories *tsdb-home* :pattern pattern :name name))) ((and name (listp name)) name) ((stringp name) (list name)) (t (error "summarize-classification-folds():~ name or pattern argument missing."))) for f-extras = (map 'list #'(lambda (x) (read-from-string (get-field :f-extras x) nil nil)) (select (list "f-extras") '(:string) "fold" nil profile)) for nfolds = (length f-extras) for scores = (map 'list selector f-extras) for sums = (when (and scores (notany 'null scores)) (apply #'mapcar (cons #'+ scores))) for means = (mapcar #'(lambda (x) (/ x nfolds)) sums) do ;; only report min/max/range/var/std-dev for f-scores, ;; -too much information otherwise. (when means (case type (:f1 (let* ((min (apply #'mapcar (cons #'min scores))) (max (apply #'mapcar (cons #'max scores))) (range (mapcar #'(lambda (x y) (- x y)) max min)) (var (loop for i from 0 for mean in means for sum-sqr-dist = (sum (map 'list #'(lambda (x) (expt (- (nth i x) mean) 2)) scores)) for var = (if (= nfolds 1) 0 (/ sum-sqr-dist (- nfolds 1))) collect var)) (std-dev (mapcar #'sqrt var))) (purge-profile-cache profile) (format (or out t) "~&~,2f ~,2f ~,2f " (first means) (first std-dev) (first range)) (format (or out t) "~,2f ~,2f ~,2f `~a'~%" (second means) (second std-dev) (second range) profile))) (t (purge-profile-cache profile) (format (or out t) "~{~,2f ~}`~a'~%" means profile) ;; prints token scores first, then sentence-level. (when out (force-output out))))))) (when out (close out))))) (defun average-feature-count (profile) (let ((selected (select (list "f-features") '(:string) "fold" nil profile))) (float (/ (sum (map 'list #'(lambda (x) (read-from-string (get-field :f-features x) nil nil)) selected)) (length selected))))) (defun wilcoxon (list1 list2) (unless (= (length list1) (length list2)) (error "wilcoxon(): given lists of different lengths: ~a and ~a~%" list1 list2)) (let* ((signed-diffs (remove-if #'zerop (mapcar #'(lambda (x y) (- x y)) list1 list2))) (ranked-diffs (sort (copy-list signed-diffs) #'< :key #'abs)) (ranks (loop for (d1 d2) on ranked-diffs with ranks = nil with result = nil for rank from 1 with push-p = nil with push-n = 1 with avg do (cond ((and d2 (= (abs d1) (abs d2))) (incf push-n) (setq push-p nil)) (t (setq push-p t))) (push rank ranks) when push-p do (setq avg (float (/ (sum ranks) push-n))) (dotimes (n push-n) (push avg result)) (setq push-n 1) (setq ranks nil) finally (return (nreverse result)))) (signed-ranks (loop for d in ranked-diffs for r in ranks collect (if (> d 0) r (- r)))) (w-pos (loop for r in signed-ranks when (> r 0) sum r)) (w-neg (abs (loop for r in signed-ranks when (< r 0) sum r)))) (values (min w-pos (abs w-neg)) (length signed-ranks)))) (defun t-test (list1 list2) (let* ((n1 (length list1)) (n2 (length list2)) (n (if (not (= n1 n2)) (error "t-test(): given lists of different lengths: ~a and ~a~%" list1 list2) n1)) (diffs (mapcar #'(lambda (x y) (- x y)) list1 list2)) (sum (sum diffs)) (mean (/ sum n)) (std-dev (sqrt (/ (sum (mapcar #'(lambda (x) (expt (- x mean) 2)) diffs)) (- n 1)))) (result (/ (* mean (sqrt n)) std-dev))) (values ;;; t ratio: ;;; (Use this when consulting a table of Student's t-distribution ;;; confidence intervals to determine the significance level at which two ;;; distributions differ) (abs result) ;;; degrees of freedom: (- n 1)))) (defun mw-t-test (list1 list2) (let* ((n1 (length list1)) (n2 (length list2)) (n (if (not (= n1 n2)) (error "mw-t-test(): given lists of different lengths:~ ~a and ~a~%" list1 list2) n1)) (sum1 (sum list1)) (sum2 (sum list2)) (mean1 (/ sum1 n)) (mean2 (/ sum2 n)) (diffs1 (mapcar #'(lambda (x) (- x mean1)) list1)) (diffs2 (mapcar #'(lambda (x) (- x mean2)) list2)) (sum-squared-diffs (sum (mapcar #'(lambda (x y) (expt (- x y) 2)) diffs1 diffs2))) (result (* (- mean1 mean2) (sqrt (/ (* n (- n 1)) sum-squared-diffs))))) (values (abs result) (- n 1)))) (defparameter *t-dist* '((0.10 (3.078 1.886 1.638 1.533 1.476 1.44 1.415 1.397 1.383 1.372 1.363 1.356 1.35 1.345 1.341 1.337 1.333 1.33 1.328 1.325)) (0.05 (6.314 2.92 2.353 2.132 2.015 1.943 1.895 1.86 1.833 1.812 1.796 1.782 1.771 1.761 1.753 1.746 1.74 1.734 1.729 1.725)) (0.025 (12.706 4.303 3.182 2.776 2.571 2.447 2.365 2.306 2.262 2.228 2.201 2.179 2.16 2.145 2.131 2.12 2.11 2.101 2.093 2.086)) (0.01 (31.821 6.965 4.541 3.747 3.365 3.143 2.998 2.896 2.821 2.764 2.718 2.681 2.65 2.624 2.602 2.583 2.567 2.552 2.539 2.528)) (0.005 (63.657 9.925 5.841 4.604 4.032 3.707 3.499 3.355 3.25 3.169 3.106 3.055 3.012 2.977 2.947 2.921 2.898 2.878 2.861 2.845)) (0.001) (318.313 22.327 10.215 7.173 5.893 5.208 4.782 4.499 4.296 4.143 4.024 3.929 3.852 3.787 3.733 3.686 3.646 3.61 3.579 3.552)) "t-test upper critical values") (defparameter *wilcoxon-dist* '((0.05 (NIL NIL NIL NIL 1 2 4 6 8 11 14 17 21 26 30 36 41 47 54 60)) (0.025 (NIL NIL NIL NIL NIL 1 2 4 6 8 11 14 17 21 25 30 35 40 46 52)) (0.01 (NIL NIL NIL NIL NIL NIL 0 2 3 5 7 10 13 16 20 24 28 33 38 43)) (0.005 (NIL NIL NIL NIL NIL NIL NIL 0 2 3 5 7 10 13 16 19 23 28 32 37))) "wilcoxon upper critical values") (defun get-critical-value (&key (level 0.05) (df 9) (sides 2) (test :ttest)) (unless (or (= sides 1) (= sides 2)) (error "critical-value(): sides should be 1 or 2. given ~a." sides)) (nth (- df 1) (cadr (assoc (/ level sides) (case test ((:t :tee :ttest) *t-dist*) ((:w :wilcoxon) *wilcoxon-dist*)))))) ;;fixme: add a key parameter for selecting other fields than f-accuracy: (defun compare-folds (name1 name2 &key (test :t) (stream *tsdb-io*) (level 0.05) (sides 2)) (let* ((acc1 (select '("f-accuracy") '(:string) "fold" nil name1)) (acc2 (select '("f-accuracy") '(:string) "fold" nil name2)) (acc1 (map 'list #'(lambda (x) (read-from-string (cdar x) nil nil)) acc1)) (acc2 (map 'list #'(lambda (x) (read-from-string (cdar x) nil nil)) acc2)) (testfun (case test ((:t :ttest) #'t-test) ((:w :wilcoxon) #'wilcoxon)))) (multiple-value-bind (stat df) (funcall testfun acc1 acc2) (let ((cv (get-critical-value :level level :sides sides :test test :df df))) (purge-profile-cache name1) (purge-profile-cache name2) (format stream "~&~,6f ~a ~a~%`~a'~%`~a'~%" stat cv df name1 name2))))) (defun create-evaluation-file (data &optional (gold data) &key (condition "readings > 1 && t-active > 0") (n 5) (test :id) supersede (ignore *redwoods-test-ignore-profiles*) (task *redwoods-task*) (loosep t) (stream *tsdb-io*) (similarities '(:neva :wa)) scopes scopes-fall-back gold-p) (let* ((*redwoods-task* task) (compress-command "gzip -c -9") (tsdb-dir (find-tsdb-directory data)) (eval (make-pathname :directory tsdb-dir :name "eval")) (eval-gz (make-pathname :directory tsdb-dir :name "eval" :type "gz"))) ;;; clean up: (when (or (cl-fad:file-exists-p eval-gz) (cl-fad:file-exists-p eval)) (if (not supersede) (return-from create-evaluation-file) (progn (when (cl-fad:file-exists-p eval-gz) (delete-file eval-gz)) (when (cl-fad:file-exists-p eval) (delete-file eval))))) (format stream "~&[~a] create-evaluation-file(): creating `eval.gz' for ~a~%" (current-time :long :short) data) (multiple-value-bind (eval-stream foo pid) (run-process compress-command :wait nil :input :stream :output eval-gz :if-output-exists :supersede :error-output nil) (declare (ignore foo)) (let* ((*redwoods-score-similarities* similarities) (thorough (when (eq test :derivation) '(:derivation))) (thorough (if *redwoods-score-similarities* (cons :surface thorough) thorough)) (gitems-unsifted (analyze gold :commentp (eq *redwoods-task* :classify) :tokensp (eq *redwoods-task* :classify) :thorough thorough :condition condition :gold gold :readerp (eq test :derivation))) (items (case *redwoods-task* (:classify (let ((itemz (if scopes ;; determine scopes to be scored (re-classify-profile data :source gold :export-xml-p nil :scopes scopes :scopes-fall-back scopes-fall-back :tasks '(:task2) :gold-p gold-p :ignore ignore) (if gold-p ;; if gold and not scopes ;; (eg. when scoring the morante profiles.) (analyze data :commentp t :tokensp t) (read-classified-items data gold :ignore ignore))))) (when gold-p (dolist (item itemz) (dolist (token (get-field :i-tokens item)) (nconc token (acons :class 0 nil))) (dolist (cspan (list-cue-spans item)) (dolist (token (spanned-tokens (first cspan) (second cspan) item)) (setf (get-field :class token) 1))))) itemz)) (:rank (loop for item in (analyze gold :thorough thorough :condition condition :score data :scorep t :readerp (eq test :derivation)) for gitem in gitems-unsifted for readings = (length (get-field :results gitem)) for ranks = (length (get-field :ranks gitem)) unless (or (= readings ranks) (not (= (get-field :i-id gitem) (get-field :i-id item)))) collect (copy-graph item))))) (items (if ignore (loop for item in items unless (member (get-field :source item) ignore :test #'string=) collect item) items)) (gitems (if (eq *redwoods-task* :rank) (loop for item in gitems-unsifted for readings = (length (get-field :results item)) for ranks = (length (get-field :ranks item)) unless (= readings ranks) collect (copy-graph item)) gitems-unsifted)) (gitems (if ignore (loop for item in gitems unless (member (get-field :source item) ignore :test #'string=) collect item) gitems)) (nkeys (loop for sim in similarities collect ;;; eg., (:nwa :nbleu :nneva) (read-from-string (format nil ":n~a" sim)))) (nkey-alist (pairlis similarities nkeys)) (keys (case *redwoods-task* (:rank (append (list :i-id :accuracy :naccuracy);; :r-id similarities nkeys)) (:classify (list :i-id :item-match :token-match :scope-match)))) (data (make-list (length keys))) (scores (pairlis keys data)));;fixme -add keys for classification (loop for item in items for gitem in gitems for i-id = (get-field :i-id item) do (case *redwoods-task* (:rank (multiple-value-bind (i score loosep similarities) (score-item item gitem :test test :n n :loosep loosep) ;;; (= i 0) means no match ;;; (<= i n) means we have a hit ;;; (= i 1) means exact match (declare (ignore loosep)) (push (if (= i 1) (float score) 0) (get-field :accuracy scores)) (push (if (<= i n) (float score) 0) (get-field :naccuracy scores)) (push i-id (get-field :i-id scores)) (loop for (key score nscore) in similarities do (push (float score) (get-field key scores)) (push (float nscore) (get-field (get-field key nkey-alist) scores))))) (:classify (multiple-value-bind (item-score token-scores scope-scores) (score-item item gitem :task :classify) (push item-score (get-field :item-match scores)) (push token-scores (get-field :token-match scores)) (push scope-scores (get-field :scope-match scores)) (push i-id (get-field :i-id scores)))))) (loop for (key . list) in scores do (setf (get-field key scores) (nreverse list))) (loop for list in scores do (prin1 list eval-stream) (terpri eval-stream))) (force-output eval-stream) (close eval-stream) (sys:os-wait nil pid)))) (defun batch-create-evaluation-files (&key pattern supersede gold n (condition "readings > 1 && t-active > 0") (similarities *redwoods-score-similarities*)) (loop for db in (find-tsdb-directories *tsdb-home* :pattern pattern) for name = (let ((name (get-field :database db))) (unless (string= name gold) name)) when name do (create-evaluation-file name gold :condition condition :n n :supersede supersede :similarities similarities) (purge-profile-cache name)) (purge-profile-cache gold)) (defun read-evaluation-file (profile &key (score :accuracy)) (let* ((tsdb-dir (find-tsdb-directory profile)) (eval (make-pathname :directory tsdb-dir :name "eval")) (eval-gz (make-pathname :directory tsdb-dir :name "eval" :type "gz")) (did-unzip-p nil)) (when (and (not (cl-fad:file-exists-p eval-gz)) (not (cl-fad:file-exists-p eval))) (error "read-evaluation-file(): `eval(.gz)' does not exists for `~a'." profile)) (when (cl-fad:file-exists-p eval-gz) ;;;allegro specific.. (excl:run-shell-command (format nil "gunzip ~a" eval-gz) :wait t) (setq did-unzip-p t)) (if (not (cl-fad:file-exists-p eval)) (error "read-evaluation-file(): cannot find `eval' file for `~a'." profile) (with-open-file (in eval :direction :input) (loop for list = (read in nil nil) until (or (null list) (eq (first list) score)) finally (progn (when did-unzip-p (excl:run-shell-command (format nil "gzip -9 ~a" eval) :wait t)) (return (if (or (eq score :token-match) (eq score :scope-match)) (apply #'append (rest list)) (rest list))))))))) (defun summarize-evaluation-file (profile &optional (stream *tsdb-io*)) (let* ((tsdb-dir (find-tsdb-directory profile)) (eval (make-pathname :directory tsdb-dir :name "eval")) (eval-gz (make-pathname :directory tsdb-dir :name "eval" :type "gz")) (did-unzip-p nil)) (when (and (not (cl-fad:file-exists-p eval-gz)) (not (cl-fad:file-exists-p eval))) (error "summarize-evaluation-file(): `eval(.gz)' does not exists for `~a'." profile)) (when (cl-fad:file-exists-p eval-gz) ;;;allegro specific (excl:run-shell-command (format nil "gunzip ~a" eval) :wait t) (setq did-unzip-p t)) (if (not (cl-fad:file-exists-p eval)) (error "summarize-evaluation-file(): cannot find `eval' file for `~a'." profile) (with-open-file (in eval :direction :input) (loop with total-items for list = (read in nil nil) until (null list) do (case (car list) ((:accuracy :naccuracy ) (format stream "~&~a: ~,2f~%" (car list) (* (/ (sum (cdr list)) (length (cdr list))) 100))) (:i-id (setq total-items (length (cdr list)))) (t (format stream "~&~a: ~,3f~%" (car list) (/ (sum (cdr list)) (length (cdr list)))))) finally (format stream "~&# Items: ~a~%" total-items)))) (when did-unzip-p (excl:run-shell-command (format nil "gzip -9 ~a" eval) :wait t))) nil) (defun test-evaluation-scores (profile1 profile2 &key (score :accuracy) (test :signtest) (tails :both)) (let ((scores1 (read-evaluation-file profile1 :score score)) (scores2 (read-evaluation-file profile2 :score score))) (case test ((:signtest :binomial) (stats:sign-test-on-sequences scores1 scores2 :tails tails)) ((:wilcoxon :signed-rank :wilcoxon-signed-rank) (stats:wilcoxon-signed-rank-test-on-sequences scores1 scores2 :tails tails)) ((:ttest :paired-ttest) (stats:t-test-paired-on-sequences scores1 scores2 :tails tails)) (t (error "test-evaluation-scores(): unknown test `~a'." test))))) (defun profiles-average-f1 (profiles &key nfolds types format) (let* ((types (or types '(:t_f1 :s_f1))) (selector #'(lambda (scores) (loop for type in types collect (get-field type scores)))) (t_f1) (s_f1) (n (length profiles))) (loop for profile in profiles for f-extras = (map 'list #'(lambda (x) (read-from-string (get-field :f-extras x) nil nil)) (select (list "f-extras") '(:string) "fold" nil profile)) for nf = (if nfolds (progn (setf f-extras (last f-extras nfolds)) nfolds) (length f-extras)) for scores = (map 'list selector f-extras) for sums = (when (and scores (notany 'null scores)) (apply #'mapcar (cons #'+ scores))) for means = (mapcar #'(lambda (x) (/ x nf)) sums) do (push (first means) t_f1) (push (second means) s_f1)) (loop for scores in (list t_f1 s_f1) for type in types for sum = (sum scores) for mean = (/ sum n) for min = (apply #'min scores) for max = (apply #'max scores) for range = (- max min) for sum-sqr-dist = (sum (map 'list #'(lambda (x) (expt (- x mean) 2)) scores)) for var = (/ sum-sqr-dist (- n 1)) for std-dev = (sqrt var) if (eq format :list) collect (pairlis (list :mean :min :max) (list mean min max)) else do (format t "~&Avg. ~a: ~,2f (min ~,2f, max ~,2f, range ~,2f, var ~,2f, std ~,2f)~%" type mean min max range var std-dev)))) ;; (purge-profile-cache nil) ;;; ;; computing scores suitable for gnuplot data files: ;;; ;;;(loop ;;; for l-bit in '(8 9 10 11 12 13 14 15 16) ;;; do (format t "~&~a" (expt 2 l-bit)) ;;; (loop ;;; for count in '(2 4 8 16) ;;; do (loop ;;; for score in (profiles-average-f1 ;;; (loop ;;; for i from 1 to 5 ;;; collect (format nil "exp/conll/hri_x~a_e~a_l~a_svm_genia=fw0_sw0_tw__fl__sl3_tl__fr2_sr3_tr__malt=patr__path__drel__ltrip__ptrip__xle=xdep__sub__co__adv__adj__opt=lin_it5.0e+3_b1.0_t0.001_pc100" ;;; i count l-bit)) ;;; :format :list ;;; :types '(:s_f1)) ;;; do (format t " ~,2f" (get-field :max score))))) ;;;(setq base ;;; "ri/filter_ri-~a-~a-x~a_svm_genia=fw1_sw1_tw1_fl3_sl3_tl3_fr3_sr3_tr3_malt=patr+_path+_drel+_ltrip+_ptrip+_xle=xdep+_sub+_co+_adv+_adj+_opt=lin_it5.0e+3_b1.0_t0.001_pc100" ;;; ) ;;; ;;; ;;;(loop ;;; for size in '(10 20) ;;; do (loop ;;; for dim in '(625 1250 2500 5000 10000 20000 40000 80000) ;;; for sym = (intern (format nil "~a-~as" size dim)) ;;; do (loop ;;; for i from 1 to 5 ;;; collect (format nil base size dim i) into list ;;; finally (setf (symbol-value sym) list)))) ;;; ;;; ;;;(loop ;;; with dim = 10000 ;;; for size in '(2 4 8 16 32 64 128) ;;; for sym = (intern (format nil "~a-~as" size dim)) ;;; do (loop ;;; for i from 1 to 5 ;;; collect (format nil base size dim i) into list ;;; finally (setf (symbol-value sym) list))) ; (loop ; with dim = 5000 ; for size in '(2 4 8 16 32 64 128) ; for sym = (intern (format nil "~a-~as" size dim)) ; do (loop ; for i from 1 to 5 ; collect (format nil base size dim i) into list ; finally (setf (symbol-value sym) list))) ;;;(setq base2 ;;; "ri/hastie/filter_ri-~a-~a-z~a_svm_genia=fw0_sw0_tw0_fl__sl3_tl__fr2_sr3_tr__malt=patr__path__drel__ltrip__ptrip__xle=xdep__sub__co__adv__adj__opt=lin_it5.0e+3_b1.0_t0.001_pc100") ;;; ;;; ;;;(loop ;;; for count in '(2 4 8 16 32) ;;; for size in '(1250 2500 5000 10000 20000) ;;; for sym = (intern (format nil "~a-~as" count size)) ;;; do (loop ;;; for i from 1 to 5 ;;; collect (format nil base count size i) into list ;;; finally (setf (symbol-value sym) list)))