;;; -*- Mode: COMMON-LISP; Syntax: Common-Lisp; Package: TSDB -*- ;;; ;;; [incr tsdb()] --- Competence and Performance Profiling Environment ;;; Copyright (c) 1996 -- 2007 Stephan Oepen (oe@csli.stanford.edu) ;;; Copyright (c) 2005 -- 2007 Erik Velldal (erikve@ifi.uio.no) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; (in-package :tsdb) ;;;(defmacro gridify (parameters &body body) ;;; (if (null (rest parameters)) ;;; `(dolist ,(first parameters) ,@body) ;;; `(dolist ,(first parameters) ;;; (gridify ,(rest parameters) ,@body)))) ;;; ;;; ;;;(defun listify (foo) ;;; (if (or (null foo) (atom foo)) (list foo) foo)) ;;; ;;;(defun cross-product (&rest lists) ;;; (labels ((cross-product2 (lists) ;;; (if (null (rest lists)) ;;; (loop ;;; for foo in (first lists) collect (list foo)) ;;; (loop ;;; with rests = (cross-product2 (rest lists)) ;;; for foo in (first lists) ;;; nconc (loop ;;; for bar in rests ;;; collect (cons foo bar)))))) ;;; ;;; (cross-product2 lists))) (defun batch-experiment (&key ;; ;; general parameters: `gold' input profile, output skeleton, how many ;; iterations of how many folds to run, et al. ;; (source *tsdb-gold*) (skeleton *tsdb-default-skeleton*) (nfold 10) (niterations nfold) (type :mem) (supersede nil) (prefix "") (compact nil) (identity (format nil "~a.~a" (current-user) (current-pid))) (recache t) (verbose t) (stream t) (debug nil) (evalp nil) ;; create eval.gz files with scores in the target profiles ;; ;; global Redwoods parameters ;; (use-preterminal-types-p *feature-use-preterminal-types-p*) (ngram-tag *feature-ngram-tag*) (score-similarities '(:neva :wa)) (enhancers *feature-item-enhancers*) (resolvedp t) normalizep (redwoods-task *redwoods-task*) ;; ;; feature selection parameters (classification mode) ;; (c-conn-form *cfeature-conn-form*) (c-form-win *cfeature-form-win*) (c-stem-win *cfeature-stem-win*) (c-tag-win *cfeature-tag-win*) (c-form-l-ngram *cfeature-form-l-ngram*) (c-stem-l-ngram *cfeature-stem-l-ngram*) (c-tag-l-ngram *cfeature-tag-l-ngram*) (c-form-r-ngram *cfeature-form-r-ngram*) (c-stem-r-ngram *cfeature-stem-r-ngram*) (c-tag-r-ngram *cfeature-tag-r-ngram*) (c-malt-deppattern *cfeature-malt-deppattern*) (c-malt-deppath *cfeature-malt-deppath*) (c-malt-deprel *cfeature-malt-deprel*) (c-malt-lextriple *cfeature-malt-lextriple*) (c-malt-postriple *cfeature-malt-postriple*) (c-xle-xdep *cfeature-xle-xdep*) (c-xle-subcat *cfeature-xle-subcat*) (c-xle-coord *cfeature-xle-coord*) (c-xle-advtype *cfeature-xle-advtype*) (c-xle-adjtype *cfeature-xle-adjtype*) (c-ptb-self *cfeature-ptb-self*) (c-ptb-parent *cfeature-ptb-parent*) (c-ptb-path *cfeature-ptb-path*) (c-ptb-left *cfeature-ptb-left*) (c-ptb-right *cfeature-ptb-right*) (c-stanford *cfeature-stanford*) ;; ;; feature selection parameters (ranking mode) ;; (grandparenting *feature-grandparenting*) (active-edges-p *feature-active-edges-p*) (lexicalization-p *feature-lexicalization-p*) (constituent-weight *feature-constituent-weight*) (ngram-size *feature-ngram-size*) (ngram-back-off-p *feature-ngram-back-off-p*) (lm-p *feature-lm-p*) (flags *feature-flags*) (random-sample-size *feature-random-sample-size*) (counts-absolute 0) (counts-contexts 0) (counts-events 0) (counts-relevant 0) ;; ;; for learning curve computation: only use percentage of training data ;; (train-percentage '(100)) ;; ;; estimation parameters: MaxEnt ;; (method *maxent-method*) (variance *maxent-variance*) (relative-tolerance *maxent-relative-tolerance*) (absolute-tolerance *maxent-absolute-tolerance*) (iterations *maxent-iterations*) ;; ;; estimation parameters: SVM ;; (kernel *svm-kernel*) (rbf-g *svm-rbf-g*) (poly-d *svm-poly-d*) (sig-poly-s *svm-sig-poly-s*) (sig-poly-r *svm-sig-poly-r*) #+:null (iterations *svm-iterations*) (balance *svm-cost-balance*) (error-to-margin *svm-error-to-margin*) (tolerance *svm-tolerance*)) (macrolet ((gridify (parameters &body body) (if (null (rest parameters)) `(dolist ,(first parameters) ,@body) `(dolist ,(first parameters) (gridify ,(rest parameters) ,@body))))) (labels ((listify (foo) (if (or (null foo) (atom foo)) (list foo) foo)) (cross-product (&rest lists) (cross-product2 lists)) (cross-product2 (lists) (if (null (rest lists)) (loop for foo in (first lists) collect (list foo)) (loop with rests = (cross-product2 (rest lists)) for foo in (first lists) nconc (loop for bar in rests collect (cons foo bar))))) (report (experiment run skipped total recache) (format stream "~&[~a] batch-experiment(): ~ experiment # ~d; grid # ~d (~d skipped); ~ (~d~@[; cc~]).~%" (current-time :long :short) experiment (+ run skipped -1) skipped total recache)) (execute-experiment (&key target recache) (let ((executep (test-experiment target :supersede supersede))) ;;;DEBUG; (format t "~&DEBUG: source: ~a, target: ~a, type: ~a, nfold: ~a, niterations: ~a, recache: ~a, identity: ~a, enhancers: ~a, resolvedp: ~a, normalizep: ~a~%" source target type nfold niterations recache identity enhancers resolvedp normalizep) (when executep (tsdb :create target :skeleton skeleton) (#-:debug handler-case #+:debug progn (rank-profile source target :type type :nfold nfold :niterations niterations :recache recache :identity identity :enhancers enhancers :resolvedp resolvedp :normalizep normalizep) #-:debug (condition (condition) (format stream "~&[~a] batch-experiment(): ~ error: `~a'.~%" (current-time :long :short) condition))) (when evalp (create-evaluation-file target source :similarities score-similarities)) (purge-profile-cache target)) executep)) (batch-experiment-hyperparameters (experiment total) (let ((run 0) (skipped 0)) (case type (:mem (gridify ((*redwoods-train-percentage* (listify train-percentage)) (*maxent-method* (listify method)) (*maxent-variance* (listify variance)) (*maxent-relative-tolerance* (listify relative-tolerance)) (*maxent-absolute-tolerance* (listify absolute-tolerance)) (*maxent-iterations* (listify iterations))) (when debug (excl:print-type-counts) (excl:gc) (excl:print-type-counts) (room)) (let ((recache (and (zerop run) recache))) (if (execute-experiment :target (mem-environment :full t :prefix prefix :format (if compact :compact :string)) :recache recache) (incf run) (incf skipped)) (incf total) (when verbose (report experiment run skipped total recache)) (incf experiment)))) ((:svm :svmrank :perf) (gridify ((*redwoods-train-percentage* (listify train-percentage)) (*svm-kernel* (listify kernel)) (*svm-rbf-g* (listify rbf-g)) (*svm-poly-d* (listify poly-d)) (*svm-sig-poly-s* (listify sig-poly-s)) (*svm-sig-poly-r* (listify sig-poly-r)) (*svm-iterations* (listify iterations)) (*svm-error-to-margin* (listify error-to-margin)) (*svm-cost-balance* (listify balance)) (*svm-tolerance* (listify tolerance))) (let ((recache (and (zerop run) recache))) (if (execute-experiment :target (svm-environment :full t :prefix prefix :format (if compact :compact :string)) :recache recache) (incf run) (incf skipped)) (incf total) (when verbose (report experiment run skipped total recache)) (incf experiment)))))))) (let ((total 0) (experiment 0) (thresholds (loop for (absolute contexts events relevant) in (cross-product (listify counts-absolute) (listify counts-contexts) (listify counts-events) (listify counts-relevant)) collect (make-counts :absolute absolute :contexts contexts :events events :relevant relevant))) (*feature-use-preterminal-types-p* use-preterminal-types-p) (*feature-ngram-tag* ngram-tag) (*redwoods-score-similarities* score-similarities) (*redwoods-task* redwoods-task)) ;;; (learner-parameters ;;; (list :source source :type type :prefix prefix :compact compact ;;; :recache recache :verbose verbose :debug debug ;;; :train-percentage train-percentage :method method ;;; :variance variance :relative-tolerance relative-tolerance ;;; :absolute-tolerance absolute-tolerance :iterations iterations ;;; :kernel kernel :rbf-g rbf-g :poly-d poly-d ;;; :sig-poly-s sig-poly-s :sig-poly-r sig-poly-r ;;; :balance balance :error-to-margin error-to-margin ;;; :tolerance tolerance))) (case *redwoods-task* (:rank ;; from here on, essentially do two nested groups of ;; loops (constructed by virtue of an ingenious macro ;; provided by erik :-): the outermost gridify() will ;; cross-multiply over feature selection parameters, so ;; that for each iteration at this level we need a fresh ;; context cache. conversely, in the inner gridify() ;; scope, we are going through the combinatorics of ;; (learner-specific) estimation parameters. ;; (gridify ((*feature-grandparenting* (listify grandparenting)) (*feature-active-edges-p* (listify active-edges-p)) (*feature-lexicalization-p* (listify lexicalization-p)) (*feature-constituent-weight* (listify constituent-weight)) (*feature-ngram-size* (listify ngram-size)) (*feature-ngram-back-off-p* (listify ngram-back-off-p)) (*feature-lm-p* (listify lm-p)) (*feature-flags* (listify flags)) (*feature-random-sample-size* (listify random-sample-size)) (*feature-frequency-threshold* thresholds)) (batch-experiment-hyperparameters experiment total))) ;;; (apply #'batch-experiment-hyperparameters learner-parameters))) (:classify (gridify ((*cfeature-conn-form* (listify c-conn-form)) (*cfeature-form-win* (listify c-form-win)) (*cfeature-stem-win* (listify c-stem-win)) (*cfeature-tag-win* (listify c-tag-win)) (*cfeature-form-l-ngram* (listify c-form-l-ngram)) (*cfeature-stem-l-ngram* (listify c-stem-l-ngram)) (*cfeature-tag-l-ngram* (listify c-tag-l-ngram)) (*cfeature-form-r-ngram* (listify c-form-r-ngram)) (*cfeature-stem-r-ngram* (listify c-stem-r-ngram)) (*cfeature-tag-r-ngram* (listify c-tag-r-ngram)) (*cfeature-malt-deppattern* (listify c-malt-deppattern)) (*cfeature-malt-deppath* (listify c-malt-deppath)) (*cfeature-malt-deprel* (listify c-malt-deprel)) (*cfeature-malt-lextriple* (listify c-malt-lextriple)) (*cfeature-malt-postriple* (listify c-malt-postriple)) (*cfeature-xle-xdep* (listify c-xle-xdep)) (*cfeature-xle-subcat* (listify c-xle-subcat)) (*cfeature-xle-coord* (listify c-xle-coord)) (*cfeature-xle-advtype* (listify c-xle-advtype)) (*cfeature-xle-adjtype* (listify c-xle-adjtype)) (*cfeature-ptb-self* (listify c-ptb-self)) (*cfeature-ptb-parent* (listify c-ptb-parent)) (*cfeature-ptb-path* (listify c-ptb-path)) (*cfeature-ptb-left* (listify c-ptb-left)) (*cfeature-ptb-right* (listify c-ptb-right)) (*cfeature-stanford* (listify c-stanford))) (batch-experiment-hyperparameters experiment total))))) ;;; (apply #'batch-experiment-hyperparameters learner-parameters))))) (purge-profile-cache source)))) ;;;(defun batch-experiment-hyperparameters ;;; (&key ;;; ;; ;;; ;; general parameters: `gold' input profile, output skeleton, how many ;;; ;; iterations of how many folds to run, et al. ;;; ;; ;;; (source *tsdb-gold*) ;;; (type :mem) ;;; (prefix "") ;;; (compact nil) ;;; (recache t) ;;; (verbose t) (debug nil) ;;; ;; ;;; ;; for learning curve computation: only use percentage of training data ;;; ;; ;;; (train-percentage '(100)) ;;; ;; ;;; ;; estimation parameters: MaxEnt ;;; ;; ;;; (method *maxent-method*) ;;; (variance *maxent-variance*) ;;; (relative-tolerance *maxent-relative-tolerance*) ;;; (absolute-tolerance *maxent-absolute-tolerance*) ;;; (iterations *maxent-iterations*) ;;; ;; ;;; ;; estimation parameters: SVM ;;; ;; ;;; (kernel *svm-kernel*) ;;; (rbf-g *svm-rbf-g*) ;;; (poly-d *svm-poly-d*) ;;; (sig-poly-s *svm-sig-poly-s*) ;;; (sig-poly-r *svm-sig-poly-r*) ;;; #+:null ;;; (iterations *svm-iterations*) ;;; (balance *svm-cost-balance*) ;;; (error-to-margin *svm-error-to-margin*) ;;; (tolerance *svm-tolerance*)) ;;; ;;; (labels ((report (experiment run skipped total recache) ;;; (format ;;; stream ;;; "~&[~a] batch-experiment(): ~ ;;; experiment # ~d; grid # ~d (~d skipped); ~ ;;; (~d~@[; cc~]).~%" ;;; (current-time :long :short) experiment ;;; (+ run skipped -1) skipped total recache)) ;;; (execute-experiment (&key target recache) ;;; (let ((executep (test-experiment target :supersede supersede))) ;;; (when executep ;;; (tsdb :create target :skeleton skeleton) ;;; (#-:debug handler-case #+:debug progn ;;; (rank-profile ;;; source target :type type ;;; :nfold nfold :niterations niterations ;;; :recache recache :identity identity ;;; :enhancers enhancers ;;; :resolvedp resolvedp ;;; :normalizep normalizep) ;;; #-:debug ;;; (condition (condition) ;;; (format ;;; stream ;;; "~&[~a] batch-experiment(): ~ ;;; error: `~a'.~%" ;;; (current-time :long :short) condition))) ;;; (when evalp ;;; (create-evaluation-file ;;; target source :similarities score-similarities)) ;;; (purge-profile-cache target)) ;;; executep))) ;;; ;;; (let ((run 0) ;;; (skipped 0)) ;;; (case type ;;; (:mem ;;; (gridify ((*redwoods-train-percentage* (listify train-percentage)) ;;; (*maxent-method* (listify method)) ;;; (*maxent-variance* (listify variance)) ;;; (*maxent-relative-tolerance* (listify relative-tolerance)) ;;; (*maxent-absolute-tolerance* (listify absolute-tolerance)) ;;; (*maxent-iterations* (listify iterations))) ;;; ;;; (when debug ;;; (excl:print-type-counts) (excl:gc) ;;; (excl:print-type-counts) (room)) ;;; ;;; (let ((recache (and (zerop run) recache))) ;;; (if (execute-experiment ;;; :target (mem-environment ;;; :full t :prefix prefix ;;; :format (if compact :compact :string)) ;;; :recache recache) ;;; (incf run) ;;; (incf skipped)) ;;; (incf total) ;;; (when verbose ;;; (report experiment run skipped total recache)) ;;; (incf experiment)))) ;;; ;;; (:svm ;;; (gridify ((*redwoods-train-percentage* (listify train-percentage)) ;;; (*svm-kernel* (listify kernel)) ;;; (*svm-rbf-g* (listify rbf-g)) ;;; (*svm-poly-d* (listify poly-d)) ;;; (*svm-sig-poly-s* (listify sig-poly-s)) ;;; (*svm-sig-poly-r* (listify sig-poly-r)) ;;; (*svm-iterations* (listify iterations)) ;;; (*svm-error-to-margin* (listify error-to-margin)) ;;; (*svm-cost-balance* (listify balance)) ;;; (*svm-tolerance* (listify tolerance))) ;;; ;;; (let ((recache (and (zerop run) recache))) ;;; (if (execute-experiment ;;; :target (svm-environment ;;; :full t :prefix prefix ;;; :format (if compact :compact :string)) ;;; :recache recache) ;;; (incf run) ;;; (incf skipped)) ;;; (incf total) ;;; (when verbose ;;; (report experiment run skipped total recache)) ;;; (incf experiment)))))))) (defun test-experiment (target &key supersede (verbose t) (stream t)) ;; ;; _fix_me_ ;; make this function more intelligent, e.g. move old directory out of the ;; way when superseding and check for appropriate number of folds et al. when ;; skipping an experiment. ;; (let ((path (find-tsdb-directory target))) (when (fad:file-exists-p path) (cond (supersede (when verbose (format stream "test-experiment(): purging `~a'.~%" target)) (ignore-errors (fad:delete-directory-and-files path)) (when (fad:file-exists-p path) (when verbose (format stream "test-experiment(): purge failed; skipping `~a'.~%" target)) (return-from test-experiment))) (t (format stream "test-experiment(): skipping `~a'.~%" target) (return-from test-experiment))))) t) (defun summarize-experiments (&key (stream t) profiles pattern (score :accuracy) (format :ascii)) (when (stringp stream) (with-open-file (stream stream :direction :output :if-exists :supersede) (return-from summarize-experiments (summarize-experiments :stream stream :profiles profiles :pattern pattern :score score :format format)))) (when (and (stringp profiles) (probe-file profiles)) (setf profiles (with-open-file (stream profiles) (loop for line = (read-line stream nil nil) while line collect line)))) (loop with key = (if (eq score :accuracy) :f-accuracy :f-extras) with profiles = (or profiles (loop for profile in (find-tsdb-directories *tsdb-home* :pattern pattern) collect (get-field :database profile))) for profile in profiles for values = (select (list (format nil "~(~a~)" key) "f-events" "f-features" "f-iterations") nil "fold" nil profile) for iterations = 0 for events = 0 for features = 0 for scores = (loop for value in values for field = (get-field key value) for i = (get-field :f-iterations value) for j = (get-field :f-events value) for k = (get-field :f-features value) when (numberp i) do (incf iterations i) when (numberp j) do (incf events j) when (numberp k) do (incf features k) when (eq score :accuracy) collect (read-from-string field) else collect (let* ((value (read-from-string field)) (field (get-field score value))) (if (numberp field) field (read-from-string field)))) when (and scores (loop for score in scores always score)) do (let* ((n (length scores)) (sum (sum scores)) (mean (/ sum n)) (min (loop for score in scores minimize score)) (max (loop for score in scores maximize score)) (range (- max min)) (variance (if (= n 1) 0 (/ (loop for score in scores sum (expt (- score mean) 2)) (- n 1)))) (iterations (divide iterations n)) (events (divide events n)) (features (divide features n))) (case format (:ascii (format stream "~,6f ~,6f ~,6f ~,1f ~,1f ~,1f `~a'~%" mean (sqrt variance) range events features iterations profile)) (:tsv (let* ((slash (position #\/ profile :from-end t)) (string (if slash (subseq profile (+ slash 1)) profile)) (fields (parse-environment string))) (format stream "~a ~,6f ~,6f ~,6f ~,1f ~,1f ~,1f ~ ~{~a~^ ~}~%" profile mean (sqrt variance) range events features iterations fields)))) (force-output stream)))) ;;;(defparameter *foo* '((:key1 . :val1) (:key2 . :val2))) ;;;*FOO* ;;;TSNLP(70): (loop initially (print (get-field :key2 *foo*)) ;;; for values in (cross-product '(a1 a2) '(1 10 100)) ;;; for *foo* = ;;; (pairlis '(:key1 :key2) values) ;;; do (print *foo*) (bar)) ; NIL ; ((:KEY2 . 1) (:KEY1 . A1)) ; 1 ; ((:KEY2 . 10) (:KEY1 . A1)) ; 10 ; ((:KEY2 . 100) (:KEY1 . A1)) ; 100 ; ((:KEY2 . 1) (:KEY1 . A2)) ; 1 ; ((:KEY2 . 10) (:KEY1 . A2)) ; 10 ; ((:KEY2 . 100) (:KEY1 . A2)) ; 100 ; NIL