;;; -*- Mode: COMMON-LISP; Syntax: Common-Lisp; Package: TSDB -*- ;;; ;;; [incr tsdb()] --- Competence and Performance Profiling Environment ;;; Copyright (c) 1996 -- 2006 Stephan Oepen (oe@csli.stanford.edu) ;;; Copyright (c) 2005 -- 2006 Erik Velldal (erikve@ifi.uio.no) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; ;;; ;;; some notes on the encoding of features: to make sure we operate in separate ;;; namespaces for the various feature templates, each feature is prefixed with ;;; an integer identifying the feature type (aka template), viz. ;;; ;;; 1: local derivational configuration; a subtree of depth one taken from ;;; the derivation; e.g. [1 (0) hspec det_poss_my_le noptcomp]; the second ;;; integer indicates the amount of grandparenting used, where parent ;;; labels will precede the root of the local configuration. ;;; 2: `active' local derivational configuration; similar to type 1, but with ;;; only one of the daughters in each feature; thus, for a ternary rule, ;;; feature type 2 will add three extra features, ;;; e.g. [2 (0) hspec noptcomp]. ;;; 3: lexicalized local derivational configuration; much like type 1, but ;;; the first symbol is the lexical head of the root of the configuration; ;;; if we decided to lexicalize _all_ nodes in the feature, that would be ;;; either a different template or additional parameter then. ;;; 4: lexicalized `active' local derivational configuration; one parameter, ;;; viz. degree of grand-parenting. ;;; 10: n-gram features; e.g. [10 (3 1) "saw" ^ n_proper_le v_np_trans_le]; ;;; the first parameter is the n-gram size, the second an integer coding ;;; as to what is used in the n-grams: 0 -- lexical identifier, 1 -- le ;;; type. ;;; 11: preterminal-only n-gram features; like type 10, but without a surface ;;; form; e.g. [11 (3 1) ^ n_proper_le v_np_trans_le]. ;;; 42: language model score; the second integer is the number of bins used ;;; (if any), and the third the divisor used in scaling (*maxent-lm-p*), ;;; if any, e.g. [42 (0) -12.34]. ;;; 43: experimental features in combining LOGON scores across components. ;;; ;;; also, we are using some pseudo-features to record additional information in ;;; the feature cache, viz. ;;; ;;; -1: event frequencies; takes one parameter encoding the frequency ;;; assignment function: 0 -- binary, i.e. 0 or 1 according to what comes ;;; out of the annotations in the treebank; 1 and upwards -- various ;;; weighted frequency functions, according to the global value of ;;; *maxent-preference-weightings*. ;;; ;;; Additional substring-oriented features for classification (ids > 100). ;;; ;;; 101 Features extracted from dependency parses. (in-package :tsdb) (defstruct (hashed-random-index) hash constants key) ;; implicit vector object. ;;; Feature Template Ids: ;;;(defparameter *c-feature-template-ids* ;;; '((:dependencies . 101) ;;; (:token-tag-ngrams . 120) ;;; (:token-stem-ngrams . 121) ;;; (:token-form-ngrams . 122))) ;;; ;;;(defparameter *c-feature-parameters* ;;; '((:dependencies . 3) ;;; (:token-tag-ngrams . 2) ;;; (:token-stem-ngrams . 2) ;;; (:token-form-ngrams . 2))) (defparameter *cfeature-malt-xle* t);; nil overrides the following 10 features (defparameter *cfeature-malt-deppattern* t) (defparameter *cfeature-malt-deppath* t) (defparameter *cfeature-malt-deprel* t) (defparameter *cfeature-malt-lextriple* t) (defparameter *cfeature-malt-postriple* t) (defparameter *cfeature-xle-xdep* t) (defparameter *cfeature-xle-subcat* t) (defparameter *cfeature-xle-coord* t) (defparameter *cfeature-xle-advtype* t) (defparameter *cfeature-xle-adjtype* t) (defparameter *cfeature-form-win* 2) (defparameter *cfeature-stem-win* 2) (defparameter *cfeature-tag-win* 2) (defparameter *cfeature-form-l-ngram* 2) ;; x = order, ie. include x tokens (defparameter *cfeature-stem-l-ngram* 2) ;; to the left. x = 1 = bigrams. (defparameter *cfeature-tag-l-ngram* 2) (defparameter *cfeature-form-r-ngram* 2) (defparameter *cfeature-stem-r-ngram* 2) (defparameter *cfeature-tag-r-ngram* 2) (defparameter *cfeature-conn-form* t) (defparameter *cfeature-ptb-self* nil) (defparameter *cfeature-ptb-parent* nil) (defparameter *cfeature-ptb-left* nil) (defparameter *cfeature-ptb-right* nil) (defparameter *cfeature-ptb-path* nil) (defparameter *cfeature-stanford* nil) (defparameter *cfeature-lexicalized-only-p* nil) (defmacro set-fid (var val) "Set the feature id of a variable." `(setf (get (quote ,var) :tid) ,val)) (defmacro get-fid (var) "Retrieve the feature id of a variable." `(get (quote ,var) :tid)) ;;; Template ids: (set-fid *cfeature-malt-deppattern* 100) (set-fid *cfeature-malt-deppath* 101) (set-fid *cfeature-malt-deprel* 102) (set-fid *cfeature-malt-lextriple* 103) (set-fid *cfeature-malt-postriple* 104) (set-fid *cfeature-xle-xdep* 105) (set-fid *cfeature-xle-subcat* 106) (set-fid *cfeature-xle-coord* 107) (set-fid *cfeature-xle-advtype* 108) (set-fid *cfeature-xle-adjtype* 109) (set-fid *cfeature-form-win* 120) (set-fid *cfeature-stem-win* 121) (set-fid *cfeature-tag-win* 122) (set-fid *cfeature-form-l-ngram* 123) (set-fid *cfeature-stem-l-ngram* 124) (set-fid *cfeature-tag-l-ngram* 125) (set-fid *cfeature-form-r-ngram* 126) (set-fid *cfeature-stem-r-ngram* 127) (set-fid *cfeature-tag-r-ngram* 128) (set-fid *cfeature-conn-form* 130) (set-fid *cfeature-ptb-self* 150) (set-fid *cfeature-ptb-parent* 151) (set-fid *cfeature-ptb-left* 152) (set-fid *cfeature-ptb-right* 153) (set-fid *cfeature-ptb-path* 154) (set-fid *cfeature-stanford* 155) ;; fixme: insert get-fid's at appropriate places in the code. ;;; Template ids: ;;;(setf (get '*cfeature-malt-deppattern* :tid) 100) ;;;(setf (get '*cfeature-malt-deppath* :tid) 101) ;;;(setf (get '*cfeature-malt-deprel* :tid) 102) ;;;(setf (get '*cfeature-malt-lextriple* :tid) 103) ;;;(setf (get '*cfeature-malt-postriple* :tid) 104) ;;;(setf (get '*cfeature-xle-xdep* :tid) 105) ;;;(setf (get '*cfeature-xle-subcat* :tid) 106) ;;;(setf (get '*cfeature-xle-coord* :tid) 107) ;;;(setf (get '*cfeature-xle-advtype* :tid) 108) ;;;(setf (get '*cfeature-xle-adjtype* :tid) 109) ;;; ;;;(setf (get '*cfeature-form-win* :tid) 120) ;;;(setf (get '*cfeature-stem-win* :tid) 121) ;;;(setf (get '*cfeature-tag-win* :tid) 122) ;;;(setf (get '*cfeature-form-l-ngram* :tid) 123) ;;;(setf (get '*cfeature-stem-l-ngram* :tid) 124) ;;;(setf (get '*cfeature-tag-l-ngram* :tid) 125) ;;;(setf (get '*cfeature-form-r-ngram* :tid) 126) ;;;(setf (get '*cfeature-stem-r-ngram* :tid) 127) ;;;(setf (get '*cfeature-tag-r-ngram* :tid) 128) ;;;fixme: add parameters for template ids for other features as well. (defparameter *feature-selection-p* nil) (defparameter *feature-covariance-threshold* nil) (defparameter *feature-grandparenting* 4) (defparameter *feature-use-preterminal-types-p* t) (defparameter *feature-lexicalization-p* t) (defparameter *feature-constituent-weight* 2) (defparameter *feature-active-edges-p* t) (defparameter *feature-ngram-size* 4) (defparameter *feature-ngram-tag* :type) (defparameter *feature-ngram-back-off-p* t) (defparameter *feature-lm-p* #-:logon nil #+:logon 10) #+:null (defparameter *feature-lm-normalize* '(:minmax 0 2)) (defparameter *feature-dependencies* nil) (defparameter *feature-flags* (nconc #+:null '((0 :ascore) (1 :tscore) (2 :rscore)) #+:null '((3 :lm) (4 :perplexity) (5 :lfn) (6 :lnf)) #+:null '((10 :distortion) (11 :distance)) #+:null '((12 :nmtrs) (13 :tratio) (14 :ratio)))) (defparameter *feature-preference-weightings* '((0 :binary) (1 :bleu) (2 :wa) (3 :waft))) (defparameter *feature-frequency-threshold* nil) (defparameter *feature-random-sample-size* nil) ;;(defparameter *feature-random-sample-test* nil) ;; use random sampling also when testing. (defparameter *feature-random-indexing-p* nil) ;; nil, t or string (import path) (defparameter *feature-random-index-type* :hash) ;; or :vector (defparameter *feature-random-indexing-export-p* nil) (defparameter *feature-random-index-scale* nil) (defparameter *feature-random-index-size* 10000) ;; length of the index vectors (defparameter *feature-random-index-count* 10) ;;; number of 1's and -1's (defparameter *feature-random-index-constants* nil) (defparameter *feature-universal-hash-keybit* 24) ;;(defparameter *feature-universal-hash-codebit* 12) ;;;(defparameter *feature-random-index-hashes* nil) ;; list of hash functions (defparameter *feature-item-enhancers* (list 'lm-item-enhancer)) (defparameter *feature-symbols-p* t) (defstruct counts (absolute 0) (contexts 0) (events 0) (relevant 0)) (defmethod print-object ((object counts) stream) (format stream "{~d ~d ~d ~d}" (counts-absolute object) (counts-contexts object) (counts-events object) (counts-relevant object))) (defmacro counts>= (counts1 counts2) `(and (>= (counts-absolute ,counts1) (counts-absolute ,counts2)) (>= (counts-contexts ,counts1) (counts-contexts ,counts2)) (>= (counts-events ,counts1) (counts-events ,counts2)) (>= (counts-relevant ,counts1) (counts-relevant ,counts2)))) (defmacro counts= (counts1 counts2) `(and (= (counts-absolute ,counts1) (counts-absolute ,counts2)) (= (counts-contexts ,counts1) (counts-contexts ,counts2)) (= (counts-events ,counts1) (counts-events ,counts2)) (= (counts-relevant ,counts1) (counts-relevant ,counts2)))) (defparameter *feature-options* '(*feature-grandparenting* *feature-use-preterminal-types-p* *feature-lexicalization-p* *feature-constituent-weight* *feature-active-edges-p* *feature-ngram-size* *feature-ngram-tag* *feature-ngram-back-off-p* *feature-lm-p* *feature-frequency-threshold* *feature-random-sample-size*)) (defparameter *feature-float-valued-tids* '(42 43 -1)) (defconstant %feature-frequency-tid% -1) (defstruct (model) (table (make-symbol-table :test #'equal)) (map (make-symbol-table :test #'eql)) contexts (ncontexts 0) (counts (make-array 512)) (weights (make-array 512)) (count 0) (size 512) parameters stream id task (minmax (make-array 512)) (random-indexes (make-array 512))) ;;(index-vectors (make-array 512))) (defmethod print-object ((object model) stream) (format stream "#[MODEL (~d context~p; ~d weight~p)]" (model-ncontexts object) (model-ncontexts object) (model-count object) (model-count object))) ;;; ;;; the `counts' vector in the model is a quadruple, containing per feature: ;;; ;;; - the absolute frequency of occurence for this feature ;;; - the total number of contexts that exhibit this feature ;;; - the total number of events that exhibit this feature ;;; - the number of times the feature is `relevant' in a context, i.e. there ;;; are (at least) two events for which the count (aka `frequency' or value, ;;; even) of this feature differs; see (van Noord & Malouf, 2004). ;;; (defstruct (feature) code mapped tid parameters symbol (count 1)) (defstruct (event) id frequency features (size 0) index-sum) (defstruct (context) id (size 0) events tail) (defun print-context (context &key (stream t) (format :rpm) model sample) (declare (ignore model)) (when (and (not (eq *redwoods-task* :classify)) (member format '(:mem :rpm))) (format stream "~d~%" (if sample (length sample) (context-size context)))) (case format ((:mem :svm :svmrank :perf :rpm) (loop with iid = (context-id context) for event in (context-events context) when (or (not sample) (member (event-id event) sample :test #'equal)) do (print-event event :stream stream :format format :iid iid))))) (defun print-event (event &key (stream t) (format :rpm) iid) (when (and *feature-random-indexing-p* (event-index-sum event)) (setf (event-size event) (spv-non-zeros (event-index-sum event)))) (macrolet ((fwrite (object stream) `(typecase ,object #+:allegro (fixnum (if (minusp ,object) (excl::print-negative-fixnum ,stream 10 ,object) (excl::print-fixnum ,stream 10 ,object))) (t (write ,object :stream ,stream))))) (case format ((:mem :rpm) ;;; MaxEnt (case *redwoods-task* (:rank (fwrite (event-frequency event) stream) (write-char #\Space stream) (fwrite (event-size event) stream)) (:classify (write 2 :stream stream) ;; = "context size" (terpri stream) (write (if (<= (event-frequency event) 0) 0 1) :stream stream) (write-char #\Space stream) (fwrite (event-size event) stream))) (if *feature-random-indexing-p* (when (event-index-sum event) (write-char #\Space stream) (print-random-vector (event-index-sum event) :stream stream :format format)) (loop for feature in (event-features event) for code = (or (feature-mapped feature) (feature-code feature)) do (write-char #\Space stream) (fwrite code stream) (write-char #\Space stream) (fwrite (feature-count feature) stream))) (case *redwoods-task* (:classify ;; add "dummy event" with label reversed, (terpri stream) ;; and zero features. (write (if (<= (event-frequency event) 0) 1 0) :stream stream) (write-char #\Space stream) (write 0 :stream stream)))) ((:svm :svmrank :perf) ;;; SVM (case *redwoods-task* (:rank (format stream "~,1f qid:~a" (cond ((or (eq format :svm) (eq format :svmrank)) (event-frequency event)) ((and (eq format :perf) (zerop (event-frequency event))) -1) (t 1)) iid)) (:classify ;;;fixme: adapt format for multiclass case as well: (format stream "~a" (if (<= (event-frequency event) 0) "-1" "+1")))) (if *feature-random-indexing-p* (when (event-index-sum event) (write-char #\Space stream) (print-random-vector (event-index-sum event) :stream stream :format format)) (loop for feature in (event-features event) ;;;feature numbers run from 1 in svm_light (from 0 in tadm) for code = (+ 1 (or (feature-mapped feature) (feature-code feature))) do (write-char #\Space stream) (fwrite code stream) (write-char #\Colon stream) (fwrite (feature-count feature) stream) finally ;; include event-id as a comment-field; (write-char #\Space stream) (write-char #\# stream) (loop with ids = (if (listp (event-id event)) (event-id event) (list (event-id event))) for id in ids do (write-char #\Space stream) (write id :stream stream)))))) (terpri stream))) (defun print-feature (feature &key (stream t) (format :compact)) (case format (:compact (format stream "[~a ~a {~{~a~^ ~}} ~a]" (feature-code feature) (feature-tid feature) (feature-parameters feature) (feature-count feature))))) (defun read-feature (stream &key (format :compact)) (case format (:compact (let* ((*readtable* (copy-readtable nil)) (c (read-char stream nil nil))) (set-syntax-from-char #\[ #\( *readtable*) (set-syntax-from-char #\] #\) *readtable*) (set-syntax-from-char #\{ #\( *readtable*) (set-syntax-from-char #\} #\) *readtable*) (when (and c (char= c #\[)) (let* ((code (read stream nil nil)) (tid (and code (read stream nil nil))) (c (peek-char #\{ stream nil nil))) (when (and tid c) (read-char stream nil nil) (let ((parameters (read-delimited-list #\} stream)) (count (read stream nil nil)) (c (peek-char #\] stream nil nil))) (when (and count c) (read-char stream nil nil) (make-feature :code code :tid tid :parameters parameters :count count)))))))))) (declaim (inline record-feature)) (defun record-feature (feature event &optional model &key rop native) (declare (optimize (speed 3) (safety 0) (space 0) (debug 0))) #+:null (setf %feature feature %event event) ;; ;; there are two distinct contexts in which record-feature() gets called: ;; (a) when building the feature cache, features have not yet been assigned ;; a unique code, and we want to determine their usage counts. conversely, ;; (b) when filling the context cache, features have been encoded (i.e. have ;; their unique code) but for increased efficiency, we want to map the codes ;; of features that actually get used (i.e. the active sub-set of features ;; from the complete feature cache) into a consecutive integer range. for ;; the latter, the model provides a `map'ping symbol table. ;; ;; (b2) when filling the context cache after feature selection ;; ;; for features that are encoded already and have a frequency count below the ;; current cut-off threshhold, return immediately, i.e. ignore this .feature. ;; (let ((code (feature-code feature))) (unless (or (null model) (null code) (null *feature-frequency-threshold*) (counts>= (aref (model-counts model) code) *feature-frequency-threshold* )) (return-from record-feature))) ;; ;; when using a foreign + native model (eg. for held-out testing, where `model' ;; would be the model learned for another data set, and `native' holds the local ;; mappings of symbols and codes), first find the native symbol and then map this ;; to the foreign id. ;; (when (and model native) (let ((mapped (symbol-to-code (code-to-symbol (feature-code feature) (model-table native)) (model-table model) :rop t))) (if mapped (setf (feature-mapped feature) mapped) (return-from record-feature)))) ;; ;; for features that are encoded already, map their code into a new range ;; (when (and model (not native) ;;(not rop) ;;fixme: may be breaking rop usage here? (symbol-table-p (model-map model)) (feature-code feature)) (setf (feature-mapped feature) (symbol-to-code (feature-code feature) (model-map model) :rop (or rop *feature-selection-p*)))) ;; ;; using feature selection: ;; ;;; (when (and model *feature-selection-p* (feature-code feature)) ;;; (unless (symbol-table-p (model-map model)) ;;; (error "record-feature(): feature selection models require model-map.~%")) ;;; (setf (feature-mapped feature) ;;; (symbol-to-code (feature-code feature) (model-map model) :rop t))) ;; ;; encode this feature, unless it has been assigned a unique identifier (with ;; respect to .model. already). ;; (unless (or (null model) (feature-code feature)) (setf (feature-code feature) (symbol-to-code (list* (feature-tid feature) (feature-parameters feature) (feature-symbol feature)) (model-table model))) ;; ;; at this point, ditch the symbolic representation of .feature. (which is ;; now recoverable from the symbol table in the .model.) ;; (setf (feature-symbol feature) nil) ;; ;; also, make sure the vectors in .model. indexed by codes grow in size as ;; we add features. ;; (when (and (not rop) (>= (feature-code feature) (model-size model))) (let ((n (setf (model-size model) (max (+ (feature-code feature) 1) (* (model-size model) 2))))) (#+:allegro excl:tenuring #-:allegro progn (setf (model-minmax model) (adjust-array (model-minmax model) n)) (setf (model-counts model) (adjust-array (model-counts model) n)) (unless (eq *feature-random-index-type* :hash) (setf (model-random-indexes model) (adjust-array (model-random-indexes model) n))) (setf (model-weights model) (adjust-array (model-weights model) n)))))) ;;(unless (and *feature-selection-p* (null (feature-mapped feature))) (unless (and *feature-selection-p* (null (feature-mapped feature))) (when *feature-random-indexing-p* ;;RIs always indexed on the fc code (not the mapped id). (let ((code (feature-code feature))) (unless (event-index-sum event) (setf (event-index-sum event) (make-vector *feature-random-index-size* :type :sparse))) ;;; (when (> (counts-events (aref (model-counts model) ;;; (feature-code feature))) ;;; 1) ;;; (setf (aref (model-random-indexes model) code) index)) (let ((index (cond ((eq *feature-random-index-type* :vector) (or (aref (model-random-indexes model) code) (setf (aref (model-random-indexes model) code) (create-random-index :size *feature-random-index-size* :count *feature-random-index-count*)))) ((eq *feature-random-index-type* :hash) (setf (hashed-random-index-key (model-random-indexes model)) code) (model-random-indexes model))))) ;;; (create-random-index ;;; :count *feature-random-index-count* ;;; :hashes (get-field :hashes (model-random-indexes model)) ;;; :key code))))) ;;; fixme: for hashing to represent an improvement we shouldn't really ;;; be creating the actual random vectors... optmize! (loop repeat (feature-count feature) ;; typically one. do (add-vector-into (event-index-sum event) index))))) ;; ;; from here on, use either the original .code. or the .mapped. value as the ;; identifier for inserting .feature. into .event. ;; (let* ((id (if (feature-mapped feature) #'feature-mapped #'feature-code)) (code (funcall id feature))) ;; ;; the .features. storage in events, naturally, is organized as a simple, ;; ordered list, sorted by feature codes. thus, the code below searches ;; linearly through the list and either increments the counter for ;; features that are present already, or inserts appropriately. ;; (cond ((or (zerop (event-size event)) (< code (funcall id (first (event-features event))))) (push feature (event-features event)) (incf (event-size event))) (t (loop for features on (event-features event) for this = (first features) for next = (first (rest features)) when (= (funcall id this) code) do (incf (feature-count this) (feature-count feature)) (return) else when (or (null next) (< code (funcall id next))) do (setf (rest features) (cons feature (rest features))) (incf (event-size event)) (return))))))) (defun record-features (features event &optional model &key rop native) (loop for feature in features do (record-feature feature event model :rop rop :native native))) (defun record-context (context model &key (format :rpm)) (incf (model-ncontexts model)) (if (and (streamp (model-stream model)) (open-stream-p (model-stream model))) (print-context context :stream (model-stream model) :model model :format format) (push context (model-contexts model)))) (defun normalize-features-n (event &key model (type :minmax)) (case type ((:euclidean :length) (loop with sum = (loop for feature in (event-features event) sum (expt (feature-count feature) 2)) for feature in (event-features event) do (setf (feature-count feature) (sqrt (/ (expt (feature-count feature) 2) sum))))) (:minmax (loop for feature in (event-features event) for code = (feature-code feature) for minmax = (and (< code (model-size model)) (aref (model-minmax model) code)) for min = (first minmax) for max = (second minmax) for normalized = (and minmax (/ (- (feature-count feature) min) (- max min))) when normalized do (setf (feature-count feature) (if (typep normalized 'ratio) (float normalized) normalized))))) event) (defun euclidean-length-features (event) (sqrt (loop with features = (event-features event) for feature in features sum (expt (feature-count feature) 2)))) (defun record-event (event context &key model normalizep) (when normalizep (normalize-features-n event :type normalizep :model model)) (let ((cons (cons event nil))) (if (null (context-events context)) (setf (context-events context) cons) (setf (rest (context-tail context)) cons)) (setf (context-tail context) cons)) (incf (context-size context))) ;;; ;;; an extension to the original model: populate a feature cache per profile to ;;; later be able to retrieve features without having to handle the original ;;; (heavy-duty) data. ;;; (defun cache-features (items model &key (stream *tsdb-io*) verbose createp) ;;; (unit :result)) ;;; fixme (declare (special *redwoods-task*)) #+:debug (setf %items items %model model) (unless items (return-from cache-features)) (loop with *feature-frequency-threshold* = nil with source = (get-field :source (first items)) with vp = (get-field :vp (first items)) with cache = (profile-find-feature-cache source :write :createp createp :vp vp) for item in items for counts = (make-hash-table :test #'eql) for db = (let ((foo (get-field :source item))) (cond ((string= source foo) cache) (t (setf source foo) (setf vp (get-field :vp item)) (close-fc cache) (setf cache (profile-find-feature-cache source :write :createp createp :vp vp))))) for iid = (get-field :i-id item) for readings = (get-field :readings item) when (or (eq *redwoods-task* :classify) ;;fixme; this check is different in cache-contexts (> readings 1): (and (integerp readings) (> readings 0))) do (when *feature-dependencies* ;; ;; when MRS-derived features are active, then reconstruct() needs to ;; actually re-build complete AVMs, in which case a lot of garbage ;; is generated, so we need to deliberately release quasi-destructive ;; intermediate structures; this was hard to debug. (21-nov-07; oe) ;; #+:debug (excl:print-type-counts) (lkb::release-temporary-storage) #+:debug (let ((*tsdb-gc-debug* nil)) (excl:print-type-counts) (excl:gc) (excl:print-type-counts))) (loop with i = 0 with *reconstruct-cache* = (make-hash-table :test #'eql) for result in (or (get-field :results item) (when (eq *redwoods-task* :classify) ;;hack for the case of no results ;;when doing substring classification. (list (acons :result-id 0 nil)))) for rid = (get-field :result-id result) for event = (when (or (null *redwoods-task*) (eq *redwoods-task* :rank)) (result-to-event result model)) for spanning-events = (when (eq *redwoods-task* :classify) (result-to-spanning-events item result model)) unless (or event spanning-events) do (format stream "~&[~a] cache-features(): ~ ignoring item # ~d (no edge for ~d)~%" (current-time :long :short) iid rid) (return) else do (loop for (span event) in (or spanning-events ;;FIXME; ugly hack (list (list nil event))) do (loop for feature in (event-features event) for code = (feature-code feature) for count = (or (gethash code counts) (setf (gethash code counts) (list 0 0))) do (incf (first count) (feature-count feature)) (incf (second count)) (pushnew (feature-count feature) (rest (rest count)))) (loop for (i . type) in *feature-preference-weightings* for count = (float (weigh-result item result type span)) for feature = (make-feature :tid %feature-frequency-tid% :parameters (list i) :symbol type :count count) do (record-feature feature event model)) (loop for feature in (event-features event) do (store-feature db iid rid feature span)) ;; span is optional (incf i) ;;hm, are relevance counts messed up for :substrings? erikve 3/2-10 ;; ;; keep track of min/max feature values for later normalization. ;; (loop with minmaxes = (model-minmax model) for feature in (event-features event) for code = (feature-code feature) for count = (feature-count feature) for minmax = (aref minmaxes code) when (null minmax) do (setf minmax (setf (aref minmaxes code) (list 0 0))) when (< count (first minmax)) do (setf (first (aref minmaxes code)) count) when (> count (second minmax)) do (setf (second (aref minmaxes code)) count))) finally (loop for code being each hash-key using (hash-value count) in counts for match = (or (aref (model-counts model) code) (setf (aref (model-counts model) code) (make-counts))) do (incf (counts-absolute match) (abs (first count))) (incf (counts-events match) (second count)) (when (or (null *redwoods-task*) (eq *redwoods-task* :rank)) (incf (counts-contexts match)) ;; ;; we call a feature `relevant' in this context, when there ;; are at least two events for which either (a) the feature ;; has different values or (b) only one of the two events ;; actually has the feature. ;; ;; reconsider this part wrt classification. ;; erikve 3/2-10 (when (or (rest (rest (rest count))) (< (second count) i)) (incf (counts-relevant match))))) (incf (model-ncontexts model)) (when verbose (format stream "~&[~a] cache-features(): item # ~d: ~d event~p;~%" (current-time :long :short) iid i i))) finally (close-fc db))) (defun get-token-spans (item &key (filter *redwoods-token-filter*)) "Helper for cache-contexts(). Lists all span indicies." (loop for foo in (get-field :i-tokens item) for token = (if (or (null filter) (funcall filter foo)) foo) when token collect (list (get-field :start token) (get-field :end token)))) (defun cache-contexts (items model &optional (identity -1) &key (format :rpm) (stream *tsdb-io*) normalizep native (mode :print) rop) ;; ;; _fix_me_ ;; most of the record-xyz() functions used to assume we had a model; maybe ;; rethink that part, or always carry around a model here too? ;; (5-jul-05; erik & oe) (loop with source = (get-field :source (first items)) with vp = (get-field :vp (first items)) with fcs = (list source) with fc = (profile-find-feature-cache source :read :vp vp) with cc = (profile-find-context-cache source identity :createp t :supersedep (not rop)) for item in items for iid = (get-field :i-id item) for readings = (get-field :readings item) for spans = (when (eq *redwoods-task* :classify) ;;fixme : move this down (get-token-spans item)) when (or (eq *redwoods-task* :classify) (and (integerp readings) (> readings 1))) do (let ((foo (get-field :source item))) (unless (string= source foo) (setf source foo) (setf vp (get-field :vp item)) (close-fc fc) (let ((createp (not (member source fcs :test #'string=)))) (setf fc (profile-find-feature-cache source :read :vp vp)) (pushnew source fcs :test #'string=) (setf cc (profile-find-context-cache source identity :createp createp :supersedep (not rop)))))) (loop with results = (or (get-field :results item) (when (eq *redwoods-task* :classify) ;;hack for the case of no results ;;when doing substring classification. (list (acons :result-id 0 nil)))) with n = (length results) ;;;FIXME: adapt random sampling for substrings/classification: with low = (when *feature-random-sample-size* ;; ;; determine whether result ids are 0- or 1-based. ;; (loop for result in (get-field :results item) minimize (get-field :result-id result))) with active = (when *feature-random-sample-size* ;; ;; find active (aka preferred) results, in order to ;; make sure all of them are in the random sample. ;; (loop for result in results for rid = (get-field :result-id result) for feature = (first (retrieve-features fc iid rid %feature-frequency-tid% (list 0))) when (and feature (= (feature-count feature) 1)) collect rid)) with sample = (when (and *feature-random-sample-size* (< *feature-random-sample-size* (- n (length active)))) ;; ;; when random sampling is requested, and the total ;; number of results (minus active ones) exceeds the ;; requested sample size, make a random selection. ;; (random-sample low (if (zerop low) (- n 1) n) (min (- n (length active)) *feature-random-sample-size*) active)) with context = (make-context :id iid) for result in results for rid = (get-field :result-id result) ;;;FIXME: adapt for substrings /classification: ;;; for event = (result-to-event-from-cache iid rid model fc) for event = (when (or (null *redwoods-task*) (eq *redwoods-task* :rank)) (result-to-event-from-cache iid rid model fc :rop rop)) for spanning-events = (when (eq *redwoods-task* :classify) (result-to-spanning-events-from-cache iid rid spans model fc :native native :rop rop)) if event do (record-event event context :normalizep normalizep :model model) else if spanning-events do (loop for event in spanning-events do (record-event event context :normalizep normalizep :model model)) finally (let ((n (context-size context))) (when (> n 0) (case mode (:print (let* ((file (merge-pathnames cc (make-pathname :name (format nil "~a" iid))))) (with-open-file (stream file :direction :output :if-exists :supersede :if-not-exists :create) (print-context context :stream stream :format format)) (when sample (let ((file (merge-pathnames cc (make-pathname :name (format nil "~a.~a" iid *feature-random-sample-size*))))) (with-open-file (stream file :direction :output :if-exists :supersede :if-not-exists :create) (print-context context :stream stream :format format :sample sample)))))) (:record (record-context context model))) (format stream "~&[~a] cache-contexts(): ~ item # ~d: ~a~@[ [~a]~] event~p;~%" (current-time :long :short) iid n (when sample (length sample)) n)))) finally (close-fc fc))) (defun result-to-event (result model &key rop (fcp t)) (let* ((derivationp (unless *feature-flags* (or (>= *feature-grandparenting* 0) (> *feature-ngram-size* 0) *feature-dependencies*))) (derivation (and derivationp (get-field :derivation result))) (edge (or (get-field :edge result) (when derivation ;; ;; when MRS-derived features are requested, we need to ;; actually re-build complete feature structures, i.e. ;; ask reconstruct() to perform all the unifications. ;; (reconstruct derivation *feature-dependencies*)))) (mrs (and *feature-dependencies* edge (mrs::extract-mrs edge))) (event (make-event))) ;; ;; _fix_me_ ;; why abort here? we could still score the LM feature, say for fragmented ;; realizations in the LOGON pipeline. (13-may-07; oe) ;; #+:null (when (and derivationp (null edge)) (return-from result-to-event)) ;; ;; in the LOGON universe, at least, .edge. can correspond to a fragmented ;; generator output, where there is no internal structure available besides ;; the concatenated surface string of the component fragments. ;; (when (and edge (lkb::edge-children edge)) ;; ;; first, extract the configurational features. ;; (loop for feature in (edge-to-configurations edge) do (record-feature feature event model :rop rop)) ;; ;; then the n-gram features over leaves of this edge. ;; (loop for feature in (edge-to-ngrams edge) do (record-feature feature event model :rop rop)) ;; ;; semantic features extracted off the MRS of this edge. ;; (when mrs (loop for feature in (mrs-to-dependencies mrs) do (record-feature feature event model :rop rop)))) ;; ;; often in a different universe, use whatever :flags properties off each ;; result. ;; (loop for feature in (result-to-flags result) do (record-feature feature event model :rop rop)) ;; ;; finally, the feature(s) corresponding to LM score(s); for now, we record ;; the raw LM score in the feature cache. however, when creating an actual ;; model, we want the scaling factor being part of the feature definition, ;; as its second parameter, so as to make sure that a serialized model can ;; be applied exactly the way it was trained. ;; (let ((lm (get-field :lm result))) (when (and (numberp lm) (numberp *feature-lm-p*) (not (= *feature-lm-p* 0))) (record-feature (make-feature :tid 42 :symbol (list 42) :parameters (if fcp (list 0) (list 0 *feature-lm-p*)) :count lm) event model :rop rop))) event)) ;;return list of pairs such as (span event), ;;where event is an event struct and span is a list '(to from) (defun result-to-spanning-events (item result model &key rop (fcp t)) (declare (ignore result fcp));;FIXME. reconsider parameter status? (let* ((events (make-hash-table :test #'equal))) ;;; (derivation (get-field :derivation result)) ;;; (edge (or (get-field :edge result) ;;; (reconstruct derivation *feature-dependencies*))) ;; (mrs (and edge (mrs::extract-mrs edge)))) ;; the recording of features in events is handled by the ;; individual extractor functions. (when *cfeature-malt-xle* (extract-dep-features item :model model :events events :rop rop)) (extract-token-features item :model model :events events :rop rop) (when (eq *conll-cues* :connectives) (extract-connective-features item :model model :events events :rop rop)) (loop for key being each hash-key of events using (hash-value val) collect (list key val) into list finally (return list)))) (defun edge-to-configurations (edge &key (parents '(lkb::^))) ;; ;; in order to support head lexicalization, perform feature extraction in a ;; pre-order tree transform: call ourselves recursively on all children (and ;; adjusting the .parents. list appropriately, i.e. inserting ourself as the ;; last parent on each recursive call); each recursive call will eventually ;; invoke edge-to-configurations() and accumulate the new features. ;; (unless (< *feature-grandparenting* 0) (nconc (loop with parents = (when (> *feature-grandparenting* 0) (append (last parents (- *feature-grandparenting* 1)) (list edge))) for edge in (lkb::edge-children edge) nconc (edge-to-configurations edge :parents parents)) ;; ;; finally, operate on the local .edge. ;; (edge-to-configurations1 edge parents)))) (defun edge-to-configurations1 (edge parents) (let* ((root (edge-root edge)) (parents (loop for parent in parents collect (if (lkb::edge-p parent) (edge-root parent) parent))) (daughters (lkb::edge-children edge))) ;; ;; _fix_me_ ;; generator edges, sadly, do not show their morphological history in the ;; `children' slot, hence (much like compute-derivation-tree()), we would ;; have to interpret the idiosyncratic `found-lex-rule-list' here too :-{. ;; (13-may-07; oe) (cond ;; ;; at the terminal yield of a derivation, things are relatively simple: ;; create one feature for the local configuration at the yield, plus as ;; many as are possible to derive from grandparenting, i.e. prefixing the ;; tuple corresponding to the local feature with all suffixes of .parents. ;; ((null daughters) (let* ((symbol (cons root (lkb::edge-leaves edge))) (features (list (make-feature :tid 1 :parameters '(0) :symbol symbol)))) (loop for i from 1 to (min (length parents) *feature-grandparenting*) for iparents = (last parents i) for feature = (make-feature :tid 1 :parameters (list i) :symbol (append iparents symbol)) do (push feature features)) (when *feature-lexicalization-p* (setf (lkb::edge-head edge) root)) features)) (t (when *feature-lexicalization-p* ;; ;; decorate local edge with head lexicalization information: find the ;; head (or key) daughter in the local rule and project its head up ;; to the current edge. this is the reason edge-to-configurations() ;; does a pre-order transform, i.e. makes sure it has been called ;; recursively on each edge _prior_ to actual feature extraction. ;; (let* ((rule (lkb::edge-rule edge)) (key (lkb::rule-head rule))) (setf (lkb::edge-head edge) (lkb::edge-head (nth key daughters))))) (let* ((roots (loop for edge in daughters collect (edge-root edge))) (weights (loop for edge in daughters for from = (lkb::edge-from edge) for to = (lkb::edge-to edge) when (and (numberp from) (numberp to)) collect (- to from) else return nil)) (skew (when (rest weights) (loop with n = (length weights) with average = (/ (sum weights) n) for foo in weights sum (expt (- foo average) 2) into bar finally (return (let ((bar (sqrt bar))) (cond ((= bar 0) 0) ((< bar 2) 1) ((>= 2) 2))))))) (weights (loop for weight in weights collect (cond ((= weight 1) 1) ((and (> weight 1) (<= weight 4)) 2) ((and (> weight 4) (<= weight 8)) 3) ((> weight 8) 4)))) (head (lkb::edge-head edge)) (symbol (cons root roots)) (lsymbol (cons head symbol)) (features (loop with weightp = (and (numberp *feature-constituent-weight*) (> *feature-constituent-weight* 0) *feature-constituent-weight*) for i from 0 to (min (length parents) *feature-grandparenting*) for iparents = (last parents i) collect (make-feature :tid 1 :parameters (list i) :symbol (append iparents symbol)) when *feature-lexicalization-p* collect (make-feature :tid 3 :parameters (list i) :symbol (append iparents lsymbol)) when (and skew weightp) collect (make-feature :tid 5 :parameters (list 1 i) :symbol (append iparents symbol (list skew))) when (and weights weightp (> weightp 1)) collect (make-feature :tid 5 :parameters (list 2 i) :symbol (append iparents symbol weights))))) ;; ;; include (back-off, in a sense) features for partially instantiated ;; constituents (corresponding to active edges in the parser): for ;; each daughter, perform head lexicalization if necessary, and add ;; the resulting features to .codes. ;; (when (and *feature-active-edges-p* (rest daughters)) (loop for edge in daughters for label = (edge-root edge) for symbol = (list root label) for lsymbol = (cons head symbol) do (loop for i from 0 to (min (length parents) *feature-grandparenting*) for iparents = (last parents i) do (push (make-feature :tid 2 :parameters (list i) :symbol (append iparents symbol)) features) when *feature-lexicalization-p* do (push (make-feature :tid 4 :parameters (list i) :symbol (append iparents lsymbol)) features)))) features))))) (defun edge-to-ngrams (edge) (loop with features = nil with forms = (lkb::edge-leaves edge) with ids = (lkb::edge-lex-ids edge) with type = (if (eq *feature-ngram-tag* :type) 1 0) with tags = (if (zerop type) ids (loop for id in ids collect (type-of-lexical-entry id))) initially (when (or (zerop *feature-ngram-size*) (not (= (length forms) (length tags)))) (return)) for forms on (append (cons 'lkb::^ forms) '(lkb::$)) for tags on (append (cons 'lkb::^ tags) '(lkb::$)) while (first forms) do (loop for i from (if *feature-ngram-back-off-p* 1 *feature-ngram-size*) to *feature-ngram-size* for form = (nth (- i 1) forms) for itags = (ith-n tags 1 i) when (and form (not (and (= i 1) (smember form '(lkb::^ lkb::$))))) do (push (make-feature :tid 10 :parameters (list i type) :symbol (cons form itags)) features) (push (make-feature :tid 11 :parameters (list i type) :symbol itags) features)) finally (return features))) (defun mrs-to-dependencies (mrs) (let* ((eds (mrs::ed-convert-psoa mrs)) (relations (and eds (mrs::eds-relations eds)))) (loop for rel in relations for pred = (intern (mrs::ed-predicate rel) 'lkb) for args = (mrs::ed-arguments rel) for arg-list = (loop for arg in args collect (list (intern (car arg) 'lkb) (intern (if (stringp (cdr arg)) (cdr arg) (mrs::ed-predicate (cdr arg))) 'lkb))) collect (make-feature :tid 20 :parameters (list 0) :symbol `(0 ,pred ,@(loop for arg in arg-list append (list (first arg) (second arg))))) append (loop for arg in arg-list collect (make-feature :tid 21 :parameters (list 0) :symbol `(0 ,pred ,(first arg) ,(second arg)))) collect (make-feature :tid 22 :parameters (list 0) :symbol `(0 ,pred ,@(loop for arg in arg-list collect (second arg)))) append (loop for arg in arg-list collect (make-feature :tid 23 :parameters (list 0) :symbol `(0 ,pred ,(second arg))))))) (defun result-to-flags (result) (loop with flags = (let ((flags (get-field :flags result))) (if (stringp flags) (setf (get-field :flags result) (ignore-errors (read-from-string flags))) flags)) for (i key) in *feature-flags* for value = (get-field key flags) for count = (and (numberp value) (coerce value 'single-float)) for features = (cond (count (list (make-feature :tid 43 :symbol (list key) :parameters (list i) :count count))) ((eq key :mtrs) (loop with map = (make-hash-table) for key in value when (gethash key map) do (incf (gethash key map)) else do (setf (gethash key map) 1) finally (return (loop for key being each hash-key using (hash-value count) in map collect (make-feature :tid 43 :symbol (list key) :parameters (list i) :count count)))))) nconc features)) (defun result-to-event-from-cache (iid rid model fc &key rop) (let* ((type (first *feature-preference-weightings*)) (features (retrieve-features fc iid rid %feature-frequency-tid% (list (first type)))) (frequency (and features (feature-count (first features)))) (event (and frequency (make-event :id rid :frequency frequency)))) (unless event (return-from result-to-event-from-cache)) ;; ;; at this point, we need to do something similiar to what happens during ;; construction of the feature cache, viz. interpret the various global ;; variables that determine which range of features to use. alas. ;; (loop for i from 0 to *feature-grandparenting* for features = (retrieve-features fc iid rid 1 (list i)) do (record-features features event model :rop rop)) (when *feature-active-edges-p* (loop for i from 0 to *feature-grandparenting* for features = (retrieve-features fc iid rid 2 (list i)) do (record-features features event model :rop rop))) (when (and (numberp *feature-constituent-weight*) (> *feature-constituent-weight* 0)) (loop for i from 0 to *feature-grandparenting* do (loop for j from 1 to *feature-constituent-weight* for features = (retrieve-features fc iid rid 5 (list j i)) do (record-features features event model :rop rop)))) (when *feature-lexicalization-p* (loop for i from 0 to *feature-grandparenting* for features = (retrieve-features fc iid rid 3 (list i)) do (record-features features event model :rop rop)) (when *feature-active-edges-p* (loop for i from 0 to *feature-grandparenting* for features = (retrieve-features fc iid rid 4 (list i)) do (record-features features event model :rop rop)))) (loop with type = (if (eq *feature-ngram-tag* :type) 1 0) for i from (if *feature-ngram-back-off-p* 1 *feature-ngram-size*) to *feature-ngram-size* for lfeatures = (retrieve-features fc iid rid 10 (list i type)) for features = (retrieve-features fc iid rid 11 (list i type)) do (record-features lfeatures event model :rop rop) (record-features features event model :rop rop)) (when *feature-dependencies* (loop for tid from 20 to 23 for features = (retrieve-features fc iid rid tid (list 0)) do (record-features features event model :rop rop))) (when (numberp *feature-lm-p*) ;; ;; the feature cache does /not/ record the scaling factor as a parameter ;; to the LM feature template (currently, template #42). ;; (let ((features (retrieve-features fc iid rid 42 (list 0)))) #+:null (when *feature-lm-normalize* ;; ;; _fix_me_ ;; incomplete code for normalization of LM values into a fixed range. ;; (loop for feature in features for minmax = (aref (model-minmax model) (feature-code feature)))) ;; ;; at this point, apply the LM scaling factor and record the actual ;; value used as part of the LM feature(s), so as to ensure that the ;; final model preserves this bit of information. ;; (loop for feature in features for count = (feature-count feature) do (setf (feature-count feature) (divide count *feature-lm-p*)) (nconc (feature-parameters feature) (list *feature-lm-p*))) (record-features features event model :rop rop))) (when *feature-flags* (loop for foo in *feature-flags* for features = (retrieve-features fc iid rid 43 (list (first foo))) do (record-features features event model :rop rop))) event)) (defun result-to-spanning-events-from-cache (iid rid spans model fc &key native rop) "Returns a list of events indexed on iid, rid, and span." (loop for span in spans for spanned-event = (let* ((type (first *feature-preference-weightings*)) ;;; e.g. type = '(:binary) (ffeatures (retrieve-features fc iid rid %feature-frequency-tid% (list (first type)) span)) (frequency (and ffeatures (feature-count (first ffeatures)))) (event (and frequency (make-event :id span :frequency frequency)))) ;; Retrive Maltpraser / XLE features: (when *cfeature-malt-xle* (loop for name in ;;note the quote: '(*cfeature-malt-deppattern* *cfeature-malt-deppath* *cfeature-malt-deprel* *cfeature-malt-lextriple* *cfeature-malt-postriple* *cfeature-xle-xdep* *cfeature-xle-subcat* *cfeature-xle-coord* *cfeature-xle-advtype* *cfeature-xle-adjtype*) for tid = (get name :tid) for parameter = (symbol-value name) when parameter append (retrieve-features fc iid rid tid (list 0) span) into features finally (when features (record-features features event model :rop (or native rop) :native native)))) ;; Retrive GENIA / TNT / token features: (loop for name in '(*cfeature-form-win* *cfeature-stem-win* *cfeature-tag-win* *cfeature-tag-l-ngram* *cfeature-stem-l-ngram* *cfeature-form-l-ngram* *cfeature-tag-r-ngram* *cfeature-stem-r-ngram* *cfeature-form-r-ngram*) ;;; for tag-p = (member name '(*cfeature-tag-win* ;;; *cfeature-tag-l-ngram* ;;; *cfeature-tag-r-ngram*) ;;; :test #'string=) for tid = (get name :tid) for parameter = (symbol-value name) ;; = span when parameter do (loop for win from 0 to parameter ;;; lexicalized; append (retrieve-features fc iid rid tid (list win 1) span) into features ;;; non-lexicalized; unless *cfeature-lexicalized-only-p* append (retrieve-features fc iid rid tid (list win 0) span) into features ;;; when (and *cfeature-lexicalized-only-p* tag-p) ;;; append (retrieve-features fc iid rid tid ;;; (list win 0) span) ;;; into features finally (when features (record-features features event model :rop (or native rop) :native native)))) ;; Retrive connective features: (when (eq *conll-cues* :connectives) (loop for name in '(*cfeature-conn-form* *cfeature-ptb-self* *cfeature-ptb-parent* *cfeature-ptb-left* *cfeature-ptb-right* *cfeature-ptb-path* *cfeature-stanford*) for tid = (get name :tid) for value = (let ((foo (symbol-value name))) (if (eq foo t) 0 foo)) for parameters = (when value (cons 0 (loop for mask in '(1 2 4 8 16) when (= (logand value mask) mask) collect mask))) when (or (eq name *cfeature-ptb-path*) (eq name *cfeature-stanford*)) append (loop for parameter in parameters append (loop for i from 0 to 4 append (retrieve-features fc iid rid tid (list parameter i) span))) else append (loop for parameter in parameters append (retrieve-features fc iid rid tid (list parameter) span)) into features finally (when features (record-features features event model :rop (or native rop) :native native)))) event) ;; when (> (event-size spanned-event) 0) collect spanned-event)) (defun edge-root (edge &optional (preterminals *feature-use-preterminal-types-p*)) (typecase (lkb::edge-rule edge) (lkb::rule (lkb::rule-id (lkb::edge-rule edge))) (string (let ((instance (first (lkb::edge-lex-ids edge)))) (if preterminals (type-of-lexical-entry instance) instance))) (t (error "edge-root(): unknown rule in edge ~a~%" edge)))) (defun lm-item-enhancer (item) #+:logon (when (and (numberp *feature-lm-p*) (not (= *feature-lm-p* 0))) (loop with foo with results = (get-field :results item) with strings = (loop for result in results for string = (get-field :surface result) when string collect string and do (push result foo)) with scores = #+:lm (mt::lm-score-strings strings :measure :logprob) #-:lm 0 for result in (nreverse foo) for score = (rest (pop scores)) do (nconc result (acons :lm score nil)))) item) (defun downcase-token-stems (item) "Normalizes token stems (eg as an item enhancer prior to classification)." (loop for token in (get-field :i-tokens item) unless (or (string= "NNP" (get-field :tag token)) (string= "NNPS" (get-field :tag token))) do (setf (get-field :stem token) (string-downcase (get-field :stem token)))) item) (defparameter %flags-ignore-fragments-p% t) (defun flags-item-enhancer (item &key (key :neva)) (nconc item (loop with *package* = (find-package :lkb) with length = (get-field :i-length item) with ranks for result in (get-field :results item) for olength = (let ((surface (get-field :surface result))) (and surface (+ 1 (count #\space surface)))) for flags = (let ((flags (get-field :flags result))) (if (stringp flags) (setf (get-field :flags result) (ignore-errors (read-from-string flags))) flags)) for fragmentp = (let ((fragments (get-field :fragments flags))) (and (numberp fragments) (> fragments 0))) for value = (get-field key flags) unless (numberp value) do (format t "flags-item-enhancer(): no ~a score on item # ~a (result # ~a).~%" key (get-field :i-id item) (get-field :result-id result)) and return nil else do ;; ;; _fix_me_ ;; it appears that the :ratio computation in translate-item() was not ;; correct, hence the final HandOn profiles (and likely also the ones ;; we used for the 2007 TMI paper) always have zero :ratio values. ;; not hard to (re-)compute at this point, so possibly we should even ;; ditch the corresponding (and flawed) code in translate-item(). ;; (30-jun-08; oe) (when olength (let ((ratio (divide olength length))) (if (get-field :ratio flags) (setf (get-field :ratio flags) ratio) (nconc flags (acons :ratio ratio nil))))) (push (acons key value result) ranks) finally (return (if (and fragmentp %flags-ignore-fragments-p%) (acons :ranks (loop for rank in ranks collect (acons :rank 1 rank)) nil) (let ((ranks (sort ranks #'> :key #'(lambda (result) (get-field key result))))) (acons :ranks (loop with top = (get-field key (first ranks)) for rank in ranks while (= (get-field key rank) top) collect (acons :rank 1 rank)) nil))))))) (defun weigh-result (item result type &optional span) (if span ;;FIXME: adapt this for scopes + multiclass. (let* ((cue-spans (list-cue-spans item)) ;;fixme; wastefull to extract this for every token. ;;; (and cues ;;; (loop with spans ;;; for cue in cues ;;; for list = (get-field :span cue) ;;; if (> (length list) 2) do ;;In case of multiword cues ;;; (loop while list ;;of non-continuous substrings. ;;; do (push (list (pop list) (pop list)) ;;; spans)) ;;; else do (push list spans) ;;; finally (return spans)))) (in-cue-spans-p (and cue-spans (loop with (t-start t-end) = span for (c-start c-end) in cue-spans when (and (>= t-start c-start) (<= t-end c-end)) do (return t)))) ;;; checks if the current substring is within the span of a cue. (frequency (if in-cue-spans-p 1 -1))) (case (first type) (:binary frequency))) ;;; (:bio fixme)));; begin-inside-outside fixme , check for span being at the beginning, inside or outside wrt to the cue-spans. (let* ((active (loop for rank in (get-field :ranks item) for n = (get-field :rank rank) for id = (get-field :result-id rank) when (= n 1) collect id)) (rid (get-field :result-id result)) (frequency (if (member rid active :test #'=) 1 0))) (case (first type) (:binary frequency) ((:bleu :wa :waft) (let ((gold (get-field :i-input item)) (surface (get-field :surface result))) (if surface (first (score-strings (list surface) (list gold) :type (first type))) 0))) (:conll (let* ((id (get-field :result-id result)) (score (item-to-conll item :result-id id :stream nil)) (total (get-field :total score)) (head (get-field :head score)) (power (or (second type) 1)) (factor (or (third type) 1))) (if (and total head) (* (expt (divide head total) power) factor) 0))) (:flags (let* ((flags (get-field :flags result)) (flags (if (stringp flags) (setf (get-field :flags result) (ignore-errors (read-from-string flags))) flags))) ;; ;; _fix_me_ ;; maybe this should really be NEVA, nowadays? (5-apr-08; oe) ;; (or (get-field :bleu flags) 0))) (t 0))))) (defun random-sample (low high size &optional sample) ;; ;; returns a set of .size. unique random integers from [.low. -- .high.], not ;; including elements of the initial .sample., when supplied. ;; (loop with n = (+ (- high low) 1) for i = (+ low (random n)) unless (smember i sample) do (push i sample) (decf size) while (> size 0) finally (return sample))) (defun profile-find-feature-cache (profile &optional mode &key createp vp) ;; ;; _fix_me_ ;; my impression is the support for naming differently the feature caches ;; and models that are part of a virtual profile is not yet fully in place. ;; while that facility seems attractive in principle, i opt to disable it ;; for just now, so as to move forward with experimentation using virtual ;; profiles and be confident i end up using the (one) right set of files. ;; the CoNLL 2016 set-up only has one virtual profile with two componenets ;; (train and dev) anyway ... (16-apr-16; oe) ;; (declare (ignore vp)) (let ((file (make-pathname :directory (find-tsdb-directory profile) :name "fc" #+:null (format nil "~@[~a_~]fc" vp)))) (if mode (open-fc file mode :createp createp) file))) (defun profile-find-model (profile &key testp) (let ((file (make-pathname :directory (find-tsdb-directory profile) :name "fc" :type "mlm"))) (if testp (probe-file file) file))) (defun profile-find-context-cache (profile &optional (identity -1) &key createp (supersedep t)) (let* ((path (dir-append (find-tsdb-directory profile) (list :relative (format nil "cc.~a" identity)))) (existsp (probe-file path))) (when (or (and existsp supersedep createp) (and (not existsp) createp)) #+:fad (ignore-errors (fad:delete-directory-and-files path :if-does-not-exist :ignore) (mkdir path))) path)) ;;; ;;; from here on, functions to score various types of structures (full results ;;; or intermediate constituents) according to a model. ;;; (defmacro score-feature (code model) ;; ;; _fix_me_ ;; unless exported and read back in once, it is quite possible for models to ;; have meaningful feature codes beyond the `count' value (which appears to ;; be the number of actual weights read after training); possibly we could ;; simply use the model `size' as the boundary instead, but then we would ;; have had to make sure that unpopulated indices are initialized to zeros. ;; (7-apr-06; oe & erik) `(or (when (< ,code (model-count ,model)) (aref (model-weights ,model) ,code)) 0.0)) (defun score-event (event model) (loop for feature in (event-features event) for count = (let ((count (or (feature-count feature) 0))) ;; ;; because the feature cache does /not/ include the scaling ;; of LM scores, we need to apply the scaling factor every ;; time we compute the contribution of the LM feature. ;; (if (= (feature-tid feature) 42) (divide count *feature-lm-p*) count)) for score = (score-feature (feature-code feature) model) sum (* count score))) (defun mem-score-result (result &optional (model %model%) &key normalizep) (unless (model-p model) (setf model (rest (assoc model *models*)))) (if model (let ((event (result-to-event result model :rop t :fcp nil))) (when normalizep (normalize-features-n event :type normalizep :model model)) (score-event event model)) 0.0)) ;;;(defun mem-score-result (result &optional (model %model%) &key normalizep item) ;;; (unless (model-p model) ;;; (setf model (rest (assoc model *models*)))) ;;; (if model ;;; (let ((event ;;; (when (eq *redwoods-task* :rank) ;;; (result-to-event result model :rop t :fcp nil))) ;;; (spanning-events ;;; (when (and item (eq *redwoods-task* :classify)) ;;; (result-to-spanning-events item result model :rop t)))) ;;; (loop for (span event) in (or spanning-events ;;; (list (list nil event))) ;;; when normalizep ;;; (normalize-features-n event :type normalizep :model model) ;;; collect (score-event event model))) ;;; (list 0.0))) ;;;(defun mem-score-tokens (item result model &key normalizep) ;;; (let ((spanning-events ;;; (result-to-spanning-events item result model :rop t))) ;;; (loop for (span event) in spanning-events ;;; when normalizep ;;; do (normalize-features-n event :type normalizep :model model) ;;; collect (list span (score-event event model))))) (defun mem-score-tokens (item result model &key normalizep) (let ((spanning-events (result-to-spanning-events item result model :rop t)) (tokens (get-field :i-tokens item))) ;;; DEBUGGING: (format t "~&mem-score-tokens(): found ~a events for ~a token(s) in item ~a~%" (length spanning-events) (length tokens) (get-field :i-id item)) (loop for (span event) in spanning-events when normalizep do (normalize-features-n event :type normalizep :model model) collect (list span (score-event event model)) into scores finally (loop for ((from to) score) in scores for token = (loop for token in tokens when (and (= from (get-field :start token)) (= to (get-field :end token))) do (return token)) if (get-field :class token) do (setf (get-field :class token) score) else do (nconc token (acons :class score nil))) (return scores)))) (defun mem-score-configuration (edge daughters &optional (model %model%)) (unless (model-p model) (setf model (rest (assoc model *models*)))) (if (model-p model) (let* ((event (make-event)) (roots (if daughters (loop for edge in daughters collect (edge-root edge)) (lkb::edge-leaves edge))) (feature (make-feature :tid 1 :parameters (list 0) :symbol (cons (edge-root edge) roots)))) (record-feature feature event model :rop t) (score-event event model)) 0.0)) (defun profile-find-addon (profile &key createp (type :malt+xle) identity) (let ((path (dir-append (find-tsdb-directory profile) (list :relative (format nil "~(~a~)~@[.~a~]" type identity))))) (when createp #+:fad (ignore-errors (fad:delete-directory-and-files path :if-does-not-exist :ignore) (mkdir path))) path)) ;; to get the default behaviour of `profile-find-context-cache', do e.g.: ;; (profile-find-addon source :type :cc :identity -1) ;;; windowing (defun collect-left-context (n pos list &key key) (loop for i from (min n pos) downto 1 if key collect (funcall key (nth (- pos i) list)) else collect (nth (- pos i) list))) (defun collect-right-context (n pos list &key key) (loop for i from 1 to (min n (- (- (length list) pos) 1)) if key collect (funcall key (nth (+ pos i) list)) else collect (nth (+ pos i) list))) (defun collect-window-context (n pos list &key key left-only-p right-only-p (target-p t)) (append (unless right-only-p (collect-left-context n pos list :key key)) (unless (not target-p) (list (if key (funcall key (nth pos list)) (nth pos list)))) (unless left-only-p (collect-right-context n pos list :key key)))) (defun extract-connective-features (item &key model events rop) "Extract token features specific to discourse connectives." (let* ((events (or events (make-hash-table :test #'equal))) (tokens (get-field :i-tokens item)) (ntokens (length tokens))) (loop for i below ntokens for token in tokens for span = (list (get-field :start token) (get-field :end token)) when (and *redwoods-token-filter* (funcall *redwoods-token-filter* token)) do (loop with event = (or (gethash span events) (setf (gethash span events) (make-event :id span))) for name in '(*cfeature-conn-form* *cfeature-ptb-self* *cfeature-ptb-parent* *cfeature-ptb-left* *cfeature-ptb-right* *cfeature-ptb-path* *cfeature-stanford*) for tid = (get name :tid) for parameter = (let ((foo (symbol-value name))) (if (eq foo t) 0 foo)) for candidates = (get-field :candidates token) for singletonp = (loop for candidate in candidates thereis (null (rest (get-field :heads candidate)))) for features = nil when parameter do (loop for candidate in candidates for head = (loop with surface = (remove "..." (get-field :surface candidate) :test #'string=) for head in (get-field :heads candidate) collect (nth head surface)) for self = (get-field :self candidate) for (parent left right) = (loop for node = self then parent for parent = (get-field :parent self) for children = (get-field :children parent) for i = (position node children) when (or (null parent) (> (length children) 1)) return (list parent (when (and i (> i 0)) (nth (- i 1) children)) (when i (nth (+ i 1) children)))) for (scc pcc lcc rcc bcc) = (labels ((label (node) (get-field :label node)) (cc (node) (let* ((parent (get-field :parent node)) (siblings (get-field :children parent))) (list* "#" (label node) (label parent) (loop for sibling in (remove node siblings) collect (label sibling)))))) (let ((lcc (and left (cc left))) (rcc (and right (cc right)))) (list (and self (cc self)) (and parent (cc parent)) lcc rcc (and lcc rcc (append lcc rcc))))) for vp = (labels ((vp (tree) (or (search "VP" (get-field :label tree)) (loop for child in (get-field :children tree) thereis (vp child))))) (vp right)) when (eq name '*cfeature-conn-form*) do (push (make-feature :tid tid :parameters (list 0) :symbol (get-field :surface candidate)) features) else when (eq name '*cfeature-stanford*) do (labels ((path (node &optional (n 4)) (unless (zerop n) (let ((head (first (get-field :in node)))) (cons (first head) (path (rest head) (- n 1))))))) (let ((path (path token))) (push (make-feature :tid tid :parameters (list 0 1) :symbol (butlast path 3)) features) (push (make-feature :tid tid :parameters (list 0 2) :symbol (butlast path 2)) features) (push (make-feature :tid tid :parameters (list 0 3) :symbol (butlast path 1)) features) (push (make-feature :tid tid :parameters (list 0 4) :symbol path) features) (unless (zerop (logand parameter 1)) (push (make-feature :tid tid :parameters (list 1 1) :symbol (append (butlast path 3) head)) features) (push (make-feature :tid tid :parameters (list 1 2) :symbol (append (butlast path 2) head)) features) (push (make-feature :tid tid :parameters (list 1 3) :symbol (append (butlast path 1) head)) features) (push (make-feature :tid tid :parameters (list 1 4) :symbol (append path head)) features)))) else unless (and singletonp (null self)) do ;; ;; for the PTB-style syntax features, we assume bits: ;; =0: just the feature, by itself ;; &1: the feature & the connective head ;; &2: pairwise conjunctions of PTB features ;; &4: the &2 features, each conjoined with the head ;; &8: the ‘connected context’ of wang et al. (2015) ;; &16: presence of VP node (for right sibling only) ;; (let ((self (get-field :label self)) (parent (get-field :label parent)) (left (or (get-field :label left) "null")) (right (or (get-field :label right) "null")) (path (get-field :path candidate))) (case name (*cfeature-ptb-self* (push (make-feature :tid tid :parameters (list 0) :symbol (list self)) features) (unless (zerop (logand parameter 1)) (push (make-feature :tid tid :parameters (list 1) :symbol (cons self head)) features)) (unless (zerop (logand parameter 2)) (push (make-feature :tid tid :parameters (list 2) :symbol (list "=" self "^" parent)) features) (push (make-feature :tid tid :parameters (list 2) :symbol (list "=" self "<" left)) features) (push (make-feature :tid tid :parameters (list 2) :symbol (list "=" self ">" right)) features)) ;; ;; no point in conjoining .self. and .path. ;; (unless (zerop (logand parameter 4)) (push (make-feature :tid tid :parameters (list 4) :symbol (list* "=" self "^" parent head)) features) (push (make-feature :tid tid :parameters (list 4) :symbol (list* "=" self "<" left head)) features) (push (make-feature :tid tid :parameters (list 4) :symbol (list* "=" self ">" right head)) features)) (unless (zerop (logand parameter 8)) (push (make-feature :tid tid :parameters (list 8) :symbol scc) features) (push (make-feature :tid tid :parameters (list 8) :symbol (append scc head)) features))) (*cfeature-ptb-parent* (push (make-feature :tid tid :parameters (list 0) :symbol (list parent)) features) (unless (zerop (logand parameter 1)) (push (make-feature :tid tid :parameters (list 1) :symbol (cons parent head)) features)) (unless (zerop (logand parameter 2)) (push (make-feature :tid tid :parameters (list 2) :symbol (list "^" parent "<" left)) features) (push (make-feature :tid tid :parameters (list 2) :symbol (list "^" parent ">" right)) features) (push (make-feature :tid tid :parameters (list 2) :symbol (list* "^" parent "|" path)) features)) (unless (zerop (logand parameter 4)) (push (make-feature :tid tid :parameters (list 4) :symbol (list* "^" parent "<" left head)) features) (push (make-feature :tid tid :parameters (list 4) :symbol (list* "^" parent ">" right head)) features) (push (make-feature :tid tid :parameters (list 4) :symbol (append (list* "^" parent "|" path) head)) features)) (unless (zerop (logand parameter 8)) (push (make-feature :tid tid :parameters (list 8) :symbol pcc) features) (push (make-feature :tid tid :parameters (list 8) :symbol (append pcc head)) features))) (*cfeature-ptb-left* (push (make-feature :tid tid :parameters (list 0) :symbol (list left)) features) (unless (zerop (logand parameter 1)) (push (make-feature :tid tid :parameters (list 1) :symbol (cons left head)) features)) (unless (zerop (logand parameter 2)) (push (make-feature :tid tid :parameters (list 2) :symbol (list "<" left ">" right)) features) (push (make-feature :tid tid :parameters (list 2) :symbol (list* "<" left "|" path)) features)) (unless (zerop (logand parameter 4)) (push (make-feature :tid tid :parameters (list 4) :symbol (list* "<" left ">" right head)) features) (push (make-feature :tid tid :parameters (list 4) :symbol (append (list* "<" left "|" path) head)) features)) (unless (zerop (logand parameter 8)) (push (make-feature :tid tid :parameters (list 8) :symbol lcc) features) (push (make-feature :tid tid :parameters (list 8) :symbol (append lcc head)) features))) (*cfeature-ptb-right* (push (make-feature :tid tid :parameters (list 0) :symbol (list right)) features) (unless (zerop (logand parameter 1)) (push (make-feature :tid tid :parameters (list 1) :symbol (cons right head)) features)) (unless (zerop (logand parameter 2)) (push (make-feature :tid tid :parameters (list 2) :symbol (list* ">" right "|" path)) features)) (unless (zerop (logand parameter 4)) (push (make-feature :tid tid :parameters (list 4) :symbol (append (list* ">" right "|" path) head)) features)) (unless (zerop (logand parameter 8)) (push (make-feature :tid tid :parameters (list 8) :symbol rcc) features) (push (make-feature :tid tid :parameters (list 8) :symbol (append rcc head)) features) (push (make-feature :tid tid :parameters (list 8) :symbol bcc) features) (push (make-feature :tid tid :parameters (list 8) :symbol (append bcc head)) features)) (unless (zerop (logand parameter 16)) (push (make-feature :tid tid :parameters (list 16) :symbol (list (if vp "yes" "no"))) features) (push (make-feature :tid tid :parameters (list 16) :symbol (cons (if vp "yes" "no") head)) features))) (*cfeature-ptb-path* (let* ((compressed (loop for previous = nil then label for label in path unless (string= label previous) collect label)) (n (length path)) (two (and (>= n 2) (subseq path 0 2))) (three (and (>= n 4)(subseq path 0 3))) (four (and (>= n 4) (subseq path 0 4)))) (push (make-feature :tid tid :parameters (list 0 0) :symbol path) features) (push (make-feature :tid tid :parameters (list 0 1) :symbol compressed) features) (when two (push (make-feature :tid tid :parameters (list 0 2) :symbol two) features)) (when three (push (make-feature :tid tid :parameters (list 0 3) :symbol three) features)) (when four (push (make-feature :tid tid :parameters (list 0 4) :symbol four) features)) (unless (zerop (logand parameter 1)) (push (make-feature :tid tid :parameters (list 1 0) :symbol (append path head)) features) (push (make-feature :tid tid :parameters (list 1 1) :symbol (append compressed head)) features) (when two (push (make-feature :tid tid :parameters (list 1 2) :symbol (append two head)) features)) (when three (push (make-feature :tid tid :parameters (list 1 3) :symbol (append three head)) features)) (when four (push (make-feature :tid tid :parameters (list 1 4) :symbol (append four head)) features))))))) finally (record-features features event model :rop rop)))) events)) (defun extract-token-features (item &key model events rop) "Extracts ngram + win features (left/right/both) over tags, stems and forms." (let* ((events (or events (make-hash-table :test #'equal))) (tokens (get-field :i-tokens item)) (ntokens (length tokens))) (loop for i below ntokens for token in tokens for stem = (get-field :stem token) for span = (list (get-field :start token) (get-field :end token)) ;; added 14/4/2016: filter tokens before extracting. ;; (the tokens list must be kept to preserve context) when (and *redwoods-token-filter* (funcall *redwoods-token-filter* token)) do (loop with event = (or (gethash span events) (setf (gethash span events) (make-event :id span))) for name in '(*cfeature-stem-l-ngram* *cfeature-stem-r-ngram* *cfeature-form-l-ngram* *cfeature-form-r-ngram* *cfeature-tag-l-ngram* *cfeature-tag-r-ngram* *cfeature-form-win* *cfeature-stem-win* *cfeature-tag-win*) for l-o-p = (member name '(*cfeature-stem-l-ngram* *cfeature-form-l-ngram* *cfeature-tag-l-ngram*)) for r-o-p = (member name '(*cfeature-stem-r-ngram* *cfeature-form-r-ngram* *cfeature-tag-r-ngram*)) for tid = (get name :tid) for parameter = (symbol-value name) for field = (case name ((*cfeature-form-l-ngram* *cfeature-form-r-ngram* *cfeature-form-win*) :form) ((*cfeature-stem-l-ngram* *cfeature-stem-r-ngram* *cfeature-stem-win*) :stem) ((*cfeature-tag-l-ngram* *cfeature-tag-r-ngram* *cfeature-tag-win*) :tag)) for min-size = (if (member name '(*cfeature-form-win* *cfeature-stem-win* *cfeature-tag-win*)) 0 1);; only win features can be the focus token alone. ;; for lexicalize-p = (eq field :tag) when parameter do (loop for lex-p in (list nil t) do (loop for win from min-size to parameter ;; 1 means including 1 token to for context = ;; the right (or left). (collect-window-context win i tokens :key #'(lambda (x) (get-field field x)) :left-only-p l-o-p :right-only-p r-o-p :target-p lex-p) when context collect (make-feature :tid tid :parameters (list win (if (eq field :tag) 0 (if lex-p 1 0))) :symbol context) into features when (and context (eq field :tag)) ;; only for tag-features collect (make-feature :tid tid :parameters (list win 1) :symbol (cons stem context)) into features finally (record-features features event model :rop rop))))) events)) (defun extract-dep-features (item &key model events rop) (let* ((events (or events (make-hash-table :test #'equal))) (source (get-field :source item)) (depdir (directory-namestring (profile-find-addon source :type :malt+xle))) (iid (get-field :i-id item)) (depfile (make-pathname :directory depdir :name (format nil "~a" iid))) (depfile-gz (make-pathname :directory depdir :name (format nil "~a" iid) :type "gz")) ;; (win (or window 3)) ;;fixme: make this part of parameters (root (namestring (parse-namestring (system:getenv "LOGONROOT")))) (ofile (format nil "/tmp/.depfeats.out.~a.~a" (current-user) (current-pid))) (command (format nil "perl ~a/uio/bioscope/src/depFeats.perl -i ~a -x -w 0 > ~a" root depfile ofile)) (unzipped-p nil)) (when (and (not (cl-fad:file-exists-p depfile-gz)) (not (cl-fad:file-exists-p depfile))) ;;;; (error "extract-dep-features(): no `depfile(.gz)' exists for item `~a'." ;;; item) (return-from extract-dep-features)) (when (cl-fad:file-exists-p depfile-gz) (run-process (format nil "gunzip ~a" (namestring depfile-gz)) :wait t) (setq unzipped-p t)) ;;; extract features: (when (and command (cl-fad:file-exists-p depfile)) (run-process command :wait t :output ofile :if-output-exists :supersede)) (when unzipped-p (run-process (format nil "gzip -9 ~a" (namestring depfile)) :wait t)) ;;; read the extracted features: (with-open-file (stream ofile :direction :input) (loop for line = (read-line stream nil nil) while line do (ppcre:register-groups-bind (start end features) ("^(\\d+)\\s+(\\d+).*\\s+==\\s+(.*)\\s*$" line) (setq start (parse-integer start) end (parse-integer end)) (loop with span = (list start end) with event = (or (gethash span events) (setf (gethash span events) (make-event :id span))) with k = 0 for (i j template value) = (multiple-value-bind (i j reg-i reg-j) (ppcre:scan "([^:\\s]+):([^\\s]+)" features :start k) (when i (list i j (subseq features (aref reg-i 0) (aref reg-j 0)) (subseq features (aref reg-i 1) (aref reg-j 1))))) while i do (setq k (+ 1 j)) ;;;; (print (list span (cons template value))) (let ((templ-name (case (read-from-string template nil nil) (deppattern '*cfeature-malt-deppattern*) (deppath '*cfeature-malt-deppath*) (deprel '*cfeature-malt-deprel*) (lextriple '*cfeature-malt-lextriple*) (postriple '*cfeature-malt-postriple*) (xdep '*cfeature-xle-xdep*) (subcat '*cfeature-xle-subcat*) ((coordLevel coord) '*cfeature-xle-coord*) (advtype '*cfeature-xle-advtype*) (adjtype '*cfeature-xle-adjtype*)))) (if (null templ-name) (format t "~&extract-dep-features(): ~ unknown feature prefix: ~a~%" template) ;; parameters here are simply boolean (unless (or (null templ-name) (null (symbol-value templ-name))) (record-feature (make-feature :tid (get templ-name :tid) :parameters (list 0) :symbol (list template value)) event model :rop rop)))))))) events)) ;;; ps: to recover symbol for cached feature: ;;; (code-to-symbol (feature-code feature) (model-table model)) ;;;(setq *cue-stems* (index-cues items)) ;;;(setq *token-filter-hook* ;;; #'(lambda (token) ;;; (gethash token *cue-stems*))) ;; (setq model (read-model (profile-find-model "conll10"))) ;; (setq cc (profile-find-context-cache "bse" (current-pid) :createp t)) (defun select-features (items model &key (identity (current-pid)) (stream *tsdb-io*) (type :erik)) (case type (:erik (let ((correlations (estimate-correlations items model :identity identity :stream stream))) (format stream "[~a] select-features(): updating model tables.~%" (current-time :long :short)) #+:debug (setf %model model %correlations correlations %items items %identity identity) (loop with new-map = (make-symbol-table :test #'eql) with old-map = (model-map model) for old-mapped-code from 0 to (1- (length correlations)) for c across correlations when (>= (expt c 2) *feature-covariance-threshold*) do (let* ((code (code-to-symbol old-mapped-code old-map))) ;;; re-map: (symbol-to-code code new-map)) finally (setf (model-map model) new-map)))) (:woodley ;; first, create a full context cache _without_ feature selection. ;; this will then provide the basis for the event file that we ;; supply as input for woodley's feature-selection code. (let ((*feature-selection-p* nil)) (cache-contexts items model identity :format :mem)) (let* ((prefix (format nil "~a/.model.~a.~a" (tmp :redwoods) (current-user) (current-pid))) (in-events (format nil "~a.input.evt" prefix)) (out-events (format nil "~a.output.evt" prefix)) (in-map (format nil "~a.input.def" prefix)) (out-map (format nil "~a.output.def" prefix)) (command (format nil "/logon/erikve/src/woodley/feature-selector ~a ~a ~a" in-events out-events *feature-covariance-threshold*))) (create-event-file items :file in-events :identity identity) (with-open-file (out in-map :direction :output :if-does-not-exist :create :if-exists :supersede) (loop for i from 1 to (symbol-table-count (or (model-map model) (model-table model))) do (write i :stream out) (write-char #\Tab out) (write (- i 1) :stream out) (terpri out))) (format stream "[~a] select-features(): starting feature selection. command: ~a~%" (current-time :long :short) command) (unless (and (zerop (run-process command :output nil :wait t :if-output-exists :supersede)) (probe-file out-events) (probe-file out-map)) (error "[~a] select-features(): feature selection failed.~%" (current-time :long :short))) (with-open-file (in out-map) (format stream "[~a] select-features(): selection complete. updating model.~%" (current-time :long :short)) (loop with new-map = (make-symbol-table :test #'eql) with old-map = (model-map model) for line = (read-line in nil nil) while line do (multiple-value-bind (code pos) (read-from-string line) (declare (ignore code)) (let* ((old-mapped-code (read-from-string line nil nil :start pos)) (code (code-to-symbol old-mapped-code old-map))) ;;; (symbol (code-to-symbol ;;; code ;;; (model-table model)))) ;;; we don't need the actual symbol at this point. (symbol-to-code code new-map))) ;;; as `symbol-to-code' will incrementally assign codes, we ;;; can ditch the code read from the feature-selection file. finally (setf (model-map model) new-map)))))) ;;; fixme: print the number of selected features (ie. map size) (format stream "[~a] select-features(): ~a features selected.~%" (current-time :long :short) (symbol-table-count (model-map model))) model) (defun estimate-correlations (items model &key (identity (current-pid)) (stream *tsdb-io*)) (format stream "[~a] estimate-correlations(): initial context caching.~%" (current-time :long :short)) ;;; first, create a full context cache _without_ feature selection. ;;; we don't really care about the actual event file, only the ;;; events and features that get recorded in the model. (setf (model-ncontexts model) 0) (let ((*feature-selection-p* nil)) (cache-contexts items model identity :format :mem :mode :record)) (let* ((nfeats (symbol-table-count (model-map model))) (nevents 0) (xsums (make-array nfeats :initial-element 0.0)) (xxsums (make-array nfeats :initial-element 0.0)) (xysums (make-array nfeats :initial-element 0.0)) (ysum 0.0) (yysum 0.0) (corr xysums)) ;;;(make-array nfeats) (format stream "[~a] estimate-correlations(): collecting counts.~%" (current-time :long :short)) ;;; collect necessary counts: (loop for context in (model-contexts model) do (loop for event in (context-events context) for freq = (event-frequency event) do (incf nevents) (incf ysum freq) (incf yysum (* freq freq)) (loop for feat in (event-features event) for count = (feature-count feat) do (incf (aref xsums (feature-mapped feat)) count) (incf (aref xxsums (feature-mapped feat)) (* count count)) (incf (aref xysums (feature-mapped feat)) (* freq count))))) (format stream "[~a] estimate-correlations(): ~ computing coefficients for ~a features.~%" (current-time :long :short) (length corr)) ;;; (setq %model model %items items ;;; %nfeats nfeats %nevents nevents ;;; %corr corr %xsums xsums %xxsums xxsums ;;; %xysums xysums %ysum ysum %yysum yysum) (loop for i from 0 to (- nfeats 1) for xsum = (aref xsums i) for xxsum = (aref xxsums i) for xysum = (aref xysums i) for r = (divide (- xysum (divide (* xsum ysum) nevents)) (sqrt (* (- xxsum (divide (expt xsum 2) nevents)) (- yysum (divide (expt ysum 2) nevents))))) do (setf (aref corr i) r));;nb:overwriting xysums[i] corr)) ;; http://davidmlane.com/hyperstat/A51911.html ;;;(loop for i from 0 to (- (length foo) 1) ;;; do (setf (aref foo i) (random 100))) ;;; ;;;;; var = (sum (x-m)^2) / N = (sum(x^2))/N - m^2 ;;; ;;;;;; (sum (x-m)^2) / N ;;;(loop ;;; for i from 0 to (- (length foo) 1) ;;; with mean = 0.0 ;;; do (incf mean (aref foo i)) ;;; finally ;;; (setf mean (/ mean (length foo))) (print (float mean)) ;;; (loop ;;; for i from 0 to (- (length foo) 1) ;;; sum (expt (- (aref foo i) mean) 2) into sum-diff2 ;;; finally (print (float (/ sum-diff2 (length foo)))))) ;;; ;;; ;;; ;;;;;; (sum(x^2))/N - m^2 ;;;(loop ;;; for i from 0 to (- (length foo) 1) ;;; with sum = 0 ;;; with sum2 = 0 ;;; do (incf sum (aref foo i)) ;;; (incf sum2 (expt (aref foo i) 2)) ;;; finally (print (float (/ sum (length foo)))) ;;; (print (float (- (/ sum2 (length foo)) ;;; (expt (/ sum (length foo)) 2))))) ;;; ;;; ;;; --------------------------------------------------- ;;; ;;; ADT for Sparse Vectors ;;; ;;; --------------------------------------------------- ;;; (defun insert-sorted (i list) (if (null list) (list i) (loop for elms on list for this = (first elms) for next = (first (rest elms)) if (= this i) do (return list) else if (< i this) do (return (cons i list)) else if (or (null next) (< i next)) do (setf (rest elms) (cons i (rest elms))) (return list)))) (defclass sparse-vector () ((entries :initform (make-hash-table :test #'eql) :initarg :entries ;;;hash w index/coefficient pairs :accessor spv-entries :type hash-table) (range :initarg :range :accessor spv-range :type integer) ;; dimensionality (non-zeros :accessor spv-non-zeros :type integer :initform 0) (initial-element :initform 0 :initarg :initial-element :accessor spv-initial-element))) (defclass list-indexed-sparse-vector (sparse-vector) ((index :initform nil :type list :accessor spv-index))) (defclass alist-sparse-vector (sparse-vector) ((entries :initform nil :type list))) (defstruct random-index (pluss nil) (minus nil)) ;;;(defun create-random-index (&key (size 50000) (count 10)) ;;; (let* ((range size) ;; (- size 1)) ;;; (neg (floor count 2)) ;;; (pos (- count neg)) ;;; (pluss) ;;; (minus)) ;;; (loop ;;; repeat pos ;;; for i = (random range) ;; i \in [0,range] ;;; do (push i pluss)) ;;; (loop ;;; with k = 0 ;;; while (< k neg) ;;; for i = (random range) ;;; unless (member i pluss :test #'=) ;;; do (push i minus) ;;; (incf k)) ;;; (make-random-index ;;; :pluss pluss :minus minus))) (defun create-random-index (&key (size 50000) (count 10) hashes key) (let* ((range size) ;; (- size 1)) (n (floor count 2)) (r (rem count 2)) (indexes (if hashes (mapcar #'(lambda (fn) (funcall fn key)) hashes) (loop repeat count collect (random range)))) (pluss (when (> count 1) (subseq indexes r (+ n r)))) (minus (when (> count 1) (subseq indexes (+ n r) (+ (* n 2) r)))) (r-sign (when (oddp count) (if (evenp (or (and hashes (funcall (first hashes) (first indexes))) (random 2))) 1 -1)))) (when r-sign (if (> r-sign 0) (push (first indexes) pluss) (push (first indexes) minus))) ;;; (push (first indexes) ;;; (if (> r-sign 0) pluss minus))) (make-random-index :pluss pluss :minus minus))) ;;;(defclass random-index () ;;; ((pluss :initform nil ;;; :initarg :pluss ;;; :accessor random-index-pluss ;;; :type list) ;;; (minus :initform nil ;;; :initarg :minus ;;; :accessor random-index-minus ;;; :type list))) (defmethod vsize ((vec array)) (length vec)) (defmethod vsize ((vec sparse-vector)) (+ (spv-range vec) 1)) (defmethod spv-list-index ((spv list-indexed-sparse-vector)) (spv-index spv)) (defmethod spv-list-index ((spv alist-sparse-vector)) (sort (mapcar #'car (spv-entries spv)) #'<)) (defmethod spv-list-index ((spv sparse-vector)) (loop for i being each hash-key in (spv-entries spv) collect i into list finally (return (sort list #'<)))) (defmethod print-object ((object sparse-vector) stream) (print-unreadable-object (object stream :type t :identity t) (format stream "#[non-zeros: ~d]" (spv-non-zeros object)))) (defmethod vref ((vec array) (i integer)) "Generalized aref() for vectors" (aref vec i)) (defmethod vref ((vec sparse-vector) (i integer)) "Like aref() for objects of type sparse-vector." (if (> i (spv-range vec)) (error "vref(): Index out of range (given ~a, size ~a).~%" i (vsize vec)) (gethash i (spv-entries vec) (spv-initial-element vec)))) (defmethod vref ((vec alist-sparse-vector) (i integer)) "Like aref() for objects of type sparse-vector." (if (> i (spv-range vec)) (error "vref(): Index out of range (given ~a, size ~a).~%" i (vsize vec)) (or (cdr (assoc i (spv-entries vec) :test #'=)) (spv-initial-element vec)))) (defmethod set-vref ((spv sparse-vector) i c) "SETF-function for VREF(). Handled correctly by INCF() / DECF()." (multiple-value-bind (val activep) (gethash i (spv-entries spv)) (declare (ignore val)) (cond ((= c (spv-initial-element spv)) (when activep (remhash i (spv-entries spv)) (decf (spv-non-zeros spv)))) (t (setf (gethash i (spv-entries spv)) c) (unless activep (incf (spv-non-zeros spv)))))) c) (defmethod set-vref ((spv list-indexed-sparse-vector) i c) (multiple-value-bind (val activep) (gethash i (spv-entries spv)) (declare (ignore val)) (cond ((= c (spv-initial-element spv)) (when activep (remhash i (spv-entries spv)) (setf (spv-index spv) (delete i (spv-index spv) :count 1)) (decf (spv-non-zeros spv)))) (t (setf (gethash i (spv-entries spv)) c) (unless activep (setf (spv-index spv) (insert-sorted i (spv-index spv))) (incf (spv-non-zeros spv)))))) c) (defmethod set-vref ((spv alist-sparse-vector) i c) (let ((activep (assoc i (spv-entries spv) :test #'=))) (cond ((= c (spv-initial-element spv)) (when activep (setf (spv-entries spv) (delete-if #'(lambda (pair) (= i (cdr pair))) (spv-entries spv))) (decf (spv-non-zeros spv)))) (activep (setf (get-field i (spv-entries spv)) c)) (t (setf (spv-entries spv) (acons i c (spv-entries spv))) (incf (spv-non-zeros spv))))) c) (defmethod set-vref ((vec array) i c) (setf (aref vec i) c)) (defsetf vref set-vref) (defmethod add-vector-into ((v1 sparse-vector) (v2 random-index)) (loop for i in (random-index-pluss v2) do (incf (vref v1 i))) (loop for i in (random-index-minus v2) do (decf (vref v1 i))) v1) (defmethod add-vector-into ((v1 sparse-vector) (v2 hashed-random-index)) (loop for code in (funcall (hashed-random-index-hash v2) (hashed-random-index-key v2)) for i from 1 if (evenp i) do (incf (vref v1 code)) else do (decf (vref v1 code))) v1) (defmethod add-vector-into ((v1 sparse-vector) (v2 sparse-vector)) (maphash #'(lambda (i2 val2) ;;(maphash seems faster than loop) (incf (vref v1 i2) val2)) (spv-entries v2))) (defmethod add-vector-into ((v1 sparse-vector) (v2 alist-sparse-vector)) (loop for (i2 . val2) in (spv-entries v2) do (incf (vref v1 i2) val2)) v1) (defmethod add-vectors-into ((v1 sparse-vector) &rest vn) (loop for v in vn do (add-vector-into v1 v)) v1) (defmethod scale-vector ((v array) c) (unless (= c 1) (loop for i from 0 to (- (length v) 1) for val across v unless (zerop val) do (setf (aref v i) (* c val)))) v) (defmethod scale-vector ((v sparse-vector) c) (unless (= c 1) (maphash #'(lambda (i val) (setf (vref v i) (* c val))) (spv-entries v))) v) (defmethod scale-vector ((v alist-sparse-vector) c) (unless (= c 1) (loop for (i . val) in (spv-entries v) do (setf (vref v i) (* c val)))) v) (defun make-vector (size &key (type :sparse) (initial-element 0)) (case type (:sparse (make-instance 'sparse-vector :range (- size 1) :initial-element initial-element)) (:a-sparse (make-instance 'alist-sparse-vector :range (- size 1) :initial-element initial-element)) (:list-indexed-sparse (make-instance 'list-indexed-sparse-vector :range (- size 1) :initial-element initial-element)) (:full (make-array size :initial-element initial-element)))) (defmethod copy-vector ((v sparse-vector)) (let ((new (make-vector (vsize v) :type :sparse :initial-element (spv-initial-element v)))) (maphash #'(lambda (i val) (setf (vref new i) val)) (spv-entries v))) v) (defmethod copy-vector ((v alist-sparse-vector)) (loop with new = (make-vector (vsize v) :type :a-sparse :initial-element (spv-initial-element v)) for (i . val) in (spv-entries v) do (setf (vref new i) val)) v) ;; fixme: plot the distributinal behaviour of random(). (defun randomize-vector (vec &optional (n 10)) (let* ((range (vsize vec)) ;;;(- (vsize vec) 1)) (neg (floor n 2)) (pos (- n neg))) (loop repeat pos for i = (random range) do (setf (vref vec i) 1)) (loop with k = 0 while (< k neg) for i = (random range) unless (= (vref vec i) 1) do (setf (vref vec i) -1) (incf k))) vec) ;;;(defun make-random-index (&key (size 50000) (count 10)) ;;; (randomize-vector ;;; (make-vector size :type (if (> count 10) :sparse :a-sparse)) ;;; count)) (defmethod print-random-vector ((v sparse-vector) &key (stream t) (format :mem) (scale *feature-random-index-scale*)) (let ((printer (case format ((:mem :rpm) #'(lambda (i) (write i :stream stream) (write-char #\Space stream) (if scale (write (* scale (vref v i)) :stream stream) (write (vref v i) :stream stream)) (write-char #\Space stream))) ((:svm :svmrank :perf) #'(lambda (i) (write (+ i 1) :stream stream) (write-char #\Colon stream) (if scale (write (* scale (vref v i)) :stream stream) (write (vref v i) :stream stream)) (write-char #\Space stream)))))) (if printer (mapc printer (spv-list-index v)) (when (eq format :s-exp) (write-char #\( stream) (loop for i in (spv-list-index v) for c = (vref v i) if (= c 1) collect i into pos else if (= c -1) collect i into neg finally (write (cons 1 pos) :stream stream) (write (cons -1 neg) :stream stream)) (write-char #\) stream)))) nil) (defmacro with-open-gzfile ((stream gzfilespec &rest args) &rest body) "Read / write to gzip'ed file." (let ((foo (gensym)) (gzfile (gensym)) (file (gensym)) ) `(let ((,foo ,gzfilespec)) (let* ((,gzfile (namestring (if (string= (pathname-type ,foo) "gz") ,foo (make-pathname :name (namestring ,foo) :type "gz")))) (,file (namestring (make-pathname :directory (pathname-directory ,gzfile) :name (pathname-name ,gzfile))))) (when (cl-fad:file-exists-p ,gzfile) (run-process (format nil "gunzip ~a" ,gzfile) :wait t)) (with-open-file (,stream ,file ,@args) ,@body) (when (cl-fad:file-exists-p ,file) (run-process (format nil "gzip -9 ~a" ,file) :wait t)))))) ;;; ;;;(with-open-gzfile (out "/path/foo" :direction :output :if-exists :supersede) ;;; (format out "~&hello world!~%")) ;;; ;;;(with-open-gzfile (stream "/path/foo.gz") ;;; (loop for line = (read-line stream nil nil) ;;; while line do (print line))) (defun export-random-indexing (model target &key file) (let ((path (or file (make-pathname :name "projection" :directory (find-tsdb-directory target))))) (with-open-gzfile (out path :direction :output :if-exists :supersede) (loop for ri across (model-random-indexes model) for code from 0 when ri do (write-char #\( out) (write code :stream out) (write (random-index-pluss ri) :stream out) (write (random-index-minus ri) :stream out) (write-char #\) out) (terpri out))))) (defun import-random-indexing (model file) ;;; &key (size *feature-random-index-size*) ;;; (count *feature-random-index-count*)) (with-open-gzfile (in file :direction :input) (loop with vecs = (model-random-indexes model) for i from 0 for nz = (read in nil nil) ;; alist of code-id + non-zeros for code = (first nz) for pos = (second nz) for neg = (third nz) ;;; for pos = (get-field 1 nz) ;;; for neg = (get-field -1 nz) ;;; for ri = (make-vector *feature-random-index-size* ;;; :type :sparse) for ri = (make-random-index :pluss pos :minus neg) ;;; for ri = (create-random-index :size size :count count) while nz ;;; do (mapc #'(lambda (j) (setf (vref ri j) 1)) pos) ;;; (mapc #'(lambda (j) (setf (vref ri j) -1)) neg) do (setf (aref vecs code) ri) finally (return vecs)))) (defmacro gridify (parameters &body body) (if (null (rest parameters)) `(dolist ,(first parameters) ,@body) `(dolist ,(first parameters) (gridify ,(rest parameters) ,@body)))) (defun initialize-random-indexing (model) (loop with indexes = (model-random-indexes model) for i from 0 below (model-size model) do (setf (aref indexes i) (create-random-index :size *feature-random-index-size* :count *feature-random-index-count*)) finally (return indexes))) (defun wipe-model-map (model) (setf (model-map model) (make-symbol-table :test #'eql))) ;;;(defmethod print-random-vector ((v sparse-vector) &key (stream t) (format :mem)) ;;; (loop ;;; fixme: add svm format support. (write-char #\Colon stream) ;;; with nz = (spv-index v) ;;; with nnz = (spv-non-zeros v) ;;; with c = 0 ;;; for i in nz ;;; for feat = (case format ;;; ((:mem :rpm) i) ;;; ((:svm :perf) (+ i 1))) ;;; do (write feat :stream stream) ;;; (case format ;;; ((:mem :rpm) (write-char #\Space stream)) ;;; ((:svm :perf) (write-char #\Colon stream))) ;;; (write (vref v i) :stream stream) ;;; (incf c) ;;; (unless (= c nnz) ;;; (write-char #\Space stream)))) ;;; fixme: need a list of non-zeros to avoid the sorting when printing! ;;(defparameter *universal-hash-k-bit* 32) ;; size of the set of keys ;;(defparameter *universal-hash-l-bit* 10) ;; size of the set of indexes ;;; returns a randomly initialized function from a family of universal ;;; (and multiplicative) hash functions: (defun orig-generate-hash-fn (&key (k-bit 32) (l-bit 8) verbosep constant) ;;; (expt 2 k-bit) = u = size of the universe (some large power of two, typically 2^k = 2^32. ;;; (expt 2 l-bit) = m = size of the set of indexes we're mapping "u" into, m = 2^l0. (labels ((random-constant () (let ((a (+ (random (- (expt 2 k-bit) 1)) 1))) (logior a 1)))) ;; inclusive OR ensures odd number. (let ((a (or constant (random-constant))) ;; a = an odd number where 0 < a < u. ;;; (pdiff (- (round (log (/ u m) 2)))) ;; = k-l (pdiff (- (- k-bit l-bit))) ;; negative sign to do a rightshift, see ash(). (sub1 (- (expt 2 k-bit) 1))) (when verbosep (format t "~&generate-hash-fn(): using random constant: ~a~%" a)) (values #'(lambda (x) ;; add 1 to x, otherwise always f(0)=0. (ash (logand (* (1+ x) a) sub1) pdiff)) a)))) ;; simplifying the original formulation: ;; ;; (truncate (mod (* a x) (expt 2 k-bit)) (expt 2 (- k-bit l-bit))) ;; ;; to ;; ;; (ash (logand (* x a) sub1) pdiff)) ;; ;; when dealing with modulo of powers of two, the modulo operation can ;; be implemented as x mod 2^n == x AND (2^n - 1) see ;; http://en.wikipedia.org/wiki/Modulo_operation#Performance_issues ;; ;; again assuming for u = 2^k and m = 2^l, then for an integer x, we ;; can find (truncate x (/ u m)) by using ash to shift the bits of x ;; by k-l positions to the right. ;; ;; one of the rationales is that odd numbers are relative primes to ;; powers of two. (defun orig-initialize-ri-hashes (model &optional constants);;'(1707063001 1844511583 1111529369 1240853219))) (unless (zerop (rem (log *feature-random-index-size* 2) 1)) (error "initialize-random-index-hashes(): size (~a) not a power of 2!~%" *feature-random-index-size*)) (let ((hashes) (constantz)) (dotimes (i *feature-random-index-count*) (multiple-value-bind (fn c) (generate-hash-fn :l-bit (round (log *feature-random-index-size* 2)) :verbosep t :constant (nth i constants)) (push fn hashes) (push c constantz))) (setf (model-random-indexes model) (pairlis (list :constants :hashes) (list constantz hashes))))) ;;;(defun initialize-ri-hashes (model &optional constants);;'(1707063001 1844511583 1111529369 1240853219))) ;;; (unless (zerop (rem (log *feature-random-index-size* 2) 1)) ;;; (error "initialize-random-index-hashes(): size (~a) not a power of 2!~%" ;;; *feature-random-index-size*)) ;;; ;;; (let ((hashes) ;;; (constantz)) ;;; (dotimes (i *feature-random-index-count*) ;;; (multiple-value-bind (fn c) ;;; (generate-hash-fn :l-bit (round (log *feature-random-index-size* 2)) ;;; :verbosep t :constant (nth i constants)) ;;; (push fn hashes) ;;; (push c constantz))) ;;; ;;; (setf (model-random-indexes model) ;;; (pairlis (list :constants :hashes) ;;; (list constantz hashes))))) ;;; NEW VERSION ;;; ----------------------- (defun generate-hash-fn (&key (k-bit *feature-universal-hash-keybit*) ;;32) (l-bit 8) verbosep constants (count 4)) ;;; (expt 2 k-bit) = u = size of the universe (typically 2^k = 2^32). ;;; (expt 2 l-bit) = m = size of indexes we're mapping "u" into, eg m = 2^l0. (labels ((random-constant () (let ((a (+ (random (- (expt 2 k-bit) 1)) 1))) (logior a 1)))) ;; inclusive OR ensures odd number. (let ((pdiff (- (- k-bit l-bit)));; neg. sign to do a rightshift, see ash() ;;;(pdiff (- (round (log (/ u m) 2)))) ;; = k-l (sub1 (- (expt 2 k-bit) 1)) (constants (copy-list constants))) (unless constants (loop ;; a = odd number a where 0 < a < u. until (= count (length constants)) do (pushnew (random-constant) constants))) (when verbosep (format t "~&generate-hash-fn(): using random constants: ~a~%" constants)) (values #'(lambda (x) (loop for a in constants ;;; add 1 to x, otherwise always f(0)=0. collect (ash (logand (* (1+ x) a) sub1) pdiff))) constants)))) (defun initialize-ri-hashes (model &optional (constants *feature-random-index-constants*)) (unless (zerop (rem (log *feature-random-index-size* 2) 1)) (error "initialize-random-index-hashes(): size (~a) not a power of 2!~%" *feature-random-index-size*)) (print model) (multiple-value-bind (hashes constants) (generate-hash-fn :verbosep t :constants constants :count *feature-random-index-count* :l-bit (round (log *feature-random-index-size* 2))) (setf (model-random-indexes model) (make-hashed-random-index :constants constants :hash hashes))))