(in-package :tsdb) (defparameter *connective-keys* (make-hash-table :test #'equal)) (defparameter *connectives* (let ((table (make-hash-table :test #'equal)) (file (logon-file "conll16/lisp" "train.connectives.lisp"))) (clrhash *connective-keys*) (with-open-file (stream file) (loop for connective = (read stream nil nil) while connective do (let* ((index (get-field :key connective)) (surface (get-field :surface connective)) (key (nth index (remove "..." surface :test #'string=)))) (push connective (gethash key *connective-keys*)) (setf (gethash surface table) connective)))) table)) (defun pdtb-import-and-score (input &optional output) (let* ((file (format nil "~a/.io.~a.~a" (tmp) (current-user) (current-pid))) (profile (format nil "test.~a.~a" (current-user) (current-pid))) (defaults (logon-file "models" "defaults.lisp")) (features (logon-file "models" "features.lisp")) (model (logon-file "models" "train.svm")) (output (if (stringp output) (parse-namestring output) "/tmp/ecc.json"))) (unless (probe-file model) (error "pdtb-import-and-score(): missing ‘~a’." (namestring model))) (setf model (read-model model)) (unless (probe-file defaults) (error "pdtb-import-and-score(): unable to load ‘~a’." (namestring defaults))) (load defaults) (unless (probe-file features) (error "pdtb-import-and-score(): unable to load ‘~a’." (namestring features))) (load features) (funcall (intern "CREATE-PROFILE" :sdp) (funcall (intern "READ-DOCUMENTS" :sdp) input) :output file) (unless (consp (do-import-items file profile :format :lisp)) (error "pdtb-import-and-score(): failed to import ‘~a’." (namestring file))) (delete-file file) (purge-test-run profile :action :purge) (operate-on-profiles (list profile) :model model :type :svm :target profile :task :classify :cached-test-p nil) (purge-profile-cache profile) (pdtb-postprocess-profile profile :stream output))) (defun pdtb-item-enhancer (item) (let ((enhancers (get-field :enhancers item))) (when (member :pdtb enhancers :test #'eq) (return-from pdtb-item-enhancer)) (set-field :enhancers (cons :pdtb enhancers) item)) (let ((tokens (get-field :i-tokens item)) (tree (get-field :tree item)) (dependencies (get-field :dependencies item))) (labels ((link (tree) (let ((start (nth (get-field :start tree) tokens)) (end (nth (- (get-field :end tree) 1) tokens))) (unless (get-field :from start) (nconc start (acons :from nil nil))) (unless (get-field :to end) (nconc end (acons :to nil nil))) (push tree (get-field :from start)) (push tree (get-field :to end))) tree) (transform (tree &optional (start 0)) (if (atom tree) (pairlis '(:label :start :end) (list tree start (+ start 1))) (let* ((children (loop for child in (rest tree) for i = start then (get-field :end node) for node = (transform child i) collect node)) (label (first tree)) (end (get-field :end (first (last children)))) (node (pairlis '(:label :start :end :children) (list label start end children)))) (loop for child in children do (nconc child (acons :parent node nil))) (link node))))) (when (stringp tree) (let* ((tree (read-ptb-from-string tree)) (tree (transform (if (listp (first tree)) (first tree) tree)))) (set-field :tree tree item)))) (when dependencies (loop for node in dependencies for id = (get-field :id node) for out = (loop for (symbol . target) in (get-field :out node) for label = (intern symbol :keyword) for token = (nth (- target 1) tokens) when (zerop id) do (set-field :in (list (cons label nil)) token) else collect (cons label token)) when out do (set-field :out out (nth (- id 1) tokens)) finally (loop for token in tokens do (loop for (symbol . target) in (get-field :out token) for label = (intern symbol :keyword) unless (get-field :in target) do (nconc target (acons :in nil nil)) do (push (cons label token) (get-field :in target))))) (set-field :dependencies nil item)) ;; ;; we need to enrich all tokens prior to matching against connectives, ;; as we may need to look both backwards and forwards ;; (loop for token in tokens for form = (get-field :form token) for tag = (get-field :tag token) for stem = (if (or (string= tag "NNP") (string= tag "NNPS")) form (string-downcase form)) for base = (if (or (string= tag "NNP") (string= tag "NNPS")) (string-downcase form) stem) do (nconc token (pairlis '(:stem :base) (list stem base)))) (loop for i from 0 for token in tokens do (let* ((candidates (match-connectives tokens i)) (candidates (sort candidates #'> :key (lambda (foo) (get-field :n foo))))) (when candidates ;; ;; while post-classification deterministically picks the first ;; candidate anyway, it (arguably) seems cleaner to not generate ;; features for other candidates, i.e. present to the classifier ;; only the information pertaining to what would be the eventual ;; outcome at the connective level. ;; (setf (rest candidates) nil) (nconc token (acons :candidates candidates nil)) (labels ((path (token) (let ((in (first (get-field :in token)))) (when in (cons (first in) (path (rest in))))))) (let* ((dependencies (path token))) (when dependencies (nconc token (acons :dependencies dependencies nil))))))))) item) (defun match-connectives (tokens index) (labels ((match (affix strings offsets) (cond ((null affix) offsets) ((null strings) t) ((string= (first affix) "...") (let* ((string (second affix)) (next (and string (position string strings :test #'string=)))) (if next (match (rest (rest affix)) (subseq strings (+ next 1)) (cons (+ (or (first offsets) 0) next 1) offsets)) t))) (t (if (string= (first affix) (first strings)) (match (rest affix) (rest strings) (cons (+ (or (first offsets) 0) 1) offsets)) t))))) (loop with token = (nth index tokens) with form = (get-field :base token) ;; ;; extract the prefix (preceding the key), in reverse order, and the ;; suffix following the key, for matching against the remainder of the ;; sequence of surface tokens for each connective ;; with prefix = (loop for j from (- index 1) downto 0 for form = (get-field :base (nth j tokens)) collect form) with suffix = (loop for j from (+ index 1) to (- (length tokens) 1) for form = (get-field :base (nth j tokens)) collect form) for connective in (gethash form *connective-keys*) for key = (get-field :key connective) for surface = (get-field :surface connective) for left = (match (reverse (subseq surface 0 key)) prefix nil) for right = (and (listp left) (match (subseq surface (+ key 1)) suffix nil)) when (and (listp left) (listp right)) collect (let* ((left (loop for foo in left collect (- foo))) (heads (get-field :heads connective)) (start (nth (- index (- key (first heads))) tokens)) (end (nth (- index (- key (first (last heads)))) tokens)) (self (first (intersect (get-field :from start) (get-field :to end)))) (path (labels ((path (node) (let ((label (get-field :label node)) (parent (get-field :parent node))) (if parent (cons label (path parent)) (list label))))) (path self)))) (nconc (acons :offsets (cons left (nreverse right)) nil) (when self (acons :self self nil)) (when path (acons :path path nil)) connective))))) (defun pdtb-item-expander (item) (loop with connectives with tokens = (get-field :i-tokens item) with id = 0 for token in tokens for class = (get-field :class token) for candidates = (get-field :candidates token) when (and candidates (numberp class) (> class 0)) do (let* ((candidate (first candidates)) (key (get-field :key candidate)) (heads (get-field :heads candidate)) (offsets (get-field :offsets candidate)) (prefix (first offsets)) (suffix (rest offsets)) (indices (loop with base = (get-field :id token) for i in heads collect (+ base (cond ((= i key) 0) ((< i key) (nth i prefix)) ((> i key) (nth (- i 1) suffix)))))) (tokens (loop for index in indices collect (nth index tokens))) (span (pdtb-tokens-span tokens)) (connective (pairlis '(:id :span :indices) (list id span indices)))) (push connective connectives) (incf id)) finally ;; ;; given our token-based classification set-up, we need to protect ;; against overlapping connectives after expansion; for now, prefer ;; longer connective token spans over shorter ones. ;; (let* ((connectives (sort connectives #'< :key #'(lambda (foo) (length (get-field :indices foo))))) (connectives (loop for connectives on connectives for connective = (first connectives) unless (member connective (rest connectives) :test #'(lambda (foo bar) (intersect (get-field :indices foo) (get-field :indices bar)))) collect connective))) (set-field :connectives connectives item)))) (defun pdtb-postprocess-profile (data &key (stream t) (idp t) padp fillp) (if (or (stringp stream) (pathnamep stream)) (with-open-file (stream stream :direction :output :if-exists :supersede) (pdtb-postprocess-profile data :stream stream :idp idp :padp padp :fillp fillp)) (loop with items = (if (consp data) data (analyze data :tokensp t :commentp t :score data)) with id = 0 with i = 0 with n = 0 with origin = (get-field :i-origin (first items)) for item in items for tokens = (get-field :i-tokens item) for scores = (pdtb-postprocess-scores item) do (unless (string= origin (get-field :i-origin item)) (setf origin (get-field :i-origin item)) (setf i 0) (setf n 0)) (loop ;; ;; _fix_me_ ;; we still need to work out our strategy for discontinuous spans ;; for score in scores for indices = (get-field :indices score) do (format stream "{\"DocID\": ~s~ ~@[, \"ID\": ~d~]~@[, \"SentenceID\": ~d~],~:[~;~%~] ~ \"Connective\": ~ {\"TokenList\": [~{~a~^, ~}], \"RawText\": ~s},~:[~;~%~] ~ \"Type\": \"Explicit\"~ ~:[~;,~:[~;~%~] \"Arg1\": {\"TokenList\": []}, ~ \"Arg2\": {\"TokenList\": []},~:[~;~%~] ~ \"Sense\": [\"Expansion.Restatement\"]~]}~%" (string-downcase (get-field :i-origin item)) (and idp id) (and idp i) fillp (loop for index in indices collect (+ index n)) (format nil "~{~a~^ ~}" (loop for index in indices collect (get-field :form (nth index tokens)))) fillp padp fillp fillp) (incf id)) (incf i) (incf n (length tokens))))) (defun pdtb-pprint (data &key (stream t) goldp systemp allp) (if (stringp stream) (with-open-file (stream stream :direction :output :if-exists :supersede) (pdtb-pprint data :stream stream :goldp goldp :systemp systemp :allp allp)) (loop with items = (if (consp data) data (analyze data :tokensp t :commentp t :score data)) for item in items for tokens = (get-field :i-tokens item) for connectives = (get-field :connectives item) for scores = (pdtb-postprocess-scores item) for starts = (make-hash-table :test #'eql) for ends = (make-hash-table :test #'eql) when goldp do (loop for connective in connectives for indices = (loop with indices = (get-field :indices connective) for head in (get-field :heads connective) collect (nth head indices)) do (loop for indices on indices for index = (first indices) for previous = nil then index for next = (first (rest indices)) for token = (nth index tokens) when (or (null previous) (> index (+ previous 1))) do (push :gold (gethash (get-field :start token) starts)) when (or (null next) (> next (+ index 1))) do (push :gold (gethash (get-field :end token) ends)))) when systemp do (loop for score in scores for indices = (get-field :indices score) do (loop for indices on indices for index = (first indices) for previous = nil then index for next = (first (rest indices)) for token = (nth index tokens) when (or (null previous) (> index (+ previous 1))) do (push :system (gethash (get-field :start token) starts)) when (or (null next) (> next (+ index 1))) do (push :system (gethash (get-field :end token) ends)))) (when (or allp (or (loop for start being each hash-value in starts thereis (not (= (count :gold start) (count :system start)))) (loop for end being each hash-value in ends thereis (not (= (count :gold end) (count :system end)))))) (format stream "[~a]" (get-field :i-id item)) (loop with left = (if goldp #\[ #\{) with right = (if goldp #\] #\}) for token in tokens for start = (gethash (get-field :start token) starts) for end = (gethash (get-field :end token) ends) do (format stream " ~:[~;{~]~@[~c~]~a~@[~c~]~:[~;}~]" (member :gold start :test #'eq) (and (member :system start :test #'eq) left) (get-field :form token) (and (member :system end :test #'eq) right) (member :gold end :test #'eq))) (terpri stream))))) (defun pdtb-postprocess-scores (item) (let ((index (make-hash-table :test #'eql))) (loop for score in (get-field :classes item) when (string= (get-field :learner score) "cue") do (push score (gethash (get-field :score-id score) index))) (sort (loop for scores being each hash-value in index for id = (get-field :score-id (first scores)) for span = (loop for score in scores append (list (get-field :score-start score) (get-field :score-end score))) for indices = (loop for token in (pdtb-span-tokens span (get-field :i-tokens item)) collect (get-field :id token)) collect (pairlis '(:id :span :indices) (list id span indices))) #'< :key #'(lambda (foo) (first (get-field :span foo)))))) (defun pdtb-span-tokens (span tokens) (when span (append (loop for start = (first span) for end = (second span) for token in tokens when (and (<= start (get-field :start token)) (>= end (get-field :end token))) collect token) (pdtb-span-tokens (rest (rest span)) tokens)))) (defun pdtb-tokens-span (tokens) (append (list (get-field :start (first tokens))) (loop for previous = (first tokens) then token for token in (rest tokens) unless (= (get-field :id token) (+ (get-field :id previous) 1)) collect (get-field :end previous) and collect (get-field :start token)) (list (get-field :end (first (last tokens)))))) (defun intersect (set1 set2 &key (test #'eql) key) ;; ;; much like intersection(), except guarantee that all elements returned are ;; taken from .set1. and preserve their original order. ;; (loop for foo in set1 when (member (if key (funcall key foo) foo) set2 :key key :test test) collect foo))