;;; Copyright (c) 1991-2021 ;;; John Carroll, Ann Copestake, Robert Malouf, Stephan Oepen ;;; see `LICENSE' for conditions. (in-package :lkb) (defun clear-temporary-data () ;; Clear global data structures, temporary dags and caches used in parsing and ;; generation - in both packing and non-packing modes. Do them all in case we've ;; switched processing types/modes and the other's data is still around. ;; ;; NB [incr tsdb()] defines a similar (although less thorough) function ;; release-temporary-storage ;; (clear-chart) (clear-gen-chart) #+:pooling (reset-pools :compressp t) (with-unification-context (ignore) (compress-lexicon *lexicon*) (maphash #'(lambda (name type) (declare (ignore name)) (let ((c (ltype-constraint type))) (when (and c ; in case this is a leaf type that's been unexpanded (or (ltype-constraint-mark type) (dag-x-new-type/forward c) (dag-x-comp-arcs c) (dag-x-copy c) (dag-x-visit-slot c))) (compress-dag c))) (setf (ltype-constraint-mark type) nil)) *types*) (loop for rule-set in (list (get-indexed-lrules nil) (get-indexed-rules nil)) do (loop for rule in rule-set do (compress-dag (tdfs-indef (rule-full-fs rule))) (compress-dag (tdfs-indef (rule-rtdfs rule))))) (loop for root in *root-entries* for tdfs = (psort-full-fs (cdr root)) do (compress-dag (tdfs-indef tdfs)))) (setf (symbol-value (intern "*NAMED-NODES*" :mrs)) nil)) ;;; Parsing sentences from file (when the fine system is just too much ...) (defun parse-sentences (&optional input-file (output-file nil output-file-p) &rest rest) ;; if output-file is nil or t, then output is suppressed or goes to standard-output, ;; respectively (unless input-file (setq input-file (or (ask-user-for-existing-pathname "Sentence file?") (return-from parse-sentences)))) (cond ((not (probe-file input-file)) (show-message-window (format nil "Input file `~a' does not exist" input-file)) (return-from parse-sentences)) ((file-xml-p input-file) ; if xml input, assume SAF XML (return-from parse-sentences (apply #'process-saf-file-sentences input-file rest))) (t (setq input-file (truename input-file)))) ;; (unless output-file-p (setq output-file (or (ask-user-for-new-pathname "Output file?" input-file) (return-from parse-sentences)))) (setq output-file (cond ((member output-file '(nil t)) output-file) ((probe-file output-file)) (t (merge-pathnames output-file)))) (when (equal output-file input-file) (show-message-window (format nil "Not permitted to overwrite the input file `~a'" input-file)) (return-from parse-sentences)) ;; (with-open-file (istream input-file :direction :input) (if (member output-file '(nil t)) (batch-parse-sentences istream output-file #'extract-fine-system-sentence) (with-open-file (ostream output-file :direction :output :if-exists :supersede :if-does-not-exist :create) (batch-parse-sentences istream ostream #'extract-fine-system-sentence))))) (defparameter *do-something-with-parse* nil) (defparameter *lex-ids-used* nil) (defparameter *parse-input* nil) (defparameter *ostream* nil) (defun batch-parse-sentences (istream ostream &optional access-fn &aux #+(and :sbcl :64-bit) (gc-bytes (sb-ext:bytes-consed-between-gcs))) ;; if *do-something-with-parse* is bound to a function, then this is called after ;; parsing each sentence; otherwise the value of ostream determines what is output: ;; either nil (only final summary, to standard output), t (progress and summary to ;; standard output), or a stream (ditto to the stream) ;; (setq *unanalysed-tokens* nil) (setq *lex-ids-used* nil) (unexpand-leaf-types) ; also does clear-expanded-lex, clear-type-cache (clear-temporary-data) (let ((nsent 0) (edge-total 0) (parse-total 0) (*print-right-margin* 300) ; in case *print-pretty* is true raw-sentence (start-time (get-internal-run-time))) (format t "~%;;; Parsing test file~%") (force-output) (unwind-protect (progn #+(and :sbcl :64-bit) (progn (setf (sb-ext:bytes-consed-between-gcs) (* 8 (expt 2 30))) ; 8GB allocation before gc triggered automatically (sb-ext:gc :full t)) ; tenure everything so generation 0 starts empty ;; start timing here to omit initial GC which we shouldn't be accountable for (setq start-time (get-internal-run-time)) (loop (setq raw-sentence (read-line istream nil 'eof)) (when (eq raw-sentence 'eof) (return)) #+(and :sbcl :64-bit) ; >2GB allocated since last GC? (when (> (sb-ext:generation-bytes-allocated 0) (* 2 (expt 2 30))) (clear-temporary-data) (sb-ext:gc) (when (> (sb-ext:generation-bytes-allocated 0) (* 1/2 (expt 2 30))) ; in trouble? (sb-ext:gc :full t))) (let ((sentence (string-trim '(#\Space #\Tab #\Newline) (if access-fn (funcall access-fn raw-sentence) raw-sentence)))) (unless (or (zerop (length sentence)) (char= (char sentence 0) #\;)) (incf nsent) (unless (fboundp *do-something-with-parse*) ;; if we're doing something else, let that function control output (when ostream (format ostream "~&~A ~A " nsent sentence) (force-output (if (eq ostream t) nil ostream)))) (#-:gdebug handler-case #+:gdebug progn (let ((parse-input (split-into-words (if (fboundp 'preprocess-sentence-string) (funcall (symbol-function 'preprocess-sentence-string) sentence) sentence))) ;; ask for recycling of safe dags from the pool ;; NB lexical entries must not contain safe dags - so expand-psort-entry ;; and friends must rebind *safe-not-to-copy-p* to nil #+:pooling (*dag-recycling-p* t)) #+:pooling (reset-pools #+:gdebug :forcep #+:gdebug t) (parse parse-input nil) (setf *sentence-number* nsent) (setf *sentence* sentence) (setf *parse-input* parse-input) (setf *ostream* ostream) (if (fboundp *do-something-with-parse*) (funcall *do-something-with-parse*) (when ostream (format ostream "~A ~A~%" (length *parse-record*) *edge-id*)))) #-:gdebug (storage-condition (condition) (format t "~&Memory allocation problem: ~A caused by ~A~%" condition raw-sentence)) #+(and (not :gdebug) (or :allegro :sbcl :ccl)) (#+:allegro excl:interrupt-signal #+:sbcl sb-sys:interactive-interrupt #+:ccl ccl:interrupt-signal-condition () (error "Interrupt signalled")) #-:gdebug (error (condition) (format t "~&Error: ~A caused by ~A~%" condition raw-sentence))) (incf parse-total (length *parse-record*)) (incf edge-total *edge-id*))))) (setq *lex-ids-used* (collect-expanded-lex-ids *lexicon*)) (unexpand-leaf-types) (clear-temporary-data) #+(and :sbcl :64-bit) (progn (setf (sb-ext:bytes-consed-between-gcs) gc-bytes) (sb-ext:gc :full t))) ; release unused memory, shrinking process size (unless (or (fboundp *do-something-with-parse*) (zerop nsent)) (let ((out (or ostream t))) (format out "~&~%;;; Total CPU time: ~A msecs~%" (round (* (- (get-internal-run-time) start-time) 1000) internal-time-units-per-second)) (format out ";;; Mean edges: ~,2F~%" (/ edge-total nsent)) (format out ";;; Mean parses: ~,2F~%" (/ parse-total nsent)))) (format t "~%;;; Finished test file~%") (finish-output) (lkb-beep) nil))