;;; Common user-fns file for all ESSLLI grammars

(in-package :lkb)

;;;
;;; determine surface order of constituents in rule: returns list of paths into
;;; feature structure of rule, i.e. (nil (args first) (args rest first)) for a
;;; binary rule, where the first list element is the path to the mother node of
;;; the rule.
;;;
(defun establish-linear-precedence (rule)
  (let ((daughters
         (loop
             for args = (existing-dag-at-end-of rule '(args))
             then (existing-dag-at-end-of args *list-tail*)
             for daughter = (when args 
                              (get-value-at-end-of args *list-head*))
             for path = (list 'args) then (append path *list-tail*)
             while (and daughter (not (eq daughter 'no-way-through)))
             collect (append path *list-head*))))
    (if (null daughters)
      (cerror "Ignore it" "Rule without daughters")
      (cons nil daughters))))

;;;
;;; detect rules that have orthographemic variation associated to them; those
;;; who do should only be applied within the morphology system; this version is
;;; a little complicated because we change from a full-form set-up to one with
;;; on-line morphology during the course.
;;;
(defun spelling-change-rule-p (rule)
  (rule-orthographemicp rule))

(defun redundancy-rule-p (rule)
;;; a function which is used to prevent the parser 
;;; trying to apply a rule which is only used
;;; as a redundancy rule 
  (declare (ignore rule))
  nil)
             

;;; return true for types that shouldn't be displayed in type hierarchy
;;; window. None of their descendents (if any) will be displayed either

(defun hide-in-type-hierarchy-p (type-name)
  (and (symbolp type-name)
       (search "GLBTYPE" (symbol-name type-name))))

;;;
;;; create feature structure representation of orthography value for insertion
;;; into the output structure of inflectional rules; somewhat more complicated
;;; than one might expect because of treatment for multi-word elements.
;;;
(defun make-orth-tdfs (orthography)
  (let* ((unifications
          (loop 
              for token in (split-into-words orthography)
              for path = *orth-path* then (append path *list-tail*)
              for opath = (create-path-from-feature-list 
                           (append path *list-head*))
              collect (make-unification :lhs opath                    
                                        :rhs (make-u-value :type token))))
         (indef (process-unifications unifications)))
    (when indef
      (make-tdfs :indef (create-wffs indef)))))


;;;
;;; characters dropped in preprocessing
;;; can be found out by typing in Lisp after #\!
;;;

;;; (ERB 2001-12-20) Colons are meaningful and we
;;; want to keep them!

(defparameter *punctuation-characters*
  '(#\space #\! #\" #\& #\' #\(
    #\) #\* #\+ #\, #\- #\. #\/ #\;
    #\< #\= #\> #\? #\@ #\[ #\\ #\] #\^
    #\_ #\` #\{ #\| #\} #\~
    #\ideographic_full_stop #\fullwidth_question_mark 
    #\horizontal_ellipsis #\fullwidth_full_stop
    #\fullwidth_exclamation_mark 
    #\ideographic_space #\minus_sign
    #\white_circle #\black_circle
    #\white_star #\black_star
    #\white_diamond #\black_diamond))
;    #\katakana_middle_dot 


(defun punctuationp (thing)
  (let ((string (string thing)))
    (loop
        for c across string
        always (member c *punctuation-characters*))))
  
(defun alphanumeric-or-extended-p (c)
  (and (graphic-char-p c) (not (member c *punctuation-characters*))))

;;;
;;; to generate `punctuation-characters' value for `japanese.set'
;;;
#+:null
(loop for c in *punctuation-characters* do (format t "~a" c))

(defun normalize-sentence-string (string)
  (loop
      with padding = 128
      with length = (+ (length string) padding)
      with result = (make-array length
                                :element-type 'character
                                :adjustable nil :fill-pointer 0)
      with space = t
      for c across string
      when (or (member c '(#\Space #\Newline #\Tab))
               (not (alphanumeric-or-extended-p c))) do
        (when space (incf padding))
        (unless space
          (vector-push #\Space result)
          (setf space :space))
      else do
        (vector-push c result)
        (setf space nil)
      finally
        (when (and (eq space :space) (not (zerop (fill-pointer result))))
          (decf (fill-pointer result)))
        (return result)))

;;;
;;; Call chasen and format the input appropriately
;;;
(setf *preprocessor* nil)

(defparameter *chasen-application* "chasen")

(defparameter *chasen-debug-p* t)

(defparameter *chasen-readings* nil)

(defun chasen-preprocess-sentence-string (string
					  &key (verbose *chasen-debug-p*) posp)
  
  (let* ((string (string-trim '(#\space #\tab) string))
         #+:logon
         (logon (system:getenv "LOGONROOT"))
         ;;
         ;; in the self-contained LOGON philosophy, we assume people want to
         ;; use the ChaSen installation included in the tree (using UTF-8).
         (command (format 
                   nil 
                   "~a~@[ -r '~a'~] -i ~a -F ~
                    '(\"%m\" \"%M\" \"%P-+%Tn-%Fn\" \"%y\")\\n'" 
                   *chasen-application*
                   #+:logon (and logon (format nil "~a/.chasenrc" logon))
                   #-:logon nil
                   #+:logon "w" #-:logon "e")))
    (setf *chasen-readings* nil)
    (multiple-value-bind (stream foo pid)
        (run-process
         command :wait nil
         :output :stream :if-output-exists :append 
         :input :stream :error-output nil)
      (declare (ignore foo #-:allegro pid))
      ;;
      ;; while we assert ChaSen to operate in EUC-JP mode, enforce the encoding
      ;; on the stream talking to the sub-process.  in the LOGON universe, on
      ;; the other hand, we control which dictionaries are in use, so there we
      ;; opt for modern UTF-8.
      ;;
      #+(and :allegro-version>= (version>= 6 0))
      (setf (stream-external-format stream)
        (excl:find-external-format #+:logon :utf-8 #-:logon :euc))
    
      (format stream "~a~%" string)
      (let* ((analyses (loop
                           for form = (read stream nil :eof)
                           until (or (eq form :eof) 
                                     (and (symbolp form)
                                          (eq (intern form :keyword) :eos)))
                           collect form))
	     (length 0)
             full)
        (close stream)
        #+:allegro 
        (loop for i from 0 while (< i 500) until (sys:os-wait nil pid))
        (loop
            initially (when verbose (format t "~&~%ChaSen output:~%~%"))
            with i = 0
            with id = 0
            for analysis in analyses
            for form = (first analysis)
	    for yy = (when *preprocessor*
		       (lkb::preprocess
			form :globalp nil :format :list :verbose nil))
	    ;; FCB change with preprocessor ---  (lkb::preprocess form))
            for pos = (third analysis)
            when verbose do
              (format 
               t 
               "  form: `~a'; stem: `~a'; analysis: `~a' ; reading : `~a'~%"
               form (second analysis) pos (fourth analysis))
            unless (punctuationp form) do
	      (incf length)
	      (if yy
		(loop
		    with start = i with end = (incf i)
		    initially (setf (first analysis) (fourth (first yy)))
		    for foo in yy
		    do
		      (push 
		       (format 
			nil 
			"(~d, ~d, ~d, 1, \"~a\" \"~a\", ~
                         0, \"null\", \"~a\" 1.0)" 
			(incf id) start end
			(fourth foo) (fifth foo) pos)
		       full))
		(push
		 (format 
		  nil 
		  "(~d, ~d, ~d, 1, \"~a\" \"~a\", 0, \"null\", \"~a\" 1.0)" 
		  (incf id)
		  i (incf i) form (second analysis) pos)
		 full))
	    ;; collects readings
	    unless (punctuationp form) do
	      (push (fourth analysis) *chasen-readings*)
            finally
	      (setf *chasen-readings* (reverse *chasen-readings*))
	      (when verbose (format t "~%")))
        (values
         (if posp
           (format nil "~{~a~^ ~}" (reverse full))
           (normalize-sentence-string
            (format 
             nil 
             "~{~a~^ ~}" 
             (loop for analysis in analyses collect (first analysis)))))
         length)))))

;;;
;;; Possibly use chasen to process the string
;;;

(defun preprocess-sentence-string (string &key (verbose *chasen-debug-p*) posp)
  (if (find :chasen *features*)
    (chasen-preprocess-sentence-string string :verbose verbose :posp posp)
    (if *preprocessor*
      (preprocess string :format :lkb :verbose nil)
      ;; FCB change with new preprocessor (preprocess string)
      (normalize-sentence-string (string-trim '(#\space #\tab) string)))))


;;;
;;; hook for [incr tsdb()] to call when preprocessing input (going to the PET
;;; parser or when counting `words' while importing test items from a file).
;;;
(defun chasen-preprocess-for-pet (input &optional tagger)
  (declare (ignore tagger))
  (chasen-preprocess-sentence-string input :verbose nil :posp t))

#+:null
(setf tsdb::*tsdb-preprocessing-hook* "lkb::preprocess-sentence-string")


;;;
;;; used in lexicon compilation for systems like PET and CHiC: when we compile
;;; out the morphology, there is no point in outputting uninflected entries for
;;; systems that have no on-line morphology.  also used in [incr tsdb()] for
;;; counting of `words'.
;;;
(defun dag-inflected-p (dag)           
  (let* ((key (existing-dag-at-end-of dag '(infl)))
         (type (and (dag-p key) (dag-type key))))
    (when type
      (or (eq type 'plus) (and (consp type) (eq (first type) 'plus))))))


;;
;; called in /lkb/src/io-paths/lexinput.lsp;lkb/src/io-paths/typeinput.lsp
;;
(defun set-temporary-lexicon-filenames nil
  (let* ((version (or (find-symbol "*GRAMMAR-VERSION*" :common-lisp-user)
                      (and (find-package :lkb)
                           (find-symbol "*GRAMMAR-VERSION*" :lkb))))
         (prefix
          (if (and version (boundp version))
            (remove-if-not #'alphanumericp (symbol-value version))
            "biglex")))
    (setf *psorts-temp-file* 
      (make-pathname :name (concatenate 'string prefix ".lex")
                     :directory (pathname-directory (lkb-tmp-dir))))
    (setf *psorts-temp-index-file* 
      (make-pathname :name (concatenate 'string prefix ".idx") 
                     :directory (pathname-directory (lkb-tmp-dir))))
    (setf *leaf-temp-file* 
      (make-pathname :name (concatenate 'string prefix ".lts")
                     :directory (pathname-directory (lkb-tmp-dir))))
    (setf *predicates-temp-file* 
      (make-pathname :name (concatenate 'string prefix ".ric")
		     :host (pathname-host (lkb-tmp-dir))
		     :device (pathname-device (lkb-tmp-dir))
                     :directory (pathname-directory (lkb-tmp-dir))))
    (setf *semantics-temp-file* 
      (make-pathname :name (concatenate 'string prefix ".stc")
		     :host (pathname-host (lkb-tmp-dir))
		     :device (pathname-device (lkb-tmp-dir))
                     :directory (pathname-directory (lkb-tmp-dir))))))


(defun gen-extract-surface (edge)
  (format nil "~{~a~^ ~}" (g-edge-leaves edge)))

(eval-when #+:ansi-eval-when (:load-toplevel :compile-toplevel :execute)
	   #-:ansi-eval-when (load eval compile)
  (setf *gen-extract-surface-hook* 'gen-extract-surface))


(defun bool-value-true (fs)
  (and fs
       (let ((fs-type (type-of-fs fs)))
         (eql fs-type '+))))
  
(defun bool-value-false (fs)
  (and fs
       (let ((fs-type (type-of-fs fs)))
         (eql fs-type '-))))


;;;
;;; Generate unknown words from the CARG
;;;
(defun instantiate-generic-lexical-entry (gle surface &optional (carg surface))
  (let ((tdfs (copy-tdfs-elements
               (lex-entry-full-fs (if (gle-p gle) (gle-le gle) gle)))))
    (loop
        with dag = (tdfs-indef tdfs)
        for path in '((STEM FIRST) (SYNSEM LKEYS KEYREL CARG))
        for foo = (existing-dag-at-end-of dag path)
        do (setf (dag-type foo) *string-type*))
    (let* ((surface(or
                    #+:logon
                    (case (gle-id gle)
                      (guess_n_gle 
                       (format nil "/~a/" surface))
                      (decade_gle
                       (format nil "~as" surface)))
                    surface))
           (unifications
            (list 
             (make-unification
              :lhs (create-path-from-feature-list
                    (append *orth-path* *list-head*))
              :rhs (make-u-value :type surface))
             (make-unification
              :lhs (create-path-from-feature-list
                    (append *orth-path* *list-tail*))
              :rhs (make-u-value :type *empty-list-type*))
             (make-unification
              :lhs (create-path-from-feature-list '(SYNSEM LKEYS KEYREL CARG))
              :rhs (make-u-value :type carg))))
           (indef (process-unifications unifications))
           (indef (and indef (create-wffs indef)))
           (overlay (and indef (make-tdfs :indef indef))))
      (values
       (when overlay
        (with-unification-context (ignore)
          (let ((foo (yadu tdfs overlay)))
            (when foo (copy-tdfs-elements foo)))))
       surface))))


;;;
;;; Idiom Implementation (CH 060804)
;;;
;;; try a new approach to post-parsing filtering of idioms, building on the new
;;; MRS transfer machinery.  essentially, the idiom phrases have been recast as
;;; MRS transfer rules (MTRs), each of them matching an idiom configuration 
;;; and replacing the idiomatic parts of the MRS with a synthesized relation 
;;; (or nothing, for the time being).  post-transfer, the filter can then just
;;; require that no idiomatic relation remain.  (20-feb-05; dan & oe phx - sfo)
;;;
(defun idiom-complete-p (tdfs)
  (let* ((mrs (and (tdfs-p tdfs)
                   (mrs::extract-mrs-from-fs (tdfs-indef tdfs))))
         (transfers (and (mrs::psoa-p mrs)
                         (mt:transfer-mrs mrs :task :idiom))))
    (loop
        for transfer in transfers
        for mrs = (mt::edge-mrs transfer)
        thereis (loop
                    for ep in (mrs:psoa-liszt mrs)
                    when (idiom-rel-p ep) return nil
                    finally (return t)))))

(eval-when #+:ansi-eval-when (:load-toplevel :execute)
	   #-:ansi-eval-when (load eval)
  (setf *additional-root-condition* #'idiom-complete-p))