;;; -*- Mode: tdl; Coding: utf-8; -*-


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; at this point, we multiply out PoS values on all tokens, where for each
;;; original token as many additional tokens are created (in the same chart
;;; cell) as there are PoS readings.  at this point, we start distinguishing
;;; between tokens that activate native lexical entries (LEs), vs. those that
;;; activate generic LEs.  in the token universe, this distinction is made by
;;; virtue of +TRAIT, with generic_trait targeting generic LEs.  the two sets
;;; do not overlap, i.e. for a single original token with two PoS readings, we
;;; end up with a total of three new tokens.  the pair of rules below resembles
;;; a recursive function, terminating once the PoS list has been reduced to 
;;; a singleton element.  form-based named entities identified earlier avoid
;;; this kind of PoS multiplication because they have already emptied out their
;;; PoS list.
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;
;; even though we originally made sure all tokens had a fully specified +TNT
;; value, intervening rules could have `leaked' PoS information.  if so, once
;; again, fully annul the +TNT value.
;;
tnt_default_tmr := one_one_tmt &
[ +INPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class,
             +PRED #pred, +CARG #carg, +TNT [ +TAGS < anti_string, ... > ], +STAG #stag ] >,
  +OUTPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class,
              +PRED #pred, +CARG #carg, +TNT null_tnt, +STAG #stag ] > ].


tnt_recurse_tmr := token_mapping_rule &
[ +INPUT < [ +FORM #form, +TRAIT anti_trait,
             +CLASS #class, +PRED #pred, +CARG #carg,
             +ID #id, +FROM #from, +TO #to,
	     +STAG null_tnt,
             +TNT [ +TAGS < #tag . #tags & *cons* >,
		     +PRBS < #prb . #prbs & *cons* > ] ] > ,
  +OUTPUT < [ +FORM #form, +TRAIT generic_trait,
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
	      +STAG null_tnt,
	      +TNT [ +TAGS < #tag >, +PRBS < #prb > ] ],
            [ +FORM #form, 
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
	      +STAG null_tnt,
              +TNT [ +TAGS #tags, +PRBS #prbs ] ] > ,
  +POSITION "O1@I1, O2@I1" ].


tnt_terminate_tmr := token_mapping_rule &
[ +INPUT < [ +FORM #form, +TRAIT anti_trait,
             +CLASS #class, +PRED #pred, +CARG #carg,
             +ID #id, +FROM #from, +TO #to,
	     +STAG null_tnt,
             +TNT [ +TAGS < #tag >, +PRBS < #prb > ] ] > ,
  +OUTPUT < [ +FORM #form, +TRAIT generic_trait,
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
	      +STAG null_tnt,
              +TNT [ +TAGS < #tag >, +PRBS < #prb > ] ],
            [ +FORM #form, +TRAIT native_trait,
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
	      +STAG null_tnt,
              +TNT null_tnt ] >,
  +POSITION "O1@I1, O2@I1" ].

#|
tnt_ptkvz_tmr := token_mapping_rule &
[ +CONTEXT < [ +FORM #form, +TRAIT generic_trait,
             +CLASS #class, +PRED #pred, +CARG #carg,
             +ID #id, +FROM #from, +TO #to, 
	     +STAG null_tnt,
             +TNT [ +TAGS < #tag & "PTKVZ" >, +PRBS < #prb > ] ] > ,
  +OUTPUT < [ +FORM #form, +TRAIT native_trait,
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
	      +STAG null_tnt,
              +TNT [ +TAGS < #tag & "PTKVZ" >, +PRBS < #prb > ] ] >,
  +POSITION "O1@C1" ].
|#

;;; Now recurse over STAG feature
#|
stag_recurse_tmr := token_mapping_rule &
[ +INPUT < [ +FORM #form, +TRAIT anti_trait,
             +CLASS #class, +PRED #pred, +CARG #carg,
             +ID #id, +FROM #from, +TO #to,
	     +TNT null_tnt,
             +STAG [ +TAGS < #tag . #tags & *cons* >,
		     +PRBS < #prb . #prbs & *cons* > ] ] > ,
  +OUTPUT < [ +FORM #form, +TRAIT native_trait,
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
	      +TNT null_tnt,
	      +STAG [ +TAGS < #tag >, +PRBS < #prb > ] ],
            [ +FORM #form, 
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
	      +TNT null_tnt, 
              +STAG [ +TAGS #tags, +PRBS #prbs ] ] > ,
  +POSITION "O1@I1, O2@I1" ].


stag_terminate_tmr := token_mapping_rule &
[ +INPUT < [ +FORM #form, +TRAIT anti_trait,
             +CLASS #class, +PRED #pred, +CARG #carg,
             +ID #id, +FROM #from, +TO #to,
	     +TNT null_tnt,
             +STAG [ +TAGS < #tag >, +PRBS < #prb > ] ] > ,
  +OUTPUT < [ +FORM #form, +TRAIT native_trait,
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
	      +TNT null_tnt,
              +STAG [ +TAGS < #tag >, +PRBS < #prb > ] ] >,
  +POSITION "O1@I1" ].

stag_ditch_stts_tmr := token_mapping_rule &
[ +INPUT < [ +STAG.+TAGS < ^[[:upper:]]+$ > ] >,
  +OUTPUT < > ].


tnt_ditch_stag_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+TAGS < ^[[:lower:]-]+$ > ] >,
  +OUTPUT < > ].


stag_ditch_punctuation_tmr := one_one_tmt &
			      [ +INPUT < [ +FORM ^[[:punct:]]+$, 
					   +TRAIT #trait, +CLASS #class,
					   +PRED #pred, +CARG #carg, +TNT #tnt,
					   +STAG.+TAGS < "--", ... > ] >,
				+OUTPUT < [ +TRAIT #trait, +CLASS #class,
					    +PRED #pred, 
					    +CARG #carg, 
					    +TNT #tnt, 
					    +STAG null_tnt ] > ].


;;; Eliminate duplicate  native entries 

stag_ditch_untagged_tmr := token_mapping_rule &
[ +CONTEXT <[ +TRAIT native_trait,
	     +CLASS non_ne,
	     +TNT null_tnt, 
	     +STAG.+TAGS < [] > ] >,
  +INPUT < [ +TRAIT native_trait,
	     +CLASS non_ne,
	     +TNT null_tnt, 
	     +STAG null_tnt ] >,
  +OUTPUT < >,
  +POSITION "I1@C1" ].
|#

;;;
;;; with singleton PoS readings multiplied out in each chart cell, we can prune
;;; undesirable alternatives, e.g. a foreign word reading when there also is a
;;; common noun.  also, ditch PoS readings with very low probability, and ones
;;; for which no PoS-activated generic entries exist anyway (function words).
;;; this final step eases debugging, reducing the size of the token chart.
;;;

tnt_ditch_unlikely_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+PRBS < ^0?\.0[0-5].*$ > ] >,
  +OUTPUT < > ].

tnt_ditch_function_1_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+TAGS < ^P(I|D|ID|REL|W|POS)(S|AT)$ > ] >,
  +OUTPUT < > ].

tnt_ditch_function_2_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+TAGS < ^PPER|PREF|ITJ|APZR|APP[RO]|APPRART|ART|PROAV|KON|KOUS|KOUI|KOKOM|PTKANT|XY|PTKNEG|V[AM](FIN|IMP|INF|PP)$ > ] >,
  +OUTPUT < > ].

tnt_ditch_function_3_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+TAGS < ^\$|#|``|''|\(|\)|,|\.|:$ > ] >,
  +OUTPUT < > ].

;;
;; _fix_me_
;; experimentally, also ditch PoS information on punctuation-only tokens.  we
;; appear to get noun and adjective readings for n- and m-dashes, which hardly
;; can do us any good.                                         (24-sep-08; oe)
;;
tnt_ditch_punctuation_tmr := token_mapping_rule &
[ +INPUT < [ +FORM ^[[:punct:]]+$, +TNT.+TAGS *cons* ] >,
  +OUTPUT < > ].

;;;
;;; _fix_me_
;;; should we eventually want to include the PoS probabilities as a feature in
;;; parse selection, this kind of pruning should disappear: a high-probability
;;; FW, say, should not be ellbowed out by an unlikely NN.     (31-aug-08; oe)
;;;
tnt_filter_dup_fw_tmr := token_mapping_rule &
[ +CONTEXT  < [ +TNT.+TAGS < "N[NE]" > ] >,
  +INPUT    < [ +TNT.+TAGS < "FW" > ] >,
  +OUTPUT   < >,
  +POSITION "I1@C1" ].

;; 
;; [DPF 23-apr-09] words ending in "-ing" can get tagged both as noun and as
;; verb, but since the grammar has gerund rules, drop the noun and keep the 
;; verb.
;; _fix_me_
;; is there a reason to prefer the gerund over the vanilla noun?  it means a
;; little extra ambiguity when followed by a PP[of], which the generic gerund
;; optionally picks up as a complement.                         (24-may-09; oe)
;;
tnt_filter_dup_vbg_tmr := token_mapping_rule &
[ +CONTEXT  < [ +TNT.+TAGS < "VBG" > ] >,
  +INPUT    < [ +TNT.+TAGS < "NN" > ] >,
  +OUTPUT   < >,
  +POSITION "I1@C1" ].


;;
;; _fix_me_
;; the old `posmapping' setting in PET contained the following comment by dan
;; (from sep-07):
;; 
;;   Tried doing without the adjective, since TNT appears to mostly guess both
;;   an adjective and a noun, and our generic mass-count noun can almost always
;;   do the work of the adjective.  This would avoid large amounts of spurious
;;   ambiguity for most occurrences of these pairs.  But unfortunately TNT 
;;   doesn't always guess both, so we need JJ when it's the only guess.  Maybe
;;   we can effect this with the new token-mapping machinery ...
;;
;; the following rule should have that effect.                 (21-jan-09; oe)
;;
;; [DPF 24-mar-09]  Unfortunately, this simple rule goes wrong sometimes.  For 
;; "the tallest and most unk-word cat" the |unk-word| has to be an adjective,
;; so we can't just throw it away.  We'll try using the probabilities from the
;; tagger for a more sensitive rule.
;;
tnt_filter_dup_jj_tmr := token_mapping_rule &
[ +CONTEXT  < [ +TNT [ +TAGS < "NN" >, +PRBS < ^0?\.[2-9].*$ > ] ] >,
  +INPUT    < [ +TNT.+TAGS < "JJ" > ] >,
  +OUTPUT   < >,
  +POSITION "I1@C1" ].

;;
;; on all tokens that we expect to activate generic entries, make the +PRED
;; value reflect the orthography and PoS tag.
;;
generic_pred_tmr := token_mapping_rule &  
[ +INPUT < [ +FORM #form,
             +TRAIT #trait & generic_trait, +CLASS #class & non_ne, 
             +PRED anti_string, +CARG #carg & ^(.+)$,
             +ID #id, +FROM #from, +TO #to,
             +TNT #tnt & [ +TAGS < ^(.*)$ > ] ] >,
  +OUTPUT < [ +FORM #form,
              +TRAIT #trait, +CLASS #class,
              +PRED 
              "_${lc(I1:+CARG:1)}/${I1:+TNT.+TAGS.FIRST:1}_u_unknown_rel",
              +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
              +TNT #tnt ] >,
  +POSITION "O1@I1" ].