;;; -*- Mode: tdl; Coding: utf-8; -*- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; at this point, we multiply out PoS values on all tokens, where for each ;;; original token as many additional tokens are created (in the same chart ;;; cell) as there are PoS readings. at this point, we start distinguishing ;;; between tokens that activate native lexical entries (LEs), vs. those that ;;; activate generic LEs. in the token universe, this distinction is made by ;;; virtue of +TRAIT, with generic_trait targeting generic LEs. the two sets ;;; do not overlap, i.e. for a single original token with two PoS readings, we ;;; end up with a total of three new tokens. the pair of rules below resembles ;;; a recursive function, terminating once the PoS list has been reduced to ;;; a singleton element. form-based named entities identified earlier avoid ;;; this kind of PoS multiplication because they have already emptied out their ;;; PoS list. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; even though we originally made sure all tokens had a fully specified +TNT ;; value, intervening rules could have `leaked' PoS information. if so, once ;; again, fully annul the +TNT value. ;; tnt_default_tmr := one_one_tmt & [ +INPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT [ +TAGS < anti_string, ... > ], +STAG #stag ] >, +OUTPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT null_tnt, +STAG #stag ] > ]. tnt_recurse_tmr := token_mapping_rule & [ +INPUT < [ +FORM #form, +TRAIT anti_trait, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +STAG null_tnt, +TNT [ +TAGS < #tag . #tags & *cons* >, +PRBS < #prb . #prbs & *cons* > ] ] > , +OUTPUT < [ +FORM #form, +TRAIT generic_trait, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +STAG null_tnt, +TNT [ +TAGS < #tag >, +PRBS < #prb > ] ], [ +FORM #form, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +STAG null_tnt, +TNT [ +TAGS #tags, +PRBS #prbs ] ] > , +POSITION "O1@I1, O2@I1" ]. tnt_terminate_tmr := token_mapping_rule & [ +INPUT < [ +FORM #form, +TRAIT anti_trait, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +STAG null_tnt, +TNT [ +TAGS < #tag >, +PRBS < #prb > ] ] > , +OUTPUT < [ +FORM #form, +TRAIT generic_trait, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +STAG null_tnt, +TNT [ +TAGS < #tag >, +PRBS < #prb > ] ], [ +FORM #form, +TRAIT native_trait, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +STAG null_tnt, +TNT null_tnt ] >, +POSITION "O1@I1, O2@I1" ]. #| tnt_ptkvz_tmr := token_mapping_rule & [ +CONTEXT < [ +FORM #form, +TRAIT generic_trait, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +STAG null_tnt, +TNT [ +TAGS < #tag & "PTKVZ" >, +PRBS < #prb > ] ] > , +OUTPUT < [ +FORM #form, +TRAIT native_trait, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +STAG null_tnt, +TNT [ +TAGS < #tag & "PTKVZ" >, +PRBS < #prb > ] ] >, +POSITION "O1@C1" ]. |# ;;; Now recurse over STAG feature #| stag_recurse_tmr := token_mapping_rule & [ +INPUT < [ +FORM #form, +TRAIT anti_trait, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT null_tnt, +STAG [ +TAGS < #tag . #tags & *cons* >, +PRBS < #prb . #prbs & *cons* > ] ] > , +OUTPUT < [ +FORM #form, +TRAIT native_trait, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT null_tnt, +STAG [ +TAGS < #tag >, +PRBS < #prb > ] ], [ +FORM #form, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT null_tnt, +STAG [ +TAGS #tags, +PRBS #prbs ] ] > , +POSITION "O1@I1, O2@I1" ]. stag_terminate_tmr := token_mapping_rule & [ +INPUT < [ +FORM #form, +TRAIT anti_trait, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT null_tnt, +STAG [ +TAGS < #tag >, +PRBS < #prb > ] ] > , +OUTPUT < [ +FORM #form, +TRAIT native_trait, +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT null_tnt, +STAG [ +TAGS < #tag >, +PRBS < #prb > ] ] >, +POSITION "O1@I1" ]. stag_ditch_stts_tmr := token_mapping_rule & [ +INPUT < [ +STAG.+TAGS < ^[[:upper:]]+$ > ] >, +OUTPUT < > ]. tnt_ditch_stag_tmr := token_mapping_rule & [ +INPUT < [ +TNT.+TAGS < ^[[:lower:]-]+$ > ] >, +OUTPUT < > ]. stag_ditch_punctuation_tmr := one_one_tmt & [ +INPUT < [ +FORM ^[[:punct:]]+$, +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG.+TAGS < "--", ... > ] >, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG null_tnt ] > ]. ;;; Eliminate duplicate native entries stag_ditch_untagged_tmr := token_mapping_rule & [ +CONTEXT <[ +TRAIT native_trait, +CLASS non_ne, +TNT null_tnt, +STAG.+TAGS < [] > ] >, +INPUT < [ +TRAIT native_trait, +CLASS non_ne, +TNT null_tnt, +STAG null_tnt ] >, +OUTPUT < >, +POSITION "I1@C1" ]. |# ;;; ;;; with singleton PoS readings multiplied out in each chart cell, we can prune ;;; undesirable alternatives, e.g. a foreign word reading when there also is a ;;; common noun. also, ditch PoS readings with very low probability, and ones ;;; for which no PoS-activated generic entries exist anyway (function words). ;;; this final step eases debugging, reducing the size of the token chart. ;;; tnt_ditch_unlikely_tmr := token_mapping_rule & [ +INPUT < [ +TNT.+PRBS < ^0?\.0[0-5].*$ > ] >, +OUTPUT < > ]. tnt_ditch_function_1_tmr := token_mapping_rule & [ +INPUT < [ +TNT.+TAGS < ^P(I|D|ID|REL|W|POS)(S|AT)$ > ] >, +OUTPUT < > ]. tnt_ditch_function_2_tmr := token_mapping_rule & [ +INPUT < [ +TNT.+TAGS < ^PPER|PREF|ITJ|APZR|APP[RO]|APPRART|ART|PROAV|KON|KOUS|KOUI|KOKOM|PTKANT|XY|PTKNEG|V[AM](FIN|IMP|INF|PP)$ > ] >, +OUTPUT < > ]. tnt_ditch_function_3_tmr := token_mapping_rule & [ +INPUT < [ +TNT.+TAGS < ^\$|#|``|''|\(|\)|,|\.|:$ > ] >, +OUTPUT < > ]. ;; ;; _fix_me_ ;; experimentally, also ditch PoS information on punctuation-only tokens. we ;; appear to get noun and adjective readings for n- and m-dashes, which hardly ;; can do us any good. (24-sep-08; oe) ;; tnt_ditch_punctuation_tmr := token_mapping_rule & [ +INPUT < [ +FORM ^[[:punct:]]+$, +TNT.+TAGS *cons* ] >, +OUTPUT < > ]. ;;; ;;; _fix_me_ ;;; should we eventually want to include the PoS probabilities as a feature in ;;; parse selection, this kind of pruning should disappear: a high-probability ;;; FW, say, should not be ellbowed out by an unlikely NN. (31-aug-08; oe) ;;; tnt_filter_dup_fw_tmr := token_mapping_rule & [ +CONTEXT < [ +TNT.+TAGS < "N[NE]" > ] >, +INPUT < [ +TNT.+TAGS < "FW" > ] >, +OUTPUT < >, +POSITION "I1@C1" ]. ;; ;; [DPF 23-apr-09] words ending in "-ing" can get tagged both as noun and as ;; verb, but since the grammar has gerund rules, drop the noun and keep the ;; verb. ;; _fix_me_ ;; is there a reason to prefer the gerund over the vanilla noun? it means a ;; little extra ambiguity when followed by a PP[of], which the generic gerund ;; optionally picks up as a complement. (24-may-09; oe) ;; tnt_filter_dup_vbg_tmr := token_mapping_rule & [ +CONTEXT < [ +TNT.+TAGS < "VBG" > ] >, +INPUT < [ +TNT.+TAGS < "NN" > ] >, +OUTPUT < >, +POSITION "I1@C1" ]. ;; ;; _fix_me_ ;; the old `posmapping' setting in PET contained the following comment by dan ;; (from sep-07): ;; ;; Tried doing without the adjective, since TNT appears to mostly guess both ;; an adjective and a noun, and our generic mass-count noun can almost always ;; do the work of the adjective. This would avoid large amounts of spurious ;; ambiguity for most occurrences of these pairs. But unfortunately TNT ;; doesn't always guess both, so we need JJ when it's the only guess. Maybe ;; we can effect this with the new token-mapping machinery ... ;; ;; the following rule should have that effect. (21-jan-09; oe) ;; ;; [DPF 24-mar-09] Unfortunately, this simple rule goes wrong sometimes. For ;; "the tallest and most unk-word cat" the |unk-word| has to be an adjective, ;; so we can't just throw it away. We'll try using the probabilities from the ;; tagger for a more sensitive rule. ;; tnt_filter_dup_jj_tmr := token_mapping_rule & [ +CONTEXT < [ +TNT [ +TAGS < "NN" >, +PRBS < ^0?\.[2-9].*$ > ] ] >, +INPUT < [ +TNT.+TAGS < "JJ" > ] >, +OUTPUT < >, +POSITION "I1@C1" ]. ;; ;; on all tokens that we expect to activate generic entries, make the +PRED ;; value reflect the orthography and PoS tag. ;; generic_pred_tmr := token_mapping_rule & [ +INPUT < [ +FORM #form, +TRAIT #trait & generic_trait, +CLASS #class & non_ne, +PRED anti_string, +CARG #carg & ^(.+)$, +ID #id, +FROM #from, +TO #to, +TNT #tnt & [ +TAGS < ^(.*)$ > ] ] >, +OUTPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class, +PRED "_${lc(I1:+CARG:1)}/${I1:+TNT.+TAGS.FIRST:1}_u_unknown_rel", +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT #tnt ] >, +POSITION "O1@I1" ].