;;; -*- Mode: tdl; Coding: utf-8; -*-


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; lightweight NEs: form-driven generic entries (formerly `ersatz' entries)
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;
;;; email addresses
;;;

;;
;; any valid DNS string, prefixed by address, with optional angle brackets
;;
email_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM
             ^<?[[:alnum:]._-]+@[[:alnum:]_-]+(\.[[:alnum:]_-]+)+>?$ ] >,
  +OUTPUT < [ +CLASS email_ne ] > ].

;;;
;;; uniform resource locators (URLs)
;;;

;;
;; any valid DNS string, prefixed by `http://', with optional angle brackets
;;
url_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^<?http://[[:alnum:]_-]+(\.[[:alnum:]_-]+)+(/.*)?>?$ ] >,
  +OUTPUT < [ +CLASS url_ne ] > ].

;;
;; any valid DNS string, prefixed by `www', with optional angle brackets
;;
url_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^<?www(\.[[:alnum:]_-]+)+(/.*)?>?$ ] >,
  +OUTPUT < [ +CLASS url_ne ] > ].

;;
;; any valid DNS string, with obligatory angle brackets
;;
url_ne_3_tmr := ne_tmt &
[ +INPUT < [ +FORM ^<[[:alnum:]_-]+(\.[[:alnum:]_-]+)+(/.*)?>$ ] >,
  +OUTPUT < [ +CLASS url_ne ] > ].

;;;
;;; file names
;;;

;;
;; fully-qualified Un*x style, starting with a slash, e.g. |/etc/config|.  
;;
;; _fix_me_
;; we require a minimum of two components, such that |/etc| by itself will not
;; match.  maybe we should allow these too but create an ambiguity here, i.e.
;; output two tokens, one [ CLASS file_ne }, the other [ CLASS non_ne ]?
;;                                                              (19-sep-08; oe)
;;
file_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(/[[:alnum:]._-]+){2,}/?$ ] >,
  +OUTPUT < [ +CLASS file_ne ] > ].


;;;
;;; time-of-day expressions: |9am|, |11:45pm|, |20:15|
;;;

;;
;; an |am| or |pm| suffix unambiguously indicates a time expression.  we also
;; grab all tokens of the form `H:M' where `H' and `M' are numbers in the right
;; ranges.
;;
;; _fix_me_
;; i wonder about `mix in a ratio of 1:15', which the second rule below would
;; consider a time-of-day expression.  should we approach such cases with more
;; `optional' NE rules, i.e. ones outputting two tokens?  or should we rather
;; introduce an abstraction over `time_ne' and `ratio_ne', such that a single
;; token can activate multiple lexical entries?  once we get regular expression
;; matching for lexical instantiation (peter is working on that), in principle,
;; we could just drop `time_ne_2_tmr', make `time_ne' a sub-type of `ratio_ne',
;; and put the `H:M' regular expression into the generic lexical entry.  with
;; great power comes great responsibility :-).                 (19-sep-08; oe)
;; DPF 27-jun-09 - Let's try the option of abstracting over time_ne and meas_ne
;; (what the ratio rules introduce).
;;                       
time_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(0?[0-9]|1[0-2])([:.][0-5][0-9])?([aApP][mM])$ ] >,
  +OUTPUT  < [ +CLASS time_ne ] > ].

time_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(0?[0-9]|1[0-9]|2[0-4]):[0-5][0-9]$ ] >,
  +OUTPUT  < [ +CLASS meas_or_time_ne ] > ].

;;
;; to also match |10.30| as a candiate time, there is an underspecification of
;; times and floating point numbers; this is a separate rule to avoid matching
;; |15:30| as a candidate cardinal.
;;
time_ne_3_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(0?[0-9]|1[0-9]|2[0-4])\.[0-5][0-9]$ ] >,
  +OUTPUT  < [ +CLASS card_or_time_ne ] > ].


;;;
;;; dates
;;;

;;
;; _fix_me_
;; things are getting a little murky here: some of the current date formats
;; overlap with some of the identifiers and the fractions and ranges.  to do
;; justice to these ambiguities, we would have to introduce multiple NE tokens.
;; or find ways of underspecification in the lexicon, maybe?    (23-sep-08; oe)
;;

;;
;; US and European variants: |11-01-1957|, |11/01/1957|, |11-01-57| |57/01/11|
;;
date_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^([0-9]{1,2}[-/]){2}[0-9]{2,4}$ ] >,
  +OUTPUT < [ +CLASS date_ne ] > ].   

;;
;; _fix_me_
;; i am leaving out |12-2005| |12/05| |12-05| |12/05|, and |'06| for now.  they
;; overlap too much with other patterns, so maybe should be optional rules?
;;                                                              (23-sep-08; oe)
;;


;;;
;;; ratios: |1:1000|, |1:100,000|, et al.
;;;

;;
;; we make the conservative assumption that the first element not exceed three
;; digits and not have leading zeros.
;;
ratio_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[1-9][0-9]{0,2}:[1-9][0-9]*$ ] >,
  +OUTPUT < [ +CLASS meas_ne ] > ].   

ratio_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[1-9][0-9]{0,2}:[1-9][0-9]{0,2}(,[0-9]{3})*$ ] >,
  +OUTPUT < [ +CLASS meas_ne ] > ].   


;;;
;;; fractions: |1/4|, |-1/3|, |51/100|, et al.
;;;

fraction_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]+(\.[0-9]*)?/[1-9][0-9]*(th)?$ ] >,
  +OUTPUT < [ +CLASS frct_ne ] > ].   


;;;
;;; measure noun phrases (taking precedence over alphanumeric identifiers)
;;;

;;
;; single token: |25cm| or |+37.3ºC|
;;

measure_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM 
          ^[+-±~]?[0-9]+([,.][0-9]*)?([kKmMgGµ"']+|[ckmn]m|σ|Å|[º°][CF]?)$ ] >,
  +OUTPUT < [ +CLASS meas_ne ] > ].

;;
;; |1:1000m|
;;
ratio_measure_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM 
             ^[1-9][0-9]{0,2}:[1-9][0-9]*([kKmMgG"']+|[ckmn]m|σ|[º°][CF]?)$ ]>,
  +OUTPUT < [ +CLASS meas_ne ] > ].   

;;
;; |US$20| and the like: break into two tokens, to get a bracketing compatible
;; with [US$ [20 billion]] for a string like |US$20 billion|.
;;
;; _fix_me_
;; dan and i concluded that we should introduce a `currency_ne' instead, with
;; a corresponding generic entry, i.e. drop |US$| et al. from the lexicon (at
;; present, some of the entries allowed by this rule are missing, e.g. |C$|).
;; to maintain some parallelism with currency names that can be ordinary words
;; (e.g. `pound'), the rule below should synthesize a PRED value (rather than
;; put a CARG on currency NEs), e.g. `_us$_n_currency_rel'.  another rule will
;; be needed for just |US$| (and in principle |€| and other currency symbols),
;; to also match them as currency NEs, and at that point the `currency+name'
;; lexical filtering rule could be discarded.            (dan & oe, 29-jul-09)
;;
currency_measure_ne_tmr := one_two_tmt &
[ +INPUT < [ +FORM ^((?:AU?|CA?|HK|NZ|US)?\$)([0-9]+([\.,][0-9]*)?)$,
             +TRAIT #trait, +CLASS non_ne,
             +PRED #pred, +CARG #carg ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}", 
              +TRAIT native_trait, +CLASS non_ne,
              +PRED #pred, +CARG #carg, +TNT null_tnt ],
	    [ +FORM "${I1:+FORM:2}", 
              +TRAIT #trait, +CLASS non_ne,
              +PRED #pred, +CARG #carg, +TNT null_tnt ] > ].

;;;
;;; various kinds of identifiers (accumulated throughout the years); some of
;;; these are allowed to include punctuation marks that can (later) lead to
;;; additional token boundaries (i.e. |-| and |/|), hence we need to be fairly
;;; restrictive about our identifier patterns.  in principle, i guess, this is
;;; something that should be adapted specifically for a target domain and type
;;; of text.
;;;
;;; _fix_me_
;;; some of these patterns conflict with others, e.g. `56K data line' should
;;; be a measure NP, not an identifier.  in all-caps strings, `PRE-1980' would
;;; be mis-analyzed too.  i am not really sure what to do about these.
;;;                                                            (24-sep-08; oe)

;;
;; (at least) one or more letters, followed by digits, e.g. |ABC123DEF|
;;
alphanumeric_identifier_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]*[[:alpha:]]+[0-9]+[[:alnum:]]*$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; a special case: a number, a hyphen, followed by a single letter: |22-b|
;;
alphanumeric_identifier_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]+-[[:alpha:]]$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; in a similar spirit, a number followed by letters in parentheses: |22(B)|;
;; note that the closing parentheses will have been tokenized off, PTB-style.
;;
alphanumeric_identifier_ne_3_tmr := two_one_tmt &
[ +INPUT < [ +FORM ^([0-9]+\([[:alpha:]]+[[:alnum:]]*)$, 
             +TRAIT #trait, +CLASS non_ne,
             +PRED #pred, +CARG #carg ],
           [ +FORM ")", +CLASS non_ne ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1})", 
              +TRAIT #trait, +CLASS proper_ne,
              +PRED #pred, +CARG #carg, +TNT null_tnt ] > ].

;;
;; a number followed by one letter: |22a|
;;
alphanumeric_identifier_ne_4_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]+[[:alpha:]]$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; the following are maybe taken from chemistry: |B.25| |IL-10| |IL/10|
;;
chemistry_identifier_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]*[[:upper:]]+[-./][[:upper:]]*[0-9]+$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; strongly alphanumeric strings (i.e. including digits), with at least two
;; hyphens, e.g. |123-45-6789|.
;;
;; _fix_me_
;; dan relaxed this rule to not require at least one numeral, with the effect
;; of matching, say, |up-to-date| as a proper name (only).  i now reverted the
;; rule to its original form, but we should look at the cases that called for
;; relaxation, to see whether we can safely match them as identifiers.
;;                                                              (3-jun-09; oe)
hyphenated_identifier_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[[:alpha:]]*[0-9]+[[:alpha:]0-9]*(-[[:alnum:]]+){2,}$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; section numbers (and the like): two or more decimal points
;;
section_number_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]+\.[0-9]+(\.[0-9]+)+$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].