#!/usr/bin/perl -w

use strict;

#################################
# History:
#
# neva.pl -- changed to NEVA instead of BLEU (Eva Forsbom)
#         -- added locale handling (Eva Forsbom) 
#
# version 09c -- bug fix (During the calculation of ngram information,
#    each ngram was being counted only once for each segment.  This has
#    been fixed so that each ngram is counted correctly in each segment.)
#
# version 09b -- text normalization modified:
#    * option flag added to preserve upper case
#    * non-ASCII characters left in place.
#
# version 09a -- text normalization modified:
#    * &quot; and &amp; converted to " and &, respectively
#    * non-ASCII characters kept together (bug fix)
#
# version 09 -- modified to accommodate sgml tag and attribute
#    names revised to conform to default SGML conventions.
#
# version 08 -- modifies the NIST metric in accordance with the
#    findings on the 2001 Chinese-English dry run corpus.  Also
#    incorporates the BLEU metric as an option and supports the
#    output of ngram detail.
#
# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
#    Keep strings of non-ASCII characters together as one word
#    (rather than splitting them into one-character words).
#    Change length penalty so that translations that are longer than
#    the average reference translation are not penalized.
#
# version 06
#    Prevent divide-by-zero when a segment has no evaluation N-grams.
#    Correct segment index for level 3 debug output.
#
# version 05
#    improve diagnostic error messages
#
# version 04
#    tag segments
#
# version 03
#    add detailed output option (intermediate document and segment scores)
#
# version 02
#    accommodation of modified sgml tags and attributes
#
# version 01
#    same as bleu version 15, but modified to provide formal score output.
#
# original IBM version
#    Author: Kishore Papineni
#    Date: 06/10/2001
#################################

# EF added POSIX, locale, require
use POSIX qw(locale_h);
use locale;

require 5.004;
                                                                                
######
# Intro
my ($date, $time) = date_time_stamp();
print "MT evaluation run on $date at $time\n";
print "command line:  ", $0, " ", join(" ", @ARGV), "\n";
# EF added l
my $usage = "\n\nUsage: $0 [-h] -r <ref_file> -s src_file -t <tst_file> [-l <trg_locale>]\n\n".
    "Description:  This Perl script evaluates MT system performance.\n".
    "\n".
    "Required arguments:\n".
    "  -r <ref_file> is a file containing the reference translations for\n".
    "      the documents to be evaluated.\n".
    "  -s <src_file> is a file containing the source documents for which\n".
    "      translations are to be evaluated\n".
    "  -t <tst_file> is a file containing the translations to be evaluated\n".
    "\n".
    "Optional arguments:\n".
    "  -b substitutes the NEVA score for the NIST score\n".
    "  -c preserves upper-case alphabetic characters\n".
    "  -d detailed output flag:\n".
    "         0 (default) for system-level score only\n".
    "         1 to include document-level scores\n".
    "         2 to include segment-level scores\n".
    "         3 to include ngram-level scores\n".
    "  -h prints this help message to STDOUT\n".
# EF added l
    "  -l <trg_locale> to use language specific tokenisation (default: en)\n".
    "\n";

# EF added opt_l
use vars qw ($opt_r $opt_s $opt_t $opt_n $opt_h $opt_d $opt_b $opt_c $opt_x $opt_l);
use Getopt::Std;
# EF added l
getopts ('r:s:t:n:d:hbcx:l:');
die $usage if defined($opt_h);
die "Error in command line:  ref_file not defined$usage" unless defined $opt_r;
die "Error in command line:  src_file not defined$usage" unless defined $opt_s;
die "Error in command line:  tst_file not defined$usage" unless defined $opt_t;
my $max_Ngram = defined($opt_n) ? $opt_n : 3;
my $ngram_bleu = 4;
my $ngram_nist = 5;
$max_Ngram = max($max_Ngram, $opt_b ? $ngram_bleu : $ngram_nist);
my $detail = defined $opt_d ? $opt_d : 0;
my $preserve_case = defined $opt_c ? 1 : 0;
my $method = defined $opt_b ? "NEVA" : "NIST";
my ($ref_file) = $opt_r;
my ($src_file) = $opt_s;
my ($tst_file) = $opt_t;
# EF added locale, old_locale: default en
my $locale = defined($opt_l) ? $opt_l : "en";
my $old_locale = setlocale(LC_CTYPE);

######
# Global variables
my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters
my (%tst_data, %ref_data); # the data -- with structure:  {system}{document}[segments]
my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets
my %eval_docs;     # document information for the evaluation data set
my %ngram_info;    # the information obtained from (the last word in) the ngram

######
# Get source document ID's
($src_id) = get_source_info ($src_file);

######
# Get reference translations
($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file);

compute_ngram_info ();

######
# Get translations to evaluate
($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file);

######
# Check data for completeness and correctness
check_MT_data ();

######
# Evaluate
print "  Evaluation of $src_lang-to-$tgt_lang translation using:\n";
my $cum_seg = 0;
foreach my $doc (sort keys %eval_docs) {
    $cum_seg += @{$eval_docs{$doc}{SEGS}};
}
print "    src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n";
print "    ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n";
print "    tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n";

foreach my $sys (sort @tst_sys) {
    score_system ($sys);
}

exit 0;

#################################

sub get_source_info {

    my ($file) = @_;
    my ($name, $id, $src, $doc);
    my ($data, $tag, $span);
    

#read data from file
    open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
    $data .= $_ while <FILE>;
    close (FILE);

#get source set info
    die "\n\nFATAL INPUT ERROR:  no 'src_set' tag in src_file '$file'\n\n"
	unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data);

    die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
	unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);

    die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
	unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
    die "\n\nFATAL INPUT ERROR:  $name ('$src') in file '$file' inconsistent\n"
	."                    with $name in previous input data ('$src_lang')\n\n"
	    unless (not defined $src_lang or $src eq $src_lang);
    $src_lang = $src;

#get doc info -- ID and # of segs
    $data = $span;
    while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) {
	die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n"
	    unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag);
	die "\n\nFATAL INPUT ERROR:  duplicate '$name' in file '$file'\n\n"
	    if defined $eval_docs{$doc};
	$span =~ s/[\s\n\r]+/ /g;  # concatenate records
	my $jseg=0, my $seg_data = $span;
	while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
	    ($eval_docs{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
	}
	die "\n\nFATAL INPUT ERROR:  no segments in document '$doc' in file '$file'\n\n"
	    if $jseg == 0;
    }
    die "\n\nFATAL INPUT ERROR:  no documents in file '$file'\n\n"
	unless keys %eval_docs > 0;
    return $id;
}

#################################

sub get_MT_data {

    my ($docs, $set_tag, $file) = @_;
    my ($name, $id, $src, $tgt, $sys, $doc);
    my ($tag, $span, $data);

#read data from file
    open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage;
    $data .= $_ while <FILE>;
    close (FILE);

#get tag info
    while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) {
	die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
	    ($id) = extract_sgml_tag_attribute ($name="SetID", $tag);

	die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
	    ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag);
	die "\n\nFATAL INPUT ERROR:  $name ('$src') in file '$file' inconsistent\n"
	    ."                    with $name of source ('$src_lang')\n\n"
		unless $src eq $src_lang;
	
	die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
	    ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag);
	die "\n\nFATAL INPUT ERROR:  $name ('$tgt') in file '$file' inconsistent\n"
	    ."                    with $name of the evaluation ('$tgt_lang')\n\n"
		unless (not defined $tgt_lang or $tgt eq $tgt_lang);
	$tgt_lang = $tgt;

	my $mtdata = $span;
	while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) {
	    die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
		(my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag);
	    
	    die "\n\nFATAL INPUT ERROR:  no tag attribute '$name' in file '$file'\n\n" unless
		$doc = extract_sgml_tag_attribute ($name="DocID", $tag);
	    
	    die "\n\nFATAL INPUT ERROR:  document '$doc' for system '$sys' in file '$file'\n"
		."                    previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n"
		    unless (not defined $docs->{$sys}{$doc});

	    $span =~ s/[\s\n\r]+/ /g;  # concatenate records
	    my $jseg=0, my $seg_data = $span;
	    while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) {
		($docs->{$sys}{$doc}{SEGS}[$jseg++]) = NormalizeText ($span);
	    }
	    die "\n\nFATAL INPUT ERROR:  no segments in document '$doc' in file '$file'\n\n"
		if $jseg == 0;
	    $docs->{$sys}{$doc}{FILE} = $file;
	}
    }
    return $id;
}

#################################

sub check_MT_data {

    @tst_sys = sort keys %tst_data;
    @ref_sys = sort keys %ref_data;

#every evaluation document must be represented for every system and every reference
    foreach my $doc (sort keys %eval_docs) {
	my $nseg_source = @{$eval_docs{$doc}{SEGS}};
	foreach my $sys (@tst_sys) {
	    die "\n\nFATAL ERROR:  no document '$doc' for system '$sys'\n\n"
		unless defined $tst_data{$sys}{$doc};
	    my $nseg = @{$tst_data{$sys}{$doc}{SEGS}};
	    die "\n\nFATAL ERROR:  translated documents must contain the same # of segments as the source, but\n"
		."              document '$doc' for system '$sys' contains $nseg segments, while\n"
                ."              the source document contains $nseg_source segments.\n\n"
		    unless $nseg == $nseg_source;
	}

	foreach my $sys (@ref_sys) {
	    die "\n\nFATAL ERROR:  no document '$doc' for reference '$sys'\n\n"
		unless defined $ref_data{$sys}{$doc};
	    my $nseg = @{$ref_data{$sys}{$doc}{SEGS}};
	    die "\n\nFATAL ERROR:  translated documents must contain the same # of segments as the source, but\n"
		."              document '$doc' for system '$sys' contains $nseg segments, while\n"
                ."              the source document contains $nseg_source segments.\n\n"
		    unless $nseg == $nseg_source;
	}
    }
}

#################################

sub compute_ngram_info {

    my ($ref, $doc, $seg);
    my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram);
    my (%ngram_count, @tot_ngrams);

    foreach $ref (keys %ref_data) {
	foreach $doc (keys %{$ref_data{$ref}}) {
	    foreach $seg (@{$ref_data{$ref}{$doc}{SEGS}}) {
		@wrds = split /\s+/, $seg;
		$tot_wrds += @wrds;
		%ngrams = %{Words2Ngrams (@wrds)};
		foreach $ngram (keys %ngrams) {
		    $ngram_count{$ngram} += $ngrams{$ngram};
		}
	    }
	}
    }
    
    foreach $ngram (keys %ngram_count) {
	@wrds = split / /, $ngram;
	pop @wrds, $mgram = join " ", @wrds;
	$ngram_info{$ngram} = - log
	    ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram}
	            : $ngram_count{$ngram}/$tot_wrds) / log 2;
	if (defined $opt_x and $opt_x eq "ngram info") {
	    @wrds = split / /, $ngram;
	    printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram},
	        $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram;
	}
    }
}

#################################

sub score_system {

    my ($sys, $ref, $doc);
    ($sys) = @_;
    my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
    my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);

    $cum_ref_length = 0;
    for (my $j=1; $j<=$max_Ngram; $j++) {
	$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
    }
	
    foreach $doc (sort keys %eval_docs) {
	($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc);

#output document summary score	    
	printf "$method score = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n",
            $method eq "NEVA" ?  bleu_score($shortest_ref_length, $match_cnt, $tst_cnt) :
		nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info),
		scalar @{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]
		    if $detail >= 1;

	$cum_ref_length += $shortest_ref_length;
	for (my $j=1; $j<=$max_Ngram; $j++) {
	    $cum_match[$j] += $match_cnt->[$j];
	    $cum_tst_cnt[$j] += $tst_cnt->[$j];
	    $cum_ref_cnt[$j] += $ref_cnt->[$j];
	    $cum_tst_info[$j] += $tst_info->[$j];
	    $cum_ref_info[$j] += $ref_info->[$j];
	    printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j],
	        $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j]
		    if (defined $opt_x and $opt_x eq "document info");
	}
    }

#output system summary score
    printf "$method score = %.4f for system \"$sys\"\n",
        $method eq "NEVA" ?  bleu_score($cum_ref_length, \@cum_match, \@cum_tst_cnt) :
	    nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info);
}

#################################

sub score_document {

    my ($sys, $ref, $doc);
    ($sys, $doc) = @_;
    my ($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info);
    my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info);

    $cum_ref_length = 0;
    for (my $j=1; $j<=$max_Ngram; $j++) {
	$cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0;
    }
	
#score each segment
    for (my $jseg=0; $jseg<@{$tst_data{$sys}{$doc}{SEGS}}; $jseg++) {
	my @ref_segments = ();
	foreach $ref (@ref_sys) {
	    push @ref_segments, $ref_data{$ref}{$doc}{SEGS}[$jseg];
	    printf "ref '$ref', seg %d: %s\n", $jseg+1, $ref_data{$ref}{$doc}{SEGS}[$jseg]
		if $detail >= 3;
	}
	printf "sys '$sys', seg %d: %s\n", $jseg+1, $tst_data{$sys}{$doc}{SEGS}[$jseg]
	    if $detail >= 3;
	($shortest_ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) =
	    score_segment ($tst_data{$sys}{$doc}{SEGS}[$jseg], @ref_segments);

#output segment summary score	    
	printf "$method score = %.4f for system \"$sys\" on segment %d of document \"$doc\" (%d words)\n",
            $method eq "NEVA" ?  bleu_score($shortest_ref_length, $match_cnt, $tst_cnt) :
		nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info),
		$jseg+1, $tst_cnt->[1]
		    if $detail >= 2;

	$cum_ref_length += $shortest_ref_length;
	for (my $j=1; $j<=$max_Ngram; $j++) {
	    $cum_match[$j] += $match_cnt->[$j];
	    $cum_tst_cnt[$j] += $tst_cnt->[$j];
	    $cum_ref_cnt[$j] += $ref_cnt->[$j];
	    $cum_tst_info[$j] += $tst_info->[$j];
	    $cum_ref_info[$j] += $ref_info->[$j];
	}
    }
    return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]);
}

#################################

sub score_segment {

    my ($tst_seg, @ref_segs) = @_;
    my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info);
    my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info);
    my ($ngram);
    my (@nwrds_ref);
    my $shortest_ref_length;

    for (my $j=1; $j<= $max_Ngram; $j++) {
	$match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0;
    }

# get the ngram counts for the test segment
    @tst_wrds = split /\s+/, $tst_seg;
    %tst_ngrams = %{Words2Ngrams (@tst_wrds)};
    for (my $j=1; $j<=$max_Ngram; $j++) { # compute ngram counts
	$tst_count[$j]  = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0;
    }

# get the ngram counts for the reference segments
    foreach $ref_seg (@ref_segs) {
	@ref_wrds = split /\s+/, $ref_seg;
	%ref_ngrams = %{Words2Ngrams (@ref_wrds)};
	foreach $ngram (keys %ref_ngrams) { # find the maximum # of occurrences
	    my @wrds = split / /, $ngram;
	    $ref_info[@wrds] += $ngram_info{$ngram};
	    $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ?
		max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) :
		    $ref_ngrams{$ngram};
	}
	for (my $j=1; $j<=$max_Ngram; $j++) { # update ngram counts
	    $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0;
	}
	$shortest_ref_length = scalar @ref_wrds # find the shortest reference segment
	    if (not defined $shortest_ref_length) or @ref_wrds < $shortest_ref_length;
    }

# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
    foreach $ngram (keys %tst_ngrams) {
	next unless defined $ref_ngrams_max{$ngram};
	my @wrds = split / /, $ngram;
	$tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
	$match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram});
	printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram
	    if $detail >= 3;
    }

    return ($shortest_ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]);
}

#################################

sub bleu_score {

    my ($shortest_ref_length, $matching_ngrams, $tst_ngrams) = @_;

    my $beta = 0.01;
    my $score = 0;
#EF<    take care of segments with less than N words
    my $N = $max_Ngram;
    if ($max_Ngram > $tst_ngrams->[1]) {
        $N = $tst_ngrams->[1];
    }
#EF<    only return with 0 if there are no matching 1-grams
        return 0 if $matching_ngrams->[1] == 0;
    for (my $j=1; $j<=$N; $j++) {
#EF>    return 0 if $matching_ngrams->[$j] == 0;
#EF<    log(0) is not defined, add -1 instead
#       $score += log ($matching_ngrams->[$j]/$tst_ngrams->[$j]);
        if ($matching_ngrams->[$j] > 0) {
            $score += $matching_ngrams->[$j]/$tst_ngrams->[$j];
        }
#EF>
    }
#EF<    log(0) is not defined, add -1 instead
    my $len_score = 1;
    if ($tst_ngrams->[1] < $shortest_ref_length) {
        $len_score = exp(1 - $shortest_ref_length/$tst_ngrams->[1]);
    }
#    else {
#        $len_score = exp(1 - $tst_ngrams->[1]/$shortest_ref_length);
#    }
#EF>
    return $score/$N * $len_score;
}

#################################

sub nist_score {

    my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info) = @_;

    my $score = 0;
    for (my $n=1; $n<=$ngram_nist; $n++) {
	$score += $tst_info->[$n]/max($tst_ngrams->[$n],1);
    }
    $score *= nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys));

    return $score;
}

#################################

sub Words2Ngrams { #convert a string of words to an Ngram count hash

    my %count = ();

    for (; @_; shift) {
	my ($j, $ngram, $word);
	for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) {
	    $ngram .= defined $ngram ? " $word" : $word;
	    $count{$ngram}++;
	}
    }
    return {%count};
}

#################################

sub NormalizeText {
    my ($norm_text) = @_;

# language-independent part:
    $norm_text =~ s/<skipped>//g; # strip "skipped" tags
    $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
    $norm_text =~ s/\n/ /g; # join lines
    $norm_text =~ s/(\d)\s+(\d)/$1$2/g; #join digits
    $norm_text =~ s/&quot;/"/g;  # convert SGML tag for quote to "
    $norm_text =~ s/&amp;/&/g;   # convert SGML tag for ampersand to &
    $norm_text =~ s/&lt;/</g;    # convert SGML tag for less-than to >
    $norm_text =~ s/&gt;/>/g;    # convert SGML tag for greater-than to <

# language-dependent part (assuming Western languages):
#EF added setlocale
    setlocale(LC_CTYPE, $locale);
    $norm_text = " $norm_text ";
#EF    $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
#EF changed to
    $norm_text = lc $norm_text unless $preserve_case;
    $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g;   # tokenize punctuation
    $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
    $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
    $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
    $norm_text =~ s/\s+/ /g; # one space only between words
    $norm_text =~ s/^\s+//;  # no leading space
    $norm_text =~ s/\s+$//;  # no trailing space
#EF added setlocale
    setlocale(LC_CTYPE, $old_locale);

    return $norm_text;
}

#################################

sub nist_length_penalty {

    my ($ratio) = @_;
    return 1 if $ratio >= 1;
    return 0 if $ratio <= 0;
    my $ratio_x = 1.5;
    my $score_x = 0.5;
    my $beta = -log($score_x)/log($ratio_x)/log($ratio_x);
    return exp (-$beta*log($ratio)*log($ratio));
}

#################################

sub date_time_stamp {

    my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
    my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
    my ($date, $time);

    $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec;
    $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday;
    return ($date, $time);
}

#################################

sub extract_sgml_tag_and_span {
    
    my ($name, $data) = @_;
    
    ($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : ();
}

#################################

sub extract_sgml_tag_attribute {

    my ($name, $data) = @_;

    ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
}

#################################

sub max {

    my ($max, $next);

    return unless defined ($max=pop);
    while (defined ($next=pop)) {
	$max = $next if $next > $max;
    }
    return $max;
}

#################################

sub min {

    my ($min, $next);

    return unless defined ($min=pop);
    while (defined ($next=pop)) {
	$min = $next if $next < $min;
    }
    return $min;
}

#################################