//////////////////////////////////////////////////////////////////
//
//    FreeLing - Open Source Language Analyzers
//
//    Copyright (C) 2004   TALP Research Center
//                         Universitat Politecnica de Catalunya
//
//    This library is free software; you can redistribute it and/or
//    modify it under the terms of the GNU Lesser General Public
//    License as published by the Free Software Foundation; either
//    version 2.1 of the License, or (at your option) any later version.
//
//    This library is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
//    Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public
//    License along with this library; if not, write to the Free Software
//    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//    contact: Lluis Padro (padro@lsi.upc.es)
//             TALP Research Center
//             despatx C6.212 - Campus Nord UPC
//             08034 Barcelona.  SPAIN
//
////////////////////////////////////////////////////////////////


//------------------------------------------------------------------//
//
//                    IMPORTANT NOTICE
//
//  This file contains a simple main program to illustrate 
//  usage of FreeLing analyzers library.
//
//  This sample main program may be used straightforwardly as 
//  a basic front-end for the analyzers (e.g. to analyze corpora)
//
//  Neverthless, if you want embed the FreeLing libraries inside
//  a larger application, or you want to deal with other 
//  input/output formats (e.g. XML), the efficient and elegant 
//  way to do so is consider this file as a mere example, and call 
//  the library from your your own main code.
//
//------------------------------------------------------------------//


using namespace std;

#include <sstream>
#include <fstream>
#include <iostream>

#include <set>
#include <map>
#include <vector>
#include <boost/filesystem.hpp>

#include "freeling.h"
#include "freeling/morfo/util.h"
#include "config.h"

#undef MOD_TRACENAME
#define MOD_TRACENAME L"LKB_ANALYZER"

const wchar_t FORMFEED=0x0C;

// we use pointers to the analyzers, so we
// can create only those strictly necessary.
tokenizer *tk=NULL;
splitter *sp=NULL;
maco *morfo=NULL;
nec *neclass=NULL;
POS_tagger *tagger=NULL;

// FL configuration options
config *cfg=NULL;

// Variables and classes to hold transformation rules FL->SPPP
class SPPP_rule {
  public:
    boost::u32regex form1;
    boost::u32regex lemma;
    boost::u32regex tag;
    bool any_form, pn_form;
    bool any_lemma, pn_lemma;
    bool any_tag, pn_tag;
 
    wstring stem;    
    wstring rule_id;
    wstring form2;

     SPPP_rule() {
       form1=boost::make_u32regex(L""); 
       lemma=boost::make_u32regex(L"");
       tag=boost::make_u32regex(L"");
       any_form=true; any_lemma=true; any_tag=true; 
       pn_form=true;  pn_lemma=true;  pn_tag=true;}
    ~SPPP_rule() {};
};

map<wstring,wstring> noTag;
list<SPPP_rule> rules;
map<wstring,list<analysis> > replaces;
list<list<wstring> > fusion;


//---------------------------------------------
// output a string both in cout and cerr
//---------------------------------------------
void say(const wstring &s) {
  wcout<<s<<endl;
  wcerr<<s<<endl;
}


//---------------------------------------------
// encode special chars to XML
//---------------------------------------------
void toXML(wstring &s){
  util::find_and_replace(s, L"&", L"&amp;");
  util::find_and_replace(s, L"\"", L"&quot;");
  util::find_and_replace(s, L"<", L"&lt;");
  util::find_and_replace(s, L">", L"&gt;");
  util::find_and_replace(s, L"'", L"&apos;");
}

//---------------------------------------------
// decode special chars from XML
//---------------------------------------------

void fromXML(wstring &s){
  util::find_and_replace(s, L"&quot;", L"\"");
  util::find_and_replace(s, L"&lt;", L"<");
  util::find_and_replace(s, L"&gt;", L">");
  util::find_and_replace(s, L"&apos;", L"'");
  util::find_and_replace(s, L"&amp;", L"&");
}

//---------------------------------------------
// print one word analysis.
//---------------------------------------------

wstring print_analysis(const word::const_iterator wb, const word::const_iterator we, const wstring &form, int pos, int posf, int start, int finish) {

  wstring result;
  wstring lcform = util::lowercase(form);
  
  word::const_iterator a;
  for (a=wb; a!=we; a++) {

    wstring tag = a->get_tag();
    wstring alemma = a->get_lemma();
    wstring clitics;
    toXML(alemma);
    
    if (a->is_retokenizable()) {  // clitics
      list<word> rtk=a->get_retokenizable();

      // verb tag
      list<word>::iterator r=rtk.begin();
      tag = r->get_tag();
      r++;
      // clitics tags
      while (r!= rtk.end()) {
	clitics += L"<str>+"+r->get_tag()+L"</str>";
	r++;
      }
    }
    
    bool trobat = false;
    wstring stem=L"NO-RULE-FOUND"; wstring rid=L"NO-RULE-FOUND"; wstring frm=L"NO-RULE-FOUND";
    for (list<SPPP_rule>::iterator r=rules.begin(); r!=rules.end() and not trobat; r++) {
      
      if ( (r->any_form or boost::u32regex_search(lcform,r->form1)==r->pn_form) and
	   (r->any_lemma or boost::u32regex_search(alemma,r->lemma)==r->pn_lemma) and
	   (r->any_tag or boost::u32regex_search(tag,r->tag)==r->pn_tag) ) {
	
	// matching rule found, apply right hand side.
	trobat = true;
	
	// compute stem
	if (r->stem==L"L") stem=alemma;
	else if (r->stem==L"T") stem=tag;
	else if (r->stem==L"F") stem=lcform;
	else stem=r->stem;
	
	// compute rule_id
	if (r->rule_id==L"L") rid=alemma;
	else if (r->rule_id==L"T") rid=tag;
	else if (r->rule_id==L"F") rid=lcform;
	
	// compute form
	wstring f=L"";
	for (size_t i=0; i<r->form2.size(); i++) {
	  if (r->form2[i]=='L') f=f+L"#"+alemma;
	  else if (r->form2[i]=='T') f=f+L"#"+tag;
	  else if (r->form2[i]=='F') f=f+L"#"+form;
	}
	if (not f.empty()) frm=f.substr(1);      
      }
    }

    result += L"      <edge source=\""+util::int2wstring(pos)+L"\" target=\""+util::int2wstring(posf)+L"\">\n";
    result += L"        <fs type=\"token\">\n";
    result += L"           <f name=\"+FORM\"><str>"+form+L"</str></f>\n";
    result += L"           <f name=\"+FROM\"><str>"+util::int2wstring(start)+L"</str></f>\n";
    result += L"           <f name=\"+TO\"><str>"+util::int2wstring(finish)+L"</str></f>\n";    
    result += L"           <f name=\"+STEM\"><str>"+stem+L"</str></f>\n";    
    result += L"           <f name=\"+TAG\">"+rid+L"</f>\n";
    if (not clitics.empty()) result += L"           <f name=\"+CLIT\" org=\"list\">"+clitics+L"</f>\n";
    result += L"        </fs>\n";
    result += L"      </edge>\n";   
  }

  return result;
}


//---------------------------------------------
// check if the word matches some Fusion rule,
// and apply it if so.
//---------------------------------------------

void CheckFusion(word &w, bool tagged) {
  word::iterator wb,we,a;
  set<wstring> common;

  if (not tagged) {
     wb = w.analysis_begin();
     we = w.analysis_end();
  }
  else {
     wb = w.selected_begin();
     we = w.selected_end();
  }

  // check all fusion rules.
  list<list<wstring> >::iterator r;
  for (r=fusion.begin(); r!=fusion.end(); r++) {
 
    common.clear(); // clear set of common lemmas.

    // check rule
    bool ok=true;
    list<wstring>::iterator tagout=r->begin(); // first tag is the output
    list<wstring>::iterator tag1=r->begin(); tag1++; // second tag is first condition

    list<wstring>::iterator tr;
    for (tr=tag1; tr!=r->end() and ok; tr++) {
      // build a set with lemmas for current rule tag
      set<wstring> lems; 
      for (a=wb; a!=we; a++) 
	if ((*tr)==a->get_tag()) 
	  lems.insert(a->get_lemma());

      if (tr==tag1) 
	// first iteration, intersection so far is lem.
	common=lems; 
      else {
	// further iterations, accumulate intersection.
	set<wstring> is;
	set_intersection(common.begin(), common.end(), lems.begin(), lems.end(), inserter(is,is.begin()) );
	common=is;
      }

      // if acumulated intersection is empty, rule won't match.
      ok = not common.empty();
    }

    if (ok) {  // rule matched. Apply it      
      // for each lemma matching rule tags
      for (set<wstring>::iterator lem=common.begin(); lem!=common.end(); lem++) {

	// Locate and erase analysis, replacing the first 
        // with new tag.
	bool done=false;
	for (a=wb; a!=we; a++) {
	  bool found=false;
	  for (tr=r->begin(), tr++; tr!=r->end() and not found; tr++)
	    found = ((*tr)==a->get_tag() and (*lem)==a->get_lemma());
	  
	  if (found) {  // tag and lemma match. Delete analysis
	    if (not done) {
	      // first matching analysis. just replace tag.
	      a->set_tag(*tagout);
	      done=true;
	    }
	    else {
	      // not the first, delete.
	      word::iterator a2=a; a2++;
	      w.erase(a);	      
	      a2--; a=a2;  // fix iteration control
	    }
	  }
	}
      }	  
    }
  }
}

//---------------------------------------------
// print obtained analysis for a word
//---------------------------------------------

wstring print_word (word & w, int pos, int posf, const wstring &currpos) {

  wstring wform=w.get_form();
  toXML(wform); 
  wstring lcform=util::lowercase(wform);
  
  // if the word is in the 'replace' list, replace all its analysis.
  map<wstring,list<analysis> >::iterator p=replaces.find(lcform);
  if (p!=replaces.end()) w.set_analysis(p->second);
  
  // Assume OutputFormat=TAGGED. Output only selected analysis.
  bool tagged=true;
  word::iterator wb = w.selected_begin();
  word::iterator we = w.selected_end();
  
  map<wstring,wstring>::iterator nd=noTag.find(lcform);  // find form in noTag list
  // if not found, try searching any selected PoS tag.
  for (word::iterator a=wb; nd==noTag.end() and a!=we; a++) 
    nd=noTag.find(a->get_tag());  
  
  // if output is MORFO or word/tag was in NoDisambiguate list (and position matches), output all analysis.
  if (cfg->OutputFormat==MORFO or (nd!=noTag.end() and (nd->second==L"@any" or nd->second==currpos))) {
    wb = w.analysis_begin();
    we = w.analysis_end();
    tagged=false;
  }
  
  CheckFusion(w,tagged);
  
  return print_analysis(wb,we,wform,pos,posf,w.get_span_start(),w.get_span_finish());
}


//---------------------------------------------
// print obtained analysis.
//---------------------------------------------

void PrintResults(list<sentence> &ls, const wstring &text, int &nsent) {
  word::const_iterator ait;
  sentence::iterator w;
  sentence::iterator nxt;
  list<sentence>::iterator is;
  parse_tree tr;  
  dep_tree dep;
  bool prevde=false; 
  wstring output_analysis;
 
  for (is=ls.begin(); is!=ls.end(); is++,nsent++) {

    size_t b=is->front().get_span_start();
    size_t e=is->back().get_span_finish();
    wstring txtsent= text.substr(b,e-b+1);
    toXML(txtsent);

    say(L"<fsc version=\"1.0\">");
    say(L"  <chart id=\"fsc-"+util::int2wstring(nsent)+L"\">");
    say(L"    <text>"+txtsent+L"</text>");

    wstring currpos=L"@begin";
    int pos=0;
    int posf;
    for (w=is->begin(); w!=is->end(); w++) {

      if (w->is_ambiguous_mw()) {
	// output mw components
	list<word> mw=w->get_words_mw();
	posf=pos;
	for (list<word>::iterator iw=mw.begin(); iw!=mw.end(); iw++) {
	  output_analysis += print_word(*iw,posf,posf+1,currpos);
	  posf++;
	  currpos=L"@any";
	}
      }
      else posf = pos+1;

      // output [multi]word normally 
      output_analysis += print_word(*w,pos,posf,currpos);
      pos = posf;
      
      currpos=L"@any";
    }

    say(L"    <lattice init=\"v0\" final=\"v"+util::int2wstring(posf)+L"\">");
    say(output_analysis);
    say(L"    </lattice>");
    say(L"  </chart>");
    say(L"</fsc>");
  }
}


//---------------------------------------------
// Plain text, start with tokenizer.
//---------------------------------------------
void ProcessPlain() {
  wstring text,line;
  list<word> av;
  list<word>::const_iterator i;
  list<sentence> ls;
  int nsent=1;

    bool head=false;
    text.clear();
    unsigned long offs=0;
    while (getline(wcin,line)) {

      wcerr<<L"   ## Read line ("<<line<<L")"<<endl;
      if (!head) {
	wstring::size_type p=line.find(L"<?xml version='1.0' encoding='utf-8'?>"); 
	if (p!=wstring::npos) {
	  line.erase(p,38);
	  say(L"<?xml version=\"1.0\" encoding=\"utf-8\"?>");
          head=true;
	}
	else ERROR_CRASH(L"ERROR - <?xml?> header expected");
      }

      if (line[0]==FORMFEED) {
	// process last sentence in buffer (if any)
	ls=sp->split(av, true);  //flush splitter buffer
	morfo->analyze(ls);
	if (cfg->OutputFormat>=TAGGED) tagger->analyze(ls);
	if (cfg->OutputFormat>=TAGGED && cfg->NEC_NEClassification) neclass->analyze(ls);
	
	PrintResults(ls,text,nsent);
        wcout<<FORMFEED<<endl; 
        wcerr<<FORMFEED<<endl; 
        head=false;
        text.clear();
        offs=0;
      }
      else {
	// clean xml tags from input
	wstring::size_type p;
	util::find_and_replace(line, L"<text>", L"");
	util::find_and_replace(line, L"</text>", L"");

        // translate XML special chars to regular ascii
	fromXML(line);

	text = text+line+L"\n";
        
	av=tk->tokenize(line,offs);
	ls=sp->split(av, cfg->AlwaysFlush);
	morfo->analyze(ls);
	if (cfg->OutputFormat>=TAGGED) tagger->analyze(ls);
	if (cfg->OutputFormat>=TAGGED && cfg->NEC_NEClassification) neclass->analyze(ls);

	PrintResults(ls,text,nsent);	
	av.clear(); // clear list of words for next use
	ls.clear(); // clear list of sentences for next use
      }
    }
}


//---------------------------------------------
// Locate file sppp.dat in the same place than 
// the executable, and load the transformation 
// rules.
//---------------------------------------------
void read_SPPP_rules() {

  wstring name;
  char* flsppp=getenv("FREELINGSPPP");
  if (flsppp==NULL){
    ERROR_CRASH(L"FREELINGSPPP is not defined. It should point to a file with rules tuning FreeLing output to meet the LKB grammar needs.");
  }
  
  wstring exp=util::string2wstring(flsppp);

  wifstream fitx;
  util::open_utf8_file(fitx,exp);
  if (fitx.fail()) ERROR_CRASH(L"Error opening rule file "+exp);

  wstring line;
  int reading=0;
  int read=0;
  while (getline(fitx,line)) {
    if (line == L"<NoDisambiguate>") reading=1;
    else if (line == L"</NoDisambiguate>") reading=0;
    else if (line == L"<ReplaceAll>") reading=2;
    else if (line == L"</ReplaceAll>") reading=0;
    else if (line == L"<Fusion>") reading=3;
    else if (line == L"</Fusion>") reading=0;
    else if (line == L"<Output>") reading=4;
    else if (line == L"</Output>") reading=0;

    else if (reading==1) {  // reading NoDisambiguate section
      wistringstream sin(line);
      wstring form,at;
      sin>>form>>at;
      noTag.insert(make_pair(form,at));
    }

    else if (reading==2) { // whole analysis list replacements
      wistringstream sin(line);

      wstring form,al,at;
      sin>>form;
      list<analysis> la;
      while (sin>>al>>at) la.push_back(analysis(al,at));
      replaces.insert(make_pair(form,la));
    }
 
    else if (reading==3) {
      wistringstream sin(line);
      list<wstring> rul;

      wstring tag;
      sin>>tag;
      while (tag!=L"=>") {
	rul.push_back(tag);
	sin>>tag;
      }
      // store last tag at the first place.
      sin>>tag;
      rul.push_front(tag);

      fusion.push_back(rul);
    }
 
    else if (reading==4) {  // Read output field arrangements
      wistringstream sin(line);

      SPPP_rule r;  // new rule.

      wstring x;
      sin>>x;  /// get form
      if (x!=L"*") {
	r.any_form=false;
	if (x[0]=='!') { r.pn_form=false; x = x.substr(1); }	  
	r.form1 = boost::make_u32regex(L"^"+x+L"$");
      }
      sin>>x;  /// get lemma
      if (x!=L"*") {
	r.any_lemma=false;
	if (x[0]=='!') { r.pn_lemma=false; x = x.substr(1); }
	r.lemma = boost::make_u32regex(L"^"+x+L"$");
      }
      sin>>x;  /// get tag
      if (x!=L"*") {
	r.any_tag=false;
	if (x[0]=='!') { r.pn_tag=false; x = x.substr(1); }	  
	r.tag = boost::make_u32regex(L"^"+x);
      }

      sin>>x; 
      if (x!=L"=>") ERROR_CRASH(L"Expecting '=>' in rule read from sppp.dat");

      sin>>r.stem;
      sin>>r.rule_id;
      sin>>r.form2;

      // Rest of the line (if any) is ignored (comments).

      // add rule to rule list.
      rules.push_back(r);
    }

    if (reading!=0) read++;
  }

  if (read==0) ERROR_CRASH(L"Rule file "+exp+L" contains no rules.");
}
  

//---------------------------------------------
// Sample main program
//---------------------------------------------
int main(int argc, char **argv) {

  util::init_locale(L"default");

  /// load transformation file from FreeLing to SPPP
  read_SPPP_rules();

  // read configuration file and command-line options
  cfg = new config(argc, argv);

  // create required analyzers
  tk = new tokenizer(cfg->TOK_TokenizerFile); 
  sp = new splitter(cfg->SPLIT_SplitterFile);

  // the morfo class requires several options at creation time.
  // they are passed packed in a maco_options object.
  maco_options opt(cfg->Lang);
  // boolean options to activate/desactivate modules
  // default: all modules activated (options set to "true")
  opt.set_active_modules (cfg->MACO_UserMap, 
                          cfg->MACO_AffixAnalysis,    cfg->MACO_MultiwordsDetection,
			  cfg->MACO_NumbersDetection, cfg->MACO_PunctuationDetection,
			  cfg->MACO_DatesDetection,   cfg->MACO_QuantitiesDetection,
			  cfg->MACO_DictionarySearch, cfg->MACO_ProbabilityAssignment,
			  cfg->MACO_NER_which,        false);

  // decimal/thousand separators used by number detection
  opt.set_nummerical_points(cfg->MACO_Decimal, cfg->MACO_Thousand);
  // Minimum probability for a tag for an unkown word
  opt.set_threshold(cfg->MACO_ProbabilityThreshold);
  // Data files for morphological submodules. by default set to ""
  // Only files for active modules have to be specified 
  opt.set_data_files (cfg->MACO_UserMapFile, 
                      cfg->MACO_LocutionsFile, cfg->MACO_QuantitiesFile,
		      cfg->MACO_AffixFile, cfg->MACO_ProbabilityFile,
		      cfg->MACO_DictionaryFile, cfg->MACO_NPDataFile,
		      cfg->MACO_PunctuationFile, L"");

  // create analyzer with desired options
  morfo = new maco(opt);

  if (cfg->OutputFormat>=TAGGED) {
     if (cfg->TAGGER_which == HMM)
       tagger = new hmm_tagger(cfg->Lang, cfg->TAGGER_HMMFile,
                               cfg->TAGGER_Retokenize, cfg->TAGGER_ForceSelect);
     else if (cfg->TAGGER_which == RELAX)
       tagger = new relax_tagger(cfg->TAGGER_RelaxFile, cfg->TAGGER_RelaxMaxIter,
                                 cfg->TAGGER_RelaxScaleFactor,
                                 cfg->TAGGER_RelaxEpsilon,
                                 cfg->TAGGER_Retokenize,
                                 cfg->TAGGER_ForceSelect);
  }

  if (cfg->OutputFormat>=TAGGED && cfg->NEC_NEClassification) {
    neclass = new nec(cfg->NEC_NECFile);
  }


  // Input is plain text.
  ProcessPlain();

  // clean up. Note that deleting a null pointer is a safe (yet useless) operation
  delete cfg;
  delete tk;
  delete sp; 
  delete morfo; 
  delete tagger;
  delete neclass; 
}