////////////////////////////////////////////////////////////////// // // FreeLing - Open Source Language Analyzers // // Copyright (C) 2004 TALP Research Center // Universitat Politecnica de Catalunya // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // contact: Lluis Padro (padro@lsi.upc.es) // TALP Research Center // despatx C6.212 - Campus Nord UPC // 08034 Barcelona. SPAIN // //////////////////////////////////////////////////////////////// //------------------------------------------------------------------// // // IMPORTANT NOTICE // // This file contains a simple main program to illustrate // usage of FreeLing analyzers library. // // This sample main program may be used straightforwardly as // a basic front-end for the analyzers (e.g. to analyze corpora) // // Neverthless, if you want embed the FreeLing libraries inside // a larger application, or you want to deal with other // input/output formats (e.g. XML), the efficient and elegant // way to do so is consider this file as a mere example, and call // the library from your your own main code. // //------------------------------------------------------------------// using namespace std; #include #include #include #include #include #include #include #include "freeling.h" #include "freeling/morfo/util.h" #include "config.h" #undef MOD_TRACENAME #define MOD_TRACENAME L"LKB_ANALYZER" const wchar_t FORMFEED=0x0C; // we use pointers to the analyzers, so we // can create only those strictly necessary. tokenizer *tk=NULL; splitter *sp=NULL; maco *morfo=NULL; nec *neclass=NULL; POS_tagger *tagger=NULL; // FL configuration options config *cfg=NULL; // Variables and classes to hold transformation rules FL->SPPP class SPPP_rule { public: boost::u32regex form1; boost::u32regex lemma; boost::u32regex tag; bool any_form, pn_form; bool any_lemma, pn_lemma; bool any_tag, pn_tag; wstring stem; wstring rule_id; wstring form2; SPPP_rule() { form1=boost::make_u32regex(L""); lemma=boost::make_u32regex(L""); tag=boost::make_u32regex(L""); any_form=true; any_lemma=true; any_tag=true; pn_form=true; pn_lemma=true; pn_tag=true;} ~SPPP_rule() {}; }; map noTag; list rules; map > replaces; list > fusion; //--------------------------------------------- // output a string both in cout and cerr //--------------------------------------------- void say(const wstring &s) { wcout<", L">"); util::find_and_replace(s, L"'", L"'"); } //--------------------------------------------- // decode special chars from XML //--------------------------------------------- void fromXML(wstring &s){ util::find_and_replace(s, L""", L"\""); util::find_and_replace(s, L"<", L"<"); util::find_and_replace(s, L">", L">"); util::find_and_replace(s, L"'", L"'"); util::find_and_replace(s, L"&", L"&"); } //--------------------------------------------- // print one word analysis. //--------------------------------------------- wstring print_analysis(const word::const_iterator wb, const word::const_iterator we, const wstring &form, int pos, int posf, int start, int finish) { wstring result; wstring lcform = util::lowercase(form); word::const_iterator a; for (a=wb; a!=we; a++) { wstring tag = a->get_tag(); wstring alemma = a->get_lemma(); wstring clitics; toXML(alemma); if (a->is_retokenizable()) { // clitics list rtk=a->get_retokenizable(); // verb tag list::iterator r=rtk.begin(); tag = r->get_tag(); r++; // clitics tags while (r!= rtk.end()) { clitics += L"+"+r->get_tag()+L""; r++; } } bool trobat = false; wstring stem=L"NO-RULE-FOUND"; wstring rid=L"NO-RULE-FOUND"; wstring frm=L"NO-RULE-FOUND"; for (list::iterator r=rules.begin(); r!=rules.end() and not trobat; r++) { if ( (r->any_form or boost::u32regex_search(lcform,r->form1)==r->pn_form) and (r->any_lemma or boost::u32regex_search(alemma,r->lemma)==r->pn_lemma) and (r->any_tag or boost::u32regex_search(tag,r->tag)==r->pn_tag) ) { // matching rule found, apply right hand side. trobat = true; // compute stem if (r->stem==L"L") stem=alemma; else if (r->stem==L"T") stem=tag; else if (r->stem==L"F") stem=lcform; else stem=r->stem; // compute rule_id if (r->rule_id==L"L") rid=alemma; else if (r->rule_id==L"T") rid=tag; else if (r->rule_id==L"F") rid=lcform; // compute form wstring f=L""; for (size_t i=0; iform2.size(); i++) { if (r->form2[i]=='L') f=f+L"#"+alemma; else if (r->form2[i]=='T') f=f+L"#"+tag; else if (r->form2[i]=='F') f=f+L"#"+form; } if (not f.empty()) frm=f.substr(1); } } result += L" \n"; result += L" \n"; result += L" "+form+L"\n"; result += L" "+util::int2wstring(start)+L"\n"; result += L" "+util::int2wstring(finish)+L"\n"; result += L" "+stem+L"\n"; result += L" "+rid+L"\n"; if (not clitics.empty()) result += L" "+clitics+L"\n"; result += L" \n"; result += L" \n"; } return result; } //--------------------------------------------- // check if the word matches some Fusion rule, // and apply it if so. //--------------------------------------------- void CheckFusion(word &w, bool tagged) { word::iterator wb,we,a; set common; if (not tagged) { wb = w.analysis_begin(); we = w.analysis_end(); } else { wb = w.selected_begin(); we = w.selected_end(); } // check all fusion rules. list >::iterator r; for (r=fusion.begin(); r!=fusion.end(); r++) { common.clear(); // clear set of common lemmas. // check rule bool ok=true; list::iterator tagout=r->begin(); // first tag is the output list::iterator tag1=r->begin(); tag1++; // second tag is first condition list::iterator tr; for (tr=tag1; tr!=r->end() and ok; tr++) { // build a set with lemmas for current rule tag set lems; for (a=wb; a!=we; a++) if ((*tr)==a->get_tag()) lems.insert(a->get_lemma()); if (tr==tag1) // first iteration, intersection so far is lem. common=lems; else { // further iterations, accumulate intersection. set is; set_intersection(common.begin(), common.end(), lems.begin(), lems.end(), inserter(is,is.begin()) ); common=is; } // if acumulated intersection is empty, rule won't match. ok = not common.empty(); } if (ok) { // rule matched. Apply it // for each lemma matching rule tags for (set::iterator lem=common.begin(); lem!=common.end(); lem++) { // Locate and erase analysis, replacing the first // with new tag. bool done=false; for (a=wb; a!=we; a++) { bool found=false; for (tr=r->begin(), tr++; tr!=r->end() and not found; tr++) found = ((*tr)==a->get_tag() and (*lem)==a->get_lemma()); if (found) { // tag and lemma match. Delete analysis if (not done) { // first matching analysis. just replace tag. a->set_tag(*tagout); done=true; } else { // not the first, delete. word::iterator a2=a; a2++; w.erase(a); a2--; a=a2; // fix iteration control } } } } } } } //--------------------------------------------- // print obtained analysis for a word //--------------------------------------------- wstring print_word (word & w, int pos, int posf, const wstring &currpos) { wstring wform=w.get_form(); toXML(wform); wstring lcform=util::lowercase(wform); // if the word is in the 'replace' list, replace all its analysis. map >::iterator p=replaces.find(lcform); if (p!=replaces.end()) w.set_analysis(p->second); // Assume OutputFormat=TAGGED. Output only selected analysis. bool tagged=true; word::iterator wb = w.selected_begin(); word::iterator we = w.selected_end(); map::iterator nd=noTag.find(lcform); // find form in noTag list // if not found, try searching any selected PoS tag. for (word::iterator a=wb; nd==noTag.end() and a!=we; a++) nd=noTag.find(a->get_tag()); // if output is MORFO or word/tag was in NoDisambiguate list (and position matches), output all analysis. if (cfg->OutputFormat==MORFO or (nd!=noTag.end() and (nd->second==L"@any" or nd->second==currpos))) { wb = w.analysis_begin(); we = w.analysis_end(); tagged=false; } CheckFusion(w,tagged); return print_analysis(wb,we,wform,pos,posf,w.get_span_start(),w.get_span_finish()); } //--------------------------------------------- // print obtained analysis. //--------------------------------------------- void PrintResults(list &ls, const wstring &text, int &nsent) { word::const_iterator ait; sentence::iterator w; sentence::iterator nxt; list::iterator is; parse_tree tr; dep_tree dep; bool prevde=false; wstring output_analysis; for (is=ls.begin(); is!=ls.end(); is++,nsent++) { size_t b=is->front().get_span_start(); size_t e=is->back().get_span_finish(); wstring txtsent= text.substr(b,e-b+1); toXML(txtsent); say(L""); say(L" "); say(L" "+txtsent+L""); wstring currpos=L"@begin"; int pos=0; int posf; for (w=is->begin(); w!=is->end(); w++) { if (w->is_ambiguous_mw()) { // output mw components list mw=w->get_words_mw(); posf=pos; for (list::iterator iw=mw.begin(); iw!=mw.end(); iw++) { output_analysis += print_word(*iw,posf,posf+1,currpos); posf++; currpos=L"@any"; } } else posf = pos+1; // output [multi]word normally output_analysis += print_word(*w,pos,posf,currpos); pos = posf; currpos=L"@any"; } say(L" "); say(output_analysis); say(L" "); say(L" "); say(L""); } } //--------------------------------------------- // Plain text, start with tokenizer. //--------------------------------------------- void ProcessPlain() { wstring text,line; list av; list::const_iterator i; list ls; int nsent=1; bool head=false; text.clear(); unsigned long offs=0; while (getline(wcin,line)) { wcerr<"); if (p!=wstring::npos) { line.erase(p,38); say(L""); head=true; } else ERROR_CRASH(L"ERROR - header expected"); } if (line[0]==FORMFEED) { // process last sentence in buffer (if any) ls=sp->split(av, true); //flush splitter buffer morfo->analyze(ls); if (cfg->OutputFormat>=TAGGED) tagger->analyze(ls); if (cfg->OutputFormat>=TAGGED && cfg->NEC_NEClassification) neclass->analyze(ls); PrintResults(ls,text,nsent); wcout<", L""); util::find_and_replace(line, L"", L""); // translate XML special chars to regular ascii fromXML(line); text = text+line+L"\n"; av=tk->tokenize(line,offs); ls=sp->split(av, cfg->AlwaysFlush); morfo->analyze(ls); if (cfg->OutputFormat>=TAGGED) tagger->analyze(ls); if (cfg->OutputFormat>=TAGGED && cfg->NEC_NEClassification) neclass->analyze(ls); PrintResults(ls,text,nsent); av.clear(); // clear list of words for next use ls.clear(); // clear list of sentences for next use } } } //--------------------------------------------- // Locate file sppp.dat in the same place than // the executable, and load the transformation // rules. //--------------------------------------------- void read_SPPP_rules() { wstring name; char* flsppp=getenv("FREELINGSPPP"); if (flsppp==NULL){ ERROR_CRASH(L"FREELINGSPPP is not defined. It should point to a file with rules tuning FreeLing output to meet the LKB grammar needs."); } wstring exp=util::string2wstring(flsppp); wifstream fitx; util::open_utf8_file(fitx,exp); if (fitx.fail()) ERROR_CRASH(L"Error opening rule file "+exp); wstring line; int reading=0; int read=0; while (getline(fitx,line)) { if (line == L"") reading=1; else if (line == L"") reading=0; else if (line == L"") reading=2; else if (line == L"") reading=0; else if (line == L"") reading=3; else if (line == L"") reading=0; else if (line == L"") reading=4; else if (line == L"") reading=0; else if (reading==1) { // reading NoDisambiguate section wistringstream sin(line); wstring form,at; sin>>form>>at; noTag.insert(make_pair(form,at)); } else if (reading==2) { // whole analysis list replacements wistringstream sin(line); wstring form,al,at; sin>>form; list la; while (sin>>al>>at) la.push_back(analysis(al,at)); replaces.insert(make_pair(form,la)); } else if (reading==3) { wistringstream sin(line); list rul; wstring tag; sin>>tag; while (tag!=L"=>") { rul.push_back(tag); sin>>tag; } // store last tag at the first place. sin>>tag; rul.push_front(tag); fusion.push_back(rul); } else if (reading==4) { // Read output field arrangements wistringstream sin(line); SPPP_rule r; // new rule. wstring x; sin>>x; /// get form if (x!=L"*") { r.any_form=false; if (x[0]=='!') { r.pn_form=false; x = x.substr(1); } r.form1 = boost::make_u32regex(L"^"+x+L"$"); } sin>>x; /// get lemma if (x!=L"*") { r.any_lemma=false; if (x[0]=='!') { r.pn_lemma=false; x = x.substr(1); } r.lemma = boost::make_u32regex(L"^"+x+L"$"); } sin>>x; /// get tag if (x!=L"*") { r.any_tag=false; if (x[0]=='!') { r.pn_tag=false; x = x.substr(1); } r.tag = boost::make_u32regex(L"^"+x); } sin>>x; if (x!=L"=>") ERROR_CRASH(L"Expecting '=>' in rule read from sppp.dat"); sin>>r.stem; sin>>r.rule_id; sin>>r.form2; // Rest of the line (if any) is ignored (comments). // add rule to rule list. rules.push_back(r); } if (reading!=0) read++; } if (read==0) ERROR_CRASH(L"Rule file "+exp+L" contains no rules."); } //--------------------------------------------- // Sample main program //--------------------------------------------- int main(int argc, char **argv) { util::init_locale(L"default"); /// load transformation file from FreeLing to SPPP read_SPPP_rules(); // read configuration file and command-line options cfg = new config(argc, argv); // create required analyzers tk = new tokenizer(cfg->TOK_TokenizerFile); sp = new splitter(cfg->SPLIT_SplitterFile); // the morfo class requires several options at creation time. // they are passed packed in a maco_options object. maco_options opt(cfg->Lang); // boolean options to activate/desactivate modules // default: all modules activated (options set to "true") opt.set_active_modules (cfg->MACO_UserMap, cfg->MACO_AffixAnalysis, cfg->MACO_MultiwordsDetection, cfg->MACO_NumbersDetection, cfg->MACO_PunctuationDetection, cfg->MACO_DatesDetection, cfg->MACO_QuantitiesDetection, cfg->MACO_DictionarySearch, cfg->MACO_ProbabilityAssignment, cfg->MACO_NER_which, false); // decimal/thousand separators used by number detection opt.set_nummerical_points(cfg->MACO_Decimal, cfg->MACO_Thousand); // Minimum probability for a tag for an unkown word opt.set_threshold(cfg->MACO_ProbabilityThreshold); // Data files for morphological submodules. by default set to "" // Only files for active modules have to be specified opt.set_data_files (cfg->MACO_UserMapFile, cfg->MACO_LocutionsFile, cfg->MACO_QuantitiesFile, cfg->MACO_AffixFile, cfg->MACO_ProbabilityFile, cfg->MACO_DictionaryFile, cfg->MACO_NPDataFile, cfg->MACO_PunctuationFile, L""); // create analyzer with desired options morfo = new maco(opt); if (cfg->OutputFormat>=TAGGED) { if (cfg->TAGGER_which == HMM) tagger = new hmm_tagger(cfg->Lang, cfg->TAGGER_HMMFile, cfg->TAGGER_Retokenize, cfg->TAGGER_ForceSelect); else if (cfg->TAGGER_which == RELAX) tagger = new relax_tagger(cfg->TAGGER_RelaxFile, cfg->TAGGER_RelaxMaxIter, cfg->TAGGER_RelaxScaleFactor, cfg->TAGGER_RelaxEpsilon, cfg->TAGGER_Retokenize, cfg->TAGGER_ForceSelect); } if (cfg->OutputFormat>=TAGGED && cfg->NEC_NEClassification) { neclass = new nec(cfg->NEC_NECFile); } // Input is plain text. ProcessPlain(); // clean up. Note that deleting a null pointer is a safe (yet useless) operation delete cfg; delete tk; delete sp; delete morfo; delete tagger; delete neclass; }