#ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include "tRepp.h" #include "postag.h" #include "unicode.h" #include "grammar.h" #include "profile.h" #include "derivReader.h" #include "derivReader.cpp" using namespace std; enum Tokeniser {NONE, REPP, CHASEN, YY, PTB}; enum TagType {LETYPE, LETYPEMORPH}; TagType tagtype; void collectleaves(Grammar &g, tToken s, vector &ancestors, vector &leaves); void parse_options(int argc, char **argv, string *gfname, string *pname, Tokeniser *tok, string *tokfname, vector *calllist, bool *tnt, string *tntmodel, TagType *tagtype, tTokenFormat *format, bool *numopt, int *lim, bool *chasen, string *chasenrc, bool *goldonly, int *parsenum, int *itemno); vector splitOnSpace(string item); bool is_infl(string rule); vector extractFromPTB(string item, bool goldtags=false); bool is_num(string str); void align(vector &itok, vector &rtok, vector *tmap); double alignscore(UChar a, UChar b); int main (int argc, char **argv) { // setting option variables string gfname, pname, tokfname, tntmodel, chasenrc; Tokeniser tok; vector calllist; tTokenFormat format; bool tnt, numopt, chasen, goldonly; int lim, parsenum, itemno; parse_options(argc, argv, &gfname, &pname, &tok, &tokfname, &calllist, &tnt, &tntmodel, &tagtype, &format, &numopt, &lim, &chasen, &chasenrc, &goldonly, &parsenum, &itemno); // UTF-8 encoder initialize_encoding_converter("utf-8"); Grammar g(gfname); Profile p(pname); vector leaves; //used in collectleaves() DerivReader > reader(g, leaves, NULL, NULL, &collectleaves); pair,string> result = p.getResult(); int context = -1; // context is item id int event = -1; // event is parse id vector itokens; while (result.first.first >= 0) {//new item/context if (result.first.first != context) {//new item/context if (context != -1) { //finish last context }//finished last context } context = result.first.first; event = result.first.second; if (lim > 0 && p.getReadings(context) >= lim) { //skip items with result = p.getResult(); // greater than lim continue; // readings } if (itemno >= 0 && context != itemno) { //select single item result = p.getResult(); continue; } if (goldonly && (p.numGold(context) > 1 || !p.isGold(context, event))) { result = p.getResult(); continue; } if (parsenum >= 0 && event != parsenum) { result = p.getResult(); continue; } //do stuff with result leaves.clear(); cout << "ITEM: " << context << endl; reader.readDeriv(result.second); if (numopt) cout << context << " " << event << " "; for (int idx=0; idx < leaves.size(); ++idx) { //string tag("NULL\tNULL\tNULL"); string tag("NULL"); tag = leaves[idx].tag(); UnicodeString us = Conv->convert(leaves[idx].surface()); if (us.length() == 1 && u_ispunct(us.charAt(0))) { if ((u_charType(us.charAt(0)) == U_START_PUNCTUATION || us.compare(Conv->convert(string("“"))) == 0 || us.compare(Conv->convert(string("‘"))) == 0) ) { tag = "O:PUNCT"; } } //leaves[idx].print(format, pair(tag,1)); leaves[idx].print(format); if (idx != (leaves.size()-1)) cout << leaves[idx].separator(format); } cout << endl; result = p.getResult(); } //finish last context return 0; } void parse_options(int argc, char **argv, string *gfname, string *pname, Tokeniser *tok, string *tokfname, vector *calllist, bool *tnt, string *tntmodel, TagType *tagtype, tTokenFormat *format, bool *numopt, int *lim, bool *chasen, string *chasenrc, bool *goldonly, int *parsenum, int *itemno) { namespace po = boost::program_options; po::options_description visible("Options"); visible.add_options() ("help,h", "This usage information.") ("tok,t", po::value()->default_value("none"), "tokeniser: none, repp, chasen, yy, ptb (default: none)" " Tokeniser will be applied to the item string in the item file." " (YY and ChaSen unimplemented as yet. Let Rebecca know if you want them.)") ("rpp,r", po::value(tokfname), "tokenizer .rpp file") ("call,c", po::value< vector >(calllist)->composing(), "rpp calls, multiple options valid.") ("pos,p", "Use TnT POS tags.") ("model,m", po::value(tntmodel), "TnT model, defaults to WSJ model in LOGON tree.") ("jpos", "Use ChaSen POS tags.") ("jposrc", po::value(chasenrc), "Specify chasenrc file, defaults to LOGONROOT/.chasenrc") ("infl,i", "Tags include morphological inflection rules.") ("format", po::value()->default_value("TNT"), "token format:TNT,CANDC,CONLL,FSC (default: TNT)") ("num,n", "Output item and parse number.") ("goldonly,g", "Only extract tags from 'gold' trees") ("analysis,a", po::value(parsenum)->default_value(-1), "Select a specific analysis, default (-1): all.") ("single,s", po::value(itemno)->default_value(-1), "Select a specific item, default (-1): all.") ("limit,l", po::value(lim)->default_value(0), "Number of readings at which a context is ignored. " "Set to nbest to negate the effect of using a model during parsing.") ; po::options_description hidden("Hidden options"); hidden.add_options() ("grammar-file", po::value(gfname), "grammar .tdl file") ("profile", po::value(pname), "profile") ; po::options_description cmd_line ("Command line options"); cmd_line.add(visible).add(hidden); po::positional_options_description p; p.add("grammar-file",1).add("profile",1); po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmd_line).positional(p).run(), vm); notify(vm); if (vm.count("help")) { cout << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cout << visible << endl; exit(0); } if (!vm.count("grammar-file") || !vm.count("profile")) { cerr << "Insufficient arguments given." << endl; cerr << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cerr << visible << endl; exit(1); } string t(vm["tok"].as()); if (t.compare("none") == 0) *tok= NONE; else if (t.compare("repp") == 0) *tok= REPP; else if (t.compare("chasen") == 0) *tok= CHASEN; else if (t.compare("yy") == 0) *tok= YY; else if (t.compare("ptb") == 0) *tok= PTB; else { cerr << "Warning: invalid tokeniser (--tok) " << t << " given. " << "Setting tokeniser to none" << endl; *tok= NONE; } if (*tok == REPP && !vm.count("rpp")) { cerr << "A repp file must be specified when using REPP tokenisation" << endl; cerr << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cerr << visible << endl; exit(1); } if (vm.count("infl")) *tagtype = LETYPEMORPH; else *tagtype = LETYPE; string f(vm["format"].as()); if (f.compare("FSC") == 0) *format = FSC; else if (f.compare("CANDC") == 0) *format = CANDC; else if (f.compare("CONLL") == 0) *format = CONLL; else if (f.compare("TNT") == 0) *format = TNT; else { cerr << "Warning: Invalid format " << f << " given. " << "Setting format to TNT" << endl; *format = TNT; } if (vm.count("pos")) { *tnt = true; if (*format == TNT) cerr << "Warning: TnT output format and POS tag option set. " << endl << " When using TnT as a supertagger, do not use POS tags " << "in the training input." << endl; } else { *tnt = false; if (vm.count("model")) cerr << "Warning: TnT model given, but TnT option not set." << endl; } if (!vm.count("model")) *tntmodel = ""; if (vm.count("jpos")) { *chasen = true; } else { *chasen = false; if (vm.count("jposrc")) cerr << "Warning: chasenrc file given, but ChaSen option not set." << endl; } if (!vm.count("jposrc")) *chasenrc = ""; if (vm.count("num")) *numopt = true; else *numopt = false; if (vm.count("goldonly")) *goldonly = true; else *goldonly = false; } vector splitOnSpace(string item) { vector toks; int start=0; int end=0; end = item.find(' ', start); while (end != string::npos) { toks.push_back(tToken(item.substr(start, end-start))); start=end+1; end = item.find(' ', start); } toks.push_back(tToken(item.substr(start))); return toks; } bool is_infl(string rule) { if ( (rule.size() > 4 && rule.compare(rule.size()-4,4,"odlr") == 0) || (rule.size() > 3 && (rule.compare(rule.size()-3,3,"olr") == 0 || rule.compare(rule.size()-3,3,"ilr") == 0))) return true; if (rule.size() > 9 && rule.compare(rule.size()-9,9,"infl-rule") == 0) return true; return false; } void collectleaves(Grammar &g, tToken s, vector &ancestors, vector &leaves) { leaves.push_back(s); string tag(ancestors.back().surface()); if (tagtype == LETYPEMORPH) { for (int x=ancestors.size()-2; x >= 0 && g.is_lexrule(ancestors[x].surface()); x--) { //&& is_infl(ancestors[x].surface()); x--) { string infl = ancestors[x].surface(); if (infl.compare(infl.length()-3, 3,"plr") != 0 && (infl.length() < 5 || (infl.length() >=5 && infl.compare(0,5,"punct") != 0))) { tag+=":"; tag+=infl; } } } leaves.back().tag(tag); } class AlignNode { public: int data; double score; AlignNode *back; AlignNode(int d, double s, AlignNode *b):data(d),score(s),back(b){}; }; void align(vector &itok, vector &rtok, vector *tmap) { typedef pair > Slot; //create char map from rtok vector cmap; vector ctmap; cmap.push_back('^'); ctmap.push_back(-1); int tokcount = 0; for (vector::iterator ti = rtok.begin(); ti != rtok.end(); ++ti) { UnicodeString tmptok = Conv->convert((*ti).surface()); for (int x=0; x < tmptok.length(); ++x) { cmap.push_back(tmptok.charAt(x)); ctmap.push_back(tokcount); } tokcount++; } //foreach char in itok, find possible aligns, find possible pre-aligns, score tokcount = 0; vector slots; for (vector::iterator ti = itok.begin(); ti != itok.end(); ++ti) { UnicodeString tmptok = Conv->convert((*ti).surface()); for (int x=0; x < tmptok.length(); ++x) { Slot nextslot(tokcount,vector()); for (int y=1; y < cmap.size(); ++y) { double score = alignscore(tmptok.charAt(x), cmap[y]); if (score > 0) { AlignNode *alignment = new AlignNode(y,score,NULL); if (slots.size() == 0) { nextslot.second.push_back(alignment); } else { //work out path to here and adjust score for (vector::iterator si = slots.back().second.begin(); si != slots.back().second.end(); ++si) { if (abs((*si)->data) < y) { //possible predecessor if ( ((*si)->data >= 0 && ((*si)->score)+score >= alignment->score) || (*si)->score + score > alignment->score) { //best path seen so far alignment->score = (*si)->score + score; alignment->back = *si; } } } nextslot.second.push_back(alignment); } } } if (slots.size() == 0) { //align first char to null AlignNode *anode = new AlignNode(0,0, NULL); nextslot.second.push_back(anode); } else { //add null matches that record highest char in path so far map nullNodes; for (vector::iterator si = slots.back().second.begin(); si != slots.back().second.end(); ++si) { if ((*si)->data > 0) { AlignNode * anode = new AlignNode(-1*(*si)->data,(*si)->score,*si); nextslot.second.push_back(anode); nullNodes[-1*(*si)->data] = *si; } else { if (nullNodes.count((*si)->data)) { //already seen this null if ((*si)->score > nullNodes[(*si)->data]->score ) { nullNodes[(*si)->data]->score = (*si)->score; nullNodes[(*si)->data]->back = *si; } } else { AlignNode *anode = new AlignNode((*si)->data,(*si)->score,*si); nextslot.second.push_back(anode); } } } } slots.push_back(nextslot); } tokcount++; } //we have the best char to char mapping, now calculate tok to tok int currentitok; AlignNode *maxnode, *curr; double maxscore = 0; map rtoks; for (vector::reverse_iterator ri = slots.rbegin(); ri != slots.rend(); ++ri) { currentitok = ri->first; if (ri == slots.rbegin()) { for (vector::iterator si = (*ri).second.begin(); si != (*ri).second.end(); ++si) { if ((*si)->score > maxscore) { maxscore = (*si)->score; maxnode = *si; } curr = maxnode; } //found end of best path } if (curr->data > 0) { if (rtoks.count(ctmap[curr->data])) { rtoks[ctmap[curr->data]]++; } else rtoks[ctmap[curr->data]] = 1; } if (ri+1 == slots.rend() || (ri+1)->first != currentitok) {//end of input tok if (rtoks.size() == 1) { //only one matching rtok found (*tmap)[currentitok] = (*rtoks.begin()).first; } else { if (rtoks.size() > 1) { for (map::reverse_iterator mi=rtoks.rbegin(); mi != rtoks.rend(); ++mi) { UnicodeString tok = Conv->convert(rtok[mi->first].surface()); if (!(tok.length() == 1 && u_ispunct(tok.charAt(0)))) { (*tmap)[currentitok] = mi->first; break; } } } else { cerr << "couldn't match " << itok[currentitok].surface() << endl; (*tmap)[currentitok] = -1; } } rtoks.clear(); } curr = curr->back; } } double alignscore(UChar a, UChar b) { if (u_foldCase(a,U_FOLD_CASE_DEFAULT) == u_foldCase(b,U_FOLD_CASE_DEFAULT)) return 1; else { if (!u_isalpha(a) && u_charType(a) == u_charType(b)) return 0.5; else { if (u_ispunct(a) && u_ispunct(b)) return 0.1; else return 0; } } } vector extractFromPTB(string item, bool goldtags) { vector tokens; string rest; rest = item; while (!rest.empty()) { while (!rest.empty() && rest.at(0) == '(') rest.erase(0,1); string nonterm; while (!rest.empty() && !isspace(rest.at(0)) && rest.at(0) != '(') { nonterm += rest.at(0); rest.erase(0,1); } while (!rest.empty() && isspace(rest.at(0))) rest.erase(0,1); if (!rest.empty() && rest.at(0) != '(') { //leaf string surface; while (!rest.empty() && !isspace(rest.at(0)) && rest.at(0) != ')') { surface += rest.at(0); rest.erase(0,1); } if (nonterm.compare("-NONE-") != 0) { if (nonterm.compare("-LRB-") == 0) { surface = string("("); } if (nonterm.compare("-RRB-") == 0) { surface = string(")"); } if (surface.compare("``") == 0) surface = string("“"); if (surface.compare("''") == 0) surface = string("”"); if (surface.compare("`") == 0) surface = string("‘"); if (surface.compare("'") == 0) surface = string("’"); //replicating token mapping rules that split input //tokens, so as to have maximally split input vector surfaceparts; if (surface.length() >= 9 && (surface.compare("everytime") == 0 || surface.compare("Everytime") == 0)) { surfaceparts.push_back(surface.substr(0,5)); surfaceparts.push_back(surface.substr(5)); } else { int hyphenidx; if ((hyphenidx = surface.find('-')) != string::npos) { double num1, num2; string part1 = surface.substr(0, hyphenidx); string part2 = surface.substr(hyphenidx+1); if(part1.empty() || part2.empty()) surfaceparts.push_back(surface); else { if (is_num(part1) && is_num(part2)) { surfaceparts.push_back(part1); surfaceparts.push_back(string("–")); surfaceparts.push_back(part2); } else { surfaceparts.push_back(string(part1+"-")); if ((hyphenidx = part2.find('-')) == string::npos || hyphenidx == part2.length()-1) surfaceparts.push_back(part2); else { //multiple hyphens surfaceparts.push_back(part2.substr(0, hyphenidx+1)); part2.erase(0, hyphenidx+1); hyphenidx = part2.find('-', 0); while (hyphenidx != string::npos && hyphenidx != part2.length()-1) { surfaceparts.push_back(part2.substr(0, hyphenidx+1)); part2.erase(0, hyphenidx+1); hyphenidx = part2.find('-'); } surfaceparts.push_back(part2); } } } } } if (surfaceparts.empty()) surfaceparts.push_back(surface); for(vector::iterator it = surfaceparts.begin(); it != surfaceparts.end(); ++it) { tokens.push_back(tToken(*it)); tokens.back().id(tokens.size()-1); if (goldtags) { tokens.back().tag(nonterm); tokens.back().prob("1.00"); } tokens.back().start(tokens.size()-1); tokens.back().end(tokens.size()); } } while (!rest.empty() && rest.at(0) == ')') rest.erase(0,1); while (!rest.empty() && isspace(rest.at(0))) rest.erase(0,1); } } return tokens; } bool is_num(string str) { if (str.empty()) return false; if (str.at(0) == '+' || str.at(0) == '-') str.erase(0,1); if (str.empty() || !isdigit(str.at(0))) return false; else str.erase(0,1); while (!str.empty() && isdigit(str.at(0))) str.erase(0,1); if (str.empty()) return true; if (str.at(0) == '.') { str.erase(0,1); if (str.empty()) return true; if (!isdigit(str.at(0))) return false; else str.erase(0,1); while (!str.empty() && isdigit(str.at(0))) str.erase(0,1); } else { return false; } if (str.empty()) return true; else return false; }