#ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include "tRepp.h" #include "postag.h" #include "unicode.h" #include "grammar.h" #include "profile.h" #include "derivReader.h" #include "derivReader.cpp" using namespace std; enum Tokeniser {NONE, REPP, CHASEN, YY, PTB, INPUT}; enum TagType {LETYPE, LETYPEMORPH}; TagType tagtype; void removeWS(string &rest); void collectleaves(Grammar &g, tToken s, vector &ancestors, vector &leaves); void parse_options(int argc, char **argv, string *gfname, string *pname, Tokeniser *tok, string *tokfname, vector *calllist, bool *tnt, string *tntmodel, TagType *tagtype, tTokenFormat *format, bool *numopt, int *lim, bool *chasen, string *chasenrc, bool *goldonly, int *parsenum); vector splitOnSpace(string item); bool is_infl(string rule); vector extractFromPTB(string item, bool goldtags=false); bool is_num(string str); void align(vector &itok, vector &rtok, vector *tmap); double alignscore(UChar a, UChar b); void alignbyid(vector &itok, vector &rtok, vector *tmap); void parseChar(char x, string &rest); void parseDummyField(string &rest); int parseNum(string &rest); vector readYY(string item); tToken readYYtok(string &rest); string parseString(string &tok); int main (int argc, char **argv) { // setting option variables string gfname, pname, tokfname, tntmodel, chasenrc; Tokeniser tok; vector calllist; tTokenFormat format; bool tnt, numopt, chasen, goldonly; int lim, parsenum; parse_options(argc, argv, &gfname, &pname, &tok, &tokfname, &calllist, &tnt, &tntmodel, &tagtype, &format, &numopt, &lim, &chasen, &chasenrc, &goldonly, &parsenum); // UTF-8 encoder initialize_encoding_converter("utf-8"); tRepp *tokrepp; if (tok == REPP) { boost::filesystem::path tokenizer(tokfname); #if BOOST_FILESYSTEM_VERSION >= 3 tokrepp = new tRepp(tokenizer.parent_path().string(), tokenizer.stem().string()); #else tokrepp = new tRepp(tokenizer.parent_path().string(), tokenizer.stem()); #endif } Grammar g(gfname); Profile p(pname); vector leaves; //used in collectleaves() DerivReader > reader(g, leaves, NULL, NULL, &collectleaves); pair,string> result = p.getResult(); int context = -1; // context is item id int event = -1; // event is parse id vector itokens; while (result.first.first >= 0) {//new item/context if (result.first.first != context) {//new item/context if (context != -1) { //finish last context }//finished last context switch (tok) { case REPP: itokens = tokrepp->tokenize(p.getItem(result.first.first), calllist); break; case CHASEN: break; case YY: // itokens = readYY(p.getItem(result.first.first)); break; case PTB: itokens = extractFromPTB(p.getItem(result.first.first)); break; case INPUT: itokens = readYY(p.getInput(result.first.first)); break; case NONE: itokens = splitOnSpace(p.getItem(result.first.first)); break; } if(tnt) { tnttag(&itokens,tntmodel); } if(chasen) { chasentag(&itokens,chasenrc); } } context = result.first.first; event = result.first.second; if (lim > 0 && p.getReadings(context) >= lim) { //skip items with result = p.getResult(); // greater than lim continue; // readings } if (goldonly && (p.numGold(context) > 1 || !p.isGold(context, event))) { result = p.getResult(); continue; } if (parsenum >= 0 && event != parsenum) { result = p.getResult(); continue; } //do stuff with result leaves.clear(); reader.readDeriv(result.second); vectortokmap(itokens.size()); if (tok == INPUT) { alignbyid(itokens, leaves, &tokmap); } else { align(itokens, leaves, &tokmap); } if (numopt) cout << context << " " << event << itokens[0].separator(format); if (format == FSC) { stringstream ss; ss << context; string contextstring = ss.str(); string itemstring = p.getItem(context); print_header(format, contextstring, itemstring, itokens.size()); } for (int idx=0; idx < itokens.size(); ++idx) { //string tag("NULL\tNULL\tNULL"); string tag("NULL"); if (tokmap[idx] != -1) { tag = leaves[tokmap[idx]].tag(); UnicodeString us = Conv->convert(itokens[idx].surface()); if (us.length() == 1 && u_ispunct(us.charAt(0))) { if ((u_charType(us.charAt(0)) == U_START_PUNCTUATION || us.compare(Conv->convert(string("“"))) == 0 || us.compare(Conv->convert(string("‘"))) == 0) && idx+1 < itokens.size() && tokmap[idx] == tokmap[idx+1]) { tag = "O:PUNCT"; } else { if (idx > 0 && tokmap[idx-1] != -1 && leaves[tokmap[idx-1]].tag().compare(tag) == 0) tag = "O:PUNCT"; } } UnicodeString lasttok; if (idx > 0) lasttok = Conv->convert(itokens[idx-1].surface()); if (format == CONLL && tag.compare("O:PUNCT") != 0) { if (idx > 0 && tokmap[idx] == tokmap[idx-1] && (lasttok.length() > 1 || !u_ispunct(lasttok.charAt(0))) ) //multi-token le tag = string("I:") + tag; else tag = string("B:") + tag; } } itokens[idx].print(format, pair(tag,1)); if (idx != (itokens.size()-1)) cout << itokens[idx].separator(format); } print_footer(format); result = p.getResult(); } //finish last context return 0; } void parse_options(int argc, char **argv, string *gfname, string *pname, Tokeniser *tok, string *tokfname, vector *calllist, bool *tnt, string *tntmodel, TagType *tagtype, tTokenFormat *format, bool *numopt, int *lim, bool *chasen, string *chasenrc, bool *goldonly, int *parsenum) { namespace po = boost::program_options; po::options_description visible("Options"); visible.add_options() ("help,h", "This usage information.") ("tok,t", po::value()->default_value("none"), "tokeniser: none, repp, chasen, yy, ptb, input (default: none)" " Tokeniser will be applied to the item string in the item file." " (YY and ChaSen unimplemented as yet. Let Rebecca know if you want them.)") ("rpp,r", po::value(tokfname), "tokenizer .rpp file") ("call,c", po::value< vector >(calllist)->composing(), "rpp calls, multiple options valid.") ("pos,p", "Use TnT POS tags.") ("model,m", po::value(tntmodel), "TnT model, defaults to WSJ model in LOGON tree.") ("jpos", "Use ChaSen POS tags.") ("jposrc", po::value(chasenrc), "Specify chasenrc file, defaults to LOGONROOT/.chasenrc") ("infl,i", "Tags include morphological inflection rules.") ("format", po::value()->default_value("TNT"), "token format:TNT,CANDC,CONLL,FSC (default: TNT)") ("num,n", "Output item and parse number.") ("goldonly,g", "Only extract tags from 'gold' trees") ("analysis,a", po::value(parsenum)->default_value(-1), "Select a specific analysis, default (-1): all.") ("limit,l", po::value(lim)->default_value(0), "Number of readings at which a context is ignored. " "Set to nbest to negate the effect of using a model during parsing.") ; po::options_description hidden("Hidden options"); hidden.add_options() ("grammar-file", po::value(gfname), "grammar .tdl file") ("profile", po::value(pname), "profile") ; po::options_description cmd_line ("Command line options"); cmd_line.add(visible).add(hidden); po::positional_options_description p; p.add("grammar-file",1).add("profile",1); po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmd_line).positional(p).run(), vm); notify(vm); if (vm.count("help")) { cout << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cout << visible << endl; exit(0); } if (!vm.count("grammar-file") || !vm.count("profile")) { cerr << "Insufficient arguments given." << endl; cerr << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cerr << visible << endl; exit(1); } string t(vm["tok"].as()); if (t.compare("none") == 0) *tok= NONE; else if (t.compare("repp") == 0) *tok= REPP; else if (t.compare("chasen") == 0) *tok= CHASEN; else if (t.compare("yy") == 0) *tok= YY; else if (t.compare("ptb") == 0) *tok= PTB; else if (t.compare("input") == 0) *tok = INPUT; else { cerr << "Warning: invalid tokeniser (--tok) " << t << " given. " << "Setting tokeniser to none" << endl; *tok= NONE; } if (*tok == REPP && !vm.count("rpp")) { cerr << "A repp file must be specified when using REPP tokenisation" << endl; cerr << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cerr << visible << endl; exit(1); } if (vm.count("infl")) *tagtype = LETYPEMORPH; else *tagtype = LETYPE; string f(vm["format"].as()); if (f.compare("FSC") == 0) *format = FSC; else if (f.compare("CANDC") == 0) *format = CANDC; else if (f.compare("CONLL") == 0) *format = CONLL; else if (f.compare("TNT") == 0) *format = TNT; else { cerr << "Warning: Invalid format " << f << " given. " << "Setting format to TNT" << endl; *format = TNT; } if (vm.count("pos")) { *tnt = true; if (*format == TNT) cerr << "Warning: TnT output format and POS tag option set. " << endl << " When using TnT as a supertagger, do not use POS tags " << "in the training input." << endl; } else { *tnt = false; if (vm.count("model")) cerr << "Warning: TnT model given, but TnT option not set." << endl; } if (!vm.count("model")) *tntmodel = ""; if (vm.count("jpos")) { *chasen = true; } else { *chasen = false; if (vm.count("jposrc")) cerr << "Warning: chasenrc file given, but ChaSen option not set." << endl; } if (!vm.count("jposrc")) *chasenrc = ""; if (vm.count("num")) *numopt = true; else *numopt = false; if (vm.count("goldonly")) *goldonly = true; else *goldonly = false; } vector splitOnSpace(string item) { vector toks; int start=0; int end=0; end = item.find(' ', start); while (end != string::npos) { toks.push_back(tToken(item.substr(start, end-start))); start=end+1; end = item.find(' ', start); } toks.push_back(tToken(item.substr(start))); return toks; } vector readYY(string item) { string rest = item; vector toks; while (!rest.empty()) { tToken newtok = readYYtok(rest); toks.push_back(newtok); } return toks; } tToken readYYtok(string &rest) { removeWS(rest); if (rest.at(0) != '(') { cerr << "not a YY token: " << rest << endl; exit(1); } rest.erase(0,1); int id = parseNum(rest); //id parseChar(',', rest); int start = parseNum(rest); //start parseChar(',', rest); int end = parseNum(rest); //end parseChar(',', rest); int sstart = -1; int send = -1; if (rest.at(0) == '<') { //link rest.erase(0,1); sstart = parseNum(rest); parseChar(':', rest); send = parseNum(rest); parseChar('>', rest); parseChar(',', rest); } parseDummyField(rest); //path parseChar(',', rest); string surface = parseString(rest); //form removeWS(rest); tToken newtok(surface); if (sstart != -1 && send != -1) { newtok.start(sstart); newtok.end(send); } else { newtok.start(start); newtok.end(end); } newtok.id(id); if (rest.at(0) == '"') { parseDummyField(rest); //surface } parseChar(',', rest); parseDummyField(rest); //ipos parseChar(',', rest); parseDummyField(rest); //lrule if (rest.at(0) == ',') { //pos tags parseChar(',', rest); while (!rest.empty() && rest.at(0) != ')') { string prob; string tag = parseString(rest); removeWS(rest); while (!rest.empty() && !isspace(rest.at(0)) && rest.at(0) != ')') { prob += rest.at(0); rest.erase(0,1); } newtok.tag(tag); newtok.prob(prob); } } parseChar(')', rest); removeWS(rest); return newtok; } int parseNum(string &rest) { int i; string numstring; removeWS(rest); if (rest.at(0) == '-') { numstring += rest.at(0); rest.erase(0,1); } while (!rest.empty() && isdigit(rest.at(0))) { numstring += rest.at(0); rest.erase(0,1); } istringstream numstream(numstring); numstream >> i; removeWS(rest); return i; } void parseDummyField(string &rest) { while (!rest.empty() && rest.at(0) != ',' && rest.at(0) != ')') rest.erase(0,1); } bool is_infl(string rule) { if ( (rule.size() > 4 && rule.compare(rule.size()-4,4,"odlr") == 0) || (rule.size() > 3 && (rule.compare(rule.size()-3,3,"olr") == 0 || rule.compare(rule.size()-3,3,"ilr") == 0))) return true; if (rule.size() > 9 && rule.compare(rule.size()-9,9,"infl-rule") == 0) return true; return false; } void collectleaves(Grammar &g, tToken s, vector &ancestors, vector &leaves) { leaves.push_back(s); string tag(ancestors.back().surface()); if (tagtype == LETYPEMORPH) { for (int x=ancestors.size()-2; x >= 0 && g.is_lexrule(ancestors[x].surface()); x--) { //&& is_infl(ancestors[x].surface()); x--) { string infl = ancestors[x].surface(); if (infl.compare(infl.length()-3, 3,"plr") != 0 && (infl.length() < 5 || (infl.length() >=5 && infl.compare(0,5,"punct") != 0))) { tag+=":"; tag+=infl; } } } leaves.back().tag(tag); } void alignbyid(vector &itok, vector &rtok, vector *tmap) { for (int i = 0; i < itok.size(); ++i) { for (int j = 0; j < rtok.size(); ++j) { vector tids = rtok[j].tids(); for (vector::iterator it = tids.begin(); it != tids.end(); ++it){ if (itok[i].id() == *it) (*tmap)[i] = j; } } } } class AlignNode { public: int data; double score; AlignNode *back; AlignNode(int d, double s, AlignNode *b):data(d),score(s),back(b){}; }; void align(vector &itok, vector &rtok, vector *tmap) { typedef pair > Slot; //create char map from rtok vector cmap; vector ctmap; cmap.push_back('^'); ctmap.push_back(-1); int tokcount = 0; for (vector::iterator ti = rtok.begin(); ti != rtok.end(); ++ti) { UnicodeString tmptok = Conv->convert((*ti).surface()); for (int x=0; x < tmptok.length(); ++x) { cmap.push_back(tmptok.charAt(x)); ctmap.push_back(tokcount); } tokcount++; } //foreach char in itok, find possible aligns, find possible pre-aligns, score tokcount = 0; vector slots; for (vector::iterator ti = itok.begin(); ti != itok.end(); ++ti) { UnicodeString tmptok = Conv->convert((*ti).surface()); for (int x=0; x < tmptok.length(); ++x) { Slot nextslot(tokcount,vector()); for (int y=1; y < cmap.size(); ++y) { double score = alignscore(tmptok.charAt(x), cmap[y]); if (score > 0) { AlignNode *alignment = new AlignNode(y,score,NULL); if (slots.size() == 0) { nextslot.second.push_back(alignment); } else { //work out path to here and adjust score for (vector::iterator si = slots.back().second.begin(); si != slots.back().second.end(); ++si) { if (abs((*si)->data) < y) { //possible predecessor if ( ((*si)->data >= 0 && ((*si)->score)+score >= alignment->score) || (*si)->score + score > alignment->score) { //best path seen so far alignment->score = (*si)->score + score; alignment->back = *si; } } } nextslot.second.push_back(alignment); } } } if (slots.size() == 0) { //align first char to null AlignNode *anode = new AlignNode(0,0, NULL); nextslot.second.push_back(anode); } else { //add null matches that record highest char in path so far map nullNodes; for (vector::iterator si = slots.back().second.begin(); si != slots.back().second.end(); ++si) { if ((*si)->data > 0) { AlignNode * anode = new AlignNode(-1*(*si)->data,(*si)->score,*si); nextslot.second.push_back(anode); nullNodes[-1*(*si)->data] = *si; } else { if (nullNodes.count((*si)->data)) { //already seen this null if ((*si)->score > nullNodes[(*si)->data]->score ) { nullNodes[(*si)->data]->score = (*si)->score; nullNodes[(*si)->data]->back = *si; } } else { AlignNode *anode = new AlignNode((*si)->data,(*si)->score,*si); nextslot.second.push_back(anode); } } } } slots.push_back(nextslot); } tokcount++; } //we have the best char to char mapping, now calculate tok to tok int currentitok; AlignNode *maxnode, *curr; double maxscore = 0; map rtoks; for (vector::reverse_iterator ri = slots.rbegin(); ri != slots.rend(); ++ri) { currentitok = ri->first; if (ri == slots.rbegin()) { for (vector::iterator si = (*ri).second.begin(); si != (*ri).second.end(); ++si) { if ((*si)->score > maxscore) { maxscore = (*si)->score; maxnode = *si; } curr = maxnode; } //found end of best path } if (curr->data > 0) { if (rtoks.count(ctmap[curr->data])) { rtoks[ctmap[curr->data]]++; } else rtoks[ctmap[curr->data]] = 1; } if (ri+1 == slots.rend() || (ri+1)->first != currentitok) {//end of input tok if (rtoks.size() == 1) { //only one matching rtok found (*tmap)[currentitok] = (*rtoks.begin()).first; } else { if (rtoks.size() > 1) { for (map::reverse_iterator mi=rtoks.rbegin(); mi != rtoks.rend(); ++mi) { UnicodeString tok = Conv->convert(rtok[mi->first].surface()); if (!(tok.length() == 1 && u_ispunct(tok.charAt(0)))) { (*tmap)[currentitok] = mi->first; break; } } } else { cerr << "couldn't match " << itok[currentitok].surface() << endl; (*tmap)[currentitok] = -1; } } rtoks.clear(); } curr = curr->back; } } double alignscore(UChar a, UChar b) { if (u_foldCase(a,U_FOLD_CASE_DEFAULT) == u_foldCase(b,U_FOLD_CASE_DEFAULT)) return 1; else { if (!u_isalpha(a) && u_charType(a) == u_charType(b)) return 0.5; else { if (u_ispunct(a) && u_ispunct(b)) return 0.1; else return 0; } } } vector extractFromPTB(string item, bool goldtags) { vector tokens; string rest; rest = item; while (!rest.empty()) { while (!rest.empty() && rest.at(0) == '(') rest.erase(0,1); string nonterm; while (!rest.empty() && !isspace(rest.at(0)) && rest.at(0) != '(') { nonterm += rest.at(0); rest.erase(0,1); } while (!rest.empty() && isspace(rest.at(0))) rest.erase(0,1); if (!rest.empty() && rest.at(0) != '(') { //leaf string surface; while (!rest.empty() && !isspace(rest.at(0)) && rest.at(0) != ')') { surface += rest.at(0); rest.erase(0,1); } if (nonterm.compare("-NONE-") != 0) { if (nonterm.compare("-LRB-") == 0) { surface = string("("); } if (nonterm.compare("-RRB-") == 0) { surface = string(")"); } if (surface.compare("``") == 0) surface = string("“"); if (surface.compare("''") == 0) surface = string("”"); if (surface.compare("`") == 0) surface = string("‘"); if (surface.compare("'") == 0) surface = string("’"); // //replicating token mapping rules that split input // //tokens, so as to have maximally split input vector surfaceparts; // if (surface.length() >= 9 && // (surface.compare("everytime") == 0 // || surface.compare("Everytime") == 0)) { // surfaceparts.push_back(surface.substr(0,5)); // surfaceparts.push_back(surface.substr(5)); // } else { // int hyphenidx; // if ((hyphenidx = surface.find('-')) != string::npos) { // double num1, num2; // string part1 = surface.substr(0, hyphenidx); // string part2 = surface.substr(hyphenidx+1); // if(part1.empty() || part2.empty()) // surfaceparts.push_back(surface); // else { // if (is_num(part1) && is_num(part2)) { // surfaceparts.push_back(part1); // surfaceparts.push_back(string("–")); // surfaceparts.push_back(part2); // } else { // surfaceparts.push_back(string(part1+"-")); // if ((hyphenidx = part2.find('-')) // == string::npos || hyphenidx == part2.length()-1) // surfaceparts.push_back(part2); // else { //multiple hyphens // surfaceparts.push_back(part2.substr(0, hyphenidx+1)); // part2.erase(0, hyphenidx+1); // hyphenidx = part2.find('-', 0); // while (hyphenidx != string::npos // && hyphenidx != part2.length()-1) { // surfaceparts.push_back(part2.substr(0, hyphenidx+1)); // part2.erase(0, hyphenidx+1); // hyphenidx = part2.find('-'); // } // surfaceparts.push_back(part2); // } // } // } // } // } if (surfaceparts.empty()) surfaceparts.push_back(surface); for(vector::iterator it = surfaceparts.begin(); it != surfaceparts.end(); ++it) { tokens.push_back(tToken(*it)); tokens.back().id(tokens.size()-1); if (goldtags) { tokens.back().tag(nonterm); tokens.back().prob("1.00"); } tokens.back().start(tokens.size()-1); tokens.back().end(tokens.size()); } } while (!rest.empty() && rest.at(0) == ')') rest.erase(0,1); while (!rest.empty() && isspace(rest.at(0))) rest.erase(0,1); } } return tokens; } bool is_num(string str) { if (str.empty()) return false; if (str.at(0) == '+' || str.at(0) == '-') str.erase(0,1); if (str.empty() || !isdigit(str.at(0))) return false; else str.erase(0,1); while (!str.empty() && isdigit(str.at(0))) str.erase(0,1); if (str.empty()) return true; if (str.at(0) == '.') { str.erase(0,1); if (str.empty()) return true; if (!isdigit(str.at(0))) return false; else str.erase(0,1); while (!str.empty() && isdigit(str.at(0))) str.erase(0,1); } else { return false; } if (str.empty()) return true; else return false; } void parseChar(char x, string &rest) { removeWS(rest); if (!rest.empty() && rest.at(0) == x) { rest.erase(0,1); removeWS(rest); } else { if (rest.empty()) { cerr << "Reached end of string while looking for " << x << endl; exit(1); } else { cerr << "Ill-formed YY tok. Got \"" << rest << "\", looking for " << x << endl; exit(1); } } } string parseString(string &tok) { string val; tok.erase(0,1); while (!tok.empty()) { if (tok.length() >= 2 && tok.at(1) == '"') { if (tok.at(0) != '\\') { val += tok.at(0); tok.erase(0,1); break; } } val += tok.at(0); tok.erase(0,1); } if (!tok.empty() && tok.at(0) == '"') { tok.erase(0,1); } else { cerr << "Unterminated quoted string at: " << tok << endl; exit(1); } return val; } void removeWS(std::string &rest) { while (!rest.empty() && isspace(rest.at(0))) rest.erase(0,1); }