#ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include "unicode.h" #include "grammar.h" #include "profile.h" #include "derivReader.h" #include "derivReader.cpp" using namespace std; enum TagType {LETYPE, LETYPEMORPH}; TagType tagtype; void removeWS(string &rest); void collectleaves(Grammar &g, tToken s, vector &ancestors, vector &leaves); void parse_options(int argc, char **argv, string *gfname, string *pname, TagType *tagtype, tTokenFormat *format, string *mapfile, bool *numopt, int *lim, bool *goldonly, int *parsenum); bool is_infl(string rule); void read_mapfile(string mapfile, map *ltmap); string map_tag(string tag, map *ltmap); void alignbyid(vector &itok, vector &rtok, vector *tmap); void parseChar(char x, string &rest); void parseDummyField(string &rest); int parseNum(string &rest); vector readYY(string item); tToken readYYtok(string &rest); string parseString(string &tok); int main (int argc, char **argv) { // setting option variables string gfname, pname, mapfile; tTokenFormat format; bool numopt, goldonly; int lim, parsenum; parse_options(argc, argv, &gfname, &pname, &tagtype, &format, &mapfile, &numopt, &lim, &goldonly, &parsenum); // UTF-8 encoder initialize_encoding_converter("utf-8"); Grammar g(gfname); Profile p(pname); vector leaves; //used in collectleaves() DerivReader > reader(g, leaves, NULL, NULL, &collectleaves); map glemap; read_mapfile(mapfile, &glemap); pair,string> result = p.getResult(); tIid context = -1; // context is item id int event = -1; // event is parse id vector itokens; while (result.first.first >= 0) {//new item/context if (result.first.first != context) {//new item/context if (context != -1) { //finish last context }//finished last context itokens = readYY(p.getInput(result.first.first)); } context = result.first.first; event = result.first.second; if (lim > 0 && p.getReadings(context) >= lim) { //skip items with result = p.getResult(); // greater than lim continue; // readings } if (goldonly && (p.numGold(context) > 1 || !p.isGold(context, event))) { result = p.getResult(); continue; } if (parsenum >= 0 && event != parsenum) { result = p.getResult(); continue; } //do stuff with result leaves.clear(); reader.readDeriv(result.second); vectortokmap(itokens.size()); alignbyid(itokens, leaves, &tokmap); if (numopt) cout << context << " " << event << itokens[0].separator(format); if (format == FSC) { stringstream ss; ss << context; string contextstring = ss.str(); string itemstring = p.getItem(context); print_header(format, contextstring, itemstring, itokens.size()); } for (int idx=0; idx < itokens.size(); ++idx) { //string tag("NULL\tNULL\tNULL"); string tag("NULL"); if (tokmap[idx] != -1) { tag = leaves[tokmap[idx]].tag(); UnicodeString us = Conv->convert(itokens[idx].surface()); if (us.length() == 1 && u_ispunct(us.charAt(0))) { if ((u_charType(us.charAt(0)) == U_START_PUNCTUATION || us.compare(Conv->convert(string("“"))) == 0 || us.compare(Conv->convert(string("‘"))) == 0) && idx+1 < itokens.size() && tokmap[idx] == tokmap[idx+1]) { tag = "O:PUNCT"; } else { if (idx > 0 && tokmap[idx-1] != -1 && leaves[tokmap[idx-1]].tag().compare(tag) == 0) tag = "O:PUNCT"; } } if (us.length() == 2 && ( us.compare(Conv->convert(string("¦i"))) == 0 || us.compare(Conv->convert(string("i¦"))) == 0)) { tag = "O:PUNCT"; } tag = map_tag(tag, &glemap); UnicodeString lasttok; if (idx > 0) lasttok = Conv->convert(itokens[idx-1].surface()); if (format == CONLL && tag.compare("O:PUNCT") != 0) { if (idx > 0 && tokmap[idx] == tokmap[idx-1] && (lasttok.length() > 1 || !u_ispunct(lasttok.charAt(0))) ) //multi-token le tag = string("I:") + tag; else tag = string("B:") + tag; } } itokens[idx].print(format, pair(tag,1)); if (idx != (itokens.size()-1)) cout << itokens[idx].separator(format); } print_footer(format); result = p.getResult(); } //finish last context return 0; } void parse_options(int argc, char **argv, string *gfname, string *pname, TagType *tagtype, tTokenFormat *format, string *mapfile, bool *numopt, int *lim, bool *goldonly, int *parsenum) { namespace po = boost::program_options; po::options_description visible("Options"); visible.add_options() ("help,h", "This usage information.") ("infl,i", "Tags include morphological inflection rules.") ("format", po::value()->default_value("TNT"), "token format:TNT,CANDC,CONLL,FSC (default: TNT)") ("num,n", "Output item and parse number.") ("goldonly,g", "Only extract tags from 'gold' trees") ("analysis,a", po::value(parsenum)->default_value(-1), "Select a specific analysis, default (-1): all.") ("map,m", po::value(mapfile), "File containing mappings between lexical types (for gles).") ("limit,l", po::value(lim)->default_value(0), "Number of readings at which a context is ignored. " "Set to nbest to negate the effect of using a model during parsing.") ; po::options_description hidden("Hidden options"); hidden.add_options() ("grammar-file", po::value(gfname), "grammar .tdl file") ("profile", po::value(pname), "profile") ; po::options_description cmd_line ("Command line options"); cmd_line.add(visible).add(hidden); po::positional_options_description p; p.add("grammar-file",1).add("profile",1); po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmd_line).positional(p).run(), vm); notify(vm); if (vm.count("help")) { cout << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cout << visible << endl; exit(0); } if (!vm.count("grammar-file") || !vm.count("profile")) { cerr << "Insufficient arguments given." << endl; cerr << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cerr << visible << endl; exit(1); } if (vm.count("infl")) *tagtype = LETYPEMORPH; else *tagtype = LETYPE; string f(vm["format"].as()); if (f.compare("FSC") == 0) *format = FSC; else if (f.compare("CANDC") == 0) *format = CANDC; else if (f.compare("CONLL") == 0) *format = CONLL; else if (f.compare("TNT") == 0) *format = TNT; else { cerr << "Warning: Invalid format " << f << " given. " << "Setting format to TNT" << endl; *format = TNT; } if (vm.count("num")) *numopt = true; else *numopt = false; if (vm.count("goldonly")) *goldonly = true; else *goldonly = false; } vector readYY(string item) { string rest = item; vector toks; while (!rest.empty()) { tToken newtok = readYYtok(rest); toks.push_back(newtok); } return toks; } tToken readYYtok(string &rest) { removeWS(rest); if (rest.at(0) != '(') { cerr << "not a YY token: " << rest << endl; exit(1); } rest.erase(0,1); int id = parseNum(rest); //id parseChar(',', rest); int start = parseNum(rest); //start parseChar(',', rest); int end = parseNum(rest); //end parseChar(',', rest); int sstart = -1; int send = -1; if (rest.at(0) == '<') { //link rest.erase(0,1); sstart = parseNum(rest); parseChar(':', rest); send = parseNum(rest); parseChar('>', rest); parseChar(',', rest); } parseDummyField(rest); //path parseChar(',', rest); string surface = parseString(rest); //form removeWS(rest); tToken newtok(surface); if (sstart != -1 && send != -1) { newtok.start(sstart); newtok.end(send); } else { newtok.start(start); newtok.end(end); } newtok.id(id); if (rest.at(0) == '"') { parseDummyField(rest); //surface } parseChar(',', rest); parseDummyField(rest); //ipos parseChar(',', rest); parseDummyField(rest); //lrule if (rest.at(0) == ',') { //pos tags parseChar(',', rest); while (!rest.empty() && rest.at(0) != ')') { string prob; string tag = parseString(rest); removeWS(rest); while (!rest.empty() && !isspace(rest.at(0)) && rest.at(0) != ')') { prob += rest.at(0); rest.erase(0,1); } newtok.tag(tag); newtok.prob(prob); } } parseChar(')', rest); removeWS(rest); return newtok; } int parseNum(string &rest) { int i; string numstring; removeWS(rest); if (rest.at(0) == '-') { numstring += rest.at(0); rest.erase(0,1); } while (!rest.empty() && isdigit(rest.at(0))) { numstring += rest.at(0); rest.erase(0,1); } istringstream numstream(numstring); numstream >> i; removeWS(rest); return i; } void parseDummyField(string &rest) { while (!rest.empty() && rest.at(0) != ',' && rest.at(0) != ')') rest.erase(0,1); } bool is_infl(string rule) { if ( (rule.size() > 4 && rule.compare(rule.size()-4,4,"odlr") == 0) || (rule.size() > 3 && (rule.compare(rule.size()-3,3,"olr") == 0 || rule.compare(rule.size()-3,3,"ilr") == 0))) return true; if (rule.size() > 9 && rule.compare(rule.size()-9,9,"infl-rule") == 0) return true; return false; } void collectleaves(Grammar &g, tToken s, vector &ancestors, vector &leaves) { leaves.push_back(s); string tag(ancestors.back().surface()); if (tagtype == LETYPEMORPH) { for (int x=ancestors.size()-2; x >= 0 && g.is_lexrule(ancestors[x].surface()); x--) { string infl = ancestors[x].surface(); if (infl.compare(infl.length()-3, 3,"plr") != 0 && (infl.length() < 5 || (infl.length() >=5 && infl.compare(0,5,"punct") != 0))) { tag+=":"; tag+=infl; } } } leaves.back().tag(tag); } string map_tag(string tag, map *ltmap) { string letype, newletype; unsigned int colon = tag.find(':'); if (colon == string::npos) letype = tag; else letype = tag.substr(0,colon); if (ltmap->count(letype) == 1) { newletype = (*ltmap)[letype]; tag.replace(0,letype.length(), newletype); } return tag; } void alignbyid(vector &itok, vector &rtok, vector *tmap) { for (int i = 0; i < itok.size(); ++i) { for (int j = 0; j < rtok.size(); ++j) { vector tids = rtok[j].tids(); for (vector::iterator it = tids.begin(); it != tids.end(); ++it){ if (itok[i].id() == *it) (*tmap)[i] = j; } } } } void parseChar(char x, string &rest) { removeWS(rest); if (!rest.empty() && rest.at(0) == x) { rest.erase(0,1); removeWS(rest); } else { if (rest.empty()) { cerr << "Reached end of string while looking for " << x << endl; exit(1); } else { cerr << "Ill-formed YY tok. Got \"" << rest << "\", looking for " << x << endl; exit(1); } } } string parseString(string &tok) { string val; tok.erase(0,1); while (!tok.empty()) { if (tok.length() >= 2 && tok.at(1) == '"') { if (tok.at(0) != '\\') { val += tok.at(0); tok.erase(0,1); break; } } val += tok.at(0); tok.erase(0,1); } if (!tok.empty() && tok.at(0) == '"') { tok.erase(0,1); } else { cerr << "Unterminated quoted string at: " << tok << endl; exit(1); } return val; } void removeWS(std::string &rest) { while (!rest.empty() && isspace(rest.at(0))) rest.erase(0,1); } void read_mapfile(string mapfile, map *ltmap) { if (!mapfile.empty()) { ifstream mf(mapfile.c_str()); if (mf.is_open()) { string mline; getline(mf, mline); while (!mf.eof()) { string oldle, newle, lrule; istringstream linestream(mline); linestream >> oldle >> newle >> lrule; if (!oldle.empty() && !newle.empty()) { if (tagtype == LETYPEMORPH && !lrule.empty()) newle = newle + ":" + lrule; ltmap->insert(pair(oldle, newle)); } else cerr << "Warning: ill-formed line in map file: " << mline << ", skipping." << endl; getline(mf, mline); } mf.close(); } else { cerr << "Warning: couldn't open map file " << mapfile << ". No mapping will occur." << endl; } } }