#ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include "unicode.h" #include "grammar.h" #include "profile.h" #include "derivReader.h" #include "derivReader.cpp" using namespace std; enum TagType {LETYPE, LETYPEMORPH}; TagType tagtype; void collectleaves(Grammar &g, tToken s, vector &ancestors, vector &leaves); void parse_options(int argc, char **argv, string *gfname, string *pname, TagType *tagtype, tTokenFormat *format, bool *numopt, int *lim, bool *goldonly, int *parsenum, int *itemno); bool is_infl(string rule); int main (int argc, char **argv) { // setting option variables string gfname, pname; tTokenFormat format; bool numopt, goldonly; int lim, parsenum, itemno; parse_options(argc, argv, &gfname, &pname, &tagtype, &format, &numopt, &lim, &goldonly, &parsenum, &itemno); // UTF-8 encoder initialize_encoding_converter("utf-8"); Grammar g(gfname); Profile p(pname); vector leaves; //used in collectleaves() DerivReader > reader(g, leaves, NULL, NULL, &collectleaves); pair,string> result = p.getResult(); tIid context = -1; // context is item id int event = -1; // event is parse id vector itokens; while (result.first.first >= 0) {//new item/context if (result.first.first != context) {//new item/context if (context != -1) { //finish last context }//finished last context } context = result.first.first; event = result.first.second; if (lim > 0 && p.getReadings(context) >= lim) { //skip items with result = p.getResult(); // greater than lim continue; // readings } if (itemno >= 0 && context != itemno) { //select single item result = p.getResult(); continue; } if (goldonly && (p.numGold(context) > 1 || !p.isGold(context, event))) { result = p.getResult(); continue; } if (parsenum >= 0 && event != parsenum) { result = p.getResult(); continue; } //do stuff with result leaves.clear(); cout << "ITEM: " << context << endl; reader.readDeriv(result.second); if (numopt) cout << context << " " << event << " "; for (int idx=0; idx < leaves.size(); ++idx) { //string tag("NULL\tNULL\tNULL"); string tag("NULL"); tag = leaves[idx].tag(); UnicodeString us = Conv->convert(leaves[idx].surface()); if (us.length() == 1 && u_ispunct(us.charAt(0))) { if ((u_charType(us.charAt(0)) == U_START_PUNCTUATION || us.compare(Conv->convert(string("“"))) == 0 || us.compare(Conv->convert(string("‘"))) == 0) ) { tag = "O:PUNCT"; } } //leaves[idx].print(format, pair(tag,1)); leaves[idx].print(format); if (idx != (leaves.size()-1)) cout << leaves[idx].separator(format); } cout << endl; result = p.getResult(); } //finish last context return 0; } void parse_options(int argc, char **argv, string *gfname, string *pname, TagType *tagtype, tTokenFormat *format, bool *numopt, int *lim, bool *goldonly, int *parsenum, int *itemno) { namespace po = boost::program_options; po::options_description visible("Options"); visible.add_options() ("help,h", "This usage information.") ("infl,i", "Tags include morphological inflection rules.") ("format", po::value()->default_value("TNT"), "token format:TNT,CANDC,CONLL,FSC (default: TNT)") ("num,n", "Output item and parse number.") ("goldonly,g", "Only extract tags from 'gold' trees") ("analysis,a", po::value(parsenum)->default_value(-1), "Select a specific analysis, default (-1): all.") ("single,s", po::value(itemno)->default_value(-1), "Select a specific item, default (-1): all.") ("limit,l", po::value(lim)->default_value(0), "Number of readings at which a context is ignored. " "Set to nbest to negate the effect of using a model during parsing.") ; po::options_description hidden("Hidden options"); hidden.add_options() ("grammar-file", po::value(gfname), "grammar .tdl file") ("profile", po::value(pname), "profile") ; po::options_description cmd_line ("Command line options"); cmd_line.add(visible).add(hidden); po::positional_options_description p; p.add("grammar-file",1).add("profile",1); po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmd_line).positional(p).run(), vm); notify(vm); if (vm.count("help")) { cout << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cout << visible << endl; exit(0); } if (!vm.count("grammar-file") || !vm.count("profile")) { cerr << "Insufficient arguments given." << endl; cerr << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cerr << visible << endl; exit(1); } if (vm.count("infl")) *tagtype = LETYPEMORPH; else *tagtype = LETYPE; string f(vm["format"].as()); if (f.compare("FSC") == 0) *format = FSC; else if (f.compare("CANDC") == 0) *format = CANDC; else if (f.compare("CONLL") == 0) *format = CONLL; else if (f.compare("TNT") == 0) *format = TNT; else { cerr << "Warning: Invalid format " << f << " given. " << "Setting format to TNT" << endl; *format = TNT; } if (vm.count("num")) *numopt = true; else *numopt = false; if (vm.count("goldonly")) *goldonly = true; else *goldonly = false; } bool is_infl(string rule) { if ( (rule.size() > 4 && rule.compare(rule.size()-4,4,"odlr") == 0) || (rule.size() > 3 && (rule.compare(rule.size()-3,3,"olr") == 0 || rule.compare(rule.size()-3,3,"ilr") == 0))) return true; if (rule.size() > 9 && rule.compare(rule.size()-9,9,"infl-rule") == 0) return true; return false; } void collectleaves(Grammar &g, tToken s, vector &ancestors, vector &leaves) { leaves.push_back(s); string tag(ancestors.back().surface()); if (tagtype == LETYPEMORPH) { for (int x=ancestors.size()-2; x >= 0 && g.is_lexrule(ancestors[x].surface()); x--) { //&& is_infl(ancestors[x].surface()); x--) { string infl = ancestors[x].surface(); if (infl.compare(infl.length()-3, 3,"plr") != 0 && (infl.length() < 5 || (infl.length() >=5 && infl.compare(0,5,"punct") != 0))) { tag+=":"; tag+=infl; } } } leaves.back().tag(tag); }