#include "process.h" #include #include #include #include #include "align.h" using namespace std; void readRaw(string &rfn, UnicodeString &rmap) { ifstream rf(rfn.c_str()); if (rf.is_open()) { string rfline; while (getline(rf, rfline)) { UnicodeString uline = Conv->convert(rfline); for (int x = 0; x < uline.length(); ++x) { rmap += uline.charAt(x); } //if (!rf.eof()) rmap += '\n'; rmap += '\n'; } rf.close(); } else { cerr << "Couldn't open raw file " << rfn << endl; exit(1); } } // reads evalb-style prm files, only using the DELETE_LABEL and EQ_LABEL options void readParam(string paramfn, map &deletelabels, map > &equivlabels, string &delimchars) { ifstream pf(paramfn.c_str()); if (pf.is_open()) { string pfline; getline(pf, pfline); while (!pf.eof()) { if (pfline.empty() || pfline.compare(0,1,"#") == 0 || isspace(pfline.at(0))) { getline(pf, pfline); continue; } string option, value1, value2; istringstream optstr(pfline); optstr >> option >> value1; if (option == "DELETE_LABEL") { deletelabels.insert(pair(value1, 1)); } else if (option == "EQ_LABEL") { optstr >> value2; if (equivlabels.count(value1) == 0) equivlabels.insert( pair >(value1, set())); equivlabels[value1].insert(value2); if (equivlabels.count(value2) == 0) equivlabels.insert( pair >(value2, set())); equivlabels[value2].insert(value1); } else if (option == "LABEL_DELIM") { delimchars = value1; } getline(pf, pfline); } pf.close(); } else { cerr << "ERROR: Couldn't open parameter file " << paramfn << ". Skipping." << endl; } } void processCHAR(istream &charf, map > > &elts, bool boundary) { int sentcnt = 1; int lineno; string charfline; getline(charf, charfline); lineno++; while (!charf.eof()) { if (charfline.empty()) { getline(charf, charfline); lineno++; sentcnt++; continue; } string first, second; vector ffs; vector fss; IndexType start, end; string label, comment; istringstream cline(charfline); if (getline(cline, first, '\t')) { istringstream startstr(first); startstr >> start; if (startstr.get() == ',') { string ffirst; while (getline(startstr, ffirst, ',')) { istringstream ffstr(ffirst); IndexType ff; ffstr >> ff; ffs.push_back(ff); } } } else { cerr << "Error parsing CHAR file at line " << lineno << ", no column 1: " << charfline << endl; cerr << " Not a CHAR file?" << endl; exit(1); } if (getline(cline, second, '\t')) { istringstream endstr(second); endstr >> end; if (endstr.get() == ',') { string fsecond; while (getline(endstr, fsecond, ',')) { istringstream fsstr(fsecond); IndexType fs; fsstr >> fs; fss.push_back(fs); } } } else { cerr << "Error parsing CHAR file at line " << lineno << ", no column 2: " << charfline << endl; cerr << " Not a CHAR file?" << endl; exit(1); } if (getline(cline, label, '\t')) { if (elts.count(start) == 0) elts.insert(pair > >(start, vector >())); elts[start].push_back(eTuple(start, end, label, -1, -1)); elts[start].back().setSent(sentcnt); if (boundary && label == "SENT") elts[start].back().setFirst(elts[start].back().getSecond()); if (!ffs.empty()) { for (vector::iterator fit = ffs.begin(); fit != ffs.end(); ++fit) { elts[start].back().addFuzzyFirst(*fit); } } if (!fss.empty()) { for (vector::iterator fit = fss.begin(); fit != fss.end(); ++fit) { elts[start].back().addFuzzySecond(*fit); } } if (getline(cline, comment, '\t')) { if (comment.size() > 1 && comment.at(0) == '#') { comment.erase(0,1); elts[start].back().setComment(comment); } else { IndexType hstart, hend; vector hffs; vector hfss; istringstream hstartstr(comment); hstartstr >> hstart; if (hstartstr.fail()) { cerr << "Error parsing CHAR file at line " << lineno << ", column 4 not a comment or number: " << charfline << endl; cerr << " Not a CHAR file?" << endl; exit(1); } elts[start].back().setFourth(hstart); if (hstartstr.get() == ',') { string hffirst; while (getline(hstartstr, hffirst, ',')) { istringstream hffstr(hffirst); IndexType hff; hffstr >> hff; hffs.push_back(hff); } } if (!hffs.empty()) { for (vector::iterator fit = hffs.begin(); fit != hffs.end(); ++fit) { elts[start].back().addFuzzyFourth(*fit); } } string hsecond; if (getline(cline, hsecond, '\t')) { istringstream hendstr(hsecond); hendstr >> hend; if (hendstr.fail()) { cerr << "Error parsing CHAR file at line " << lineno << ", column 5 present but not a number: " << charfline << endl; cerr << " Not a CHAR file?" << endl; exit(1); } elts[start].back().setFifth(hend); if (hendstr.get() == ',') { string hfsecond; while (getline(hendstr, hfsecond, ',')) { istringstream hfsstr(hfsecond); IndexType hfs; hfsstr >> hfs; hfss.push_back(hfs); } } if (!hfss.empty()) { for (vector::iterator fit = hfss.begin(); fit != hfss.end(); ++fit) { elts[start].back().addFuzzyFifth(*fit); } } } else { cerr << "Error parsing CHAR file at line " << lineno << ", no column 5, but numeric column 4: " << charfline << endl; cerr << " Not a CHAR file?" << endl; exit(1); } } } } else { cerr << "Error parsing CHAR file at line " << lineno << ", column 3: " << charfline << endl; cerr << " Not a CHAR file?" << endl; exit(1); } getline(charf, charfline); } } void processLINE(UnicodeString &rmap, istream &linef, map > > &elts, bool fuzzy, bool boundary, bool rawFromGold) { int sentcnt = 1; int sentstart = -1; //token the last sentence started with vector toks; //tokens for alignment vector > brackets; //temporary store, pre-alignment int lineno; string linefline; while (getline(linef, linefline)) { lineno++; if (linefline.empty()) continue; sentstart = toks.size(); string tok; istringstream sentline(linefline); while (sentline >> tok) { UnicodeString utok = Conv->convert(tok); toks.push_back(utok); brackets.push_back(eTuple(toks.size()-1, toks.size()-1, string("TOK"), -1, -1)); brackets.back().setSent(sentcnt); //record tok string as comment for debugging brackets.back().setComment(tok); } brackets.push_back(eTuple(sentstart, toks.size()-1, "SENT", -1, -1)); brackets.back().setSent(sentcnt); sentcnt++; } if (brackets.empty()) { cerr << "No sentences found in LINE file." << " Not an LINE file?" << endl; exit(1); } //align tokens to raw character positions vector tokstart(toks.size(), -1); vector tokend(toks.size(), -1); // alignRaw (rmap, toks, tokstart, tokend); if (rawFromGold) { createRawFromGold(rmap, toks, tokstart, tokend); } else { lcsAlign (rmap, toks, tokstart, tokend); //positions, not chars for (int x = 0; x < toks.size(); ++x) { if (!toks[x].isEmpty() && tokend[x] != -1) { tokend[x]++; } } } for (vector >::iterator bitr = brackets.begin(); bitr != brackets.end(); ++bitr) { int start = -1; int end = -1; if (bitr->getFirst() < tokstart.size()) start = tokstart[bitr->getFirst()]; if (bitr->getSecond() < tokend.size()) end = tokend[bitr->getSecond()]; //give unaligned tuples zero-length span, at end of last aligned token if (start == -1) { if (end != -1) { start = end; } else { for (int x = bitr->getFirst(); x >= 0; --x) { if (tokend[x] != -1) { start = tokend[x]; end = tokend[x]; break; } } if (start == -1) { start = 0; end = 0; } } } if (end == -1) { end = start; } if (boundary && bitr->getThird() == "SENT") //boundary point, not span start = end; if (elts.count(start) == 0) elts.insert(pair > >(start, vector >())); elts[start].push_back(eTuple(start, end, bitr->getThird(), -1, -1)); elts[start].back().setSent(bitr->getSent()); elts[start].back().setComment(bitr->getComment()); } for (map > >::iterator litr = elts.begin(); litr != elts.end(); ++litr) { for (vector >::iterator titr = litr->second.begin(); titr != litr->second.end(); ++titr) { IndexType x = titr->getSecond() - 2; //char not pos while (x >=0 && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzySecond(x+1); //pos not char x--; } x = titr->getSecond(); //char not pos while (x < rmap.length() && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzySecond(x+1); //pos not char x++; } if (titr->getFirst() < titr->getSecond()) {//non-zero span x = titr->getFirst()-1; while (x >=0 && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzyFirst(x); x--; } x = titr->getFirst() + 1; while (x < rmap.length() && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzyFirst(x); x++; } } else { //zero span for (set::iterator sit = titr->getFuzzySecond().begin(); sit != titr->getFuzzySecond().end(); ++sit) { titr->addFuzzyFirst(*sit); } } } } } void processTAB(UnicodeString &rmap, istream &linef, map > > &elts, bool fuzzy, bool boundary, bool multi, bool rawFromGold) { int sentcnt = 1; int sentstart = -1; //token the last sentence started with vector toks; //tokens for alignment vector > brackets; //temporary store, pre-alignment int lineno; string linefline; while (getline(linef, linefline)) { lineno++; //if (linefline.empty() || linefline == "---") {//C&J POS sent marker if (linefline.empty()) {//C&J POS sent marker if (toks.size() > 0 && sentstart > -1) { brackets.push_back(eTuple(sentstart, toks.size()-1, "SENT", -1, -1)); brackets.back().setSent(sentcnt); sentcnt++; sentstart = -1; } continue; } string tok, pos; istringstream sentline(linefline); if (getline(sentline, tok, '\t')) { UnicodeString utok = Conv->convert(tok); toks.push_back(utok); brackets.push_back(eTuple(toks.size()-1, toks.size()-1, string("TOK"), -1, -1)); brackets.back().setSent(sentcnt); //record tok string as comment for debugging brackets.back().setComment(tok); if (sentstart == -1) sentstart = toks.size()-1; } else { cerr << "Not empty, but we can't get something from it?" << endl; exit(1); } if (getline(sentline, pos, '\t')) { brackets.push_back(eTuple(toks.size()-1, toks.size()-1, string("POS:"+pos), -1, -1)); brackets.back().setSent(sentcnt); if (multi) { //add all given POS tags to fuzzy set brackets.back().addFuzzyThird(pos); while (getline(sentline, pos, '\t')) { brackets.back().addFuzzyThird(pos); } } } } if (toks.size() > 0 && sentstart > 0) { brackets.push_back(eTuple(sentstart, toks.size()-1, "SENT", -1, -1)); brackets.back().setSent(sentcnt); } //finished reading TAB file if (brackets.empty()) { cerr << "No tokens found in TAB file." << " Not an TAB file?" << endl; exit(1); } //align tokens to raw character positions vector tokstart(toks.size(), -1); vector tokend(toks.size(), -1); // alignRaw (rmap, toks, tokstart, tokend); if (rawFromGold) { createRawFromGold(rmap, toks, tokstart, tokend); } else { lcsAlign (rmap, toks, tokstart, tokend); //positions, not chars for (int x = 0; x < toks.size(); ++x) { if (!toks[x].isEmpty() && tokend[x] != -1) { tokend[x]++; } } } for (vector >::iterator bitr = brackets.begin(); bitr != brackets.end(); ++bitr) { int start = -1; int end = -1; if (bitr->getFirst() < tokstart.size()) start = tokstart[bitr->getFirst()]; if (bitr->getSecond() < tokend.size()) end = tokend[bitr->getSecond()]; //check token span, to force aligned start and end, if possible for (int check = bitr->getFirst(); check <= bitr->getSecond(); ++check) { if (start == -1 && tokstart[check] > -1) { start = tokstart[check]; } if (tokend[check] > -1 && end < tokend[check]) { end = tokend[check]; } } //give unaligned tuples zero-length span, at end of last aligned token if (start == -1) { if (end != -1) { start = end; } else { for (int x = bitr->getFirst(); x >= 0; --x) { if (tokend[x] != -1) { start = tokend[x]; end = tokend[x]; break; } } if (start == -1) { start = 0; end = 0; } } } if (end == -1) { end = start; } if (boundary && bitr->getThird() == "SENT") start = end; if (elts.count(start) == 0) elts.insert(pair > >(start, vector >())); elts[start].push_back(eTuple(start, end, bitr->getThird(), -1, -1)); elts[start].back().setSent(bitr->getSent()); elts[start].back().setComment(bitr->getComment()); } for (map > >::iterator litr = elts.begin(); litr != elts.end(); ++litr) { for (vector >::iterator titr = litr->second.begin(); titr != litr->second.end(); ++titr) { IndexType x = titr->getSecond() - 2; //char not pos while (x >=0 && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzySecond(x+1); //pos not char x--; } x = titr->getSecond(); //char not pos while (x < rmap.length() && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzySecond(x+1); //pos not char x++; } if (titr->getFirst() < titr->getSecond()) {//non-zero span x = titr->getFirst()-1; while (x >=0 && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzyFirst(x); x--; } x = titr->getFirst() + 1; while (x < rmap.length() && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzyFirst(x); x++; } } else { //zero span for (set::iterator sit = titr->getFuzzySecond().begin(); sit != titr->getFuzzySecond().end(); ++sit) { titr->addFuzzyFirst(*sit); } } } } } void processCONLLX(UnicodeString &rmap, istream &linef, map > > &elts, bool fuzzy, bool boundary, bool rawFromGold) { int sentcnt = 1; int sentstart = -1; //token the last sentence started with vector toks; //tokens for alignment vector > brackets; //temporary store, pre-alignment int lineno; string linefline; getline(linef, linefline); lineno++; while (!linef.eof()) { if (linefline.empty() || linefline == "---") {//C&J POS sent marker if (toks.size() > 0 && sentstart > -1) { brackets.push_back(eTuple(sentstart, toks.size()-1, "SENT", -1, -1)); brackets.back().setSent(sentcnt); sentcnt++; sentstart = -1; } getline(linef, linefline); lineno++; continue; } if (linefline.at(0) == '#') {//comment getline(linef, linefline); lineno++; continue; } IndexType tokid; string tok, pos; istringstream sentline(linefline); if (getline(sentline, tok, '\t')) { istringstream tokidstr(tok); tokidstr >> tokid; if (tokidstr.fail()) { cerr << "Non-numeric token id in line " << lineno << ": " << linefline << endl; exit(1); } } else { cerr << "Not empty, but we can't get something from it?" << endl; exit(1); } if (getline(sentline, tok, '\t')) { UnicodeString utok = Conv->convert(tok); toks.push_back(utok); brackets.push_back(eTuple(toks.size()-1, toks.size()-1, string("TOK"), -1, -1)); brackets.back().setSent(sentcnt); //record tok string as comment for debugging brackets.back().setComment(tok); if (sentstart == -1) sentstart = toks.size()-1; } else { cerr << "No column 2 in line " << lineno << ": " << linefline << endl; exit(1); } if (! getline(sentline, pos, '\t')) { cerr << "No column 3 in line " << lineno << ": " << linefline << endl; exit(1); } if (! getline(sentline, pos, '\t')) { cerr << "No column 4 in line " << lineno << ": " << linefline << endl; exit(1); } if (getline(sentline, pos, '\t')) { if (pos.compare("_") != 0) { brackets.push_back(eTuple(toks.size()-1, toks.size()-1, string("POS:"+pos), -1, -1)); brackets.back().setSent(sentcnt); } } else { cerr << "No column 5 in line " << lineno << ": " << linefline << endl; exit(1); } if (! getline(sentline, pos, '\t')) { cerr << "No column 6 in line " << lineno << ": " << linefline << endl; exit(1); } IndexType head; string dep; if (getline(sentline, dep, '\t')) { if (dep.compare("_") != 0) { istringstream headstr(dep); headstr >> head; if (headstr.fail()) { cerr << "Non-numeric head in line " << lineno << ": " << linefline << endl; exit(1); } } else { //dummy placeholder getline(linef, linefline); lineno++; continue; } } else { cerr << "No column 7 in line " << lineno << ": " << linefline << endl; exit(1); } if (getline(sentline, dep, '\t')) { int headid = head; if (headid != 0) headid = head + sentstart - 1; brackets.push_back(eTuple(toks.size()-1, toks.size()-1, string("DEP:"+dep), headid, -1)); brackets.back().setSent(sentcnt); } else { cerr << "No column 8 in line " << lineno << ": " << linefline << endl; exit(1); } getline(linef, linefline); lineno++; } if (toks.size() > 0 && sentstart > 0) { brackets.push_back(eTuple(sentstart, toks.size()-1, "SENT", -1, -1)); brackets.back().setSent(sentcnt); } //finished reading CONLLX file if (brackets.empty()) { cerr << "No tokens found in CONLLX file." << " Not an CONLLX file?" << endl; exit(1); } //align tokens to raw character positions vector tokstart(toks.size(), -1); vector tokend(toks.size(), -1); if (rawFromGold) { createRawFromGold(rmap, toks, tokstart, tokend); } else { lcsAlign (rmap, toks, tokstart, tokend); // alignRaw (rmap, toks, tokstart, tokend); //positions, not chars for (int x = 0; x < toks.size(); ++x) { if (!toks[x].isEmpty() && tokend[x] != -1) { tokend[x]++; } } } for (vector >::iterator bitr = brackets.begin(); bitr != brackets.end(); ++bitr) { int start = -1; int end = -1; if (bitr->getFirst() < tokstart.size()) start = tokstart[bitr->getFirst()]; if (bitr->getSecond() < tokend.size()) end = tokend[bitr->getSecond()]; //check token span, to force aligned start and end, if possible for (int check = bitr->getFirst(); check <= bitr->getSecond(); ++check) { if (start == -1 && tokstart[check] > -1) { start = tokstart[check]; } if (tokend[check] > -1 && end < tokend[check]) { end = tokend[check]; } } //give unaligned tuples zero-length span, at end of last aligned token if (start == -1) { if (end != -1) { start = end; } else { for (int x = bitr->getFirst(); x >= 0; --x) { if (tokend[x] != -1) { start = tokend[x]; end = tokend[x]; break; } } if (start == -1) { start = 0; end = 0; } } } if (end == -1) { end = start; } if (boundary && bitr->getThird() == "SENT") start = end; if (elts.count(start) == 0) elts.insert(pair > >(start, vector >())); if (bitr->getFourth() == -1) { elts[start].push_back(eTuple(start, end, bitr->getThird(), -1, -1)); } else { if (bitr->getFourth() == 0) { elts[start].push_back(eTuple(start, end, bitr->getThird(), start, end)); } else { elts[start].push_back(eTuple(start, end, bitr->getThird(), tokstart[bitr->getFourth()], tokend[bitr->getFourth()])); } } elts[start].back().setSent(bitr->getSent()); elts[start].back().setComment(bitr->getComment()); } for (map > >::iterator litr = elts.begin(); litr != elts.end(); ++litr) { for (vector >::iterator titr = litr->second.begin(); titr != litr->second.end(); ++titr) { IndexType x = titr->getSecond() - 2; //char not pos while (x >=0 && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzySecond(x+1); //pos not char x--; } x = titr->getSecond(); //char not pos while (x < rmap.length() && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzySecond(x+1); //pos not char x++; } if (titr->getFirst() < titr->getSecond()) {//non-zero span x = titr->getFirst()-1; while (x >=0 && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzyFirst(x); x--; } x = titr->getFirst() + 1; while (x < rmap.length() && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzyFirst(x); x++; } } else { //zero span for (set::iterator sit = titr->getFuzzySecond().begin(); sit != titr->getFuzzySecond().end(); ++sit) { titr->addFuzzyFirst(*sit); } } //and for the head of dependent if (titr->getFourth() != -1 && titr->getFifth() != -1) { x = titr->getFifth() - 2; //char not pos while (x >=0 && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzyFifth(x+1); //pos not char x--; } x = titr->getFifth(); //char not pos while (x < rmap.length() && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzyFifth(x+1); //pos not char x++; } if (titr->getFourth() < titr->getFifth()) {//non-zero span x = titr->getFourth()-1; while (x >=0 && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzyFourth(x); x--; } x = titr->getFourth() + 1; while (x < rmap.length() && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzyFourth(x); x++; } } else { //zero span for (set::iterator sit = titr->getFuzzyFifth().begin(); sit != titr->getFuzzyFifth().end(); ++sit) { titr->addFuzzyFourth(*sit); } } } } } } void createRawFromGold(UnicodeString &rmap, vector &toks, vector &tokstart, vector &tokend) { rmap = ""; //in case we accidently also give a raw file, //default to the extracted version for (int t = 0; t < toks.size(); ++t) { if (rmap != "") rmap += " "; tokstart[t] = rmap.length(); if (toks[t] == "-LRB-") { rmap += "("; } else if (toks[t] == "-RRB-") { rmap += ")"; } else if (toks[t] == "-LCB-") { rmap += "{"; } else if (toks[t] == "-RCB-") { rmap += "}"; } else { rmap += toks[t]; } tokend[t] = rmap.length(); } } //write out interim characterised files to avoid parsing and re-aligning the //same files continuously void writeInterim(map > > &elts, ostream &out, bool printfuzzy) { int sentcnt; for (map > >::iterator itr = elts.begin(); itr != elts.end(); ++itr) { for (vector >::iterator titr = itr->second.begin(); titr != itr->second.end(); ++titr) { if (itr == elts.begin() && titr == itr->second.begin()) { sentcnt = titr->getSent(); } else { if (titr->getSent() != sentcnt) { out << endl; sentcnt = titr->getSent(); } } if (titr->getStatus() != DEL) { out << titr->getFirst(); if (printfuzzy && !titr->getFuzzyFirst().empty()) { for (set::iterator fit = titr->getFuzzyFirst().begin(); fit != titr->getFuzzyFirst().end(); ++fit) { out << "," << *fit; } } out << "\t" << titr->getSecond(); if (printfuzzy && !titr->getFuzzySecond().empty()) { for (set::iterator fit = titr->getFuzzySecond().begin(); fit != titr->getFuzzySecond().end(); ++fit) { out << "," << *fit; } } out << "\t" << titr->getThird(); if (!titr->getComment().empty()) { out << "\t#" << titr->getComment(); } else { if (titr->getFourth() != -1) { out << "\t" << titr->getFourth(); if (!titr->getFuzzyFourth().empty()) { for (set::iterator fit = titr->getFuzzyFourth().begin(); fit != titr->getFuzzyFourth().end(); ++fit) { out << "," << *fit; } } out << "\t" << titr->getFifth(); if (!titr->getFuzzyFifth().empty()) { for (set::iterator fit = titr->getFuzzyFifth().begin(); fit != titr->getFuzzyFifth().end(); ++fit) { out << "," << *fit; } } } } out << endl; } } } }