#include "eval.h" using namespace std; void compareTuples(map > > &gelts, map > > &telts, map > &equivlabels, bool stats, bool multi, bool unlabelled, string delimchars, bool sent_counts) { map sentences; IndexType lastchar; int correct = 0; int gold = 0; int tcount = 0; int pscorrect = 0; int psgold = 0; int pstcount = 0; int depcorrect = 0; int depgold = 0; int deptcount = 0; int ptcorrect = 0; int ptgold = 0; int pttcount = 0; int tscorrect = 0; int tsgold = 0; int tstcount = 0; int sscorrect = 0; int ssgold = 0; int sstcount = 0; int sent_correct = 0; int sent_gold = 0; int sent_depcorrect = 0; int sent_pscorrect = 0; int sent_ptcorrect = 0; int sent_tscorrect = 0; int sent_sscorrect = 0; int currsent = -1; for (map > >::iterator litr = gelts.begin(); litr != gelts.end(); ++litr) { for (vector >::iterator titr = litr->second.begin(); titr != litr->second.end(); ++titr) { int currsent = titr->getSent(); if (sentences.count(currsent) == 0) { sentences.insert(pair(currsent, Sentence())); } if (sentences[currsent].start == -1 || sentences[currsent].start > titr->getFirst()) sentences[currsent].start = titr->getFirst(); if (sentences[currsent].end == -1 || sentences[currsent].end < titr->getSecond()) sentences[currsent].end = titr->getSecond(); if (lastchar < titr->getSecond()) lastchar = titr->getSecond(); if (titr->getStatus() == INIT) { sentences[currsent].gold++; TripleType ttype = PS; if (titr->getThird() == "SENT") { sentences[currsent].ssgold++; ttype = SENT; } else if (titr->getThird() == "TOK") { sentences[currsent].tsgold++; ttype = TOK; } else if (titr->getThird().length() > 4 && titr->getThird().compare(0,4, "POS:") == 0) { sentences[currsent].ptgold++; ttype = POS; } else if (titr->getThird().length() > 4 && titr->getThird().compare(0,4, "DEP:") == 0) { sentences[currsent].depgold++; ttype = DEP; } else sentences[currsent].psgold++; if (telts.count(litr->first) > 0) { for (vector >::iterator ttitr = telts[litr->first].begin(); ttitr != telts[litr->first].end(); ++ttitr) { if (ttitr->getStatus() == INIT && (ttitr->getFirst() == titr->getFirst() || titr->getFuzzyFirst().count(ttitr->getFirst()) > 0) && (ttitr->getSecond() == titr->getSecond() || titr->getFuzzySecond().count(ttitr->getSecond()) >0) && equiv(ttitr->getThird(), titr->getThird(), equivlabels, delimchars, unlabelled) && (ttitr->getFourth() == titr->getFourth() || titr->getFuzzyFourth().count(ttitr->getFourth()) > 0) && (ttitr->getFifth() == titr->getFifth() || titr->getFuzzyFifth().count(ttitr->getFifth()) > 0) ) { sentences[currsent].correct++; sentences[currsent].test++; if (ttype == SENT) sentences[currsent].sscorrect++; if (ttype == SENT) sentences[currsent].sstest++; if (ttype == TOK) sentences[currsent].tscorrect++; if (ttype == TOK) sentences[currsent].tstest++; if (ttype == POS) sentences[currsent].ptcorrect++; if (ttype == POS) sentences[currsent].pttest++; if (ttype == DEP) sentences[currsent].depcorrect++; if (ttype == DEP) sentences[currsent].deptest++; if (ttype == PS) sentences[currsent].pscorrect++; if (ttype == PS) sentences[currsent].pstest++; ttitr->setStatus(MATCH); titr->setStatus(MATCH); break; } } } if (titr->getStatus() != MATCH) { for (set::iterator sit = titr->getFuzzyFirst().begin(); sit != titr->getFuzzyFirst().end(); ++sit) { if (telts.count(*sit) > 0) { for (vector >::iterator ttitr = telts[*sit].begin(); ttitr != telts[*sit].end(); ++ttitr) { if (ttitr->getStatus() == INIT && (ttitr->getFirst() == titr->getFirst() || titr->getFuzzyFirst().count(ttitr->getFirst()) > 0) && (ttitr->getSecond() == titr->getSecond() || titr->getFuzzySecond().count(ttitr->getSecond()) >0) && (equiv(ttitr->getThird(), titr->getThird(), equivlabels, delimchars, unlabelled) || (multi && tagoverlap(titr->getFuzzyThird(), ttitr->getFuzzyThird(), equivlabels)))) { sentences[currsent].correct++; sentences[currsent].test++; if (ttype == SENT) sentences[currsent].sscorrect++; if (ttype == SENT) sentences[currsent].sstest++; if (ttype == TOK) sentences[currsent].tscorrect++; if (ttype == TOK) sentences[currsent].tstest++; if (ttype == POS) sentences[currsent].ptcorrect++; if (ttype == POS) sentences[currsent].pttest++; if (ttype == DEP) sentences[currsent].depcorrect++; if (ttype == DEP) sentences[currsent].deptest++; if (ttype == PS) sentences[currsent].pscorrect++; if (ttype == PS) sentences[currsent].pstest++; ttitr->setStatus(MATCH); titr->setStatus(MATCH); break; } } } } } } } } bool getexact = true; map s_by_end; for (map::iterator mit = sentences.begin(); mit != sentences.end(); ++mit) { if (mit->second.start != -1 && mit->second.end != -1) { s_by_end[mit->second.end] = mit->first; } correct += mit->second.correct; gold += mit->second.gold; if (mit->second.correct == mit->second.gold) mit->second.exact = true; sscorrect += mit->second.sscorrect; ssgold += mit->second.ssgold; if (mit->second.sscorrect == mit->second.ssgold) mit->second.ssexact = true; tscorrect += mit->second.tscorrect; tsgold += mit->second.tsgold; if (mit->second.tscorrect == mit->second.tsgold) mit->second.tsexact = true; ptcorrect += mit->second.ptcorrect; ptgold += mit->second.ptgold; if (mit->second.ptcorrect == mit->second.ptgold) mit->second.ptexact = true; depcorrect += mit->second.depcorrect; depgold += mit->second.depgold; if (mit->second.depcorrect == mit->second.depgold) mit->second.depexact = true; pscorrect += mit->second.pscorrect; psgold += mit->second.psgold; if (mit->second.pscorrect == mit->second.psgold) mit->second.psexact = true; } for (map > >::iterator litr = telts.begin(); litr != telts.end(); ++litr) { for (vector >::iterator titr = litr->second.begin(); titr != litr->second.end(); ++titr) { if (titr->getStatus() == INIT || titr->getStatus() == MATCH) { tcount++; set unmatcheds; if (titr->getStatus() == INIT) {//unmatched if (getexact && titr->getFirst() > -1 && titr->getSecond() > -1) { map::iterator fgs = s_by_end.upper_bound(titr->getFirst()); if (fgs != s_by_end.end()) { for (int x = fgs->second; x <= sentences.size() && ((titr->getSecond() >= sentences[x].start && titr->getSecond() <= sentences[x].end) || sentences[x].end <= titr->getSecond()); ++x) { sentences[x].exact = false; sentences[x].test++; if (titr->getThird() == "SENT") { sentences[x].ssexact = false; sentences[x].sstest++; } else if (titr->getThird() == "TOK") { sentences[x].tsexact = false; sentences[x].tstest++; } else if (titr->getThird().length() > 4 && titr->getThird().compare(0,4, "POS:") == 0) { sentences[x].ptexact = false; sentences[x].pttest++; } else if (titr->getThird().length() > 4 && titr->getThird().compare(0,4, "DEP:") == 0) { sentences[x].depexact = false; sentences[x].deptest++; } else { sentences[x].psexact = false; sentences[x].pstest++; } } } } } if (titr->getThird() == "SENT") { sstcount++; } else if (titr->getThird() == "TOK") { tstcount++; } else if (titr->getThird().length() > 4 && titr->getThird().compare(0,4, "POS:") == 0) { pttcount++; } else if (titr->getThird().length() > 4 && titr->getThird().compare(0,4, "DEP:") == 0) { deptcount++; } else { pstcount++; } } } } if (sent_counts) { cout << "Exact" << "\tCorrect\tGold\tTest" << "\tSent Correct\tSent Gold\tSent Test" << "\tTok Correct\tTok Gold\tTok Test" << "\tPOS Correct\tPOS Gold\tPOS Test" << "\tDep Correct\tDep Gold\tDep Test" << "\tPS Correct\tPS Gold\tPS Test" << endl; } for (map::iterator mit = sentences.begin(); mit != sentences.end(); ++mit) { sent_gold++; if (mit->second.exact) sent_correct++; if (mit->second.ssexact) sent_sscorrect++; if (mit->second.tsexact) sent_tscorrect++; if (mit->second.ptexact) sent_ptcorrect++; if (mit->second.depexact) sent_depcorrect++; if (mit->second.psexact) sent_pscorrect++; if (sent_counts) { cout << (mit->second.exact?1:0) << "\t" << mit->second.correct << "\t" << mit->second.gold << "\t" << mit->second.test << "\t" << mit->second.sscorrect << "\t" << mit->second.ssgold << "\t" << mit->second.sstest << "\t" << mit->second.tscorrect << "\t" << mit->second.tsgold << "\t" << mit->second.tstest << "\t" << mit->second.ptcorrect << "\t" << mit->second.ptgold << "\t" << mit->second.pttest << "\t" << mit->second.depcorrect << "\t" << mit->second.depgold << "\t" << mit->second.deptest << "\t" << mit->second.pscorrect << "\t" << mit->second.psgold << "\t" << mit->second.pstest << endl; } } double exact = static_cast(sent_correct)/sent_gold; double ssexact = static_cast(sent_sscorrect)/sent_gold; double tsexact = static_cast(sent_tscorrect)/sent_gold; double ptexact = static_cast(sent_ptcorrect)/sent_gold; double depexact = static_cast(sent_depcorrect)/sent_gold; double psexact = static_cast(sent_pscorrect)/sent_gold; double ssprec, ssrec, ssf; double tsprec, tsrec, tsf; double ptprec, ptrec, ptf; double depprec, deprec, depf; double psprec, psrec, psf; double prec, rec, f; calcScores(sstcount, ssgold, sscorrect, &ssprec, &ssrec, &ssf); calcScores(tstcount, tsgold, tscorrect, &tsprec, &tsrec, &tsf); calcScores(pttcount, ptgold, ptcorrect, &ptprec, &ptrec, &ptf); calcScores(deptcount, depgold, depcorrect, &depprec, &deprec, &depf); calcScores(pstcount, psgold, pscorrect, &psprec, &psrec, &psf); calcScores(tcount, gold, correct, &prec, &rec, &f); if (stats) { cout << sent_gold << "\t" << sscorrect << "\t" << ssgold << "\t" << sstcount << "\t" << ssprec << "\t" << ssrec << "\t" << ssf << "\t" << sent_sscorrect << "\t" << ssexact << "\t" << tscorrect << "\t" << tsgold << "\t" << tstcount << "\t" << tsprec << "\t" << tsrec << "\t" << tsf << "\t" << sent_tscorrect << "\t" << tsexact << "\t" << ptcorrect << "\t" << ptgold << "\t" << pttcount << "\t" << ptprec << "\t" << ptrec << "\t" << ptf << "\t" << sent_ptcorrect << "\t" << ptexact << "\t" << depcorrect << "\t" << depgold << "\t" << deptcount << "\t" << depprec << "\t" << deprec << "\t" << depf << "\t" << sent_depcorrect << "\t" << depexact << "\t" << pscorrect << "\t" << psgold << "\t" << pstcount << "\t" << psprec << "\t" << psrec << "\t" << psf << "\t" << sent_pscorrect << "\t" << psexact << "\t" << correct << "\t" << gold << "\t" << tcount << "\t" << prec << "\t" << rec << "\t" << f << "\t" << sent_correct << "\t" << exact << endl; } else if (!sent_counts) { if (ssgold > 0) { cout << "SENT: R: (" << sscorrect << "/" << ssgold << ") = " << ssrec << ", P: (" << sscorrect << "/" << sstcount << ") = " << ssprec << ", F: " << ssf; if (getexact) cout << ", EX: (" << sent_sscorrect << "/" << sent_gold << ") " << ssexact; cout << endl; } if (tsgold > 0) { cout << " TOK: R: (" << tscorrect << "/" << tsgold << ") = " << tsrec << ", P: (" << tscorrect << "/" << tstcount << ") = " << tsprec << ", F: " << tsf; if (getexact) cout << ", EX: (" << sent_tscorrect << "/" << sent_gold << ") " << tsexact; cout << endl; } if (ptgold > 0) { cout << " POS: R: (" << ptcorrect << "/" << ptgold << ") = " << ptrec << ", P: (" << ptcorrect << "/" << pttcount << ") = " << ptprec << ", F: " << ptf; if (getexact) cout << ", EX: (" << sent_ptcorrect << "/" << sent_gold << ") " << ptexact; cout << endl; } if (depgold > 0) { cout << " DEP: R: (" << depcorrect << "/" << depgold << ") = " << deprec << ", P: (" << depcorrect << "/" << deptcount << ") = " << depprec << ", F: " << depf; if (getexact) cout << ", EX: (" << sent_depcorrect << "/" << sent_gold << ") " << depexact; cout << endl; } if (psgold > 0) { cout << " PS: R: (" << pscorrect << "/" << psgold << ") = " << psrec << ", P: (" << pscorrect << "/" << pstcount << ") = " << psprec << ", F: " << psf; if (getexact) cout << ", EX: (" << sent_pscorrect << "/" << sent_gold << ") " << psexact; cout << endl; } cout << " ALL: R: (" << correct << "/" << gold << ") = " << rec << ", P: (" << correct << "/" << tcount << ") = " << prec << ", F: " << f; if (getexact) cout << ", EX: (" << sent_correct << "/" << sent_gold << ") " << exact; cout << endl; } } //labels can match, match up to first hyphen, or be recorded as equivalent bool equiv (string a, string b, map > &equivlabels, string delimchars, bool unlabelled) { if (unlabelled) { if (a.length() > 4 && b.length() > 4 && a.compare(0, 4, "DEP:") == 0 && b.compare(0, 4, "DEP:") == 0) { return true; } if (a.compare("SENT") != 0 && b.compare("SENT") != 0 && a.compare("TOK") != 0 && b.compare("TOK") != 0 && !(a.length() > 4 && a.compare(0, 4, "POS:") == 0) && !(b.length() > 4 && b.compare(0, 4, "POS:") == 0) && !(a.length() > 4 && a.compare(0, 4, "DEP:") == 0) && !(b.length() > 4 && b.compare(0, 4, "DEP:") == 0)) { //must be PS return true; } } if (a == b) { return true; } else if (equivlabels.count(a) && equivlabels[a].count(b) > 0) { return true; } else { string tmpa = a; string tmpb = b; if (tmpa.length() > 4 && tmpb.length() > 4 && tmpa.compare(0, 4, "POS:") == 0 && tmpb.compare(0, 4, "POS:") == 0) { tmpa = tmpa.substr(4, tmpa.length()-4); tmpb = tmpb.substr(4, tmpb.length()-4); if (equivlabels.count(tmpa) && equivlabels[tmpa].count(tmpb) > 0) { return true; } } if (tmpa.length() > 4 && tmpb.length() > 4 && tmpa.compare(0, 4, "DEP:") == 0 && tmpb.compare(0, 4, "DEP:") == 0) { tmpa = tmpa.substr(4, tmpa.length()-4); tmpb = tmpb.substr(4, tmpb.length()-4); if (equivlabels.count(tmpa) && equivlabels[tmpa].count(tmpb) > 0) { return true; } } if (!delimchars.empty()) { tmpa = a; tmpb = b; unsigned int delim = a.find_first_of(delimchars); if (delim != string::npos) { tmpa = tmpa.substr(0, delim); } delim = b.find_first_of("-="); if (delim != string::npos) { tmpb = tmpb.substr(0, delim); } if (tmpa == tmpb) { return true; } if (equivlabels.count(tmpa) && equivlabels[tmpa].count(tmpb) > 0) { return true; } } return false; } } bool tagoverlap (set >ags, set &ttags, map > &equivlabels) { for (set::iterator gtitr = gtags.begin(); gtitr != gtags.end(); ++gtitr) { if (ttags.count(*gtitr) > 0) return true; if (equivlabels.count(*gtitr) > 0) { for (set::iterator ttitr = ttags.begin(); ttitr != ttags.end(); ++ttitr) { if (equivlabels[*gtitr].count(*ttitr)) return true; } } } return false; } //calculates and prints precision, recall and f-score void calcScores(int tcount, int gcount, int correct, double *p, double *r, double *f) { *p = 0; if (tcount > 0) *p = static_cast(correct)/tcount; *r = 0; if (gcount > 0) *r = static_cast(correct)/gcount; *f = 0; if (correct > 0) *f = (2*(*p)*(*r))/((*p) + (*r)); } void UnmatchedRecord::printRecord(ostream &out) const { out << source << " (" << sent << ")\t" << first << "\t" << second << "\t" << third; if (!comment.empty()) { out << "\t#" << comment; } else { if (fourth >= 0 && fifth >= 0) { out << "\t" << fourth << "\t" << fifth; } } out << endl; } void printSortedUnmatched(map > > &gelts, map > > &telts, ostream &out) { set unmatched; for (map > >::iterator itr = gelts.begin(); itr != gelts.end(); ++itr) { for (vector >::iterator titr = itr->second.begin(); titr != itr->second.end(); ++titr) { if (titr->getStatus() == INIT) { unmatched.insert(UnmatchedRecord('G', titr->getSent(), titr->getFirst(), titr->getSecond(), titr->getThird(), titr->getComment(), titr->getFourth(), titr->getFifth())); } } } for (map > >::iterator itr = telts.begin(); itr != telts.end(); ++itr) { for (vector >::iterator titr = itr->second.begin(); titr != itr->second.end(); ++titr) { if (titr->getStatus() == INIT) { unmatched.insert(UnmatchedRecord('T', titr->getSent(), titr->getFirst(), titr->getSecond(), titr->getThird(), titr->getComment(), titr->getFourth(), titr->getFifth())); } } } for (set::iterator sit = unmatched.begin(); sit != unmatched.end(); ++sit) { sit->printRecord(out); } }