#include "oldalign.h" //aligns between vector of chars from raw string, and tokens from annotated //files. Includes a lot of fuzziness seen in various treebanks including: // * bracket substitutions as in Penn Treebank // * backslash escapes as in PTB // * quote substitutions ``, '', ", “, ”, ', ` // * Unicode substitutions from the English Web Treebank // * double punctuation handling from Brown Corpus // * special handling for C&J parser's mangling (won't -> will n't etc) // * html entity equivalence (>, <, ", &) // * EWTB errors: &apos -> ', … to single . or ♥ to < // * case insensitive compare // * allows skipping IGNOREMAX char in a row, currently set to 1 void alignRaw (UnicodeString &rmap, vector &toks, vector &tokstart, vector &tokend) { int ridx = 0; int tokidx = 0; int ignore = 0; boost::u32regex ellipsisperiod = boost::make_u32regex("^((?:\\.\\s*)+\\.)\\s*(\\.)"); boost::u32regex ellipsis = boost::make_u32regex("^((?:\\.\\s*)+\\.)"); boost::u32regex dashes = boost::make_u32regex("^((?:-\\s*)+-)"); while (ridx < rmap.length() && tokidx < toks.size()) { UChar rchar = rmap.charAt(ridx); UnicodeString utok = toks[tokidx]; //DEBUG // cerr << "aligning " << Conv->convert(utok) << " at "; // int debugidx = ridx; // while (debugidx < rmap.length() && debugidx < ridx+30) { // cerr << Conv->convert(rmap.charAt(debugidx)); // debugidx++; // } // cerr << endl; //start of token //skip spaces, since a token won't have a leading space if (u_isUWhiteSpace(rchar) || Conv->convert(string("\u00A0")).compare(rchar) == 0 || Conv->convert(string("\u00AD")).compare(rchar) == 0) { ridx++; continue; } //ellipses handling if (utok == "..." || Conv->convert(string("\u2026")) == utok) { if (Conv->convert(string("\u2026")).compare(rchar) == 0) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; ridx++; tokidx++; ignore = 0; continue; } boost::match_results res; if (tokidx+1 < toks.size() && (toks[tokidx+1] == "." || (toks[tokidx+1].charAt(0) == '.' && toks[tokidx+1].length() > 1 && toks[tokidx+1].charAt(1) != '.')) && // boost::u32regex_search(rmap.tempSubString(ridx), res, boost::u32regex_search(UnicodeString(rmap,ridx), res, ellipsisperiod)) { int ellipsisend = ridx + res[1].length() - 1; int periodstart = ridx + string(res[0].first, res[2].first).length(); if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ellipsisend > tokend[tokidx]) tokend[tokidx] = ellipsisend; ridx = periodstart; tokidx++; ignore = 0; continue; } else { if ((tokidx+1 == toks.size() || toks[tokidx+1].charAt(0) != '.') && // boost::u32regex_search(rmap.tempSubString(ridx), res, boost::u32regex_search(UnicodeString(rmap, ridx), res, ellipsis)) { int ellipsisend = ridx + res[1].length() - 1; if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ellipsisend > tokend[tokidx]) tokend[tokidx] = ellipsisend; ridx = ellipsisend+1; tokidx++; ignore = 0; continue; } } } //dash handling if (utok == "--" || Conv->convert(string("\u2013")) == utok || utok == "---" || Conv->convert(string("\u2014")) == utok) { if (Conv->convert(string("\u2013")).compare(rchar) == 0 || Conv->convert(string("\u2014")).compare(rchar) == 0) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; ridx++; tokidx++; ignore = 0; continue; } boost::match_results res; if ((tokidx+1 == toks.size() || ( toks[tokidx+1].charAt(0) != '-' || toks[tokidx+1].compare(0,5,"-LRB-") == 0 || toks[tokidx+1].compare(0,5,"-RRB-") == 0 || toks[tokidx+1].compare(0,5,"-LCB-") == 0 || toks[tokidx+1].compare(0,5,"-RCB-") == 0 || toks[tokidx+1].compare(0,5,"-LSB-") == 0 || toks[tokidx+1].compare(0,5,"-RSB-") == 0)) && // boost::u32regex_search(rmap.tempSubString(ridx), res, boost::u32regex_search(UnicodeString(rmap, ridx), res, dashes)) { int dashend = ridx + res[1].length() - 1; if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || dashend > tokend[tokidx]) tokend[tokidx] = dashend; ridx = dashend+1; tokidx++; ignore = 0; continue; } } if ( (tokidx+1 < toks.size() && (Conv->convert(string("n't")). caseCompare(0, 3, toks[tokidx+1], 0, 3, 0) == 0 || Conv->convert(string("n’t")). caseCompare(0, 3, toks[tokidx+1], 0, 3, 0) == 0) && ridx+4 < rmap.length() && (Conv->convert(string("n't")). caseCompare(0, 3, rmap, ridx+2, 3, 0) == 0 || Conv->convert(string("n’t")). caseCompare(0, 3, rmap, ridx+2, 3, 0) == 0)) && ((Conv->convert(string("will")).caseCompare(0,4,utok,0,4,0) == 0 && Conv->convert(string("wo")). caseCompare(0,2,rmap,ridx,2,0) == 0) || (Conv->convert(string("can")). caseCompare(0,3,utok,0,3,0) == 0 && Conv->convert(string("ca")). caseCompare(0,2,rmap,ridx,2,0) == 0) || (utok == "IS" && Conv->convert(string("ai")).caseCompare(0,2,rmap,ridx,2,0) == 0)) ) { tokstart[tokidx] = ridx; tokend[tokidx] = ridx+1; tokidx++; ridx += 2; ignore = 0; continue; } //when PTB-style quotes are input and split apart if ((utok == "'" || utok == "`") && tokidx+1 < toks.size() && (toks[tokidx+1] == "'" || toks[tokidx+1] == "`") && (rchar == '"' || Conv->convert(string("”")).compare(rchar) == 0 || Conv->convert(string("“")).compare(rchar) == 0)) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokstart[tokidx+1] == -1 || ridx < tokstart[tokidx+1]) tokstart[tokidx+1] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; if (tokend[tokidx+1] == -1 || ridx > tokend[tokidx+1]) tokend[tokidx+1] = ridx; ridx++; tokidx += 2; ignore = 0; continue; } //matching char by char int x = 0; while (x < utok.length() && ridx < rmap.length()) { rchar = rmap.charAt(ridx); UChar tchar = utok.charAt(x); //DEBUG // cerr << "RCHAR(" << ridx << "): " << Conv->convert(rchar) // << ", TCHAR(" << x << "): " // << Conv->convert(tchar) << endl; //checking for "&" matching '&'. Needs to be before the //equality check to avoid unmatched "amp;" if (rchar == '&' && tchar == '&') { if (ridx+4 < rmap.length() && rmap.compare(ridx, 5, "&") == 0) { if (x+4 < utok.length() && utok.compare(x, 5, "&") == 0) x += 5; else x++; if (tokend[tokidx] == -1 || ridx+4 > tokend[tokidx]) tokend[tokidx] = ridx+4; ridx += 5; } else { if (x+4 < utok.length() && utok.compare(x, 5, "&") == 0) x += 5; else x++; ridx++; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; } if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; ignore = 0; continue; } if ((tchar == '-' && (Conv->convert(string("\u2013")).compare(rchar) == 0 || Conv->convert(string("\u2014")).compare(rchar) == 0)) && ((x+2 < utok.length() && utok.compare(x, 3, "---") == 0) || (x+1 < utok.length() && utok.compare(x, 2, "--") == 0))) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; ridx++; if (x+2 < utok.length() && utok.compare(x, 3, "---") == 0) x += 3; else x += 2; ignore = 0; continue; } //single char exact or unicode fuzzy match if (rchar == tchar || unUnicodeMatch(rchar, tchar) || unUnicodeMatch(tchar, rchar) || (tchar == '`' && (rchar == '\'' || Conv->convert(string("‘")).compare(rchar) == 0)) || (rchar == '`' && (tchar == '\'' || Conv->convert(string("‘")).compare(tchar) == 0))) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; ridx++; x++; ignore = 0; continue; } //escaped char if (tchar == '\\' && x+1 < utok.length() && rchar == utok.charAt(x+1) ) { x++; continue; } if (u_isUWhiteSpace(rchar) || Conv->convert(string("\u00A0")).compare(rchar) == 0 || Conv->convert(string("\u00AD")).compare(rchar) == 0) { ridx++; continue; } if (u_isUWhiteSpace(tchar) || Conv->convert(string("\u00A0")).compare(tchar) == 0 || Conv->convert(string("\u00AD")).compare(tchar) == 0) { x++; continue; } boost::match_results res; if (Conv->convert(string("\u2026")).compare(tchar) == 0 && x+1 < utok.length() && utok[x+1] == '.' && boost::u32regex_search(UnicodeString(rmap,ridx), res, ellipsisperiod)) { int ellipsisend = ridx + res[1].length() - 1; int periodstart = ridx + string(res[0].first, res[2].first).length(); if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ellipsisend > tokend[tokidx]) tokend[tokidx] = ellipsisend; ridx = periodstart; x++; ignore = 0; continue; } //berkeley parser special if (rchar == '@' && x+2 < utok.length() && utok.compare(x, 3, "SYM") == 0) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; ridx++; x += 3; ignore = 0; continue; } if (tchar == '-' && x+4 < utok.length() && (utok.compare(x, 5, "-LRB-") == 0 && rchar == '(') || (utok.compare(x, 5, "-LRB-") == 0 && rchar == '[') || (utok.compare(x, 5, "-LRB-") == 0 && rchar == '<') || (utok.compare(x, 5, "-RRB-") == 0 && rchar == ')') || (utok.compare(x, 5, "-RRB-") == 0 && rchar == ']') || (utok.compare(x, 5, "-RRB-") == 0 && rchar == '>') || (utok.compare(x, 5, "-LSB-") == 0 && rchar == '[') || (utok.compare(x, 5, "-RSB-") == 0 && rchar == ']') || (utok.compare(x, 5, "-LCB-") == 0 && rchar == '{') || (utok.compare(x, 5, "-RCB-") == 0 && rchar == '}') ) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; ridx++; x += 5; ignore = 0; continue; } if (rchar == '-' && ridx+4 < rmap.length() && (rmap.compare(x, 5, "-LRB-") == 0 && tchar == '(') || (rmap.compare(x, 5, "-LRB-") == 0 && tchar == '[') || (rmap.compare(x, 5, "-LRB-") == 0 && tchar == '<') || (rmap.compare(x, 5, "-RRB-") == 0 && tchar == ')') || (rmap.compare(x, 5, "-RRB-") == 0 && tchar == ']') || (rmap.compare(x, 5, "-RRB-") == 0 && tchar == '>') || (rmap.compare(x, 5, "-LSB-") == 0 && tchar == '[') || (rmap.compare(x, 5, "-RSB-") == 0 && tchar == ']') || (rmap.compare(x, 5, "-LCB-") == 0 && tchar == '{') || (rmap.compare(x, 5, "-RCB-") == 0 && tchar == '}') ) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; ridx += 5; x++; ignore = 0; continue; } if (rchar == '&' && ridx+3 < rmap.length()) { if ((rmap.compare(ridx, 4, ">") == 0 && tchar == '>') || (rmap.compare(ridx, 4, "<") == 0 && tchar == '<')) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx+3 > tokend[tokidx]) tokend[tokidx] = ridx+3; ridx += 4; x++; ignore = 0; continue; } if ((rmap.compare(ridx, 4, ">") == 0 && x+4 < utok.length() && utok.compare(x,5,"-RRB-") == 0) || (rmap.compare(ridx, 4, "<") == 0 && x+4 < utok.length() && utok.compare(x,5,"-LRB-") == 0)) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx+3 > tokend[tokidx]) tokend[tokidx] = ridx+3; ridx += 4; x += 5; ignore = 0; continue; } if (ridx+5 < rmap.length() && rmap.compare(ridx, 6, """) == 0 && (tchar == '"' || Conv->convert(string("“")).compare(tchar) == 0 || Conv->convert(string("”")).compare(tchar) == 0 )) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx+5 > tokend[tokidx]) tokend[tokidx] = ridx+5; ridx += 6; x++; ignore = 0; continue; } if (ridx+5 < rmap.length() && rmap.compare(ridx, 6, """) == 0 && x+1 < utok.length() && (utok.compare(x,2,"``") == 0 || utok.compare(x,2,"''") == 0 )) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx+5 > tokend[tokidx]) tokend[tokidx] = ridx+5; ridx += 6; x += 2; ignore = 0; continue; } } if (tchar == '&' && x+3 < utok.length()) { if ((utok.compare(x, 4, ">") == 0 && rchar == '>') || (utok.compare(x, 4, "<") == 0 && rchar == '<')) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; ridx++; x += 4; ignore = 0; continue; } if (x+5 < utok.length() && utok.compare(x, 6, """) == 0 && (rchar == '"' || Conv->convert(string("“")).compare(rchar) == 0 || Conv->convert(string("”")).compare(rchar) == 0 )) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; ridx++; x += 6; ignore = 0; continue; } if (x+5 < utok.length() && utok.compare(x, 6, """) == 0 && ridx+1 < rmap.length() && (rmap.compare(x,2,"``") == 0 || rmap.compare(x,2,"''") == 0 )) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx+5 > tokend[tokidx]) tokend[tokidx] = ridx+5; ridx += 2; x += 6; ignore = 0; continue; } } if (x+1 < utok.length() && (utok.compare(x,2,"``") == 0 || utok.compare(x,2,"''") == 0) && (rchar == '\"' || Conv->convert(string("“")).compare(rchar) == 0 || Conv->convert(string("”")).compare(rchar) == 0)) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; x += 2; ridx++; ignore = 0; continue; } if (ridx+1 < rmap.length() && (rmap.compare(ridx,2,"``") == 0 || rmap.compare(ridx,2,"''") == 0) && (tchar == '\"' || Conv->convert(string("“")).compare(tchar) == 0 || Conv->convert(string("”")).compare(tchar) == 0)) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx+1 > tokend[tokidx]) tokend[tokidx] = ridx+1; x++; ridx += 2; ignore = 0; continue; } if (Conv->convert(string("\u2026")).compare(rchar) == 0 && utok.compare(x,3,"...") == 0) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; ridx++; x += 3; ignore = 0; continue; } //allow case-insensitive match if (u_foldCase(rchar, U_FOLD_CASE_DEFAULT) == u_foldCase(tchar, U_FOLD_CASE_DEFAULT)) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; ridx++; x++; ignore = 0; continue; } //skip doubled punctuation, seen in Brown corpus if ((tchar == ';' || //tchar == '.' || tchar == '!' || tchar == '?' || tchar == '-') && tokidx > 1 && toks[tokidx-1].charAt(toks[tokidx-1].length()-1) == tchar) { // cerr << "skipping doubled punct " << Conv->convert(tchar) // << endl; x++; continue; } //partial matches seen in EWTB: … to single . or ♥ to < if ((Conv->convert(string("\u2026")).compare(rchar) == 0 && tchar == '.' && ((x+1 < utok.length() && utok.charAt(x+1) != '.') || x+1 == utok.length())) || (Conv->convert(string("\u2665")).compare(rchar) == 0 && tchar == '<')) { if (tokstart[tokidx] == -1 || ridx < tokstart[tokidx]) tokstart[tokidx] = ridx; if (tokend[tokidx] == -1 || ridx > tokend[tokidx]) tokend[tokidx] = ridx; ridx++; x++; ignore = 0; continue; } //ignore char in raw in very particular circumstances, since adding //text in the treebanks is so much more common than leaving it out // * the next char matches the current treebank char and either // ** it's a single char token and the start of the next token // doesn't match the current raw char OR // ** we've already matched part of the token and the next TWO raw // chars match the next token chars if (ignore < MAXIGNORE && ridx+1 < rmap.length() && rmap.charAt(ridx+1) == tchar && ((x == 0 && utok.length() == 1 && tokidx+1 < toks.size() && toks[tokidx+1].charAt(0) != rmap.charAt(ridx)) || (x > 0 && x+1 < utok.length() && ridx+2 < rmap.length() && rmap.charAt(ridx+2) == utok.charAt(x+1)))) { ridx++; ignore++; // cerr << "ignoring " << Conv->convert(rchar) << " in raw" << endl; continue; } if (ignore < MAXIGNORE && rmap.charAt(ridx) == '.' && toks[tokidx+1].charAt(0) != rmap.charAt(ridx)) { ridx++; ignore++; // cerr << "ignoring " << Conv->convert(rchar) << " in raw" << endl; continue; } //allow skipping MAXIGNORE chars in a row in treebank, usually // . or - if (ignore < MAXIGNORE) { x++; ignore++; // cerr << "ignoring " << Conv->convert(tchar) << " in mrg" << endl; continue; } //alignment debugging. Shouldn't happen, and will stop the evaluation, //hopefully with enough information to correct either the alignment //code or the raw file or the treebank file cerr << "Lost alignment at token " << tokidx << "(" << x << ") " << Conv->convert(utok) << " and ridx " << ridx << " ("; int debugidx = 0; int debugcount = 0; if (ridx > 20) debugidx = ridx - 20; while (debugidx < rmap.length() && debugcount <=40) { if (debugidx == ridx) cerr << "|"; cerr << Conv->convert(rmap.charAt(debugidx)); if (debugidx == ridx) cerr << "|"; debugidx++; debugcount++; } cerr << ") with ignore " << ignore << endl; exit(1); } tokidx++; } } //from EWTB edit.list bool unUnicodeMatch(UChar a, UChar b) { if ( (Conv->convert(string("\u00A0")) == a && b == ' ') || (Conv->convert(string("\u00A3")) == a && b == 'L') || (Conv->convert(string("\u00A3")) == a && b == '#') || //corenlp (Conv->convert(string("\u00AD")) == a && b == ' ') || (Conv->convert(string("\u00B0")) == a && b == 'o') || (Conv->convert(string("\u00B3")) == a && b == '3') || (Conv->convert(string("\u00B4")) == a && b == '\'') || (Conv->convert(string("\u00B7")) == a && b == '*') || (Conv->convert(string("\u00C1")) == a && b == 'A') || (Conv->convert(string("\u00C3")) == a && b == 'A') || (Conv->convert(string("\u00C7")) == a && b == 'C') || (Conv->convert(string("\u00CD")) == a && b == 'I') || (Conv->convert(string("\u00E0")) == a && b == 'a') || (Conv->convert(string("\u00E1")) == a && b == 'a') || (Conv->convert(string("\u00E3")) == a && b == 'a') || (Conv->convert(string("\u00E4")) == a && b == 'a') || (Conv->convert(string("\u00E7")) == a && b == 'c') || (Conv->convert(string("\u00E9")) == a && b == 'e') || (Conv->convert(string("\u00EA")) == a && b == 'e') || (Conv->convert(string("\u00EF")) == a && b == 'i') || (Conv->convert(string("\u00F3")) == a && b == 'o') || (Conv->convert(string("\u00F4")) == a && b == 'o') || (Conv->convert(string("\u00F6")) == a && b == 'o') || (Conv->convert(string("\u00FC")) == a && b == 'u') || (Conv->convert(string("\u03A5")) == a && b == 'Y') || (Conv->convert(string("\u2000")) == a && b == ' ') || (Conv->convert(string("\u2013")) == a && b == '-') || (Conv->convert(string("\u2014")) == a && b == '-') || (Conv->convert(string("\u2018")) == a && b == '\'') || (Conv->convert(string("\u2019")) == a && b == '\'') || (Conv->convert(string("«")) == a && Conv->convert(string("“")) == b) || (Conv->convert(string("“")) == a && Conv->convert(string("«")) == b) || (Conv->convert(string("»")) == a && Conv->convert(string("”")) == b) || (Conv->convert(string("”")) == a && Conv->convert(string("»")) == b) || (Conv->convert(string("\u201C")) == a && b == '"') || (Conv->convert(string("\u201D")) == a && b == '"') ) { return true; } return false; }