#include "align.h" #include #include #include using namespace std; void lcsAlign (UnicodeString &rmap, vector &toks, vector &tokstart, vector &tokend) { if (rmap == "" && toks.size() > 0) { cerr << "ERROR: no raw for mapping tokens to." << endl; exit(1); } vector tmap; vector mappings; mappings.push_back(UU(Conv->convert(string("-LRB-")), Conv->convert(string("(")))); mappings.push_back(UU(Conv->convert(string("(")), Conv->convert(string("-LRB-")))); mappings.push_back(UU(Conv->convert(string("-RRB-")), Conv->convert(string(")")))); mappings.push_back(UU(Conv->convert(string(")")), Conv->convert(string("-RRB-")))); mappings.push_back(UU(Conv->convert(string("-LCB-")), Conv->convert(string("{")))); mappings.push_back(UU(Conv->convert(string("-RCB-")), Conv->convert(string("}")))); mappings.push_back(UU(Conv->convert(string("``")), Conv->convert(string("\"")))); mappings.push_back(UU(Conv->convert(string("''")), Conv->convert(string("\"")))); mappings.push_back(UU(Conv->convert(string("“")), Conv->convert(string("\"")))); mappings.push_back(UU(Conv->convert(string("“")), Conv->convert(string("``")))); mappings.push_back(UU(Conv->convert(string("``")), Conv->convert(string("“")))); mappings.push_back(UU(Conv->convert(string("”")), Conv->convert(string("\"")))); mappings.push_back(UU(Conv->convert(string("”")), Conv->convert(string("''")))); mappings.push_back(UU(Conv->convert(string("''")), Conv->convert(string("”")))); mappings.push_back(UU(Conv->convert(string("’")), Conv->convert(string("'")))); mappings.push_back(UU(Conv->convert(string("'")), Conv->convert(string("’")))); mappings.push_back(UU(Conv->convert(string("‘")), Conv->convert(string("`")))); mappings.push_back(UU(Conv->convert(string("`")), Conv->convert(string("‘")))); mappings.push_back(UU(Conv->convert(string("‘")), Conv->convert(string("'")))); mappings.push_back(UU(Conv->convert(string("`")), Conv->convert(string("'")))); mappings.push_back(UU(Conv->convert(string("–")), Conv->convert(string("--")))); mappings.push_back(UU(Conv->convert(string("–")), Conv->convert(string("-")))); mappings.push_back(UU(Conv->convert(string("--")), Conv->convert(string("–")))); mappings.push_back(UU(Conv->convert(string("—")), Conv->convert(string("---")))); mappings.push_back(UU(Conv->convert(string("---")), Conv->convert(string("—")))); mappings.push_back(UU(Conv->convert(string("—")), Conv->convert(string("--")))); mappings.push_back(UU(Conv->convert(string("--")), Conv->convert(string("—")))); mappings.push_back(UU(Conv->convert(string("…")), Conv->convert(string("...")))); mappings.push_back(UU(Conv->convert(string("…")), Conv->convert(string(". . .")))); mappings.push_back(UU(Conv->convert(string("+")), Conv->convert(string(" ")))); for (int tix=0; tix < toks.size(); ++tix) { if (toks[tix].length() > 0) { for (int cix=0; cix < toks[tix].length(); ++cix) { if (toks[tix].charAt(cix) == UChar('-') && cix+4 < toks[tix].length()) {//maybe an escaped ( if (UnicodeString("-LRB-").compare( UnicodeString(toks[tix],cix, 5)) == 0) { tmap.push_back(new Tchar(tix, UChar('('))); cix += 4; } else if (UnicodeString("-RRB-").compare( UnicodeString(toks[tix],cix, 5)) == 0) { tmap.push_back(new Tchar(tix, UChar(')'))); cix += 4; } else if (UnicodeString("-LCB-").compare( UnicodeString(toks[tix],cix, 5)) == 0) { tmap.push_back(new Tchar(tix, UChar('{'))); cix += 4; } else if (UnicodeString("-RCB-").compare( UnicodeString(toks[tix],cix, 5)) == 0) { tmap.push_back(new Tchar(tix, UChar('}'))); cix += 4; } else { tmap.push_back(new Tchar(tix, toks[tix].charAt(cix))); } } else { tmap.push_back(new Tchar(tix, toks[tix].charAt(cix))); } } tmap.push_back(new Tchar(-1, UChar(' '))); } } IndexType n = rmap.length(); IndexType m = tmap.size(); LCS(rmap, 0, n, tmap, 0, m); IndexType lastseen = -1; for (IndexType i=0; i < m; ++i) { // cerr << Conv->convert(tmap[i]->c) << ":" << tmap[i]->cix << " (" // << tmap[i]->tid << ")" << endl; if (tmap[i]->cix > -1) { tmap[i]->ceix = tmap[i]->cix; lastseen = tmap[i]->cix; } else { if (Conv->convert(string(" ")).compare(tmap[i]->c) != 0 && Conv->convert(string("\n")).compare(tmap[i]->c) != 0) { IndexType te; for (te=i+1; te < m && tmap[te]->cix == -1; ++te); IndexType re; if (te < m) { re = tmap[te]->cix; } else { re = n-1; } // cerr << "unmatched " << Conv->convert(tmap[i]->c) << " after " // << lastseen << endl; if (re > -1 && re-lastseen > 1) {//unmatched raw UnicodeString uraw(rmap, lastseen+1, re-lastseen-1); // cerr << "trying fuzzy with " << Conv->convert(uraw) << endl; i = fuzzymatch(tmap, i, te, uraw, lastseen+1, mappings); if (tmap[i]->cix > -1) { lastseen = tmap[i]->ceix; // cerr << "and matched " << Conv->convert(tmap[i]->c) // << ":" << tmap[i]->cix << " (" << tmap[i]->tid // << ")" << endl; } } } } } int t = -1; lastseen = -1; for (IndexType i=0; i < m; ++i) { if (tmap[i]->tid > t) { //first of token if (t > -1 && lastseen > -1) { tokend[t] = lastseen; // check if the token alignment spans a much larger range of raw // than might be expected, suggesting incorrect alignment. If so // reset to allow new alignment if (tokstart[t] != -1 && tokstart[t] < tokend[t] && tokend[t] - tokstart[t] > toks[t].length()+8) { cerr << "LONG: " << t << ":" << Conv->convert(toks[t]) << " (" << tokstart[t] << "," << tokend[t] << ")" << endl; tokend[t] = -1; tokstart[t] = -1; } } t = tmap[i]->tid; if (tmap[i]->cix > -1) { tokstart[t] = tmap[i]->cix; } } if (Conv->convert(string(" ")).compare(tmap[i]->c) != 0 && Conv->convert(string("\n")).compare(tmap[i]->c) != 0) { lastseen = tmap[i]->ceix; // if (tokstart[t] == -1) { // tokstart[t] = tmap[i]->cix; // } } } if (t > -1 && lastseen > -1) { tokend[t] = lastseen; } bool changed = true; while (changed) { changed = false; for (int x = 0; x < toks.size(); ++x) { // cerr << x << ":" << Conv->convert(toks[x]) << " (" // << tokstart[x] << "," << tokend[x] << ")" << endl; if (tokend[x] == -1 || tokstart[x] == -1) { int b, a; for (b = x-1; b >= 0 && tokend[b] == -1; b--); for (a = x+1; a < toks.size() && tokstart[a] == -1; ++a); IndexType s = 0; if (b >= 0) s = tokend[b]+1; while (rmap.charAt(s) == UChar(' ') || rmap.charAt(s) == UChar('\n')) s++; IndexType e = n; if (a < toks.size()) e = tokstart[a]-1; while (rmap.charAt(e) == UChar(' ') || rmap.charAt(e) == UChar('\n')) e--; UnicodeString nraw(rmap, s, e-s+1); int rlen = nraw.length(); int tlen = toks[x].length(); if (nraw.length() >= toks[x].length() && nraw.compare(0, toks[x].length(), toks[x]) == 0) { tokstart[x] = s; tokend[x] = s+toks[x].length()-1; changed = true; // cerr << x << ":" << Conv->convert(toks[x]) << " (" // << tokstart[x] << "," << tokend[x] << ")" << endl; } else { if (nraw.length() >= toks[x].length() && nraw.compare(nraw.length() - toks[x].length(), toks[x].length(), toks[x]) == 0){ tokstart[x] = e - toks[x].length()+1; tokend[x] = e; changed = true; // cerr << x << ":" << Conv->convert(toks[x]) << " (" // << tokstart[x] << "," << tokend[x] << ")" << endl; } else { int ti, ri; bool cmatch = true; for (ti = 0, ri = 0; ti < tlen && ri < rlen && cmatch; ++ti, ++ri) { if (toks[x].charAt(ti) == nraw.charAt(ri)) continue; bool mapped = false; for (vector::iterator m = mappings.begin(); m != mappings.end(); ++m) { if (m->first.length() <= tlen-ti && toks[x].compare(ti, m->first.length(), m->first) == 0 && m->second.length() <= rlen-ri && nraw.compare(ri, m->second.length(), m->second) == 0) { ti += m->first.length()-1; ri += m->second.length()-1; mapped = true; break; } } if (mapped) continue; else cmatch = false; } if (ti == tlen && cmatch == true) { tokstart[x] = s; tokend[x] = s + ri -1; changed = true; // cerr << x << ":" << Conv->convert(toks[x]) << " (" // << tokstart[x] << "," << tokend[x] << ")" << endl; } else { cmatch = true; for (ti = tlen-1, ri = rlen -1; ti >= 0 && ri >= 0 && cmatch; --ti, --ri) { if (toks[x].charAt(ti) == nraw.charAt(ri)) continue; bool mapped = false; for (vector::iterator m = mappings.begin(); m != mappings.end(); ++m) { if (m->first.length() <= ti+1 && toks[x].compare(ti - m->first.length() +1, m->first.length(), m->first) == 0 && m->second.length() <= ri+1 && nraw.compare(ri - m->second.length() + 1, m->second.length(), m->second) == 0) { ti -= m->first.length()-1; ri -= m->second.length()-1; mapped = true; break; } } if (mapped) continue; else cmatch = false; } if (ti < 0 && cmatch == true) { tokstart[x] = s + ri + 1; tokend[x] = e; changed = true; // cerr << x << ":" << Conv->convert(toks[x]) << " (" // << tokstart[x] << "," << tokend[x] << ")" << endl; } else { // cerr << x << ":" << Conv->convert(toks[x]) << " (" // << tokstart[x] << "," << tokend[x] // << ") should match between " // << "raw " << s << " and " << e << endl; // << Conv->convert(nraw)<< endl; } } } } } } } for (int tx = 0; tx < tmap.size(); ++tx) { delete tmap[tx]; } } void LCS(UnicodeString &A, IndexType As, IndexType N, vector &B, IndexType Bs, IndexType M) { if (N > 0 && M > 0) { //find middle snake, find (x,y), (u, v) and D IndexType delta = N-M; bool oddDelta = true; if (delta % 2 == 0) oddDelta = false; IndexType mid = (M+N+2-1)/2; //ceiling (m+n)/2 vector vf(M+N+1, -1); vector vr(M+N+1, -1); IndexType d, x, x2, y, y2, D, u, u2, v, v2, xfin, yfin, ufin, vfin; bool finished = false; assert(mid+1 <= N+M && mid+1 >= 0); vf[mid+1] = 0; assert(mid+1 <= N+M && mid+1 >= 0); vr[mid+1] = N + 1; for (d=0; d <= mid && !finished; ++d) { for (IndexType k = -1 * d; k <= d; k += 2) { if (k == (-1*d) || (k != d && vf[(k+mid)-1] < vf[(k+mid)+1])) { assert(k+mid+1 <= N+M && k+mid+1 >= 0); x = vf[(k+mid)+1]; } else { assert((k+mid)-1 <= N+M && (k+mid)-1 >= 0); x = vf[(k+mid)-1] + 1; } y = x - k; IndexType startx = x; while (x < N && y < M && A.charAt(x+As+1-1) == B[y+Bs+1-1]->c) { x = x+1; y = y+1; } assert(k+mid <= N+M && k+mid >= 0); vf[(k+mid)] = x; if (oddDelta && k >= (delta - (d-1)) && k <= (delta + (d-1)) && ((x >= N && y >= M) || (x >= vr[k+mid-delta]))) { D = 2*d - 1; assert(k+mid-delta <= N+M && k+mid-delta >= 0); x2 = startx; y2 = x2 - k; u = vr[(k+mid)-delta]; if (((vr[k+mid-delta+1]-1) - vr[(k+mid)-delta]) == x-x2) { u2 = vr[k+mid-delta+1]-1; } else { u2 = vr[k+mid-delta-1]; } v = u - k; v2 = u2 - k; // if ((x+y) >= ((N-u)+(M-v))) { xfin = x; yfin = y; ufin = x2; vfin = y2; // } else { // xfin = u2; // yfin = v2; // ufin = u; // vfin = v; // } finished = true; break; } } if (!finished) { for (IndexType k = -1 * d; k <= d; k += 2) { if (k == (-1*d) || (k != d && vr[(k+mid)-1] > vr[(k+mid)+1])) { assert(k+mid+1 <= N+M && k+mid+1 >= 0); u = vr[(k+mid)+1] - 1; } else { assert(k+mid-1 <= N+M && k+mid-1 >= 0); u = vr[(k+mid)-1]; } v = u - (k+delta); IndexType startu = u; while (u > 0 && v > 0 && A.charAt(u+As-1) == B[v+Bs-1]->c) { u = u - 1; v = v - 1; } assert(k+mid <= N+M && k+mid >= 0); vr[(k+mid)] = u; if (!oddDelta && k+delta >= (-1*d) && k + delta <= d && ((u <= 0 && v <= 0) || (u <= vf[k+delta+mid]))) { D = 2*d; u2 = startu; v2 = u2 - (k+delta); xfin = u2; yfin = v2; ufin = u; vfin = v; finished = true; break; } } } } if (xfin < ufin) { IndexType i, j; for (i=xfin+1, j=yfin+1; i <=ufin && j <= vfin; ++i, ++j) { B[Bs+j-1]->cix = As+i-1; } } else { if (ufin < xfin) { IndexType i, j; for (i=ufin+1, j=vfin+1; i <=xfin && j <= yfin; ++i, ++j) { if (B[Bs+j-1]->c != UChar(' ') && B[Bs+j-1]->c != UChar('\n') && B[Bs+j-1]->c != UChar('.') && B[Bs+j-1]->c == A.charAt(As+i-1)) { B[Bs+j-1]->cix = As+i-1; } } } } if (D > 1) { LCS(A, As, ufin, B, Bs, vfin); LCS(A, xfin+As, N-xfin, B, yfin+Bs, M-yfin); } else { if (M > N) { IndexType i, j; for (i=1, j=1; i <= N && j <= N; ++i, ++j) { if (B[Bs+j-1]->c != UChar(' ') && B[Bs+j-1]->c != UChar('\n') && B[Bs+j-1]->c != UChar('.') && B[Bs+j-1]->c == A.charAt(As+i-1)) { B[Bs+j-1]->cix = As+i-1; } } } else { IndexType i, j; for (i=1, j=1; i <= M && j <= M; ++i, ++j) { if (B[Bs+j-1]->c != UChar(' ') && B[Bs+j-1]->c != UChar('\n') && B[Bs+j-1]->c != UChar('.') && B[Bs+j-1]->c == A.charAt(As+i-1)) { B[Bs+j-1]->cix = As+i-1; } } } } } } IndexType fuzzymatch(vector &tmap, IndexType i, IndexType e, UnicodeString uraw, IndexType rix, vector &mappings) { while (uraw.charAt(0) == UChar(' ') || uraw.charAt(0) == UChar('\n')) { uraw.remove(0,1); rix++; } int rlen = uraw.length(); UnicodeString tstr; for (IndexType x=i; x < e; ++x) { tstr += tmap[x]->c; } int tlen = tstr.length(); if (rlen == 0 || tlen == 0) { return i; } if (uraw.charAt(0) == tstr.charAt(0)) { tmap[i]->cix = rix; tmap[i]->ceix = rix; return i; } if (uraw.caseCompare(0,1,tstr,0,1, U_FOLD_CASE_DEFAULT) == 0) { tmap[i]->cix = rix; tmap[i]->ceix = rix; return i; } // cerr << "comparing `" << Conv->convert(tstr) << "' and `" // << Conv->convert(uraw) << "' near " << rix << endl; for (vector::iterator m = mappings.begin(); m != mappings.end(); ++m) { if (m->first.length() <= tlen && tstr.compare(0, m->first.length(), m->first) == 0 && m->second.length() <= rlen && uraw.compare(0, m->second.length(), m->second) == 0) { IndexType x; for (x = i; x < e && x < i+m->first.length(); ++x) { tmap[x]->cix = rix; tmap[x]->ceix = rix + (m->second.length()-1); } // cerr << "matched" << endl; return x-1; } } return i; }