package lkformat2;
import java.io.*;
import java.util.*;
//import java.util.regex.*;
//import mpqareader.SubjectivityLexicon;
public class POSTagsToLK {
private static final String ENCODING = "UTF-8";
public static void main(String[] argv) {
String lkDir = argv[0];
String sstOutputFile = argv[1];
String conll2008OutFile = argv[2];
//String subjExpInputFile = argv[3];
String outDir = argv[3];
String dict = argv[4];
//String subjLexFile = argv[6];
//System.out.println("argv = " + Arrays.toString(argv));
try {
readDict(dict);
//SubjectivityLexicon subjLex = new SubjectivityLexicon(subjLexFile);
BufferedReader taggerInput = new BufferedReader(new InputStreamReader(new FileInputStream(sstOutputFile), ENCODING));
PrintWriter conll2008Out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(conll2008OutFile), ENCODING));
//PrintWriter subjExpOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(subjExpInputFile), ENCODING));
String[] files = new File(lkDir).list();
Arrays.sort(files);
for(String file: files) {
if(file.endsWith("lktext.xml"))
processFile(lkDir + File.separatorChar + file, taggerInput,
conll2008Out, outDir);
}
taggerInput.close();
conll2008Out.close();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
public static void processFile(String textFile, BufferedReader taggerInput,
PrintWriter tabularCoNLL08Out, String outDir) {
try {
textFile = textFile.replaceAll("[^/]*/", "");
System.out.println("Reading tags for base text |" + textFile + "|");
String line = taggerInput.readLine();
if(line == null)
throw new RuntimeException("Expected beginning of file, but came to end");
if(!line.startsWith("___BEGIN___"))
throw new RuntimeException("Expected beginning of file: line = |" + line + "|");
String tokenFile = line.substring("___BEGIN___|".length());
tokenFile = tokenFile.replaceAll("\t.*", "");
System.out.println("Token file is |" + tokenFile + "|");
tabularCoNLL08Out.print("1\t___BEGIN___|" + tokenFile + "\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("___BEGIN___|" + tokenFile + "\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("0\t");
tabularCoNLL08Out.println("ROOT");
//tabularCoNLL08Out.println();
tokenFile = tokenFile.replaceAll("[^/\\\\]*[/\\\\]", "");
//System.out.println("TokenFile = |" + tokenFile + "|");
String basename = textFile.replaceFirst("\\.lktext\\.xml", "");
basename = basename.replaceAll("[^/\\\\]*[/\\\\]", "");
basename = outDir + File.separatorChar + basename;
String outPOSFile = basename + ".pos.xml";
PrintWriter posOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outPOSFile), ENCODING));
posOut.println("");
posOut.println("");
posOut.println("");
posOut.println(" " + textFile + "");
posOut.println(" LTHPOSTagger");
posOut.println("");
posOut.println("");
ArrayList lemmas = new ArrayList();
int tokenIdCounter = 0;
int sentenceTokenId = 0;
line = taggerInput.readLine();
while(line != null && !line.contains("___END___")) {
line = line.trim();
if(!line.equals("")) {
String[] ss = line.split("\t");
if(ss.length != 2) {
System.err.println(Arrays.toString(ss));
if(line.equals("cannot")) {
System.err.println("WARNING: bad tokenization: cannot");
ss = new String[] { "cannot", "VBP" };
} else if(line.equals("rebuilt")) {
System.err.println("WARNING: changed tagger output: rebuilt");
ss = new String[] { "rebuilt", "VBN" };
} else if(ss.length == 1) {
System.err.println("WARNING: changed tagger output: " + line);
ss = new String[] { line, "NN" };
} else
throw new RuntimeException("this line: |" + line + "|");
}
String token = ss[0];
String pos = ss[1];
String lemma = lemmatize(token, pos);
lemmas.add(lemma);
sentenceTokenId++;
tabularCoNLL08Out.print(sentenceTokenId + "\t");
tabularCoNLL08Out.print(token + "\t");
tabularCoNLL08Out.print(lemma + "\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print(pos + "\t");
tabularCoNLL08Out.print(token + "\t");
tabularCoNLL08Out.print(lemma + "\t");
tabularCoNLL08Out.print(pos + "\t");
tabularCoNLL08Out.print("0\t");
tabularCoNLL08Out.println("ROOT");
int id = ++tokenIdCounter;
printEntity(pos, id, -1, id, posOut);
} else {
sentenceTokenId = 0;
tabularCoNLL08Out.println();
}
line = taggerInput.readLine();
}
System.out.println("End of file: " + line);
if(!line.startsWith("___END___|" + outDir + File.separatorChar + tokenFile)) {
System.err.println("|" + line + "|");
System.err.println("|___END___|" + outDir + File.separatorChar + tokenFile);
throw new RuntimeException("Wrong end tag!");
}
line = taggerInput.readLine();
if(!line.equals(""))
throw new RuntimeException("!!!");
int nTokens = tokenIdCounter;
posOut.println("");
posOut.println("");
for(int i = 0; i < lemmas.size(); i++) {
String lemma = lemmas.get(i);
int tid = i + 1;
int id = tid + nTokens;
if(lemma != null && !lemma.equals("_"))
printEntity(lemma, tid, -1, id, posOut);
}
posOut.println("");
posOut.println("");
posOut.close();
tabularCoNLL08Out.print("0\t___END___\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("___END___\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("0\t");
tabularCoNLL08Out.println("ROOT");
tabularCoNLL08Out.println();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
static void printEntity(String l, int start, int end,
int id, PrintWriter out) {
StringBuilder sb = new StringBuilder(" ");
else
sb.append("\" on=\"#" + start + "\">");
sb.append(l);
sb.append("");
out.println(sb);
}
private static HashMap dict;
private static void readDict(String file) {
try {
dict = new HashMap();
BufferedReader dictInput = new BufferedReader(new InputStreamReader(new FileInputStream(file), ENCODING));
String line = dictInput.readLine();
while(line != null) {
String[] ss = line.split("\t");
dict.put(ss[0] + "\t" + ss[1], ss[2]);
line = dictInput.readLine();
}
dictInput.close();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
private static String lemmatize(String w, String t) {
w = w.toLowerCase();
String l = dict.get(w + "\t" + t);
if(l != null)
return l;
if(!t.matches("JJR|JJS|NNS|VBD|VBG|VBN|VBP|VBZ"))
return w;
return null;
}
}