package lkformat2; import java.io.*; import java.util.*; public class PreprocessSSTLight { private static String extractAttribute(String line, String attr) { String s = attr + "=\""; int ix1 = line.indexOf(s); if(ix1 == -1) return null; ix1 += s.length(); int ix2 = line.indexOf("\"", ix1); return line.substring(ix1, ix2); } private static String extractEntityData(String line) { int ix1 = line.indexOf("", ix1); if(ix2 == -1) return null; int ix3 = line.lastIndexOf(""); if(ix3 == -1) return null; return line.substring(ix2 + 1, ix3); } public static void processFile(String fileName, PrintWriter out) { try { if(new File(fileName).isDirectory()) return; BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8")); String line = br.readLine(); while(line != null) { if(line.contains("provides=\"SENTENCES\"")) break; line = br.readLine(); } if(line == null) { br.close(); return; } String tokenFile = extractAttribute(line, "scope"); if(tokenFile == null) tokenFile = fileName; System.out.println("Sentences from " + fileName + ", tokens from " + tokenFile); ArrayList spans = new ArrayList(); line = br.readLine(); while(!line.contains("")) { String start = extractAttribute(line, "start"); String end = extractAttribute(line, "end"); //System.out.println("line = " + line + " start = " + start + " end = " + end); if(start == null) throw new RuntimeException("Only start-end annotation supported for sentences"); if(end == null) throw new RuntimeException("Only start-end annotation supported for sentences"); if(start.charAt(0) != '#') throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " start = " + start + " end = " + end); if(end.charAt(0) != '#') throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " start = " + start + " end = " + end); start = start.substring(1); end = end.substring(1); spans.add(new String[] { start, end }); line = br.readLine(); } /* 091012 if(spans.isEmpty()) return; */ br.close(); br = new BufferedReader(new InputStreamReader(new FileInputStream(tokenFile), "UTF-8")); out.println("___BEGIN___|" + tokenFile); out.println(); out.flush(); line = br.readLine(); while(line != null) { if(line.contains("provides=\"TOKENS\"")) break; line = br.readLine(); } int senPos = 0; if(senPos < spans.size()) { String[] senSpan = spans.get(senPos); boolean inside = false; int prev = 0; line = br.readLine(); while(!line.contains("")) { line = line.trim(); if(!line.equals("")) { String t = extractEntityData(line); if(t == null) throw new RuntimeException("Could not extract token"); String id = extractAttribute(line, "id"); if(id == null) throw new RuntimeException("Could not extract id"); int idi = Integer.parseInt(id); if(idi != prev + 1) throw new RuntimeException("I have assumed contiguous ids..."); prev = idi; if(id.equals(senSpan[0])) inside = true; if(inside) out.println(t); if(id.equals(senSpan[1])) { out.println(); senPos++; if(senPos == spans.size()) break; senSpan = spans.get(senPos); inside = false; } } line = br.readLine(); } } br.close(); out.println("___END___|" + tokenFile); out.println(); out.flush(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } public static void processDirectory(String dirName, String outFileName) { try { PrintWriter out = new PrintWriter(new FileWriter(outFileName)); String[] files = new File(dirName).list(); Arrays.sort(files); for(String file: files) { processFile(dirName + File.separatorChar + file, out); } out.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } public static void main(String[] argv) { processDirectory(argv[0], argv[1]); } }