package lkformat2; import lkformat.reader.*; import mpqareader.SubjectivityLexicon; import java.io.*; import java.util.*; import se.lth.cs.nlp.nlputils.core.*; import se.lth.cs.nlp.nlputils.depgraph.*; public class MPQASeqExtractor { static int countEntities; static int countOverlaps; public static HashMap createStartMap(ArrayList tokens, ArrayList entities) { HashMap out = new HashMap(); if(false) { /* Check for overlaps. */ for(LKAnnotationEntity e: entities) { if(e.referred == null) continue; countEntities++; boolean after = false; for(LKAnnotationEntity e2: entities) { if(e2.referred == null) continue; if(after) { if(overlap(e, e2)) countOverlaps++; } else if(e2 == e) after = true; } } } for(LKAnnotationEntity e: entities) { if(e.referred == null) continue; LKAnnotationEntity t = e.referred.get(0); if(out.containsKey(t)) { LKAnnotationEntity other = out.get(t); if(e.referred.size() > other.referred.size()) out.put(t, e); } else out.put(t, e); } return out; } private static boolean overlap(LKAnnotationEntity e1, LKAnnotationEntity e2) { HashSet s1 = new HashSet(); for(LKAnnotationEntity t1: e1.referred) s1.add(t1); for(LKAnnotationEntity t2: e2.referred) if(s1.contains(t2)) return true; return false; } public static ListMap createEndMap(ArrayList tokens, ArrayList entities) { ListMap out = new ListMap(); for(LKAnnotationEntity e: entities) { if(e.referred == null) continue; LKAnnotationEntity t = e.referred.get(e.referred.size() - 1); out.put(t, e); } return out; } private static String createIOBLabel(boolean starts, LKAnnotationEntity e) { if(e == null) return "O"; DataElementNode n = (DataElementNode) e.data; DataNode n2 = n.children.get(0); DataElementNode dn2 = (DataElementNode) n2; return (starts? "B-": "I-") + dn2.name; // + "/" + dn2.attributes; } private static String getEntityLabel(LKAnnotationEntity e) { DataElementNode n = (DataElementNode) e.data; DataNode n2 = n.children.get(0); DataElementNode dn2 = (DataElementNode) n2; return dn2.name; } // sentence -> depgraph private static Pair, HashMap> createDGMap(LKAnnotationLayer senLayer, LKAnnotationLayer dgLayer, HashMap posMap, HashMap lemmaMap) { HashMap tokenToLink = new HashMap(); for(LKAnnotationEntity le: dgLayer.entityList) if(le.to != null) tokenToLink.put(le.to, le); else if(le.referred.size() == 1) tokenToLink.put(le.referred.get(0), le); else throw new RuntimeException("No link!"); HashMap sentenceToGraph = new HashMap(); HashMap tokenToNode = new HashMap(); for(LKAnnotationEntity se: senLayer.entityList) { ArrayList tokens = new ArrayList(); ArrayList poss = new ArrayList(); ArrayList lemmas = new ArrayList(); for(LKAnnotationEntity te: se.referred) { LKAnnotationEntity pose = posMap.get(te); if(pose == null) continue; LKAnnotationEntity leme = lemmaMap.get(te); tokens.add(te); poss.add(pose); lemmas.add(leme); } DepGraph dg = new DepGraph(tokens.size()); dg.nodes[0] = new DepNode(); for(int i = 0; i < tokens.size(); i++) { dg.nodes[i+1] = new DepNode(); dg.nodes[i+1].position = i+1; tokenToNode.put(tokens.get(i), dg.nodes[i+1]); } for(int i = 0; i < tokens.size(); i++) { LKAnnotationEntity te = tokens.get(i); DepNode n = tokenToNode.get(te); n.word = te.data.getText(); n.pos = poss.get(i).data.getText(); LKAnnotationEntity leme = lemmas.get(i); if(leme != null) n.lemma = leme.data.getText(); LKAnnotationEntity le = tokenToLink.get(te); if(le.to != null) { DepNode p = tokenToNode.get(le.from); if(p == null) throw new RuntimeException("parent not found!"); p.addChild(n, le.data.getText()); } else if(le.referred.size() == 1) dg.nodes[0].addChild(n, le.data.getText()); else throw new RuntimeException("No link!"); } dg.collectChildren(); sentenceToGraph.put(se, dg); } return new Pair(sentenceToGraph, tokenToNode); } private static boolean nodeInSubjLex(DepNode n, SubjectivityLexicon lex) { String subjClue = lex.lookup(n.word, n.pos, n.lemma); return subjClue != null; } private static boolean isDominatedByNodeInSubjLex(DepNode n, SubjectivityLexicon lex) { DepNode p = n.parents[0]; while(p.position > 0) { if(nodeInSubjLex(p, lex)) return true; p = p.parents[0]; } return false; } private static boolean dominatesNodeInSubjLex(DepNode n, SubjectivityLexicon lex) { for(DepNode c: n.children) { if(nodeInSubjLex(c, lex)) return true; if(dominatesNodeInSubjLex(c, lex)) return true; } return false; } private static HashMap createAgentMap(LKAnnotationLayer agentLayer) { HashMap out = new HashMap(); for(LKAnnotationEntity e: agentLayer.entityList) { String id = e.localURI; out.put("#" + id, e); } return out; } private static HashMap> partitionAgents(HashMap agentMap, LKAnnotationLayer agentLayer) { HashMap> out = new HashMap(); for(LKAnnotationEntity agent: agentLayer.entityList) { String ns = agent.data.getTopAttribute("ns"); if(ns != null) ns = ns.replaceAll("\\s", ""); if(ns == null || ns.isEmpty()) { HashSet s = new HashSet(); s.add(agent); out.put(agent, s); continue; } ns = ns.replaceAll(".*,", ""); LKAnnotationEntity referent = agentMap.get(ns); if(referent == null) throw new RuntimeException("referent not found: " + agent); HashSet s = out.get(referent); if(s == null) { s = new HashSet(); s.add(referent); out.put(referent, s); } s.add(agent); out.put(agent, s); } for(LKAnnotationEntity e: agentLayer.entityList) { if(!out.containsKey(e)) throw new RuntimeException("!!!"); HashSet s = out.get(e); if(s == null) throw new RuntimeException("!!!"); if(!s.contains(e)) throw new RuntimeException("!!!"); } return out; } private static String getSentenceDistance(LKAnnotationEntity expr, HashMap agentMap, HashMap> agentClusterMap, HashMap stIndexMap) { if(expr.data.getTopAttribute("imp") != null) return "expr_imp"; if(expr.referred == null || expr.referred.isEmpty()) { return "expr_empty"; } int eix = stIndexMap.get(expr.referred.get(0)); String ns = expr.data.getTopAttribute("ns"); if(ns == null) return "ns_null"; String[] ts = ns.split(","); String lastSrcId = ts[ts.length-1]; LKAnnotationEntity src = agentMap.get(lastSrcId); if(src == null) { return "src_not_found"; } int aix = -1; if(src.referred != null && !src.referred.isEmpty()) { aix = stIndexMap.get(src.referred.get(0)); if(aix == eix) return "0"; } for(LKAnnotationEntity src2: agentClusterMap.get(src)) { if(src2.data.getTopAttribute("w") != null) continue; if(src2.data.getTopAttribute("imp") != null) continue; if(src2.referred == null || src2.referred.isEmpty()) continue; int aix2 = stIndexMap.get(src2.referred.get(0)); if(aix2 == eix) return "0-coref"; } /*int nSrcInSent = 0; for(LKAnnotationEntity src2: agentClusterMap.get(src)) { if(src2.data.getTopAttribute("w") != null) continue; if(src2.data.getTopAttribute("imp") != null) continue; if(src2.referred == null || src2.referred.isEmpty()) continue; int aix2 = stIndexMap.get(src2.referred.get(0)); if(aix2 == eix) nSrcInSent++; } if(nSrcInSent > 0) return "local-" + nSrcInSent;*/ if(src.data.getTopAttribute("w") != null) return "w"; if(src.data.getTopAttribute("imp") != null) return "src_imp"; if(src.referred == null || src.referred.isEmpty()) { return "agent_empty"; } if(aix < eix) return "-"; else return "+"; } private static HashMap createTokenSentenceIndexMap(LKAnnotationLayer senLayer) { HashMap out = new HashMap(); int index = 0; for(LKAnnotationEntity sen: senLayer.entityList) { for(LKAnnotationEntity token: sen.referred) out.put(token, index); index++; } return out; } private static void createLinkTrainingExamples(LKAnnotationEntity expr, HashMap agentMap, HashMap> agentClusterMap, HashMap stIndexMap, PrintWriter pw, DepGraph dg) { /* If the expression is empty: ignore. */ if(expr.data.getTopAttribute("imp") != null) return; if(expr.referred == null || expr.referred.isEmpty()) return; int eix = stIndexMap.get(expr.referred.get(0)); /* Noisy training data, no nested-source given. Ignore. */ String ns = expr.data.getTopAttribute("ns"); if(ns == null) return; String[] ts = ns.split(","); String lastSrcId = ts[ts.length-1]; LKAnnotationEntity src = agentMap.get(lastSrcId); /* Noisy training data, source not found. Ignore. */ if(src == null) return; ArrayList localSrcInstances = new ArrayList(); for(LKAnnotationEntity src2: agentClusterMap.get(src)) { if(src2.data.getTopAttribute("w") != null) continue; if(src2.data.getTopAttribute("imp") != null) continue; if(src2.referred == null || src2.referred.isEmpty()) continue; int aix2 = stIndexMap.get(src2.referred.get(0)); if(aix2 == eix) localSrcInstances.add(src2); } ArrayList candidateNominals = findNominals(dg); /* if(src.data.getTopAttribute("w") != null) return "w"; if(src.data.getTopAttribute("imp") != null) return "src_imp"; if(src.referred == null || src.referred.isEmpty()) { return "agent_empty"; } if(aix < eix) return "-"; else return "+"; */ } // private static LKAnnotationEntity getSource(LKAnnotationEntity e, HashMap agentMap) { // String ns = e.data.getTopAttribute("ns"); // if(ns == null) // return null; // String[] ts = ns.split(","); // return agentMap.get(ts[ts.length-1]); // } private static final boolean isNominal(DepNode n) { String pos = n.pos; if(pos.startsWith("NN")) return true; if(pos.equals("PRP")) return true; if(pos.equals("DT")) return true; return false; } private static ArrayList findNominals(DepGraph dg) { ArrayList out = new ArrayList(); for(int i = 1; i < dg.nodes.length; i++) { DepNode n = dg.nodes[i]; if(isNominal(dg.nodes[i])) out.add(n); } return out; } private static void extractSequences(LKAnnotatedText text, PrintWriter pw, SubjectivityLexicon subjLex, HashMap> agentDistHist) throws IOException { System.out.println(text.metaInfo.get("source")); LKAnnotationLayer senLayer = text.getLayer("SENTENCES"); LKAnnotationLayer tokenLayer = text.getLayer("TOKENS"); LKAnnotationLayer posLayer = text.getLayer("POS"); LKAnnotationLayer lemmaLayer = text.getLayer("LEMMA"); HashMap tokenSenIndexMap = createTokenSentenceIndexMap(senLayer); LKAnnotationLayer dgLayer = text.getLayer("DEP-SYNTAX"); LKAnnotationLayer exprSubjLayer = text.getLayer("MPQA-expressive-subjectivity"); LKAnnotationLayer objSpeechLayer = text.getLayer("MPQA-objective-speech-event"); LKAnnotationLayer dirSubjLayer = text.getLayer("MPQA-direct-subjective"); LKAnnotationLayer agentLayer = text.getLayer("MPQA-agents"); HashMap agentMap = createAgentMap(agentLayer); HashMap> agentClusterMap = partitionAgents(agentMap, agentLayer); ArrayList all = new ArrayList(); all.addAll(dirSubjLayer.entityList); all.addAll(exprSubjLayer.entityList); all.addAll(objSpeechLayer.entityList); HashMap posMap = createStartMap(tokenLayer.entityList, posLayer.entityList); HashMap lemmaMap = createStartMap(tokenLayer.entityList, lemmaLayer.entityList); HashMap startMap = createStartMap(tokenLayer.entityList, all); ListMap endMap = createEndMap(tokenLayer.entityList, all); Pair, HashMap> p = createDGMap(senLayer, dgLayer, posMap, lemmaMap); //HashMap dgMap = p.left; HashMap nodeMap = p.right; LKAnnotationEntity currentEntity = null; for(LKAnnotationEntity sen: senLayer.entityList) { //DepGraph dg = dgMap.get(sen); //pw.println(dg); for(LKAnnotationEntity token: sen.referred) { String tokenText = ((DataElementNode) token.data).getText(); LKAnnotationEntity pos = posMap.get(token); if(pos == null) { if(!tokenText.startsWith("<")) throw new RuntimeException("pos = null, tokenText = " + tokenText); continue; } String posText = ((DataElementNode) pos.data).getText(); LKAnnotationEntity lemma = lemmaMap.get(token); String lemmaText = lemma == null? "_": ((DataElementNode) lemma.data).getText(); String subjClue = subjLex.lookup(tokenText, posText, lemmaText); if(subjClue == null) subjClue = "_"; boolean starts = false; if(currentEntity == null) { LKAnnotationEntity e = startMap.get(token); if(e != null) { currentEntity = e; starts = true; } } String label = createIOBLabel(starts, currentEntity); if(currentEntity != null) { ArrayList es = endMap.get(token); if(es != null && es.contains(currentEntity)) currentEntity = null; } DepNode n = nodeMap.get(token); if(n == null) throw new RuntimeException("No dep node"); String isDom = isDominatedByNodeInSubjLex(n, subjLex)? "T": "F"; String dom = dominatesNodeInSubjLex(n, subjLex)? "T": "F"; pw.println(tokenText + "\t" + posText + "\t" + lemmaText + "\t" + subjClue + "\t" + isDom + "\t" + dom + "\t" + label); pw.flush(); } pw.println(); } pw.flush(); //System.out.println(agentMap); //System.out.println(all); //System.exit(0); for(LKAnnotationEntity e: all) { //LKAnnotationEntity src = getSource(e, agentMap); Histogram hist = agentDistHist.get(getEntityLabel(e)); hist.add(getSentenceDistance(e, agentMap, agentClusterMap, tokenSenIndexMap)); } } public static void main(String[] argv) { String dir = argv[0]; String subjLexFile = argv[1]; String out = argv[2]; String selectedDocList = argv[3]; try { SubjectivityLexicon subjLex = new SubjectivityLexicon(subjLexFile); LKCollectionReader r = new LKCollectionReader(dir); PrintWriter pw = new PrintWriter(new FileWriter(out)); HashMap> agentDistHist = new HashMap(); agentDistHist.put("es", new Histogram()); agentDistHist.put("ds", new Histogram()); agentDistHist.put("os", new Histogram()); HashSet selectedFiles = new HashSet(); //Ax.readLines(selectedDocList, selectedFiles); Util.readLines(selectedDocList, selectedFiles); //int i = 0; while(r.hasNext()) { //i++; //System.out.println(i); LKAnnotatedText text = r.next(); String docName = text.metaInfo.get("source"); if(docName.startsWith("database.mpqa.2.0/docs/")) docName = docName.substring("database.mpqa.2.0/docs/".length()); if(selectedFiles.contains(docName)) { //System.out.println(docName); extractSequences(text, pw, subjLex, agentDistHist); } } pw.close(); //System.out.println("entities: " + countEntities); //System.out.println("overlaps: " + countOverlaps); for(String k: agentDistHist.keySet()) { Histogram hist = agentDistHist.get(k); System.out.println(k + ": " + hist.asSortedList()); } } catch(Exception e) { e.printStackTrace(); } } }