# Natural Language Toolkit: Glue Semantics # # Author: Dan Garrette # # Copyright (C) 2001-2012 NLTK Project # URL: # For license information, see LICENSE.TXT import os import nltk from nltk.internals import Counter from nltk.corpus import brown from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger from nltk.sem.logic import (LogicParser, Expression, Variable, VariableExpression, LambdaExpression, AbstractVariableExpression) import drt import linearlogic SPEC_SEMTYPES = {'a' : 'ex_quant', 'an' : 'ex_quant', 'every' : 'univ_quant', 'the' : 'def_art', 'no' : 'no_quant', 'default' : 'ex_quant'} OPTIONAL_RELATIONSHIPS = ['nmod', 'vmod', 'punct'] class GlueFormula(object): def __init__(self, meaning, glue, indices=None): if not indices: indices = set() if isinstance(meaning, str): self.meaning = LogicParser().parse(meaning) elif isinstance(meaning, Expression): self.meaning = meaning else: raise RuntimeError, 'Meaning term neither string or expression: %s, %s' % (meaning, meaning.__class__) if isinstance(glue, str): self.glue = linearlogic.LinearLogicParser().parse(glue) elif isinstance(glue, linearlogic.Expression): self.glue = glue else: raise RuntimeError, 'Glue term neither string or expression: %s, %s' % (glue, glue.__class__) self.indices = indices def applyto(self, arg): """ self = (\\x.(walk x), (subj -o f)) arg = (john , subj) returns ((walk john), f) """ if self.indices & arg.indices: # if the sets are NOT disjoint raise linearlogic.LinearLogicApplicationException, "'%s' applied to '%s'. Indices are not disjoint." % (self, arg) else: # if the sets ARE disjoint return_indices = (self.indices | arg.indices) try: return_glue = linearlogic.ApplicationExpression(self.glue, arg.glue, arg.indices) except linearlogic.LinearLogicApplicationException: raise linearlogic.LinearLogicApplicationException, "'%s' applied to '%s'" % (self.simplify(), arg.simplify()) arg_meaning_abstracted = arg.meaning if return_indices: for dep in self.glue.simplify().antecedent.dependencies[::-1]: # if self.glue is (A -o B), dep is in A.dependencies arg_meaning_abstracted = self.make_LambdaExpression(Variable('v%s' % dep), arg_meaning_abstracted) return_meaning = self.meaning.applyto(arg_meaning_abstracted) return self.__class__(return_meaning, return_glue, return_indices) def make_VariableExpression(self, name): return VariableExpression(name) def make_LambdaExpression(self, variable, term): return LambdaExpression(variable, term) def lambda_abstract(self, other): assert isinstance(other, GlueFormula) assert isinstance(other.meaning, AbstractVariableExpression) return self.__class__(self.make_LambdaExpression(other.meaning.variable, self.meaning), linearlogic.ImpExpression(other.glue, self.glue)) def compile(self, counter=None): """From Iddo Lev's PhD Dissertation p108-109""" if not counter: counter = Counter() (compiled_glue, new_forms) = self.glue.simplify().compile_pos(counter, self.__class__) return new_forms + [self.__class__(self.meaning, compiled_glue, set([counter.get()]))] def simplify(self): return self.__class__(self.meaning.simplify(), self.glue.simplify(), self.indices) def __eq__(self, other): return self.__class__ == other.__class__ and self.meaning == other.meaning and self.glue == other.glue def __str__(self): assert isinstance(self.indices, set) accum = '%s : %s' % (self.meaning, self.glue) if self.indices: accum += ' : {' + ', '.join([str(index) for index in self.indices]) + '}' return accum def __repr__(self): return str(self) class GlueDict(dict): def __init__(self, filename): self.filename = filename self.read_file() def read_file(self, empty_first=True): if empty_first: self.clear() try: f = nltk.data.find( os.path.join('grammars', 'sample_grammars', self.filename)) # if f is a ZipFilePathPointer or a FileSystemPathPointer # then we need a little extra massaging if hasattr(f, 'open'): f = f.open() except LookupError, e: try: f = open(self.filename) except LookupError: raise e lines = f.readlines() f.close() for line in lines: # example: 'n : (\\x.( x), (v-or))' # lambdacalc -^ linear logic -^ line = line.strip() # remove trailing newline if not len(line): continue # skip empty lines if line[0] == '#': continue # skip commented out lines parts = line.split(' : ', 2) # ['verb', '(\\x.( x), ( subj -o f ))', '[subj]'] glue_formulas = [] parenCount = 0 tuple_start = 0 tuple_comma = 0 relationships = None if len(parts) > 1: for (i, c) in enumerate(parts[1]): if c == '(': if parenCount == 0: # if it's the first '(' of a tuple tuple_start = i+1 # then save the index parenCount += 1 elif c == ')': parenCount -= 1 if parenCount == 0: # if it's the last ')' of a tuple meaning_term = parts[1][tuple_start:tuple_comma] # '\\x.( x)' glue_term = parts[1][tuple_comma+1:i] # '(v-r)' glue_formulas.append([meaning_term, glue_term]) # add the GlueFormula to the list elif c == ',': if parenCount == 1: # if it's a comma separating the parts of the tuple tuple_comma = i # then save the index elif c == '#': # skip comments at the ends of lines if parenCount != 0: # if the line hasn't parsed correctly so far raise RuntimeError, 'Formula syntax is incorrect for entry ' + line break # break to the next line if len(parts) > 2: #if there is a relationship entry at the end relStart = parts[2].index('[')+1 relEnd = parts[2].index(']') if relStart == relEnd: relationships = frozenset() else: relationships = frozenset([r.strip() for r in parts[2][relStart:relEnd].split(',')]) try: startInheritance = parts[0].index('(') endInheritance = parts[0].index(')') sem = parts[0][:startInheritance].strip() supertype = parts[0][startInheritance+1:endInheritance] except: sem = parts[0].strip() supertype = None if sem not in self: self[sem] = {} if relationships is None: #if not specified for a specific relationship set #add all relationship entries for parents if supertype: for rels, glue in self[supertype].iteritems(): if rels not in self[sem]: self[sem][rels] = [] self[sem][rels].extend(glue) self[sem][rels].extend(glue_formulas) # add the glue formulas to every rel entry else: if None not in self[sem]: self[sem][None] = [] self[sem][None].extend(glue_formulas) # add the glue formulas to every rel entry else: if relationships not in self[sem]: self[sem][relationships] = [] if supertype: self[sem][relationships].extend(self[supertype][relationships]) self[sem][relationships].extend(glue_formulas) # add the glue entry to the dictionary def __str__(self): accum = '' for pos in self: for relset in self[pos]: i = 1 for gf in self[pos][relset]: if i==1: accum += str(pos) + ': ' else: accum += ' '*(len(str(pos))+2) accum += str(gf) if relset and i==len(self[pos][relset]): accum += ' : ' + str(relset) accum += '\n' i += 1 return accum def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False): if node is None: top = depgraph.nodelist[0] root = depgraph.nodelist[top['deps'][0]] return self.to_glueformula_list(depgraph, root, Counter(), verbose) glueformulas = self.lookup(node, depgraph, counter) for dep_idx in node['deps']: dep = depgraph.nodelist[dep_idx] glueformulas.extend(self.to_glueformula_list(depgraph, dep, counter, verbose)) return glueformulas def lookup(self, node, depgraph, counter): semtype_names = self.get_semtypes(node) semtype = None for name in semtype_names: if name in self: semtype = self[name] break if semtype is None: # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) return [] self.add_missing_dependencies(node, depgraph) lookup = self._lookup_semtype_option(semtype, node, depgraph) if not len(lookup): raise KeyError, "There is no GlueDict entry for sem type of '%s'"\ " with tag '%s', and rel '%s'" %\ (node['word'], node['tag'], node['rel']) return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) def add_missing_dependencies(self, node, depgraph): rel = node['rel'].lower() if rel == 'main': headnode = depgraph.nodelist[node['head']] subj = self.lookup_unique('subj', headnode, depgraph) node['deps'].append(subj['address']) def _lookup_semtype_option(self, semtype, node, depgraph): relationships = frozenset([depgraph.nodelist[dep]['rel'].lower() for dep in node['deps'] if depgraph.nodelist[dep]['rel'].lower() not in OPTIONAL_RELATIONSHIPS]) try: lookup = semtype[relationships] except KeyError: # An exact match is not found, so find the best match where # 'best' is defined as the glue entry whose relationship set has the # most relations of any possible relationship set that is a subset # of the actual depgraph best_match = frozenset() for relset_option in set(semtype)-set([None]): if len(relset_option) > len(best_match) and \ relset_option < relationships: best_match = relset_option if not best_match: if None in semtype: best_match = None else: return None lookup = semtype[best_match] return lookup def get_semtypes(self, node): """ Based on the node, return a list of plausible semtypes in order of plausibility. """ semtype_name = None rel = node['rel'].lower() word = node['word'].lower() if rel == 'spec': if word in SPEC_SEMTYPES: return [SPEC_SEMTYPES[word]] else: return [SPEC_SEMTYPES['default']] elif rel in ['nmod', 'vmod']: return [node['tag'], rel] else: return [node['tag']] def get_glueformulas_from_semtype_entry(self, lookup, word, node, depgraph, counter): glueformulas = [] glueFormulaFactory = self.get_GlueFormula_factory() for meaning, glue in lookup: gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue) if not len(glueformulas): gf.word = word else: gf.word = '%s%s' % (word, len(glueformulas)+1) gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get()) glueformulas.append(gf) return glueformulas def get_meaning_formula(self, generic, word): """ :param generic: A meaning formula string containing the parameter "" :param word: The actual word to be replace "" """ word = word.replace('.', '') return generic.replace('', word) def initialize_labels(self, expr, node, depgraph, unique_index): if isinstance(expr, linearlogic.AtomicExpression): name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index) if name[0].isupper(): return linearlogic.VariableExpression(name) else: return linearlogic.ConstantExpression(name) else: return linearlogic.ImpExpression( self.initialize_labels(expr.antecedent, node, depgraph, unique_index), self.initialize_labels(expr.consequent, node, depgraph, unique_index)) def find_label_name(self, name, node, depgraph, unique_index): try: dot = name.index('.') before_dot = name[:dot] after_dot = name[dot+1:] if before_dot == 'super': return self.find_label_name(after_dot, depgraph.nodelist[node['head']], depgraph, unique_index) else: return self.find_label_name(after_dot, self.lookup_unique(before_dot, node, depgraph), depgraph, unique_index) except ValueError: lbl = self.get_label(node) if name=='f': return lbl elif name=='v': return '%sv' % lbl elif name=='r': return '%sr' % lbl elif name=='super': return self.get_label(depgraph.nodelist[node['head']]) elif name=='var': return '%s%s' % (lbl.upper(), unique_index) elif name=='a': return self.get_label(self.lookup_unique('conja', node, depgraph)) elif name=='b': return self.get_label(self.lookup_unique('conjb', node, depgraph)) else: return self.get_label(self.lookup_unique(name, node, depgraph)) def get_label(self, node): """ Pick an alphabetic character as identifier for an entity in the model. :param value: where to index into the list of characters :type value: int """ value = node['address'] letter = ['f','g','h','i','j','k','l','m','n','o','p','q','r','s', 't','u','v','w','x','y','z','a','b','c','d','e'][value-1] num = int(value) / 26 if num > 0: return letter + str(num) else: return letter def lookup_unique(self, rel, node, depgraph): """ Lookup 'key'. There should be exactly one item in the associated relation. """ deps = [depgraph.nodelist[dep] for dep in node['deps'] if depgraph.nodelist[dep]['rel'].lower() == rel.lower()] if len(deps) == 0: raise KeyError, "'%s' doesn't contain a feature '%s'" % (node['word'], rel) elif len(deps) > 1: raise KeyError, "'%s' should only have one feature '%s'" % (node['word'], rel) else: return deps[0] def get_GlueFormula_factory(self): return GlueFormula class Glue(object): def __init__(self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False): self.verbose = verbose self.remove_duplicates = remove_duplicates self.depparser = depparser from nltk import Prover9 self.prover = Prover9() if semtype_file: self.semtype_file = semtype_file else: self.semtype_file = 'glue.semtype' def train_depparser(self, depgraphs=None): if depgraphs: self.depparser.train(depgraphs) else: self.depparser.train_from_file(nltk.data.find( os.path.join('grammars', 'sample_grammars', 'glue_train.conll'))) def parse_to_meaning(self, sentence): readings = [] for agenda in self.parse_to_compiled(sentence): readings.extend(self.get_readings(agenda)) return readings def get_readings(self, agenda): readings = [] agenda_length = len(agenda) atomics = dict() nonatomics = dict() while agenda: # is not empty cur = agenda.pop() glue_simp = cur.glue.simplify() if isinstance(glue_simp, linearlogic.ImpExpression): # if cur.glue is non-atomic for key in atomics: try: if isinstance(cur.glue, linearlogic.ApplicationExpression): bindings = cur.glue.bindings else: bindings = linearlogic.BindingDict() glue_simp.antecedent.unify(key, bindings) for atomic in atomics[key]: if not (cur.indices & atomic.indices): # if the sets of indices are disjoint try: agenda.append(cur.applyto(atomic)) except linearlogic.LinearLogicApplicationException: pass except linearlogic.UnificationException: pass try: nonatomics[glue_simp.antecedent].append(cur) except KeyError: nonatomics[glue_simp.antecedent] = [cur] else: # else cur.glue is atomic for key in nonatomics: for nonatomic in nonatomics[key]: try: if isinstance(nonatomic.glue, linearlogic.ApplicationExpression): bindings = nonatomic.glue.bindings else: bindings = linearlogic.BindingDict() glue_simp.unify(key, bindings) if not (cur.indices & nonatomic.indices): # if the sets of indices are disjoint try: agenda.append(nonatomic.applyto(cur)) except linearlogic.LinearLogicApplicationException: pass except linearlogic.UnificationException: pass try: atomics[glue_simp].append(cur) except KeyError: atomics[glue_simp] = [cur] for entry in atomics: for gf in atomics[entry]: if len(gf.indices) == agenda_length: self._add_to_reading_list(gf, readings) for entry in nonatomics: for gf in nonatomics[entry]: if len(gf.indices) == agenda_length: self._add_to_reading_list(gf, readings) return readings def _add_to_reading_list(self, glueformula, reading_list): add_reading = True if self.remove_duplicates: for reading in reading_list: try: if reading.equiv(glueformula.meaning, self.prover): add_reading = False break; except Exception, e: #if there is an exception, the syntax of the formula #may not be understandable by the prover, so don't #throw out the reading. print 'Error when checking logical equality of statements', e pass if add_reading: reading_list.append(glueformula.meaning) def parse_to_compiled(self, sentence='a man sees Mary'.split()): gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)] return [self.gfl_to_compiled(gfl) for gfl in gfls] def dep_parse(self, sentence='every cat leaves'.split()): #Lazy-initialize the depparser if self.depparser is None: from nltk.parse import MaltParser self.depparser = MaltParser(tagger=self.get_pos_tagger()) if not self.depparser._trained: self.train_depparser() return [self.depparser.parse(sentence, verbose=self.verbose)] def depgraph_to_glue(self, depgraph): return self.get_glue_dict().to_glueformula_list(depgraph) def get_glue_dict(self): return GlueDict(self.semtype_file) def gfl_to_compiled(self, gfl): index_counter = Counter() return_list = [] for gf in gfl: return_list.extend(gf.compile(index_counter)) if self.verbose: print 'Compiled Glue Premises:' for cgf in return_list: print cgf return return_list def get_pos_tagger(self): regexp_tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) brown_train = brown.tagged_sents(categories='news') unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) #Override particular words main_tagger = RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant') ], backoff=trigram_tagger) return main_tagger class DrtGlueFormula(GlueFormula): def __init__(self, meaning, glue, indices=None): if not indices: indices = set() if isinstance(meaning, str): self.meaning = drt.DrtParser().parse(meaning) elif isinstance(meaning, drt.AbstractDrs): self.meaning = meaning else: raise RuntimeError, 'Meaning term neither string or expression: %s, %s' % (meaning, meaning.__class__) if isinstance(glue, str): self.glue = linearlogic.LinearLogicParser().parse(glue) elif isinstance(glue, linearlogic.Expression): self.glue = glue else: raise RuntimeError, 'Glue term neither string or expression: %s, %s' % (glue, glue.__class__) self.indices = indices def make_VariableExpression(self, name): return drt.DrtVariableExpression(name) def make_LambdaExpression(self, variable, term): return drt.DrtLambdaExpression(variable, term) class DrtGlueDict(GlueDict): def get_GlueFormula_factory(self): return DrtGlueFormula class DrtGlue(Glue): def __init__(self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False): if not semtype_file: semtype_file = 'drt_glue.semtype' Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose) def get_glue_dict(self): return DrtGlueDict(self.semtype_file) def demo(show_example=-1): from nltk.parse import MaltParser examples = ['David sees Mary', 'David eats a sandwich', 'every man chases a dog', 'every man believes a dog sleeps', 'John gives David a sandwich', 'John chases himself'] # 'John persuades David to order a pizza', # 'John tries to go', # 'John tries to find a unicorn', # 'John seems to vanish', # 'a unicorn seems to approach', # 'every big cat leaves', # 'every gray cat leaves', # 'every big gray cat leaves', # 'a former senator leaves', print '============== DEMO ==============' tagger = RegexpTagger( [('^(David|Mary|John)$', 'NNP'), ('^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$', 'VB'), ('^(go|order|vanish|find|approach)$', 'VB'), ('^(a)$', 'ex_quant'), ('^(every)$', 'univ_quant'), ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'), ('^(big|gray|former)$', 'JJ'), ('^(him|himself)$', 'PRP') ]) depparser = MaltParser(tagger=tagger) glue = Glue(depparser=depparser, verbose=False) for (i, sentence) in enumerate(examples): if i==show_example or show_example==-1: print '[[[Example %s]]] %s' % (i, sentence) for reading in glue.parse_to_meaning(sentence.split()): print reading.simplify() print '' if __name__ == '__main__': demo()