# Natural Language Toolkit: Senseval 2 Corpus Reader # # Copyright (C) 2001-2012 NLTK Project # Author: Trevor Cohn # Steven Bird (modifications) # URL: # For license information, see LICENSE.TXT """ Read from the Senseval 2 Corpus. SENSEVAL [http://www.senseval.org/] Evaluation exercises for Word Sense Disambiguation. Organized by ACL-SIGLEX [http://www.siglex.org/] Prepared by Ted Pedersen , University of Minnesota, http://www.d.umn.edu/~tpederse/data.html Distributed with permission. The NLTK version of the Senseval 2 files uses well-formed XML. Each instance of the ambiguous words "hard", "interest", "line", and "serve" is tagged with a sense identifier, and supplied with context. """ import os import re import xml.sax from xmldocs import XMLCorpusReader from nltk.tokenize import * from xml.etree import ElementTree from util import * from api import * class SensevalInstance(object): def __init__(self, word, position, context, senses): self.word = word self.senses = tuple(senses) self.position = position self.context = context def __repr__(self): return ('SensevalInstance(word=%r, position=%r, ' 'context=%r, senses=%r)' % (self.word, self.position, self.context, self.senses)) class SensevalCorpusReader(CorpusReader): def instances(self, fileids=None): return concat([SensevalCorpusView(fileid, enc) for (fileid, enc) in self.abspaths(fileids, True)]) def raw(self, fileids=None): """ :return: the text contents of the given fileids, as a single string. """ if fileids is None: fileids = self._fileids elif isinstance(fileids, basestring): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def _entry(self, tree): elts = [] for lexelt in tree.findall('lexelt'): for inst in lexelt.findall('instance'): sense = inst[0].attrib['senseid'] context = [(w.text, w.attrib['pos']) for w in inst[1]] elts.append( (sense, context) ) return elts class SensevalCorpusView(StreamBackedCorpusView): def __init__(self, fileid, encoding): StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) self._word_tokenizer = WhitespaceTokenizer() self._lexelt_starts = [0] # list of streampos self._lexelts = [None] # list of lexelt names def read_block(self, stream): # Decide which lexical element we're in. lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1 lexelt = self._lexelts[lexelt_num] instance_lines = [] in_instance = False while True: line = stream.readline() if line == '': assert instance_lines == [] return [] # Start of a lexical element? if line.lstrip().startswith(' has no 'item=...' lexelt = m.group(1)[1:-1] if lexelt_num < len(self._lexelts): assert lexelt == self._lexelts[lexelt_num] else: self._lexelts.append(lexelt) self._lexelt_starts.append(stream.tell()) # Start of an instance? if line.lstrip().startswith('' elif cword.tag == 'wf': context.append((cword.text, cword.attrib['pos'])) elif cword.tag == 's': pass # Sentence boundary marker. else: print 'ACK', cword.tag assert False, 'expected CDATA or or ' if cword.tail: context += self._word_tokenizer.tokenize(cword.tail) else: assert False, 'unexpected tag %s' % child.tag return SensevalInstance(lexelt, position, context, senses) def _fixXML(text): """ Fix the various issues with Senseval pseudo-XML. """ # <~> or <^> => ~ or ^ text = re.sub(r'<([~\^])>', r'\1', text) # fix lone & text = re.sub(r'(\s+)\&(\s+)', r'\1&\2', text) # fix """ text = re.sub(r'"""', '\'"\'', text) # fix => text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text) # fix foreign word tag text = re.sub(r'<\&frasl>\s*]*>', 'FRASL', text) # remove <&I .> text = re.sub(r'<\&I[^>]*>', '', text) # fix <{word}> text = re.sub(r'<{([^}]+)}>', r'\1', text) # remove <@>,

,

text = re.sub(r'<(@|/?p)>', r'', text) # remove <&M .> and <&T .> and <&Ms .> text = re.sub(r'<&\w+ \.>', r'', text) # remove lines text = re.sub(r']*>', r'', text) # remove <[hi]> and <[/p]> etc text = re.sub(r'<\[\/?[^>]+\]*>', r'', text) # take the thing out of the brackets: <…> text = re.sub(r'<(\&\w+;)>', r'\1', text) # and remove the & for those patterns that aren't regular XML text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text) # fix 'abc ' style tags - now abc text = re.sub(r'[ \t]*([^<>\s]+?)[ \t]*', r' \1', text) text = re.sub(r'\s*"\s*', " \"", text) return text