# Natural Language Toolkit: SemCor Corpus Reader # # Copyright (C) 2001-2012 NLTK Project # Author: Nathan Schneider # URL: # For license information, see LICENSE.TXT """ Corpus reader for the SemCor Corpus. """ __docformat__ = 'epytext en' import re import xml.etree.ElementTree as ET from api import * from util import * from xmldocs import * from nltk.tree import Tree class SemcorCorpusReader(XMLCorpusReader): """ Corpus reader for the SemCor Corpus. For access to the complete XML data structure, use the ``xml()`` method. For access to simple word lists and tagged word lists, use ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. """ def __init__(self, root, fileids, lazy=True): XMLCorpusReader.__init__(self, root, fileids) self._lazy = lazy def words(self, fileids=None): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return self._items(fileids, 'word', False, False, False) def chunks(self, fileids=None): """ :return: the given file(s) as a list of chunks, each of which is a list of words and punctuation symbols that form a unit. :rtype: list(list(str)) """ return self._items(fileids, 'chunk', False, False, False) def tagged_chunks(self, fileids=None, tag=('pos' or 'sem' or 'both')): """ :return: the given file(s) as a list of tagged chunks, represented in tree form. :rtype: list(Tree) :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` to indicate the kind of tags to include. Semantic tags consist of WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity without a specific entry in WordNet. (Named entities of type 'other' have no lemma. Other chunks not in WordNet have no semantic tag. Punctuation tokens have `None` for their part of speech tag.) """ return self._items(fileids, 'chunk', False, tag!='sem', tag!='pos') def sents(self, fileids=None): """ :return: the given file(s) as a list of sentences, each encoded as a list of word strings. :rtype: list(list(str)) """ return self._items(fileids, 'word', True, False, False) def chunk_sents(self, fileids=None): """ :return: the given file(s) as a list of sentences, each encoded as a list of chunks. :rtype: list(list(list(str))) """ return self._items(fileids, 'chunk', True, False, False) def tagged_sents(self, fileids=None, tag=('pos' or 'sem' or 'both')): """ :return: the given file(s) as a list of sentences. Each sentence is represented as a list of tagged chunks (in tree form). :rtype: list(list(Tree)) :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` to indicate the kind of tags to include. Semantic tags consist of WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity without a specific entry in WordNet. (Named entities of type 'other' have no lemma. Other chunks not in WordNet have no semantic tag. Punctuation tokens have `None` for their part of speech tag.) """ return self._items(fileids, 'chunk', True, tag!='sem', tag!='pos') def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag): if unit=='word' and not bracket_sent: # the result of the SemcorWordView may be a multiword unit, so the # LazyConcatenation will make sure the sentence is flattened _ = lambda *args: LazyConcatenation((SemcorWordView if self._lazy else self._words)(*args)) else: _ = SemcorWordView if self._lazy else self._words return concat([_(fileid, unit, bracket_sent, pos_tag, sem_tag) for fileid in self.abspaths(fileids)]) def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag): """ Helper used to implement the view methods -- returns a list of tokens, (segmented) words, chunks, or sentences. The tokens and chunks may optionally be tagged (with POS and sense information). :param fileid: The name of the underlying file. :param unit: One of `'token'`, `'word'`, or `'chunk'`. :param bracket_sent: If true, include sentence bracketing. :param pos_tag: Whether to include part-of-speech tags. :param sem_tag: Whether to include semantic tags, namely WordNet lemma and OOV named entity status. """ assert unit in ('token', 'word', 'chunk') result = [] xmldoc = ElementTree.parse(fileid).getroot() for xmlsent in xmldoc.findall('.//s'): sent = [] for xmlword in _all_xmlwords_in(xmlsent): itm = SemcorCorpusReader._word(xmlword, unit, pos_tag, sem_tag) if unit=='word': sent.extend(itm) else: sent.append(itm) if bracket_sent: result.append(SemcorSentence(xmlsent.attrib['snum'], sent)) else: result.extend(sent) assert None not in result return result @staticmethod def _word(xmlword, unit, pos_tag, sem_tag): tkn = xmlword.text if not tkn: tkn = "" # fixes issue 337? lemma = xmlword.get('lemma', tkn) # lemma or NE class redef = xmlword.get('rdf', tkn) # redefinition--this indicates the lookup string # does not exactly match the enclosed string, e.g. due to typographical adjustments # or discontinuity of a multiword expression. If a redefinition has occurred, # the "rdf" attribute holds its inflected form and "lemma" holds its lemma. # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class). sensenum = xmlword.get('wnsn') # WordNet sense number isOOVEntity = 'pn' in xmlword.keys() # a "personal name" (NE) not in WordNet pos = xmlword.get('pos') # part of speech for the whole chunk (None for punctuation) if unit=='token': if not pos_tag and not sem_tag: itm = tkn else: itm = (tkn,) + ((pos,) if pos_tag else ()) + ((lemma, sensenum, isOOVEntity) if sem_tag else ()) return itm else: ww = tkn.split('_') # TODO: case where punctuation intervenes in MWE if unit=='word': return ww else: if sensenum is not None: try: sense = '%s.%02d' % (lemma, int(sensenum)) except ValueError: sense = lemma+'.'+sensenum # e.g. the sense number may be "2;1" bottom = [Tree(pos, ww)] if pos_tag else ww if sem_tag and isOOVEntity: if sensenum is not None: return Tree(sense, [Tree('NE', bottom)]) else: # 'other' NE return Tree('NE', bottom) elif sem_tag and sensenum is not None: return Tree(sense, bottom) elif pos_tag: return bottom[0] else: return bottom # chunk as a list def _all_xmlwords_in(elt, result=None): if result is None: result = [] for child in elt: if child.tag in ('wf', 'punc'): result.append(child) else: _all_xmlwords_in(child, result) return result class SemcorSentence(list): """ A list of words, augmented by an attribute ``num`` used to record the sentence identifier (the ``n`` attribute from the XML). """ def __init__(self, num, items): self.num = num list.__init__(self, items) class SemcorWordView(XMLCorpusView): """ A stream backed corpus view specialized for use with the BNC corpus. """ def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag): """ :param fileid: The name of the underlying file. :param unit: One of `'token'`, `'word'`, or `'chunk'`. :param bracket_sent: If true, include sentence bracketing. :param pos_tag: Whether to include part-of-speech tags. :param sem_tag: Whether to include semantic tags, namely WordNet lemma and OOV named entity status. """ if bracket_sent: tagspec = '.*/s' else: tagspec = '.*/s/(punc|wf)' self._unit = unit self._sent = bracket_sent self._pos_tag = pos_tag self._sem_tag = sem_tag XMLCorpusView.__init__(self, fileid, tagspec) def handle_elt(self, elt, context): if self._sent: return self.handle_sent(elt) else: return self.handle_word(elt) def handle_word(self, elt): return SemcorCorpusReader._word(elt, self._unit, self._pos_tag, self._sem_tag) def handle_sent(self, elt): sent = [] for child in elt: if child.tag in ('wf','punc'): itm = self.handle_word(child) if self._unit=='word': sent.extend(itm) else: sent.append(itm) else: raise ValueError('Unexpected element %s' % child.tag) return SemcorSentence(elt.attrib['snum'], sent)