# Natural Language Toolkit: SemCor Corpus Reader
#
# Copyright (C) 2001-2012 NLTK Project
# Author: Nathan Schneider <nschneid@cs.cmu.edu>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
Corpus reader for the SemCor Corpus.
"""
__docformat__ = 'epytext en'

import re

import xml.etree.ElementTree as ET

from api import *
from util import *
from xmldocs import *
from nltk.tree import Tree

class SemcorCorpusReader(XMLCorpusReader):
    """
    Corpus reader for the SemCor Corpus.
    For access to the complete XML data structure, use the ``xml()``
    method.  For access to simple word lists and tagged word lists, use
    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
    """
    def __init__(self, root, fileids, lazy=True):
        XMLCorpusReader.__init__(self, root, fileids)
        self._lazy = lazy

    def words(self, fileids=None):
        """
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        """
        return self._items(fileids, 'word', False, False, False)

    def chunks(self, fileids=None):
        """
        :return: the given file(s) as a list of chunks, 
            each of which is a list of words and punctuation symbols 
            that form a unit.
        :rtype: list(list(str))
        """
        return self._items(fileids, 'chunk', False, False, False)

    def tagged_chunks(self, fileids=None, tag=('pos' or 'sem' or 'both')):
        """
        :return: the given file(s) as a list of tagged chunks, represented 
            in tree form.
        :rtype: list(Tree)

        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` 
            to indicate the kind of tags to include.  Semantic tags consist of 
            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity 
            without a specific entry in WordNet.  (Named entities of type 'other' 
            have no lemma.  Other chunks not in WordNet have no semantic tag. 
            Punctuation tokens have `None` for their part of speech tag.)
        """
        return self._items(fileids, 'chunk', False, tag!='sem', tag!='pos')

    def sents(self, fileids=None):
        """
        :return: the given file(s) as a list of sentences, each encoded 
            as a list of word strings.
        :rtype: list(list(str))
        """
        return self._items(fileids, 'word', True, False, False)

    def chunk_sents(self, fileids=None):
        """
        :return: the given file(s) as a list of sentences, each encoded 
            as a list of chunks.
        :rtype: list(list(list(str)))
        """
        return self._items(fileids, 'chunk', True, False, False)

    def tagged_sents(self, fileids=None, tag=('pos' or 'sem' or 'both')):
        """
        :return: the given file(s) as a list of sentences. Each sentence 
            is represented as a list of tagged chunks (in tree form).
        :rtype: list(list(Tree))

        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` 
            to indicate the kind of tags to include.  Semantic tags consist of 
            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity 
            without a specific entry in WordNet.  (Named entities of type 'other' 
            have no lemma.  Other chunks not in WordNet have no semantic tag.
            Punctuation tokens have `None` for their part of speech tag.)
        """
        return self._items(fileids, 'chunk', True, tag!='sem', tag!='pos')

    def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
        if unit=='word' and not bracket_sent:
            # the result of the SemcorWordView may be a multiword unit, so the 
            # LazyConcatenation will make sure the sentence is flattened
            _ = lambda *args: LazyConcatenation((SemcorWordView if self._lazy else self._words)(*args))
        else:
            _ = SemcorWordView if self._lazy else self._words
        return concat([_(fileid, unit, bracket_sent, pos_tag, sem_tag)
                       for fileid in self.abspaths(fileids)])

    def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
        """
        Helper used to implement the view methods -- returns a list of
        tokens, (segmented) words, chunks, or sentences. The tokens 
        and chunks may optionally be tagged (with POS and sense 
        information).

        :param fileid: The name of the underlying file.
        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
        :param bracket_sent: If true, include sentence bracketing.
        :param pos_tag: Whether to include part-of-speech tags.
        :param sem_tag: Whether to include semantic tags, namely WordNet lemma 
            and OOV named entity status.
        """
        assert unit in ('token', 'word', 'chunk')
        result = []

        xmldoc = ElementTree.parse(fileid).getroot()
        for xmlsent in xmldoc.findall('.//s'):
            sent = []
            for xmlword in _all_xmlwords_in(xmlsent):
                itm = SemcorCorpusReader._word(xmlword, unit, pos_tag, sem_tag)
                if unit=='word':
                    sent.extend(itm)
                else:
                    sent.append(itm)

            if bracket_sent:
                result.append(SemcorSentence(xmlsent.attrib['snum'], sent))
            else:
                result.extend(sent)

        assert None not in result
        return result

    @staticmethod
    def _word(xmlword, unit, pos_tag, sem_tag):
        tkn = xmlword.text
        if not tkn:
            tkn = "" # fixes issue 337?

        lemma = xmlword.get('lemma', tkn) # lemma or NE class
        redef = xmlword.get('rdf', tkn)	# redefinition--this indicates the lookup string 
        # does not exactly match the enclosed string, e.g. due to typographical adjustments 
        # or discontinuity of a multiword expression. If a redefinition has occurred, 
        # the "rdf" attribute holds its inflected form and "lemma" holds its lemma. 
        # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
        sensenum = xmlword.get('wnsn')  # WordNet sense number
        isOOVEntity = 'pn' in xmlword.keys()   # a "personal name" (NE) not in WordNet
        pos = xmlword.get('pos')    # part of speech for the whole chunk (None for punctuation)

        if unit=='token':
            if not pos_tag and not sem_tag:
                itm = tkn
            else:
                itm = (tkn,) + ((pos,) if pos_tag else ()) + ((lemma, sensenum, isOOVEntity) if sem_tag else ())
            return itm
        else:
            ww = tkn.split('_') # TODO: case where punctuation intervenes in MWE
            if unit=='word':
                return ww
            else:
                if sensenum is not None:
                    try:
                        sense = '%s.%02d' % (lemma, int(sensenum))
                    except ValueError:
                        sense = lemma+'.'+sensenum  # e.g. the sense number may be "2;1"

                bottom = [Tree(pos, ww)] if pos_tag else ww

                if sem_tag and isOOVEntity:
                    if sensenum is not None:
                        return Tree(sense, [Tree('NE', bottom)])
                    else:	# 'other' NE
                        return Tree('NE', bottom)
                elif sem_tag and sensenum is not None:
                    return Tree(sense, bottom)
                elif pos_tag:
                    return bottom[0]
                else:
                    return bottom # chunk as a list

def _all_xmlwords_in(elt, result=None):
    if result is None: result = []
    for child in elt:
        if child.tag in ('wf', 'punc'): result.append(child)
        else: _all_xmlwords_in(child, result)
    return result

class SemcorSentence(list):
    """
    A list of words, augmented by an attribute ``num`` used to record
    the sentence identifier (the ``n`` attribute from the XML).
    """
    def __init__(self, num, items):
        self.num = num
        list.__init__(self, items)

class SemcorWordView(XMLCorpusView):
    """
    A stream backed corpus view specialized for use with the BNC corpus.
    """
    def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
        """
        :param fileid: The name of the underlying file.
        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
        :param bracket_sent: If true, include sentence bracketing.
        :param pos_tag: Whether to include part-of-speech tags.
        :param sem_tag: Whether to include semantic tags, namely WordNet lemma 
            and OOV named entity status.
        """
        if bracket_sent: tagspec = '.*/s'
        else: tagspec = '.*/s/(punc|wf)'

        self._unit = unit
        self._sent = bracket_sent
        self._pos_tag = pos_tag
        self._sem_tag = sem_tag

        XMLCorpusView.__init__(self, fileid, tagspec)

    def handle_elt(self, elt, context):
        if self._sent: return self.handle_sent(elt)
        else: return self.handle_word(elt)

    def handle_word(self, elt):
        return SemcorCorpusReader._word(elt, self._unit, self._pos_tag, self._sem_tag)

    def handle_sent(self, elt):
        sent = []
        for child in elt:
            if child.tag in ('wf','punc'):
                itm = self.handle_word(child)
                if self._unit=='word':
                    sent.extend(itm)
                else:
                    sent.append(itm)
            else:
                raise ValueError('Unexpected element %s' % child.tag)
        return SemcorSentence(elt.attrib['snum'], sent)