# -*- coding: utf-8 -*-
""" simple html event parser """

from htmlentitydefs import name2codepoint
import unittest

# a list of html closing tags, from the HTML 4.01 specification
CLOSING = ['a', 'abbr', 'acronym', 'address', 'applet', 'b', 'bdo',
           'big', 'blockquote', 'button', 'caption', 'center', 'cite',
           'code', 'del', 'dfn', 'dir', 'div', 'dl', 'em', 'fieldset',
           'font', 'form', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5',
           'h6', 'i', 'iframe', 'ins', 'kbd', 'label', 'legend', 'map',
           'menu', 'noframes', 'noscript', 'object', 'ol', 'optgroup',
           'pre', 'q', 's', 'samp', 'script', 'select', 'small', 'span',
           'strike', 'strong', 'style', 'sub', 'sup', 'table',
           'textarea', 'title', 'tt', 'u', 'ul', 'var', 'body',
           'colgroup', 'dd', 'dt', 'head', 'html', 'li', 'p', 'tbody',
           'option', 'td', 'tfoot', 'th', 'thead', 'tr']

# a list of html non-closing tags, from the HTML 4.01 specification
NON_CLOSING = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
               'img', 'input', 'isindex', 'link', 'meta', 'param']


class _Event(object):
    def __init__(self, offset, backwards=False):
        self.offset = offset
        self.backwards = backwards
        self.content = unicode()
        self.finalised = False

    def __len__(self):
        return len(self.content)

    def __str__(self):
        return '{0}:"{1}"@{2}'.format(type(self).__name__,
                                      self.content,
                                      self.offset)

    def append(self, char):
        assert not self.finalised
        if self.backwards:
            if self.content:
                self.content = char + self.content
                self.offset -= 1
            else:
                self.content = char
        else:
            self.content += char

    @classmethod
    def end_at(cls, html, i):
        return html[i] == cls.END

    def _finalise(self):
        self.finalised = True

    @classmethod
    def start_at(cls, html, i):
        return html[i] == cls.START


class Comment(_Event):
    START = '<!--'
    END = '-->'

    @classmethod
    def start_at(cls, html, i):
        return html[i:i + 4] == Comment.START

    @classmethod
    def end_at(cls, html, i):
        return html[i - 2:i+1] == Comment.END


class Entity(_Event):
    START = '&'
    END = ';'
    LENGTH_OF_LONGEST = len('&thetasym;')

    def _finalise(self):
        self.name = self.content[1:-1]
        self.char = Entity.get_char(self.name)
        super(Entity, self)._finalise()

    @staticmethod
    def _get_codepoint(entity_name):
        if Entity._is_character_reference(entity_name):
            if Entity._is_hexadecimal(entity_name):
                return int(entity_name[2:], 16)
            else:
                return int(entity_name[1:])
        else:
            if entity_name in name2codepoint:
                return name2codepoint[entity_name]
            else:
                message = 'unknown named entity "{}"'.format(entity_name)
                raise ValueError(message)

    @staticmethod
    def _is_character_reference(entity_name):
        return entity_name.startswith('#')

    @staticmethod
    def _is_hexadecimal(entity_name):
        return entity_name.startswith('#x')

    @staticmethod
    def get_char(entity_name):
        try:
            codepoint = Entity._get_codepoint(entity_name)
            return unichr(codepoint)
        except:
            return '?'


class Tag(_Event):
    START = '<'
    END = '>'
    CLOSE = '/'

    def _finalise(self):
        self.is_closer = self.content[1] == Tag.CLOSE
        self.is_empty = self.content[-2] == Tag.CLOSE
        self.has_attributes = '=' in self.content

        if self.is_closer:
            self.name = self.content[2:-1]
        elif self.is_empty:
            if self.content[-3] == ' ':
                end = -3
            else:
                end = -2
            self.name = self.content[1:end]
        elif self.has_attributes:
            self.name = self.content[1:self.content.find(' ')]
        else:
            self.name = self.content[1:-1]

        super(Tag, self)._finalise()

    @classmethod
    def start_at(cls, html, i):
        return html[i] == Tag.START and \
            (Tag._valid_name_at(html, i) or
            (i + 1 < len(html) and html[i + 1] == Tag.CLOSE and Tag._valid_name_at(html, i + 1)))

    @staticmethod
    def _valid_name_at(html, i):
        # TODO: doesn't handle namespaces :(
        # special hack for wdc
        if i < len(html) - 3 and html[i+1] == 'o' and html[i+2] == ':':
            i = i + 2
        for tagset in (CLOSING, NON_CLOSING):
            for tag in tagset:
                if html[i + 1:i + len(tag) + 1] == tag:
                    return True
        return False


class Text(_Event):
    pass


def parse(html, start=0, backwards=False):

    # get configuration for stepping through the html
    if backwards:
        step = -1
        in_bounds = lambda i: i >= 0
        initiating = lambda event_type: event_type.end_at
        terminating = lambda event_type: event_type.start_at
    else:
        step = +1
        in_bounds = lambda i: i < len(html)
        initiating = lambda event_type: event_type.start_at
        terminating = lambda event_type: event_type.end_at

    i = start
    current = None
    while in_bounds(i):

        if current is None or type(current) is Text:
            # look for the start of a comment, entity or tag. If present then
            # the current event is complete, yield it and create a new one of
            # the appropriate type
            for event_type in (Comment, Entity, Tag):
                if initiating(event_type)(html, i):
                    # found the start of a comment, entity or tag---yield the
                    # currently building event, if present, and create a new
                    # event to build
                    if current is not None:
                        current._finalise()
                        yield current
                    current = event_type(i, backwards)
                    break

            # if there is still no event, then default to text
            if current is None:
                current = Text(i, backwards)

            current.append(html[i])

        else:

            current.append(html[i])
            # If i is now at the end of the current event, yield it.
            if terminating(current)(html, i):
                current._finalise()
                yield current
                current = None
            elif type(current) is Entity and \
                    len(current) > Entity.LENGTH_OF_LONGEST:
                # looks like this wasn't an entity after all,
                # so convert it to text
                text = Text(current.offset, backwards)
                text.content = current.content
                current = text

        # step to the next character
        i += step

    # read through the entire bounds---yield the current event, if there is one
    if current is not None:
        current._finalise()
        yield current


class _TestEventParsing(unittest.TestCase):

    def setUp(self):
        self.expectations = ((Tag, '<div>', 'div', False),
                             (Text, 'The '),
                             (Tag, '<a href="quick.html">', 'a', False),
                             (Text, 'quick'),
                             (Tag, '</a>', 'a', True),
                             (Text, ' brown fox'),
                             (Comment, '<!--GOOGLEAD-->'),
                             (Text, ' jumps over the lazy dog '),
                             (Entity, '&amp;', '&'),
                             (Text, ' flies away.'),
                             (Tag, '</div>', 'div', True))
        self.test = ''.join(fields[1] for fields in self.expectations)
        self.events = [event for event in parse(self.test)]

    def test_contents(self):
        for i, event in enumerate(self.events):
            expected_content = self.expectations[i][1]
            self.assertEqual(event.content, expected_content)

    def test_entities(self):
        for i, event in enumerate(self.events):
            if type(event) is Entity:
                expected_char = self.expectations[i][2]
                self.assertEqual(event.char, expected_char)

    def test_event_types(self):
        self.assertEqual(len(self.events), len(self.expectations))
        for i, event in enumerate(self.events):
            event_type = self.expectations[i][0]
            self.assertEqual(type(event), event_type)

    def test_tags(self):
        for i, event in enumerate(self.events):
            if type(event) is Tag:
                expected_name, expected_is_closer = self.expectations[i][2:]
                self.assertEqual(event.name, expected_name)
                self.assertEqual(event.is_closer, expected_is_closer)


if __name__ == '__main__':

    from argparse import ArgumentParser
    from common import read

    argparser = ArgumentParser(description=__doc__)
    argparser.add_argument('files', nargs='*', help='a file to segment')
    argparser.add_argument('--start', type=int, default='0',
                           help='offset to start processing from')
    argparser.add_argument('--backward', action='store_true',
                           help='step backwards, rather than forwards')
    args = argparser.parse_args()

    for path in args.files:
        print '=' * 79
        print path
        html = read(path)
        for event in parse(html, args.start, args.backward):
            print ' ', str(event)
        print