# -*- coding: utf-8 -*- """ simple html event parser """ from htmlentitydefs import name2codepoint import unittest # a list of html closing tags, from the HTML 4.01 specification CLOSING = ['a', 'abbr', 'acronym', 'address', 'applet', 'b', 'bdo', 'big', 'blockquote', 'button', 'caption', 'center', 'cite', 'code', 'del', 'dfn', 'dir', 'div', 'dl', 'em', 'fieldset', 'font', 'form', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i', 'iframe', 'ins', 'kbd', 'label', 'legend', 'map', 'menu', 'noframes', 'noscript', 'object', 'ol', 'optgroup', 'pre', 'q', 's', 'samp', 'script', 'select', 'small', 'span', 'strike', 'strong', 'style', 'sub', 'sup', 'table', 'textarea', 'title', 'tt', 'u', 'ul', 'var', 'body', 'colgroup', 'dd', 'dt', 'head', 'html', 'li', 'p', 'tbody', 'option', 'td', 'tfoot', 'th', 'thead', 'tr'] # a list of html non-closing tags, from the HTML 4.01 specification NON_CLOSING = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param'] class _Event(object): def __init__(self, offset, backwards=False): self.offset = offset self.backwards = backwards self.content = unicode() self.finalised = False def __len__(self): return len(self.content) def __str__(self): return '{0}:"{1}"@{2}'.format(type(self).__name__, self.content, self.offset) def append(self, char): assert not self.finalised if self.backwards: if self.content: self.content = char + self.content self.offset -= 1 else: self.content = char else: self.content += char @classmethod def end_at(cls, html, i): return html[i] == cls.END def _finalise(self): self.finalised = True @classmethod def start_at(cls, html, i): return html[i] == cls.START class Comment(_Event): START = '' @classmethod def start_at(cls, html, i): return html[i:i + 4] == Comment.START @classmethod def end_at(cls, html, i): return html[i - 2:i+1] == Comment.END class Entity(_Event): START = '&' END = ';' LENGTH_OF_LONGEST = len('ϑ') def _finalise(self): self.name = self.content[1:-1] self.char = Entity.get_char(self.name) super(Entity, self)._finalise() @staticmethod def _get_codepoint(entity_name): if Entity._is_character_reference(entity_name): if Entity._is_hexadecimal(entity_name): return int(entity_name[2:], 16) else: return int(entity_name[1:]) else: if entity_name in name2codepoint: return name2codepoint[entity_name] else: message = 'unknown named entity "{}"'.format(entity_name) raise ValueError(message) @staticmethod def _is_character_reference(entity_name): return entity_name.startswith('#') @staticmethod def _is_hexadecimal(entity_name): return entity_name.startswith('#x') @staticmethod def get_char(entity_name): try: codepoint = Entity._get_codepoint(entity_name) return unichr(codepoint) except: return '?' class Tag(_Event): START = '<' END = '>' CLOSE = '/' def _finalise(self): self.is_closer = self.content[1] == Tag.CLOSE self.is_empty = self.content[-2] == Tag.CLOSE self.has_attributes = '=' in self.content if self.is_closer: self.name = self.content[2:-1] elif self.is_empty: if self.content[-3] == ' ': end = -3 else: end = -2 self.name = self.content[1:end] elif self.has_attributes: self.name = self.content[1:self.content.find(' ')] else: self.name = self.content[1:-1] super(Tag, self)._finalise() @classmethod def start_at(cls, html, i): return html[i] == Tag.START and \ (Tag._valid_name_at(html, i) or (i + 1 < len(html) and html[i + 1] == Tag.CLOSE and Tag._valid_name_at(html, i + 1))) @staticmethod def _valid_name_at(html, i): # TODO: doesn't handle namespaces :( # special hack for wdc if i < len(html) - 3 and html[i+1] == 'o' and html[i+2] == ':': i = i + 2 for tagset in (CLOSING, NON_CLOSING): for tag in tagset: if html[i + 1:i + len(tag) + 1] == tag: return True return False class Text(_Event): pass def parse(html, start=0, backwards=False): # get configuration for stepping through the html if backwards: step = -1 in_bounds = lambda i: i >= 0 initiating = lambda event_type: event_type.end_at terminating = lambda event_type: event_type.start_at else: step = +1 in_bounds = lambda i: i < len(html) initiating = lambda event_type: event_type.start_at terminating = lambda event_type: event_type.end_at i = start current = None while in_bounds(i): if current is None or type(current) is Text: # look for the start of a comment, entity or tag. If present then # the current event is complete, yield it and create a new one of # the appropriate type for event_type in (Comment, Entity, Tag): if initiating(event_type)(html, i): # found the start of a comment, entity or tag---yield the # currently building event, if present, and create a new # event to build if current is not None: current._finalise() yield current current = event_type(i, backwards) break # if there is still no event, then default to text if current is None: current = Text(i, backwards) current.append(html[i]) else: current.append(html[i]) # If i is now at the end of the current event, yield it. if terminating(current)(html, i): current._finalise() yield current current = None elif type(current) is Entity and \ len(current) > Entity.LENGTH_OF_LONGEST: # looks like this wasn't an entity after all, # so convert it to text text = Text(current.offset, backwards) text.content = current.content current = text # step to the next character i += step # read through the entire bounds---yield the current event, if there is one if current is not None: current._finalise() yield current class _TestEventParsing(unittest.TestCase): def setUp(self): self.expectations = ((Tag, '
', 'div', False), (Text, 'The '), (Tag, '', 'a', False), (Text, 'quick'), (Tag, '', 'a', True), (Text, ' brown fox'), (Comment, ''), (Text, ' jumps over the lazy dog '), (Entity, '&', '&'), (Text, ' flies away.'), (Tag, '
', 'div', True)) self.test = ''.join(fields[1] for fields in self.expectations) self.events = [event for event in parse(self.test)] def test_contents(self): for i, event in enumerate(self.events): expected_content = self.expectations[i][1] self.assertEqual(event.content, expected_content) def test_entities(self): for i, event in enumerate(self.events): if type(event) is Entity: expected_char = self.expectations[i][2] self.assertEqual(event.char, expected_char) def test_event_types(self): self.assertEqual(len(self.events), len(self.expectations)) for i, event in enumerate(self.events): event_type = self.expectations[i][0] self.assertEqual(type(event), event_type) def test_tags(self): for i, event in enumerate(self.events): if type(event) is Tag: expected_name, expected_is_closer = self.expectations[i][2:] self.assertEqual(event.name, expected_name) self.assertEqual(event.is_closer, expected_is_closer) if __name__ == '__main__': from argparse import ArgumentParser from common import read argparser = ArgumentParser(description=__doc__) argparser.add_argument('files', nargs='*', help='a file to segment') argparser.add_argument('--start', type=int, default='0', help='offset to start processing from') argparser.add_argument('--backward', action='store_true', help='step backwards, rather than forwards') args = argparser.parse_args() for path in args.files: print '=' * 79 print path html = read(path) for event in parse(html, args.start, args.backward): print ' ', str(event) print