#!/usr/bin/env python from codecs import open from getopt import getopt import pickle from sys import argv class EntityDict(dict): def __init__(self, path): super(EntityDict, self).__init__() istream = open(path, 'r', 'utf-8') for line in istream: line = line[:-1] if line and line[0] != '#': character, code, mnemonic, description = line.split('\t') self[code[1:-1]] = character self[mnemonic[1:-1]] = character istream.close() class Reduction(object): def __init__(self, stock): self.stock = stock self.reduction = u'' self.pointers = [] def _append(self, char, stock_index): if len(char) != 1: raise ValueError('can only append one character at a time') elif not ((char == ' ' and len(self.reduction) > 0 and self.reduction[-1] == ' ') or (char == '\n' and len(self.reduction) > 1 and self.reduction[-2:] == '\n\n')): self.reduction += char self.pointers.append(stock_index) class HtmlReduction(Reduction): def __init__(self, html, entities=None, newlines=[], \ paragraphs=[], retain=[]): super(HtmlReduction, self).__init__(html) tag = None entity = None for i, char in enumerate(html): if char == '<': tag = '' elif char == '>' and tag: self._handle_tag(tag, i, newlines, paragraphs, retain) tag = None elif char == '&' and not tag and entities: entity = '' elif char == ';' and entity: self._append(entities[entity], i) entity = None else: if tag != None: tag += char elif entity != None: entity += char elif not (char == ' ' and self.reduction[-1] == '\n'): self._append(char, i) def _handle_tag(self, tag, html_index, newlines, paragraphs, retain): tag_type = HtmlReduction._type_of(tag) if tag_type in newlines or tag_type in paragraphs: self._append('\n', html_index) if tag_type in paragraphs: self._append('\n', html_index) if tag_type in retain: for i, char in enumerate('<' + tag + '>'): self._append(char, html_index - len(tag) + i) @staticmethod def _type_of(tag): tag_type = tag.replace('/', '') if ' ' in tag_type: tag_type = tag_type[:tag_type.find(' ')] return tag_type def get_tag_list(string): return [tag.strip() for tag in string.split(',')] def main(argv): opts, args = getopt(argv, 'e:n:p:r:', ['entities=', 'newlines=', 'paragraphs=', 'retain=']) entities = None newlines = [] paragraphs = [] retain = [] for opt, arg in opts: if opt in ('-e', '--entities'): entities = EntityDict(arg) elif opt in ('-n', '--newlines'): newlines = get_tag_list(arg) elif opt in ('-p', '--paragraphs'): paragraphs = get_tag_list(arg) elif opt in ('-r', '--retain'): retain = get_tag_list(arg) else: print 'unrecognised option: ', opt exit(-1) for path in args: istream = open(path, 'r', 'utf-8') html = istream.read() istream.close() reduction = HtmlReduction(html, entities, newlines, paragraphs, retain) ostream = open(path + '.reduction', 'w', 'utf-8') ostream.write(reduction.reduction) ostream.close() if __name__ == '__main__': main(argv[1:])