#!/usr/bin/env python

from codecs import open
from getopt import getopt
import pickle
from sys import argv

class EntityDict(dict):
    def __init__(self, path):
        super(EntityDict, self).__init__()
        istream = open(path, 'r', 'utf-8')
        for line in istream:
            line = line[:-1]
            if line and line[0] != '#':
                character, code, mnemonic, description = line.split('\t')
                self[code[1:-1]] = character
                self[mnemonic[1:-1]] = character
        istream.close()

class Reduction(object):

    def __init__(self, stock):
        self.stock = stock
        self.reduction = u''
        self.pointers = []


    def _append(self, char, stock_index):
        if len(char) != 1:
            raise ValueError('can only append one character at a time')
        elif not ((char == ' ' and len(self.reduction) > 0 and 
                 self.reduction[-1] == ' ')
                or
                (char == '\n' and len(self.reduction) > 1 and
                 self.reduction[-2:] == '\n\n')):     
            self.reduction += char
            self.pointers.append(stock_index)


class HtmlReduction(Reduction):

    def __init__(self, html, entities=None, newlines=[], \
                     paragraphs=[], retain=[]):

        super(HtmlReduction, self).__init__(html)

        tag = None
        entity = None
        for i, char in enumerate(html):
            if char == '<':
                tag = ''
            elif char == '>' and tag:
                self._handle_tag(tag, i, newlines, paragraphs, retain)
                tag = None
            elif char == '&' and not tag and entities:
                entity = ''
            elif char == ';' and entity:
                self._append(entities[entity], i)
                entity = None
            else:
                if tag != None:
                    tag += char
                elif entity != None:
                    entity += char
                elif not (char == ' ' and self.reduction[-1] == '\n'):
                    self._append(char, i)

    def _handle_tag(self, tag, html_index, newlines, paragraphs, retain):
        tag_type = HtmlReduction._type_of(tag)
        
        if tag_type in newlines or tag_type in paragraphs:
            self._append('\n', html_index)
            if tag_type in paragraphs:
                self._append('\n', html_index)

        if tag_type in retain:
            for i, char in enumerate('<' + tag + '>'):
                self._append(char, html_index - len(tag) + i)

    @staticmethod
    def _type_of(tag):
        tag_type = tag.replace('/', '')
        if ' ' in tag_type: 
            tag_type = tag_type[:tag_type.find(' ')]
        return tag_type


def get_tag_list(string):
    return [tag.strip() for tag in string.split(',')]
                
def main(argv):

    opts, args = getopt(argv, 'e:n:p:r:', ['entities=', 'newlines=',
                                        'paragraphs=', 'retain='])

    entities = None
    newlines = []
    paragraphs = []
    retain = []


    for opt, arg in opts:
        if opt in ('-e', '--entities'):
            entities = EntityDict(arg)
        elif opt in ('-n', '--newlines'):
            newlines = get_tag_list(arg)
        elif opt in ('-p', '--paragraphs'):
            paragraphs = get_tag_list(arg)
        elif opt in ('-r', '--retain'):
            retain = get_tag_list(arg)
        else:
            print 'unrecognised option: ', opt
            exit(-1)


    for path in args:
        
        istream = open(path, 'r', 'utf-8')
        html = istream.read()
        istream.close()

        reduction = HtmlReduction(html, entities, newlines, paragraphs, retain)

        ostream = open(path + '.reduction', 'w', 'utf-8')
        ostream.write(reduction.reduction)
        ostream.close()
                

if __name__ == '__main__':
    main(argv[1:])