#!/usr/bin/env python
# -*- coding: utf-8 -*-

from accounting import AccountedString, Action
from argparse import ArgumentParser
from common import extract_element, is_whitespace, open, read
from config import Config
import re
import shlex
from subprocess import Popen, PIPE


def _align(sentence, unit, start_in_unit):

    if start_in_unit is None:
        x = 0
    else:
        x = start_in_unit

    # consume whitespace at the start in unit
    while x < len(unit) and is_whitespace(unit[x]):
        x += 1

    i = 0   # i tracks the current position in the sentence
    y = x   # y tracks the current position in the unit

    while i < len(sentence) and y < len(unit):
        if sentence[i] == unit[y]:
            # character is aligned---move on in sentence and unit
            i += 1
            y += 1
        elif is_whitespace(sentence[i]):
            # skip over whitespace in the sentence
            i += 1
        elif is_whitespace(unit[y]):
            # skip over whitespace in the unit
            y += 1
        else:
            raise Exception('lost alignment')

    # x and y now hold the start/end of the sentence in the unit
    return x, y


def _generate_units(reduction, config):
    if config.paragraph_mode:
        offset = 0
        previous = None
        for i, char in enumerate(reduction):
            if i == offset and is_whitespace(char):
                offset += 1
            elif (previous == '\n' and char == '\n') or i == len(reduction) - 1:
                unit = reduction[offset:i - 1].strip()
                if unit:
                    yield offset, unit
                offset = i + 1
            previous = char

    else:
        yield 0, reduction


def _invoke(unit, config):
    shseq = shlex.split(config.segmenter)
    process = Popen(shseq, stdin=PIPE, stdout=PIPE)
    out, err = process.communicate(unit.encode('utf-8'))
    if err:
        raise Exception('segmenter invocation error:\n' + err)
    return unicode(out, 'utf-8').split('\n')


def _read_account(istream):
    line = istream.readline().strip()
    return line


def _read_string(istream):
    return istream.readline().strip()


def process(path, pattern, config):
    ostream = open(path + '.deler', 'w')
    html = read(path)
    if pattern is None:
        post = html
    else:
        post = extract_element(html, pattern)
    for start, end, account in segment(post, config):
        item = unicode(account)
        ostream.write(item)
        ostream.write('\n')
        if config.accounting:
            ostream.write(str(start))
            ostream.write('\t')
            ostream.write(str(end))
            for action in account.actions:
                ostream.write('\t')
                ostream.write(action.tostring())
            ostream.write('\n')
    ostream.close()


def segment(raw, config):
    reduction = config.reduction_type(raw, config)
    for offset, unit in _generate_units(unicode(reduction), config):
        ry = None
        for sentence in _invoke(unit, config):

            sentence = sentence.strip()

            # get start/end in reduction
            rx, ry = _align(sentence, unit, ry)
            # get start/end in source
            sx, sy = reduction.get_source_range(offset + rx, offset + ry)
            # get source string
            source = reduction.get_source(sx, sy)

            # yield an accounted string
            account = config.account_type(source, config)

            string = unicode(account).strip()
            if len(string) > 0:
                yield sx, sy, account


def validate(path):
    valid = True
    source = None
    actions = None
    istream = open(path)
    for line in istream:
        if source is None:
            source_path = line.strip()
            source = read(source_path)

        elif actions is None:
            fields = line.strip().split('\t')
            start = int(fields[0])
            end = int(fields[1])+1
            pre_string = source[start:end]
            actions = [Action.fromstring(field) for field in fields[2:]]
        else:
            post_string = line.strip()
            reverted = AccountedString.revert(post_string, actions)
            if reverted != pre_string:
                valid = False
                print 'INVALID ITEM'
                print '[ORIGINAL]\t', pre_string
                print '[RECORDED]\t', post_string
                print '[REVERTED]\t', reverted
                print
            actions = None
    istream.close()
    if valid:
        print '{0} account of {1} is valid'.format(path, source_path)
    else:
        print '{0} account of {1} is invalid'.format(path, source_path)


def main():

    argparser = ArgumentParser(description=__doc__)
    argparser.add_argument('files', nargs='*',
                           help='a list of files to segment')
    argparser.add_argument('--accounting', action='store_true',
                           help='output an account of modifications made to the original segment')
    argparser.add_argument('--config',
                           help='configuration xml')
    argparser.add_argument('--gml-mode', action='store_true',
                           help='output gml instead of the input markup')
    argparser.add_argument('--paragraph-mode', action='store_true',
                           help='force segmentation at double newlines')
    argparser.add_argument('--post_start',
                           help='regex to extract the tag that indicates the start of the post')
    argparser.add_argument('--segmenter',
                           help='path to segmenter executable')

    args = argparser.parse_args()
    config = Config(args.config, args.accounting, args.gml_mode,
                    args.paragraph_mode, args.segmenter)
    pattern = None
    if args.post_start is not None:
        pattern = re.compile(args.post_start)

    for path in args.files:
        process(path, pattern, config)

if __name__ == '__main__':
    main()