#!/usr/bin/env python # -*- coding: utf-8 -*- from accounting import AccountedString, Action from argparse import ArgumentParser from common import extract_element, is_whitespace, open, read from config import Config import re import shlex from subprocess import Popen, PIPE def _align(sentence, unit, start_in_unit): if start_in_unit is None: x = 0 else: x = start_in_unit # consume whitespace at the start in unit while x < len(unit) and is_whitespace(unit[x]): x += 1 i = 0 # i tracks the current position in the sentence y = x # y tracks the current position in the unit while i < len(sentence) and y < len(unit): if sentence[i] == unit[y]: # character is aligned---move on in sentence and unit i += 1 y += 1 elif is_whitespace(sentence[i]): # skip over whitespace in the sentence i += 1 elif is_whitespace(unit[y]): # skip over whitespace in the unit y += 1 else: raise Exception('lost alignment') # x and y now hold the start/end of the sentence in the unit return x, y def _generate_units(reduction, config): if config.paragraph_mode: offset = 0 previous = None for i, char in enumerate(reduction): if i == offset and is_whitespace(char): offset += 1 elif (previous == '\n' and char == '\n') or i == len(reduction) - 1: unit = reduction[offset:i - 1].strip() if unit: yield offset, unit offset = i + 1 previous = char else: yield 0, reduction def _invoke(unit, config): shseq = shlex.split(config.segmenter) process = Popen(shseq, stdin=PIPE, stdout=PIPE) out, err = process.communicate(unit.encode('utf-8')) if err: raise Exception('segmenter invocation error:\n' + err) return unicode(out, 'utf-8').split('\n') def _read_account(istream): line = istream.readline().strip() return line def _read_string(istream): return istream.readline().strip() def process(path, pattern, config): ostream = open(path + '.deler', 'w') html = read(path) if pattern is None: post = html else: post = extract_element(html, pattern) for start, end, account in segment(post, config): item = unicode(account) ostream.write(item) ostream.write('\n') if config.accounting: ostream.write(str(start)) ostream.write('\t') ostream.write(str(end)) for action in account.actions: ostream.write('\t') ostream.write(action.tostring()) ostream.write('\n') ostream.close() def segment(raw, config): reduction = config.reduction_type(raw, config) for offset, unit in _generate_units(unicode(reduction), config): ry = None for sentence in _invoke(unit, config): sentence = sentence.strip() # get start/end in reduction rx, ry = _align(sentence, unit, ry) # get start/end in source sx, sy = reduction.get_source_range(offset + rx, offset + ry) # get source string source = reduction.get_source(sx, sy) # yield an accounted string account = config.account_type(source, config) string = unicode(account).strip() if len(string) > 0: yield sx, sy, account def validate(path): valid = True source = None actions = None istream = open(path) for line in istream: if source is None: source_path = line.strip() source = read(source_path) elif actions is None: fields = line.strip().split('\t') start = int(fields[0]) end = int(fields[1])+1 pre_string = source[start:end] actions = [Action.fromstring(field) for field in fields[2:]] else: post_string = line.strip() reverted = AccountedString.revert(post_string, actions) if reverted != pre_string: valid = False print 'INVALID ITEM' print '[ORIGINAL]\t', pre_string print '[RECORDED]\t', post_string print '[REVERTED]\t', reverted print actions = None istream.close() if valid: print '{0} account of {1} is valid'.format(path, source_path) else: print '{0} account of {1} is invalid'.format(path, source_path) def main(): argparser = ArgumentParser(description=__doc__) argparser.add_argument('files', nargs='*', help='a list of files to segment') argparser.add_argument('--accounting', action='store_true', help='output an account of modifications made to the original segment') argparser.add_argument('--config', help='configuration xml') argparser.add_argument('--gml-mode', action='store_true', help='output gml instead of the input markup') argparser.add_argument('--paragraph-mode', action='store_true', help='force segmentation at double newlines') argparser.add_argument('--post_start', help='regex to extract the tag that indicates the start of the post') argparser.add_argument('--segmenter', help='path to segmenter executable') args = argparser.parse_args() config = Config(args.config, args.accounting, args.gml_mode, args.paragraph_mode, args.segmenter) pattern = None if args.post_start is not None: pattern = re.compile(args.post_start) for path in args.files: process(path, pattern, config) if __name__ == '__main__': main()