import sys from argparse import ArgumentParser import pickle from os import walk, path, makedirs SPLITS = ['development', 'evaluation', 'training'] def validate(epe_path, parsed_path, task): ''' Compares token (and sentence) count per split per task. The reference point is the EPE tokenized files (.tt) ''' epe_stats = {} parsed_stats = {} for split in SPLITS: for root, dirs, files in walk(path.join(epe_path, task, split)): for ix, filename in enumerate(files): if filename.endswith(".tt"): f_name = split + '/' + '.'.join(filename.split('.')[:-1]) epe_stats[f_name] = {'sents':0, 'tokens': 0} with open(path.join(root, filename), 'r') as infile: for line in infile: if len(line) < 2: epe_stats[f_name]['sents'] += 1 elif len(line.split('\t')) == 5: epe_stats[f_name]['tokens'] += 1 for split in SPLITS: for root, dirs, files in walk(path.join(parsed_path, task, split)): for ix, filename in enumerate(files): if filename.endswith(".conllu"): f_name = split + '/' + '.'.join(filename.split('.')[:-1]) parsed_stats[f_name] = {'sents':0, 'tokens': 0} with open(path.join(root, filename), 'r') as infile: for line in infile: if len(line) < 2: parsed_stats[f_name]['sents'] += 1 elif len(line.split('\t')) == 10: parsed_stats[f_name]['tokens'] += 1 for filename in epe_stats: if not (0.7 * epe_stats[filename]['tokens'] < parsed_stats[filename]['tokens'] < 1.3 * epe_stats[filename]['tokens']): print("Different token count in %s/%s, Original: %d Parsed: %d" % (task, filename, epe_stats[filename]['tokens'], parsed_stats[filename]['tokens'])) def main(): argparser = ArgumentParser(description=__doc__) argparser.add_argument('-epe', default="../", help='''Path to EPE directory. Default value assumes running in EPE's bin directory''') argparser.add_argument('-parsed', default=None, help='''Path to unpack parsed files.''') argparser.add_argument('--task', choices=['events', 'opinion', 'negation'], default=None, help='''Specifies the task for which to run the script. If no value is passed, the files of all the tasks are packed.''') args = argparser.parse_args(sys.argv[1:]) if args.task: validate(args.epe, args.parsed, args.task) else: for task in ['events', 'opinion', 'negation']: validate(args.epe, args.parsed, task) if __name__ == '__main__': main()