import sys from argparse import ArgumentParser import pickle from os import walk, path SPLITS = ['development', 'evaluation', 'training'] TASK_IDS = {'opinion': '001', 'negation': '002', 'events': '003'} SPLIT_IDS = {'training': '001', 'development': '002', 'evaluation': '003'} ''' - Walk through the files, use a prefix of: - the task ID + split ID + file counter for example: 003001213 - Save the file name and its prefixed hash key in a pickle dict file ''' def create_dict(epe_path, task): # map file names to keys (used for packing) filename_key = {} # map keys to file names (used for unpacking) key_filename = {} for split in SPLITS: for root, dirs, files in walk(path.join(epe_path, task, split)): for ix, filename in enumerate(files): if filename.endswith(".txt"): # must concatenate filenames with splits because some filenames # are repeated across splits, e.g. raw.txt can be found under # training and evaluation filename_s = split + "/" + filename # create prefixed key: task ID + split ID + file counter prefixed_key = TASK_IDS[task] + SPLIT_IDS[split] + str(ix) filename_key[filename_s] = prefixed_key key_filename[prefixed_key] = filename_s if len(filename_key) > 0: pickle.dump(filename_key, open(task + "_fk.p", "wb")) pickle.dump(key_filename, open(task + "_kf.p", "wb")) def main(): argparser = ArgumentParser(description=__doc__) argparser.add_argument('--path', default="../", help='''Path to EPE directory. Default value assumes running in EPE's bin directory''') argparser.add_argument('--task', default=None, choices=['events', 'opinion', 'negation'], help='''Specifies the task for which to generate key dict. If no value is passed, dicts are created for all tasks''') args = argparser.parse_args(sys.argv[1:]) if args.task: create_dict(args.path, args.task) else: for task in ['events', 'opinion', 'negation']: create_dict(args.path, task) if __name__ == '__main__': main()