import sys from argparse import ArgumentParser import pickle from os import walk, path, makedirs import re ''' The script performs both packing and unpacking ''' SPLITS = ['development', 'evaluation', 'training'] # CoNLL-U column names ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) def check_file_count(task, file_counts): events = {} events['training'] = 800 events['development'] = 150 events['evaluation'] = 260 opinion = {} opinion['training'] = 449 opinion['development'] = 90 opinion['evaluation'] = 148 negation = {} negation['training'] = 1 negation['development'] = 1 negation['evaluation'] = 1 if task == "events": for k in file_counts.keys(): if len(file_counts[k]) != events[k]: print('''The total number of parsed files in %s doesn't match the original number of files'''% (k)) else: print('''Successfully unpacked %d files in the %s split of %s''' % (len(file_counts[k]), k, task)) if task == "opinion": for k in file_counts.keys(): if len(file_counts[k]) != opinion[k]: print('''The total number of parsed files %d in %s doesn't match the original number of files %d''' % (len(file_counts[k]), k, opinion[k])) filename_key = pickle.load(open(task + "_fk.p", "rb")) for filename in filename_key: split = filename.split('/')[0] name = filename.split('/')[-1] if split == k and name not in file_counts[k]: print(name) else: print('''Successfully unpacked %d files in the %s split of %s''' % (len(file_counts[k]), k, task)) if task == "negation": for k in file_counts.keys(): if len(file_counts[k]) != negation[k]: print('''The total number of parsed files in %s doesn't match the original number of files'''% (k)) else: print('''Successfully unpacked %d files in the %s split of %s''' % (len(file_counts[k]), k, task)) def pack(epe_path, task): try: filename_key = pickle.load(open(task + "_fk.p", "rb")) except IOError: pf_path = '/projects/ltg/epe/2018/real/' + task + "_fk.p" print("Did not find %s filename-key pickle files in the working dir, falling back to %s" % (task + "_fk.p", pf_path)) try: key_filename = pickle.load(open(pf_path, "rb")) except IOError: print("Did not find %s neither in the working directory nor under %s" % (task + "_fk.p", pf_path)) sys.exit(1) with open(task + '.txt', 'w') as outfile: for split in SPLITS: for root, dirs, files in walk(path.join(epe_path, task, split)): for ix, filename in enumerate(files): if filename.endswith(".txt"): p_filename = split + "/" + filename separator = "\n\n\nDocument " + filename_key[p_filename] + " ends.\n\n\n" with open(path.join(root, filename)) as infile: for line in infile: outfile.write(line) outfile.write(separator) print("Finished packing files for %s" % (task)) def unpack(infile, task, outpath): doc_regex = re.compile('^D(?:o(c(u(m(e(n(t)?)?)?)?)?)?)?$') delimiter_regex = re.compile("^(Document)(\d+)(ends\.)$") num_regex = re.compile('^\d+$') id_range_regex = re.compile('^\d+[\.|-]+\d+$') file_counts = {} extension = "." + infile.split('.')[-1] for split in SPLITS: if not path.exists(path.join(outpath, task, split)): makedirs(path.join(outpath, task, split)) file_counts[split] = [] try: key_filename = pickle.load(open(task + "_kf.p", "rb")) except IOError: pf_path = '/projects/ltg/epe/2018/real/' + task + "_kf.p" print("Did not find %s key-filename pickle files in the working dir, falling back to %s" % (task + "_kf.p", pf_path)) try: key_filename = pickle.load(open(pf_path, "rb")) except IOError: print("Did not find %s neither in the working directory nor under %s" % (task + "_kf.p", pf_path)) sys.exit(1) delimiter_line_num = 0 delimiter_str = "" tmp_buffer = [] with open(infile, 'r') as file: print("Validating the %s file: %s" % (task, infile.split('/')[-1])) for line in file: # add line to temporary buffer tmp_buffer.append(line) if bool(re.search(id_range_regex, line.split('\t')[ID])): print("It looks like the parser output includes multiword token ID ranges and/or empty nodes") if len(line) < 2 or not bool(re.search(num_regex, line.split('\t')[ID])): # increase the number of delimiter lines # if there is a delimiter string under # construction if len(delimiter_str) > 0: delimiter_line_num += 1 continue # check if the current token starts with D[ocument] elif bool(re.search(doc_regex, line.split('\t')[FORM])): delimiter_str = line.split('\t')[FORM] delimiter_line_num = 1 elif len(delimiter_str) > 0: delimiter_str += line.split('\t')[FORM] delimiter_line_num += 1 if bool(re.search(delimiter_regex, delimiter_str)): num_id = delimiter_regex.split(delimiter_str)[2] if num_id not in key_filename: # This should NOT happen! print("File ID %s was not found" % (num_id)) sys.exit(1) p_filename = key_filename[num_id] split = p_filename.split('/')[0] file_counts[split].append('.'.join(p_filename.split('/')[-1].split('.')[:-1]) + ".txt") lines = "" start = 0 if len(tmp_buffer[0]) < 2: start = 1 for ln in tmp_buffer[start : - delimiter_line_num]: lines += ln # write out the lines to a file filename = '.'.join(p_filename.split('/')[-1].split('.')[:-1]) + extension with open(path.join(outpath, task, split, filename), 'w') as outfile: outfile.write(lines) delimiter_line_num = 0 tmp_buffer = [] delimiter_str = "" check_file_count(task, file_counts) def main(): argparser = ArgumentParser(description=__doc__) argparser.add_argument('-mode', choices=['pack', 'unpack'], help='''Script mode: pack multiple files into one, or unpack one file to multiple ones''') argparser.add_argument('--path', default="../", help='''Path to EPE directory. Default value assumes running in EPE's bin directory''') argparser.add_argument('--task', choices=['events', 'opinion', 'negation'], default=None, help='''Specifies the task for which to run the script. If no value is passed, the files of all the tasks are packed. Required in unpack mode''') argparser.add_argument('--outpath', default=None, help='''Path to unpack files. Required in unpack mode''') argparser.add_argument('--infile', default=None, help='''Full path to file to unpack. Required in unpack mode''') args = argparser.parse_args(sys.argv[1:]) if args.mode == 'pack': if args.task: pack(args.path, args.task) else: for task in ['events', 'opinion', 'negation']: pack(args.path, task) elif args.mode == 'unpack' and (args.infile is None or args.task is None or args.outpath is None): argparser.error('''unpack mode requires --infile, --outpath and --task. See pack.py -h for more''') elif args.mode == 'unpack' and args.infile and args.task and args.outpath: unpack(args.infile, args.task, args.outpath) if __name__ == '__main__': main()