#!/usr/local/bin/python # -*- coding: utf-8 -*- import glob, os, gzip, re, sys, string, math, argparse, itertools import codecs from collections import defaultdict #from collections import OrderedDict from nltk.tree import * from nltk.util import * reload(sys) sys.setdefaultencoding('utf8') ################################## # USAGE #python converter_v1.py --data --grammar --dt --dm --tex --log # The difference from converter.py is that we do not change derivation tree (do not cut on first underscore etc.) when converting to CFG. # These lines make the difference: # # cfg_tree = cfg_tree + rule_line_full # ################################## #========================================# #FUNCTIONS COMMON FOR PTB AND ERG TOKENIZATION# #========================================# class Converter: #this global variable was used to count the number of sentences that have ". ..." or ". '" at the end. def __init__(self): self.t_cfg_traversed = "" self.cfg_no_unary_rules = 0 def run(self): latex_doc_start = '%%!TEX encoding = UTF-8 Unicode\n\ \documentclass[8pt,landscape]{a0poster}\n\ \setlength\oddsidemargin{15mm}\n\ \setlength\evensidemargin{15mm}\n\ \setlength\\topmargin{15mm}\n\ \setlength\headsep{15mm}\n\ \setlength\headheight{0mm}\n\ \let\openright=\clearpage\n\ \usepackage[english]{babel}\n\ \usepackage[T1]{fontenc}\n\ \usepackage[T2A]{fontenc}\n\ \usepackage{indentfirst}\n\ \usepackage[utf8x]{inputenc}\n\ \usepackage{textcomp}\n\ \usepackage[titletoc]{appendix}\n\ \usepackage{graphicx}\n\ \usepackage{amsthm}\n\ \usepackage{multirow}\n\ \\renewcommand{\multirowsetup}{\centering}\n\ \usepackage{longtable}\n\ \usepackage[hyphens]{url}\n\ \usepackage{titletoc}\n\ \usepackage[unicode]{hyperref}\n\ \usepackage{subfigure}\n\ \usepackage{array}\n\ \usepackage[intoc]{nomencl}\n\ \usepackage{paralist}\n\ \usepackage{amsmath}\n\ \usepackage[round]{natbib}\n\ \\renewcommand*\\thesection{\\arabic{section}}\n\ \\renewcommand{\\nomname}{List of Abbreviations}\n\ \makenomenclature\n\ \hypersetup{ colorlinks, citecolor=black, filecolor=black, linkcolor=black, urlcolor=black}\n\ \\renewcommand*\\thesubsection{\\arabic{subsection}}\n\ \\renewcomand{\\figurename}{Figure}\n\ \hypersetup{pdfauthor=Angelina Ivanova}\n\ \makeatletter\n\ \def\@makechapterhead#1{\n\ {\parindent \z@ \\raggedright \\normalfont\n\ \Huge\\bfseries \\thechapter. #1\n\ \par\\nobreak\n\ \\vskip 20\p@\n\ }}\n\ \def\@makeschapterhead#1{\n\ {\parindent \z@ \\raggedright \\normalfont\n\ \Huge\\bfseries #1\n\ \par\\nobreak\n\ \\vskip 20\p@\n\ }}\n\ \makeatother\n\ %macros for dependency graphs\n\ \usepackage{tikz}\n\ \usepackage{tikz-dependency}\n\ \\begin{document}\n\ \lefthyphenmin=2\n\ \\righthyphenmin=2\n\ \pagenumbering{arabic}\n\ \setcounter{page}{1}\n\ \\newpage\n' parser = argparse.ArgumentParser() parser.add_argument('--data', required = True, help = 'Existing folder where the original files that should be parsed are located') parser.add_argument('--grammar', required = True, help = 'Existing folder where the grammar is located. For the conversion only dm.cfg and rules.hds are required') parser.add_argument('--dt', help='Existing folder where the converted files will be printed out in the dependency format DELPH-IN Syntactic Derivation Tree (DT)') parser.add_argument('--dm', help='Existing folder where the converted files will be printed out in the dependency format DELPH-IN MRS-Derived Dependencies (DM)') parser.add_argument('--dtm', help='Existing folder where the converted files will be printed out in the DT and DM formats combined in CoNLL08 file.') parser.add_argument('--cfg', help ='Existing folder where the converted files will be printed out in the form of phrase-structured trees (context-free grammar).') parser.add_argument('--cfg_no_unary_rules', choices= ['true', 'false'],default = 'false',help ='Wheather to collapse unary rules in CFG tree or not.') parser.add_argument('--dt_yy', help ='Existing folder where the converted files will be printed out in the form of DT annotations in yy output format (available only for PTB tokenization).') parser.add_argument ('--sent_tok', help='Existing folder where the file with tokenized sentences (one sentence per line) will be created.') parser.add_argument ('--tex', help='Existing folder where the TEX file illustrating the result of conversion will be printed out') parser.add_argument('--tok', choices= ['ptb', 'erg'], default = 'erg', help = 'Tokenization style: PTB or ERG') parser.add_argument('--log', help ='Information about files that were not proccessed will be written to the log file') args = parser.parse_args() args_dict = vars(args) #flag if to print out a format specified by the key output_flags = {} # It is not possible to produce CFG trees with PTB tokenization if args_dict['tok'] == 'ptb' and args_dict['cfg'] is not None: print('Error! It is not possible to extract phrase-structured trees with PTB-style tokenization. Either choose ERG-style tokenization, or do not specify "--cfg" option.') sys.exit(1) if args_dict['cfg'] is None and args_dict['cfg_no_unary_rules'] == "true": print('Error! Option --cfg_no_unary_rules can only be used together with option --cfg') sys.exit(1) # We currently only provide yy output for DT with PTB tokenization if args_dict['tok'] == 'erg' and args_dict['dt_yy'] is not None: print('Error! The yy output format is not supported for ERG tokenization. Either choose PTB-style tokenization, or do not specify "--dt_yy" option.') sys.exit(1) #check if the paths of mandatory paths exists and that at least one output is provided if os.path.exists(args_dict['data']) and os.path.exists(args_dict['grammar']): #check if we have at least one output specified for key in ['dt', 'dm', 'dtm', 'cfg', 'sent_tok','tex', 'dt_yy']: #by default we think the format was not specified by the user output_flags[key] = 0 #However if it was specified if args_dict[key] is not None: # and the path exists if os.path.exists(args_dict[key]): #then flag turns 1 output_flags[key] = 1 #if at least one output format was specified if not (output_flags['dt']==0 and output_flags['dm']==0 and output_flags['dtm']==0 and output_flags['cfg']==0 and output_flags['sent_tok']==0 and output_flags['tex']==0 and output_flags['dt_yy'] == 0): #Add slash to the end of the folder name if is missing for key in ['data', 'grammar', 'dt', 'dm', 'dtm', 'cfg', 'sent_tok','tex', 'dt_yy']: if args_dict[key] is not None: args_dict[key] = self.addSlashToFolderPath(args_dict[key]) rules_file = args_dict["grammar"] + "etc/rules.hds" rule_head_dict = self.read_rule_head_file(rules_file) relations_file = args_dict["grammar"] + "etc/dm.cfg" relations_collection = self.read_relations_file(relations_file) #print "relations collection" #pprint(relations_collection) #By default our output file handles for each format are None objects. We have to initiate and #pass as parameters file handles (not closed files) because we loop over input files and print out results to the #output files on the fly (otherwise we need to keep too much info in memory) dt_output_fh = None dm_output_fh = None dtm_output_fh = None cfg_output_fh = None sent_tok_output_fh = None latex_output_fh = None dt_yy_output_fh = None log_fh = None #These are counters for output latex files. If we write everything to one Latex file, we exceed TEX memory capacity current_latex_file_index = 1 file_index = 0 fhdl_dict = {} if args_dict['log'] is not None: try: fhdl_dict['log'] = codecs.open(args_dict['log'], 'w') except IOError: print('Error! Cannot open the log file for writing!') sys.exit(1) if output_flags['dt']: #DT output file handle fhdl_dict['dt'] = codecs.open(args_dict['dt'] + os.path.basename(os.path.normpath(args_dict['data'])) + "." + args_dict['tok'] + '.dt', 'w') if output_flags['dm']: #DM output file handle fhdl_dict['dm'] = codecs.open(args_dict['dm'] + os.path.basename(os.path.normpath(args_dict['data'])) + "." + args_dict['tok'] + '.dm', 'w') if output_flags['dtm']: #DT&DM output file handle fhdl_dict['dtm'] = codecs.open(args_dict['dtm'] + os.path.basename(os.path.normpath(args_dict['data'])) + "." + args_dict['tok'] + '.dtm', 'w', encoding = 'utf-8') if output_flags['cfg']: #phrase-structured trees extracted from the derivation tree fhdl_dict['cfg'] = codecs.open(args_dict['cfg'] + "cfg_" + os.path.basename(os.path.normpath(args_dict['data'])) + '.txt', 'w') if output_flags['sent_tok']: #tokenized normalized sentence (normalized means words in multiword expressions are joined with "+", some quotes are normalized, brackets are replaced with -LRB- and -RRB-) fhdl_dict['sent_tok'] = codecs.open(args_dict['sent_tok'] + "sent_" +os.path.basename(os.path.normpath(args_dict['data'])) + "_" + args_dict['tok'] + "_tok" + '.txt', 'w') if output_flags['tex']: fhdl_dict['tex'] = codecs.open(args_dict['tex'] + os.path.basename(os.path.normpath(args_dict['data'])) + "." + args_dict['tok'] + "_tok_" + str(current_latex_file_index) + '.tex', 'w') fhdl_dict['tex'].write(latex_doc_start) if output_flags['dt_yy']: #tokenized normalized sentence (normalized means words in multiword expressions are joined with "+", some quotes are normalized, brackets are replaced with -LRB- and -RRB-) fhdl_dict['dt_yy'] = codecs.open(args_dict['dt_yy'] + os.path.basename(os.path.normpath(args_dict['data'])) + "_" + args_dict['tok'] + "_tok" + '.yy', 'w', encoding = 'utf-8') file_list_sorted = [] for fi in os.listdir(args_dict['data']): if fi.endswith(".gz"): file_list_sorted.append(int(re.sub(r'\.gz', '', fi))) file_list_sorted.sort() if args_dict['cfg_no_unary_rules'] is not None: if args_dict['cfg_no_unary_rules'] == "true": self.cfg_no_unary_rules = 1 #THIS IS THE MAIN LOOP: WE LOOP OVER ALL THE FILES THAT HAVE TO BE CONVERTED for fi in file_list_sorted: self.t_cfg_traversed = "" if args_dict['tok'] == 'ptb': try: (file_index, current_latex_file_index, fhdl_dict) = \ self.analyze_input_file_ptb_tok(fi, file_index, current_latex_file_index, latex_doc_start, fhdl_dict, args_dict, output_flags,rule_head_dict, relations_collection) except ValueError, e: print(str(e)) if fhdl_dict['log'] is not None: fhdl_dict['log'].write(str(fi) + "\t" + str(e)+"\n") continue elif args_dict['tok'] == 'erg': try: (file_index, current_latex_file_index, fhdl_dict, eds_dep_labels_dict, words_correct_case_dict) = \ self.analyze_input_file_erg_tok(fi, file_index, current_latex_file_index, latex_doc_start, fhdl_dict, args_dict, output_flags, rule_head_dict, relations_collection, {}) except ValueError, e: print(str(e)) if fhdl_dict['log'] is not None: fhdl_dict['log'].write(str(fi) + "\t" + str(e)+"\n") continue if output_flags['tex']: fhdl_dict['tex'].write('\end{document}') fhdl_dict['tex'].close() else: print "Error! Please specify correct existing path to at least one output format. See usage below!" parser.print_help() sys.exit(1) else: print "Error! Incorrect command line arguments provided. See usage below!" parser.print_help() sys.exit(1) def addSlashToFolderPath(self, somepath): if not somepath.endswith("/"): somepath += "/" return somepath def remove_fragment_symbol_eps(self, eps): m = re.match("^\s*\|(.*)", eps) if m: eps = m.group(1) return eps # FUNCTION READ_EDS_INTO_DICT(self, eds, relations_collection) def read_eds_into_dict(self, eds, relations_collection): eds = eds.strip() eds_dict = {} dict_id_pos = {} transparent_dict = {} eds_relation_dict = {} eds_root = '' eds = eds[1:] #cut { eds = eds[:-1] #cut } #print '\n\n EDS\n' + eds + '\n\n' eps = eds.split('\n') #Now split the first line by ':' # e.g. eps[0] = "e3: (fragmented)" if eps[0].find(':')!=-1: # Removes initial "|" if for example we have: #|_34:udef_q<228:244>[BV e193] eps[0] = self.remove_fragment_symbol_eps(eps[0]) eps_first_line_as_list = eps[0].split(':') #root is the first label (e.g. 'e3' in our example) eds_root = eps_first_line_as_list[0] for i in range(1,len(eps)): if eps[i]!='': # Removes initial "|" if for example we have: #|_34:udef_q<228:244>[BV e193] eps[i] = self.remove_fragment_symbol_eps(eps[i]) #Example: eps[i] = "e2:unknown<9:23>[ARG x4]" eps_parts = eps[i].split(':',1) #Example: dict_key = "e2" dict_key = eps_parts[0] tokid = '' dep_line = eps_parts[1] rel = '' if '<' in eps_parts[1]: #Example: eps_parts[1] = "unknown<9:23>[ARG x4]" eps_parts2 = eps_parts[1].split('<', 1) #Example: rel = "unknown" rel = eps_parts2[0] #Example: eps_parts2[1] = "9:23>[ARG x4]" eps_parts3 = eps_parts2[1].split('>', 1) #Example: tokid = "<9:23>" tokid = '<' + eps_parts3[0] + '>' #Example: dep_line = "[L-INDEX x54, R-INDEX x59]" or dep_line = "[]" or dep_line = '("Lane")[]' dep_line = eps_parts3[1].strip() elif '(' in eps_parts[1]: #suppose we have smth like x25:yofc("2004")[] (not sure, it exists) eps_parts2 = eps_parts[1].split('(', 1) #Example: rel = "unknown" rel = eps_parts2[0] elif '[' in eps_parts[1]: #e.g. e34:parg_d[ARG1 e2] eps_parts2 = eps_parts[1].split('[', 1) #Example: rel = "unknown" rel = eps_parts2[0] eds_relation_dict[dict_key] = rel labels_dep_dict = {} #a line with arguments, such as [ARG1 e3, ARG2 e99] p_arg = re.compile('\[[^\]]+\]') m_arg = p_arg.search(dep_line) if m_arg: dep_line = m_arg.group() #cut '[' dep_line = dep_line[1:] #cut ']' dep_line = dep_line[:-1] dep_list = [dep_line] if ',' in dep_line: #e.g. "ARG1 e3, ARG2 e99" dep_list = dep_line.split(', ') for dep in dep_list: dep_parts = dep.split(' ') label = dep_parts[0] #e.g. ARG1 dependent = dep_parts[1] #e.g. e3 labels_dep_dict[label] = dependent dict_value = [] # Example: # relations_collection['relational'][2] = "(_c$|^appos|^compound|^implicit_conj|^loc_nonsp|^loc_sp|^measure|^nonsp|^of_p|^part_of|^poss|^subord)" # rel = "implicit_conj" m_redundant = re.search(relations_collection['redundant'][2], rel) m_transparent = re.search(relations_collection['transparent'][2], rel) m_relational = re.search(relations_collection['relational'][2], rel) m_lexical = re.search(relations_collection['lexical'][2], rel) if m_redundant: #print '\n' + eps[i] + ' is a redundant relation type\n' labels_from_redundant_dict = self.get_labels_from_typed_dict('redundant', relations_collection, rel) for labels_list in labels_from_redundant_dict: if labels_list[0] in labels_dep_dict and labels_list[1] in labels_dep_dict: #if labels_dep_dict[labels_list[0]] == labels_dep_dict[labels_list[1]]: del labels_dep_dict[labels_list[1]] if m_transparent: #Example: dict_id_pos["e2"] = "<9:23>" dict_id_pos[dict_key] = tokid #print '\n' + eps[i] + ' is a transparent relation type\n' labels_from_transparent_dict = self.get_labels_from_typed_dict('transparent', relations_collection, rel) for lab in labels_from_transparent_dict: #TO avoid "key" equal to "value" in transparent dict (e.g. transparent_dict['i82']='i82') that occur in cyclic graphs # and cause infinite loops, we check that dict_key != labels_dep_dict[lab] if lab in labels_dep_dict and dict_key != labels_dep_dict[lab]: transparent_dict[dict_key] = labels_dep_dict[lab] #print 'transparent_dict' #pprint(transparent_dict) break if m_relational: #Example: dict_id_pos["e2"] = "<9:23>" dict_id_pos[dict_key] = tokid #print '\n' + eps[i] + ' is a relational relation type\n' labels_from_relational_dict = self.get_labels_from_typed_dict('relational', relations_collection, rel) for labels_list in labels_from_relational_dict: lbl1 = labels_list[0] lbl2 = labels_list[1] # if we want to rename the dependency relation, there is a label for a new name for it: # comp ARG0 ARG2 ref if len(labels_list) == 3: new_rel_name = labels_list[2] # with "rel = new_rel_name" I get encoding error: #UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 4: ordinal not in range(128) #rel = new_rel_name rel = new_rel_name.encode('utf-8') if lbl1 in labels_dep_dict and lbl2 in labels_dep_dict: dpdt1 = labels_dep_dict[lbl1] dpdt2 = labels_dep_dict[lbl2] dict_value.append([rel, dpdt2]) if dpdt1 in eds_dict: dict_value_old = eds_dict[dpdt1] dict_value_new = dict_value_old + dict_value eds_dict[dpdt1] = dict_value_new else: eds_dict[dpdt1] = dict_value #all relational predicates are by default transparent as well #TO avoid "key" equal to "value" in transparent dict (e.g. transparent_dict['i82']='i82') that occur in cyclic graphs # and cause infinite loops, we check that dict_key != labels_dep_dict[lbl1] if dict_key != labels_dep_dict[lbl1]: transparent_dict[dict_key] = labels_dep_dict[lbl1] del labels_dep_dict[lbl1] del labels_dep_dict[lbl2] elif lbl1 == 'ARG0': # For the case part_of ARG0 ARG1 # x5:part_of<0:3>[ARG1 x9] #labels_dep_dict = {'ARG1': 'x9'} # ARG0 is not in labels_dep_dict # dict_value gets additional value [part_of, x9] # and we change eds_dict: eds_dict[x5] = dict_value if lbl2 in labels_dep_dict: # we need to check for lbl2 because there are cases when e.g. # part_of does not have ARG1: #|x23:part_of<33:36>[] # file 26360 in Conan Doyle development set # (PET export ssa) dict_value.append([rel, labels_dep_dict[lbl2]]) eds_dict[dict_key] = dict_value # Now we do not need to process it as a lexical item, because otherwise there will be two labels assigned # from x5 to x9: part_of and ARG1 m_lexical = False elif lbl2 == 'ARG0' and lbl1 in labels_dep_dict: dict_value.append([rel, dict_key]) eds_dict[labels_dep_dict[lbl1]] = dict_value m_lexical = False #print dpdt1 + ' is the head of ' + dpdt2 if m_lexical: #Example: dict_id_pos["e2"] = "<9:23>" dict_id_pos[dict_key] = tokid #print '\n' + eps[i] + ' is a lexical relation type\n' for lbl in labels_dep_dict.keys(): dpdt = labels_dep_dict[lbl] dict_value.append([lbl, dpdt]) if dict_key in eds_dict: dict_value_old = eds_dict[dict_key] dict_value_new = dict_value_old + dict_value eds_dict[dict_key] = dict_value_new else: eds_dict[dict_key] = dict_value #p3 = re.compile('(^compound|^poss|^measure|^appos|^loc\_nonsp|^loc\_sp|^nonsp|^subord|^of\_p)') #m3 = p3.match(eps_parts[1]) #p4 = re.compile('(^_|^abstr\_deg|^card|^dofm|^dofw|^generic\_entity|^mofy|^much-many\_a|^named|^named\_n|^numbered\_hour|^ord|^part\_of|^person|^person\_n|^pron|^thing|^time|^time\_n|^yofc|^neg)') #m4 = p4.match(eps_parts[1]) ''' print eps_parts[1] + ' is a redundant relation type\n' if re.match(relations_collection['transparent'][2], eps_parts[1]): print eps_parts[1] + ' is a transparent relation type\n' if re.match(relations_collection['relational'][2], eps_parts[1]): print eps_parts[1] + ' is a relational relation type\n' if re.match(relations_collection['lexical'][2], eps_parts[1]): print eps_parts[1] + ' is a lexical relation type\n' ''' ''' #NOMINALIZATION if re.match('^nominalization', eps_parts[1]): #print 'NOMINALIZATION' + '\n' dict_value = [] if m1: dict_id_pos[dict_key] = m1.group() if m2 is not None: dep_line = m2.group() dep_line = dep_line[1:] dep_line = dep_line[:-1] dep_list = [dep_line] if ',' in dep_line: dep_list = dep_line.split(', ') for dep in dep_list: dep_parts = dep.split(' ') label = dep_parts[0] dependent = dep_parts[1] if label == 'ARG1': transparent_dict[dict_key] = dependent else: dict_value.append([label, dependent]) if dict_key in eds_dict: dict_value_old = eds_dict[dict_key] dict_value_new = dict_value_old + dict_value eds_dict[dict_key] = dict_value_new else: eds_dict[dict_key] = dict_value #Conjunction elif re.search('_c[<\[]', eps_parts[1]) or re.match('^implicit_conj', eps_parts[1]): if m1: dict_id_pos[dict_key] = m1.group() dict_value = [] if m2 is not None: dep_line = m2.group() dep_line = dep_line[1:] dep_line = dep_line[:-1] dep_list = [dep_line] if ',' in dep_line: dep_list = dep_line.split(', ') # we create a dictionary of dependencies tmp_dep_dict = {} #if we have L-INDEX, it will substitute *_c and take its argument for dep in dep_list: dep_parts = dep.split(' ') label = dep_parts[0] dependent = dep_parts[1] tmp_dep_dict[label] = dependent #print '\n Temporary dep dict\n\n' #pprint(tmp_dep_dict) #if conjuction had other dependants dict_value_old = [] if dict_key in eds_dict: dict_value_old = eds_dict[dict_key] if 'L-INDEX' in tmp_dep_dict and 'R-INDEX' in tmp_dep_dict: dict_value.append(['conj',tmp_dep_dict['R-INDEX']]) dict_value_new = dict_value_old + dict_value eds_dict[dict_key] = dict_value_new transparent_dict[dict_key] = tmp_dep_dict['L-INDEX'] elif 'L-HNDL' in tmp_dep_dict and 'R-HNDL' in tmp_dep_dict: dict_value.append(['conj',tmp_dep_dict['R-HNDL']]) dict_value_new = dict_value_old + dict_value eds_dict[dict_key] = dict_value_new transparent_dict[dict_key] = tmp_dep_dict['L-HNDL'] #relational elif m3: label = m3.group(1) label = re.sub(r'\_', r'\\_', label) #print 'COMPOUND ET AL. ', label dict_value = [] if m1: dict_id_pos[dict_key] = m1.group() if m2 is not None: dep_line = m2.group() dep_line = dep_line[1:] dep_line = dep_line[:-1] dep_list = [dep_line] if ',' in dep_line: dep_list = dep_line.split(', ') if len(dep_list) == 2: dep_parts1 = dep_list[0].split(' ') dep_parts2 = dep_list[1].split(' ') dict_key = dep_parts2[1] dependent = dep_parts1[1] #print 'dependent = ', dependent , '\n\n' dict_value.append([label, dependent]) if dict_key in eds_dict: dict_value_old = eds_dict[dict_key] dict_value_new = dict_value_old + dict_value eds_dict[dict_key] = dict_value_new else: eds_dict[dict_key] = dict_value #so-called `lexical' predicate symbols elif m4: dict_value = [] dict_id_pos[dict_key] = m1.group() if m2 is not None: dep_line = m2.group() dep_line = dep_line[1:] dep_line = dep_line[:-1] dep_list = [dep_line] if ',' in dep_line: dep_list = dep_line.split(', ') for dep in dep_list: dep_parts = dep.split(' ') label = dep_parts[0] dependent = dep_parts[1] dict_value.append([label, dependent]) if dict_key in eds_dict: dict_value_old = eds_dict[dict_key] dict_value_new = dict_value_old + dict_value eds_dict[dict_key] = dict_value_new else: eds_dict[dict_key] = dict_value ''' while eds_root in transparent_dict: eds_root = transparent_dict[eds_root] #print "EDS RELATION DICT" #pprint(eds_relation_dict) #Example of eds_relation_dict: #'x73': '_piano_n_1', #'x78': 'implicit_conj', #'x83': '_bass_n_1', #'x93': '_show_n_of', #'x96': '_slide_n_1'} #Example of eds_dict: # {'x49': [['appos', 'x47']], #'x55': [['compound_name', 'x47']], #'x61': [['compound_name', 'x55']], #'x72': [['compound', 'x49']], #'x89': [['loc_nonsp', 'e3']], #'x9': [['_and_c', 'x47']]} return [eds_root,eds_dict, dict_id_pos, transparent_dict, eds_relation_dict] #FUNCTION MY_TRAVERSE def my_traverse(self, t, first_dep, words_list, pos_tag_dict, lemma_dict, dep_dict, words_pos_dict, words_correct_case_dict,erg_tok_dict,tok_type): #this function goes through the derivation tree and creates different dictionaries and lists try: t.node except AttributeError: print '' else: parent = t.node #we would need grand parent to extract POS tag grand_parent = None if '///////' in t.node: parent_pieces= t.node.split('///////') parent = parent_pieces.pop() if len(parent_pieces) > 0: grand_parent = parent_pieces.pop() parent = parent.replace('<<<<<<<', '(') parent = parent.replace('>>>>>>>', ')') if re.match("^\".*", parent) and len(t) == 1: start = '' end = '' case_notes = '' words_list.append(parent) #print 'PARENT: ' + parent #print 'LEAF:' + '\n' + t[0] + '\n\n\n' m = re.search("\+FROM\|\|\|\|\|\|\|(#\d+=)*\\\\\"(\d+)\\\\\"", str(t[0])) if m: start = m.group(2) m = re.search("\+TO\|\|\|\|\|\|\|\\\\\"(\d+)\\\\\"", str(t[0])) if m: end = m.group(1) #print 'START '+ start + '; END ' + end + '\n\n' if start != '' and end != '': if '<' + start + ':' + end + '>' not in words_pos_dict: words_pos_dict['<' + start + ':' + end + '>'] = [len(words_list)] else: words_pos_dict['<' + start + ':' + end + '>'].append(len(words_list)) #pprint(words_pos_dict) # This is a token from the derivation tree cleaned from the markup clean_word_lowercase = self.extract_lowercased_token_erg_tok(parent) #When len(t) == 1, word in the derivation tree is represented as one token only # So for this situation we have the case_dict with one element in it. By default we # put no information about the case (empty string) case_list = [] #Search for the information about the case m_case = re.search("\+CASE\|\|\|\|\|\|\|([^\|]+)\|\|\|\|\|\|\|", str(t[0])) if m_case: case_list.append(['<' + start + ':' + end + '>', clean_word_lowercase,m_case.group(1)]) else: case_list.append(['<' + start + ':' + end + '>', clean_word_lowercase, ""]) case_list = self.analyze_case_descr_erg_tok(erg_tok_dict, case_list) #print("CASE_LIST") #pprint(case_list) # IN this case we have only one element in case_list and it is a list with one element # (because the word is not a multi-word expression) words_correct_case_dict[parent]= case_list[0][1] if tok_type == 'ptb': (pos_tag_dict, lemma_dict) = self.extract_pos_tag_and_lemma_ptb_tok(grand_parent, pos_tag_dict, lemma_dict, '<' + start + ':' + end + '>') else: (pos_tag_dict, lemma_dict) = self.extract_pos_tag_and_lemma_erg_tok(grand_parent, pos_tag_dict, lemma_dict, '<' + start + ':' + end + '>') elif re.match("^\".*", parent) and len(t) > 1: parent = parent.replace('_______', ' ') words_list.append(parent) #print 'PARENT: ' + parent #if parent == '"more+than"|||||||299': # print(str(t)) #print 'LEAF:' + '\n' + str(t[len(t)-1]) + '\n\n\n' start = '' end = '' #m_case = re.search("\+CASE ([^ ]+) ", str(t[0])) #if m_case: # case_notes = m_case.group(1) m = re.search("\+FROM\|\|\|\|\|\|\|(#\d+=)*\\\\\"(\d+)\\\\\"", str(t[0])) if m: start = m.group(2) m = re.search('\+TO\|\|\|\|\|\|\|\\\\\"(\d+)\\\\\"', str(t[len(t)-1])) if m: end = m.group(1) #print start + ' ' + end + '\n\n' if start != '' and end != '': if '<' + start + ':' + end + '>' not in words_pos_dict: words_pos_dict['<' + start + ':' + end + '>'] = [len(words_list)] else: words_pos_dict['<' + start + ':' + end + '>'].append(len(words_list)) #pprint(words_pos_dict) clean_word_lowercase = self.extract_lowercased_token_erg_tok(parent) if "+" in clean_word_lowercase: multiword_parts = clean_word_lowercase.split("+") case_list = self.find_case_info_for_multiword_expr_erg_tok(t, multiword_parts) case_list = self.analyze_case_descr_erg_tok(erg_tok_dict, case_list) #print("CASE_LIST") #pprint(case_list) multiword_expr_tokens = [] for elem_pos in range(len(case_list)): tok = case_list[elem_pos][1] multiword_expr_tokens.append(tok) words_correct_case_dict[parent]="+".join(multiword_expr_tokens) if tok_type == 'ptb': (pos_tag_dict, lemma_dict) = self.extract_pos_tag_and_lemma_ptb_tok(grand_parent, pos_tag_dict, lemma_dict,'<' + start + ':' + end + '>') else: (pos_tag_dict, lemma_dict) = self.extract_pos_tag_and_lemma_erg_tok(grand_parent, pos_tag_dict, lemma_dict,'<' + start + ':' + end + '>') if len(t) > 1: #print t.node parent = t.node if '///////' in t.node: parent_pieces= t.node.split('///////') parent = parent_pieces.pop() child_nodes = [] for child in t: try: child.node except AttributeError: print '' else: child_node = child.node if '///////' in child.node: child_pieces= child.node.split('///////') child_node = child_pieces.pop() child_node = child_node.replace('_______', ' ') child_node = child_node.replace('<<<<<<<', '(') child_node = child_node.replace('>>>>>>>', ')') child_nodes.append(child_node) #print '\n\n\n', child.node #print '\n\n\n========================================================\n\n\n' parent = parent.replace('_______', ' ') parent = parent.replace('<<<<<<<', '(') parent = parent.replace('>>>>>>>', ')') dep_dict[parent] = child_nodes if len(first_dep) == 0: first_dep.append(parent) for child in t: self.my_traverse(child, first_dep, words_list, pos_tag_dict, lemma_dict, dep_dict, words_pos_dict, words_correct_case_dict, erg_tok_dict,tok_type) #NOTE: we do not need "case_dict" for PTB tokenization, since we take tokens from PTB items and lower/upper case # is correct for them. Unfortunately, it is corrupted for ERG tokenization return (first_dep, words_list, dep_dict,words_pos_dict, pos_tag_dict, lemma_dict, words_correct_case_dict, tok_type) #FUNCTION: READ_RULE_HEAD_FILE def read_rule_head_file(self, rules_file): f = codecs.open(rules_file, 'r', encoding = 'utf-8') rule_head_dict = {} for line in f.readlines(): line = line.strip() rule_pieces = line.split(' ') rule_head_dict[rule_pieces[0]] = [rule_pieces[1], rule_pieces[2]] return rule_head_dict #FUNCTION: READ_RELATIONS_FILE def read_relations_file(self, relations_file): #We create two dictionaries for each type of the relations: one dictionary contains those relations that are expressed in # regular expression form in the configuration file, the other dictionary contains those relations that are not expressed as regular expresssions. We do not put everything in one dictionary and do not use regular expressions for everything in order to speed up the search process (regex are very slow to process) read_redundant = 0 read_transparent = 0 read_lexical = 0 read_relational = 0 redundant_relname_dict = {} redundant_regex_dict = {} transparent_relname_dict = {} transparent_regex_dict = {} relational_relname_dict = {} relational_regex_dict = {} lexical_relname_dict = {} lexical_regex_dict = {} for line in codecs.open(relations_file,'r', encoding = 'utf-8').readlines(): line = line.strip() if line == '[redundant]': read_redundant = 1 read_transparent = 0 read_lexical = 0 read_relational = 0 elif line == '[transparent]': read_redundant = 0 read_transparent = 1 read_lexical = 0 read_relational = 0 elif line == '[relational]': read_redundant = 0 read_transparent = 0 read_lexical = 0 read_relational = 1 elif line == '[lexical]': read_redundant = 0 read_transparent = 0 read_lexical = 1 read_relational = 0 elif len(line) == 0: read_redundant = 0 read_transparent = 0 read_lexical = 0 read_relational = 0 elif read_redundant: (redundant_relname_dict, redundant_regex_dict) = self.read_line_into_rel_dict(line, redundant_relname_dict, redundant_regex_dict) elif read_transparent: (transparent_relname_dict, transparent_regex_dict) = self.read_line_into_rel_dict(line, transparent_relname_dict, transparent_regex_dict) elif read_relational: (relational_relname_dict, relational_regex_dict) = self.read_line_into_rel_dict(line, relational_relname_dict, relational_regex_dict) elif read_lexical: (lexical_relname_dict, lexical_regex_dict) = self.read_line_into_rel_dict(line, lexical_relname_dict, lexical_regex_dict) redundant_general_regex = self.create_general_regex(redundant_relname_dict, redundant_regex_dict) transparent_general_regex = self.create_general_regex(transparent_relname_dict, transparent_regex_dict) relational_general_regex = self.create_general_regex(relational_relname_dict, relational_regex_dict) lexical_general_regex = self.create_general_regex(lexical_relname_dict, lexical_regex_dict) dictionaries = {} dictionaries['redundant'] = [redundant_relname_dict, redundant_regex_dict, redundant_general_regex] dictionaries['transparent'] = [transparent_relname_dict, transparent_regex_dict, transparent_general_regex] dictionaries['relational'] = [relational_relname_dict, relational_regex_dict, relational_general_regex] dictionaries['lexical'] = [lexical_relname_dict, lexical_regex_dict, lexical_general_regex] return dictionaries def read_line_into_rel_dict(self, line, relname_dict, regex_dict): #arguments (can be zero, one or two depending whether it is lexical relation or transparent relation or relational/redundant correspondingly arguments = [] #line is already stripped!!! # Check if line is a comment (comments start with ";" sign) # If it is a comment, just ignore it m_comment = re.match(r"^;", line) if m_comment: results = [relname_dict, regex_dict] return results if " " in line.strip(): line_pieces = line.split(" ") #the string before the first tabulation mark is the name of the relation relname = line_pieces[0] #everything after the first tabulation in the line are the arguments for i in range(1, len(line_pieces)): arguments.append(line_pieces[i]) # if there is no tabulation in the string, then it is lexical relation (it cannot be an empty string, because we checked for that in the function read_relations_file else: relname = line #if the relation name is expressed as regex if '/' in relname: #and we will write the relation into the general regular expression that corresponds to this type relname = relname.replace('/', '') regex_dict = self.write_relation_into_dict(relname, arguments, regex_dict) #if the relation name is NOT expressed as regex, we put it in a separate dictionary to speed up the search in future else: relname_dict = self.write_relation_into_dict(relname, arguments, relname_dict) results = [relname_dict, regex_dict] return results def create_general_regex(self, relname_dict, regex_dict): general_regex = "(" for relname in relname_dict.keys(): if general_regex == "(": general_regex = general_regex + "^" + relname else: general_regex = general_regex + "|^" + relname for relname in regex_dict.keys(): if general_regex == "(": general_regex = general_regex + relname else: general_regex = general_regex + "|" + relname general_regex = general_regex + ")" return general_regex def write_relation_into_dict(self, relname, arguments, relations_dict): #if the relation name is already in regex dict if relname in relations_dict: #if we do not have arguments and the relname is already in the dictionary, we do not have to do anything # if we have, we should add them if len(arguments) == 1: #to the existing record with the key of the relation name (expressed in the form of regex) # attach the new list of arguments relations_dict[relname].append(arguments[0]) elif len(arguments) == 2: relations_dict[relname].append(arguments) # this is only for relational relations: in case a relational relation has the third argument: a new name of the relation, e.g.: # comp ARG0 ARG2 ref # The dependency from ARG0 to ARG2 will be called "ref" instead of "comp" #to the existing record with the key of the relation name (expressed in the form of regex) # attach the new list of arguments elif len(arguments) == 3: relations_dict[relname].append(arguments) #if the relation name hasn't been in the dictionary before else: #if we do not have arguments, just write the relation name into the dictionary with the value 1 if len(arguments) == 0: relations_dict[relname] = 1 #if we have just one argument, write relname as a key and this argument as the value. elif len(arguments) == 1: relations_dict[relname] = [arguments[0]] # We need to have a list of values because the same relation name can occur more than once, e.g. # [relational] # /_c$/ L-HNDL R-HNDL # /_c$/ L-INDEX R-INDEX elif len(arguments) == 2: relations_dict[relname] = [arguments] # this is only for relational relations: in case a relational relation has the third argument: a new name of the relation, e.g.: # comp ARG0 ARG2 ref # The dependency from ARG0 to ARG2 will be called "ref" instead of "comp" elif len(arguments) == 3: relations_dict[relname] = [arguments] return relations_dict def get_labels_from_typed_dict(self, dict_type, relations_collection, rel): labels_from_dict = [] if rel in relations_collection[dict_type][0]: labels_from_dict = relations_collection[dict_type][0][rel] else: for regex_key in relations_collection[dict_type][1].keys(): if re.search(regex_key, rel): labels_from_dict = relations_collection[dict_type][1][regex_key] return labels_from_dict #========================================# # FUNCTIONS FOR PTB TOKENIZATION # #========================================# #FUNCTION1: extract_pos_tag_and_lemma_ptb_tok def extract_pos_tag_and_lemma_ptb_tok(self, grand_parent, pos_tag_dict, lemma_dict, index): pos_tag = '' lemma = '' if grand_parent is not None: grand_parent_pieces = grand_parent.split('|||||||') pos_tag_incorporated = grand_parent_pieces[1] if '/' in pos_tag_incorporated: pos_tag_incorporated_pieces = pos_tag_incorporated.split('/') lemma = pos_tag_incorporated_pieces[0] pos_tag = pos_tag_incorporated_pieces[1] elif '@' in pos_tag_incorporated: pos_tag_incorporated_pieces = pos_tag_incorporated.split('@') lemma = pos_tag_incorporated_pieces[0] pos_tag = pos_tag_incorporated_pieces[1] #now cut pos tag up so that we take only the part before first "_" #sub_parts = pos_tag.split('_') #pos_tag = sub_parts[0] #pos_tag = '_'.join([sub_parts[0],sub_parts[1]]) #pos_tag = '_'.join([sub_parts[0],sub_parts[1], sub_parts[2]]) pos_tag_dict[index] = pos_tag lemma_dict[index] = lemma return [pos_tag_dict, lemma_dict] #FUNCTION2: find_key_index_ptb_tok def find_key_index_ptb_tok(self, key, transparent_dict, dict_id_pos, tokens_pos_dict, split_dict, contracted_neg_split_dict_mrs, group_dict): key_id = None #DANGER OF AN INFINITE LOOP, IF THE SAME KEY IS THE "KEY" and THE "VALUE" in transparent dict while key in transparent_dict: key = transparent_dict[key] if (key in dict_id_pos): elem_id = dict_id_pos[key] if elem_id in tokens_pos_dict: key_id = tokens_pos_dict[elem_id] elif elem_id in split_dict and elem_id in contracted_neg_split_dict_mrs: key_id = tokens_pos_dict[contracted_neg_split_dict_mrs[elem_id][0]] elif elem_id in split_dict: key_id = tokens_pos_dict[split_dict[elem_id][0]] elif elem_id in group_dict: key_id = tokens_pos_dict[group_dict[elem_id]] return key_id #FUNCTION2.2: find_key_index_pred_ptb_tok def find_key_index_pred_ptb_tok(self, key, transparent_dict, dict_id_pos, tokens_pos_dict, split_dict, contracted_neg_split_dict_mrs, group_dict): key_id = None if key not in transparent_dict: if (key in dict_id_pos): elem_id = dict_id_pos[key] if elem_id in tokens_pos_dict: key_id = tokens_pos_dict[elem_id] elif elem_id in split_dict and elem_id in contracted_neg_split_dict_mrs: key_id = tokens_pos_dict[contracted_neg_split_dict_mrs[elem_id][0]] elif elem_id in split_dict: key_id = tokens_pos_dict[split_dict[elem_id][0]] elif elem_id in group_dict: key_id = tokens_pos_dict[group_dict[elem_id]] return key_id #FUNCTION3: eds_expansion_ptb_tok def eds_expansion_ptb_tok(self, eds_dep_indexes, eds_dict, tokens_pos_dict, dict_id_pos, transparent_dict, split_dict, contracted_neg_split_dict_mrs, group_dict, eds_relation_dict): #print 'EDS DICT' #pprint(eds_dict) #print '\n' eds_relation_pos_dict = {} for key, value in eds_dict.iteritems(): head = self.find_key_index_ptb_tok(key, transparent_dict, dict_id_pos, tokens_pos_dict, split_dict, contracted_neg_split_dict_mrs, group_dict) #head can be 'None' for the cases when the head is not a word token if not(head is None): for dep in value: label = dep[0] dependent = self.find_key_index_ptb_tok(dep[1], transparent_dict, dict_id_pos, tokens_pos_dict, split_dict, contracted_neg_split_dict_mrs, group_dict) if head != dependent and not(dependent is None): eds_dep_indexes.append([head, dependent, label]) #print 'EDS DEPENDENCY INDEXES:' #pprint(eds_dep_indexes) #NOW EXPAND EDS_RELATION_DICT # THIS DICTIONARY IS FOR ALL PREDICATES (NOT ONLY FOR THOSE THAT ARE HEADS FOR SOME DEPENDENTS) # we had a dictionary "eds_relation_dict" for relations that mapped 'e30' -> _leading_a_1, # 'x9' -> _and_c # now we find that the position of 'e30' in the sentence is 5 and #the position of 'x9' in the sentence is 1, #and build a new dictionary eds_relation_pos_dict that maps 5 -> _leading_a_1, 1 -> _and_c for key in eds_relation_dict.keys(): key_pos_in_sent = self.find_key_index_pred_ptb_tok(key, transparent_dict, dict_id_pos, tokens_pos_dict, split_dict, contracted_neg_split_dict_mrs, group_dict) if not (key_pos_in_sent is None): eds_relation_pos_dict[key_pos_in_sent] = eds_relation_dict[key] #print 'EDS DEPENDENCY INDEXES:' #pprint(eds_dep_indexes) #print '\n\n' #print '\n\n' return [eds_dep_indexes, eds_relation_pos_dict] #FUNCTION4: dep_expansion_ptb_tok def dep_expansion_ptb_tok(self, fi, rule_head_dict, dep_rel, words_id_dict, dep_dict, dep_rel_id_list): # dep_rel[0] can be 454|||||||SP-HD_N_C|||||||1.49349|||||||0|||||||2 #We could have dep_dict like this: {'"burkina faso"|||||||22': []} which means that len(dep_dict[dep_rel[0]]) = 0 if dep_rel[0] in dep_dict and len(dep_dict[dep_rel[0]]) > 0: for i in range(0,len(dep_dict[dep_rel[0]])): if dep_dict[dep_rel[0]][i] in words_id_dict: dep_dict[dep_rel[0]][i] = words_id_dict[dep_dict[dep_rel[0]][i]] else: new_dep_rel = [] new_dep_rel.append(dep_dict[dep_rel[0]][i]) self.dep_expansion_ptb_tok(fi, rule_head_dict, new_dep_rel, words_id_dict, dep_dict, dep_rel_id_list) #here we can potentially fail dep_dict[dep_rel[0]][i] = words_id_dict[dep_dict[dep_rel[0]][i]] dep_rel_parts = dep_rel[0].split('|||||||') if re.match('^\^', dep_rel_parts[1], re.IGNORECASE): dep_rel_parts[1] = dep_rel_parts[1][1:] if dep_rel_parts[1].lower() in rule_head_dict and len(dep_dict[dep_rel[0]]) == int(rule_head_dict[ dep_rel_parts[1].lower()][0]): head_index = int(rule_head_dict[ dep_rel_parts[1].lower()][1]) #DEPENDENCY ARC LABEL (WE CAN TAKE THE WHOLE ERG LABEL, OR BEFORE FIRST "_" OR BEFORE SECOND "_" label = dep_rel_parts[1] dep_rel_subparts = dep_rel_parts[1].split('_') label = dep_rel_subparts[0] #label = "_".join([dep_rel_subparts[0], dep_rel_subparts[1]]) head = dep_dict[dep_rel[0]][head_index] for i in range(0,len(dep_dict[dep_rel[0]])): if dep_dict[dep_rel[0]][i] != head: dependent = dep_dict[dep_rel[0]][i] dep_rel_id_list.append([head, dependent,label]) if not dep_rel[0] in words_id_dict: words_id_dict[dep_rel[0]] = head #if head_index == 0: #head = dep_dict[dep_rel[0]][0] #dependent = dep_dict[dep_rel[0]][1] #elif head_index == 1: #head = dep_dict[dep_rel[0]][1] #dependent = dep_dict[dep_rel[0]][0] #else: #print >> sys.stderr, "Unexpected head index of the rule " + dep_rel_parts[1].lower() + " (head index is neither 0, nor 1; head index is " + str(head_index) + ")." #sys.exit(1) else: print >> sys.stderr, "Error! Sentence id " + str(fi) + ". Rule " + dep_rel_parts[1].lower() + ". Unknown rule (rule is not in the list of rules in the file erg.hds) or incorrect number of daughters in the tree." sys.exit(1) return dep_rel_id_list def convert_dt_ptb_tok(self, fi, tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list,sent_root, dep_rel_tokens_list, args_dict, output_flags, fhdl_dict): #we have a list dep_indexes that contains lists in it. Each of the inner lists represents head, dependent, label. # but when we write a sentence in CoNLL format, for each word we need its head and label. #so we convert our list into a dictionary where keys are dependents and values are lists (in the Derivation Tree one dependent can have only one head) that represent head and label #print 'SENTENCE ROOT ', sent_root, '\n\n' if output_flags['dt']: fhdl_dict['dt'].write('#' + str(fi) + '\n') dep_indexes_dict = create_dep_indexes_dict(dep_rel_tokens_list) for i in range(1, len(tokens_list) + 1): head = '_' label = '_' if i in dep_indexes_dict.keys(): head = dep_indexes_dict[i][0] label = dep_indexes_dict[i][1] #"if" instead of "elif" because in file 10100030.gz we have a dependency to root node due to errors in PTB tokenization. Root should not be dependent #on anything therefore this "if" will re-write the dependency of the previous "if". If we use "elif" we would get sentence without root in CoNLL format. if i == sent_root: head = 0 label = 'ROOT' if output_flags['tex']: fhdl_dict['tex'].write('\deproot[edge above, edge style={red}]{' + str(i) + '}' + '{root}\n') word = tokens_list[i-1] #print 'WORD: ', word, '\n' if self.is_punctuation_ptb_tok(word): lemma = word pos_tag = word else: lemma = lemma_tokens_dict[i] pos_tag = pos_tag_tokens_dict[i] #We have several sentences that end with e.g. # says . `` # and `` is lost in the derivation tree. # PTB tokenization # (64, 22, 23, <22:23>, 1, "says" "says", 0, "null", "VBZ" 1.0) # (65, 23, 24, <23:24>, 1, "." ".", 0, "null", "." 1.0) # (66, 24, 25, <24:25>, 1, "“" "``", 0, "null", "``" 1.0) # Derivation tree: # (224, 20, 21, <22:24>, 1, "says.", 0, "null", "VBZ" 1.0) # Therefore `` did not get a head during our analysis. # We have to fix that by attaching `` to the nearest word on the left which is not a punctuation symbol. if head == '_': for j in range(i-2,-1,-1): if not self.is_punctuation_ptb_tok(tokens_list[j]): head = j+1 if self.is_punctuation_ptb_tok(word): label = 'PUNCT' break if self.is_punctuation_ptb_tok(word): lemma = word pos_tag = word else: lemma = lemma_tokens_dict[i] pos_tag = pos_tag_tokens_dict[i] if output_flags['dt']: fhdl_dict['dt'].write(str(i) + '\t' + word + '\t' + "_" + '\t' + ptb_pos_list[i-1] + '\t' + pos_tag + '\t' + '_' + '\t' + str(head) + '\t' + label + '\t' + '_' + '\t' + '_' +'\n') if output_flags['tex']: if i!= sent_root: #Latex requires that underscore symbol is escaped. label = label.replace('_', '\_') fhdl_dict['tex'].write('\depedge[edge above, edge style={red}]' + '{' + str(head) + '}' + '{' + str(i) + '}' + '{' + label + '}' + '\n') if output_flags['dt']: fhdl_dict['dt'].write('\n') return def convert_dm_ptb_tok(self, fi, tokens_list, lemma_tokens_dict ,pos_tag_tokens_dict, ptb_pos_list, sent_root_mrs_derived, eds_dep_indexes, args_dict, output_flags, fhdl_dict): #Remove duplates from eds_dep_indexes eds_dep_indexes.sort() eds_dep_indexes_no_dup = list(eds_dep_indexes for eds_dep_indexes, _ in itertools.groupby(eds_dep_indexes)) #print 'EDS DEP INDEXES NO DUP' #pprint(eds_dep_indexes_no_dup) #print '\n' #print 'EDS DEP INDEXES' #pprint(eds_dep_indexes) #print '\n' #if there is no root, then our variable sent_root_mrs_derived = '' #if there is a root, its index is in an integer format and is calculated starting from 1: #Example: #'This is the most common conception...' # sent_root_mrs_derived = '2' (the copula 'is' is the root in this case) if output_flags['dm']: fhdl_dict['dm'].write('#' + str(fi) + '\n') if sent_root_mrs_derived != '': #If a word is a punctuation, it means it was cut from the word, therefore the lemma and pos tag saved in the dictionary concern the word, not the punctuation. We replace them just repeating the punctuation if self.is_punctuation_ptb_tok(tokens_list[sent_root_mrs_derived-1]): lemma = tokens_list[sent_root_mrs_derived-1] pos_tag = tokens_list[sent_root_mrs_derived-1] else: #if it is not a puntuation, use the lemma and the pos from the dictionaries lemma = lemma_tokens_dict[sent_root_mrs_derived] pos_tag = pos_tag_tokens_dict[sent_root_mrs_derived] #in English MRS Test Suite we are not given the PTB pos tags therefore ptb_pos_tag[sent_root_mrs_derived - 1] = '_' #we should just print out ERG PoS tags twice #print('ptb_pos_list') #pprint(ptb_pos_list) if ptb_pos_list[sent_root_mrs_derived - 1] != '_': #in the case of WeScience and PEST ptb_pos_tag = ptb_pos_list[sent_root_mrs_derived - 1] else: #in the case of English MRS Test Suite, use ERG PoS tag ptb_pos_tag = pos_tag_tokens_dict[sent_root_mrs_derived] if output_flags['dm']: fhdl_dict['dm'].write('ROOT' + '\t' + '_' + '\t' + 'ROOT' + '\t' + 'ROOT' + '\t' + '-1' + '\t' + 'ROOT' + '\t' + 'ROOT' + '\t' + tokens_list[sent_root_mrs_derived-1] + '\t' + "_" + '\t' + pos_tag + '\t' + ptb_pos_tag + '\t' + str(sent_root_mrs_derived - 1) + '\n') if output_flags['tex']: fhdl_dict['tex'].write('\deproot[edge style={blue}]{' + str(sent_root_mrs_derived) + '}' + '{root}\n') # Example: # eds_dep_indexes_no_dup: # [[13, 15, 'conj'], # [15, 16, 'ARG1'], # [17, 16, 'ARG1'], # [17, 20, 'ARG2'], # # The indexes are counted from 1, while in PAS we count indexes from 0 for i in range(0, len(eds_dep_indexes_no_dup)): head_word_index = eds_dep_indexes_no_dup[i][0] - 1 head_word = tokens_list[head_word_index] #If a head word is a punctuation, it means it was cut from the word, therefore the lemma and pos tag saved in the dictionary concern the word, not the punctuation. We replace them just repeating the punctuation if self.is_punctuation_ptb_tok(head_word): #print head_word head_lemma = head_word head_word_pos = head_word else: #if it is not a puntuation, use the lemma and the pos from the dictionaries head_lemma = lemma_tokens_dict[head_word_index + 1] head_word_pos = pos_tag_tokens_dict[head_word_index + 1] #we want to have a column with PTB PoS tag. However, for English MRS Test Suite we do not have them available. #we will have ptb_pos_list[head_word_index]!= '_' in this case and therefore we will use ERG PoS tag instead. if ptb_pos_list[head_word_index]!= '_': head_word_ptb_pos = ptb_pos_list[head_word_index] else: head_word_ptb_pos = head_word_pos dep_word_index = eds_dep_indexes_no_dup[i][1] - 1 dep_word = tokens_list[dep_word_index] #If a dependency word is a punctuation, it means it was cut from the word, therefore the lemma and pos tag saved in the dictionary concern the word, not the punctuation. We replace them just repeating the punctuation if self.is_punctuation_ptb_tok(dep_word): dep_lemma = dep_word dep_word_pos = dep_word else: #if it is not a puntuation, use the lemma and the pos from the dictionaries dep_lemma = lemma_tokens_dict[dep_word_index + 1] dep_word_pos = pos_tag_tokens_dict[dep_word_index + 1] if ptb_pos_list[dep_word_index] != '_': dep_word_ptb_pos = ptb_pos_list[dep_word_index] else: dep_word_ptb_pos = dep_word_pos label = eds_dep_indexes_no_dup[i][2] if output_flags['dm']: fhdl_dict['dm'].write(head_word + '\t' + "_" + '\t' + head_word_pos + '\t' + head_word_ptb_pos + '\t' + str(head_word_index) + '\t' + label + '\t' + label + '\t' + dep_word + '\t' + "_" + '\t' + dep_word_pos + '\t' + dep_word_ptb_pos + '\t' + str(dep_word_index) + '\n') if output_flags['tex']: eds_dep_indexes_no_dup[i][2] = eds_dep_indexes_no_dup[i][2].replace('_', '\_') fhdl_dict['tex'].write('\depedge[edge style={blue}]' + '{' + str(eds_dep_indexes_no_dup[i][0]) + '}' + '{' + str(eds_dep_indexes_no_dup[i][1]) + '}' + '{' + eds_dep_indexes_no_dup[i][2] + '}' + '\n') if output_flags['dm']: fhdl_dict['dm'].write('\n') return def convert_dt_yy_ptb_tok(self, fi, all_ptb_pos_with_probabilities_list, tokens_pos_dict,tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list,dt_sent_root, dt_dep_rel_tokens_list, split_dict, args_dict, output_flags, fhdl_dict): # WORK WITH DT dt_dep_indexes_dict = create_dep_indexes_dict(dt_dep_rel_tokens_list) if dt_sent_root != '': dt_dep_indexes_dict[dt_sent_root] = [0, u'ROOT'] #Re-arrange tokens_pos_dict to fix the cases like #22100021 # Current tokens_pos_dict: #{'<56:61>': 12, #'<61:62>': 13, #'<62:64>': 15, ...} # Desired tokens_pos_dict: # Current tokens_pos_dict: #{'<56:61>': 12, #'<61:62>': 13, #<62:63> : 14, #'<63:64>': 15, ...} # This comes from the export: # 22100021 #(12, 11, 12, <56:61>, 1, "click", 0, "null", "NN" 1.0) #(13, 12, 13, <61:62>, 1, ".", 0, "null", "." 1.0) #(14, 13, 14, <62:64>, 1, "”", 0, "null", "''" 1.0) #(15, 14, 15, <62:64>, 1, ")", 0, "null", ")" 1.0) updated_tokens_pos_dict = {} token_pos_start_char_id_dict = {} tokens_pos_dict_sorted_by_values = sorted(tokens_pos_dict.items(), key=lambda (key,value): value) for i in range(0, len(tokens_pos_dict_sorted_by_values)): char_based_id = tokens_pos_dict_sorted_by_values[i][0] token_position = tokens_pos_dict_sorted_by_values[i][1] char_based_id_parts = char_based_id[1:-1].split(":") char_based_id_start = int(char_based_id_parts[0]) char_based_id_end = int(char_based_id_parts[1]) if token_position == i+2 and char_based_id_end - char_based_id_start == 2 and self.is_punctuation_ptb_tok(tokens_list[i]) and self.is_punctuation_ptb_tok(tokens_list[i+1]): updated_tokens_pos_dict['<' + str(char_based_id_start) + ':' + str(char_based_id_start+1) + '>' ] = i +1 updated_tokens_pos_dict['<' + str(char_based_id_start +1 ) + ':' + str(char_based_id_start+2) + '>' ] = i +2 # and the dictionary that contains opposite: by token position we find start of character id token_pos_start_char_id_dict[i +1] = char_based_id_start token_pos_start_char_id_dict[i +2] = char_based_id_start +1 else: updated_tokens_pos_dict[char_based_id] = token_position token_pos_start_char_id_dict[token_position] = char_based_id_start #print("dt_dep_indexes_dict") #pprint(dt_dep_indexes_dict) #pprint(tokens_list) #pprint(tokens_pos_dict) #print "Updated tokens pos dict" #pprint(updated_tokens_pos_dict) # IF the ERG token was split in PTB tokenization, we will provide the dependency for the head word before the # first non-punctuation elment of the the split ERG token # ERG TOK in profile # (41, 0, 1, <0:2>, 1, "ad", 0, "null") #(45, 0, 1, <0:2>, 1, "ad", 0, "null") #(38, 1, 2, <3:6>, 1, "hoc", 0, "null") #(42, 1, 2, <3:6>, 1, "hoc", 0, "null") #(39, 2, 3, <7:10>, 1, "oil", 0, "null") #(43, 2, 3, <7:10>, 1, "oil", 0, "null") #(40, 3, 4, <11:19>, 1, "arrived.", 0, "null") #(44, 3, 4, <11:19>, 1, "arrived.", 0, "null") # # PTB TOK in profile #(1, 0, 1, <0:2>, 1, "Ad", 0, "null") #(2, 1, 2, <3:6>, 1, "hoc", 0, "null") #(3, 2, 3, <7:10>, 1, "oil", 0, "null") #(4, 3, 4, <11:18>, 1, "arrived", 0, "null") #(5, 4, 5, <18:19>, 1, ".", 0, "null") # # DTM #1 Ad _ _ aj_-_i_le _ 2 MWE _ _ _ MWE _ _ #2 hoc _ _ aj_-_i_le _ 3 AJ-HDN _ _ _ad+hoc_a_1 _ _ _ #3 oil _ _ n_-_mc_le _ 4 SB-HD _ _ _oil_n_1 ARG1 _ ARG1 #4 arrived _ _ v_-_le _ 0 ROOT _ _ ^_arrive_v_1 _ _ _ #5 . _ _ . _ 4 PUNCT _ _ _ _ _ _ # # YY # [2] | #(0, 0, 1, 1, "⌊→¦aj-hdn¦7⌋", 0, "null") #(1, 1, 2, <0:2>, 1, "Ad", 0, "null" ) #(2, 2, 3, <3:6>, 1, "hoc", 0, "null" ) #(3, 3, 4, 1, "⌊→¦sb-hd¦11⌋", 0, "null") #(4, 4, 5, <7:10>, 1, "oil", 0, "null" ) #(5, 5, 6, 1, "⌊→¦root⌋", 0, "null") #(6, 6, 7, <11:18>, 1, "arrived", 0, "null") #(7, 7, 8, <18:19>, 1, ".", 0, "null") # First collect MWE expressions and change dependencies for them: # ignore_dep_list - a list of character ids of tokens for which dependencies should be ignored ignore_dep_list = [] # change_dep_dict - a dictionary of character ids of tokens for which dependencies should be changed change_dep_dict = {} mwe_dict = defaultdict(list) #print("dt_dep_indexes_dict") #pprint(dt_dep_indexes_dict) change_outgoing_dep_for_mwe = {} for tok_index in range(1, len(tokens_list) + 1): # COLLECT INFO ABOUT DT dt_head = u'_' dt_label = u'_' if tok_index in dt_dep_indexes_dict.keys(): dt_head = dt_dep_indexes_dict[tok_index][0] dt_label = dt_dep_indexes_dict[tok_index][1] if dt_label == u'MWE': mwe_dict[int(dt_head)].append(int(tok_index)) #print("mwe_dict") #pprint(mwe_dict) for el in mwe_dict: if el in dt_dep_indexes_dict.keys(): el_head = dt_dep_indexes_dict[el][0] el_label = dt_dep_indexes_dict[el][1] mwe_dict[el].sort() ignore_dep_list.append(el) # Before outgoing dependencies from MWE were from the head of MWE # The head of MWE used to be "el" # Now we put all the annotations on the first token of MWE: "mwe_dict[el][0]" # So the outgoing dependencies from MWE should be also now started from "mwe_dict[el][0]" instead of "el" change_outgoing_dep_for_mwe[el] = mwe_dict[el][0] change_dep_dict[mwe_dict[el][0]] = [el_head, el_label] #print("ignore_dep_list") #pprint(ignore_dep_list) #print("change_dep_dict") #pprint(change_dep_dict) #print("change_outgoing_dep_for_mwe") #pprint(change_outgoing_dep_for_mwe) # NOW COLLECT ALL DT INFO FOR YY FORMAT # first four variables for yy out_path = u'1' out_ipos = u'0' out_lrule = u'"null"' final_tokens_list = [] out_id = 0 out_start = 0 # Loop over sentence tokens for i in range(1, len(tokens_list) + 1): # COLLECT INFO ABOUT DT dt_head = u'_' dt_label = u'_' if i in dt_dep_indexes_dict.keys(): dt_head = dt_dep_indexes_dict[i][0] dt_label = dt_dep_indexes_dict[i][1] #"if" instead of "elif" because in file 10100030.gz we have a dependency to root node due to errors in PTB tokenization. Root should not be dependent #on anything therefore this "if" will re-write the dependency of the previous "if". If we use "elif" we would get sentence without root in CoNLL format. if i == dt_sent_root: dt_head = 0 dt_label = u'ROOT' word = tokens_list[i-1] #.replace("“", "``").replace("”", "''").replace("‘", "`").replace("’", "'") #print 'WORD: ', word, '\n' character_based_id_start = u'_' character_based_id_end = u'_' # COLLECT CHARACTER-BASED ID OF THE WORD, e.g. token 20 corresponds to a word "of" and has a character-based id <99:101> for char_based_id, token_pos in updated_tokens_pos_dict.iteritems(): if token_pos == i: char_based_id_parts = char_based_id[1:-1].split(':') character_based_id_start = char_based_id_parts[0] character_based_id_end = char_based_id_parts[1] out_link = '<' + character_based_id_start + ':' + character_based_id_end + '>' word = word.decode('utf-8') #We have several sentences that end with e.g. # says . `` # and `` is lost in the derivation tree. # PTB tokenization # (64, 22, 23, <22:23>, 1, "says" "says", 0, "null", "VBZ" 1.0) # (65, 23, 24, <23:24>, 1, "." ".", 0, "null", "." 1.0) # (66, 24, 25, <24:25>, 1, "“" "``", 0, "null", "``" 1.0) # Derivation tree: # (224, 20, 21, <22:24>, 1, "says.", 0, "null", "VBZ" 1.0) # Therefore `` did not get a head during our analysis. # We have to fix that by attaching `` to the nearest word on the left which is not a punctuation symbol. if dt_head == u'_': for j in range(i-2,-1,-1): if not self.is_punctuation_ptb_tok(tokens_list[j]): dt_head = j+1 if self.is_punctuation_ptb_tok(word): dt_label = u'PUNCT' break if i in change_dep_dict: dt_head = change_dep_dict[i][0] dt_label = change_dep_dict[i][1] #dependent_start_char_pos = token_pos_start_char_id_dict[dt_head] #out_form = u'"⌊→¦' + dt_label.lower() + u'¦' + str(dependent_start_char_pos) + u'⌋"' # This is dependency #final_tokens_list.append("(" + ", ".join([str(out_id), str(out_start), str(out_start + 1), out_path, out_form, out_ipos, out_lrule]) + ")") #out_id += 1 #out_start += 1 if dt_head in change_outgoing_dep_for_mwe: dt_head = change_outgoing_dep_for_mwe[dt_head] if dt_label == u'ROOT' and u'-' not in word and u'/' not in word and (i not in ignore_dep_list): out_form = u'"⌊→¦root⌋"' # This is dependency final_tokens_list.append("(" + ", ".join([str(out_id), str(out_start), str(out_start + 1), out_path, out_form, out_ipos, out_lrule]) + ")") out_id += 1 out_start += 1 # We still check for dt_label != u'PUNCT' despite having excluded punct that were obtained from split dictionary, because there are special cases when some punctuation was absent from the ERG tree but was in the PTB tokenization if dt_label != u'PUNCT' and dt_label != u'ROOT' and dt_label != u'MWE' and dt_label != u'NEG' and u'-' not in word and u'/' not in word and (i not in ignore_dep_list): dependent_start_char_pos = token_pos_start_char_id_dict[dt_head] out_form = u'"⌊→¦' + dt_label.lower() + u'¦' + str(dependent_start_char_pos) + u'⌋"' # This is dependency final_tokens_list.append("(" + ", ".join([str(out_id), str(out_start), str(out_start + 1), out_path, out_form, out_ipos, out_lrule]) + ")") out_id += 1 out_start += 1 # This is token #word = '"' + word.decode('utf-8') + '"' word = '"' + word + '"' # If there are no PoS tags with probabilities, do not include them at the end if all_ptb_pos_with_probabilities_list[i-1]== "": final_tokens_list.append("(" + ", ".join([str(out_id), str(out_start), str(out_start + 1), out_link, out_path, word, out_ipos, out_lrule]) + ")") else: final_tokens_list.append("(" + ", ".join([str(out_id), str(out_start), str(out_start + 1), out_link, out_path, word, out_ipos, out_lrule, all_ptb_pos_with_probabilities_list[i-1]]) + ")") out_id += 1 out_start += 1 if output_flags['dt_yy']: fhdl_dict['dt_yy'].write('[' + str(fi) + '] |' + " ".join(final_tokens_list) + "\n") #print('[' + str(fi) + '] |' + " ".join(final_tokens_list) + "\n") def convert_dtm_ptb_tok(self, fi, tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list,dt_sent_root, dt_dep_rel_tokens_list, args_dict, output_flags, fhdl_dict, dm_sent_root, dm_eds_dep_indexes, eds_relation_pos_dict): fhdl_dict['dtm'].write('#' + str(fi) + '\n') #FIRST WORK ON DM REPRESENTATION #Sort relations by position in the sentence # Now they are unsorted in the dictionary eds_relation_pos_dict sorted_eds_relation_pos_list = sorted(eds_relation_pos_dict.keys()) #print("eds_relation_pos_dict") #pprint(eds_relation_pos_dict) #print "Sorted EDS RELATION POS LIST" #pprint(sorted_eds_relation_pos_list) #Rearrange the array for DM dm_dependencies = collect_arguments_for_dm_predicates(dm_eds_dep_indexes, eds_relation_pos_dict, sorted_eds_relation_pos_list) # WORK WITH DT #print("dt_dep_rel_tokens_list") #pprint(dt_dep_rel_tokens_list) dt_dep_indexes_dict = create_dep_indexes_dict (dt_dep_rel_tokens_list) #print("dt_dep_indexes_dict") #pprint(dt_dep_indexes_dict) # NOW COLLECT ALL INFO FOR DT AND DM AND WRITE OUTPUT for i in range(1, len(tokens_list) + 1): # COLLECT INFO ABOUT DT dt_head = '_' dt_label = '_' if i in dt_dep_indexes_dict.keys(): dt_head = dt_dep_indexes_dict[i][0] dt_label = dt_dep_indexes_dict[i][1] #"if" instead of "elif" because in file 10100030.gz we have a dependency to root node due to errors in PTB tokenization. Root should not be dependent #on anything therefore this "if" will re-write the dependency of the previous "if". If we use "elif" we would get sentence without root in CoNLL format. if i == dt_sent_root: dt_head = 0 dt_label = 'ROOT' word = tokens_list[i-1] #print 'WORD: ', word, '\n' if self.is_punctuation_ptb_tok(word): lemma = word pos_tag = word else: lemma = lemma_tokens_dict[i] pos_tag = pos_tag_tokens_dict[i] #We have several sentences that end with e.g. # says . `` # and `` is lost in the derivation tree. # PTB tokenization # (64, 22, 23, <22:23>, 1, "says" "says", 0, "null", "VBZ" 1.0) # (65, 23, 24, <23:24>, 1, "." ".", 0, "null", "." 1.0) # (66, 24, 25, <24:25>, 1, "“" "``", 0, "null", "``" 1.0) # Derivation tree: # (224, 20, 21, <22:24>, 1, "says.", 0, "null", "VBZ" 1.0) # Therefore `` did not get a head during our analysis. # We have to fix that by attaching `` to the nearest word on the left which is not a punctuation symbol. if dt_head == '_': for j in range(i-2,-1,-1): if not self.is_punctuation_ptb_tok(tokens_list[j]): dt_head = j+1 if self.is_punctuation_ptb_tok(word): dt_label = 'PUNCT' break if self.is_punctuation_ptb_tok(word): lemma = word pos_tag = word else: lemma = lemma_tokens_dict[i] pos_tag = pos_tag_tokens_dict[i] #NOW COLLECT information about DM #by default dm_predicate is "_" # and for all predicate current word is not an argument by default ("_") dm_predicate = "_" if i in eds_relation_pos_dict: #if current word is a predicate dm_predicate = eds_relation_pos_dict[i] #if current word is the root, we add "^" symbol to the predicate to mark that it is a root, e.g. #^_ (empty predicate, ) if not(dm_sent_root == ''): if i == int(dm_sent_root): dm_predicate = "^" + dm_predicate dm_is_argument = "\t".join(["_"] * len(sorted_eds_relation_pos_list)) if i in dm_dependencies: dm_is_argument = "\t".join(dm_dependencies[i]) fhdl_dict['dtm'].write(str(i) + '\t' + word + '\t' + lemma + '\t' + ptb_pos_list[i-1] + '\t' + pos_tag + '\t' + '_' + '\t' + '_' + '\t' + '_' + '\t' + str(dt_head) + '\t' + dt_label + '\t' + dm_predicate + '\t' + dm_is_argument +'\n') fhdl_dict['dtm'].write('\n') return #FUNCTION8: dep_expansion_to_tokens_ptb_tok def dep_expansion_to_tokens_ptb_tok(self, dep_rel_id_list, tokens_id_dict, tokens_pos_dict, words_pos_dict, tokens_list, pos_tag_dict, lemma_dict): #here we transform the list dep_rel_id_list #[['<24:29>', '<20:23>', 'SP-HD'], #['<17:19>', '<24:29>', 'HD-CMP'], #['<5:16>', '<17:19>', 'HD-CMP'],...] #into dep_rel_tokes_list #[[5,4,'SP-HD'], #[3,5,'HD-CMP'], #[2, 3, 'HD-CMP'],...] #so we base tokenization on PennTreebank tokenization, not on ERG tokenization #first we have to fill in split_dict and group_dict #since we created list_dep_rel_id_list from words_pos_dict, we will use tokens_pos_dict and words_pos_dict to fill in #split_dict and group_dict and partially fill in dep_rel_id_list (we add relations between the splitted items to it) fill_split_group_dictionaries_results = self.fill_split_group_dictionaries_ptb_tok(words_pos_dict, tokens_pos_dict, tokens_list, pos_tag_dict, lemma_dict) dep_rel_tokens_list = fill_split_group_dictionaries_results[0] eds_dep_indexes = fill_split_group_dictionaries_results[1] group_dict = fill_split_group_dictionaries_results[2] split_dict = fill_split_group_dictionaries_results[3] contracted_neg_split_dict_mrs = fill_split_group_dictionaries_results[4] pos_tag_dict = fill_split_group_dictionaries_results[5] lemma_dict = fill_split_group_dictionaries_results[6] for dep_rel in dep_rel_id_list: #dep_rel looks like ['<24:29>', '<30:32>', 'SP-HD'] #new_dep_rel is an empty list of size three. It will be a re-written version of dep_rel in a form [5,4,'SP-HD'] new_dep_rel = [None]*3 #the label in new_dep_rel will be the same as in dep_rel, e.g. 'SP-HD' new_dep_rel[2] = dep_rel[2] for i in range(0,2): #dep_rel[i] looks like '<24:29>' when i=0 and '<30:32>' when i=1 if dep_rel[i] in tokens_pos_dict: new_dep_rel[i]= tokens_pos_dict[dep_rel[i]] elif dep_rel[i] in split_dict: id_of_main_token = split_dict[dep_rel[i]][0] new_dep_rel[i] = tokens_pos_dict[id_of_main_token] elif dep_rel[i] in group_dict: id_of_main_token = group_dict[dep_rel[i]][0] new_dep_rel[i] = tokens_pos_dict[id_of_main_token] else: print >> sys.stderr, "Id from dep_rel_id_list is not found neither in tokens_pos_dict, nor in split_dict, nor in group_dict" print >> sys.stderr, "Id " + dep_rel[i] + " from the relation " + ', '.join(dep_rel) + " in dep_rel_id_list" #we do not want loops in the tree, so we should make sure that first and second elements in ['<24:29>', '<30:32>', 'SP-HD'] are different if new_dep_rel[0] != new_dep_rel[1]: dep_rel_tokens_list.append(new_dep_rel) dep_expansion_to_tokens_results= [dep_rel_tokens_list, eds_dep_indexes, group_dict, split_dict, contracted_neg_split_dict_mrs, pos_tag_dict, lemma_dict] return dep_expansion_to_tokens_results #FUNCTION9: fill_split_group_dictionaries_ptb_tok def fill_split_group_dictionaries_ptb_tok(self,words_pos_dict, tokens_pos_dict, tokens_list, pos_tag_dict, lemma_dict): dep_rel_tokens_list = [] eds_dep_indexes = [] group_dict = {} split_dict = {} contracted_neg_split_dict_mrs = {} ''' print 'WORDS POS DICTIONARY' pprint(words_pos_dict) print 'TOKENS POS DICTIONARY' pprint(tokens_pos_dict) print 'TOKENS LIST' pprint(tokens_list) print 'POS TAG DICT' pprint(pos_tag_dict) print 'LEMMA DICT' pprint(lemma_dict) ''' tokens_pos_dict_tmp = {} for token_key in tokens_pos_dict.keys(): #tree_key = '<24:29>'. But when we do tree_key[1:-1] we get tree_key = '24:29' token_key_cut = token_key[1:-1] #tree_key_parts = [24, 29] token_key_parts = token_key_cut.split(':') #start_index = 24 token_start_index = int(token_key_parts[0]) #end_index = 29 token_end_index = int(token_key_parts[1]) tokens_pos_dict_tmp[token_key] = [token_start_index, token_end_index] for tree_key in words_pos_dict.keys(): #if tree_key in tokens_pos_dict and len(words_pos_dict[tree_key]) > 1: if tree_key not in tokens_pos_dict: #tree_key = '<24:29>'. But when we do tree_key[1:-1] we get tree_key = '24:29' tree_key_cut = tree_key[1:-1] #tree_key_parts = [24, 29] tree_key_parts = tree_key_cut.split(':') #start_index = 24 start_index = int(tree_key_parts[0]) #end_index = 29 end_index = int(tree_key_parts[1]) #now we have to figure out if to split or join words from the derivation tree to match PennTreebank tokenization parts_of_splitted_key = [] for tok_key in tokens_pos_dict_tmp.keys(): if start_index >= tokens_pos_dict_tmp[tok_key][0] and end_index <= tokens_pos_dict_tmp[tok_key][1]: group_dict[tree_key] = tok_key elif tokens_pos_dict_tmp[tok_key][0] >= start_index and tokens_pos_dict_tmp[tok_key][1] <= end_index: parts_of_splitted_key.append(tok_key) #It might be that the part of the splitted word has already been in the words dictionary in the case of "end- state." Here "end-" has id <237:246> and "state." has id #<237:247>. We split "state." into "state" with id <237:246> and "." with id <246:247>. Since id <237:246> has already been in the words_pos_dict because of "end", it means the words "end-" and "state" will be grouped together. This means, POS tag for "end-state" will be the same as for "state", not the same as for "end-". It cannot be punctuation, because punctuation does not function as independent tokens in ERG approach. if tok_key in words_pos_dict: if int(words_pos_dict[tok_key][0]) < int(words_pos_dict[tree_key][0]): #print 'Substitute POS tag ', pos_tag_dict[tok_key], ' with POS tag ', pos_tag_dict[tree_key], '\n\n' pos_tag_dict[tok_key] = pos_tag_dict[tree_key] lemma_dict[tok_key] = lemma_dict[tree_key] #if the word was splitted, we need to define the head. By default, it is the last token in the expression. However, we do not want a punctuation symbol #to be the head so we will check that the head is not a punctuation. #The problem is that we have the keys in parts_of_splitted_key in the random order because we received them in the line #>for tok_key in tokens_pos_dict_tmp.keys(): # since keys are not ordered, we got them in random order. # To set up the last word in the multi-word expression as head we have to order the multi-word expression first. if len(parts_of_splitted_key) > 0: #sort elements in parts_of_splitted_key parts_of_splitted_key = sorted(parts_of_splitted_key, self.compare_ptb_tok) #the head will be the last element of the array head = parts_of_splitted_key[len(parts_of_splitted_key) - 1] #print 'INITIAL HEAD OF COMPOUND: ', head # A flag that shows whether there is a non-punctuation head for the # split token found_non_punctuation_head_flag = 0 # Loop from the last element of the splitted token and # make the last non-punctuation symbol head for counter in range(len(parts_of_splitted_key)-1, -1, -1): #print 'COMPOUND PARTS: ' + parts_of_splitted_key[counter] token_id = parts_of_splitted_key[counter] token_pos = tokens_pos_dict[token_id] #print 'TOKEN POS ' + str(token_pos) #print 'TOKENS LIST' #pprint(tokens_list) token_value = tokens_list[token_pos - 1] if not (self.is_punctuation_ptb_tok(token_value) or (token_value == "n't") or (token_value == "n’t")): #print token_value + ' has no punct' head = token_id found_non_punctuation_head_flag = 1 #print 'HEAD OF COMPOUND INSIDE THE LOOP ', token_value break #print found_non_punctuation_head_flag # Special case! If all the elements of the split token are # punctuation symbols, than some symbols should be preferred # over the other if not found_non_punctuation_head_flag: for counter in range(len(parts_of_splitted_key)-1, -1, -1): #print 'COMPOUND PARTS: ' + parts_of_splitted_key[counter] token_id = parts_of_splitted_key[counter] token_pos = tokens_pos_dict[token_id] #print 'TOKEN POS ' + str(token_pos) #print 'TOKENS LIST' #pprint(tokens_list) token_value = tokens_list[token_pos - 1] if self.is_end_of_phrase_punctuation_ptb_tok(token_value): #print token_value + ' has no punct' head = token_id #print 'HEAD OF COMPOUND INSIDE THE LOOP ', token_value break split_dict[tree_key] = [head, parts_of_splitted_key] #NOW WE HAVE TO LOOP AGAIN AND find out if we have split a contracted negation #FOR the Derivation Tree it is not important because the # contracted negation contains apostrofi therefore it contains puntuatin and it won't become a head # For the derivation tree we want "doesn't" to be split into #"does" and "n't" with "does" as the head. So we can use "split_dict" for it #However for the MRS-derived dependencies we want "n't" to become a head therefore we have to create and additional dictionary contracted_neg_split_dict_mrs for counter in range(len(parts_of_splitted_key)-1, -1, -1): #print 'COMPOUND PARTS: ' + parts_of_splitted_key[counter] token_id = parts_of_splitted_key[counter] token_pos = tokens_pos_dict[token_id] #print 'TOKEN POS ' + str(token_pos) #print 'TOKENS LIST' #pprint(tokens_list) token_value = tokens_list[token_pos - 1] if token_value == "n't" or token_value == "n’t": #print token_value + ' contracted negation' contr_neg_head = token_id contracted_neg_split_dict_mrs[tree_key] = [contr_neg_head, parts_of_splitted_key] #print 'HEAD OF COMPOUND INSIDE THE LOOP ', head break #print 'HEAD OF COMPOUND AFTER THE LOOP ', head for counter in range(len(parts_of_splitted_key)-1, -1, -1): #we do not want loops in the tree if parts_of_splitted_key[counter] != head: token_id = parts_of_splitted_key[counter] token_pos = tokens_pos_dict[token_id] #print 'TOKEN POS ' + str(token_pos) #print 'TOKENS LIST' #pprint(tokens_list) token_value = tokens_list[token_pos - 1] head_value = tokens_list[tokens_pos_dict[head]-1] #print 'head_value: ' + head_value if (token_value == "n’t" or token_value == "n't") and not(self.is_punctuation_ptb_tok(token_value)): dep_rel_tokens_list.append([tokens_pos_dict[head],token_pos, 'NEG']) eds_dep_indexes.append([ token_pos, tokens_pos_dict[head],'NEG']) #print "NEGATION: " + head_value elif self.is_punctuation_ptb_tok(token_value) == 0: dep_rel_tokens_list.append([tokens_pos_dict[head], token_pos, 'MWE']) eds_dep_indexes.append([tokens_pos_dict[head], token_pos, 'MWE']) else: dep_rel_tokens_list.append([tokens_pos_dict[head], token_pos, 'PUNCT']) #print 'SPLIT DICT IN THE FUNCTION ' #pprint(split_dict) #print 'Contracted Negation dictionary for MRS-derived dep. in the function' #pprint(contracted_neg_split_dict_mrs) #print 'GROUP DICT IN THE FUNCTION ' #pprint(group_dict) fill_split_group_dictionaries_results = [dep_rel_tokens_list, eds_dep_indexes, group_dict, split_dict, contracted_neg_split_dict_mrs, pos_tag_dict, lemma_dict] return fill_split_group_dictionaries_results #FUNCTION10: has_punctuation_ptb_tok # NOT USED, PUNCTUATION LIST IS INCOMPLETE def has_punctuation_ptb_tok(self, string_to_check): has_punct = 0 check = ['!', ',', '.', ':', '?', ';', "'", '"', "“", "”", '(', ')' , '[', ']', '{', '}', '¦'] for p in check: if p in string_to_check: has_punct = 1 break return has_punct #FUNCTION11: compare_ptb_tok def compare_ptb_tok(self, a, b): #a = '<24:29>'. But when we do a[1:-1] we get token1_key_cut = '24:29' token1_key_cut = a[1:-1] #token1_key_parts = [24, 29] token1_key_parts = token1_key_cut.split(':') #token1_start_index = 24 token1_start_index = int(token1_key_parts[0]) #b = '<30:33>'. But when we do b[1:-1] we get token2_key_cut = '30:33' token2_key_cut = b[1:-1] #token2_key_parts = [30, 33] token2_key_parts = token2_key_cut.split(':') #token2_start_index = 30 token2_start_index = int(token2_key_parts[0]) return cmp(int(token1_start_index), int(token2_start_index)) # compare as integers #FUNCTION12: create_pos_tag_token_dict_ptb_tok def create_pos_tag_token_dict_ptb_tok(self, pos_tag_dict, tokens_pos_dict, group_dict, split_dict): pos_tag_tokens_dict = {} for key in pos_tag_dict.keys(): if key in tokens_pos_dict: pos_tag_tokens_dict[tokens_pos_dict[key]] = pos_tag_dict[key] elif key in group_dict: pos_tag_tokens_dict[tokens_pos_dict[group_dict[key]]] = pos_tag_dict[key] elif key in split_dict: for tokens_id in split_dict[key][1]: pos_tag_tokens_dict[tokens_pos_dict[tokens_id]] = pos_tag_dict[key] return pos_tag_tokens_dict #FUNCTION13: create_lemma_token_dict_ptb_tok def create_lemma_token_dict_ptb_tok(self, lemma_dict, tokens_pos_dict, group_dict, split_dict, words_id_dict): lemma_tokens_dict = {} for key in lemma_dict.keys(): if key in tokens_pos_dict: lemma_tokens_dict[tokens_pos_dict[key]] = lemma_dict[key] elif key in group_dict: lemma_tokens_dict[tokens_pos_dict[group_dict[key]]] = lemma_dict[key] elif key in split_dict: for tokens_id in split_dict[key][1]: lemma_tokens_dict[tokens_pos_dict[tokens_id]] = lemma_dict[key] #print "LEMMA TOKENS DICT" #pprint(lemma_tokens_dict) return lemma_tokens_dict #FUNCTION14: def analyze_input_file_ptb_tok(self, fi, file_index, current_latex_file_index, latex_doc_start, fhdl_dict, args_dict, output_flags, rule_head_dict, relations_collection): if output_flags['tex']: file_index = file_index + 1 if file_index % 50 == 0: fhdl_dict['tex'].write('\end{document}') fhdl_dict['tex'].close() current_latex_file_index = current_latex_file_index + 1 fhdl_dict['tex'] = open(args_dict['tex'] + os.path.basename(os.path.normpath(args_dict['data'])) + "_ptb_tok_" + str(current_latex_file_index) + '.tex', 'w') fhdl_dict['tex'].write(latex_doc_start) print >> sys.stderr, fi current_file = gzip.open(args_dict['data'] + '/' + str(fi) + '.gz', 'rb') derivation_tree = '' read_tree = 0 after_derivation_tree = 0 previous_line_empty = 0 read_eds = 0 eds = '' search_original_sentence = 1 #original_sentence = '' search_tokenization = 0 read_tokenization = 0 read_done = 0 partial_line = '' #sometimes we have #("the" # 246 #instead of ("the" 246 #but we need this index to create unique identifiers for words token_index_missing = 0 tokens_id_dict = {} tokens_pos_dict = {} tokens_list = [] ptb_pos_list = [] all_ptb_pos_with_probabilities_list = [] # to count space punct regex = re.compile(' [!|\,|\.|\:|\?|\;|\'|\"|“|”|‘|’|\(|\)|\[|\]|\{|\}|¦|¦i|«|»]+$') for line in current_file.readlines(): if read_done == 0: line = line.strip() #m = re.search('[^`]+`(.*)\'(?: \[(.*)\])?$', line) m = re.search('[^`]+`(.*)\'(?: \[(.*)\])?$', line) if m and search_original_sentence ==1: #Normally the line with the sentence looks like this: #[10320040] (1 of 1) {1} `As of [[June 30]] [[2008]] the company has 19,604 full-time employees.' #But for PEST corpus it is different: #[20201001] (1 of 1) {1} `((S (NP-SBJ (NNP Rolls-Royce) (NNP Motor) (NNPS Cars) (NNP Inc.)) (VP (VBD said) (SBAR (-NONE- 0) (S (NP-SBJ (PRP it)) (VP (VBZ expects) (S (NP-SBJ (PRP$ its) (NNP U.S.) (NNS sales)) (VP (TO to) (VP (VB remain) (ADJP-PRD (JJ steady)) (PP-LOC-CLR (IN at) (NP (QP (IN about) (CD 1,200)) (NNS cars))) (PP-TMP (IN in) (NP (CD 1990)))))))))) (. .)))' [Rolls-Royce Motor Cars Inc. said it expects its U.S. sales to remain steady at about 1,200 cars in 1990.] #if corpus == 'pest': #original_sentence = m.group(2).lower() # original_sentence = m.group(2) #else: #original_sentence = m.group(1).lower() # original_sentence = m.group(1) #if regex.search(m.group(1)): # self.space_punct_count = self.space_punct_count + 1 #This script was used to collect the names of the files that contained cases like # ". ..." and ". '" at the end of the sentence #here we write the file numbers for those files that contain cases like #". ..." at the end #file_handle = open('/Users/angelina/Documents/WeSearch/2012_Contrastive_parsing_experiments/03_First_15_sections/test_converter/error_log.txt', 'a') #file_handle.write(str(fi) + "\n") #file_handle.write(m.group(1) + "\n") #file_handle.write("\n") #file_handle.close() #print m.group(1) ## end of the script that collected ". ..."-type cases search_original_sentence = 0 search_tokenization = 1 elif previous_line_empty == 1 and search_tokenization == 1 and line == '<': read_tokenization = 1 search_tokenization = 0 elif read_tokenization == 1 and line == '>': read_tokenization = 0 search_derivation_tree = 1 elif read_tokenization == 1 and line != '>': line_parts = line.split(', ') #we remove quotes from the string value that is why we use [1:-1] #we take <0:3> as the key and The as the value token_part = line_parts[5][1:-1] if '" "' in line_parts[5][1:-1]: token_parts = line_parts[5][1:-1].split('" "') token_part = token_parts[0] #WeScience, English MRS Test Suite: #(1, 0, 1, <0:3>, 1, "The", 0, "null") #PEST AND deepbank: #(42, 0, 1, <0:1>, 1, "The" "The", 0, "null", "DT" 1.0) #we decided that the tokens' positions are in the third column (tokens positions start from 1) tokens_id_dict[line_parts[3]] = token_part #tokens_id_dict[line_parts[3]] = token_part.lower() tokens_pos_dict[line_parts[3]] = int(string.replace(line_parts[2], '(', '')) tokens_list.append(token_part) #in WeScience corpus we have 9 columns, and the 9th contains PTB POS #In the English MRS Test Suite we have only 8 columns, so we are not given PTB POS tags char_based_id = line_parts[3] if len(line_parts) >= 9 : ptb_pos_aux_list = line_parts[8].split(' ') ptb_pos = ptb_pos_aux_list[0] ptb_pos = ptb_pos[1:-1] ptb_pos_list.append(ptb_pos) # [0:-1] is necessary to remove closing bracket all_ptb_pos_with_prob = line_parts[8][0:-1] #print "all_ptb_pos_with_prob",line_parts[8] all_ptb_pos_with_probabilities_list.append(all_ptb_pos_with_prob) else: all_ptb_pos_with_probabilities_list.append('') ptb_pos_list.append('_') elif line.startswith('(ROOT'): read_tree = 1 derivation_tree = line + '\n' previous_line_empty = 0 read_eds = 0 elif len(line) == 0: previous_line_empty = 1 if read_tree == 1: read_tree = 0 after_derivation_tree = 1 elif read_eds == 1: read_eds = 0 read_done = 1 elif read_tree == 1: line = line.strip() if token_index_missing == 1 and re.match('^\d+$', line): line = partial_line + '|||||||' + line derivation_tree = derivation_tree + line + '\n' token_index_missing = 0 partial_line = '' else: #this is a pattern for lines like ("understood" 264 match_pattern = re.match('(^\(")([^"]+)("\s+\d+$)', line) #this is a pattern when we have #("the" # 246 #and currently are analyzing line ("the" match_pattern2 = re.match('(^\(")([^"]+)("$)', line) if match_pattern and match_pattern.group(1) and match_pattern.group(2) and match_pattern.group(3): line_begin = match_pattern.group(1) line_content = match_pattern.group(2) line_end = match_pattern.group(3) #print 'LINE ORIGINAL ', line #print 'LINE BEGIN ', line_begin #print 'LINE CONTENT ', line_content #print 'LINE END ', line_end #substitute brackets with <<<<<<<, otherwise we won't be able to read a tree line_content = line_content.replace('(', '<<<<<<<') line_content = line_content.replace(')', '>>>>>>>') #multiword expression, we will consider them as one word at+least if ' ' in line_content: line_content = '+'.join(line_content.split(' ')) line = line_begin + line_content + line_end #now the line #(at least( 25 #will look #(at+least<<<<<<<|||||||25 line = '|||||||'.join(line.split(' ')) #print 'LINE FINAL ', line, '\n\n' derivation_tree = derivation_tree + line + '\n' #this is the case when the line is incomplete because the indificator is on the next line elif match_pattern2 and match_pattern2.group(1) and match_pattern2.group(2) and match_pattern2.group(3): line_begin = match_pattern2.group(1) line_content = match_pattern2.group(2) line_end = match_pattern2.group(3) #print 'LINE ORIGINAL ', line #print 'LINE BEGIN ', line_begin #print 'LINE CONTENT ', line_content #print 'LINE END ', line_end line_content = line_content.replace('(', '<<<<<<<') line_content = line_content.replace(')', '>>>>>>>') #multiword expression if ' ' in line_content: line_content = '+'.join(line_content.split(' ')) line = line_begin + line_content + line_end partial_line = line token_index_missing = 1 else: #here we take care about the lines like "token [ +CARG #1=\"the\" +CLASS alphabetic [ +CASE non_capitalized+lower +INITIAL - ] +FORM #1 +FROM \"175\" +ID *diff-list* [ LAST #2=*top* LIST *cons* [ FIRST \"25\" REST #2 ] ] +PRED predsort +TNT null_tnt [ +MAIN tnt_main [ +PRB \"1\" +TAG \"DT\" ] +PRBS *null* +TAGS *null* ] +TO \"178\" +TRAIT native_trait ]")))) #they should not contain brackets in the middle for match_pattern1 in re.finditer('(\\\\\")([^"]+)(\\\\\")',line): #print 'MATCH_PATTERN1.group ', match_pattern1.group() if '(' in match_pattern1.group(0) or ')' in match_pattern1.group(0): line_begin = match_pattern1.group(1) line_content = match_pattern1.group(2) line_end = match_pattern1.group(3) #print 'LINE ORIGINAL ', line #print 'LINE BEGIN ', line_begin #print 'LINE CONTENT ', line_content #print 'LINE END ', line_end line_content = line_content.replace('(', '<<<<<<<') line_content = line_content.replace(')', '>>>>>>>') #print 'substitute ', match_pattern1.group(0), ' with ', line_begin + line_content + line_end line = re.sub(re.escape(match_pattern1.group(0)), line_begin + line_content + line_end, line, 1) line = '|||||||'.join(line.split(' ')) #print 'LINE FINAL ', line, '\n\n' derivation_tree = derivation_tree + line + '\n' previous_line_empty = 0 read_eds = 0 elif after_derivation_tree == 1 and previous_line_empty == 1 and re.match('^\{.+',line) and (line[-1] == ':' or re.search('\:\s+\(fragmented\)', line) or re.search('\:\s+\(cyclic\)', line) or re.search('\:\s+\(cyclic fragmented\)', line)): eds = eds + line + '\n' read_eds = 1 previous_line_empty = 0 after_derivation_tree = 0 elif read_eds == 1: eds = eds + line + '\n' previous_line_empty =0 current_file.close() try: t = Tree.parse(derivation_tree) except ValueError: raise ValueError("Missing or incorrect derivation tree!") t.collapse_unary(True, True, '///////') #t.draw() traverse_results = self.my_traverse(t, [], [], {}, {}, {},{}, {}, {},'ptb') first_dep = '' if len(traverse_results[0]) > 0: first_dep = traverse_results[0][0] words_list = traverse_results[1] dep_dict = traverse_results[2] words_pos_dict = traverse_results[3] #print 'WORDS POS DICT' #pprint(words_pos_dict) pos_tag_dict = traverse_results[4] lemma_dict = traverse_results[5] indexes = range(1,len(words_list)+1) #print('\n\n WORDS LIST\n') #pprint(words_list) #print '====================\n\n\n DEPENDENCY DICTIONARY\n\n' #pprint(dep_dict) #print '\n\n First dependency: ' + first_dep + '\n\n' sentence = '' for i in range(0,len(tokens_list)): sentence = sentence + tokens_list[i] + ' ' sentence = sentence.strip() words_dict = dict(zip(words_list, indexes)) #print '\n\n WORDS DICTIONARY\n' #pprint(words_dict) #print '\n\n WORDS POSITION DICTIONARY\n' #pprint(words_pos_dict) #merge words_pos_dict and words_list into a new dictionary words_id_dict # words_pos_dict looks like #{'<0:2>': 1, #'<100:105>': 17, #'<106:110>': 18, #'<111:113>': 19, # ... #} #words_list looks like #['"it"|||||||430', # ... #'"large"|||||||329', #'"body"|||||||331', #'"of"|||||||333', # ... #] #So the new words_id_dict will look like: #{'"it"|||||||430': '<0:2>' #'"large"|||||||329': '<100:105>' #'"body"|||||||331': '<106:110>' #'"of"|||||||333': '<111:113>' # ... #} words_id_dict = {} #pprint(words_pos_dict) for word_id in words_pos_dict.keys(): for k in range(0, len(words_pos_dict[word_id])): words_id_dict[words_list[words_pos_dict[word_id][k]-1]] = word_id #pprint(words_id_dict) dep_rel = [] dep_rel_id_list = [] dep_rel_tokens_list = [] group_dict = {} split_dict = {} contracted_neg_split_dict_mrs = {} sent_root = '' eds_dep_indexes = [] if first_dep != '': dep_rel.append(first_dep) #print 'WORDS ID DICT' #pprint(words_id_dict) dep_rel_id_list = self.dep_expansion_ptb_tok(fi, rule_head_dict, dep_rel, words_id_dict, dep_dict, []) #print 'DEPENDENCY RELATION ID LIST' #pprint(dep_rel_id_list) #print 'WORDS ID DICT AFTER DEP EXPANSION' #pprint(words_id_dict) #print '\n\n' #now we need to compute real relationships between separate tokens dep_expansion_to_tokens_results = self.dep_expansion_to_tokens_ptb_tok(dep_rel_id_list, tokens_id_dict, tokens_pos_dict, words_pos_dict, tokens_list, pos_tag_dict, lemma_dict) dep_rel_tokens_list = dep_expansion_to_tokens_results[0] eds_dep_indexes = dep_expansion_to_tokens_results[1] group_dict = dep_expansion_to_tokens_results[2] split_dict = dep_expansion_to_tokens_results[3] contracted_neg_split_dict_mrs = dep_expansion_to_tokens_results[4] pos_tag_dict = dep_expansion_to_tokens_results[5] lemma_dict = dep_expansion_to_tokens_results[6] #print 'SPLIT DICTIONARY' #pprint(split_dict) #print 'DEP REL TOKENS LIST' #pprint(dep_rel_tokens_list) sent_root_id = words_id_dict[first_dep] if sent_root_id in tokens_pos_dict: sent_root = tokens_pos_dict[sent_root_id] elif sent_root_id in group_dict: sent_root = tokens_pos_dict[group_dict[sent_root_id]] elif sent_root_id in split_dict: sent_root = tokens_pos_dict[split_dict[sent_root_id][0]] #one-word sentence: the word should be the root elif first_dep == '' and len(tokens_pos_dict) == 1: sent_root = 1 #This is a complex case such as file ws01/10011680.gz where # PTB tokenization is: # (1, 0, 1, <30:39>, 1, "Reduction", 0, "null", "NNP" 0.537 "NN" 0.463) #(2, 1, 2, <44:45>, 1, ".", 0, "null", "." 1.0) # while ERG approach looks different: # (22 reduction_n1/n_pp_mc-of_le 0 0 1 # ("reduction." 19 #+FROM \"30\" TO \"45\" # In this case we have one-word sentence for ERG and two-word sentence for PTB elif first_dep == '' and len(words_pos_dict) == 1 and len(tokens_pos_dict) > 1: fill_split_group_dictionaries_results = self.fill_split_group_dictionaries_ptb_tok(words_pos_dict, tokens_pos_dict, tokens_list, pos_tag_dict, lemma_dict) dep_rel_tokens_list = fill_split_group_dictionaries_results[0] eds_dep_indexes = fill_split_group_dictionaries_results[1] group_dict = fill_split_group_dictionaries_results[2] split_dict = fill_split_group_dictionaries_results[3] contracted_neg_split_dict_mrs = fill_split_group_dictionaries_results[4] pos_tag_dict = fill_split_group_dictionaries_results[5] lemma_dict = fill_split_group_dictionaries_results[6] #dep_rel_tokens_list will contain the dependency relations. As we are breaking out a compound with/without punctuation or a word with punctuation, # it is likely that the head is the same in each of the new dependency relation. So we will take the first head of the first relation in the list # as a sentence root sent_root = dep_rel_tokens_list[0][0] if output_flags['sent_tok']: fhdl_dict['sent_tok'].write('#' + str(fi) + "\t" + sentence + "\n") sentence = escape_sent(sentence) words_in_sent = sentence.split(' ') #print 'EDS' #print eds + '\n\n' read_eds_results = self.read_eds_into_dict(eds, relations_collection) eds_root = read_eds_results[0] eds_dict = read_eds_results[1] dict_id_pos = read_eds_results[2] transparent_dict = read_eds_results[3] eds_relation_dict = read_eds_results[4] #print 'EDS DICT' #pprint(eds_dict) #print 'EDS RELATION DICT' #pprint(eds_relation_dict) #print 'TRANSPARENT DICTIONARY' #pprint(transparent_dict) #print '\n\n' (eds_dep_indexes, eds_relation_pos_dict) = self.eds_expansion_ptb_tok(eds_dep_indexes, eds_dict, tokens_pos_dict, dict_id_pos, transparent_dict, split_dict, contracted_neg_split_dict_mrs, group_dict, eds_relation_dict) #print 'EDS DEP INDEXES' #pprint(eds_dep_indexes) sent_root_mrs_derived = '' #if eds_root in eds_dict and eds_root in dict_id_pos: if eds_root in dict_id_pos: #EXAMPLE: #WeScience corpus, sentence 10010020 #eds_root = 'e2' #dict_id_pos['e2'] = '<95:97>' #It is incorrect to set a root as words_pos_dict['<95:97>'] = 10 because we #split two commas and the root is no more the word number 10 in the sentence #Therefore we use tokens_pos_dict['<95:97>'] = 13 which is a new position #of the root in the sentence (after the punctuation was cut off from the words #it was attached to. sent_root_mrs_derived_id = dict_id_pos[eds_root] if sent_root_mrs_derived_id in tokens_pos_dict: sent_root_mrs_derived = tokens_pos_dict[sent_root_mrs_derived_id] elif sent_root_mrs_derived_id in group_dict: sent_root_mrs_derived = tokens_pos_dict[group_dict[sent_root_mrs_derived_id]] elif sent_root_mrs_derived_id in split_dict and sent_root_mrs_derived_id in contracted_neg_split_dict_mrs: sent_root_mrs_derived = tokens_pos_dict[contracted_neg_split_dict_mrs[sent_root_mrs_derived_id][0]] elif sent_root_mrs_derived_id in split_dict: sent_root_mrs_derived = tokens_pos_dict[split_dict[sent_root_mrs_derived_id][0]] if output_flags['tex']: fhdl_dict['tex'].write('\\begin{center}\n' + '\\begin{dependency}[edge below]\n' + '\\begin{deptext}[column sep=.05cm]\n') fhdl_dict['tex'].write(words_in_sent[0]) for i in range(1, len(words_in_sent)): fhdl_dict['tex'].write(' \& ' + words_in_sent[i]) fhdl_dict['tex'].write('\\\\\n' + '\end{deptext}\n') pos_tag_tokens_dict = self.create_pos_tag_token_dict_ptb_tok(pos_tag_dict, tokens_pos_dict, group_dict, split_dict) lemma_tokens_dict = self.create_lemma_token_dict_ptb_tok(lemma_dict, tokens_pos_dict, group_dict, split_dict, words_id_dict) if output_flags['dt'] or output_flags['tex']: #if we have to print out tex file, we anyway have to extract everything for DT self.convert_dt_ptb_tok(fi, tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list, sent_root, dep_rel_tokens_list, args_dict, output_flags, fhdl_dict) if output_flags['dm'] or output_flags['tex']: #if we have to print out tex file, we anyway have to extract everything for DM self.convert_dm_ptb_tok(fi, tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list, sent_root_mrs_derived, eds_dep_indexes, args_dict, output_flags,fhdl_dict) if output_flags['dtm']: self.convert_dtm_ptb_tok(fi, tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list, sent_root, dep_rel_tokens_list, args_dict, output_flags, fhdl_dict, sent_root_mrs_derived, eds_dep_indexes, eds_relation_pos_dict) #pprint(split_dict) if output_flags['dt_yy']: self.convert_dt_yy_ptb_tok(fi, all_ptb_pos_with_probabilities_list, tokens_pos_dict, tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list, sent_root, dep_rel_tokens_list, split_dict, args_dict, output_flags, fhdl_dict) if output_flags['tex']: fhdl_dict['tex'].write('\end{dependency}\n' + '\\\\\n' + str(fi) + '.gz' + '\end{center}' + '\n\\newpage') return [file_index, current_latex_file_index, fhdl_dict] #FUNCTION def is_punctuation_ptb_tok(self, string_to_check): is_punct = 0 matchObj = re.match( r'^[!|\,|\.|\:|\?|\;|\'|\"|“|”|‘|’|`|\(|\)|\[|\]|\{|\}|¦|¦i|¦|«|»]+$', string_to_check) if matchObj: is_punct = 1 return is_punct #FUNCTION def is_end_of_phrase_punctuation_ptb_tok(self, string_to_check): is_end_of_phrase_punct = 0 matchObj = re.match( r'^[!|\,|\.|\:|\?|\;]+$', string_to_check) if matchObj: is_end_of_phrase_punct = 1 return is_end_of_phrase_punct #FUNCTION is_punctuation_erg_tok # The only difference with is_punctuation_ptb_tok is that we do not check for "¦i" because # when we call this function, we intend to send as string_to_check only one letter def is_punctuation_erg_tok(self, string_to_check): is_punct = 0 matchObj = re.match( r'^[!|\,|\.|\:|\?|\;|\'|\"|“|”|‘|’|`|\(|\)|\[|\]|\{|\}|¦|¦|«|»]+$', string_to_check) if matchObj: is_punct = 1 return is_punct #========================================# # FUNCTIONS FOR ERG TOKENIZATION # #========================================# #FUNCTION1: extract_pos_tag_and_lemma_erg_tok def extract_pos_tag_and_lemma_erg_tok(self, grand_parent, pos_tag_dict, lemma_dict, index): pos_tag = '' if grand_parent is not None: #example of grand_parent value: #224|||||||be_id_is/v_np_is_le|||||||0.776934|||||||1|||||||2 #the pos_tag will be 'v_np_is_le' #the lemma will be 'be' grand_parent_pieces = grand_parent.split('|||||||') pos_tag_incorporated = grand_parent_pieces[1] if '/' in pos_tag_incorporated: pos_tag_incorporated_pieces = pos_tag_incorporated.split('/') lemma = pos_tag_incorporated_pieces[0] pos_tag = pos_tag_incorporated_pieces[1] elif '@' in pos_tag_incorporated: pos_tag_incorporated_pieces = pos_tag_incorporated.split('@') lemma = pos_tag_incorporated_pieces[0] pos_tag = pos_tag_incorporated_pieces[1] #now cut pos tag up so that we take only the part before first "_" #sub_parts = pos_tag.split('_') #pos_tag = sub_parts[0] #pos_tag = '_'.join([sub_parts[0],sub_parts[1]]) #pos_tag = '_'.join([sub_parts[0],sub_parts[1], sub_parts[2]]) pos_tag_dict[len(pos_tag_dict.keys())] = pos_tag lemma_dict[len(lemma_dict.keys())] = lemma return [pos_tag_dict, lemma_dict] #FUNCTION2: find_key_index_erg_tok def find_key_index_erg_tok(self, key, transparent_dict, dict_id_pos, words_pos_dict): key_id = None while key in transparent_dict: key = transparent_dict[key] if (key in dict_id_pos) and (dict_id_pos[key] in words_pos_dict): key_id = words_pos_dict[dict_id_pos[key]][0] return key_id #FUNCTION2.1: find_key_index_pred_erg_tok (it is different, because for predicate names we do # not want to substitute the predicate with a value in the transparent dict, e.g.) # # Abrams arrived and Browne sang. # {e3: #_1:proper_q<0:6>[BV x6] #x6:named<0:6>("Abrams")[] #e10:_arrive_v_1<7:14>[ARG1 x6] #e3:_and_c<15:18>[L-INDEX e10, R-INDEX e13, L-HNDL e10, R-HNDL e13] #_2:proper_q<19:25>[BV x16] #x16:named<19:25>("Browne")[] #e13:_sing_v_1<26:31>[ARG1 x16] #} # # e3 is transparent and is equated with e10 # Function find_key_index_erg_tok is auxilliary to determine dependency relation # and in that case we want e10 to be the head of e13 with the label _and_c # But when we identify the predicate name we still want e10 to have predicate "_arrive_v_1" (not "_and_c"): #1 Abrams _ n_-_pn_le n_-_pn_le _ _ _ 2 SB-HD named _ ARG1 _ _ #2 arrived _ v_-_le v_-_le _ _ _ 3 CL-CL ^_arrive_v_1 _ _ _ _ #3 and _ c_xp_and_le c_xp_and_le _ _ _ 0 ROOT _ _ _ _ _ #4 Browne _ n_-_pn_le n_-_pn_le _ _ _ 5 SB-HD named _ _ _ ARG1 #5 sang. _ v_np*_le v_np*_le _ _ _ 3 MRK-NH _sing_v_1 _ _and_c _ _ # def find_key_index_pred_erg_tok(self, key, transparent_dict,dict_id_pos, words_pos_dict): key_id = None if key not in transparent_dict: # e.g. "and" should not get a predicate because it is transparent if (key in dict_id_pos) and (dict_id_pos[key] in words_pos_dict): key_id = words_pos_dict[dict_id_pos[key]][0] return key_id #FUNCTION3: eds_expansion_erg_tok def eds_expansion_erg_tok(self, eds_dict, words_pos_dict, dict_id_pos, transparent_dict, eds_relation_dict): eds_dep_indexes = [] eds_relation_pos_dict = {} for key, value in eds_dict.iteritems(): head = self.find_key_index_erg_tok(key, transparent_dict, dict_id_pos, words_pos_dict) #head can be 'None' for the cases when the head is not a word token if not(head is None): for dep in value: label = dep[0] dependent = self.find_key_index_erg_tok(dep[1], transparent_dict, dict_id_pos, words_pos_dict) #if key == '|_31': # print dependent if head != dependent and not(dependent is None): eds_dep_indexes.append([head, dependent, label]) #NOW EXPAND EDS_RELATION_DICT # THIS DICTIONARY IS FOR ALL PREDICATES (NOT ONLY FOR THOSE THAT ARE HEADS FOR SOME DEPENDENTS) # we had a dictionary "eds_relation_dict" for relations that mapped 'e30' -> _leading_a_1, # 'x9' -> _and_c # now we find that the position of 'e30' in the sentence is 5 and #the position of 'x9' in the sentence is 1, #and build a new dictionary eds_relation_pos_dict that maps 5 -> _leading_a_1, 1 -> _and_c for key in eds_relation_dict.keys(): key_pos_in_sent = self.find_key_index_pred_erg_tok(key, transparent_dict, dict_id_pos, words_pos_dict) if not (key_pos_in_sent is None): eds_relation_pos_dict[key_pos_in_sent] = eds_relation_dict[key] #print 'EDS DEPENDENCY INDEXES:' #pprint(eds_dep_indexes) #print '\n\n' return [eds_dep_indexes, eds_relation_pos_dict] #FUNCTION4: dep_expansion_erg_tok def dep_expansion_erg_tok(self, fi, rule_head_dict, dep_rel, words_dict, dep_dict, dep_rel_indexes): # dep_rel[0] can be 454|||||||SP-HD_N_C|||||||1.49349|||||||0|||||||2 #We could have dep_dict like this: {'"burkina faso"|||||||22': []} which means that len(dep_dict[dep_rel[0]]) = 0 if len(dep_dict[dep_rel[0]]) > 0: for i in range(0,len(dep_dict[dep_rel[0]])): if dep_dict[dep_rel[0]][i] in words_dict: dep_dict[dep_rel[0]][i] = words_dict[dep_dict[dep_rel[0]][i]] else: new_dep_rel = [] new_dep_rel.append(dep_dict[dep_rel[0]][i]) self.dep_expansion_erg_tok(fi, rule_head_dict, new_dep_rel, words_dict, dep_dict, dep_rel_indexes) #here we can potentially fail dep_dict[dep_rel[0]][i] = words_dict[dep_dict[dep_rel[0]][i]] dep_rel_parts = dep_rel[0].split('|||||||') #print dep_rel_parts[1] if re.match('^\^', dep_rel_parts[1], re.IGNORECASE): dep_rel_parts[1] = dep_rel_parts[1][1:] #print dep_rel_parts[1] #print '\n' if dep_rel_parts[1].lower() in rule_head_dict and len(dep_dict[dep_rel[0]]) == int(rule_head_dict[ dep_rel_parts[1].lower()][0]): head_index = int(rule_head_dict[ dep_rel_parts[1].lower()][1]) dep_rel_subparts = dep_rel_parts[1].split('_') #label = '_'.join([dep_rel_subparts[0],dep_rel_subparts[1]]) #label = dep_rel_parts[1] label = dep_rel_subparts[0] head = dep_dict[dep_rel[0]][head_index] for i in range(0,len(dep_dict[dep_rel[0]])): if dep_dict[dep_rel[0]][i] != head: dependent = dep_dict[dep_rel[0]][i] dep_rel_indexes.append([head, dependent,label]) if not dep_rel[0] in words_dict: words_dict[dep_rel[0]] = head #if head_index == 0: #head = dep_dict[dep_rel[0]][0] #dependent = dep_dict[dep_rel[0]][1] #elif head_index == 1: #head = dep_dict[dep_rel[0]][1] #dependent = dep_dict[dep_rel[0]][0] #else: #print >> sys.stderr, "Unexpected head index of the rule " + dep_rel_parts[1].lower() + " (head index is neither 0, nor 1; head index is " + str(head_index) + ")." #sys.exit(1) else: print >> sys.stderr, "Error! Sentence id " + str(fi) + ". Rule " + dep_rel_parts[1].lower() + ". Unknown rule (rule is not in the list of rules in the file erg.hds) or incorrect number of daughters in the tree." sys.exit(1) return dep_rel_indexes def convert_dt_erg_tok(self, fi, words_list, words_correct_case_dict, pos_tag_dict, lemma_dict, sent_root, dep_rel_indexes, args_dict, output_flags, fhdl_dict): if output_flags['dt']: fhdl_dict['dt'].write('#' + str(fi) + '\n') #we have a list dep_indexes that contains lists in it. Each of the inner lists represents head, dependent, label. # but when we write a sentence in CoNLL format, for each word we need its head and label. #so we convert our list into a dictionary where keys are dependents and values are lists (in the Derivation Tree one dependent can have only one head) that represent head and label dep_indexes_dict = create_dep_indexes_dict(dep_rel_indexes) for i in range(1, len(words_list) + 1): head = '_' label = '_' if i in dep_indexes_dict.keys(): head = dep_indexes_dict[i][0] label = dep_indexes_dict[i][1] elif i == sent_root: head = 0 label = 'ROOT' if output_flags['tex']: if sent_root != '': fhdl_dict['tex'].write('\deproot[edge above, edge style={red}]{' + str(sent_root) + '}' + '{root}\n') #print '\n\n ELEMENT OF WORDS_LIST: ', words_list[i-1] word = words_correct_case_dict[words_list[i-1]] #print 'WORD: ', word, '\n' if output_flags['dt']: fhdl_dict['dt'].write(str(i) + '\t' + word + '\t' + "_" + '\t' + pos_tag_dict[i-1] + '\t' + pos_tag_dict[i-1] + '\t' + '_' + '\t' + str(head) + '\t' + label + '\t' + '_' + '\t' + '_' +'\n') if 'dt' in fhdl_dict: fhdl_dict['dt'].write('\n') if output_flags['tex']: for i in range(0,len(dep_rel_indexes)): #escape underscore symbol in the latex dep_rel_indexes[i][2] = dep_rel_indexes[i][2].replace('_', '\_') fhdl_dict['tex'].write('\depedge[edge above, edge style={red}]' + '{' + str(dep_rel_indexes[i][0]) + '}' + '{' + str(dep_rel_indexes[i][1]) + '}' + '{' + dep_rel_indexes[i][2] + '}' + '\n') return def convert_dm_erg_tok(self, fi, words_list, words_correct_case_dict,pos_tag_dict, lemma_dict, eds_dep_indexes, mrs_derived_sent_root, args_dict, output_flags, fhdl_dict): #print 'WORDS LIST' #pprint(words_list) #print 'POS TAG DICT' #pprint(pos_tag_dict) #print 'LEMMA DICT' #pprint(lemma_dict) #print 'MRS DERIVED SENT ROOT' #pprint(mrs_derived_sent_root) #print 'EDS DEP INDEXES' #pprint(eds_dep_indexes) #if there is no root, then our variable mrs_derived_sent_root = 0 #if there is a root, its index is in a string format and is calculated starting from 1: #Example: #'This is the most common conception...' # mrs_derived_sent_root = '2' (the copula 'is' is the root in this case) #words_list has a format: #['"this"|||||||192', # '"is"|||||||149', # '"the"|||||||151', # '"most"|||||||175', # '"common"|||||||153', # '"conception,"|||||||169', if output_flags['dm']: fhdl_dict['dm'].write('#' + str(fi) + '\n') if mrs_derived_sent_root != '': word = words_correct_case_dict[words_list[int(mrs_derived_sent_root) - 1]] if output_flags['dm']: fhdl_dict['dm'].write('ROOT' + '\t' + '_' + '\t' + 'ROOT' + '\t' + 'ROOT' + '\t' + '-1' + '\t' + 'ROOT' + '\t' + 'ROOT' + '\t' + word + '\t' + '_' + '\t' + pos_tag_dict[int(mrs_derived_sent_root) - 1] + '\t' + pos_tag_dict[int(mrs_derived_sent_root) - 1] + '\t' + str(int(mrs_derived_sent_root) - 1) + '\n') if output_flags['tex']: fhdl_dict['tex'].write('\deproot[edge style={blue}]{' + mrs_derived_sent_root + '}' + '{root}\n') for i in range(0, len(eds_dep_indexes)): head_word_index = eds_dep_indexes[i][0] - 1 head_word = words_correct_case_dict[words_list[head_word_index]] head_word_pos = pos_tag_dict[head_word_index] head_lemma = lemma_dict[head_word_index] dep_word_index = eds_dep_indexes[i][1] - 1 dep_word = words_correct_case_dict[words_list[dep_word_index]] dep_word_pos = pos_tag_dict[dep_word_index] dep_lemma = lemma_dict[dep_word_index] label = eds_dep_indexes[i][2] if output_flags['dm']: fhdl_dict['dm'].write(head_word + '\t' + "_" + '\t' + head_word_pos + '\t' + head_word_pos + '\t' + str(head_word_index) + '\t' + label + '\t' + label + '\t' + dep_word + '\t' + "_" + '\t' + dep_word_pos + '\t' + dep_word_pos + '\t' + str(dep_word_index) + '\n') if output_flags['tex']: #escape underscore in latex eds_dep_indexes[i][2] = eds_dep_indexes[i][2].replace('_', '\_') fhdl_dict['tex'].write('\depedge[edge style={blue}]' + '{' + str(eds_dep_indexes[i][0]) + '}' + '{' + str(eds_dep_indexes[i][1]) + '}' + '{' + eds_dep_indexes[i][2] + '}' + '\n') if output_flags['dm']: fhdl_dict['dm'].write('\n') return #FUNCTION6: print both DT and DM in one file in CoNLL08 formatted file # fi - The name of the original file (e.g. "20201001") # words_list - Word tokens of the sentence # pos_tag_dict - Part-of-speech tags for each word token # lemma_dict - Lemma for each word token # dt_sent_root - Sentence root in DT representation # dt_dep_rel_indexes - Dependencies in DT representation # output_hdl - Output file, where we write DT and DM in ConLL08 form # dm_sent_root_mrs_derived - Sentence root in DM representation # dm_eds_dep_indexes - Dependencies in DM representation # eds_relation_pos_dict - Dictionary of predicates that maps predicate position in the sentence to its name, # e.g. 16 -> 'named', 17 -> '_firm_n_1' def convert_dtm_erg_tok(self, fi, words_list, words_correct_case_dict,pos_tag_dict, lemma_dict, dt_sent_root, dt_dep_rel_indexes, dm_sent_root, dm_eds_dep_indexes, eds_relation_pos_dict, args_dict, output_flags, fhdl_dict): # We keep the name of the original file so that we could find source files for sentences with interesting fenomena fhdl_dict['dtm'].write('#' + str(fi) + '\n') #FIRST WORK ON DM REPRESENTATION #Sort relations by position in the sentence # Now they are unsorted in the dictionary eds_relation_pos_dict sorted_eds_relation_pos_list = sorted(eds_relation_pos_dict.keys()) #print("eds_relation_pos_dict") #pprint(eds_relation_pos_dict) #print "Sorted EDS RELATION POS LIST" #pprint(sorted_eds_relation_pos_list) #Rearrange the array for DM dm_dependencies = collect_arguments_for_dm_predicates(dm_eds_dep_indexes, eds_relation_pos_dict, sorted_eds_relation_pos_list) #print 'DM DEPENDENCIES' #pprint(dm_dependencies) # NOW WORK WITH DT REPRESENTATION #we have a list dt_dep_rel_indexes that contains lists in it. Each of the inner lists represents head, dependent, label. # but when we write a sentence in CoNLL format, for each word we need its head and label. #so we convert our list into a dictionary where keys are dependents and values are lists (in the Derivation Tree one dependent can have only one head) that represent head and label dt_dep_indexes_dict = create_dep_indexes_dict(dt_dep_rel_indexes) #NOW LOOP OVER ALL WORDS IN THE SENTENCE # COLLECT DT AND DM INFORMATION AND PRINT IT out for i in range(1, len(words_list) + 1): dt_head = '_' dt_label = '_' if i in dt_dep_indexes_dict.keys(): dt_head = dt_dep_indexes_dict[i][0] dt_label = dt_dep_indexes_dict[i][1] elif i == dt_sent_root: dt_head = 0 dt_label = 'ROOT' #print '\n\n ELEMENT OF WORDS_LIST: ', words_list[i-1] word = words_correct_case_dict[words_list[i-1]] #print 'WORD: ', word, '\n' #NOW COLLECT information about DM #by default dm_predicate is "_" # and for all predicate current word is not an argument by default ("_") dm_predicate = "_" if i in eds_relation_pos_dict: #if current word is a predicate dm_predicate = eds_relation_pos_dict[i] #if current word is the root, we add "^" symbol to the predicate to mark that it is a root, e.g. #^_ (empty predicate, ) if not(dm_sent_root == ''): if i == int(dm_sent_root): dm_predicate = "^" + dm_predicate dm_is_argument = "\t".join(["_"] * len(sorted_eds_relation_pos_list)) if i in dm_dependencies: dm_is_argument = "\t".join(dm_dependencies[i]) fhdl_dict['dtm'].write(str(i) + '\t' + word + '\t' + lemma_dict[i-1] + '\t' + pos_tag_dict[i-1] + '\t' + pos_tag_dict[i-1] + '\t' + '_' + '\t' + str(dt_head) + '\t' + dt_label + '\t' + '_' + '\t' + '_' + '\t' + dm_predicate + '\t' + dm_is_argument +'\n') fhdl_dict['dtm'].write('\n') return def analyze_input_file_erg_tok(self, fi, file_index, current_latex_file_index, latex_doc_start, fhdl_dict, args_dict, output_flags, rule_head_dict, relations_collection, eds_dep_labels_dict): if output_flags['tex']: file_index = file_index + 1 if file_index % 50 == 0: fhdl_dict['tex'].write('\end{document}') fhdl_dict['tex'].close() current_latex_file_index = current_latex_file_index + 1 fhdl_dict['tex'] = open(args_dict['tex'] + os.path.basename(os.path.normpath(args_dict['data'])) + "_erg_tok_" + str(current_latex_file_index) + '.tex', 'w') fhdl_dict['tex'].write(latex_doc_start) print >> sys.stderr, fi current_file = gzip.open(args_dict['data'] + '/' + str(fi) + '.gz', 'rb') derivation_tree = '' cfg_tree = '' read_tree = 0 after_derivation_tree = 0 previous_line_empty = 0 read_eds = 0 eds = '' search_original_sentence = 1 search_ptb_tokenization = 1 search_erg_tokenization = 0 read_ptb_tokenization = 0 read_erg_tokenization = 0 original_sentence = '' read_done = 0 partial_line = '' #sometimes we have #("the" # 246 #instead of ("the" 246 #but we need this index to create unique identifiers for words token_index_missing = 0 # ERG tokenization. We collect only ID and token, e.g. ''' (235, 23, 24, <123:129>, 1, "Tunick", 0, "null") (302, 23, 24, <123:129>, 1, "Tunick", 0, "null", "NNP" 1.0) (315, 23, 24, <123:129>, 1, "tunick", 0, "null") ''' #erg_tok_dict['<123:129>'] = ["Tunick", "Tunick", "tunick"] erg_tok_dict = defaultdict(list) #reader = codecs.getreader("utf-8") #contents = reader( current_file ) #for line in contents: for line in current_file.readlines(): if read_done == 0: #line = line_from_gzip.decode('utf-8') line = line.strip() m = re.search('[^`]+`(.*)\'(?: \[.*\])?$', line) if m and search_original_sentence ==1: #original_sentence = m.group(1).lower() original_sentence = m.group(1) search_original_sentence = 0 search_tokenization = 1 elif previous_line_empty == 1 and search_ptb_tokenization == 1 and line == '<': read_ptb_tokenization = 1 search_ptb_tokenization = 0 elif read_ptb_tokenization == 1 and line == '>': read_ptb_tokenization = 0 search_erg_tokenization = 1 elif read_erg_tokenization == 1 and line == '>': read_erg_tokenization = 0 search_derivation_tree = 1 elif previous_line_empty == 1 and search_erg_tokenization == 1 and line == '<': read_erg_tokenization = 1 search_erg_tokenization = 0 elif read_erg_tokenization == 1 and line != '>': line_parts = line.split(', ') #we remove quotes from the string value that is why we use [1:-1] #we take <0:3> as the key and The as the value token_part = line_parts[5][1:-1] if '" "' in line_parts[5][1:-1]: token_parts = line_parts[5][1:-1].split('" "') token_part = token_parts[0] #WeScience, English MRS Test Suite: #(1, 0, 1, <0:3>, 1, "The", 0, "null") #PEST AND deepbank: #(42, 0, 1, <0:1>, 1, "The" "The", 0, "null", "DT" 1.0) #we decided that the tokens' positions are in the third column (tokens positions start from 1) erg_tok_dict[line_parts[3]].append(token_part) elif line == '(ROOT_STRICT' or line == '(ROOT_INFORMAL' or line == '(ROOT_FRAG' or line == '(ROOT_INFFRAG' or line == '(ROOT_SPOKEN_FRAG' or line == '(ROOT_SPOKEN': read_tree = 1 derivation_tree = line + '\n' cfg_tree = cfg_tree + line + ' '; previous_line_empty = 0 read_eds = 0 elif len(line) == 0: previous_line_empty = 1 if read_tree == 1: cfg_tree = cfg_tree + "\n" read_tree = 0 after_derivation_tree = 1 elif read_eds == 1: read_eds = 0 read_done = 1 elif read_tree == 1: line = line.strip() if token_index_missing == 1 and re.match('^\d+$', line): line = partial_line + '|||||||' + line derivation_tree = derivation_tree + line + '\n' # For the CFG tree we cut the opening bracket before the word (line[0] = "(" always) cfg_tree = cfg_tree + line[1:] #print("Line restored: " + line) token_index_missing = 0 partial_line = '' else: #this is a pattern for lines like ("understood" 264 match_pattern = re.match('(^\(")([^"]+)("\s+\d+$)', line) #this is a pattern when we have #("the" # 246 #and currently are analyzing line ("the" match_pattern2 = re.match('(^\(")([^"]+)("$)', line) #this one is to detect rule or supertag match_pattern3 = re.match("^\(", line) #this one is to detect the end of "token" string match_pattern4 = re.search("\"(\))+\)$", line) if match_pattern and match_pattern.group(1) and match_pattern.group(2) and match_pattern.group(3): line_begin = match_pattern.group(1) line_content = match_pattern.group(2) line_end = match_pattern.group(3) #print 'LINE ORIGINAL ', line #print 'LINE BEGIN ', line_begin #print 'LINE CONTENT ', line_content #print 'LINE END ', line_end line_content = line_content.replace('(', '<<<<<<<') line_content = line_content.replace(')', '>>>>>>>') #multiword expression if ' ' in line_content: line_content = '+'.join(line_content.split(' ')) line = line_begin + line_content + line_end line = '|||||||'.join(line.split(' ')) derivation_tree = derivation_tree + line + '\n' # For the CFG tree we cat the opening bracket before the word (line[0] = "(" always) cfg_tree = cfg_tree + line[1:] + '\n' #this is the case when the line is incomplete because the indificator is on the next line elif match_pattern2 and match_pattern2.group(1) and match_pattern2.group(2) and match_pattern2.group(3): line_begin = match_pattern2.group(1) line_content = match_pattern2.group(2) line_end = match_pattern2.group(3) #print 'LINE ORIGINAL ', line #print 'LINE BEGIN ', line_begin #print 'LINE CONTENT ', line_content #print 'LINE END ', line_end line_content = line_content.replace('(', '<<<<<<<') line_content = line_content.replace(')', '>>>>>>>') #multiword expression if ' ' in line_content: line_content = '+'.join(line_content.split(' ')) line = line_begin + line_content + line_end partial_line = line token_index_missing = 1 else: original_line = line #rule or supertag if match_pattern3: rule_line = line cfg_rule = rule_line if re.search("\s", rule_line): line_parts = re.split("\s+", rule_line) rule_line = line_parts[1] rule_line = rule_line.replace("-", ":") cfg_rule = rule_line #if it is a supertag if rule_line.find("/") != -1: line_parts = rule_line.split("/") rule_line = line_parts[1] cfg_rule = rule_line #cfg_tree = cfg_tree + "(" + rule_line + " " #in the new format supertags are separated with @ elif rule_line.find("@")!=-1: line_parts = rule_line.split("@") rule_line = line_parts[1] cfg_rule = rule_line #cfg_tree = cfg_tree + "(" + rule_line + " " #if it is a rule else: # If the rule is marked with "^", we remove "^" if re.match("^\^", rule_line): rule_line = rule_line[1:] cfg_rule = rule_line #cfg_tree = cfg_tree + "(" + rule_line_full + " " if rule_line.find("_")!=-1: line_parts = rule_line.split("_") rule_line = line_parts[0] #print("rule or supertag: " + rule_line) rule_line = "(" + rule_line + " "; cfg_tree = cfg_tree + "(" + cfg_rule + " " #here we take care about the lines like "token [ +CARG #1=\"the\" +CLASS alphabetic [ +CASE non_capitalized+lower +INITIAL - ] +FORM #1 +FROM \"175\" +ID *diff-list* [ LAST #2=*top* LIST *cons* [ FIRST \"25\" REST #2 ] ] +PRED predsort +TNT null_tnt [ +MAIN tnt_main [ +PRB \"1\" +TAG \"DT\" ] +PRBS *null* +TAGS *null* ] +TO \"178\" +TRAIT native_trait ]")))) #they should not contain brackets in the middle for match_pattern1 in re.finditer('(\\\\\")([^"]+)(\\\\\")',line): #print 'MATCH_PATTERN1.group ', match_pattern1.group() if '(' in match_pattern1.group(0) or ')' in match_pattern1.group(0): line_begin = match_pattern1.group(1) line_content = match_pattern1.group(2) line_end = match_pattern1.group(3) #print 'LINE ORIGINAL ', line #print 'LINE BEGIN ', line_begin #print 'LINE CONTENT ', line_content #print 'LINE END ', line_end line_content = line_content.replace('(', '<<<<<<<') line_content = line_content.replace(')', '>>>>>>>') #print 'substitute ', match_pattern1.group(0), ' with ', line_begin + line_content + line_end line = re.sub(re.escape(match_pattern1.group(0)), line_begin + line_content + line_end, line, 1) line = '|||||||'.join(line.split(' ')) #print 'LINE FINAL ', line, '\n\n' derivation_tree = derivation_tree + line + '\n' # Here we have smth like ")))) at the end of the string that contains "token " if match_pattern4: # since the regex match_pattern4 worked, we know there is quote in the string line_parts = original_line.split("\""); closing_line = line_parts[len(line_parts) - 1] #cut off the last bracket because it is for the word #EXAMPLE #(335 of_poss/p_np_i-nm-poss_le -0.0241179 8 9 #("of" 101 #"token [ ...]")) #becomes: #(p_np_i-nm-poss_le of) cfg_tree = cfg_tree + closing_line[:-1] + " " previous_line_empty = 0 read_eds = 0 elif after_derivation_tree == 1 and previous_line_empty == 1 and re.match('^\{.+',line) and (line[-1] == ':' or re.search('\:\s+\(fragmented\)', line) or re.search('\:\s+\(cyclic\)', line) or re.search('\:\s+\(cyclic fragmented\)', line)): eds = eds + line + '\n' read_eds = 1 previous_line_empty = 0 after_derivation_tree = 0 elif read_eds == 1: eds = eds + line + '\n' previous_line_empty =0 current_file.close() #print("CFG Tree") try: t_cfg = Tree.parse(cfg_tree) except ValueError: raise ValueError("Missing or incorrect derivation tree!") #if fhdl_dict['log'] is not None: # fhdl_dict['log'].write(str(fi) + "\t" + "Missing or incorrect derivation tree\n") #return if self.cfg_no_unary_rules == 1: t_cfg.collapse_unary(True, False, '///////') self.traverse_cfg(t_cfg) t_cfg = Tree.parse(self.t_cfg_traversed) #print(t_cfg) #t_cfg.draw() #t_cfg.draw() #print("ERG TOK DICTIONARY") #print(erg_tok_dict) #print('\n\n') #print 'DERIVATION TREE:\n\n' #print derivation_tree + '\n\n' try: t = Tree.parse(derivation_tree) except ValueError: print "Missing or incorrect derivation tree!" #if fhdl_dict['log'] is not None: # fhdl_dict['log'].write(str(fi) + "\t" + "Missing or incorrect derivation tree\n") return t.collapse_unary(True, True, '///////') #t.draw() traverse_results = self.my_traverse(t, [], [], {}, {}, {},{}, {}, erg_tok_dict, 'erg') first_dep = '' sent_root = '' if len(traverse_results[0]) > 0: first_dep = traverse_results[0][0] words_list = traverse_results[1] dep_dict = traverse_results[2] words_pos_dict = traverse_results[3] #print 'WORDS POS DICT' #pprint(words_pos_dict) #print 'WORDS LIST' #pprint(words_list) pos_tag_dict = traverse_results[4] #print('POS TAG DICT') #pprint(pos_tag_dict) #print('\n') lemma_dict = traverse_results[5] words_correct_case_dict = traverse_results[6] #print('CASE_DICT') #pprint(case_dict) ''' #Search for cases where id (e.g. <13:25>) corresponds to more than one token and those tokens are not #hyphen-separated (e.g. #(344, 7, 8, <44:60>, 1, "Macmillan", 0, "null", "NNP" 0.9881) #(372, 8, 9, <44:60>, 1, "/", 0, "null") #(264, 9, 10, <44:61>, 1, "McGraw,", 0, "null") for key in case_dict: #print(words_list[words_pos_dict[key]-1]) if len(case_dict[key]) > 1 and not re.search("-", words_list[words_pos_dict[key][0]-1]): print(key + "\t" + str(case_dict[key])) print 'WORDS POS DICT' pprint(words_pos_dict) print 'WORDS LIST' pprint(words_list) ''' indexes = range(1,len(words_list)+1) #print('\n\n WORDS LIST\n') #pprint(words_list) #print '====================\n\n\n DEPENDENCY DICTIONARY\n\n' #pprint(dep_dict) #print '\n\n First dependency: ' + first_dep + '\n\n' sentence = '' # erg_tok_dict contains all tokens in ERG tokenization (some of them could have correct upper/lower case) # case_dict contains information about the case from the derivation tree (e.g. +CASE capitalized+lower ) #words_correct_case_dict = self.correct_case_erg_tok(words_pos_dict, words_list, erg_tok_dict, case_dict) for i in range(1,len(words_list)+1): word = words_correct_case_dict[words_list[i-1]] sentence = sentence + word + ' ' sentence = sentence.strip() #sentence = sentence.lower() words_dict = dict(zip(words_list, indexes)) #word_pos_dict = dict(zip(word_pos_list, indexes)) #print '\n\n WORDS DICTIONARY\n' #pprint(words_dict) #print '\n\n WORDS POSITION DICTIONARY\n' #pprint(word_pos_dict) #print original_sentence + '\n' #print sentence + '\n' #if original_sentence != sentence: # continue dep_rel = [] dep_rel_indexes = [] sent_root = '' if first_dep != '': dep_rel.append(first_dep) #print 'WORDS DICT' #pprint(words_dict) dep_rel_indexes = self.dep_expansion_erg_tok(fi, rule_head_dict, dep_rel, words_dict, dep_dict, []) #print("Dependency relations dictionary") #pprint(dep_rel_indexes) sent_root = words_dict[first_dep] #one-word sentence: the word should be the root elif first_dep == '' and len(words_dict.keys()) == 1: sent_root = words_dict[words_dict.keys()[0]] sentence = escape_sent(sentence) words_in_sent = sentence.split(' ') #print eds + '\n' read_eds_results = self.read_eds_into_dict(eds, relations_collection) eds_root = read_eds_results[0] eds_dict = read_eds_results[1] dict_id_pos = read_eds_results[2] transparent_dict = read_eds_results[3] eds_relation_dict = read_eds_results[4] #print '\n\n ROOT:' + eds_root + '\n' #print 'EDS_DICT' #pprint(eds_dict) #print '\n\n' #print 'TRANSPARENT DICTIONARY' #pprint(transparent_dict) #print '\n\n' (eds_dep_indexes, eds_relation_pos_dict) = self.eds_expansion_erg_tok(eds_dict, words_pos_dict, dict_id_pos, transparent_dict, eds_relation_dict) #print 'EDS DEP INDEXES' #pprint(eds_dep_indexes) #print '\n\n' #DEBUGGING 13/03/2013 #print 'EDS RELATION POS DICT' #pprint(eds_relation_pos_dict) #print('\n\n') for dependency_relation in eds_dep_indexes: eds_dep_labels_dict[dependency_relation[2]] = 1 #print('eds_dep_labels_dict') #pprint(eds_dep_labels_dict) #print('\n\n') if output_flags['tex']: fhdl_dict['tex'].write('\\begin{center}\n' + '\\begin{dependency}[edge below]\n' + '\\begin{deptext}[column sep=.05cm]\n') fhdl_dict['tex'].write(words_in_sent[0]) for i in range(1, len(words_in_sent)): fhdl_dict['tex'].write(' \& ' + words_in_sent[i]) fhdl_dict['tex'].write('\\\\\n' + '\end{deptext}\n') #print '\n\n SENT ROOT ' + str(sent_root) + '\n\n' #if sent_root != '': #latex_output_fh.write('\deproot[edge above, edge style={red}]{' + str(sent_root) + '}' + '{root}\n') mrs_derived_sent_root = '' if eds_root in eds_dict and eds_root in dict_id_pos: if dict_id_pos[eds_root] in words_pos_dict: #we have to choose intermediate dictionaries as well # {e2: # e2:implicit_conj<5:173>[L-INDEX e7, R-INDEX e6, L-HNDL e7, R-HNDL e36] # e7:unknown<5:50>[] #Another problematic example #{e2: #e2:implicit_conj<78:218>[L-INDEX e21, R-INDEX e57, L-HNDL e21, R-HNDL e57] # e21:loc_nonsp<16:77>[ARG1 x5, ARG2 x22] # We have no root in the end mrs_derived_sent_root = str(words_pos_dict[dict_id_pos[eds_root]][0]) # latex_output_fh.write('\deproot[edge style={blue}]{' + mrs_derived_sent_root + '}' + '{root}\n') #print 'WORDS POS DICT' #pprint(words_pos_dict) #print("Output flags") #pprint(output_flags) if output_flags['dt'] or output_flags['tex']: self.convert_dt_erg_tok(fi, words_list, words_correct_case_dict,pos_tag_dict, lemma_dict, sent_root, dep_rel_indexes, args_dict, output_flags, fhdl_dict) if output_flags['dm'] or output_flags['tex']: self.convert_dm_erg_tok(fi, words_list, words_correct_case_dict, pos_tag_dict, lemma_dict, eds_dep_indexes, mrs_derived_sent_root, args_dict, output_flags, fhdl_dict) if output_flags['dtm']: self.convert_dtm_erg_tok(fi, words_list, words_correct_case_dict, pos_tag_dict, lemma_dict, sent_root, dep_rel_indexes, mrs_derived_sent_root, eds_dep_indexes, eds_relation_pos_dict, args_dict, output_flags, fhdl_dict) #sent_tok MUST preceed 'cfg' because in 'cfg' the tree is changed if output_flags['sent_tok']: self.extract_tok_sent_erg_tok(fi, t_cfg, words_correct_case_dict, fhdl_dict) if output_flags['cfg']: self.convert_cfg(fi, t_cfg, words_correct_case_dict, fhdl_dict) if output_flags['tex']: fhdl_dict['tex'].write('\end{dependency}\n' + '\\\\\n' + str(fi) + '.gz' + '\end{center}' + '\n\\newpage') return [file_index, current_latex_file_index, fhdl_dict, eds_dep_labels_dict,words_correct_case_dict] def traverse_cfg(self,t_cfg): try: t_cfg.node except AttributeError: self.t_cfg_traversed = self.t_cfg_traversed + t_cfg else: #print("Node: " + t_cfg.node) #print(self.t_cfg_traversed) # Now we know that t.node is defined node = t_cfg.node if '///////' in node: node_parts = t_cfg.node.split('///////') node = node_parts[len(node_parts)-1] self.t_cfg_traversed = self.t_cfg_traversed + '(' + node + " " #print(t_cfg_traversed) for child in t_cfg: self.traverse_cfg(child) self.t_cfg_traversed = self.t_cfg_traversed + ')' def extract_lowercased_token_erg_tok(self, parent): # e.g. parent = '"because+of"|||||||399' # ex1) elem_of_words_list = ['"he"', 429] # ex2) elem_of_words_list = ['"because+of"',399] elem_of_words_list = parent.split('|||||||') # ex1) word_lowercase = '"he"' # ex2) word_lowercase = '"because+of"' word_lowercase = elem_of_words_list[0] # ex1) clean_word_lowercase = 'he' # ex2) clean_word_lowercase = 'because+of' clean_word_lowercase = word_lowercase[1:len(word_lowercase)-1] #print(clean_word_lowercase) return clean_word_lowercase def find_case_info_for_multiword_expr_erg_tok(self, t, multiword_parts): # t is sort of array that contains information under leaves #e.g. t= #(1104|||||||more_than_adv1/av_-_i-vp-pr_le|||||||0.257395|||||||17|||||||19///////"more+than"|||||||299 #"token|||||||[|||||||+CARG|||||||#1=\"more\"|||||||+CLASS|||||||alphabetic|||||||[|||||||+CASE|||||||non_capitalized+lower|||||||+INITIAL|||||||-|||||||]|||||||+FORM|||||||#1|||||||+FROM|||||||\"116\"|||||||+ID|||||||*diff-list*|||||||[|||||||LAST|||||||#2=*top*|||||||LIST|||||||*cons*|||||||[|||||||FIRST|||||||\"19\"|||||||REST|||||||#2|||||||]|||||||]|||||||+PRED|||||||predsort|||||||+TICK|||||||bool|||||||+TNT|||||||null_tnt|||||||[|||||||+MAIN|||||||tnt_main|||||||[|||||||+PRB|||||||\"0.52686469999999996\"|||||||+TAG|||||||\"RBR\"|||||||]|||||||+PRBS|||||||*null*|||||||+TAGS|||||||*null*|||||||]|||||||+TO|||||||\"120\"|||||||+TRAIT|||||||token_trait|||||||[|||||||+HD|||||||token_head|||||||+IT|||||||italics|||||||+LB|||||||bracket_null|||||||+RB|||||||bracket_null|||||||+UW|||||||-|||||||]|||||||]" #286 #"token|||||||[|||||||+CARG|||||||#1=\"than\"|||||||+CLASS|||||||alphabetic|||||||[|||||||+CASE|||||||non_capitalized+lower|||||||+INITIAL|||||||-|||||||]|||||||+FORM|||||||#1|||||||+FROM|||||||\"121\"|||||||+ID|||||||*diff-list*|||||||[|||||||LAST|||||||#2=*top*|||||||LIST|||||||*cons*|||||||[|||||||FIRST|||||||\"20\"|||||||REST|||||||#2|||||||]|||||||]|||||||+PRED|||||||predsort|||||||+TICK|||||||bool|||||||+TNT|||||||null_tnt|||||||[|||||||+MAIN|||||||tnt_main|||||||[|||||||+PRB|||||||\"1\"|||||||+TAG|||||||\"IN\"|||||||]|||||||+PRBS|||||||*null*|||||||+TAGS|||||||*null*|||||||]|||||||+TO|||||||\"125\"|||||||+TRAIT|||||||token_trait|||||||[|||||||+HD|||||||token_head|||||||+IT|||||||italics|||||||+LB|||||||bracket_null|||||||+RB|||||||bracket_null|||||||+UW|||||||-|||||||]|||||||]") #output list case_list = [] j = 0 for i in range(len(t)): m_start = re.search("\+FROM\|\|\|\|\|\|\|(#\d+=)*\\\\\"(\d+)\\\\\"", str(t[i])) m_end = re.search("\+TO\|\|\|\|\|\|\|\\\\\"(\d+)\\\\\"", str(t[i])) m_case = re.search("\+CASE\|\|\|\|\|\|\|([^\|]+)\|\|\|\|\|\|\|", str(t[i])) if m_start and m_end: start = m_start.group(2) end = m_end.group(1) #By default the case for the found token will be not specified (empty string) if m_case: case_list.append(["<" + start+ ":" + end + ">", multiword_parts[j], m_case.group(1)]) else: case_list.append(["<" + start+ ":" + end + ">", multiword_parts[j], ""]) j+= 1 return case_list def analyze_case_descr_erg_tok(self, erg_tok_dict, case_list): for i in range(len(case_list)): token_from_dertree = case_list[i][1] case_descr = case_list[i][2] if case_descr == "non_capitalized+lower": case_list[i][1] = token_from_dertree.lower() elif case_descr == "capitalized+lower": # Difficult cases: token starts with punctuation # Example: “david # In this case we capitalize not the first letter, # but the first letter after all the punctuation symbols letter_pos = 0 while letter_pos < len(token_from_dertree) and self.is_punctuation_erg_tok(token_from_dertree[letter_pos]) == 1: letter_pos +=1 #This bit contains punctuation that preceeds capitalized letter, e.g. in the # case of token “david. # We keep punctuation string empty by default punctuation_str = "" if letter_pos > 0: punctuation_str = token_from_dertree[:letter_pos] #if there are letters after the one that should be capitalized if len(token_from_dertree) -1 > letter_pos: case_list[i][1] = punctuation_str + token_from_dertree[letter_pos].upper() + token_from_dertree[letter_pos+1:].lower() # if there are no more letters after the one that should be capitalized elif len(token_from_dertree) -1 == letter_pos: case_list[i][1] = punctuation_str + token_from_dertree[letter_pos].upper() #print(word_lowercase_unique_id + ": " + words_correct_case_dict[word_lowercase_unique_id]) elif case_descr == "capitalized+upper": case_list[i][1] = token_from_dertree.upper() #print(word_lowercase_unique_id + ": " + words_correct_case_dict[word_lowercase_unique_id]) else: # If we have capitalization of the first letter, we should change default # However, theses cases are not clear about the other letters in the word, # so we still have to look at the ERG tokenization if re.match("^capitalized", case_descr): # Difficult cases: token starts with punctuation # Example: “david # In this case we capitalize not the first letter, # but the first letter after all the punctuation symbols letter_pos = 0 while self.is_punctuation_erg_tok(token_from_dertree[letter_pos]) == 1: letter_pos +=1 #This bit contains punctuation that preceeds capitalized letter, e.g. in the # case of token “david. # We keep punctuation string empty by default punctuation_str = "" if letter_pos > 0: punctuation_str = token_from_dertree[:letter_pos] #if there are letters after the one that should be capitalized if len(token_from_dertree) -1 > letter_pos: case_list[i][1] = punctuation_str + token_from_dertree[letter_pos].upper() + token_from_dertree[letter_pos+1:].lower() # if there are no more letters after the one that should be capitalized elif len(token_from_dertree) -1 == letter_pos: case_list[i][1] = punctuation_str + token_from_dertree[letter_pos].upper() #print(case_descr + ";" + word_lowercase_unique_id + ": " + words_correct_case_dict[word_lowercase_unique_id]) #If there are no clear records (e.g. the case is mixed) about the case of the token in the derivation tree, # look at ERG tokenization. #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #TODO: CORRECT FINUCTION find_correct_case_in_erg_tok !!!!!!!!!! #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! case_list[i][1] = self.find_correct_case_in_erg_tok(erg_tok_dict, case_list[i][0], case_list[i][1]) return case_list def find_correct_case_in_erg_tok(self, erg_tok_dict, character_based_id, token_from_dertree): ''' ERG TOK DICTIONARY {'<45:53>': ['American', 'American', 'american'], '<16:20>': ['U.S.', 'U.S.', 'u.s.'], '<176:179>': ['the'], '<74:76>': ['on'], '<180:183>': ['U.', 'U.', 'u.'], ...} ''' number_of_cap_letters_in_token_from_dertree = sum(x.isupper() for x in token_from_dertree) if character_based_id in erg_tok_dict: # e.g. if token_id = "<0:2>", then token_variants = ['he'] token_variants = erg_tok_dict[character_based_id] # If we find a variant of the word from erg_tok_dict with capitalization, we choose # capitalization, rather than lowercase version # (so we return this variant) for variant in token_variants: number_of_cap_letters_in_variant = sum(x.isupper() for x in variant) if len(variant) == len(token_from_dertree) and variant != token_from_dertree and variant.lower() == token_from_dertree.lower() and number_of_cap_letters_in_variant > number_of_cap_letters_in_token_from_dertree: return variant return token_from_dertree def convert_cfg(self, fi, t_cfg, words_correct_case_dict, fhdl_dict): for pos in t_cfg.treepositions('leaves'): t_cfg[pos] = t_cfg[pos].replace('_______', ' ').replace('<<<<<<<', '(').replace('>>>>>>>', ')') t_cfg[pos] = words_correct_case_dict[t_cfg[pos]].replace("/", "\/").replace("*", "\*").replace("(", "-LRB-").replace(")", "-RRB-").replace("“", "``").replace("”", "''").replace("‘", "`").replace("’", "'") fhdl_dict['cfg'].write('#' + str(fi) + "\t" + "( " + t_cfg._pprint_flat(nodesep='', parens='()', quotes=False) + " )" +"\n") #print("( " + t_cfg._pprint_flat(nodesep='', parens='()', quotes=False) + " )" +"\n") return def extract_tok_sent_erg_tok(self, fi, t_cfg, words_correct_case_dict, fhdl_dict): tree_leaves = t_cfg.leaves() sent = "" for token in tree_leaves: token = token.replace('_______', ' ').replace('<<<<<<<', '(').replace('>>>>>>>', ')') token = words_correct_case_dict[token].replace("/", "\/").replace("*", "\*").replace("(", "-LRB-").replace(")", "-RRB-").replace("“", "``").replace("”", "''").replace("‘", "`").replace("’", "'") sent += token + " " sent = sent.strip() fhdl_dict['sent_tok'].write('#' + str(fi) + "\t" + sent + "\n") return def break_key_into_start_end(key): # key is "<0:2>" # key_parts = ["<0", "2>"] key_parts = key.split(':') # now we extract "0" from "<0" start = int(key_parts[0][1:]) # now we extract "2" from "2>" end = int(key_parts[1][:-1]) return [start, end] def create_dep_indexes_dict (dt_dep_rel_indexes): dep_indexes_dict = {} for i in range(0,len(dt_dep_rel_indexes)): head = dt_dep_rel_indexes[i][0] dependent = dt_dep_rel_indexes[i][1] label = dt_dep_rel_indexes[i][2] dep_indexes_dict[dependent] =[head,label] return dep_indexes_dict def escape_sent (sentence): sentence = sentence.replace('%', '\%') sentence = sentence.replace('$', '\$') sentence = sentence.replace('&', 'and') sentence = sentence.replace('#', '\#') sentence = sentence.replace('{', '\{') sentence = sentence.replace('}', '\}') sentence = sentence.replace('[', '$[$') sentence = sentence.replace(']', '$]$') return sentence def collect_arguments_for_dm_predicates(dm_eds_dep_indexes, eds_relation_pos_dict, sorted_eds_relation_pos_list): dm_dependencies = {} for i in range(0, len(dm_eds_dep_indexes)): dm_head_word_index = dm_eds_dep_indexes[i][0] dm_dep_word_index = dm_eds_dep_indexes[i][1] dm_label = dm_eds_dep_indexes[i][2] #For Latex we escape "_" with a backslash in the label name, but now we are printing out CoNLL08 file, so we do not need to escape dm_label = dm_label.replace("\\", "") #Unless we already saw this dependent word before if not (dm_dep_word_index in dm_dependencies): # and by default there are no predicates for which current word is an argument dm_dependencies[dm_dep_word_index] = ["_"] * len(sorted_eds_relation_pos_list) #if the head is a predicate if dm_head_word_index in eds_relation_pos_dict: dm_dependencies[dm_dep_word_index][sorted_eds_relation_pos_list.index(dm_head_word_index)] = dm_label return dm_dependencies #=============== THE END ==================# if __name__=="__main__": Converter().run()