#!/usr/local/bin/python
# -*- coding: utf-8 -*-

import glob, os, gzip, re, sys, string, math, argparse, itertools
import codecs
from collections import defaultdict
#from collections import OrderedDict
from nltk.tree import *
from nltk.util import *

reload(sys)  
sys.setdefaultencoding('utf8')

##################################
#               USAGE

#python converter_v1.py --data <path_input> --grammar <grammar_folder_path> --dt <dt_folder_path> --dm <dm_folder_path> --tex <tex_folder_path> --log <log_file_path>

# The difference from converter.py is that we do not change derivation tree (do not cut on first underscore etc.) when converting to CFG.
# These lines make the difference:
#
# cfg_tree = cfg_tree + rule_line_full
#
##################################


#========================================#
#FUNCTIONS COMMON FOR PTB AND ERG TOKENIZATION#
#========================================#

class Converter:
    #this global variable was used to count the  number of sentences that have ". ..." or ". '" at the end.
    def __init__(self):
        self.t_cfg_traversed = ""
        self.cfg_no_unary_rules = 0
    def run(self):


        latex_doc_start = '%%!TEX encoding = UTF-8 Unicode\n\
        \documentclass[8pt,landscape]{a0poster}\n\
        \setlength\oddsidemargin{15mm}\n\
        \setlength\evensidemargin{15mm}\n\
        \setlength\\topmargin{15mm}\n\
        \setlength\headsep{15mm}\n\
        \setlength\headheight{0mm}\n\
        \let\openright=\clearpage\n\
        \usepackage[english]{babel}\n\
        \usepackage[T1]{fontenc}\n\
        \usepackage[T2A]{fontenc}\n\
        \usepackage{indentfirst}\n\
        \usepackage[utf8x]{inputenc}\n\
        \usepackage{textcomp}\n\
        \usepackage[titletoc]{appendix}\n\
        \usepackage{graphicx}\n\
        \usepackage{amsthm}\n\
        \usepackage{multirow}\n\
        \\renewcommand{\multirowsetup}{\centering}\n\
        \usepackage{longtable}\n\
        \usepackage[hyphens]{url}\n\
        \usepackage{titletoc}\n\
        \usepackage[unicode]{hyperref}\n\
        \usepackage{subfigure}\n\
        \usepackage{array}\n\
        \usepackage[intoc]{nomencl}\n\
        \usepackage{paralist}\n\
        \usepackage{amsmath}\n\
        \usepackage[round]{natbib}\n\
        \\renewcommand*\\thesection{\\arabic{section}}\n\
        \\renewcommand{\\nomname}{List of Abbreviations}\n\
        \makenomenclature\n\
        \hypersetup{ colorlinks, citecolor=black, filecolor=black, linkcolor=black, urlcolor=black}\n\
        \\renewcommand*\\thesubsection{\\arabic{subsection}}\n\
        \\renewcomand{\\figurename}{Figure}\n\
        \hypersetup{pdfauthor=Angelina Ivanova}\n\
        \makeatletter\n\
        \def\@makechapterhead#1{\n\
        {\parindent \z@ \\raggedright \\normalfont\n\
        \Huge\\bfseries \\thechapter. #1\n\
        \par\\nobreak\n\
        \\vskip 20\p@\n\
        }}\n\
        \def\@makeschapterhead#1{\n\
        {\parindent \z@ \\raggedright \\normalfont\n\
        \Huge\\bfseries #1\n\
        \par\\nobreak\n\
        \\vskip 20\p@\n\
        }}\n\
        \makeatother\n\
        %macros for dependency graphs\n\
        \usepackage{tikz}\n\
        \usepackage{tikz-dependency}\n\
        \\begin{document}\n\
        \lefthyphenmin=2\n\
        \\righthyphenmin=2\n\
        \pagenumbering{arabic}\n\
        \setcounter{page}{1}\n\
        \\newpage\n'


        parser = argparse.ArgumentParser()
        parser.add_argument('--data', required = True, help = 'Existing folder where the original files that should be parsed are located')
        parser.add_argument('--grammar', required = True, help = 'Existing folder where the grammar is located. For the conversion only dm.cfg and rules.hds are required')
        parser.add_argument('--dt', help='Existing folder where the converted files will be printed out in the dependency format DELPH-IN Syntactic Derivation Tree (DT)')
        parser.add_argument('--dm', help='Existing folder where the converted files will be printed out in the dependency format DELPH-IN MRS-Derived Dependencies (DM)')
        parser.add_argument('--dtm', help='Existing folder where the converted files will be printed out in the DT and DM formats combined in CoNLL08 file.')
        parser.add_argument('--cfg', help ='Existing folder where the converted files will be printed out in the form of phrase-structured trees (context-free grammar).')
        parser.add_argument('--cfg_no_unary_rules', choices= ['true', 'false'],default = 'false',help ='Wheather to collapse unary rules in CFG tree or not.')
        parser.add_argument('--dt_yy', help ='Existing folder where the converted files will be printed out in the form of DT annotations in yy output format (available only for PTB tokenization).')        
        parser.add_argument ('--sent_tok', help='Existing folder where the file with tokenized sentences (one sentence per line) will be created.')
        parser.add_argument ('--tex', help='Existing folder where the TEX file illustrating the result of conversion will be printed out')
        parser.add_argument('--tok', choices= ['ptb', 'erg'], default = 'erg', help = 'Tokenization style: PTB or ERG')
	parser.add_argument('--log', help ='Information about files that were not proccessed will be written to the log file')
        args = parser.parse_args()
        args_dict = vars(args)

        #flag if to print out a format specified by the key
        output_flags = {}
	
 
        # It is not possible to produce CFG trees with PTB tokenization
        if args_dict['tok'] == 'ptb' and  args_dict['cfg'] is not None: 
            print('Error! It is not possible to extract phrase-structured trees with PTB-style tokenization. Either choose ERG-style tokenization, or do not specify "--cfg" option.')
            sys.exit(1)      

        if args_dict['cfg'] is None and args_dict['cfg_no_unary_rules'] == "true":
            print('Error! Option --cfg_no_unary_rules can only be used together with option --cfg')
            sys.exit(1)
        # We currently only provide yy output for DT with PTB tokenization
        if args_dict['tok'] == 'erg' and  args_dict['dt_yy'] is not None: 
            print('Error! The yy output format is not supported for ERG tokenization. Either choose PTB-style tokenization, or do not specify "--dt_yy" option.')
            sys.exit(1) 

				
        #check if the paths of mandatory paths exists and that at least one output is provided
        if os.path.exists(args_dict['data']) and os.path.exists(args_dict['grammar']):
            #check if we have at least one output specified
            for key in ['dt', 'dm', 'dtm', 'cfg', 'sent_tok','tex', 'dt_yy']:
                #by default we think the format was not specified by the user
                output_flags[key] = 0
                #However if it was specified
                if args_dict[key] is not None:
                    # and the path exists
                    if os.path.exists(args_dict[key]):
                        #then flag turns 1
                        output_flags[key] = 1


            #if at least one output format was specified
            if not (output_flags['dt']==0 and output_flags['dm']==0  and output_flags['dtm']==0  and output_flags['cfg']==0 and output_flags['sent_tok']==0 and output_flags['tex']==0 and output_flags['dt_yy'] == 0):
                

                #Add slash to the end of the folder name if is missing 
                for key in ['data', 'grammar', 'dt', 'dm', 'dtm', 'cfg', 'sent_tok','tex', 'dt_yy']:
                    if args_dict[key] is not None:
                        args_dict[key] = self.addSlashToFolderPath(args_dict[key])


                rules_file = args_dict["grammar"] + "etc/rules.hds"    
                rule_head_dict = self.read_rule_head_file(rules_file)

                relations_file = args_dict["grammar"] + "etc/dm.cfg"
                relations_collection = self.read_relations_file(relations_file)
                #print "relations collection"
                #pprint(relations_collection)


                #By default our output file handles for each format are None objects. We have to initiate and 
                #pass as parameters file handles (not closed files) because we loop over input files and print out results to the 
                #output files on the fly (otherwise we need to keep too much info in memory)

                dt_output_fh = None
                dm_output_fh = None
                dtm_output_fh = None
                cfg_output_fh = None
                sent_tok_output_fh = None
                latex_output_fh = None
                dt_yy_output_fh = None
		log_fh = None
                #These are counters for output latex files. If we write everything to one Latex file, we exceed TEX memory capacity
                current_latex_file_index = 1
                file_index = 0

                fhdl_dict = {}
        	if args_dict['log'] is not None:
                	try:
                       		fhdl_dict['log'] = codecs.open(args_dict['log'], 'w')
                	except IOError:
                        	print('Error! Cannot open the log file for writing!')
                        	sys.exit(1)
                if output_flags['dt']:
                    #DT output file handle
                    fhdl_dict['dt'] = codecs.open(args_dict['dt'] + os.path.basename(os.path.normpath(args_dict['data'])) + "." + args_dict['tok'] + '.dt', 'w')
                if output_flags['dm']:
                    #DM output file handle
                    fhdl_dict['dm'] = codecs.open(args_dict['dm'] + os.path.basename(os.path.normpath(args_dict['data'])) + "." + args_dict['tok'] + '.dm', 'w')
                if output_flags['dtm']:
                    #DT&DM output file handle
                    fhdl_dict['dtm'] = codecs.open(args_dict['dtm'] + os.path.basename(os.path.normpath(args_dict['data'])) + "." + args_dict['tok'] + '.dtm', 'w', encoding = 'utf-8')
                if output_flags['cfg']:
                    #phrase-structured trees extracted from the derivation tree
                    fhdl_dict['cfg'] = codecs.open(args_dict['cfg'] + "cfg_" + os.path.basename(os.path.normpath(args_dict['data'])) +  '.txt', 'w')
                if output_flags['sent_tok']:
                    #tokenized normalized sentence (normalized means words in multiword expressions are joined with "+", some quotes are normalized, brackets are replaced with -LRB- and -RRB-)
                    fhdl_dict['sent_tok'] = codecs.open(args_dict['sent_tok'] + "sent_" +os.path.basename(os.path.normpath(args_dict['data'])) + "_" + args_dict['tok'] + "_tok" +   '.txt', 'w')
                if output_flags['tex']:    
                    fhdl_dict['tex'] = codecs.open(args_dict['tex'] + os.path.basename(os.path.normpath(args_dict['data'])) + "." + args_dict['tok'] + "_tok_" + str(current_latex_file_index) + '.tex', 'w')    
                    fhdl_dict['tex'].write(latex_doc_start)
                if output_flags['dt_yy']:
                    #tokenized normalized sentence (normalized means words in multiword expressions are joined with "+", some quotes are normalized, brackets are replaced with -LRB- and -RRB-)
                    fhdl_dict['dt_yy'] = codecs.open(args_dict['dt_yy'] + os.path.basename(os.path.normpath(args_dict['data'])) + "_" + args_dict['tok'] + "_tok" +   '.yy', 'w', encoding = 'utf-8')

                file_list_sorted = []
                for fi in os.listdir(args_dict['data']): 
                    if fi.endswith(".gz"):
                        file_list_sorted.append(int(re.sub(r'\.gz', '', fi)))
                file_list_sorted.sort()
                    
                if args_dict['cfg_no_unary_rules'] is not None:
                    if args_dict['cfg_no_unary_rules'] == "true":
                        self.cfg_no_unary_rules = 1

             
                 #THIS IS THE MAIN LOOP: WE LOOP OVER ALL THE FILES THAT HAVE TO BE CONVERTED   
                for fi in file_list_sorted:
                    self.t_cfg_traversed = ""
                    if args_dict['tok'] == 'ptb':
			try:
                        	(file_index, current_latex_file_index, fhdl_dict) = \
                        	self.analyze_input_file_ptb_tok(fi, file_index, current_latex_file_index, latex_doc_start, fhdl_dict, args_dict, output_flags,rule_head_dict, relations_collection)
			except ValueError, e:
				print(str(e))
				if fhdl_dict['log'] is not None:
					fhdl_dict['log'].write(str(fi) + "\t" + str(e)+"\n")
				continue
                    elif args_dict['tok'] == 'erg':
                        try:
				(file_index, current_latex_file_index, fhdl_dict, eds_dep_labels_dict, words_correct_case_dict) = \
                        	self.analyze_input_file_erg_tok(fi, file_index, current_latex_file_index, latex_doc_start, fhdl_dict, args_dict, output_flags, rule_head_dict, relations_collection, {})
			except ValueError, e:
				print(str(e))
				if fhdl_dict['log'] is not None:
					fhdl_dict['log'].write(str(fi) + "\t" + str(e)+"\n")
				continue

                if output_flags['tex']:    
                    fhdl_dict['tex'].write('\end{document}')
                    fhdl_dict['tex'].close()


            else:
                print "Error! Please specify correct existing path to at least one output format. See usage below!"
                parser.print_help()
                sys.exit(1)
        else:
            print "Error! Incorrect command line arguments provided. See usage below!"
            parser.print_help()
            sys.exit(1)                     

    def addSlashToFolderPath(self, somepath):
        if not somepath.endswith("/"):
            somepath += "/"
        return somepath


    def remove_fragment_symbol_eps(self, eps):
        m = re.match("^\s*\|(.*)", eps)
        if m:
            eps = m.group(1)
        return eps

    # FUNCTION READ_EDS_INTO_DICT(self, eds, relations_collection)

    def read_eds_into_dict(self, eds, relations_collection):

        eds = eds.strip()
        eds_dict = {}
        dict_id_pos = {}
        transparent_dict = {}
        eds_relation_dict = {}
        eds_root = ''

        eds = eds[1:] #cut {
        eds = eds[:-1] #cut }
        #print '\n\n EDS\n' + eds + '\n\n'

        eps = eds.split('\n')

        
        #Now split the first line by ':'
        # e.g. eps[0] = "e3: (fragmented)"
        if eps[0].find(':')!=-1:
            # Removes initial "|" if for example we have:
            #|_34:udef_q<228:244>[BV e193]
            eps[0] = self.remove_fragment_symbol_eps(eps[0])
            eps_first_line_as_list = eps[0].split(':')
            #root is the first label (e.g. 'e3' in our example)
            eds_root = eps_first_line_as_list[0] 


        for i in range(1,len(eps)):
            if eps[i]!='':
                # Removes initial "|" if for example we have:
                #|_34:udef_q<228:244>[BV e193]
                eps[i] = self.remove_fragment_symbol_eps(eps[i])
                
                #Example: eps[i] = "e2:unknown<9:23>[ARG x4]"
                eps_parts = eps[i].split(':',1)
                #Example: dict_key = "e2"
                dict_key = eps_parts[0]
                
                tokid = ''
                dep_line = eps_parts[1]
                rel = ''
                
                if '<' in eps_parts[1]:
                    #Example: eps_parts[1] = "unknown<9:23>[ARG x4]"
                    eps_parts2 = eps_parts[1].split('<', 1)
                    #Example: rel = "unknown"
                    rel = eps_parts2[0]
                    #Example: eps_parts2[1] = "9:23>[ARG x4]"
                    eps_parts3 = eps_parts2[1].split('>', 1)
                    #Example: tokid = "<9:23>"
                    tokid = '<' + eps_parts3[0] + '>'
                    #Example: dep_line = "[L-INDEX x54, R-INDEX x59]" or dep_line = "[]" or dep_line = '("Lane")[]'
                    dep_line = eps_parts3[1].strip()
                elif '(' in eps_parts[1]:
                    #suppose we have smth like x25:yofc("2004")[] (not sure, it exists)
                    eps_parts2 = eps_parts[1].split('(', 1)    
                    #Example: rel = "unknown"
                    rel = eps_parts2[0]
                    
                elif '[' in eps_parts[1]:
                    #e.g. e34:parg_d[ARG1 e2]
                    eps_parts2 = eps_parts[1].split('[', 1)    
                    #Example: rel = "unknown"
                    rel = eps_parts2[0]                
                
                eds_relation_dict[dict_key] = rel
                labels_dep_dict = {}

                
                #a line with arguments, such as [ARG1 e3, ARG2 e99]
                p_arg = re.compile('\[[^\]]+\]')
                m_arg = p_arg.search(dep_line) 

                
                if m_arg:
                
                    dep_line = m_arg.group()
                    #cut '['
                    dep_line = dep_line[1:]
                    #cut ']'
                    dep_line = dep_line[:-1]
                    dep_list = [dep_line]

                    if ',' in dep_line:
                        #e.g. "ARG1 e3, ARG2 e99"
                        dep_list = dep_line.split(', ')
                    for dep in dep_list:
                        dep_parts = dep.split(' ')
                        label = dep_parts[0] #e.g. ARG1
                        dependent = dep_parts[1] #e.g. e3
                        labels_dep_dict[label] = dependent


                dict_value = []
            
                # Example: 
                # relations_collection['relational'][2] = "(_c$|^appos|^compound|^implicit_conj|^loc_nonsp|^loc_sp|^measure|^nonsp|^of_p|^part_of|^poss|^subord)"
                # rel = "implicit_conj"           
                m_redundant   = re.search(relations_collection['redundant'][2], rel)
                m_transparent = re.search(relations_collection['transparent'][2], rel)
                m_relational  = re.search(relations_collection['relational'][2], rel)
                m_lexical     = re.search(relations_collection['lexical'][2], rel)
                

                if m_redundant:  
                    #print '\n' + eps[i] + ' is a redundant relation type\n'
                    labels_from_redundant_dict = self.get_labels_from_typed_dict('redundant', relations_collection, rel)
                    for labels_list in labels_from_redundant_dict:
                        if labels_list[0] in labels_dep_dict and labels_list[1] in labels_dep_dict:
                            #if labels_dep_dict[labels_list[0]] == labels_dep_dict[labels_list[1]]:
                                del labels_dep_dict[labels_list[1]]

            
                if m_transparent:
                    #Example: dict_id_pos["e2"] = "<9:23>"
                    dict_id_pos[dict_key] = tokid
                    #print '\n' + eps[i] + ' is a transparent relation type\n' 
                    labels_from_transparent_dict = self.get_labels_from_typed_dict('transparent', relations_collection, rel)
                    for lab in labels_from_transparent_dict:

                        #TO avoid "key" equal to "value" in transparent dict (e.g. transparent_dict['i82']='i82') that occur in cyclic graphs
                        # and cause infinite loops, we check that dict_key != labels_dep_dict[lab]
                        if lab in labels_dep_dict and dict_key != labels_dep_dict[lab]:
                            transparent_dict[dict_key] = labels_dep_dict[lab]
                            #print 'transparent_dict'
                            #pprint(transparent_dict)
                            break                            
            
                   
                if m_relational:
                    #Example: dict_id_pos["e2"] = "<9:23>"
                    dict_id_pos[dict_key] = tokid
                    #print '\n' + eps[i] + ' is a relational relation type\n'
                    labels_from_relational_dict = self.get_labels_from_typed_dict('relational', relations_collection, rel)
                    for labels_list in labels_from_relational_dict:
                        lbl1 = labels_list[0]
                        lbl2 = labels_list[1]
                        # if we want to rename the dependency relation, there is a label for a new name for it:
                        # comp ARG0 ARG2 ref
                        if len(labels_list) == 3:
                            new_rel_name = labels_list[2]
                            # with "rel = new_rel_name" I get encoding error:
                            #UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 4: ordinal not in range(128)
                            #rel = new_rel_name
                            rel = new_rel_name.encode('utf-8')
                            

                        if lbl1 in labels_dep_dict and lbl2 in labels_dep_dict:
                            dpdt1 = labels_dep_dict[lbl1]
                            dpdt2 = labels_dep_dict[lbl2]
                            
                            dict_value.append([rel, dpdt2])


                            if dpdt1 in eds_dict:
                                dict_value_old = eds_dict[dpdt1]
                                dict_value_new = dict_value_old + dict_value
                                eds_dict[dpdt1] = dict_value_new
                                
                            else:
                                eds_dict[dpdt1] = dict_value
                        

                            #all relational predicates are by default transparent as well
                            #TO avoid "key" equal to "value" in transparent dict (e.g. transparent_dict['i82']='i82') that occur in cyclic graphs
                            # and cause infinite loops, we check that dict_key != labels_dep_dict[lbl1]
                            if dict_key != labels_dep_dict[lbl1]:
                                transparent_dict[dict_key] = labels_dep_dict[lbl1]
                            del labels_dep_dict[lbl1]
                            del labels_dep_dict[lbl2]
                        elif lbl1 == 'ARG0':
                            # For the case part_of ARG0 ARG1
                            # x5:part_of<0:3>[ARG1 x9]
                            #labels_dep_dict = {'ARG1': 'x9'}
                            # ARG0 is not in labels_dep_dict
                            # dict_value gets additional value [part_of, x9]
                            # and we change eds_dict: eds_dict[x5] = dict_value
                            if lbl2 in labels_dep_dict:
                                # we need to check for lbl2 because there are cases when e.g.
                                # part_of does not have ARG1:
                                #|x23:part_of<33:36>[]
                                # file 26360 in Conan Doyle development set
                                # (PET export ssa)
                                dict_value.append([rel, labels_dep_dict[lbl2]])
                                eds_dict[dict_key] = dict_value
                                # Now we do not need to process it as a lexical item, because otherwise there will be two labels assigned
                                # from x5 to x9: part_of and ARG1
                                m_lexical = False
                        elif lbl2 == 'ARG0' and lbl1 in labels_dep_dict:
                            dict_value.append([rel, dict_key])
                            eds_dict[labels_dep_dict[lbl1]] = dict_value
                            m_lexical = False


                            #print dpdt1 + ' is the head of ' + dpdt2                           
                             
                
                if m_lexical:
                    #Example: dict_id_pos["e2"] = "<9:23>"
                    dict_id_pos[dict_key] = tokid
                    #print '\n' + eps[i] + ' is a lexical relation type\n' 
                    
                    for lbl in labels_dep_dict.keys():
                        dpdt = labels_dep_dict[lbl]
                        dict_value.append([lbl, dpdt])
                        if dict_key in eds_dict:
                            dict_value_old = eds_dict[dict_key]
                            dict_value_new = dict_value_old + dict_value
                                
                            eds_dict[dict_key] = dict_value_new
                        else:
                            eds_dict[dict_key] = dict_value
                    
                
                #p3 = re.compile('(^compound|^poss|^measure|^appos|^loc\_nonsp|^loc\_sp|^nonsp|^subord|^of\_p)')
                #m3 = p3.match(eps_parts[1])
                
                #p4 = re.compile('(^_|^abstr\_deg|^card|^dofm|^dofw|^generic\_entity|^mofy|^much-many\_a|^named|^named\_n|^numbered\_hour|^ord|^part\_of|^person|^person\_n|^pron|^thing|^time|^time\_n|^yofc|^neg)')
                #m4 = p4.match(eps_parts[1])
                '''
                    
                    print eps_parts[1] + ' is a redundant relation type\n'
                if re.match(relations_collection['transparent'][2], eps_parts[1]):
                    print eps_parts[1] + ' is a transparent relation type\n' 
                if re.match(relations_collection['relational'][2], eps_parts[1]):
                    print eps_parts[1] + ' is a relational relation type\n'
                if re.match(relations_collection['lexical'][2], eps_parts[1]):
                    print eps_parts[1] + ' is a lexical relation type\n'
                '''        
                '''
                    #NOMINALIZATION
                    if re.match('^nominalization', eps_parts[1]):
                        #print 'NOMINALIZATION' + '\n' 
                        dict_value = []
                        if m1:
                            dict_id_pos[dict_key] = m1.group()
                        
                        
                        if m2 is not None:
                            dep_line = m2.group()
                            dep_line = dep_line[1:]
                            dep_line = dep_line[:-1]
                            dep_list = [dep_line]
                            if ',' in dep_line: 
                                dep_list = dep_line.split(', ')
                            for dep in dep_list:
                                dep_parts = dep.split(' ')
                                label = dep_parts[0]
                                dependent = dep_parts[1]
                                if label == 'ARG1':
                                    transparent_dict[dict_key] = dependent
                                else:
                                    dict_value.append([label, dependent])
                            if dict_key in eds_dict:
                                dict_value_old = eds_dict[dict_key]
                                dict_value_new = dict_value_old + dict_value
                                eds_dict[dict_key] = dict_value_new
                            
                            else:
                                eds_dict[dict_key] = dict_value                        
                    
                    
                    #Conjunction     
                    elif re.search('_c[<\[]', eps_parts[1]) or re.match('^implicit_conj', eps_parts[1]):
                        if m1:
                            dict_id_pos[dict_key] = m1.group()
                        
                        dict_value = []
                        
                        if m2 is not None:
                            dep_line = m2.group()
                            dep_line = dep_line[1:]
                            dep_line = dep_line[:-1]
                            dep_list = [dep_line]
                            if ',' in dep_line: 
                                dep_list = dep_line.split(', ')
                            # we create a dictionary of dependencies
                            tmp_dep_dict = {}
                            #if we have L-INDEX, it will substitute *_c and take its argument
                            for dep in dep_list:
                                dep_parts = dep.split(' ')
                                label = dep_parts[0]
                                dependent = dep_parts[1]
                                tmp_dep_dict[label] = dependent
                            
                            #print '\n Temporary dep dict\n\n'        
                            #pprint(tmp_dep_dict)
                            #if conjuction had other dependants
                            dict_value_old = []        
                            if dict_key in eds_dict:
                                dict_value_old = eds_dict[dict_key]
                            
                            if 'L-INDEX' in tmp_dep_dict and 'R-INDEX' in tmp_dep_dict:
                                dict_value.append(['conj',tmp_dep_dict['R-INDEX']])
                                dict_value_new = dict_value_old + dict_value
                                eds_dict[dict_key] = dict_value_new
                                transparent_dict[dict_key] = tmp_dep_dict['L-INDEX']
                            elif 'L-HNDL' in tmp_dep_dict and 'R-HNDL' in tmp_dep_dict:
                                dict_value.append(['conj',tmp_dep_dict['R-HNDL']])
                                dict_value_new = dict_value_old + dict_value
                                eds_dict[dict_key] = dict_value_new
                                transparent_dict[dict_key] = tmp_dep_dict['L-HNDL']  
                    
                    
                    #relational        
                    elif  m3:
                        label = m3.group(1)
                        label = re.sub(r'\_', r'\\_', label)
                        #print 'COMPOUND ET AL. ', label 
                        
                        dict_value = []
                        if m1:
                            dict_id_pos[dict_key] = m1.group()
                        
                        
                        if m2 is not None:
                            dep_line = m2.group()
                            dep_line = dep_line[1:]
                            dep_line = dep_line[:-1]
                            dep_list = [dep_line]
                            
                            if ',' in dep_line: 
                                dep_list = dep_line.split(', ')
                            if len(dep_list) == 2:
                                dep_parts1 = dep_list[0].split(' ')
                                dep_parts2 = dep_list[1].split(' ') 
                                dict_key = dep_parts2[1]
                                dependent = dep_parts1[1]
                                #print 'dependent = ', dependent , '\n\n'
                                dict_value.append([label, dependent])
                                if dict_key in eds_dict:
                                    dict_value_old = eds_dict[dict_key]
                                    dict_value_new = dict_value_old + dict_value
                                    eds_dict[dict_key] = dict_value_new
                                
                                else:
                                    eds_dict[dict_key] = dict_value                    
                    
                    #so-called `lexical' predicate symbols
                    elif m4:    
                        dict_value = []
                        dict_id_pos[dict_key] = m1.group()
                        
                        
                        if m2 is not None:
                            dep_line = m2.group()
                            dep_line = dep_line[1:]
                            dep_line = dep_line[:-1]
                            dep_list = [dep_line]
                            if ',' in dep_line: 
                                dep_list = dep_line.split(', ')
                            for dep in dep_list:
                                dep_parts = dep.split(' ')
                                label = dep_parts[0]
                                dependent = dep_parts[1]
                                dict_value.append([label, dependent])
                        if dict_key in eds_dict:
                            dict_value_old = eds_dict[dict_key]
                            dict_value_new = dict_value_old + dict_value
                            
                            eds_dict[dict_key] = dict_value_new
                        else:
                            eds_dict[dict_key] = dict_value
                    '''


        while eds_root in transparent_dict:
            eds_root = transparent_dict[eds_root]
                        
        #print "EDS RELATION DICT"
        #pprint(eds_relation_dict)
        #Example of eds_relation_dict:
        #'x73': '_piano_n_1',
        #'x78': 'implicit_conj',
        #'x83': '_bass_n_1',
        #'x93': '_show_n_of',
        #'x96': '_slide_n_1'}


        #Example of eds_dict:
        # {'x49': [['appos', 'x47']],
        #'x55': [['compound_name', 'x47']],
        #'x61': [['compound_name', 'x55']],
        #'x72': [['compound', 'x49']],
        #'x89': [['loc_nonsp', 'e3']],
        #'x9': [['_and_c', 'x47']]}
        return [eds_root,eds_dict, dict_id_pos, transparent_dict, eds_relation_dict]


    #FUNCTION MY_TRAVERSE
    def my_traverse(self, t, first_dep, words_list, pos_tag_dict, lemma_dict, dep_dict, words_pos_dict, words_correct_case_dict,erg_tok_dict,tok_type):
        #this function goes through the derivation tree and creates different dictionaries and lists   
        try:
            t.node
        except AttributeError:
            print ''
        else:
            parent = t.node
            #we would need grand parent to extract POS tag
            grand_parent = None
            if '///////' in t.node:
                parent_pieces= t.node.split('///////')
                parent = parent_pieces.pop()
                if len(parent_pieces) > 0:
                    grand_parent = parent_pieces.pop()
            parent = parent.replace('<<<<<<<', '(')
            parent = parent.replace('>>>>>>>', ')')
            if re.match("^\".*", parent) and len(t) == 1:
                start = ''
                end = ''
                case_notes = ''
        
                words_list.append(parent)
                
                #print 'PARENT: ' + parent
                #print 'LEAF:' + '\n' + t[0] +  '\n\n\n'
                m = re.search("\+FROM\|\|\|\|\|\|\|(#\d+=)*\\\\\"(\d+)\\\\\"", str(t[0]))
                if m:
                    start = m.group(2)
                m = re.search("\+TO\|\|\|\|\|\|\|\\\\\"(\d+)\\\\\"", str(t[0]))
                if m:
                    end = m.group(1)

                #print 'START '+ start + '; END ' + end + '\n\n'
                
                if start != '' and end != '':
                    if '<' + start + ':' + end + '>' not in words_pos_dict:
                        words_pos_dict['<' + start + ':' + end + '>'] = [len(words_list)]
                    else:
                        words_pos_dict['<' + start + ':' + end + '>'].append(len(words_list))
                    #pprint(words_pos_dict)
                    # This is a token from the derivation tree cleaned from the markup
                    clean_word_lowercase = self.extract_lowercased_token_erg_tok(parent)

                    #When len(t) == 1, word in the derivation tree is represented as one token only
                    # So for this situation we have the case_dict with one element in it. By default we
                    # put no information about the case (empty string)
                    case_list = []
                    #Search for the information about the case
                    m_case = re.search("\+CASE\|\|\|\|\|\|\|([^\|]+)\|\|\|\|\|\|\|", str(t[0]))
                    if m_case:
                        case_list.append(['<' + start + ':' + end + '>', clean_word_lowercase,m_case.group(1)])
                    else:
                        case_list.append(['<' + start + ':' + end + '>', clean_word_lowercase, ""])

                    case_list = self.analyze_case_descr_erg_tok(erg_tok_dict, case_list)
                    #print("CASE_LIST")
                    #pprint(case_list)

                    # IN this case we have only one element in case_list and it is a list with one element
                    # (because the word is not a multi-word expression)
                    words_correct_case_dict[parent]= case_list[0][1]

                                    
                    if tok_type == 'ptb':
                        (pos_tag_dict, lemma_dict) = self.extract_pos_tag_and_lemma_ptb_tok(grand_parent, pos_tag_dict, lemma_dict, '<' + start + ':' + end + '>')
                    else:
                        (pos_tag_dict, lemma_dict) = self.extract_pos_tag_and_lemma_erg_tok(grand_parent, pos_tag_dict, lemma_dict, '<' + start + ':' + end + '>')
            
            elif re.match("^\".*", parent) and len(t) > 1:
                parent = parent.replace('_______', ' ')
                
                words_list.append(parent)
                
                #print 'PARENT: ' + parent
                #if parent == '"more+than"|||||||299':
                #    print(str(t))
                #print 'LEAF:' + '\n' + str(t[len(t)-1]) +  '\n\n\n'
                start = ''
                end = ''


                #m_case = re.search("\+CASE ([^ ]+) ", str(t[0]))

                #if m_case:
                #    case_notes = m_case.group(1)
                m = re.search("\+FROM\|\|\|\|\|\|\|(#\d+=)*\\\\\"(\d+)\\\\\"", str(t[0]))
                if m:
                    start = m.group(2)
                
                m = re.search('\+TO\|\|\|\|\|\|\|\\\\\"(\d+)\\\\\"', str(t[len(t)-1]))
                if m:
                    end = m.group(1)
                #print start + ' ' + end + '\n\n'
                if start != '' and end != '':
                    if '<' + start + ':' + end + '>' not in words_pos_dict:                                                         
                        words_pos_dict['<' + start + ':' + end + '>'] = [len(words_list)]
                    else:
                        words_pos_dict['<' + start + ':' + end + '>'].append(len(words_list))
                    #pprint(words_pos_dict)
                    clean_word_lowercase = self.extract_lowercased_token_erg_tok(parent)
                    if "+" in clean_word_lowercase:
                        multiword_parts = clean_word_lowercase.split("+")
                        case_list = self.find_case_info_for_multiword_expr_erg_tok(t, multiword_parts)
                    
                    case_list = self.analyze_case_descr_erg_tok(erg_tok_dict, case_list)
                    #print("CASE_LIST")
                    #pprint(case_list)

                    multiword_expr_tokens = []
                    for elem_pos in range(len(case_list)):
                        tok = case_list[elem_pos][1]
                        multiword_expr_tokens.append(tok)


                    words_correct_case_dict[parent]="+".join(multiword_expr_tokens)


                    if tok_type == 'ptb':
                        (pos_tag_dict, lemma_dict) = self.extract_pos_tag_and_lemma_ptb_tok(grand_parent, pos_tag_dict, lemma_dict,'<' + start + ':' + end + '>')
                    else:
                        (pos_tag_dict, lemma_dict) = self.extract_pos_tag_and_lemma_erg_tok(grand_parent, pos_tag_dict, lemma_dict,'<' + start + ':' + end + '>')
            
            if len(t) > 1:
                #print t.node
                parent = t.node
                
                if '///////' in t.node:
                    parent_pieces= t.node.split('///////')
                    parent = parent_pieces.pop()
                
                child_nodes = []
                for child in t:
                    try:
                        child.node
                    except AttributeError: 
                        print ''
                    else:
                        child_node = child.node
                        
                        if '///////' in child.node:
                            child_pieces= child.node.split('///////')
                            child_node = child_pieces.pop()
                        child_node = child_node.replace('_______', ' ') 
                        child_node = child_node.replace('<<<<<<<', '(')
                        child_node = child_node.replace('>>>>>>>', ')')
                        child_nodes.append(child_node)
                #print '\n\n\n', child.node
                #print '\n\n\n========================================================\n\n\n'
                parent = parent.replace('_______', ' ')
                parent = parent.replace('<<<<<<<', '(')
                parent = parent.replace('>>>>>>>', ')')
                dep_dict[parent] = child_nodes
                if len(first_dep) == 0:
                    first_dep.append(parent)
                
                
                for child in t:
                    self.my_traverse(child, first_dep, words_list, pos_tag_dict, lemma_dict, dep_dict, words_pos_dict, words_correct_case_dict, erg_tok_dict,tok_type)

        #NOTE: we do not need "case_dict" for PTB tokenization, since we take tokens from PTB items and lower/upper case
        # is correct for them. Unfortunately, it is corrupted for ERG tokenization
        return (first_dep, words_list, dep_dict,words_pos_dict, pos_tag_dict, lemma_dict, words_correct_case_dict, tok_type)    

    #FUNCTION: READ_RULE_HEAD_FILE
    def read_rule_head_file(self, rules_file):
        f = codecs.open(rules_file, 'r', encoding = 'utf-8')
        rule_head_dict = {}
        for line in f.readlines():
            line = line.strip()
            rule_pieces = line.split(' ')
            rule_head_dict[rule_pieces[0]] = [rule_pieces[1], rule_pieces[2]]
        return rule_head_dict

    #FUNCTION: READ_RELATIONS_FILE
    def read_relations_file(self, relations_file):
        #We create two dictionaries for each type of the relations: one dictionary contains those relations that are expressed in 
        # regular expression form in the configuration file, the other dictionary contains those relations that are not expressed as regular expresssions. We do not put everything in one dictionary and do not use regular expressions for everything in order to speed up the search process (regex are very slow to process)
        read_redundant = 0
        read_transparent = 0
        read_lexical = 0
        read_relational = 0
        redundant_relname_dict = {}
        redundant_regex_dict = {}
        transparent_relname_dict = {}
        transparent_regex_dict = {}
        relational_relname_dict = {}
        relational_regex_dict = {}
        lexical_relname_dict = {}
        lexical_regex_dict = {}
        for line in codecs.open(relations_file,'r', encoding = 'utf-8').readlines(): 
            line = line.strip()
            if line == '[redundant]':
                read_redundant = 1
                read_transparent = 0
                read_lexical = 0
                read_relational = 0        
            elif line == '[transparent]':
                read_redundant = 0
                read_transparent = 1
                read_lexical = 0
                read_relational = 0
            elif line == '[relational]':
                read_redundant = 0
                read_transparent = 0
                read_lexical = 0
                read_relational = 1
            elif line == '[lexical]':
                read_redundant = 0
                read_transparent = 0
                read_lexical = 1
                read_relational = 0
            elif len(line) == 0:
                read_redundant = 0
                read_transparent = 0
                read_lexical = 0
                read_relational = 0
            elif read_redundant:
                (redundant_relname_dict, redundant_regex_dict) = self.read_line_into_rel_dict(line, redundant_relname_dict, redundant_regex_dict)
            elif read_transparent:
                (transparent_relname_dict, transparent_regex_dict) = self.read_line_into_rel_dict(line, transparent_relname_dict, transparent_regex_dict)
            elif read_relational:
                (relational_relname_dict, relational_regex_dict) = self.read_line_into_rel_dict(line, relational_relname_dict, relational_regex_dict)
            elif read_lexical:
                (lexical_relname_dict, lexical_regex_dict) = self.read_line_into_rel_dict(line, lexical_relname_dict, lexical_regex_dict)

        redundant_general_regex = self.create_general_regex(redundant_relname_dict, redundant_regex_dict)
        transparent_general_regex = self.create_general_regex(transparent_relname_dict, transparent_regex_dict)
        relational_general_regex = self.create_general_regex(relational_relname_dict, relational_regex_dict)
        lexical_general_regex = self.create_general_regex(lexical_relname_dict, lexical_regex_dict)

        dictionaries = {} 
        dictionaries['redundant'] = [redundant_relname_dict, redundant_regex_dict, redundant_general_regex]
        dictionaries['transparent'] = [transparent_relname_dict, transparent_regex_dict, transparent_general_regex]
        dictionaries['relational'] = [relational_relname_dict, relational_regex_dict, relational_general_regex]
        dictionaries['lexical'] = [lexical_relname_dict, lexical_regex_dict, lexical_general_regex]
        return dictionaries 
                 
                    
    def read_line_into_rel_dict(self, line, relname_dict, regex_dict):
        #arguments (can be zero, one or two depending whether it is lexical relation or transparent relation or relational/redundant correspondingly
        arguments = []
        #line is already stripped!!!

        # Check if line is a comment (comments start with ";" sign)
        # If it is a comment, just ignore it
        m_comment = re.match(r"^;", line)
        if m_comment:
            results = [relname_dict, regex_dict]
            return results

        if " " in line.strip():
            line_pieces = line.split(" ")
            #the string before the first tabulation mark is the name of the relation
            relname = line_pieces[0]
            #everything after the first tabulation in the line are the arguments
            for i in range(1, len(line_pieces)):
                 arguments.append(line_pieces[i])

        # if there is no tabulation in the string, then it is lexical relation (it cannot be an empty string, because we checked for that in the function read_relations_file
        else:
            relname = line

        #if the relation name is expressed as regex
        if '/' in relname:
            #and we will write the relation into the general regular expression that corresponds to this type
            relname = relname.replace('/', '')
            regex_dict = self.write_relation_into_dict(relname, arguments, regex_dict)
        #if the relation name is NOT expressed as regex, we put it in a separate dictionary to speed up the search in future
        else:
            relname_dict = self.write_relation_into_dict(relname, arguments, relname_dict)


        results = [relname_dict, regex_dict]
        return results


    def create_general_regex(self, relname_dict, regex_dict):
        general_regex = "("
        for relname in relname_dict.keys():
            if general_regex == "(":
                general_regex = general_regex + "^" + relname
            else:
                general_regex = general_regex + "|^" + relname

        for relname in regex_dict.keys():
            if general_regex == "(":
                general_regex = general_regex + relname
            else:
                general_regex = general_regex + "|" + relname


        general_regex = general_regex + ")"
        return general_regex

    def write_relation_into_dict(self, relname, arguments, relations_dict):        
            #if the relation name is already in regex dict
            if relname in relations_dict:
                #if we do not have arguments and the relname is already in the dictionary, we do not have to do anything
                # if we have, we should add them
                if len(arguments) == 1:
                #to the existing record with the key of the relation name (expressed in the form of regex)
                # attach the new list of arguments
                    relations_dict[relname].append(arguments[0])
                elif len(arguments) == 2:
                    relations_dict[relname].append(arguments)
                # this is only for relational relations: in case a relational relation has the third argument: a new name of the relation, e.g.:
                # comp ARG0 ARG2 ref
                # The dependency from ARG0 to ARG2 will be called "ref" instead of "comp"
                #to the existing record with the key of the relation name (expressed in the form of regex)
                # attach the new list of arguments
                elif len(arguments) == 3:
                    relations_dict[relname].append(arguments)

            #if the relation name hasn't been in the dictionary before
            else:
                #if we do not have arguments, just write the relation name into the dictionary with the value 1
                if len(arguments) == 0:
                    relations_dict[relname] = 1
                #if we have just one argument, write relname as a key and this argument as the value. 
                elif len(arguments) == 1:
                    relations_dict[relname] = [arguments[0]]
                # We need to have a list of values because the same relation name can occur more than once, e.g.
                # [relational]
                # /_c$/ L-HNDL R-HNDL
                # /_c$/ L-INDEX R-INDEX
                elif len(arguments) == 2:
                    relations_dict[relname] = [arguments]
                # this is only for relational relations: in case a relational relation has the third argument: a new name of the relation, e.g.:
                # comp ARG0 ARG2 ref
                # The dependency from ARG0 to ARG2 will be called "ref" instead of "comp"
                elif len(arguments) == 3:
                    relations_dict[relname] = [arguments]
            return relations_dict

    def get_labels_from_typed_dict(self, dict_type, relations_collection, rel):
        labels_from_dict = []
        if rel in relations_collection[dict_type][0]:
            labels_from_dict = relations_collection[dict_type][0][rel]
        else:
            for regex_key in relations_collection[dict_type][1].keys():
                if re.search(regex_key, rel):
                    labels_from_dict = relations_collection[dict_type][1][regex_key]
        return labels_from_dict
#========================================#
#     FUNCTIONS FOR PTB TOKENIZATION     #
#========================================#

    #FUNCTION1: extract_pos_tag_and_lemma_ptb_tok
    def extract_pos_tag_and_lemma_ptb_tok(self, grand_parent, pos_tag_dict, lemma_dict, index):
        pos_tag = ''
        lemma = ''
        if grand_parent is not None:
            grand_parent_pieces = grand_parent.split('|||||||')
            pos_tag_incorporated = grand_parent_pieces[1]
            if '/' in pos_tag_incorporated:
                pos_tag_incorporated_pieces = pos_tag_incorporated.split('/')
                lemma = pos_tag_incorporated_pieces[0]
                pos_tag = pos_tag_incorporated_pieces[1]
            elif '@' in pos_tag_incorporated:
                pos_tag_incorporated_pieces = pos_tag_incorporated.split('@')
                lemma = pos_tag_incorporated_pieces[0]
                pos_tag = pos_tag_incorporated_pieces[1]                
        #now cut pos tag up so that we take only the part before first "_"
        #sub_parts = pos_tag.split('_')
        #pos_tag = sub_parts[0]
        #pos_tag = '_'.join([sub_parts[0],sub_parts[1]])
        #pos_tag = '_'.join([sub_parts[0],sub_parts[1], sub_parts[2]])

        pos_tag_dict[index] = pos_tag
        lemma_dict[index] = lemma
        return [pos_tag_dict, lemma_dict]

    #FUNCTION2: find_key_index_ptb_tok
    def find_key_index_ptb_tok(self, key, transparent_dict, dict_id_pos, tokens_pos_dict, split_dict, contracted_neg_split_dict_mrs, group_dict):
        key_id = None

        #DANGER OF AN INFINITE LOOP, IF THE SAME KEY IS THE "KEY" and THE "VALUE" in transparent dict
        while key in transparent_dict:
            key = transparent_dict[key]
    

        if (key in dict_id_pos):
            elem_id = dict_id_pos[key]

            if elem_id in  tokens_pos_dict:
                key_id = tokens_pos_dict[elem_id]
            elif elem_id in split_dict and elem_id in contracted_neg_split_dict_mrs:
                key_id = tokens_pos_dict[contracted_neg_split_dict_mrs[elem_id][0]]
            elif elem_id in split_dict:
                key_id = tokens_pos_dict[split_dict[elem_id][0]]
            elif elem_id in group_dict:
                key_id = tokens_pos_dict[group_dict[elem_id]]
        return key_id

    #FUNCTION2.2: find_key_index_pred_ptb_tok
    def find_key_index_pred_ptb_tok(self, key, transparent_dict, dict_id_pos, tokens_pos_dict, split_dict, contracted_neg_split_dict_mrs, group_dict):
        key_id = None
        if key not in transparent_dict:

            if (key in dict_id_pos):
                elem_id = dict_id_pos[key]
                if elem_id in  tokens_pos_dict:
                    key_id = tokens_pos_dict[elem_id]
                elif elem_id in split_dict and elem_id in contracted_neg_split_dict_mrs:
                    key_id = tokens_pos_dict[contracted_neg_split_dict_mrs[elem_id][0]]
                elif elem_id in split_dict:
                    key_id = tokens_pos_dict[split_dict[elem_id][0]]
                elif elem_id in group_dict:
                    key_id = tokens_pos_dict[group_dict[elem_id]]
        return key_id


    #FUNCTION3: eds_expansion_ptb_tok
    def eds_expansion_ptb_tok(self, eds_dep_indexes, eds_dict, tokens_pos_dict, dict_id_pos, transparent_dict, split_dict, contracted_neg_split_dict_mrs, group_dict, eds_relation_dict):
        #print 'EDS DICT'
        #pprint(eds_dict)
        #print '\n'
        eds_relation_pos_dict = {}

        for key, value in eds_dict.iteritems():
            head = self.find_key_index_ptb_tok(key, transparent_dict, dict_id_pos, tokens_pos_dict, split_dict, contracted_neg_split_dict_mrs, group_dict)
            #head can be 'None' for the cases when the head is not a word token
            if not(head is None):
                for dep in value:
                    label = dep[0]
                    dependent = self.find_key_index_ptb_tok(dep[1], transparent_dict, dict_id_pos, tokens_pos_dict, split_dict, contracted_neg_split_dict_mrs, group_dict)


                    if head != dependent and not(dependent is None):
                        eds_dep_indexes.append([head, dependent, label])

        #print 'EDS DEPENDENCY INDEXES:'
        #pprint(eds_dep_indexes) 
         #NOW EXPAND EDS_RELATION_DICT
        # THIS DICTIONARY IS FOR ALL PREDICATES (NOT ONLY FOR THOSE THAT ARE HEADS FOR SOME DEPENDENTS)
        # we had a dictionary "eds_relation_dict" for relations that mapped 'e30' -> _leading_a_1,
        # 'x9' -> _and_c
        # now we find that the position of 'e30' in the sentence is 5 and 
        #the position of 'x9' in the sentence is 1, 
        #and build a new dictionary eds_relation_pos_dict that maps 5 -> _leading_a_1, 1 -> _and_c
        for key in eds_relation_dict.keys():
            key_pos_in_sent = self.find_key_index_pred_ptb_tok(key, transparent_dict, dict_id_pos, tokens_pos_dict, split_dict, contracted_neg_split_dict_mrs, group_dict)
            if not (key_pos_in_sent is None):
                eds_relation_pos_dict[key_pos_in_sent] = eds_relation_dict[key]
        #print 'EDS DEPENDENCY INDEXES:'
        #pprint(eds_dep_indexes) 
        #print '\n\n'       #print '\n\n'

        return [eds_dep_indexes, eds_relation_pos_dict]

    #FUNCTION4: dep_expansion_ptb_tok
    def dep_expansion_ptb_tok(self, fi, rule_head_dict, dep_rel, words_id_dict, dep_dict, dep_rel_id_list):
        # dep_rel[0] can be 454|||||||SP-HD_N_C|||||||1.49349|||||||0|||||||2
        #We could have dep_dict like this: {'"burkina faso"|||||||22': []} which means that len(dep_dict[dep_rel[0]]) = 0
        if dep_rel[0] in dep_dict and len(dep_dict[dep_rel[0]]) > 0:
            for i in range(0,len(dep_dict[dep_rel[0]])):
                if dep_dict[dep_rel[0]][i] in words_id_dict:
                    dep_dict[dep_rel[0]][i] = words_id_dict[dep_dict[dep_rel[0]][i]]
                else:
                    new_dep_rel = []
                    new_dep_rel.append(dep_dict[dep_rel[0]][i])
                    self.dep_expansion_ptb_tok(fi, rule_head_dict, new_dep_rel, words_id_dict, dep_dict, dep_rel_id_list)
                    #here we can potentially fail
                    dep_dict[dep_rel[0]][i] = words_id_dict[dep_dict[dep_rel[0]][i]]
            
            
            dep_rel_parts = dep_rel[0].split('|||||||')
            if re.match('^\^', dep_rel_parts[1], re.IGNORECASE):
                dep_rel_parts[1] = dep_rel_parts[1][1:]
            
            if dep_rel_parts[1].lower() in rule_head_dict and len(dep_dict[dep_rel[0]]) == int(rule_head_dict[ dep_rel_parts[1].lower()][0]):
                head_index = int(rule_head_dict[ dep_rel_parts[1].lower()][1])
                #DEPENDENCY ARC LABEL (WE CAN TAKE THE WHOLE ERG LABEL, OR BEFORE FIRST "_" OR BEFORE SECOND "_"
                label = dep_rel_parts[1]
                dep_rel_subparts = dep_rel_parts[1].split('_')
                label = dep_rel_subparts[0]
                #label = "_".join([dep_rel_subparts[0], dep_rel_subparts[1]])
                head = dep_dict[dep_rel[0]][head_index]
                for i in range(0,len(dep_dict[dep_rel[0]])):    
                    if dep_dict[dep_rel[0]][i] != head:
                        dependent = dep_dict[dep_rel[0]][i]
                        dep_rel_id_list.append([head, dependent,label])
                if not dep_rel[0] in words_id_dict:
                    words_id_dict[dep_rel[0]] = head                 
            #if head_index == 0: 
            #head = dep_dict[dep_rel[0]][0]
            #dependent = dep_dict[dep_rel[0]][1]
            #elif head_index == 1:
            #head = dep_dict[dep_rel[0]][1]
            #dependent = dep_dict[dep_rel[0]][0]
            #else:
            #print >> sys.stderr, "Unexpected head index of the rule " + dep_rel_parts[1].lower() + " (head index is neither 0, nor 1; head index is " + str(head_index) + ")."
            #sys.exit(1)
            else:
                print >> sys.stderr, "Error! Sentence id " + str(fi) + ". Rule " + dep_rel_parts[1].lower() + ". Unknown rule (rule is not in the list of rules in the file erg.hds) or incorrect number of daughters in the tree."
                sys.exit(1)


        return dep_rel_id_list

    def convert_dt_ptb_tok(self, fi, tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list,sent_root, dep_rel_tokens_list, args_dict, output_flags, fhdl_dict):

        #we have a list dep_indexes that contains lists in it. Each of the inner lists represents head, dependent, label.

        # but when we write a sentence in CoNLL format, for each word we need its head and label.
        #so we convert our list into a dictionary where keys are dependents and values are lists (in the Derivation Tree one dependent can have only one head) that represent head and label 
        #print 'SENTENCE ROOT ', sent_root, '\n\n'
        if output_flags['dt']:
            fhdl_dict['dt'].write('#' + str(fi) + '\n') 
        

        dep_indexes_dict = create_dep_indexes_dict(dep_rel_tokens_list)


        for i in range(1, len(tokens_list) + 1):
            head = '_'
            label = '_'
            
            if i in dep_indexes_dict.keys():
                head = dep_indexes_dict[i][0]
                label = dep_indexes_dict[i][1]
            #"if" instead of "elif" because in file 10100030.gz we have a dependency to root node due to errors in PTB tokenization. Root should not be dependent 
            #on anything therefore this "if" will re-write the dependency of the previous "if". If we use "elif" we would get sentence without root in CoNLL format.
            
            if i == sent_root:

                head = 0
                label = 'ROOT'
                if output_flags['tex']:
                    fhdl_dict['tex'].write('\deproot[edge above, edge style={red}]{' + str(i) + '}' + '{root}\n')
            
            word = tokens_list[i-1]
            #print 'WORD: ', word, '\n'


            if self.is_punctuation_ptb_tok(word):
                lemma = word
                pos_tag = word
            else:
                lemma = lemma_tokens_dict[i]
                pos_tag = pos_tag_tokens_dict[i]

            #We have several sentences that end with e.g.
            # says . ``
            # and `` is lost in the derivation tree.
            # PTB tokenization
            # (64, 22, 23, <22:23>, 1, "says" "says", 0, "null", "VBZ" 1.0)
            # (65, 23, 24, <23:24>, 1, "." ".", 0, "null", "." 1.0)
            # (66, 24, 25, <24:25>, 1, "“" "``", 0, "null", "``" 1.0)
            # Derivation tree:
            # (224, 20, 21, <22:24>, 1, "says.", 0, "null", "VBZ" 1.0)
            # Therefore `` did not get a head during our analysis. 
            # We have to fix that by attaching `` to the nearest word on the left which is not a punctuation symbol.
            if head == '_':
                for j in range(i-2,-1,-1): 
                    if not self.is_punctuation_ptb_tok(tokens_list[j]):
                        head = j+1
                        if self.is_punctuation_ptb_tok(word):
                            label = 'PUNCT'
                        break


            if self.is_punctuation_ptb_tok(word):
                lemma = word
                pos_tag = word
            else:
                lemma = lemma_tokens_dict[i]
                pos_tag = pos_tag_tokens_dict[i]

            if output_flags['dt']:
                fhdl_dict['dt'].write(str(i) + '\t' + word + '\t' + "_" + '\t'  + ptb_pos_list[i-1] + '\t' + pos_tag + '\t' +  '_' + '\t' + str(head) + '\t' + label  + '\t' + '_' + '\t' + '_' +'\n') 
            if output_flags['tex']:
                if i!= sent_root:
                    #Latex requires that underscore symbol is escaped.
                    label = label.replace('_', '\_')
                    fhdl_dict['tex'].write('\depedge[edge above, edge style={red}]' + 
                                   '{' + str(head) + '}' + 
                                   '{' + str(i) + '}' + 
                                   '{' + label + '}' + '\n') 
        if output_flags['dt']:
            fhdl_dict['dt'].write('\n')

        return  


    def convert_dm_ptb_tok(self, fi, tokens_list, lemma_tokens_dict ,pos_tag_tokens_dict, ptb_pos_list, sent_root_mrs_derived, eds_dep_indexes, args_dict, output_flags, fhdl_dict):

        
        #Remove duplates from eds_dep_indexes
        eds_dep_indexes.sort()
        eds_dep_indexes_no_dup = list(eds_dep_indexes for eds_dep_indexes, _ in itertools.groupby(eds_dep_indexes))

        #print 'EDS DEP INDEXES NO DUP'
        #pprint(eds_dep_indexes_no_dup)
        #print '\n'

        #print 'EDS DEP INDEXES'
        #pprint(eds_dep_indexes)
        #print '\n'

        
        #if there is no root, then our variable sent_root_mrs_derived = ''
        #if there is a root, its index is in an integer format and is calculated starting from 1:
        #Example:
        #'This is the most common conception...'
        # sent_root_mrs_derived = '2' (the copula 'is' is the root in this case)
        if output_flags['dm']:
            fhdl_dict['dm'].write('#' + str(fi) + '\n') 

        if sent_root_mrs_derived != '':
            #If a word is a punctuation, it means it was cut from the word, therefore the lemma and pos tag saved in the dictionary concern the word, not the punctuation. We replace them just repeating the punctuation
            if self.is_punctuation_ptb_tok(tokens_list[sent_root_mrs_derived-1]):
                lemma = tokens_list[sent_root_mrs_derived-1]
                pos_tag = tokens_list[sent_root_mrs_derived-1]
            else:
                #if it is not a puntuation, use the lemma and the pos from the dictionaries
                lemma = lemma_tokens_dict[sent_root_mrs_derived]
                pos_tag = pos_tag_tokens_dict[sent_root_mrs_derived]
            
            #in English MRS Test Suite we are not given the PTB pos tags therefore ptb_pos_tag[sent_root_mrs_derived - 1] = '_'
            #we should just print out  ERG PoS tags twice
            #print('ptb_pos_list')
            #pprint(ptb_pos_list)
            if ptb_pos_list[sent_root_mrs_derived - 1] != '_':
                #in the case of WeScience and PEST
                ptb_pos_tag = ptb_pos_list[sent_root_mrs_derived - 1]
            else:
                #in the case of English MRS Test Suite, use ERG PoS tag
                ptb_pos_tag = pos_tag_tokens_dict[sent_root_mrs_derived]

            if output_flags['dm']:
                fhdl_dict['dm'].write('ROOT' + '\t' + '_' + '\t' + 'ROOT' + '\t' + 'ROOT' + '\t' +  '-1'	+ '\t' + 'ROOT' + '\t' +	'ROOT' + '\t' + tokens_list[sent_root_mrs_derived-1] + '\t' + "_" + '\t' + pos_tag + '\t' + ptb_pos_tag + 	'\t' + str(sent_root_mrs_derived - 1) + '\n')
            if output_flags['tex']:
                fhdl_dict['tex'].write('\deproot[edge style={blue}]{' + str(sent_root_mrs_derived) + '}' + '{root}\n')

        # Example:
        # eds_dep_indexes_no_dup:
        # [[13, 15, 'conj'],
        #  [15, 16, 'ARG1'],
        #  [17, 16, 'ARG1'],
        #  [17, 20, 'ARG2'],
        # 
        # The indexes are counted from 1, while in PAS we count indexes from 0
        for i in range(0, len(eds_dep_indexes_no_dup)):
            head_word_index = eds_dep_indexes_no_dup[i][0] - 1
            
            head_word = tokens_list[head_word_index]


            #If a head word is a punctuation, it means it was cut from the word, therefore the lemma and pos tag saved in the dictionary concern the word, not the punctuation. We replace them just repeating the punctuation
            if self.is_punctuation_ptb_tok(head_word):
                #print head_word
                head_lemma = head_word
                head_word_pos = head_word
            else:
                #if it is not a puntuation, use the lemma and the pos from the dictionaries
                head_lemma = lemma_tokens_dict[head_word_index + 1]
                head_word_pos = pos_tag_tokens_dict[head_word_index + 1]    

            #we want to have a column with PTB PoS tag. However, for English MRS Test Suite we do not have them available.
            #we will have ptb_pos_list[head_word_index]!= '_' in this case and therefore we will use ERG PoS tag instead.
            if ptb_pos_list[head_word_index]!= '_':
                head_word_ptb_pos = ptb_pos_list[head_word_index]
            else:
                head_word_ptb_pos = head_word_pos
            
                    
            dep_word_index = eds_dep_indexes_no_dup[i][1] - 1
            
            dep_word = tokens_list[dep_word_index]

            #If a dependency word is a punctuation, it means it was cut from the word, therefore the lemma and pos tag saved in the dictionary concern the word, not the punctuation. We replace them just repeating the punctuation           
            if self.is_punctuation_ptb_tok(dep_word): 
                dep_lemma = dep_word
                dep_word_pos = dep_word
            else:
                #if it is not a puntuation, use the lemma and the pos from the dictionaries
                dep_lemma = lemma_tokens_dict[dep_word_index + 1]
                dep_word_pos = pos_tag_tokens_dict[dep_word_index + 1]
            
            if ptb_pos_list[dep_word_index] != '_':
                dep_word_ptb_pos = ptb_pos_list[dep_word_index]
            else:
                dep_word_ptb_pos = dep_word_pos
            
            
            label = eds_dep_indexes_no_dup[i][2]
            
            if output_flags['dm']:
                fhdl_dict['dm'].write(head_word + '\t' + "_" + '\t' + head_word_pos + '\t' + head_word_ptb_pos + '\t' +  str(head_word_index)	+ '\t' + label + '\t' +	label + '\t' + dep_word + '\t' + "_" + '\t' + dep_word_pos + '\t' + dep_word_ptb_pos + 	'\t' + str(dep_word_index) + '\n')
            if output_flags['tex']:
                eds_dep_indexes_no_dup[i][2] = eds_dep_indexes_no_dup[i][2].replace('_', '\_')
                fhdl_dict['tex'].write('\depedge[edge style={blue}]' + 
                       '{' + str(eds_dep_indexes_no_dup[i][0]) + '}' + 
                       '{' + str(eds_dep_indexes_no_dup[i][1]) + '}' + 
                       '{' + eds_dep_indexes_no_dup[i][2] + '}' + '\n')
        if output_flags['dm']:
            fhdl_dict['dm'].write('\n')

        return
    def convert_dt_yy_ptb_tok(self, fi, all_ptb_pos_with_probabilities_list, tokens_pos_dict,tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list,dt_sent_root, dt_dep_rel_tokens_list, split_dict, args_dict, output_flags, fhdl_dict):
        # WORK WITH DT
        dt_dep_indexes_dict = create_dep_indexes_dict(dt_dep_rel_tokens_list)
        if dt_sent_root != '':
            dt_dep_indexes_dict[dt_sent_root] = [0, u'ROOT']
        
        #Re-arrange tokens_pos_dict to fix the cases like #22100021
        # Current tokens_pos_dict:
        #{'<56:61>': 12,
        #'<61:62>': 13,
        #'<62:64>': 15, ...}

        # Desired tokens_pos_dict:
        # Current tokens_pos_dict:
        #{'<56:61>': 12,
        #'<61:62>': 13,
        #<62:63> : 14,
        #'<63:64>': 15, ...}


        # This comes from the export:
        # 22100021
        #(12, 11, 12, <56:61>, 1, "click", 0, "null", "NN" 1.0)
        #(13, 12, 13, <61:62>, 1, ".", 0, "null", "." 1.0)
        #(14, 13, 14, <62:64>, 1, "”", 0, "null", "''" 1.0)
        #(15, 14, 15, <62:64>, 1, ")", 0, "null", ")" 1.0)


        updated_tokens_pos_dict = {}
        token_pos_start_char_id_dict = {}
        tokens_pos_dict_sorted_by_values = sorted(tokens_pos_dict.items(), key=lambda (key,value): value)
        for i in range(0, len(tokens_pos_dict_sorted_by_values)):
            char_based_id = tokens_pos_dict_sorted_by_values[i][0]
            token_position = tokens_pos_dict_sorted_by_values[i][1]

            char_based_id_parts = char_based_id[1:-1].split(":")
            char_based_id_start = int(char_based_id_parts[0])
            char_based_id_end = int(char_based_id_parts[1])

            if token_position == i+2 and char_based_id_end - char_based_id_start == 2 and self.is_punctuation_ptb_tok(tokens_list[i]) and self.is_punctuation_ptb_tok(tokens_list[i+1]):
                updated_tokens_pos_dict['<' + str(char_based_id_start) + ':' + str(char_based_id_start+1) + '>' ] = i +1
                updated_tokens_pos_dict['<' + str(char_based_id_start +1 ) + ':' + str(char_based_id_start+2) + '>' ] = i +2               
                
                # and the dictionary that contains opposite: by token position we find start of character id
                token_pos_start_char_id_dict[i +1] = char_based_id_start
                token_pos_start_char_id_dict[i +2] = char_based_id_start +1

            else:
                updated_tokens_pos_dict[char_based_id] = token_position
                token_pos_start_char_id_dict[token_position] = char_based_id_start


        #print("dt_dep_indexes_dict")
        #pprint(dt_dep_indexes_dict)
        #pprint(tokens_list)
        #pprint(tokens_pos_dict)
        #print "Updated tokens pos dict"
        #pprint(updated_tokens_pos_dict)

        # IF the ERG token was split in PTB tokenization, we will provide the dependency for the head word before the
        # first non-punctuation elment of the the split ERG token
        # ERG TOK in profile
        #  (41, 0, 1, <0:2>, 1, "ad", 0, "null")
        #(45, 0, 1, <0:2>, 1, "ad", 0, "null")
        #(38, 1, 2, <3:6>, 1, "hoc", 0, "null")
        #(42, 1, 2, <3:6>, 1, "hoc", 0, "null")
        #(39, 2, 3, <7:10>, 1, "oil", 0, "null")
        #(43, 2, 3, <7:10>, 1, "oil", 0, "null")
        #(40, 3, 4, <11:19>, 1, "arrived.", 0, "null")
        #(44, 3, 4, <11:19>, 1, "arrived.", 0, "null")
        #
        # PTB TOK in profile
        #(1, 0, 1, <0:2>, 1, "Ad", 0, "null")
        #(2, 1, 2, <3:6>, 1, "hoc", 0, "null")
        #(3, 2, 3, <7:10>, 1, "oil", 0, "null")
        #(4, 3, 4, <11:18>, 1, "arrived", 0, "null")
        #(5, 4, 5, <18:19>, 1, ".", 0, "null")
        #
        # DTM
        #1  Ad  _   _   aj_-_i_le   _   2   MWE _   _   _   MWE _   _
        #2   hoc _   _   aj_-_i_le   _   3   AJ-HDN  _   _   _ad+hoc_a_1 _   _   _
        #3   oil _   _   n_-_mc_le   _   4   SB-HD   _   _   _oil_n_1    ARG1    _   ARG1
        #4   arrived _   _   v_-_le  _   0   ROOT    _   _   ^_arrive_v_1    _   _   _
        #5   .   _   _   .   _   4   PUNCT   _   _   _   _   _   _
        #
        # YY
        # [2] |
        #(0, 0, 1, 1, "⌊→¦aj-hdn¦7⌋", 0, "null") 
        #(1, 1, 2, <0:2>, 1, "Ad", 0, "null" ) 
        #(2, 2, 3, <3:6>, 1, "hoc", 0, "null" ) 
        #(3, 3, 4, 1, "⌊→¦sb-hd¦11⌋", 0, "null") 
        #(4, 4, 5, <7:10>, 1, "oil", 0, "null" ) 
        #(5, 5, 6, 1, "⌊→¦root⌋", 0, "null") 
        #(6, 6, 7, <11:18>, 1, "arrived", 0, "null") 
        #(7, 7, 8, <18:19>, 1, ".", 0, "null") 
        # First collect MWE expressions and change dependencies for them:
        # ignore_dep_list - a list of character ids of tokens for which dependencies should be ignored
        ignore_dep_list = []
        # change_dep_dict - a dictionary of character ids of tokens for which dependencies should be changed
        change_dep_dict = {}
        mwe_dict = defaultdict(list)
        #print("dt_dep_indexes_dict")
        #pprint(dt_dep_indexes_dict)
        change_outgoing_dep_for_mwe = {}
        for tok_index in range(1, len(tokens_list) + 1):
            # COLLECT INFO ABOUT DT
            dt_head = u'_'
            dt_label = u'_'
            
            if tok_index in dt_dep_indexes_dict.keys():
                dt_head  = dt_dep_indexes_dict[tok_index][0]
                dt_label = dt_dep_indexes_dict[tok_index][1]
            if dt_label == u'MWE':
                mwe_dict[int(dt_head)].append(int(tok_index))
        #print("mwe_dict")
        #pprint(mwe_dict)
        for el in mwe_dict:
            if el in dt_dep_indexes_dict.keys():
                el_head  = dt_dep_indexes_dict[el][0]
                el_label = dt_dep_indexes_dict[el][1]
            mwe_dict[el].sort()
            ignore_dep_list.append(el)
            # Before outgoing dependencies from MWE were from the head of MWE
            # The head of MWE used to be "el"
            # Now we put all the annotations on the first token of MWE: "mwe_dict[el][0]"
            # So the outgoing dependencies from MWE should be also now started from "mwe_dict[el][0]" instead of "el"
            change_outgoing_dep_for_mwe[el] = mwe_dict[el][0]
            change_dep_dict[mwe_dict[el][0]] = [el_head, el_label]
        #print("ignore_dep_list")
        #pprint(ignore_dep_list)
        #print("change_dep_dict")
        #pprint(change_dep_dict)
        #print("change_outgoing_dep_for_mwe")
        #pprint(change_outgoing_dep_for_mwe)

        
        # NOW COLLECT ALL  DT INFO FOR YY FORMAT
        # first four variables for yy
        out_path = u'1'
        out_ipos = u'0'
        out_lrule = u'"null"'
        final_tokens_list = []
        out_id = 0
        out_start = 0
        # Loop over sentence tokens
        for i in range(1, len(tokens_list) + 1):
            # COLLECT INFO ABOUT DT
            dt_head = u'_'
            dt_label = u'_'
            
            if i in dt_dep_indexes_dict.keys():
                dt_head  = dt_dep_indexes_dict[i][0]
                dt_label = dt_dep_indexes_dict[i][1]
            #"if" instead of "elif" because in file 10100030.gz we have a dependency to root node due to errors in PTB tokenization. Root should not be dependent 
            #on anything therefore this "if" will re-write the dependency of the previous "if". If we use "elif" we would get sentence without root in CoNLL format.
            if i == dt_sent_root:
                dt_head = 0
                dt_label = u'ROOT'
            
            word = tokens_list[i-1]
            #.replace("“", "``").replace("”", "''").replace("‘", "`").replace("’", "'")
            #print 'WORD: ', word, '\n'

            character_based_id_start = u'_'
            character_based_id_end = u'_'

            # COLLECT CHARACTER-BASED ID OF THE WORD, e.g. token 20 corresponds to a word "of" and has a character-based id <99:101>
            for char_based_id, token_pos in updated_tokens_pos_dict.iteritems():
                if token_pos == i:
                    char_based_id_parts = char_based_id[1:-1].split(':')
                    character_based_id_start = char_based_id_parts[0]
                    character_based_id_end   = char_based_id_parts[1]


            out_link = '<' + character_based_id_start + ':' + character_based_id_end + '>'
            word = word.decode('utf-8')


            #We have several sentences that end with e.g.
            # says . ``
            # and `` is lost in the derivation tree.
            # PTB tokenization
            # (64, 22, 23, <22:23>, 1, "says" "says", 0, "null", "VBZ" 1.0)
            # (65, 23, 24, <23:24>, 1, "." ".", 0, "null", "." 1.0)
            # (66, 24, 25, <24:25>, 1, "“" "``", 0, "null", "``" 1.0)
            # Derivation tree:
            # (224, 20, 21, <22:24>, 1, "says.", 0, "null", "VBZ" 1.0)
            # Therefore `` did not get a head during our analysis. 
            # We have to fix that by attaching `` to the nearest word on the left which is not a punctuation symbol.
            if dt_head == u'_':
                for j in range(i-2,-1,-1): 
                    if not self.is_punctuation_ptb_tok(tokens_list[j]):
                        dt_head = j+1
                        if self.is_punctuation_ptb_tok(word):
                            dt_label = u'PUNCT'
                        break


            if i in change_dep_dict:
                dt_head = change_dep_dict[i][0]
                dt_label = change_dep_dict[i][1]
                #dependent_start_char_pos = token_pos_start_char_id_dict[dt_head]
                #out_form = u'"⌊→¦' + dt_label.lower() + u'¦' + str(dependent_start_char_pos) + u'⌋"'
                # This is dependency
                #final_tokens_list.append("("  + ", ".join([str(out_id), str(out_start), str(out_start + 1), out_path, out_form, out_ipos, out_lrule]) + ")")
                #out_id += 1
                #out_start += 1   
            if dt_head in change_outgoing_dep_for_mwe:
                dt_head = change_outgoing_dep_for_mwe[dt_head]          
            if dt_label == u'ROOT'  and u'-' not in word and u'/' not in word and (i not in ignore_dep_list):
                out_form = u'"⌊→¦root⌋"'
                # This is dependency
                final_tokens_list.append("("  + ", ".join([str(out_id), str(out_start), str(out_start + 1), out_path, out_form, out_ipos, out_lrule]) + ")")
                out_id += 1
                out_start += 1

            # We still check for dt_label != u'PUNCT' despite having excluded punct that were obtained from split dictionary, because there are special cases when some punctuation was absent from the ERG tree but was in the PTB tokenization
            if dt_label != u'PUNCT' and dt_label != u'ROOT' and dt_label != u'MWE' and dt_label != u'NEG' and u'-' not in word and u'/' not in word and (i not in ignore_dep_list):
                
                dependent_start_char_pos = token_pos_start_char_id_dict[dt_head]
                out_form = u'"⌊→¦' + dt_label.lower() + u'¦' + str(dependent_start_char_pos) + u'⌋"'
                # This is dependency
                final_tokens_list.append("("  + ", ".join([str(out_id), str(out_start), str(out_start + 1), out_path, out_form, out_ipos, out_lrule]) + ")")
                out_id += 1
                out_start += 1
                               

            # This is token
            #word = '"' + word.decode('utf-8') + '"'
            word = '"' + word + '"'
            # If there are no PoS tags with probabilities, do not include them at the end
            if all_ptb_pos_with_probabilities_list[i-1]== "":
                final_tokens_list.append("("  + ", ".join([str(out_id), str(out_start), str(out_start + 1), out_link, out_path, word, out_ipos, out_lrule]) + ")") 
            else:
                final_tokens_list.append("("  + ", ".join([str(out_id), str(out_start), str(out_start + 1), out_link, out_path, word, out_ipos, out_lrule, all_ptb_pos_with_probabilities_list[i-1]]) + ")") 

 
            out_id += 1
            out_start += 1
        if output_flags['dt_yy']:
            fhdl_dict['dt_yy'].write('[' + str(fi) + '] |' + " ".join(final_tokens_list) + "\n")
            
            #print('[' + str(fi) + '] |' + " ".join(final_tokens_list) + "\n")


    def convert_dtm_ptb_tok(self, fi, tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list,dt_sent_root, dt_dep_rel_tokens_list, args_dict, output_flags, fhdl_dict, dm_sent_root, dm_eds_dep_indexes, eds_relation_pos_dict):
                                  
        
        fhdl_dict['dtm'].write('#' + str(fi) + '\n') 

        #FIRST WORK ON DM REPRESENTATION
        #Sort relations by position in the sentence
        # Now they are unsorted in the dictionary eds_relation_pos_dict

        sorted_eds_relation_pos_list = sorted(eds_relation_pos_dict.keys())
        #print("eds_relation_pos_dict")
        #pprint(eds_relation_pos_dict)
        #print "Sorted EDS RELATION POS LIST"
        #pprint(sorted_eds_relation_pos_list)


        #Rearrange the array for DM
        dm_dependencies = collect_arguments_for_dm_predicates(dm_eds_dep_indexes, eds_relation_pos_dict, sorted_eds_relation_pos_list)
        
        # WORK WITH DT
        #print("dt_dep_rel_tokens_list")
        #pprint(dt_dep_rel_tokens_list)

        dt_dep_indexes_dict = create_dep_indexes_dict (dt_dep_rel_tokens_list)
        #print("dt_dep_indexes_dict")
        #pprint(dt_dep_indexes_dict)

        # NOW COLLECT ALL INFO FOR DT AND DM AND WRITE OUTPUT
        for i in range(1, len(tokens_list) + 1):
            # COLLECT INFO ABOUT DT
            dt_head = '_'
            dt_label = '_'
            
            if i in dt_dep_indexes_dict.keys():
                dt_head  = dt_dep_indexes_dict[i][0]
                dt_label = dt_dep_indexes_dict[i][1]
            #"if" instead of "elif" because in file 10100030.gz we have a dependency to root node due to errors in PTB tokenization. Root should not be dependent 
            #on anything therefore this "if" will re-write the dependency of the previous "if". If we use "elif" we would get sentence without root in CoNLL format.
            if i == dt_sent_root:
                dt_head = 0
                dt_label = 'ROOT'
            
            word = tokens_list[i-1]
            #print 'WORD: ', word, '\n'


            if self.is_punctuation_ptb_tok(word):
                lemma = word
                pos_tag = word
            else:
                lemma = lemma_tokens_dict[i]
                pos_tag = pos_tag_tokens_dict[i]

            #We have several sentences that end with e.g.
            # says . ``
            # and `` is lost in the derivation tree.
            # PTB tokenization
            # (64, 22, 23, <22:23>, 1, "says" "says", 0, "null", "VBZ" 1.0)
            # (65, 23, 24, <23:24>, 1, "." ".", 0, "null", "." 1.0)
            # (66, 24, 25, <24:25>, 1, "“" "``", 0, "null", "``" 1.0)
            # Derivation tree:
            # (224, 20, 21, <22:24>, 1, "says.", 0, "null", "VBZ" 1.0)
            # Therefore `` did not get a head during our analysis. 
            # We have to fix that by attaching `` to the nearest word on the left which is not a punctuation symbol.
            if dt_head == '_':
                for j in range(i-2,-1,-1): 
                    if not self.is_punctuation_ptb_tok(tokens_list[j]):
                        dt_head = j+1
                        if self.is_punctuation_ptb_tok(word):
                            dt_label = 'PUNCT'
                        break


            if self.is_punctuation_ptb_tok(word):
                lemma = word
                pos_tag = word
            else:
                lemma = lemma_tokens_dict[i]
                pos_tag = pos_tag_tokens_dict[i]

            #NOW COLLECT information about DM
            #by default dm_predicate is "_"
            # and for all predicate current word is not an argument by default ("_")
            dm_predicate = "_"
            if i in eds_relation_pos_dict:
                #if current word is a predicate
                dm_predicate = eds_relation_pos_dict[i]
            #if current word is the root, we add "^" symbol to the predicate to mark that it is a root, e.g.
            #^_ (empty predicate, )
            if not(dm_sent_root == ''):
                if i == int(dm_sent_root):
                    dm_predicate = "^" + dm_predicate
                

            dm_is_argument = "\t".join(["_"] * len(sorted_eds_relation_pos_list))
            if i in dm_dependencies:
                dm_is_argument = "\t".join(dm_dependencies[i])

            fhdl_dict['dtm'].write(str(i) + '\t' + word + '\t' + lemma + '\t'  + ptb_pos_list[i-1] + '\t' + pos_tag + '\t' +  '_' +  '\t' + '_' +  '\t' + '_' + '\t' + str(dt_head) + '\t' + dt_label  + '\t' + dm_predicate + '\t' + dm_is_argument +'\n') 
            
            
        fhdl_dict['dtm'].write('\n')

        return 


    #FUNCTION8: dep_expansion_to_tokens_ptb_tok
    def dep_expansion_to_tokens_ptb_tok(self, dep_rel_id_list, tokens_id_dict, tokens_pos_dict, words_pos_dict, tokens_list, pos_tag_dict, lemma_dict):
        #here we transform the list dep_rel_id_list
        #[['<24:29>', '<20:23>', 'SP-HD'],
        #['<17:19>', '<24:29>', 'HD-CMP'],
        #['<5:16>', '<17:19>', 'HD-CMP'],...]
        #into dep_rel_tokes_list 
        #[[5,4,'SP-HD'],
        #[3,5,'HD-CMP'],
        #[2, 3, 'HD-CMP'],...]
        #so we base tokenization on PennTreebank tokenization, not on ERG tokenization


        #first we have to fill in split_dict and group_dict
        #since we created list_dep_rel_id_list from words_pos_dict, we will use tokens_pos_dict and words_pos_dict to fill in  
        #split_dict and group_dict and partially fill in dep_rel_id_list (we add relations between the splitted items to it)
        fill_split_group_dictionaries_results = self.fill_split_group_dictionaries_ptb_tok(words_pos_dict, tokens_pos_dict, tokens_list, pos_tag_dict, lemma_dict)

        dep_rel_tokens_list = fill_split_group_dictionaries_results[0]
        eds_dep_indexes = fill_split_group_dictionaries_results[1]
        group_dict      = fill_split_group_dictionaries_results[2]
        split_dict      = fill_split_group_dictionaries_results[3]
        contracted_neg_split_dict_mrs = fill_split_group_dictionaries_results[4]
        pos_tag_dict    = fill_split_group_dictionaries_results[5]
        lemma_dict      = fill_split_group_dictionaries_results[6]

        for dep_rel in dep_rel_id_list:
            #dep_rel looks like ['<24:29>', '<30:32>', 'SP-HD']
            
            #new_dep_rel is an empty list of size three. It will be a re-written version of dep_rel in a form [5,4,'SP-HD']
            new_dep_rel = [None]*3
            #the label in new_dep_rel will be the same as in dep_rel, e.g. 'SP-HD'
            new_dep_rel[2] = dep_rel[2]
            for i in range(0,2):
                #dep_rel[i] looks like '<24:29>' when i=0 and '<30:32>' when i=1
                if dep_rel[i] in tokens_pos_dict:
                    new_dep_rel[i]= tokens_pos_dict[dep_rel[i]]
                elif dep_rel[i] in split_dict:
                    id_of_main_token = split_dict[dep_rel[i]][0]
                    new_dep_rel[i] = tokens_pos_dict[id_of_main_token]
                elif dep_rel[i] in group_dict: 
                    id_of_main_token = group_dict[dep_rel[i]][0]
                    new_dep_rel[i] = tokens_pos_dict[id_of_main_token]
                else:
                    print >> sys.stderr, "Id from dep_rel_id_list is not found neither in tokens_pos_dict, nor in split_dict, nor in group_dict"
                    print >> sys.stderr, "Id " +  dep_rel[i] + " from the relation " + ', '.join(dep_rel) + " in dep_rel_id_list"
            
            #we do not want loops in the tree, so we should make sure that first and second elements in ['<24:29>', '<30:32>', 'SP-HD'] are different
            if new_dep_rel[0] != new_dep_rel[1]:
                dep_rel_tokens_list.append(new_dep_rel)
        dep_expansion_to_tokens_results= [dep_rel_tokens_list, eds_dep_indexes, group_dict, split_dict, contracted_neg_split_dict_mrs, pos_tag_dict, lemma_dict]
        return dep_expansion_to_tokens_results

    #FUNCTION9: fill_split_group_dictionaries_ptb_tok
    def fill_split_group_dictionaries_ptb_tok(self,words_pos_dict, tokens_pos_dict, tokens_list, pos_tag_dict, lemma_dict):    
        dep_rel_tokens_list = []
        eds_dep_indexes = []
        group_dict         = {}
        split_dict         = {} 
        contracted_neg_split_dict_mrs = {}

        '''
        print 'WORDS POS DICTIONARY'
        pprint(words_pos_dict)
        
        print 'TOKENS POS DICTIONARY'
        pprint(tokens_pos_dict)
        
        print 'TOKENS LIST'
        pprint(tokens_list)
        print 'POS TAG DICT'
        pprint(pos_tag_dict)
        print 'LEMMA DICT'
        pprint(lemma_dict)
        '''    
            

        tokens_pos_dict_tmp = {}
        for token_key in tokens_pos_dict.keys():
            #tree_key = '<24:29>'. But when we do tree_key[1:-1] we get tree_key = '24:29'
            token_key_cut = token_key[1:-1]
            #tree_key_parts = [24, 29]
            token_key_parts = token_key_cut.split(':')
            #start_index = 24
            token_start_index = int(token_key_parts[0])
            #end_index = 29
            token_end_index = int(token_key_parts[1])
            tokens_pos_dict_tmp[token_key] = [token_start_index, token_end_index]


        for tree_key in words_pos_dict.keys():
            #if tree_key in tokens_pos_dict and len(words_pos_dict[tree_key]) > 1:
            if tree_key not in tokens_pos_dict:
                #tree_key = '<24:29>'. But when we do tree_key[1:-1] we get tree_key = '24:29'
                tree_key_cut = tree_key[1:-1]
                #tree_key_parts = [24, 29]
                tree_key_parts = tree_key_cut.split(':')
                #start_index = 24
                start_index = int(tree_key_parts[0])
                #end_index = 29
                end_index = int(tree_key_parts[1])
                #now we have to figure out if to split or join words from the derivation tree to match PennTreebank tokenization
                
                parts_of_splitted_key = []
                for tok_key in tokens_pos_dict_tmp.keys():
                    if start_index >= tokens_pos_dict_tmp[tok_key][0] and end_index <= tokens_pos_dict_tmp[tok_key][1]:
                        group_dict[tree_key] = tok_key
                    elif tokens_pos_dict_tmp[tok_key][0] >= start_index and tokens_pos_dict_tmp[tok_key][1] <= end_index:
                        parts_of_splitted_key.append(tok_key)
                        #It might be that the part of the splitted word has already been in the words dictionary in the case of "end- state." Here "end-" has id <237:246> and "state." has id 
                        #<237:247>. We split "state." into "state" with id <237:246> and "." with id <246:247>. Since id <237:246> has already been in the words_pos_dict because of "end", it means the words "end-" and "state" will be grouped together. This means, POS tag for "end-state" will be the same as for "state", not the same as for "end-". It cannot be punctuation, because punctuation does not function as independent tokens in ERG approach.
                        if tok_key in words_pos_dict:
                            if int(words_pos_dict[tok_key][0]) < int(words_pos_dict[tree_key][0]):
                                #print 'Substitute POS tag ', pos_tag_dict[tok_key], ' with POS tag ', pos_tag_dict[tree_key], '\n\n'
                                pos_tag_dict[tok_key] = pos_tag_dict[tree_key]
                                lemma_dict[tok_key]   = lemma_dict[tree_key]
                
                
                #if the word was splitted, we need to define the head. By default, it is the last token in the expression. However, we do not want a punctuation symbol
                #to be the head so we will check that the head is not a punctuation. 
                
                #The problem is that we have the keys in parts_of_splitted_key in the random order because we received them in the line
                #>for tok_key in tokens_pos_dict_tmp.keys():
                # since keys are not ordered, we got them in random order.
                # To set up the last word in the multi-word expression as head we have to order the multi-word expression first.
                if len(parts_of_splitted_key) > 0:
                    #sort elements in parts_of_splitted_key
                    parts_of_splitted_key = sorted(parts_of_splitted_key, self.compare_ptb_tok)
                    
                    
                    #the head will be the last element of the array
                    head = parts_of_splitted_key[len(parts_of_splitted_key) - 1]
                    #print 'INITIAL HEAD OF COMPOUND: ', head

                    # A flag that shows whether there is a non-punctuation head for the
                    # split token
                    found_non_punctuation_head_flag = 0

                    # Loop from the last element of the splitted token and 
                    # make the last non-punctuation symbol head
                    for counter in range(len(parts_of_splitted_key)-1, -1, -1):
                        #print 'COMPOUND PARTS: ' + parts_of_splitted_key[counter]
                        token_id = parts_of_splitted_key[counter]
                        token_pos = tokens_pos_dict[token_id]
                        #print 'TOKEN POS ' + str(token_pos)
                        #print 'TOKENS LIST'
                        #pprint(tokens_list)
                        token_value = tokens_list[token_pos - 1]

                        if not (self.is_punctuation_ptb_tok(token_value) or (token_value == "n't") or (token_value == "n’t")):
                            #print token_value + ' has no punct'
                            head = token_id
                            found_non_punctuation_head_flag = 1

                            #print 'HEAD OF COMPOUND INSIDE THE LOOP ', token_value
                            break
                    
                    #print found_non_punctuation_head_flag

                    # Special case! If all the elements of the split token are
                    # punctuation symbols, than some symbols should be preferred 
                    # over the other
                    if not found_non_punctuation_head_flag:
                        
                        for counter in range(len(parts_of_splitted_key)-1, -1, -1):
                            #print 'COMPOUND PARTS: ' + parts_of_splitted_key[counter]
                            token_id = parts_of_splitted_key[counter]
                            token_pos = tokens_pos_dict[token_id]
                            #print 'TOKEN POS ' + str(token_pos)
                            #print 'TOKENS LIST'
                            #pprint(tokens_list)
                            token_value = tokens_list[token_pos - 1]

                            if self.is_end_of_phrase_punctuation_ptb_tok(token_value):
                                #print token_value + ' has no punct'
                                head = token_id
                                #print 'HEAD OF COMPOUND INSIDE THE LOOP ', token_value
                                break


                    split_dict[tree_key] = [head, parts_of_splitted_key]
                    
                    #NOW WE HAVE TO LOOP AGAIN AND find out if we have split a contracted negation
                    #FOR the Derivation Tree it is not important because the 
                    # contracted negation contains apostrofi therefore it contains puntuatin and it won't become a head
                    # For the derivation tree we want "doesn't" to be split into 
                    #"does" and "n't" with "does" as the head. So we can use "split_dict" for it
                    #However for the MRS-derived dependencies we want "n't" to become a head therefore we have to create and additional dictionary contracted_neg_split_dict_mrs
                    for counter in range(len(parts_of_splitted_key)-1, -1, -1):
                        #print 'COMPOUND PARTS: ' + parts_of_splitted_key[counter]
                        token_id = parts_of_splitted_key[counter]
                        token_pos = tokens_pos_dict[token_id]
                        #print 'TOKEN POS ' + str(token_pos)
                        #print 'TOKENS LIST'
                        #pprint(tokens_list)
                        token_value = tokens_list[token_pos - 1]
                        if token_value == "n't" or token_value == "n’t":
                            #print token_value + ' contracted negation'
                            contr_neg_head = token_id
                            contracted_neg_split_dict_mrs[tree_key] = [contr_neg_head, parts_of_splitted_key]
                            #print 'HEAD OF COMPOUND INSIDE THE LOOP ', head
                            break
               
                
                    #print 'HEAD OF COMPOUND AFTER THE LOOP ', head
                    for counter in range(len(parts_of_splitted_key)-1, -1, -1):
                        #we do not want loops in the tree
                        if parts_of_splitted_key[counter] != head:
                            token_id = parts_of_splitted_key[counter]
                            token_pos = tokens_pos_dict[token_id]
                            #print 'TOKEN POS ' + str(token_pos)
                            #print 'TOKENS LIST'
                            #pprint(tokens_list)
                            token_value = tokens_list[token_pos - 1]
                            head_value  = tokens_list[tokens_pos_dict[head]-1]
                            #print 'head_value: ' + head_value
                            
                            if (token_value == "n’t" or token_value == "n't")  and not(self.is_punctuation_ptb_tok(token_value)):
                                    dep_rel_tokens_list.append([tokens_pos_dict[head],token_pos, 'NEG'])
                                    eds_dep_indexes.append([ token_pos, tokens_pos_dict[head],'NEG'])
                                    #print "NEGATION: " + head_value
                            elif self.is_punctuation_ptb_tok(token_value) == 0:
                                dep_rel_tokens_list.append([tokens_pos_dict[head], token_pos, 'MWE'])
                                eds_dep_indexes.append([tokens_pos_dict[head], token_pos, 'MWE'])
                            else:
                                dep_rel_tokens_list.append([tokens_pos_dict[head], token_pos, 'PUNCT'])
        #print 'SPLIT DICT IN THE FUNCTION '
        #pprint(split_dict)
        #print 'Contracted Negation dictionary for MRS-derived dep. in the function'
        #pprint(contracted_neg_split_dict_mrs)                                
        #print 'GROUP DICT IN THE FUNCTION '
        #pprint(group_dict)


        fill_split_group_dictionaries_results = [dep_rel_tokens_list, eds_dep_indexes, group_dict, split_dict, contracted_neg_split_dict_mrs, pos_tag_dict, lemma_dict]


        return fill_split_group_dictionaries_results

    #FUNCTION10: has_punctuation_ptb_tok
        # NOT USED, PUNCTUATION LIST IS INCOMPLETE
    def has_punctuation_ptb_tok(self, string_to_check):
        has_punct = 0

        check = ['!', ',', '.', ':', '?', ';', "'", '"', "“", "”", '(', ')' , '[', ']', '{', '}', '¦']
        for p in check:
            if p in string_to_check:
                has_punct = 1
                break
        return has_punct

    #FUNCTION11: compare_ptb_tok
    def compare_ptb_tok(self, a, b):
        #a = '<24:29>'. But when we do a[1:-1] we get token1_key_cut = '24:29'
        token1_key_cut = a[1:-1]
        #token1_key_parts = [24, 29]
        token1_key_parts = token1_key_cut.split(':')
        #token1_start_index = 24
        token1_start_index = int(token1_key_parts[0])

        #b = '<30:33>'. But when we do b[1:-1] we get token2_key_cut = '30:33'
        token2_key_cut = b[1:-1]
        #token2_key_parts = [30, 33]
        token2_key_parts = token2_key_cut.split(':')
        #token2_start_index = 30
        token2_start_index = int(token2_key_parts[0])    
        return cmp(int(token1_start_index), int(token2_start_index)) # compare as integers

    #FUNCTION12: create_pos_tag_token_dict_ptb_tok
    def create_pos_tag_token_dict_ptb_tok(self, pos_tag_dict, tokens_pos_dict, group_dict, split_dict):
        pos_tag_tokens_dict = {}
        for key in pos_tag_dict.keys():
            if key in tokens_pos_dict:
                pos_tag_tokens_dict[tokens_pos_dict[key]] = pos_tag_dict[key]
            elif key in group_dict:
                pos_tag_tokens_dict[tokens_pos_dict[group_dict[key]]] = pos_tag_dict[key]
            elif key in split_dict:
                for tokens_id in split_dict[key][1]:
                    pos_tag_tokens_dict[tokens_pos_dict[tokens_id]] = pos_tag_dict[key]


        return pos_tag_tokens_dict

    #FUNCTION13: create_lemma_token_dict_ptb_tok
    def create_lemma_token_dict_ptb_tok(self, lemma_dict, tokens_pos_dict, group_dict, split_dict, words_id_dict):
        lemma_tokens_dict = {}
        for key in lemma_dict.keys():
            if key in tokens_pos_dict:
                lemma_tokens_dict[tokens_pos_dict[key]] = lemma_dict[key]
            elif key in group_dict:
                lemma_tokens_dict[tokens_pos_dict[group_dict[key]]] = lemma_dict[key]
            elif key in split_dict:
                for tokens_id in split_dict[key][1]:
                    lemma_tokens_dict[tokens_pos_dict[tokens_id]] = lemma_dict[key]
        #print "LEMMA TOKENS DICT"
        #pprint(lemma_tokens_dict)
        return lemma_tokens_dict

    #FUNCTION14: 
    def analyze_input_file_ptb_tok(self, fi, file_index, current_latex_file_index, latex_doc_start, fhdl_dict, args_dict, output_flags, rule_head_dict, relations_collection):
  
        if output_flags['tex']:
            file_index = file_index + 1
            if file_index % 50 == 0:
                fhdl_dict['tex'].write('\end{document}')
                fhdl_dict['tex'].close()       
            
                current_latex_file_index  = current_latex_file_index + 1      
                fhdl_dict['tex'] = open(args_dict['tex'] + os.path.basename(os.path.normpath(args_dict['data'])) + "_ptb_tok_" + str(current_latex_file_index) + '.tex', 'w')
                fhdl_dict['tex'].write(latex_doc_start)

        print >> sys.stderr, fi
        current_file = gzip.open(args_dict['data'] + '/' + str(fi) + '.gz', 'rb')
        derivation_tree = ''

        read_tree = 0
        after_derivation_tree = 0
        previous_line_empty = 0
        read_eds = 0
        eds = ''
        search_original_sentence = 1
        #original_sentence = ''

        search_tokenization = 0
        read_tokenization = 0

        read_done = 0
        partial_line = ''

        #sometimes we have
        #("the"
        #    246
        #instead of ("the"  246
        #but we need this index to create unique identifiers for words
        token_index_missing = 0
        tokens_id_dict = {}
        tokens_pos_dict = {}
        tokens_list = []
        ptb_pos_list = []
        all_ptb_pos_with_probabilities_list = []

        # to count space punct
        regex = re.compile(' [!|\,|\.|\:|\?|\;|\'|\"|“|”|‘|’|\(|\)|\[|\]|\{|\}|¦|¦i|«|»]+$')

        for line in current_file.readlines():
            if read_done == 0:
                line = line.strip()

               #m = re.search('[^`]+`(.*)\'(?: \[(.*)\])?$', line)            
                m = re.search('[^`]+`(.*)\'(?: \[(.*)\])?$', line)                
                if m and search_original_sentence ==1:
                    #Normally the line with the sentence looks like this:
                    #[10320040] (1 of 1) {1} `As of [[June 30]] [[2008]] the company has 19,604 full-time employees.'
                    
                    #But for PEST corpus it is different:
                    #[20201001] (1 of 1) {1} `((S (NP-SBJ (NNP Rolls-Royce) (NNP Motor) (NNPS Cars) (NNP Inc.)) (VP (VBD said) (SBAR (-NONE- 0) (S (NP-SBJ (PRP it)) (VP (VBZ expects) (S (NP-SBJ (PRP$ its) (NNP U.S.) (NNS sales)) (VP (TO to) (VP (VB remain) (ADJP-PRD (JJ steady)) (PP-LOC-CLR (IN at) (NP (QP (IN about) (CD 1,200)) (NNS cars))) (PP-TMP (IN in) (NP (CD 1990)))))))))) (. .)))' [Rolls-Royce Motor Cars Inc. said it expects its U.S. sales to remain steady at about 1,200 cars in 1990.]
                    #if corpus == 'pest':
                        #original_sentence = m.group(2).lower()
                    #    original_sentence = m.group(2)
                    #else:
                        #original_sentence = m.group(1).lower()
                    #    original_sentence = m.group(1)
                    #if regex.search(m.group(1)):
                    #    self.space_punct_count = self.space_punct_count + 1
                        
                        #This script was used to collect the names of the files that contained cases like
                        # ". ..." and ". '" at the end of the sentence
                        #here we write the file numbers for those files that contain cases like 
                        #". ..." at the end
                        #file_handle = open('/Users/angelina/Documents/WeSearch/2012_Contrastive_parsing_experiments/03_First_15_sections/test_converter/error_log.txt', 'a')
                        #file_handle.write(str(fi) + "\n")
                        #file_handle.write(m.group(1) + "\n")
                        #file_handle.write("\n")
                        #file_handle.close()
                        #print m.group(1)
                        ## end of the script that collected ". ..."-type cases
                        
                    search_original_sentence = 0
                    search_tokenization = 1
                elif previous_line_empty == 1 and search_tokenization == 1 and line == '<':
                    read_tokenization = 1
                    search_tokenization = 0
                elif read_tokenization == 1 and line == '>':
                    read_tokenization = 0
                    search_derivation_tree = 1
                elif read_tokenization == 1 and line != '>':
                    line_parts = line.split(', ')

                    
                    #we remove quotes from the string value that is why we use [1:-1]
                    #we take <0:3> as the key and The as the value
                    token_part = line_parts[5][1:-1]
                    if '" "' in line_parts[5][1:-1]:
                        token_parts = line_parts[5][1:-1].split('" "')
                        token_part = token_parts[0]   

                
                    #WeScience, English MRS Test Suite:
                    #(1, 0, 1, <0:3>, 1, "The", 0, "null")
                    #PEST AND deepbank:
                    #(42, 0, 1, <0:1>, 1, "The" "The", 0, "null", "DT" 1.0)
                
                    #we decided that the tokens' positions are in the third column (tokens positions start from 1)          
                    tokens_id_dict[line_parts[3]] = token_part
                    #tokens_id_dict[line_parts[3]] = token_part.lower()
                    tokens_pos_dict[line_parts[3]] = int(string.replace(line_parts[2], '(', ''))
                    tokens_list.append(token_part)
                    
                    #in WeScience corpus we have 9 columns, and the 9th contains PTB POS
                    #In the English MRS Test Suite we have only 8 columns, so we are not given PTB POS tags
                    char_based_id = line_parts[3]

                    if len(line_parts) >= 9 :
                        ptb_pos_aux_list = line_parts[8].split(' ')
                        ptb_pos = ptb_pos_aux_list[0]
                        ptb_pos = ptb_pos[1:-1]
                        ptb_pos_list.append(ptb_pos)
                        # [0:-1] is necessary to remove closing bracket
                        all_ptb_pos_with_prob = line_parts[8][0:-1]
                        #print "all_ptb_pos_with_prob",line_parts[8]
                        
                        all_ptb_pos_with_probabilities_list.append(all_ptb_pos_with_prob)
                    else:
                        all_ptb_pos_with_probabilities_list.append('')
                        ptb_pos_list.append('_')
                
                
                elif line.startswith('(ROOT'):
                    read_tree = 1
                    derivation_tree = line + '\n'
                    previous_line_empty = 0
                    read_eds = 0
                elif len(line) == 0:
                    previous_line_empty = 1
                    if read_tree == 1:
                        read_tree = 0                        
                        after_derivation_tree = 1
                    elif read_eds == 1:
                        read_eds = 0
                        read_done = 1
                elif read_tree == 1:
                    line = line.strip()
                    
                    if token_index_missing == 1 and re.match('^\d+$', line):
                        line = partial_line + '|||||||' + line
                        derivation_tree = derivation_tree + line + '\n'
                        token_index_missing = 0
                        partial_line = ''
                    else:
                        
                        #this is a pattern for lines like ("understood" 264
                        match_pattern = re.match('(^\(")([^"]+)("\s+\d+$)', line)
                        #this is a pattern when we have
                        #("the"
                        #    246
                        #and currently are analyzing line ("the"
                        match_pattern2 = re.match('(^\(")([^"]+)("$)', line)
                        
                        
                        if match_pattern and match_pattern.group(1) and match_pattern.group(2) and match_pattern.group(3):
                            
                            line_begin = match_pattern.group(1)
                            line_content = match_pattern.group(2)
                            line_end = match_pattern.group(3)
                            #print 'LINE ORIGINAL ', line
                            #print 'LINE BEGIN ', line_begin
                            #print 'LINE CONTENT ', line_content
                            #print 'LINE END ', line_end
                            
                            #substitute brackets with <<<<<<<, otherwise we won't be able to read a tree
                            line_content = line_content.replace('(', '<<<<<<<')
                            line_content = line_content.replace(')', '>>>>>>>')
                            #multiword expression, we will consider them as one word at+least
                            if ' ' in line_content:
                                line_content = '+'.join(line_content.split(' '))
                            
                            line = line_begin + line_content + line_end
                            
                            #now the line
                            #(at least( 25
                            #will look 
                            #(at+least<<<<<<<|||||||25
                            line = '|||||||'.join(line.split(' '))
                            #print 'LINE FINAL ', line, '\n\n'
                            derivation_tree = derivation_tree + line + '\n'
                        #this is the case when the line is incomplete because the indificator is on the next line
                        elif match_pattern2 and match_pattern2.group(1) and match_pattern2.group(2) and match_pattern2.group(3):
                            line_begin = match_pattern2.group(1)
                            line_content = match_pattern2.group(2)
                            line_end = match_pattern2.group(3)
                            #print 'LINE ORIGINAL ', line
                            #print 'LINE BEGIN ', line_begin
                            #print 'LINE CONTENT ', line_content
                            #print 'LINE END ', line_end
                            line_content = line_content.replace('(', '<<<<<<<')
                            line_content = line_content.replace(')', '>>>>>>>')
                            #multiword expression
                            if ' ' in line_content:
                                line_content = '+'.join(line_content.split(' '))
                            
                            line = line_begin + line_content + line_end
                            
                            partial_line = line
                            token_index_missing = 1
                        
                        
                        else:
                            #here we take care about the lines like "token [ +CARG #1=\"the\" +CLASS alphabetic [ +CASE non_capitalized+lower +INITIAL - ] +FORM #1 +FROM \"175\" +ID *diff-list* [ LAST #2=*top* LIST *cons* [ FIRST \"25\" REST #2 ] ] +PRED predsort +TNT null_tnt [ +MAIN tnt_main [ +PRB \"1\" +TAG \"DT\" ] +PRBS *null* +TAGS *null* ] +TO \"178\" +TRAIT native_trait ]"))))
                            #they should not contain brackets in the middle
                            for match_pattern1 in re.finditer('(\\\\\")([^"]+)(\\\\\")',line):
                                #print 'MATCH_PATTERN1.group ', match_pattern1.group()
                                if '(' in match_pattern1.group(0) or ')' in match_pattern1.group(0):
                                    line_begin = match_pattern1.group(1)
                                    line_content = match_pattern1.group(2)
                                    line_end = match_pattern1.group(3) 
                                    #print 'LINE ORIGINAL ', line
                                    #print 'LINE BEGIN ', line_begin
                                    #print 'LINE CONTENT ', line_content
                                    #print 'LINE END ', line_end
                                    line_content = line_content.replace('(', '<<<<<<<')
                                    line_content = line_content.replace(')', '>>>>>>>')
                                    #print 'substitute ', match_pattern1.group(0), ' with ', line_begin + line_content + line_end
                                    line = re.sub(re.escape(match_pattern1.group(0)), line_begin + line_content + line_end, line, 1)
                            
                            line = '|||||||'.join(line.split(' '))
                            #print 'LINE FINAL ', line, '\n\n'
                            derivation_tree = derivation_tree + line + '\n'
                    previous_line_empty = 0
                    read_eds = 0
                elif after_derivation_tree == 1 and previous_line_empty == 1 and re.match('^\{.+',line) and (line[-1] == ':' or re.search('\:\s+\(fragmented\)', line) or re.search('\:\s+\(cyclic\)', line) or re.search('\:\s+\(cyclic fragmented\)', line)):
                    eds = eds + line + '\n'
                    read_eds = 1
                    previous_line_empty = 0
                    after_derivation_tree = 0
                elif read_eds == 1:
                    eds = eds + line + '\n'
                    previous_line_empty =0


        current_file.close()

	try:
        	t = Tree.parse(derivation_tree)
	except ValueError:
		raise ValueError("Missing or incorrect derivation tree!")
	
        t.collapse_unary(True, True, '///////')
        #t.draw()
        traverse_results = self.my_traverse(t, [], [], {}, {}, {},{}, {}, {},'ptb')
        first_dep = ''

        if len(traverse_results[0]) > 0:
            first_dep = traverse_results[0][0]
        words_list = traverse_results[1] 
        dep_dict = traverse_results[2]
        words_pos_dict = traverse_results[3]
        #print 'WORDS POS DICT'
        #pprint(words_pos_dict)
        pos_tag_dict = traverse_results[4]
        lemma_dict = traverse_results[5]

        indexes = range(1,len(words_list)+1)

        #print('\n\n WORDS LIST\n')
        #pprint(words_list)
        #print '====================\n\n\n DEPENDENCY DICTIONARY\n\n'
        #pprint(dep_dict)
        #print '\n\n First dependency: ' + first_dep + '\n\n'


        sentence = ''
        for i in range(0,len(tokens_list)):
            sentence = sentence + tokens_list[i] + ' '

        sentence = sentence.strip()
        words_dict = dict(zip(words_list, indexes))
        #print '\n\n WORDS DICTIONARY\n'
        #pprint(words_dict)
        #print '\n\n WORDS POSITION DICTIONARY\n'
        #pprint(words_pos_dict)


        #merge words_pos_dict and words_list into a new dictionary words_id_dict
        # words_pos_dict looks like
        #{'<0:2>': 1,
        #'<100:105>': 17,
        #'<106:110>': 18,
        #'<111:113>': 19,
        # ...
        #}
        #words_list looks like
        #['"it"|||||||430',
        # ...
        #'"large"|||||||329',
        #'"body"|||||||331',
        #'"of"|||||||333',
        # ...
        #]
        #So the new words_id_dict will look like:
        #{'"it"|||||||430': '<0:2>'
        #'"large"|||||||329': '<100:105>'
        #'"body"|||||||331': '<106:110>'
        #'"of"|||||||333': '<111:113>'
        # ...
        #} 
        words_id_dict = {}
        #pprint(words_pos_dict)
        for word_id in words_pos_dict.keys():
            for k in range(0, len(words_pos_dict[word_id])):
                words_id_dict[words_list[words_pos_dict[word_id][k]-1]] = word_id

        #pprint(words_id_dict)
        dep_rel = []
        dep_rel_id_list = []
        dep_rel_tokens_list = []
        group_dict = {}
        split_dict = {}  
        contracted_neg_split_dict_mrs = {}
        sent_root = ''
        eds_dep_indexes = []
        if first_dep != '':
            dep_rel.append(first_dep)
            
            #print 'WORDS ID DICT'
            #pprint(words_id_dict)
            dep_rel_id_list = self.dep_expansion_ptb_tok(fi, rule_head_dict, dep_rel, words_id_dict, dep_dict, [])
            #print 'DEPENDENCY RELATION ID LIST'
            #pprint(dep_rel_id_list)
            #print 'WORDS ID DICT AFTER DEP EXPANSION'
            #pprint(words_id_dict)
            #print '\n\n'
            
            #now we need to compute real relationships between separate tokens
            dep_expansion_to_tokens_results = self.dep_expansion_to_tokens_ptb_tok(dep_rel_id_list, tokens_id_dict, tokens_pos_dict, words_pos_dict, tokens_list, pos_tag_dict, lemma_dict)
            dep_rel_tokens_list = dep_expansion_to_tokens_results[0]
            eds_dep_indexes = dep_expansion_to_tokens_results[1]
            group_dict = dep_expansion_to_tokens_results[2]
            split_dict = dep_expansion_to_tokens_results[3]
            contracted_neg_split_dict_mrs = dep_expansion_to_tokens_results[4]
            pos_tag_dict = dep_expansion_to_tokens_results[5]
            lemma_dict =   dep_expansion_to_tokens_results[6]

            #print 'SPLIT DICTIONARY'
            #pprint(split_dict)
            #print 'DEP REL TOKENS LIST'
            #pprint(dep_rel_tokens_list)
            
            sent_root_id = words_id_dict[first_dep]
            if sent_root_id in tokens_pos_dict:
                sent_root = tokens_pos_dict[sent_root_id]
            elif sent_root_id in group_dict:
                sent_root = tokens_pos_dict[group_dict[sent_root_id]]
            elif sent_root_id in split_dict:
                sent_root = tokens_pos_dict[split_dict[sent_root_id][0]]
        #one-word sentence: the word should be the root
        elif first_dep == '' and len(tokens_pos_dict) == 1:
            sent_root = 1
        #This is a complex case such as file ws01/10011680.gz where
        # PTB tokenization is:
        #  (1, 0, 1, <30:39>, 1, "Reduction", 0, "null", "NNP" 0.537 "NN" 0.463)
        #(2, 1, 2, <44:45>, 1, ".", 0, "null", "." 1.0)
        # while ERG approach looks different:
        #       (22 reduction_n1/n_pp_mc-of_le 0 0 1
        #        ("reduction." 19
        #+FROM \"30\" TO \"45\"
        # In this case we have one-word sentence for ERG and two-word sentence for PTB
        elif first_dep == '' and len(words_pos_dict) == 1 and len(tokens_pos_dict) > 1:
            fill_split_group_dictionaries_results = self.fill_split_group_dictionaries_ptb_tok(words_pos_dict, tokens_pos_dict, tokens_list, pos_tag_dict, lemma_dict)
            dep_rel_tokens_list = fill_split_group_dictionaries_results[0]
            eds_dep_indexes = fill_split_group_dictionaries_results[1]
            group_dict      = fill_split_group_dictionaries_results[2]
            split_dict      = fill_split_group_dictionaries_results[3]
            contracted_neg_split_dict_mrs = fill_split_group_dictionaries_results[4]
            pos_tag_dict    = fill_split_group_dictionaries_results[5]
            lemma_dict      = fill_split_group_dictionaries_results[6]
            #dep_rel_tokens_list will contain the dependency relations. As we are breaking out a compound with/without punctuation or a word with punctuation,
            # it is likely that the head is the same in each of the new dependency relation. So we will take the first head of the first relation in the list
            # as a sentence root
            sent_root       = dep_rel_tokens_list[0][0]

        if output_flags['sent_tok']:
            fhdl_dict['sent_tok'].write('#' + str(fi) + "\t" + sentence + "\n")

        sentence = escape_sent(sentence)
        words_in_sent = sentence.split(' ')            

        #print 'EDS'
        #print eds + '\n\n'


        read_eds_results = self.read_eds_into_dict(eds, relations_collection)
        eds_root = read_eds_results[0]

        eds_dict = read_eds_results[1]
        dict_id_pos = read_eds_results[2]
        transparent_dict = read_eds_results[3]
        eds_relation_dict = read_eds_results[4]

        #print 'EDS DICT'
        #pprint(eds_dict)

        #print 'EDS RELATION DICT'
        #pprint(eds_relation_dict)

        #print 'TRANSPARENT DICTIONARY'
        #pprint(transparent_dict)
        #print '\n\n'

        (eds_dep_indexes, eds_relation_pos_dict) = self.eds_expansion_ptb_tok(eds_dep_indexes, eds_dict, tokens_pos_dict, dict_id_pos, transparent_dict, split_dict, contracted_neg_split_dict_mrs, group_dict, eds_relation_dict)

        #print 'EDS DEP INDEXES'
        #pprint(eds_dep_indexes)

        sent_root_mrs_derived = ''
        #if eds_root in eds_dict and eds_root in dict_id_pos:
        if eds_root in dict_id_pos:
            #EXAMPLE:
            #WeScience corpus, sentence 10010020
            #eds_root = 'e2'
            #dict_id_pos['e2'] = '<95:97>'
            #It is incorrect to set a root as  words_pos_dict['<95:97>'] = 10 because we
            #split two commas and the root is no more the word number 10 in the sentence
            #Therefore we use tokens_pos_dict['<95:97>'] = 13 which is a new position
            #of the root in the sentence (after the punctuation was cut off from the words
            #it was attached to.
            sent_root_mrs_derived_id = dict_id_pos[eds_root]
            if   sent_root_mrs_derived_id in tokens_pos_dict:
                sent_root_mrs_derived = tokens_pos_dict[sent_root_mrs_derived_id]
            elif sent_root_mrs_derived_id in group_dict:
                sent_root_mrs_derived = tokens_pos_dict[group_dict[sent_root_mrs_derived_id]]
            elif sent_root_mrs_derived_id in split_dict and sent_root_mrs_derived_id in contracted_neg_split_dict_mrs:
                sent_root_mrs_derived = tokens_pos_dict[contracted_neg_split_dict_mrs[sent_root_mrs_derived_id][0]]
            elif sent_root_mrs_derived_id in split_dict:
                sent_root_mrs_derived = tokens_pos_dict[split_dict[sent_root_mrs_derived_id][0]]                     


        if output_flags['tex']:
            fhdl_dict['tex'].write('\\begin{center}\n' + 
                           '\\begin{dependency}[edge below]\n' + 
                           '\\begin{deptext}[column sep=.05cm]\n')

            fhdl_dict['tex'].write(words_in_sent[0])
            for i in range(1, len(words_in_sent)):
                fhdl_dict['tex'].write(' \& ' + words_in_sent[i])

            fhdl_dict['tex'].write('\\\\\n' + '\end{deptext}\n')


        pos_tag_tokens_dict = self.create_pos_tag_token_dict_ptb_tok(pos_tag_dict, tokens_pos_dict, group_dict, split_dict)
        lemma_tokens_dict = self.create_lemma_token_dict_ptb_tok(lemma_dict, tokens_pos_dict, group_dict, split_dict, words_id_dict)

        if output_flags['dt'] or output_flags['tex']:
            #if we have to print out tex file, we anyway have to extract everything for DT
            self.convert_dt_ptb_tok(fi, tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list, sent_root, dep_rel_tokens_list, args_dict, output_flags, fhdl_dict)
        if output_flags['dm'] or output_flags['tex']:
            #if we have to print out tex file, we anyway have to extract everything for DM
            self.convert_dm_ptb_tok(fi, tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list, sent_root_mrs_derived, eds_dep_indexes, args_dict, output_flags,fhdl_dict)
        if output_flags['dtm']:
            self.convert_dtm_ptb_tok(fi, tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list, sent_root, dep_rel_tokens_list, args_dict, output_flags, fhdl_dict, sent_root_mrs_derived, eds_dep_indexes, eds_relation_pos_dict)

        #pprint(split_dict)
        if output_flags['dt_yy']:
            self.convert_dt_yy_ptb_tok(fi, all_ptb_pos_with_probabilities_list, tokens_pos_dict, tokens_list, lemma_tokens_dict, pos_tag_tokens_dict, ptb_pos_list, sent_root, dep_rel_tokens_list, split_dict, args_dict, output_flags, fhdl_dict)

        if output_flags['tex']:
            fhdl_dict['tex'].write('\end{dependency}\n' +
                           '\\\\\n' + str(fi) + '.gz' +  
                           '\end{center}' + '\n\\newpage') 
        return [file_index, current_latex_file_index, fhdl_dict]

    #FUNCTION
    def is_punctuation_ptb_tok(self, string_to_check):
        is_punct = 0
        matchObj = re.match( r'^[!|\,|\.|\:|\?|\;|\'|\"|“|”|‘|’|`|\(|\)|\[|\]|\{|\}|¦|¦i|¦|«|»]+$', string_to_check)
        if matchObj:
            is_punct = 1
        return is_punct

    #FUNCTION
    def is_end_of_phrase_punctuation_ptb_tok(self, string_to_check):
        is_end_of_phrase_punct = 0
        matchObj = re.match( r'^[!|\,|\.|\:|\?|\;]+$', string_to_check)
        if matchObj:
            is_end_of_phrase_punct = 1
        return is_end_of_phrase_punct


    #FUNCTION is_punctuation_erg_tok
    # The only difference with is_punctuation_ptb_tok is that we do not check for "¦i" because 
    # when we call this function, we intend to send as string_to_check only one letter
    def is_punctuation_erg_tok(self, string_to_check):
        is_punct = 0
        matchObj = re.match( r'^[!|\,|\.|\:|\?|\;|\'|\"|“|”|‘|’|`|\(|\)|\[|\]|\{|\}|¦|¦|«|»]+$', string_to_check)
        if matchObj:
            is_punct = 1
        return is_punct
#========================================#
#     FUNCTIONS FOR ERG TOKENIZATION     #
#========================================#

    #FUNCTION1: extract_pos_tag_and_lemma_erg_tok
    def extract_pos_tag_and_lemma_erg_tok(self, grand_parent, pos_tag_dict, lemma_dict, index):
        pos_tag = ''
        if grand_parent is not None:
            #example of grand_parent value:
            #224|||||||be_id_is/v_np_is_le|||||||0.776934|||||||1|||||||2
            #the pos_tag will be 'v_np_is_le'
            #the lemma will be 'be'
            grand_parent_pieces = grand_parent.split('|||||||')
            pos_tag_incorporated = grand_parent_pieces[1]
            if '/' in pos_tag_incorporated:
                pos_tag_incorporated_pieces = pos_tag_incorporated.split('/')
                lemma = pos_tag_incorporated_pieces[0]
                pos_tag = pos_tag_incorporated_pieces[1]
            elif '@' in pos_tag_incorporated:
                pos_tag_incorporated_pieces = pos_tag_incorporated.split('@')
                lemma = pos_tag_incorporated_pieces[0]
                pos_tag = pos_tag_incorporated_pieces[1]
            
        #now cut pos tag up so that we take only the part before first "_"
        #sub_parts = pos_tag.split('_')
        #pos_tag = sub_parts[0]
        #pos_tag = '_'.join([sub_parts[0],sub_parts[1]])
        #pos_tag = '_'.join([sub_parts[0],sub_parts[1], sub_parts[2]])

        pos_tag_dict[len(pos_tag_dict.keys())] = pos_tag
        lemma_dict[len(lemma_dict.keys())] = lemma
        return [pos_tag_dict, lemma_dict]

    #FUNCTION2: find_key_index_erg_tok
    def find_key_index_erg_tok(self, key, transparent_dict, dict_id_pos, words_pos_dict):
        key_id = None 

        while key in transparent_dict:
            key = transparent_dict[key]

        if (key in dict_id_pos) and (dict_id_pos[key] in words_pos_dict):
            key_id = words_pos_dict[dict_id_pos[key]][0]
        

        return key_id

    #FUNCTION2.1: find_key_index_pred_erg_tok (it is different, because for predicate names we do 
    #    not want to substitute the predicate with a value in the transparent dict, e.g.)
    #
    # Abrams arrived and Browne sang.
    # {e3:
    #_1:proper_q<0:6>[BV x6]
    #x6:named<0:6>("Abrams")[]
    #e10:_arrive_v_1<7:14>[ARG1 x6]
    #e3:_and_c<15:18>[L-INDEX e10, R-INDEX e13, L-HNDL e10, R-HNDL e13]
    #_2:proper_q<19:25>[BV x16]
    #x16:named<19:25>("Browne")[]
    #e13:_sing_v_1<26:31>[ARG1 x16]
    #}
    #
    # e3 is transparent and is equated with e10
    # Function find_key_index_erg_tok is auxilliary to determine dependency relation
    # and in that case we want e10 to be the head of e13 with the label _and_c
    # But when we identify the predicate name we still want e10 to have predicate "_arrive_v_1" (not "_and_c"):
    #1   Abrams  _   n_-_pn_le   n_-_pn_le   _   _   _   2   SB-HD   named   _   ARG1    _   _
    #2   arrived _   v_-_le  v_-_le  _   _   _   3   CL-CL   ^_arrive_v_1 _   _   _   _
    #3   and _   c_xp_and_le c_xp_and_le _   _   _   0   ROOT    _   _   _   _   _
    #4   Browne  _   n_-_pn_le   n_-_pn_le   _   _   _   5   SB-HD   named   _   _   _   ARG1
    #5   sang.   _   v_np*_le    v_np*_le    _   _   _   3   MRK-NH  _sing_v_1   _   _and_c  _   _
    #  
    def find_key_index_pred_erg_tok(self, key, transparent_dict,dict_id_pos, words_pos_dict):
        key_id = None 
        if key not in transparent_dict: # e.g. "and" should not get a predicate because it is transparent
            if (key in dict_id_pos) and (dict_id_pos[key] in words_pos_dict):
                key_id = words_pos_dict[dict_id_pos[key]][0]

        return key_id

    #FUNCTION3: eds_expansion_erg_tok
    def eds_expansion_erg_tok(self, eds_dict, words_pos_dict, dict_id_pos, transparent_dict, eds_relation_dict):
        eds_dep_indexes = []
        eds_relation_pos_dict = {}

        for key, value in eds_dict.iteritems():
            head = self.find_key_index_erg_tok(key, transparent_dict, dict_id_pos, words_pos_dict)

            
            #head can be 'None' for the cases when the head is not a word token
            if not(head is None):
                for dep in value:
                    label = dep[0]
                    dependent = self.find_key_index_erg_tok(dep[1], transparent_dict, dict_id_pos, words_pos_dict)
                    #if key == '|_31':
                    #    print dependent

                    if head != dependent and not(dependent is None):
                        eds_dep_indexes.append([head, dependent, label])

        #NOW EXPAND EDS_RELATION_DICT
        # THIS DICTIONARY IS FOR ALL PREDICATES (NOT ONLY FOR THOSE THAT ARE HEADS FOR SOME DEPENDENTS)
        # we had a dictionary "eds_relation_dict" for relations that mapped 'e30' -> _leading_a_1,
        # 'x9' -> _and_c
        # now we find that the position of 'e30' in the sentence is 5 and 
        #the position of 'x9' in the sentence is 1, 
        #and build a new dictionary eds_relation_pos_dict that maps 5 -> _leading_a_1, 1 -> _and_c
        for key in eds_relation_dict.keys():
            key_pos_in_sent = self.find_key_index_pred_erg_tok(key, transparent_dict, dict_id_pos, words_pos_dict)
            if not (key_pos_in_sent is None):
                eds_relation_pos_dict[key_pos_in_sent] = eds_relation_dict[key]
        #print 'EDS DEPENDENCY INDEXES:'
        #pprint(eds_dep_indexes) 
        #print '\n\n'

        return [eds_dep_indexes, eds_relation_pos_dict]

    #FUNCTION4: dep_expansion_erg_tok
    def dep_expansion_erg_tok(self, fi, rule_head_dict, dep_rel, words_dict, dep_dict, dep_rel_indexes):
        # dep_rel[0] can be 454|||||||SP-HD_N_C|||||||1.49349|||||||0|||||||2
        #We could have dep_dict like this: {'"burkina faso"|||||||22': []} which means that len(dep_dict[dep_rel[0]]) = 0
        if len(dep_dict[dep_rel[0]]) > 0:
            for i in range(0,len(dep_dict[dep_rel[0]])):
                if dep_dict[dep_rel[0]][i] in words_dict:
                    dep_dict[dep_rel[0]][i] = words_dict[dep_dict[dep_rel[0]][i]]
                else:
                    new_dep_rel = []
                    new_dep_rel.append(dep_dict[dep_rel[0]][i])
                    self.dep_expansion_erg_tok(fi, rule_head_dict, new_dep_rel, words_dict, dep_dict, dep_rel_indexes)
                    #here we can potentially fail
                    dep_dict[dep_rel[0]][i] = words_dict[dep_dict[dep_rel[0]][i]]
            
            
            dep_rel_parts = dep_rel[0].split('|||||||')
            #print dep_rel_parts[1]
            if re.match('^\^', dep_rel_parts[1], re.IGNORECASE):
                dep_rel_parts[1] = dep_rel_parts[1][1:]
                #print dep_rel_parts[1]
            #print '\n'
            
            if dep_rel_parts[1].lower() in rule_head_dict and len(dep_dict[dep_rel[0]]) == int(rule_head_dict[ dep_rel_parts[1].lower()][0]):
                head_index = int(rule_head_dict[ dep_rel_parts[1].lower()][1])
                dep_rel_subparts = dep_rel_parts[1].split('_')
                #label = '_'.join([dep_rel_subparts[0],dep_rel_subparts[1]])
                #label = dep_rel_parts[1]
                label = dep_rel_subparts[0]
                head = dep_dict[dep_rel[0]][head_index]
                for i in range(0,len(dep_dict[dep_rel[0]])):    
                    if dep_dict[dep_rel[0]][i] != head:
                        dependent = dep_dict[dep_rel[0]][i]
                        dep_rel_indexes.append([head, dependent,label])
                if not dep_rel[0] in words_dict:
                    words_dict[dep_rel[0]] = head                 
            #if head_index == 0: 
            #head = dep_dict[dep_rel[0]][0]
            #dependent = dep_dict[dep_rel[0]][1]
            #elif head_index == 1:
            #head = dep_dict[dep_rel[0]][1]
            #dependent = dep_dict[dep_rel[0]][0]
            #else:
            #print >> sys.stderr, "Unexpected head index of the rule " + dep_rel_parts[1].lower() + " (head index is neither 0, nor 1; head index is " + str(head_index) + ")."
            #sys.exit(1)
            else:
                print >> sys.stderr, "Error! Sentence id " + str(fi) + ". Rule " + dep_rel_parts[1].lower() + ". Unknown rule (rule is not in the list of rules in the file erg.hds) or incorrect number of daughters in the tree."
                sys.exit(1)
        return dep_rel_indexes


    def convert_dt_erg_tok(self, fi, words_list, words_correct_case_dict, pos_tag_dict, lemma_dict, sent_root, dep_rel_indexes, args_dict, output_flags, fhdl_dict):


        if output_flags['dt']:
            fhdl_dict['dt'].write('#' + str(fi) + '\n')
        
        
        #we have a list dep_indexes that contains lists in it. Each of the inner lists represents head, dependent, label.
        # but when we write a sentence in CoNLL format, for each word we need its head and label.
        #so we convert our list into a dictionary where keys are dependents and values are lists (in the Derivation Tree one dependent can have only one head) that represent head and label 
        dep_indexes_dict = create_dep_indexes_dict(dep_rel_indexes)


        for i in range(1, len(words_list) + 1):
            head = '_'
            label = '_'
            if i in dep_indexes_dict.keys():
                head = dep_indexes_dict[i][0]
                label = dep_indexes_dict[i][1]
            elif i == sent_root:
                head = 0
                label = 'ROOT'
                if output_flags['tex']:
                    if sent_root != '':        
                        fhdl_dict['tex'].write('\deproot[edge above, edge style={red}]{' + str(sent_root) + '}' + '{root}\n') 

            #print '\n\n ELEMENT OF WORDS_LIST: ', words_list[i-1] 
            word = words_correct_case_dict[words_list[i-1]]
            #print 'WORD: ', word, '\n' 


            if output_flags['dt']:
                fhdl_dict['dt'].write(str(i) + '\t' + word + '\t' + "_" + '\t'  + pos_tag_dict[i-1] + '\t' + pos_tag_dict[i-1] + '\t' + '_' + '\t' + str(head) + '\t' + label  + '\t' + '_' + '\t' + '_' +'\n')  
        
        if 'dt' in fhdl_dict:
            fhdl_dict['dt'].write('\n')
         
        if output_flags['tex']:       
            for i in range(0,len(dep_rel_indexes)):
                #escape underscore symbol in the latex
                dep_rel_indexes[i][2] = dep_rel_indexes[i][2].replace('_', '\_')
                fhdl_dict['tex'].write('\depedge[edge above, edge style={red}]' + 
                                       '{' + str(dep_rel_indexes[i][0]) + '}' + 
                                       '{' + str(dep_rel_indexes[i][1]) + '}' + 
                                       '{' + dep_rel_indexes[i][2] + '}' + '\n') 
        return      


    def convert_dm_erg_tok(self, fi, words_list, words_correct_case_dict,pos_tag_dict, lemma_dict, eds_dep_indexes, mrs_derived_sent_root, args_dict, output_flags, fhdl_dict):


        #print 'WORDS LIST'
        #pprint(words_list)

        #print 'POS TAG DICT'
        #pprint(pos_tag_dict)

        #print 'LEMMA DICT'
        #pprint(lemma_dict)

        #print 'MRS DERIVED SENT ROOT'
        #pprint(mrs_derived_sent_root)

        #print 'EDS DEP INDEXES'
        #pprint(eds_dep_indexes)

        #if there is no root, then our variable mrs_derived_sent_root = 0
        #if there is a root, its index is in a string format and is calculated starting from 1:
        #Example:
        #'This is the most common conception...'
        # mrs_derived_sent_root = '2' (the copula 'is' is the root in this case)

        #words_list has a format:
        #['"this"|||||||192',
        # '"is"|||||||149',
        # '"the"|||||||151',
        # '"most"|||||||175',
        # '"common"|||||||153',
        # '"conception,"|||||||169',

        if output_flags['dm']:
            fhdl_dict['dm'].write('#' + str(fi) + '\n') 

        if mrs_derived_sent_root != '':
            word = words_correct_case_dict[words_list[int(mrs_derived_sent_root) - 1]]
            if output_flags['dm']:
                fhdl_dict['dm'].write('ROOT' + '\t' + '_' + '\t' + 'ROOT' + '\t' + 'ROOT' + '\t' +  '-1'	+ '\t' + 'ROOT' + '\t' +	'ROOT' + '\t' + word + '\t' + '_' + '\t' + pos_tag_dict[int(mrs_derived_sent_root) - 1] + '\t' + pos_tag_dict[int(mrs_derived_sent_root) - 1] + 	'\t' + str(int(mrs_derived_sent_root) - 1) + '\n')
            if output_flags['tex']:
                fhdl_dict['tex'].write('\deproot[edge style={blue}]{' + mrs_derived_sent_root + '}' + '{root}\n') 
        for i in range(0, len(eds_dep_indexes)):
            head_word_index = eds_dep_indexes[i][0] - 1
            

            head_word = words_correct_case_dict[words_list[head_word_index]]

            
            head_word_pos = pos_tag_dict[head_word_index]
            
            
            head_lemma = lemma_dict[head_word_index]
            
            dep_word_index = eds_dep_indexes[i][1] - 1
            
            dep_word = words_correct_case_dict[words_list[dep_word_index]]
            
            dep_word_pos = pos_tag_dict[dep_word_index]

            
            dep_lemma = lemma_dict[dep_word_index]
            
            label = eds_dep_indexes[i][2]
            
            if output_flags['dm']:
                fhdl_dict['dm'].write(head_word + '\t' + "_" + '\t' + head_word_pos + '\t' + head_word_pos + '\t' +  str(head_word_index)	+ '\t' + label + '\t' +	label + '\t' + dep_word + '\t' + "_" + '\t' + dep_word_pos + '\t' + dep_word_pos + 	'\t' + str(dep_word_index) + '\n')
            if output_flags['tex']:
                #escape underscore in latex
                eds_dep_indexes[i][2] = eds_dep_indexes[i][2].replace('_', '\_')
                fhdl_dict['tex'].write('\depedge[edge style={blue}]' + 
                                   '{' + str(eds_dep_indexes[i][0]) + '}' + 
                                   '{' + str(eds_dep_indexes[i][1]) + '}' + 
                                   '{' + eds_dep_indexes[i][2] + '}' + '\n') 
        if output_flags['dm']:
	        fhdl_dict['dm'].write('\n')


        return

    #FUNCTION6: print both DT and DM in one file in CoNLL08 formatted file
    # fi - The name of the original file (e.g. "20201001")
    # words_list - Word tokens of the sentence
    # pos_tag_dict - Part-of-speech tags for each word token
    # lemma_dict - Lemma for each word token
    # dt_sent_root - Sentence root in DT representation
    # dt_dep_rel_indexes - Dependencies in DT representation
    # output_hdl - Output file, where we write DT and DM in ConLL08 form
    # dm_sent_root_mrs_derived - Sentence root in DM representation
    # dm_eds_dep_indexes - Dependencies in DM representation
    # eds_relation_pos_dict - Dictionary of predicates that maps predicate position in the sentence to its name,
    #                     e.g. 16 -> 'named', 17 -> '_firm_n_1'


    def convert_dtm_erg_tok(self, fi, words_list, words_correct_case_dict,pos_tag_dict, lemma_dict, dt_sent_root, dt_dep_rel_indexes,  dm_sent_root, dm_eds_dep_indexes, eds_relation_pos_dict, args_dict, output_flags, fhdl_dict):

        # We keep the name of the original file so that we could find source files for sentences with interesting fenomena
        fhdl_dict['dtm'].write('#' + str(fi) + '\n')


        #FIRST WORK ON DM REPRESENTATION
        #Sort relations by position in the sentence
        # Now they are unsorted in the dictionary eds_relation_pos_dict

        sorted_eds_relation_pos_list = sorted(eds_relation_pos_dict.keys())
        #print("eds_relation_pos_dict")
        #pprint(eds_relation_pos_dict)
        #print "Sorted EDS RELATION POS LIST"
        #pprint(sorted_eds_relation_pos_list)

        #Rearrange the array for DM
        dm_dependencies = collect_arguments_for_dm_predicates(dm_eds_dep_indexes, eds_relation_pos_dict, sorted_eds_relation_pos_list)

        #print 'DM DEPENDENCIES'
        #pprint(dm_dependencies)

        # NOW WORK WITH DT REPRESENTATION
        #we have a list dt_dep_rel_indexes that contains lists in it. Each of the inner lists represents head, dependent, label.

        # but when we write a sentence in CoNLL format, for each word we need its head and label.
        #so we convert our list into a dictionary where keys are dependents and values are lists (in the Derivation Tree one dependent can have only one head) that represent head and label 
        dt_dep_indexes_dict = create_dep_indexes_dict(dt_dep_rel_indexes)

        #NOW LOOP OVER ALL WORDS IN THE SENTENCE
        # COLLECT DT AND DM INFORMATION AND PRINT IT out 
        for i in range(1, len(words_list) + 1):
            dt_head = '_'
            dt_label = '_'
            if i in dt_dep_indexes_dict.keys():
                dt_head = dt_dep_indexes_dict[i][0]
                dt_label = dt_dep_indexes_dict[i][1]
            elif i == dt_sent_root:
                dt_head = 0
                dt_label = 'ROOT'
            #print '\n\n ELEMENT OF WORDS_LIST: ', words_list[i-1] 
            word = words_correct_case_dict[words_list[i-1]]
            #print 'WORD: ', word, '\n' 


            #NOW COLLECT information about DM
            #by default dm_predicate is "_"
            # and for all predicate current word is not an argument by default ("_")
            dm_predicate = "_"
            if i in eds_relation_pos_dict:
                #if current word is a predicate
                dm_predicate = eds_relation_pos_dict[i]
            #if current word is the root, we add "^" symbol to the predicate to mark that it is a root, e.g.
            #^_ (empty predicate, )
            if not(dm_sent_root == ''):
                if i == int(dm_sent_root):
                    dm_predicate = "^" + dm_predicate
                

            dm_is_argument = "\t".join(["_"] * len(sorted_eds_relation_pos_list))
            if i in dm_dependencies:
                dm_is_argument = "\t".join(dm_dependencies[i])
            
            fhdl_dict['dtm'].write(str(i) + '\t' + word + '\t' + lemma_dict[i-1] + '\t'  + pos_tag_dict[i-1] + '\t' + pos_tag_dict[i-1] + '\t' + '_' + '\t' + str(dt_head) + '\t' + dt_label  + '\t' +  '_' + '\t' +  '_' + '\t' + dm_predicate + '\t' + dm_is_argument +'\n')  
        fhdl_dict['dtm'].write('\n')
                

        return  


    def analyze_input_file_erg_tok(self, fi, file_index, current_latex_file_index, latex_doc_start, fhdl_dict, args_dict, output_flags, rule_head_dict, relations_collection, eds_dep_labels_dict):

        if output_flags['tex']:
            file_index = file_index + 1
            if file_index % 50 == 0:
                fhdl_dict['tex'].write('\end{document}')
                fhdl_dict['tex'].close()      
            
                current_latex_file_index  = current_latex_file_index + 1      
                fhdl_dict['tex'] = open(args_dict['tex'] + os.path.basename(os.path.normpath(args_dict['data'])) + "_erg_tok_" + str(current_latex_file_index) + '.tex', 'w')
                fhdl_dict['tex'].write(latex_doc_start)

        print >> sys.stderr, fi
        current_file = gzip.open(args_dict['data'] + '/' + str(fi) + '.gz', 'rb')
        derivation_tree = ''
        cfg_tree = ''
        read_tree = 0
        after_derivation_tree = 0
        previous_line_empty = 0
        read_eds = 0
        eds = ''
        search_original_sentence = 1
        search_ptb_tokenization = 1
        search_erg_tokenization = 0
        read_ptb_tokenization = 0
        read_erg_tokenization = 0
        original_sentence = ''
        read_done = 0
        partial_line = ''


        #sometimes we have
        #("the"
        #    246
        #instead of ("the"  246
        #but we need this index to create unique identifiers for words
        token_index_missing = 0

        # ERG tokenization. We collect only ID and token, e.g.
        '''
        (235, 23, 24, <123:129>, 1, "Tunick", 0, "null")
        (302, 23, 24, <123:129>, 1, "Tunick", 0, "null", "NNP" 1.0)
        (315, 23, 24, <123:129>, 1, "tunick", 0, "null")
        '''
        #erg_tok_dict['<123:129>'] = ["Tunick", "Tunick", "tunick"]
        erg_tok_dict = defaultdict(list)

        #reader = codecs.getreader("utf-8")
        #contents = reader( current_file )

        #for line in contents:
        for line in current_file.readlines():
            if read_done == 0:
                #line = line_from_gzip.decode('utf-8')
                line = line.strip()
                
                
                m = re.search('[^`]+`(.*)\'(?: \[.*\])?$', line)                
                if m and search_original_sentence ==1:
                    #original_sentence = m.group(1).lower()
                    original_sentence = m.group(1)
                    search_original_sentence = 0
                    search_tokenization = 1
                elif previous_line_empty == 1 and search_ptb_tokenization == 1 and line == '<':
                    read_ptb_tokenization = 1
                    search_ptb_tokenization = 0
                elif read_ptb_tokenization == 1 and line == '>':
                    read_ptb_tokenization = 0
                    search_erg_tokenization = 1
                elif read_erg_tokenization == 1 and line == '>':
                    read_erg_tokenization = 0
                    search_derivation_tree = 1
                elif previous_line_empty == 1 and search_erg_tokenization == 1 and line == '<':
                    read_erg_tokenization = 1
                    search_erg_tokenization = 0
                elif read_erg_tokenization == 1 and line != '>':
                    line_parts = line.split(', ')

                    
                    #we remove quotes from the string value that is why we use [1:-1]
                    #we take <0:3> as the key and The as the value
                    token_part = line_parts[5][1:-1]
                    if '" "' in line_parts[5][1:-1]:
                        token_parts = line_parts[5][1:-1].split('" "')
                        token_part = token_parts[0]   

                
                    #WeScience, English MRS Test Suite:
                    #(1, 0, 1, <0:3>, 1, "The", 0, "null")
                    #PEST AND deepbank:
                    #(42, 0, 1, <0:1>, 1, "The" "The", 0, "null", "DT" 1.0)
                
                    #we decided that the tokens' positions are in the third column (tokens positions start from 1)          
                    erg_tok_dict[line_parts[3]].append(token_part)

                elif line == '(ROOT_STRICT' or line == '(ROOT_INFORMAL' or line == '(ROOT_FRAG' or line == '(ROOT_INFFRAG' or line == '(ROOT_SPOKEN_FRAG' or line == '(ROOT_SPOKEN':
                    read_tree = 1
                    derivation_tree = line + '\n'
                    cfg_tree = cfg_tree + line + ' ';
                    previous_line_empty = 0
                    read_eds = 0
                elif len(line) == 0:
                    previous_line_empty = 1
                    if read_tree == 1:
                        cfg_tree = cfg_tree + "\n"
                        read_tree = 0                        
                        after_derivation_tree = 1
                    elif read_eds == 1:
                        read_eds = 0
                        read_done = 1
                elif read_tree == 1:
                    line = line.strip()
                    
                    if token_index_missing == 1 and re.match('^\d+$', line):
                        line = partial_line + '|||||||' + line
                        derivation_tree = derivation_tree + line + '\n'
                        # For the CFG tree we cut the opening bracket before the word (line[0] = "(" always)
                        cfg_tree = cfg_tree + line[1:]
                        #print("Line restored: " + line)
                        token_index_missing = 0
                        partial_line = ''
                    else:
                        
                        #this is a pattern for lines like ("understood" 264
                        match_pattern = re.match('(^\(")([^"]+)("\s+\d+$)', line)
                        #this is a pattern when we have
                        #("the"
                        #    246
                        #and currently are analyzing line ("the"
                        match_pattern2 = re.match('(^\(")([^"]+)("$)', line)
                        
                        #this one is to detect rule or supertag
                        match_pattern3 = re.match("^\(", line)

                        #this one is to detect the end of "token" string
                        match_pattern4 = re.search("\"(\))+\)$", line)

                        if match_pattern and match_pattern.group(1) and match_pattern.group(2) and match_pattern.group(3):
                            
                            line_begin = match_pattern.group(1)
                            line_content = match_pattern.group(2)
                            line_end = match_pattern.group(3)
                            #print 'LINE ORIGINAL ', line
                            #print 'LINE BEGIN ', line_begin
                            #print 'LINE CONTENT ', line_content
                            #print 'LINE END ', line_end
                            line_content = line_content.replace('(', '<<<<<<<')
                            line_content = line_content.replace(')', '>>>>>>>')
                            #multiword expression
                            if ' ' in line_content:
                                line_content = '+'.join(line_content.split(' '))
                            
                            line = line_begin + line_content + line_end
                            line = '|||||||'.join(line.split(' '))
                            derivation_tree = derivation_tree + line + '\n'
                            # For the CFG tree we cat the opening bracket before the word (line[0] = "(" always)
                            cfg_tree = cfg_tree + line[1:] + '\n'

                        #this is the case when the line is incomplete because the indificator is on the next line
                        elif match_pattern2 and match_pattern2.group(1) and match_pattern2.group(2) and match_pattern2.group(3):
                            line_begin = match_pattern2.group(1)
                            line_content = match_pattern2.group(2)
                            line_end = match_pattern2.group(3)
                            #print 'LINE ORIGINAL ', line
                            #print 'LINE BEGIN ', line_begin
                            #print 'LINE CONTENT ', line_content
                            #print 'LINE END ', line_end
                            line_content = line_content.replace('(', '<<<<<<<')
                            line_content = line_content.replace(')', '>>>>>>>')
                            #multiword expression
                            if ' ' in line_content:
                                line_content = '+'.join(line_content.split(' '))
                            
                            line = line_begin + line_content + line_end
                            
                            partial_line = line
                            token_index_missing = 1
                        
                        
                        else:
                            original_line = line
                            #rule or supertag
                            if match_pattern3:
                                rule_line = line
                                cfg_rule = rule_line
                    
                                if re.search("\s", rule_line):
                                    line_parts = re.split("\s+", rule_line)
                                    rule_line = line_parts[1]
                                    rule_line = rule_line.replace("-", ":")
                                    cfg_rule = rule_line
                                    
                                    #if it is a supertag
                                    if rule_line.find("/") != -1:
                                        line_parts = rule_line.split("/")
                                        rule_line = line_parts[1]
                                        cfg_rule = rule_line
                                        #cfg_tree = cfg_tree + "(" + rule_line + " "
                                    #in the new format supertags are separated with @
                                    elif rule_line.find("@")!=-1:
                                        line_parts = rule_line.split("@")
                                        rule_line = line_parts[1]
                                    
                                        cfg_rule = rule_line
                                        #cfg_tree = cfg_tree + "(" + rule_line + " "

                                    #if it is a rule
                                    else: 
                                        # If the rule is marked with "^", we remove "^"
                                        if re.match("^\^", rule_line):
                                            rule_line = rule_line[1:]
                                        cfg_rule = rule_line
                                        #cfg_tree = cfg_tree + "(" + rule_line_full + " "

                                        if rule_line.find("_")!=-1:
                                            line_parts = rule_line.split("_") 
                                            rule_line = line_parts[0]
                                #print("rule or supertag: " + rule_line)

                                rule_line = "(" + rule_line + " ";
                                cfg_tree = cfg_tree + "(" + cfg_rule + " "
          

                            #here we take care about the lines like "token [ +CARG #1=\"the\" +CLASS alphabetic [ +CASE non_capitalized+lower +INITIAL - ] +FORM #1 +FROM \"175\" +ID *diff-list* [ LAST #2=*top* LIST *cons* [ FIRST \"25\" REST #2 ] ] +PRED predsort +TNT null_tnt [ +MAIN tnt_main [ +PRB \"1\" +TAG \"DT\" ] +PRBS *null* +TAGS *null* ] +TO \"178\" +TRAIT native_trait ]"))))
                            #they should not contain brackets in the middle
                            for match_pattern1 in re.finditer('(\\\\\")([^"]+)(\\\\\")',line):
                                #print 'MATCH_PATTERN1.group ', match_pattern1.group()
                                if '(' in match_pattern1.group(0) or ')' in match_pattern1.group(0):
                                    line_begin = match_pattern1.group(1)
                                    line_content = match_pattern1.group(2)
                                    line_end = match_pattern1.group(3) 
                                    #print 'LINE ORIGINAL ', line
                                    #print 'LINE BEGIN ', line_begin
                                    #print 'LINE CONTENT ', line_content
                                    #print 'LINE END ', line_end
                                    line_content = line_content.replace('(', '<<<<<<<')
                                    line_content = line_content.replace(')', '>>>>>>>')
                                    #print 'substitute ', match_pattern1.group(0), ' with ', line_begin + line_content + line_end
                                    line = re.sub(re.escape(match_pattern1.group(0)), line_begin + line_content + line_end, line, 1)
                            
                            line = '|||||||'.join(line.split(' '))
                            #print 'LINE FINAL ', line, '\n\n'
                            derivation_tree = derivation_tree + line + '\n'

                            # Here we have smth like ")))) at the end of the string that contains "token "
                            if match_pattern4:
                                # since the regex match_pattern4 worked, we know there is quote in the string
                                line_parts = original_line.split("\"");
                                closing_line = line_parts[len(line_parts) - 1]
                                #cut off the last bracket because it is for the word
                                #EXAMPLE
                                #(335 of_poss/p_np_i-nm-poss_le -0.0241179 8 9
                                #("of" 101
                                #"token [ ...]"))
                                #becomes:
                                #(p_np_i-nm-poss_le of)
                                cfg_tree = cfg_tree + closing_line[:-1] + " "

                    previous_line_empty = 0
                    read_eds = 0
                elif after_derivation_tree == 1 and previous_line_empty == 1 and re.match('^\{.+',line) and (line[-1] == ':' or re.search('\:\s+\(fragmented\)', line)  or re.search('\:\s+\(cyclic\)', line) or re.search('\:\s+\(cyclic fragmented\)', line)):
                    eds = eds + line + '\n'
                    read_eds = 1
                    previous_line_empty = 0
                    after_derivation_tree = 0
                elif read_eds == 1:
                    eds = eds + line + '\n'
                    previous_line_empty =0


        current_file.close()
        #print("CFG Tree")
        try:
		t_cfg = Tree.parse(cfg_tree)
        except ValueError:
		raise ValueError("Missing or incorrect derivation tree!")
		#if fhdl_dict['log'] is not None:
                #        fhdl_dict['log'].write(str(fi) + "\t" + "Missing or incorrect derivation tree\n")
		#return
	if self.cfg_no_unary_rules == 1:
            t_cfg.collapse_unary(True, False, '///////')
            self.traverse_cfg(t_cfg)
            t_cfg = Tree.parse(self.t_cfg_traversed)
        #print(t_cfg)
        #t_cfg.draw()
        #t_cfg.draw()
        
        #print("ERG TOK DICTIONARY")
        #print(erg_tok_dict)
        #print('\n\n')

        #print 'DERIVATION TREE:\n\n'
        #print derivation_tree + '\n\n'
        try:
		t = Tree.parse(derivation_tree)
	except ValueError:
		print "Missing or incorrect derivation tree!"
		#if fhdl_dict['log'] is not None:
                #        fhdl_dict['log'].write(str(fi) + "\t" + "Missing or incorrect derivation tree\n")
		return
        t.collapse_unary(True, True, '///////')
        #t.draw()

        traverse_results = self.my_traverse(t, [], [], {}, {}, {},{}, {}, erg_tok_dict, 'erg')
        first_dep = ''
        sent_root = ''
        if len(traverse_results[0]) > 0:
            first_dep = traverse_results[0][0]
        words_list = traverse_results[1] 
        dep_dict = traverse_results[2]
        words_pos_dict = traverse_results[3]
        #print 'WORDS POS DICT'
        #pprint(words_pos_dict)
        #print 'WORDS LIST'
        #pprint(words_list)

        pos_tag_dict = traverse_results[4]
        #print('POS TAG DICT')
        #pprint(pos_tag_dict)
        #print('\n')

        lemma_dict = traverse_results[5]
        words_correct_case_dict = traverse_results[6]
        #print('CASE_DICT')
        #pprint(case_dict)
        
        '''
        #Search for cases where id (e.g. <13:25>) corresponds to more than one token and those tokens are not
        #hyphen-separated (e.g.   
        #(344, 7, 8, <44:60>, 1, "Macmillan", 0, "null", "NNP" 0.9881)
        #(372, 8, 9, <44:60>, 1, "/", 0, "null")
        #(264, 9, 10, <44:61>, 1, "McGraw,", 0, "null")
        for key in case_dict:
            #print(words_list[words_pos_dict[key]-1])
            if len(case_dict[key]) > 1 and  not re.search("-", words_list[words_pos_dict[key][0]-1]):
                print(key + "\t" + str(case_dict[key]))
                print 'WORDS POS DICT'
                pprint(words_pos_dict)
                print 'WORDS LIST'
                pprint(words_list)
        '''
        indexes = range(1,len(words_list)+1)

        #print('\n\n WORDS LIST\n')
        #pprint(words_list)
        #print '====================\n\n\n DEPENDENCY DICTIONARY\n\n'
        #pprint(dep_dict)
        #print '\n\n First dependency: ' + first_dep + '\n\n'


        sentence = ''
        # erg_tok_dict contains all tokens in ERG tokenization (some of them could have correct upper/lower case)
        # case_dict contains information about the case from the derivation tree (e.g. +CASE capitalized+lower )
        #words_correct_case_dict = self.correct_case_erg_tok(words_pos_dict, words_list, erg_tok_dict, case_dict)
        for i in range(1,len(words_list)+1):
            
            word = words_correct_case_dict[words_list[i-1]]

            sentence = sentence + word + ' '

        sentence = sentence.strip()
        #sentence = sentence.lower()
        words_dict = dict(zip(words_list, indexes))
        #word_pos_dict = dict(zip(word_pos_list, indexes))
        #print '\n\n WORDS DICTIONARY\n'
        #pprint(words_dict)
        #print '\n\n WORDS POSITION DICTIONARY\n'
        #pprint(word_pos_dict)


        #print original_sentence + '\n'        
        #print sentence + '\n'
        #if original_sentence != sentence:
        #    continue

        dep_rel = []
        dep_rel_indexes = []
        sent_root = ''
        if first_dep != '':
            dep_rel.append(first_dep)
            
            #print 'WORDS DICT'
            #pprint(words_dict)
            dep_rel_indexes = self.dep_expansion_erg_tok(fi, rule_head_dict, dep_rel, words_dict, dep_dict, [])
            #print("Dependency relations dictionary")
            #pprint(dep_rel_indexes)
            
            sent_root = words_dict[first_dep]
        #one-word sentence: the word should be the root
        elif first_dep == '' and len(words_dict.keys()) == 1:
            sent_root = words_dict[words_dict.keys()[0]]

        sentence = escape_sent(sentence)
        words_in_sent = sentence.split(' ')            

        #print eds + '\n'


        read_eds_results = self.read_eds_into_dict(eds, relations_collection)
        eds_root = read_eds_results[0]
        eds_dict = read_eds_results[1]
        dict_id_pos = read_eds_results[2]
        transparent_dict = read_eds_results[3]
        eds_relation_dict = read_eds_results[4]
        #print '\n\n ROOT:' + eds_root  + '\n'
        #print 'EDS_DICT'
        #pprint(eds_dict)
        #print '\n\n'
        #print 'TRANSPARENT DICTIONARY'
        #pprint(transparent_dict)
        #print '\n\n'

        (eds_dep_indexes, eds_relation_pos_dict) = self.eds_expansion_erg_tok(eds_dict, words_pos_dict, dict_id_pos, transparent_dict, eds_relation_dict)

        #print 'EDS DEP INDEXES'
        #pprint(eds_dep_indexes)
        #print '\n\n'
        #DEBUGGING 13/03/2013
        #print 'EDS RELATION POS DICT'
        #pprint(eds_relation_pos_dict)
        #print('\n\n')

        for dependency_relation in eds_dep_indexes:
            eds_dep_labels_dict[dependency_relation[2]] = 1

        #print('eds_dep_labels_dict')
        #pprint(eds_dep_labels_dict)
        #print('\n\n')


        if output_flags['tex']:
            fhdl_dict['tex'].write('\\begin{center}\n' + 
                           '\\begin{dependency}[edge below]\n' + 
                           '\\begin{deptext}[column sep=.05cm]\n')

            fhdl_dict['tex'].write(words_in_sent[0])
            for i in range(1, len(words_in_sent)):
                fhdl_dict['tex'].write(' \& ' + words_in_sent[i])

            fhdl_dict['tex'].write('\\\\\n' + '\end{deptext}\n')

        #print '\n\n SENT ROOT ' + str(sent_root) + '\n\n'        

        #if sent_root != '':        
            #latex_output_fh.write('\deproot[edge above, edge style={red}]{' + str(sent_root) + '}' + '{root}\n') 

        mrs_derived_sent_root = ''
        if eds_root in eds_dict and eds_root in dict_id_pos:
            if dict_id_pos[eds_root] in words_pos_dict:
            #we have to choose intermediate dictionaries as well 
            # {e2:
            #    e2:implicit_conj<5:173>[L-INDEX e7, R-INDEX e6, L-HNDL e7, R-HNDL e36]
            #    e7:unknown<5:50>[]
                
                #Another problematic example
                #{e2:
                #e2:implicit_conj<78:218>[L-INDEX e21, R-INDEX e57, L-HNDL e21, R-HNDL e57]
                # e21:loc_nonsp<16:77>[ARG1 x5, ARG2 x22]
                # We have no root in the end
                mrs_derived_sent_root = str(words_pos_dict[dict_id_pos[eds_root]][0])
            
        #    latex_output_fh.write('\deproot[edge style={blue}]{' + mrs_derived_sent_root + '}' + '{root}\n')

        #print 'WORDS POS DICT'
        #pprint(words_pos_dict) 

        #print("Output flags")
        #pprint(output_flags)

        if output_flags['dt'] or output_flags['tex']:
            self.convert_dt_erg_tok(fi, words_list, words_correct_case_dict,pos_tag_dict, lemma_dict, sent_root, dep_rel_indexes, args_dict, output_flags, fhdl_dict)
        if output_flags['dm'] or output_flags['tex']:
            self.convert_dm_erg_tok(fi, words_list, words_correct_case_dict, pos_tag_dict, lemma_dict, eds_dep_indexes, mrs_derived_sent_root,  args_dict, output_flags, fhdl_dict)
        if output_flags['dtm']:
            self.convert_dtm_erg_tok(fi, words_list, words_correct_case_dict, pos_tag_dict, lemma_dict, sent_root, dep_rel_indexes, mrs_derived_sent_root, eds_dep_indexes, eds_relation_pos_dict,  args_dict, output_flags, fhdl_dict)
        #sent_tok MUST preceed 'cfg' because in 'cfg' the tree is changed
        if output_flags['sent_tok']:
            self.extract_tok_sent_erg_tok(fi, t_cfg, words_correct_case_dict, fhdl_dict)
        if output_flags['cfg']:
            self.convert_cfg(fi, t_cfg, words_correct_case_dict, fhdl_dict)

        if output_flags['tex']:        
            fhdl_dict['tex'].write('\end{dependency}\n' + '\\\\\n' + str(fi) + '.gz' +  '\end{center}' + '\n\\newpage')

        return [file_index, current_latex_file_index, fhdl_dict, eds_dep_labels_dict,words_correct_case_dict]

    def traverse_cfg(self,t_cfg):
        try:
            t_cfg.node
        except AttributeError:
            self.t_cfg_traversed = self.t_cfg_traversed + t_cfg
        else:
            #print("Node: " + t_cfg.node)
            #print(self.t_cfg_traversed)
            # Now we know that t.node is defined
            node = t_cfg.node 
            if '///////' in node:
                node_parts = t_cfg.node.split('///////')
                node = node_parts[len(node_parts)-1]
            self.t_cfg_traversed = self.t_cfg_traversed + '(' + node + " "
            #print(t_cfg_traversed)
            for child in t_cfg:
                self.traverse_cfg(child)
            self.t_cfg_traversed = self.t_cfg_traversed + ')'


    def extract_lowercased_token_erg_tok(self, parent):
        # e.g. parent = '"because+of"|||||||399'
                    
        # ex1) elem_of_words_list = ['"he"', 429]
        # ex2) elem_of_words_list = ['"because+of"',399]
        elem_of_words_list = parent.split('|||||||')
        # ex1) word_lowercase = '"he"'
        # ex2) word_lowercase = '"because+of"'
        word_lowercase = elem_of_words_list[0]
        # ex1) clean_word_lowercase = 'he'
        # ex2) clean_word_lowercase = 'because+of'
        clean_word_lowercase = word_lowercase[1:len(word_lowercase)-1]
        #print(clean_word_lowercase)
        return clean_word_lowercase

    def find_case_info_for_multiword_expr_erg_tok(self, t, multiword_parts):
        
        # t is sort of array that contains information under leaves
        #e.g. t=
        #(1104|||||||more_than_adv1/av_-_i-vp-pr_le|||||||0.257395|||||||17|||||||19///////"more+than"|||||||299
        #"token|||||||[|||||||+CARG|||||||#1=\"more\"|||||||+CLASS|||||||alphabetic|||||||[|||||||+CASE|||||||non_capitalized+lower|||||||+INITIAL|||||||-|||||||]|||||||+FORM|||||||#1|||||||+FROM|||||||\"116\"|||||||+ID|||||||*diff-list*|||||||[|||||||LAST|||||||#2=*top*|||||||LIST|||||||*cons*|||||||[|||||||FIRST|||||||\"19\"|||||||REST|||||||#2|||||||]|||||||]|||||||+PRED|||||||predsort|||||||+TICK|||||||bool|||||||+TNT|||||||null_tnt|||||||[|||||||+MAIN|||||||tnt_main|||||||[|||||||+PRB|||||||\"0.52686469999999996\"|||||||+TAG|||||||\"RBR\"|||||||]|||||||+PRBS|||||||*null*|||||||+TAGS|||||||*null*|||||||]|||||||+TO|||||||\"120\"|||||||+TRAIT|||||||token_trait|||||||[|||||||+HD|||||||token_head|||||||+IT|||||||italics|||||||+LB|||||||bracket_null|||||||+RB|||||||bracket_null|||||||+UW|||||||-|||||||]|||||||]"
        #286
        #"token|||||||[|||||||+CARG|||||||#1=\"than\"|||||||+CLASS|||||||alphabetic|||||||[|||||||+CASE|||||||non_capitalized+lower|||||||+INITIAL|||||||-|||||||]|||||||+FORM|||||||#1|||||||+FROM|||||||\"121\"|||||||+ID|||||||*diff-list*|||||||[|||||||LAST|||||||#2=*top*|||||||LIST|||||||*cons*|||||||[|||||||FIRST|||||||\"20\"|||||||REST|||||||#2|||||||]|||||||]|||||||+PRED|||||||predsort|||||||+TICK|||||||bool|||||||+TNT|||||||null_tnt|||||||[|||||||+MAIN|||||||tnt_main|||||||[|||||||+PRB|||||||\"1\"|||||||+TAG|||||||\"IN\"|||||||]|||||||+PRBS|||||||*null*|||||||+TAGS|||||||*null*|||||||]|||||||+TO|||||||\"125\"|||||||+TRAIT|||||||token_trait|||||||[|||||||+HD|||||||token_head|||||||+IT|||||||italics|||||||+LB|||||||bracket_null|||||||+RB|||||||bracket_null|||||||+UW|||||||-|||||||]|||||||]")
        
        #output list
        case_list = []
        j = 0
        for i in range(len(t)):
            m_start = re.search("\+FROM\|\|\|\|\|\|\|(#\d+=)*\\\\\"(\d+)\\\\\"", str(t[i]))
            m_end = re.search("\+TO\|\|\|\|\|\|\|\\\\\"(\d+)\\\\\"", str(t[i]))
            m_case = re.search("\+CASE\|\|\|\|\|\|\|([^\|]+)\|\|\|\|\|\|\|", str(t[i]))
            if m_start  and m_end:
                start = m_start.group(2)
                end  = m_end.group(1)
                #By default the case for the found token will be not specified (empty string)
                if m_case:
                    case_list.append(["<" + start+ ":" + end + ">", multiword_parts[j], m_case.group(1)])
                else:
                    case_list.append(["<" + start+ ":" + end + ">", multiword_parts[j], ""])
                j+= 1
                    
 
        return case_list

    def analyze_case_descr_erg_tok(self, erg_tok_dict, case_list):
        for i in range(len(case_list)):
            token_from_dertree = case_list[i][1]
            case_descr = case_list[i][2]
            if case_descr == "non_capitalized+lower":
                case_list[i][1] = token_from_dertree.lower()

            elif case_descr == "capitalized+lower":
            # Difficult cases: token starts with punctuation
            # Example: “david
            # In this case we capitalize not the first letter,
            # but the first letter after all the punctuation symbols
                letter_pos = 0

                while letter_pos < len(token_from_dertree) and self.is_punctuation_erg_tok(token_from_dertree[letter_pos]) == 1:
                    letter_pos +=1

                #This bit contains punctuation that preceeds capitalized letter, e.g. in the 
                # case of token “david.
                # We keep punctuation string empty by default
                punctuation_str = ""
                if letter_pos > 0:
                    punctuation_str = token_from_dertree[:letter_pos]

                #if there are letters after the one that should be capitalized
                if len(token_from_dertree) -1 > letter_pos:
                    case_list[i][1] =  punctuation_str + token_from_dertree[letter_pos].upper() + token_from_dertree[letter_pos+1:].lower()
                # if there are no more letters after the one that should be capitalized
                elif len(token_from_dertree) -1 == letter_pos:
                    case_list[i][1] = punctuation_str + token_from_dertree[letter_pos].upper()
                #print(word_lowercase_unique_id + ": " + words_correct_case_dict[word_lowercase_unique_id])                                    

            elif case_descr == "capitalized+upper":
                case_list[i][1] = token_from_dertree.upper()
                #print(word_lowercase_unique_id + ": " + words_correct_case_dict[word_lowercase_unique_id])
            
            else:
                # If we have capitalization of the first letter, we should change default
                # However, theses cases are not clear about the other letters in the word,
                # so we still have to look at the ERG tokenization
                if re.match("^capitalized", case_descr):
                                                    # Difficult cases: token starts with punctuation
                    # Example: “david
                    # In this case we capitalize not the first letter,
                    # but the first letter after all the punctuation symbols
                    letter_pos = 0

                    while self.is_punctuation_erg_tok(token_from_dertree[letter_pos]) == 1:
                        letter_pos +=1

                    #This bit contains punctuation that preceeds capitalized letter, e.g. in the 
                    # case of token “david.
                    # We keep punctuation string empty by default
                    punctuation_str = ""
                    if letter_pos > 0:
                        punctuation_str = token_from_dertree[:letter_pos]

                    #if there are letters after the one that should be capitalized
                    if len(token_from_dertree) -1 > letter_pos:
                        case_list[i][1] = punctuation_str + token_from_dertree[letter_pos].upper() + token_from_dertree[letter_pos+1:].lower()
                    # if there are no more letters after the one that should be capitalized
                    elif len(token_from_dertree) -1 == letter_pos:
                        case_list[i][1] = punctuation_str + token_from_dertree[letter_pos].upper()
                    #print(case_descr + ";" + word_lowercase_unique_id + ": " + words_correct_case_dict[word_lowercase_unique_id])                                    


                #If there are no clear records (e.g. the case is mixed) about the case of the token in the derivation tree,
                # look at ERG tokenization.
                
                #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                #TODO: CORRECT FINUCTION find_correct_case_in_erg_tok !!!!!!!!!!
                #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

                case_list[i][1] = self.find_correct_case_in_erg_tok(erg_tok_dict, case_list[i][0], case_list[i][1])
        return case_list

    def find_correct_case_in_erg_tok(self, erg_tok_dict, character_based_id, token_from_dertree):
        '''


        ERG TOK DICTIONARY
        {'<45:53>': ['American', 'American', 'american'],
         '<16:20>': ['U.S.', 'U.S.', 'u.s.'], 
         '<176:179>': ['the'], 
         '<74:76>': ['on'], 
         '<180:183>': ['U.', 'U.', 'u.'], ...}

        '''


        number_of_cap_letters_in_token_from_dertree = sum(x.isupper() for x in token_from_dertree)

        if character_based_id in erg_tok_dict:
            # e.g. if token_id = "<0:2>", then token_variants = ['he']
            token_variants = erg_tok_dict[character_based_id]

            # If we find a variant of the word from erg_tok_dict with capitalization, we choose
            # capitalization, rather than lowercase version
            # (so we return this variant)
            for variant in token_variants:
                number_of_cap_letters_in_variant = sum(x.isupper() for x in variant)
                if len(variant) == len(token_from_dertree) and variant != token_from_dertree and variant.lower() == token_from_dertree.lower() and number_of_cap_letters_in_variant > number_of_cap_letters_in_token_from_dertree:
                    return variant

        return token_from_dertree
    def convert_cfg(self, fi, t_cfg,  words_correct_case_dict, fhdl_dict):
        for pos in t_cfg.treepositions('leaves'):
            t_cfg[pos] = t_cfg[pos].replace('_______', ' ').replace('<<<<<<<', '(').replace('>>>>>>>', ')')
            t_cfg[pos] = words_correct_case_dict[t_cfg[pos]].replace("/", "\/").replace("*", "\*").replace("(", "-LRB-").replace(")", "-RRB-").replace("“", "``").replace("”", "''").replace("‘", "`").replace("’", "'")
        fhdl_dict['cfg'].write('#' + str(fi) + "\t" + "( " + t_cfg._pprint_flat(nodesep='', parens='()', quotes=False) + " )" +"\n")
        #print("( " + t_cfg._pprint_flat(nodesep='', parens='()', quotes=False) + " )" +"\n")
        
        return

    def extract_tok_sent_erg_tok(self, fi, t_cfg, words_correct_case_dict, fhdl_dict):
        tree_leaves = t_cfg.leaves()
        sent = ""
        for token in tree_leaves:
            token = token.replace('_______', ' ').replace('<<<<<<<', '(').replace('>>>>>>>', ')')
            token = words_correct_case_dict[token].replace("/", "\/").replace("*", "\*").replace("(", "-LRB-").replace(")", "-RRB-").replace("“", "``").replace("”", "''").replace("‘", "`").replace("’", "'")
            sent += token + " "
        sent = sent.strip()
        fhdl_dict['sent_tok'].write('#' + str(fi) + "\t" + sent  + "\n")
        return

def break_key_into_start_end(key):
    # key is "<0:2>"
    # key_parts = ["<0", "2>"]
    key_parts = key.split(':')
    # now we extract "0" from "<0"
    start = int(key_parts[0][1:])
    # now we extract "2" from "2>"
    end = int(key_parts[1][:-1])
    return [start, end]

def create_dep_indexes_dict (dt_dep_rel_indexes):
    dep_indexes_dict = {}
    for i in range(0,len(dt_dep_rel_indexes)):
        head = dt_dep_rel_indexes[i][0]
        dependent = dt_dep_rel_indexes[i][1]
        label = dt_dep_rel_indexes[i][2]
            
            
        dep_indexes_dict[dependent] =[head,label]
    return dep_indexes_dict

def escape_sent (sentence):
    sentence = sentence.replace('%', '\%')
    sentence = sentence.replace('$', '\$')
    sentence = sentence.replace('&', 'and')
    sentence = sentence.replace('#', '\#')
    sentence = sentence.replace('{', '\{')
    sentence = sentence.replace('}', '\}')
    sentence = sentence.replace('[', '$[$')
    sentence = sentence.replace(']', '$]$')
    return sentence

def collect_arguments_for_dm_predicates(dm_eds_dep_indexes, eds_relation_pos_dict, sorted_eds_relation_pos_list):
    dm_dependencies = {}
    for i in range(0, len(dm_eds_dep_indexes)):
        dm_head_word_index = dm_eds_dep_indexes[i][0]
        dm_dep_word_index  = dm_eds_dep_indexes[i][1]
        dm_label = dm_eds_dep_indexes[i][2]
        #For Latex we escape "_" with a backslash in the label name, but now we are printing out CoNLL08 file, so we do not need to escape
        dm_label = dm_label.replace("\\", "")


        #Unless we already saw this dependent word before
        if not (dm_dep_word_index in dm_dependencies):
            # and by default there are no predicates for which current word is an argument
            dm_dependencies[dm_dep_word_index] = ["_"] * len(sorted_eds_relation_pos_list)
                        
        #if the head is a predicate
        if dm_head_word_index in eds_relation_pos_dict:
            dm_dependencies[dm_dep_word_index][sorted_eds_relation_pos_list.index(dm_head_word_index)] = dm_label
    return dm_dependencies


#=============== THE END ==================#

if __name__=="__main__":
    Converter().run()