""" This module stores the deler configuration, and specified in the xml configuration files. See examples/html-wdc.xml for an example with documenting comments. """ from html import HtmlAccountedString, HtmlReduction import os from xml.etree import ElementTree class Config(object): def __init__(self, config, accounting, gml_mode, paragraph_mode, segmenter): super(Config, self).__init__() if os.path.exists(config): self.root = ElementTree.parse(config).getroot() # JR - removed for simplicity until other input possible # get the input format (only html in this version) # input_format = self.root.attrib['input'] # if input_format is None or input_format == 'html': # self.reduction_type = HtmlReduction # self.account_type = HtmlAccountedString # else: # raise ValueError('unrecognised input format ' + input_format) self.reduction_type = HtmlReduction self.account_type = HtmlAccountedString # check segmenter path references an executable if not os.path.exists(segmenter): raise ValueError(segmenter + ' does not exist') elif not os.access(segmenter, os.X_OK): raise ValueError(segmenter + ' is not executable') self.segmenter = segmenter self.accounting = accounting self.gml_mode = gml_mode self.paragraph_mode = paragraph_mode self.purge = _ElementSet(self.root, 'purge') self.map = _MapDict(self.root, 'map') self.mask = _ElementSet(self.root, 'mask') self.newline = _ElementSet(self.root, 'newline') self.non_closing = _ElementSet(self.root, 'non-closing') self.paragraph_like = _ElementSet(self.root, 'paragraph-like') self.unsegmentable = _ElementSet(self.root, 'unsegmentable') if self.gml_mode: for name, gml in self.map.items(): if gml is None: raise ValueError('gml-mode activated but not all map elements have a gml equivalent') else: raise ValueError(config + ' does not exist') class _ElementSet(set): """ A set of child elements with the specified name """ def __init__(self, root, name): super(_ElementSet, self).__init__() for element in root.findall(name): for child in element.findall('element'): self.add(child.attrib['name']) class _MapDict(dict): """ A dictionary of maps between element names and gml """ def __init__(self, root, name): super(_MapDict, self).__init__() for element in root.findall(name): for child in element.findall('element'): name = child.attrib['name'] gml = child.attrib['gml'] if 'gml' in child.attrib else None self[name] = gml