# Natural Language Toolkit: Dependency Corpus Reader # # Copyright (C) 2001-2012 NLTK Project # Author: Kepa Sarasola # Iker Manterola # # URL: # For license information, see LICENSE.TXT from nltk.parse import DependencyGraph from nltk.tokenize import * from util import * from api import * class DependencyCorpusReader(SyntaxCorpusReader): def __init__(self, root, fileids, encoding=None, word_tokenizer=TabTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), para_block_reader=read_blankline_block): CorpusReader.__init__(self, root, fileids, encoding) ######################################################### def raw(self, fileids=None): """ :return: the given file(s) as a single string. :rtype: str """ return concat([open(fileid).read() for fileid in self.abspaths(fileids)]) def words(self, fileids=None): return concat([DependencyCorpusView(fileid, False, False, False) for fileid in self.abspaths(fileids)]) def tagged_words(self, fileids=None): return concat([DependencyCorpusView(fileid, True, False, False) for fileid in self.abspaths(fileids)]) def sents(self, fileids=None): return concat([DependencyCorpusView(fileid, False, True, False) for fileid in self.abspaths(fileids)]) def tagged_sents(self, fileids=None): return concat([DependencyCorpusView(fileid, True, True, False) for fileid in self.abspaths(fileids)]) def parsed_sents(self, fileids=None): sents=concat([DependencyCorpusView(fileid, False, True, True) for fileid in self.abspaths(fileids)]) return [DependencyGraph(sent) for sent in sents] class DependencyCorpusView(StreamBackedCorpusView): _DOCSTART = '-DOCSTART- -DOCSTART- O\n' #dokumentu hasiera definitzen da def __init__(self, corpus_file, tagged, group_by_sent, dependencies, chunk_types=None): self._tagged = tagged self._dependencies = dependencies self._group_by_sent = group_by_sent self._chunk_types = chunk_types StreamBackedCorpusView.__init__(self, corpus_file) def read_block(self, stream): # Read the next sentence. sent = read_blankline_block(stream)[0].strip() # Strip off the docstart marker, if present. if sent.startswith(self._DOCSTART): sent = sent[len(self._DOCSTART):].lstrip() # extract word and tag from any of the formats if not self._dependencies: lines = [line.split('\t') for line in sent.split('\n')] if len(lines[0]) == 3 or len(lines[0]) == 4: sent = [(line[0], line[1]) for line in lines] elif len(lines[0]) == 10: sent = [(line[1], line[4]) for line in lines] else: raise ValueError('Unexpected number of fields in dependency tree file') # discard tags if they weren't requested if not self._tagged: sent = [word for (word, tag) in sent] # Return the result. if self._group_by_sent: return [sent] else: return list(sent)