# # Copyright (C) 2001-2012 NLTK Project # Author: Masato Hagiwara # URL: # For license information, see LICENSE.TXT # For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html import sys import util from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class ChasenCorpusReader(CorpusReader): def __init__(self, root, fileids, encoding=None, sent_splitter=None): self._sent_splitter = sent_splitter CorpusReader.__init__(self, root, fileids, encoding) def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, basestring): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def words(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)]) def tagged_words(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)]) def sents(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)]) def tagged_sents(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)]) def paras(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)]) def tagged_paras(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)]) class ChasenCorpusView(StreamBackedCorpusView): """ A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``, but this'll use fixed sets of word and sentence tokenizer. """ def __init__(self, corpus_file, encoding, tagged, group_by_sent, group_by_para, sent_splitter=None): self._tagged = tagged self._group_by_sent = group_by_sent self._group_by_para = group_by_para self._sent_splitter = sent_splitter StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) def read_block(self, stream): """Reads one paragraph at a time.""" block = [] for para_str in read_regexp_block(stream, r".", r"^EOS\n"): para = [] sent = [] for line in para_str.splitlines(): _eos = line.strip() == 'EOS' _cells = line.split('\t') w = (_cells[0], '\t'.join(_cells[1:])) if not _eos: sent.append(w) if _eos or (self._sent_splitter and self._sent_splitter(w)): if not self._tagged: sent = [w for (w,t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) sent = [] if len(sent)>0: if not self._tagged: sent = [w for (w,t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) if self._group_by_para: block.append(para) else: block.extend(para) return block def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') print '/'.join( jeita.words()[22100:22140] ) print '\nEOS\n'.join(['\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173]]) def test(): from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') assert isinstance(jeita.tagged_words()[0][1], basestring) if __name__ == '__main__': demo() test()