.. Copyright (C) 2001-2012 NLTK Project .. For license information, see LICENSE.TXT =========== Probability =========== >>> import nltk >>> from nltk.probability import * FreqDist -------- >>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!'] >>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.'] >>> fd1 = nltk.FreqDist(text1) >>> fd1.items() [('!', 1), ('a', 1), ('anywhere', 1), ('fish', 1), ('goes', 1), ('good', 1), ('no', 1), ('porpoise', 1), ('without', 1)] >>> fd1 == nltk.FreqDist(text1) True Note that items are sorted in order of decreasing frequency; two items of the same frequency are sorted alphabetically by their key. >>> both = nltk.FreqDist(text1 + text2) >>> both.items() [('fish', 3), ('anywhere', 2), ('good', 2), ('no', 2), ('porpoise', 2), ('!', 1), ('.', 1), ('a', 1), ('goes', 1), ('likes', 1), ('to', 1), ('without', 1)] >>> both == fd1 + nltk.FreqDist(text2) True >>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged True >>> fd2 = nltk.FreqDist(text2) >>> fd1.update(fd2) >>> fd1 == both True >>> fd1 = nltk.FreqDist(text1) >>> fd1.update(text2) >>> fd1 == both True >>> fd1 = nltk.FreqDist(text1) >>> fd2 = nltk.FreqDist(fd1) >>> fd2 == fd1 True Testing some HMM estimators --------------------------- We extract a small part (500 sentences) of the Brown corpus >>> corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:500] >>> print len(corpus) 500 We create a HMM trainer - note that we need the tags and symbols from the whole corpus, not just the training corpus >>> tag_set = list(set([tag for sent in corpus for (word,tag) in sent])) >>> print len(tag_set) 92 >>> symbols = list(set([word for sent in corpus for (word,tag) in sent])) >>> print len(symbols) 1464 >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) We divide the corpus into 90% training and 10% testing >>> train_corpus = [] >>> test_corpus = [] >>> for i in range(len(corpus)): ... if i % 10: ... train_corpus += [corpus[i]] ... else: ... test_corpus += [corpus[i]] >>> print len(train_corpus) 450 >>> print len(test_corpus) 50 And now we can test the estimators >>> def train_and_test(est): ... hmm = trainer.train_supervised(train_corpus, estimator=est) ... print '%.2f%%' % (100 * hmm.evaluate(test_corpus)) Maximum Likelihood Estimation - this resulted in an initialization error before r7209 >>> mle = lambda fd, bins: MLEProbDist(fd) >>> train_and_test(mle) 14.43% Laplace (= Lidstone with gamma==1) >>> train_and_test(LaplaceProbDist) 66.04% Expected Likelihood Estimation (= Lidstone with gamma==0.5) >>> train_and_test(ELEProbDist) 73.01% Lidstone Estimation, for gamma==0.1, 0.5 and 1 (the later two should be exactly equal to MLE and ELE above) >>> def lidstone(gamma): ... return lambda fd, bins: LidstoneProbDist(fd, gamma, bins) >>> train_and_test(lidstone(0.1)) 82.51% >>> train_and_test(lidstone(0.5)) 73.01% >>> train_and_test(lidstone(1.0)) 66.04% Witten Bell Estimation - This resulted in ZeroDivisionError before r7209 >>> train_and_test(WittenBellProbDist) 88.12% Good Turing Estimation - The accuracy is only 17%, see issues #26 and #133 >>> gt = lambda fd, bins: GoodTuringProbDist(fd) >>> train_and_test(gt) 0.34% >>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd) >>> train_and_test(gt) 0.17% Remains to be added: - Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist Squashed bugs ------------- Issue 511: override pop and popitem to invalidate the cache >>> fd = nltk.FreqDist('a') >>> fd.keys() ['a'] >>> fd.pop('a') 1 >>> fd.keys() [] Issue 533: access cumulative frequencies with no arguments >>> fd = nltk.FreqDist('aab') >>> list(fd._cumulative_frequencies(['a'])) [2.0] >>> list(fd._cumulative_frequencies()) [2.0, 3.0] Issue 579: override clear to reset some variables >>> fd = FreqDist('aab') >>> fd.clear() >>> fd.N() 0 Issue 351: fix fileids method of CategorizedCorpusReader to inadvertently add errant categories >>> from nltk.corpus import brown >>> brown.fileids('blah') Traceback (most recent call last): ... ValueError: Category blah not found >>> brown.categories() ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] Issue 175: add the unseen bin to SimpleGoodTuringProbDist by default otherwise any unseen events get a probability of zero, i.e., they don't get smoothed >>> from nltk import SimpleGoodTuringProbDist, FreqDist >>> fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10}) >>> p = SimpleGoodTuringProbDist(fd) >>> p.prob('a') 0.017649766667026317 >>> p.prob('o') 0.08433050215340411 >>> p.prob('z') 0.022727272727272728 >>> p.prob('foobar') 0.022727272727272728