# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book # # Copyright (C) 2001-2012 NLTK Project # Author: Steven Bird # # URL: # For license information, see LICENSE.TXT from nltk.corpus import gutenberg, genesis, inaugural,\ nps_chat, webtext, treebank, wordnet from nltk.text import Text from nltk.probability import FreqDist from nltk.util import bigrams from nltk.misc import babelize_shell print "*** Introductory Examples for the NLTK Book ***" print "Loading text1, ..., text9 and sent1, ..., sent9" print "Type the name of the text or sentence to view it." print "Type: 'texts()' or 'sents()' to list the materials." text1 = Text(gutenberg.words('melville-moby_dick.txt')) print "text1:", text1.name text2 = Text(gutenberg.words('austen-sense.txt')) print "text2:", text2.name text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis") print "text3:", text3.name text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print "text4:", text4.name text5 = Text(nps_chat.words(), name="Chat Corpus") print "text5:", text5.name text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail") print "text6:", text6.name text7 = Text(treebank.words(), name="Wall Street Journal") print "text7:", text7.name text8 = Text(webtext.words('singles.txt'), name="Personals Corpus") print "text8:", text8.name text9 = Text(gutenberg.words('chesterton-thursday.txt')) print "text9:", text9.name def texts(): print "text1:", text1.name print "text2:", text2.name print "text3:", text3.name print "text4:", text4.name print "text5:", text5.name print "text6:", text6.name print "text7:", text7.name print "text8:", text8.name print "text9:", text9.name sent1 = ["Call", "me", "Ishmael", "."] sent2 = ["The", "family", "of", "Dashwood", "had", "long", "been", "settled", "in", "Sussex", "."] sent3 = ["In", "the", "beginning", "God", "created", "the", "heaven", "and", "the", "earth", "."] sent4 = ["Fellow", "-", "Citizens", "of", "the", "Senate", "and", "of", "the", "House", "of", "Representatives", ":"] sent5 = ["I", "have", "a", "problem", "with", "people", "PMing", "me", "to", "lol", "JOIN"] sent6 = ['SCENE', '1', ':', '[', 'wind', ']', '[', 'clop', 'clop', 'clop', ']', 'KING', 'ARTHUR', ':', 'Whoa', 'there', '!'] sent7 = ["Pierre", "Vinken", ",", "61", "years", "old", ",", "will", "join", "the", "board", "as", "a", "nonexecutive", "director", "Nov.", "29", "."] sent8 = ['25', 'SEXY', 'MALE', ',', 'seeks', 'attrac', 'older', 'single', 'lady', ',', 'for', 'discreet', 'encounters', '.'] sent9 = ["THE", "suburb", "of", "Saffron", "Park", "lay", "on", "the", "sunset", "side", "of", "London", ",", "as", "red", "and", "ragged", "as", "a", "cloud", "of", "sunset", "."] def sents(): print "sent1:", " ".join(sent1) print "sent2:", " ".join(sent2) print "sent3:", " ".join(sent3) print "sent4:", " ".join(sent4) print "sent5:", " ".join(sent5) print "sent6:", " ".join(sent6) print "sent7:", " ".join(sent7) print "sent8:", " ".join(sent8) print "sent9:", " ".join(sent9)