1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
Python 3.6.1 (default, Dec 2015, 13:05:11) [GCC 4.8.2] on linux import nltk nltk.download() NLTK Downloader --------------------------------------------------------------------------- d) Download l) List u) Update c) Config h) Help q) Quit --------------------------------------------------------------------------- Downloader> d Download which package (l=list; x=cancel)? Identifier> book Downloading collection 'book' | ... Lots of lines ... | Done downloading collection book --------------------------------------------------------------------------- d) Download l) List u) Update c) Config h) Help q) Quit --------------------------------------------------------------------------- Downloader> q => True nltk.corpus.gutenberg.fileids() => ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt'] from nltk.corpus import gutenberg gutenberg.raw('austen-emma.txt')[0:100] => '[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a' gutenberg.raw('austen-emma.txt')[0:500] => "[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.\n\nShe was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period. Her mother\nhad died t" from nltk.tokenize import sent_tokenize sent_tokenize("Hello! What's your name?") => ['Hello!', "What's your name?"] emma = gutenberg.raw('austen-emma.txt') emma[0:100] => '[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a' sent_tokenize(emma)[0:10] => ['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.', "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.", 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.', "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.", 'Between _them_ it was more the intimacy\nof sisters.', "Even before Miss Taylor had ceased to hold the nominal\noffice of governess, the mildness of her temper had hardly allowed\nher to impose any restraint; and the shadow of authority being\nnow long passed away, they had been living together as friend and\nfriend very mutually attached, and Emma doing just what she liked;\nhighly esteeming Miss Taylor's judgment, but directed chiefly by\nher own.", "The real evils, indeed, of Emma's situation were the power of having\nrather too much her own way, and a disposition to think a little\ntoo well of herself; these were the disadvantages which threatened\nalloy to her many enjoyments.", 'The danger, however, was at present\nso unperceived, that they did not by any means rank as misfortunes\nwith her.', 'Sorrow came--a gentle sorrow--but not at all in the shape of any\ndisagreeable consciousness.--Miss Taylor married.', "It was Miss\nTaylor's loss which first brought grief."] from nltk.tokenize import word_tokenize word_tokenize("What's your name?") => ['What', "'s", 'your', 'name', '?'] sent_tokenize(emma)[1] => "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period." word_tokenize(sent_tokenize(emma)[1]) => ['She', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', ',', 'indulgent', 'father', ';', 'and', 'had', ',', 'in', 'consequence', 'of', 'her', 'sister', "'s", 'marriage', ',', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period', '.'] from nltk.tag import pos_tag emmawords = word_tokenize(sent_tokenize(emma)[1]) pos_tag(emmawords) => [('She', 'PRP'), ('was', 'VBD'), ('the', 'DT'), ('youngest', 'JJS'), ('of', 'IN'), ('the', 'DT'), ('two', 'CD'), ('daughters', 'NNS'), ('of', 'IN'), ('a', 'DT'), ('most', 'RBS'), ('affectionate', 'JJ'), (',', ','), ('indulgent', 'JJ'), ('father', 'NN'), (';', ':'), ('and', 'CC'), ('had', 'VBD'), (',', ','), ('in', 'IN'), ('consequence', 'NN'), ('of', 'IN'), ('her', 'PRP$'), ('sister', 'NN'), ("'s", 'POS'), ('marriage', 'NN'), (',', ','), ('been', 'VBN'), ('mistress', 'NN'), ('of', 'IN'), ('his', 'PRP$'), ('house', 'NN'), ('from', 'IN'), ('a', 'DT'), ('very', 'RB'), ('early', 'JJ'), ('period', 'NN'), ('.', '.')] |