当前位置:   article > 正文

《Python自然语言处理》第二章练习题答案_python 自然语言处理试题

python 自然语言处理试题

第二章
最近要学nltk,这本书的练习题出的很好,自己写下来锻炼一下。
这一章主要涉及词频统计和画图。
2

from nltk.corpus import gutenberg
len(gutenberg.words('austen-persuasion.txt'))
len(set(gutenberg.words('austen-persuasion.txt')))
  • 1
  • 2
  • 3

3

from nltk.corpus import brown
brown.words(categories=['news','editorial'])
  • 1
  • 2

4

from nltk.corpus import state_union
cdf = nltk.ConditionalFreqDist(
    (target,fileid[:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
    for target in ['men','women','people']
    if w.lower().startswith(target)
)
cfd.plot()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

8

names = nltk.corpus.names
cfd = nltk.ConditionalFreqDist(
    (fileid,name[0])
    for fileid in names.fileids()
    for name in names.words(fileid)
)
cfd.plot()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

15

from nltk.corpus import brown
fd = nltk.FreqDist(brown.words())
print([w for (w,_) in fd.most_common() if fd[w]>3])

  • 1
  • 2
  • 3
  • 4

16

for cate in brown.categories():
    word_num = len([w for w in brown.words(categories=cate)])
    type_num = len(set(w for w in brown.words(categories=cate)))
    print(cate,":",word_num/type_num)
  • 1
  • 2
  • 3
  • 4

17

from nltk.corpus import stopwords
sw = stopwords.words('English')
fd = nltk.FreqDist([w for w in brown.words() if w.lower() not in sw])
[w for (w,_) in fd.most_common()[:50]]
li = list(fd.most_common())
li[:50]
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

18

fd = nltk.FreqDist([(w1,w2) for (w1,w2) in nltk.bigrams(brown.words()) if w1 not in sw and w2 not in sw])
[w for (w,_) in fd.most_common()[:50]]
fd.most_common()[1]
  • 1
  • 2
  • 3

19

import re
def cate_count_word(text):
    cates = text.categories()
    for cate in cates:
        fd = nltk.FreqDist([w for w in text.words(categories=cate) if (re.search(r'[A-Za-z]+',w) and w.lower() not in sw)])
        print(cate,":",fd.most_common()[1],fd.most_common()[-1])
cate_count_word(brown)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

20

from nltk.corpus import gutenberg
from nltk.text import Text
def word_freq(text,word):
    count = Text(gutenberg.words(text)).count(word)
    return count/len(text)
word_freq('austen-emma.txt','will')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

23

%matplotlib inline
from nltk.corpus import PlaintextCorpusReader
corpus_root = r'root'
wordlists = PlaintextCorpusReader(corpus_root,'.*.txt')
fdist = nltk.FreqDist(wordlists.words('nkw_all.txt'))
common = fdist.most_common()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.xticks(rotation=270)
x = [i for i in range(1,151)]
y = [w[1] for w in common][:150]
plt.plot(x,y)
plt.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
import random
word_li = ''
for i in range(100000):
    word_li += random.choice('abcdefg ')
    i += 1
words = word_li.split(' ')
fdist = nltk.FreqDist(words)
common = fdist.most_common()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.xticks(rotation=270)
x = [i for i in range(1,151)]
y = [w[1] for w in common][:150]
plt.plot(x,y)
plt.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/盐析白兔/article/detail/521724
推荐阅读
相关标签
  

闽ICP备14008679号