赞
踩
第二章
最近要学nltk,这本书的练习题出的很好,自己写下来锻炼一下。
这一章主要涉及词频统计和画图。
2
from nltk.corpus import gutenberg
len(gutenberg.words('austen-persuasion.txt'))
len(set(gutenberg.words('austen-persuasion.txt')))
3
from nltk.corpus import brown
brown.words(categories=['news','editorial'])
4
from nltk.corpus import state_union
cdf = nltk.ConditionalFreqDist(
(target,fileid[:4])
for fileid in state_union.fileids()
for w in state_union.words(fileid)
for target in ['men','women','people']
if w.lower().startswith(target)
)
cfd.plot()
8
names = nltk.corpus.names
cfd = nltk.ConditionalFreqDist(
(fileid,name[0])
for fileid in names.fileids()
for name in names.words(fileid)
)
cfd.plot()
15
from nltk.corpus import brown
fd = nltk.FreqDist(brown.words())
print([w for (w,_) in fd.most_common() if fd[w]>3])
16
for cate in brown.categories():
word_num = len([w for w in brown.words(categories=cate)])
type_num = len(set(w for w in brown.words(categories=cate)))
print(cate,":",word_num/type_num)
17
from nltk.corpus import stopwords
sw = stopwords.words('English')
fd = nltk.FreqDist([w for w in brown.words() if w.lower() not in sw])
[w for (w,_) in fd.most_common()[:50]]
li = list(fd.most_common())
li[:50]
18
fd = nltk.FreqDist([(w1,w2) for (w1,w2) in nltk.bigrams(brown.words()) if w1 not in sw and w2 not in sw])
[w for (w,_) in fd.most_common()[:50]]
fd.most_common()[1]
19
import re
def cate_count_word(text):
cates = text.categories()
for cate in cates:
fd = nltk.FreqDist([w for w in text.words(categories=cate) if (re.search(r'[A-Za-z]+',w) and w.lower() not in sw)])
print(cate,":",fd.most_common()[1],fd.most_common()[-1])
cate_count_word(brown)
20
from nltk.corpus import gutenberg
from nltk.text import Text
def word_freq(text,word):
count = Text(gutenberg.words(text)).count(word)
return count/len(text)
word_freq('austen-emma.txt','will')
23
%matplotlib inline
from nltk.corpus import PlaintextCorpusReader
corpus_root = r'root'
wordlists = PlaintextCorpusReader(corpus_root,'.*.txt')
fdist = nltk.FreqDist(wordlists.words('nkw_all.txt'))
common = fdist.most_common()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.xticks(rotation=270)
x = [i for i in range(1,151)]
y = [w[1] for w in common][:150]
plt.plot(x,y)
plt.show()
import random
word_li = ''
for i in range(100000):
word_li += random.choice('abcdefg ')
i += 1
words = word_li.split(' ')
fdist = nltk.FreqDist(words)
common = fdist.most_common()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.xticks(rotation=270)
x = [i for i in range(1,151)]
y = [w[1] for w in common][:150]
plt.plot(x,y)
plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。