赞
踩
print(nltk.corpus.gutenberg.fileids()) #模块中的语料库的文档列表
emma = nltk.corpus.gutenberg.words('austen-emma.txt')#提取文档词语
print(emma[:50])
print(len(emma))
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
print(emma)
print(emma.concordance("surprize"))#获取指定词汇出现的文章上下文
from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid)) #以字母为单位进行统计
num_words = len(gutenberg.words(fileid)) #以单词分隔符为单位进行统计
num_sents = len(gutenberg.sents(fileid)) #以句子为单位进行统计
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) #去重转化为小写在进行统计(相同词汇可能因为大小写不同而重复)
#平均词长,平均句长, 词汇的多样性,文档名
print(int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid)
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') #生成的是词链表
longest_len = max([len(s) for s in macbeth_sentences]) #求出最长句子长度
print([s for s in macbeth_sentences if len(s) == longest_len])#最长句子写了些什么
# #网络和聊天文本
from nltk.corpus import webtext
webtext.fileids()
for fileid in webtext.fileids():
print(fileid, webtext.raw(fileid)[:65], '...')
from nltk.corpus import brown
print(brown.categories()) #输出文本分类
print(brown.words(categories='news'))
print(brown.words(fileids=['cg22']))
print(brown.sents(categories=['news', 'editorial', 'reviews']))
import nltk
from nltk.corpus import brown
news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']#形成想要查询的情态动词表
for m in modals:
print(m + ':', fdist[m],)#查看情态动词的词频
import nltk
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist( #创建条件频率分布函数
(genre, word) #文体 + 单词
for genre in brown.categories()
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)#画出一个交叉表
from nltk.corpus import reuters
print(len(reuters.fileids())) #统计文档数目
print(len(reuters.categories())) #统计文档分类数目
reuters.categories('training/9865')
reuters.categories(['training/9865', 'training/9880'])
reuters.fileids('barley') #输出分类文档
from nltk.corpus import inaugural
import nltk
inaugural.fileids()
print([fileid[:4] for fileid in inaugural.fileids()])#取出文件相应的就职年份
#
cfd = nltk.ConditionalFreqDist( #特定词汇在不同就职演讲中出现的次数变化
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target))
cfd.plot()
from nltk.corpus import PlaintextCorpusReader
corpus_root = './training' #文件目录
wordlists = PlaintextCorpusReader(corpus_root, ['pku_training.utf8','cityu_training.utf8','msr_training.utf8','pku_training.utf8'])#文件载入可以使用正则表达式进行构建文件名列表
print(wordlists.fileids())
print(wordlists.raw('pku_training.utf8'))
print(len(wordlists.words('pku_training.utf8')))
print(len(wordlists.sents('pku_training.utf8')))
from nltk.corpus import brown
import nltk
genre_word = [(genre, word)
for genre in ['news', 'romance']
for word in brown.words(categories=genre)]
cfd = nltk.ConditionalFreqDist(genre_word)
print(cfd.items())
from nltk.corpus import inaugural
import nltk
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target))
cfd.plot()
from nltk.corpus import udhr
import nltk
languages = ['Chickasaw', 'English', 'German_Deutsch',
'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
udhr.fileids()
cfd = nltk.ConditionalFreqDist(
(lang, len(word))
for lang in languages
for word in udhr.words(lang + '-Latin1'))
cfd.tabulate(conditions=['English', 'German_Deutsch'],#如果不设定就是默认都显示
samples=range(10), cumulative=True) #限制显示样本的数目
import nltk sent = ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven', 'and', 'the', 'earth', '.'] print([i for i in nltk.bigrams(sent)])#生成的是生成器 def generate_model(cfdist, word, num=15): for i in range(num): print(word) word = cfdist[word].max()#下一个最大概率的连接词 text = nltk.corpus.genesis.words('english-kjv.txt') bigrams = nltk.bigrams(text)#二连接一篇文章 # print([i for i in bigrams]) cfd = nltk.ConditionalFreqDist(bigrams)#建立条件分布图 generate_model(cfd, 'living')#生成随机文章,不断输出二连次中下一个最有可能的词语
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。