赞
踩
- from nltk.book import * # 加载语料库
- text1.concordance('monstrous') # 搜索文本上下文
- text1.similar('monstrous') # 查找与monstrous拥有共同上下文的词
- text2.common_contexts(['monstrous','very'])
- text4.dispersion_plot(['America','citizen','democracy','freedom']) # 文中分布位置
- text3.generate() # 以相同风格产生文本(有可能有文法错误)
- fdist = FreqDist(text1) # 单词计数
- fdist.plot(50,cumulative=50) # 绘制排名前50的单词
- fdist = FreqDist([len(w) for w in text1]) # 计算其他东西
- fdist.freq(3) # 获取比例
- fdist.max() # 最大频率对应的词
- sorted([w for w in set(text1) if w.endswith('ableness')]) # 词汇判断
- from nltk.corpus import gutenberg # 引入语料库
- gutenberg.fileids() # 查看文本标识符
- for fileid in gutenberg.fileids():
- num_chars = len(gutenberg.raw(fileid)) # 计数,返回一个包括空白字符的字符串,解析文本
- num_words = len(gutenberg.words(fileid)) # 返回词链表[]
- num_sents = len(gutenberg.sents(fileid)) # 返回词链表,每个句子为单独链表[[],[]]
- num_voac = len(set([w.lower() for w in gutenberg.words(fileid)])) # 去重
- print(int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_voac),fileid) # 输出平均词长,平均句子长度和单词多样性得分(每个词出现的平均次数)
- from nltk.corpus import brown
- cfd = nltk.ConditionalFreqDist((genre,word) for genre in brown.categories() for word in brown.words(categories=genre)) # 条件频率分布对象
- cfd.tabulate(conditions=['adventure','fiction'],sample=range(10),cumulative=True)
- text = nltk.corpus.genesis.words('english-kjv.txt')
- bigrams = nltk.bigrams(text) # 产生双连词
- cfd = nltk.ConditionalFreqDist(bigrams)
- print(cfd['living']) # 查看living条件下的单词分布
- from nltk.corpus import wordnet as wn
- wn.synsets('motorcar') # 查找同义词
- wn.synset('car.n.01').lemma_names # car是同义词集合,查看集合内容
- wn.synset('whale.n.02').min_depth() # 对应同义词集深度
- wn.synset('whale.n.02').path_similarity(wn.synset('right_whale.n.01')) # 查看单词相似度
- token = nltk.word_tokenize(str(text1)) # 进行分词
- text = nltk.Text(token) #变成nltk文本,便于调用方法
- import re
- wordlis = [w for w in nltk.corpus.words.words('en') if w.islower()]
- [w for w in wordlis if re.search('ed$',w)] #如果以ed结尾就留存
- re.findall(r'^(.*?)(ing|ly|ious|ies|es|s|ment)$','posses') # 理解贪婪模式和findall
- porter= nltk.PorterStemmer() # 词干提取lying-lie,但不处理women
- [porter.stem(t) for t in text]
- wnl = nltk.WordNetLemmatizer() # 词形提取 women变成woman
- [wnl.lemmatize(w) for w in text]
- def is_content(word):
- return word.lower not in ['a','text']
- filter(is_content,text) # 过滤停止词
- nltk.pos_tag(nltk.word_tokenize('I love doing homework .')) # 词性标注
- freq = nltk.defaultdict(int)
- freq['b'] # 默认词典0
- from nltk.corpus import brown
- brown_tagged_sents = brown.tagged_sents(categories='news') # 已经标注好的句子
- brown_sents = brown.sents(categories='news') # 未标注的z数据
- size = int(len(brown_tagged_sents)*0.9)
- train_sents = brown_tagged_sents[:size]
- test_sents = brown_tagged_sents[size:]
- bigram_tagger = nltk.BigramTagger(train_sents) # 2_Gram
- bigram_tagger.tag[brown_sents[2007]] # 标注2007号
- bigram_tagger.evaluate(test_sents) # 评估标注结果
- bigram_tagger = nltk.BigramTagger(train_sents,backoff=nltk.DefaultTagger('NN')) # 2_Gram,设置回退
- from nltk import names
- from nltk.corpus import names
- import random
- names = ([(name.'male') for name in names.words('male.txt')]+[(name,'female') for name in names.words('female.txt')])
- names = ([(name,'male') for name in names.words('male.txt')]+[(name,'female') for name in names.words('female.txt')])
- random.shuffle(names)
- def gen_feature(word):
- return {'last_letter':word[-1]}
- featuresets = [(gen_feature(n),g) for (n,g) in names]
- trainset,testset = featuresets[500:],featuresets[:500]
- import nltk
- classifier = nltk.NaiveBayesClassifier.train(trainset)
- classifier.classify(gen_feature('Neo'))
- classifier.show_most_informative_features(5) # 查看5个最有信息量的特征
- nltk.classify.accuracy(classifier,testset) # 评估
-
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。