当前位置:   article > 正文

Python自然语言处理-带你玩转NLTK库常见操作_nltk.corpus.words.words()

nltk.corpus.words.words()
  1. from nltk.book import * # 加载语料库
  2. text1.concordance('monstrous') # 搜索文本上下文
  3. text1.similar('monstrous') # 查找与monstrous拥有共同上下文的词
  4. text2.common_contexts(['monstrous','very'])
  5. text4.dispersion_plot(['America','citizen','democracy','freedom']) # 文中分布位置
  6. text3.generate() # 以相同风格产生文本(有可能有文法错误)
  7. fdist = FreqDist(text1) # 单词计数
  8. fdist.plot(50,cumulative=50) # 绘制排名前50的单词
  9. fdist = FreqDist([len(w) for w in text1]) # 计算其他东西
  10. fdist.freq(3) # 获取比例
  11. fdist.max() # 最大频率对应的词
  12. sorted([w for w in set(text1) if w.endswith('ableness')]) # 词汇判断
  13. from nltk.corpus import gutenberg # 引入语料库
  14. gutenberg.fileids() # 查看文本标识符
  15. for fileid in gutenberg.fileids():
  16. num_chars = len(gutenberg.raw(fileid)) # 计数,返回一个包括空白字符的字符串,解析文本
  17. num_words = len(gutenberg.words(fileid)) # 返回词链表[]
  18. num_sents = len(gutenberg.sents(fileid)) # 返回词链表,每个句子为单独链表[[],[]]
  19. num_voac = len(set([w.lower() for w in gutenberg.words(fileid)])) # 去重
  20. print(int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_voac),fileid) # 输出平均词长,平均句子长度和单词多样性得分(每个词出现的平均次数)
  21. from nltk.corpus import brown
  22. cfd = nltk.ConditionalFreqDist((genre,word) for genre in brown.categories() for word in brown.words(categories=genre)) # 条件频率分布对象
  23. cfd.tabulate(conditions=['adventure','fiction'],sample=range(10),cumulative=True)
  24. text = nltk.corpus.genesis.words('english-kjv.txt')
  25. bigrams = nltk.bigrams(text) # 产生双连词
  26. cfd = nltk.ConditionalFreqDist(bigrams)
  27. print(cfd['living']) # 查看living条件下的单词分布
  28. from nltk.corpus import wordnet as wn
  29. wn.synsets('motorcar') # 查找同义词
  30. wn.synset('car.n.01').lemma_names # car是同义词集合,查看集合内容
  31. wn.synset('whale.n.02').min_depth() # 对应同义词集深度
  32. wn.synset('whale.n.02').path_similarity(wn.synset('right_whale.n.01')) # 查看单词相似度
  33. token = nltk.word_tokenize(str(text1)) # 进行分词
  34. text = nltk.Text(token) #变成nltk文本,便于调用方法
  35. import re
  36. wordlis = [w for w in nltk.corpus.words.words('en') if w.islower()]
  37. [w for w in wordlis if re.search('ed$',w)] #如果以ed结尾就留存
  38. re.findall(r'^(.*?)(ing|ly|ious|ies|es|s|ment)$','posses') # 理解贪婪模式和findall
  39. porter= nltk.PorterStemmer() # 词干提取lying-lie,但不处理women
  40. [porter.stem(t) for t in text]
  41. wnl = nltk.WordNetLemmatizer() # 词形提取 women变成woman
  42. [wnl.lemmatize(w) for w in text]
  43. def is_content(word):
  44. return word.lower not in ['a','text']
  45. filter(is_content,text) # 过滤停止词
  46. nltk.pos_tag(nltk.word_tokenize('I love doing homework .')) # 词性标注
  47. freq = nltk.defaultdict(int)
  48. freq['b'] # 默认词典0
  49. from nltk.corpus import brown
  50. brown_tagged_sents = brown.tagged_sents(categories='news') # 已经标注好的句子
  51. brown_sents = brown.sents(categories='news') # 未标注的z数据
  52. size = int(len(brown_tagged_sents)*0.9)
  53. train_sents = brown_tagged_sents[:size]
  54. test_sents = brown_tagged_sents[size:]
  55. bigram_tagger = nltk.BigramTagger(train_sents) # 2_Gram
  56. bigram_tagger.tag[brown_sents[2007]] # 标注2007号
  57. bigram_tagger.evaluate(test_sents) # 评估标注结果
  58. bigram_tagger = nltk.BigramTagger(train_sents,backoff=nltk.DefaultTagger('NN')) # 2_Gram,设置回退
  59. from nltk import names
  60. from nltk.corpus import names
  61. import random
  62. names = ([(name.'male') for name in names.words('male.txt')]+[(name,'female') for name in names.words('female.txt')])
  63. names = ([(name,'male') for name in names.words('male.txt')]+[(name,'female') for name in names.words('female.txt')])
  64. random.shuffle(names)
  65. def gen_feature(word):
  66. return {'last_letter':word[-1]}
  67. featuresets = [(gen_feature(n),g) for (n,g) in names]
  68. trainset,testset = featuresets[500:],featuresets[:500]
  69. import nltk
  70. classifier = nltk.NaiveBayesClassifier.train(trainset)
  71. classifier.classify(gen_feature('Neo'))
  72. classifier.show_most_informative_features(5) # 查看5个最有信息量的特征
  73. nltk.classify.accuracy(classifier,testset) # 评估

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/从前慢现在也慢/article/detail/348939
推荐阅读
相关标签
  

闽ICP备14008679号