当前位置:   article > 正文

nltk分词

nltk分词

先读入数据

  1. import pandas as pd
  2. data = pd.read_excel(r'D:\python\zxzy\amazon_asin\review.xlsx')
  3. title = data['review_revs']
  4. data.head(1)

对每条review进行分句

  1. #分句
  2. import nltk
  3. from nltk.tokenize import sent_tokenize
  4. sent = []
  5. for i in title:
  6. sent.append(sent_tokenize(str(i)))
  7. sent[0:3]

对分句结果sent进行分词

  1. #分词
  2. from nltk.tokenize import word_tokenize
  3. words = []
  4. for i in sent:
  5. for j in i:
  6. words.extend(word_tokenize(j))
  7. words[0:5]

对分词结果words进行小写处理

  1. #小写处理
  2. words_lower = [i.lower() for i in words]

对小写处理结果words_lower去除标点词和停用词

  1. # 去除标点符号和停用词
  2. from nltk.corpus import stopwords
  3. english_stopwords = stopwords.words("english")
  4. english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*', '...',"'s"]
  5. words_clear = []
  6. for i in words_lower:
  7. if i not in english_stopwords:
  8. if i not in english_punctuations:
  9. words_clear.append(i)
  10. print("/".join(words_clear[0:10]))

对去除停用词和标点号后的结果words_clear进行词干化处理

  1. # 词干化处理
  2. from nltk.stem.porter import PorterStemmer
  3. st = PorterStemmer()
  4. words_stem = [st.stem(word) for word in words_clear]
  5. print("/".join(words_stem[0:10]))

将词干化结果转化为text格式 

  1. # 将分词结果转化为text格式
  2. from nltk.text import Text
  3. word_text = Text(words_stem)

 识别评论文本中常用固定词组搭配

  1. # 识别评论文本中常用固定词组搭配
  2. word_text.collocations(num = 50,window_size = 2)

利用Counter计数器统计出现次数最多的前20个单词

  1. # 利用Counter计数器统计出现次数最多的前20个单词
  2. from collections import Counter
  3. words_counter = Counter(word_text)
  4. words_counter.most_common(20)
  5. #保存words_counter结果
  6. #fo = open(r'D:\python\zxzy\amazon_asin\tshirt.txt', "w")
  7. #for i in words_counter:
  8. # fo.write(i + ":" + str(words_counter[i]) + "\n")
  9. #fo.close()

查看高频词dress上下文结果

  1. #通过查看高频词上下文相关内容,可以了解评论的具体内容
  2. #如果想直接查看原始评论文本,可以通过索引查看
  3. word_text.concordance("dress",lines=10)

词性标注

  1. # 词性标注,找出形容词ADJ和名词NOUN
  2. from nltk.tag import pos_tag
  3. ADJ = []
  4. NOUN = []
  5. for a,b in pos_tag(words_stem,tagset = "universal"):
  6. if b == "ADJ":
  7. ADJ.append(a)
  8. elif b == "NOUN":
  9. NOUN.append(a)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/2023面试高手/article/detail/729587
推荐阅读
相关标签
  

闽ICP备14008679号