当前位置:   article > 正文

NLP情感分析和可视化|python实现评论内容的文本清洗、语料库分词、去除停用词、建立TF-IDF矩阵、获取主题词和主题词团_评论文本清洗

评论文本清洗

1 文本数据准备

首先文本数据准备,爬取李佳琦下的评论,如下:

 2 提出文本数据、获得评论内容

  1. #内容读取
  2. import xlrd
  3. import pandas as pd
  4. wb=xlrd.open_workbook("评论数据.xlsx")
  5. sh=wb.sheet_by_index(0)
  6. col=sh.ncols
  7. row=sh.nrows
  8. Text=[]
  9. for i in range(row):
  10. Text_Context=sh.row_values(i,1,2)[0]
  11. Text.append(Text_Context)
  12. del Text[0]
  13. print(Text)

2 进行结巴分词、去除停用词,得到词料

  1. #结巴分词
  2. import jieba
  3. import gensim
  4. #停用词处理
  5. import spacy
  6. from spacy.lang.zh.stop_words import STOP_WORDS
  7. sent_words = []
  8. for sent0 in Text:
  9. try:
  10. l=list(jieba.cut(sent0))
  11. # print(l)
  12. filtered_sentence = []
  13. for word in l:
  14. if word not in STOP_WORDS:
  15. filtered_sentence.append(word)
  16. sent_words.append(filtered_sentence)
  17. # print( filtered_sentence)
  18. except:
  19. pass
  20. print(sent_words)
  21. document = [" "

3 生成TF-IDF矩阵:获取逆文档高频词

  1. from sklearn import feature_extraction
  2. from sklearn.feature_extraction.text import TfidfTransformer
  3. from sklearn.feature_extraction.text import CountVectorizer
  4. from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
  5. tfidf_model = TfidfVectorizer().fit(document)
  6. # 得到语料库所有不重复的词
  7. feature = tfidf_model.get_feature_names()
  8. print(feature)
  9. # 得到每个特征对应的id值:即上面数组的下标
  10. print(tfidf_model.vocabulary_)
  11. # 每一行中的指定特征的tf-idf值:
  12. sparse_result = tfidf_model.transform(document)
  13. # 每一个语料中包含的各个特征值的tf-idf值:
  14. # 每一行代表一个预料,每一列代表这一行代表的语料中包含这个词的tf-idf值,不包含则为空
  15. weight = sparse_result.toarray()
  16. # 构建词与tf-idf的字典:
  17. feature_TFIDF = {}
  18. for i in range(len(weight)):
  19. for j in range(len(feature)):
  20. # print(feature[j], weight[i][j])
  21. if feature[j] not in feature_TFIDF:
  22. feature_TFIDF[feature[j]] = weight[i][j]
  23. else:
  24. feature_TFIDF[feature[j]] = max(feature_TFIDF[feature[j]], weight[i][j])
  25. # print(feature_TFIDF)
  26. # 按值排序:
  27. print('TF-IDF 排名前十的(TF-IDF>1时):')
  28. featureList = sorted(feature_TFIDF.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
  29. for i in range(10):
  30. print(featureList[i][0], featureList[i][1])
  31. k=0
  32. m=0
  33. print('TF-IDF 排名前十的(TF-IDF<1时):')
  34. while k<=10:
  35. if featureList[m][1]<1:
  36. k+=1
  37. print(featureList[m][0], featureList[m][1])
  38. m+=1

4 结果:

5 画图

  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. from gensim import corpora
  4. from gensim.models import LdaModel
  5. from gensim.corpora import Dictionary
  6. #内容读取
  7. import xlrd
  8. import pandas as pd
  9. from gensim import corpora
  10. from collections import defaultdict
  11. import spacy
  12. from spacy.lang.zh.stop_words import STOP_WORDS
  13. #结巴分词
  14. import jieba
  15. import gensim
  16. #停用词处理
  17. wb=xlrd.open_workbook("评论数据.xlsx")
  18. sh=wb.sheet_by_index(0)
  19. col=sh.ncols
  20. row=sh.nrows
  21. Text=[]
  22. for i in range(row):
  23. Text_Context=sh.row_values(i,1,2)[0]
  24. Text.append(Text_Context)
  25. del Text[0]
  26. print(Text)
  27. file1 = open('结巴分词结果.txt','w')
  28. sent_word = []
  29. for sent0 in Text:
  30. try:
  31. l=list(jieba.cut(sent0))
  32. sent_word.append(l)
  33. # print( filtered_sentence)
  34. except:
  35. pass
  36. for s in sent_word:
  37. try:
  38. for w in s:
  39. file1.write(str(w))
  40. file1.write('\n')
  41. except:
  42. pass
  43. file1.close()
  44. sent_words=[]
  45. for l in sent_word:
  46. filtered_sentence=[]
  47. for word in l:
  48. if word not in STOP_WORDS:
  49. filtered_sentence.append(word)
  50. sent_words.append(filtered_sentence)
  51. file2 = open('去除停用词后的结果.txt','w')
  52. for s in sent_word:
  53. for w in s:
  54. file1.write(w)
  55. file2.write('\n')
  56. file2.close()
  57. dictionary = corpora.Dictionary(sent_words)
  58. corpus = [dictionary.doc2bow(text) for text in sent_words]
  59. lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=60)
  60. # num_topics:主题数目
  61. # passes:训练伦次
  62. # num_words:每个主题下输出的term的数目
  63. file3=open("tf-idf值.txt",'w')
  64. for topic in lda.print_topics(num_words = 20):
  65. try:
  66. termNumber = topic[0]
  67. print(topic[0], ':', sep='')
  68. file3.write(str(topic[0])+':'+''+'\n')
  69. listOfTerms = topic[1].split('+')
  70. for term in listOfTerms:
  71. listItems = term.split('*')
  72. print(' ', listItems[1], '(', listItems[0], ')', sep='')
  73. file3.write(' '+str(listItems[1])+ '('+str(listItems[0])+ ')',+''+ '\n')
  74. except:
  75. pass
  76. import pyLDAvis.gensim
  77. d=pyLDAvis.gensim.prepare(lda, corpus, dictionary)
  78. '''
  79. lda: 计算好的话题模型
  80. corpus: 文档词频矩阵
  81. dictionary: 词语空间
  82. '''
  83. pyLDAvis.save_html(d, 'lda_pass10.html')
  84. # pyLDAvis.displace(d) #展示在notebook的output cell中

6 结果展示

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/凡人多烦事01/article/detail/628577
推荐阅读
相关标签
  

闽ICP备14008679号