当前位置:   article > 正文

LDA主题模型代码实现,实测复杂度为正数

lda主题模型代码

复杂度计算和主题词提取,需要三个文件,未分词的数据集,词典文件,停用词,路径最好用/替代\,

  1. import numpy as np
  2. import pandas as pd
  3. import re
  4. import jieba
  5. import jieba.posseg as psg
  6. from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
  7. from sklearn.decomposition import LatentDirichletAllocation
  8. import pyLDAvis
  9. import pyLDAvis.sklearn
  10. import matplotlib.pyplot as plt
  11. data=pd.read_table("数据分析师测试.txt")#第一行标题为content
  12. dic_file = "dict.txt"#词性字典,例牛马 n,摸鱼 v
  13. stop_file = "stopword.txt"
  14. def chinese_word_cut(mytext):
  15. jieba.load_userdict(dic_file)
  16. jieba.initialize()
  17. try:
  18. stopword_list = open(stop_file, encoding='utf-8')
  19. except:
  20. stopword_list = []
  21. print("error in stop_file")
  22. stop_list = []
  23. flag_list = ['n', 'nz', 'vn']
  24. for line in stopword_list:
  25. line = re.sub(u'\n|\\r', '', line)
  26. stop_list.append(line)
  27. word_list = []
  28. # jieba分词
  29. seg_list = psg.cut(mytext)
  30. for seg_word in seg_list:
  31. word = re.sub(u'[^\u4e00-\u9fa5]', '', seg_word.word)
  32. # word = seg_word.word #如果想要分析英语文本,注释这行代码,启动下行代码
  33. find = 0
  34. for stop_word in stop_list:
  35. if stop_word == word or len(word) < 2: # this word is stopword
  36. find = 1
  37. break
  38. if find == 0 and seg_word.flag in flag_list:
  39. word_list.append(word)
  40. return (" ").join(word_list)
  41. data["content_cutted"] = data.content.apply(chinese_word_cut)
  42. def print_top_words(model, feature_names, n_top_words):
  43. tword = []
  44. for topic_idx, topic in enumerate(model.components_):
  45. print("Topic #%d:" % topic_idx)
  46. topic_w = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
  47. tword.append(topic_w)
  48. print(topic_w)
  49. return tword
  50. n_features = 1000 #提取1000个特征词语
  51. tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
  52. max_features=n_features,
  53. stop_words='english',
  54. max_df = 0.5,
  55. min_df = 10)
  56. tf = tf_vectorizer.fit_transform(data.content_cutted)
  57. n_topics = 8
  58. lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
  59. learning_method='batch',
  60. learning_offset=50,
  61. # doc_topic_prior=0.1,
  62. # topic_word_prior=0.01,
  63. random_state=0)
  64. lda.fit(tf)
  65. n_top_words = 25
  66. tf_feature_names = tf_vectorizer.get_feature_names_out()
  67. topic_word = print_top_words(lda, tf_feature_names, n_top_words)
  68. topics=lda.transform(tf)
  69. topic = []
  70. for t in topics:
  71. topic.append("Topic #"+str(list(t).index(np.max(t))))
  72. data['概率最大的主题序号']=topic
  73. data['每个主题对应概率']=list(topics)
  74. data.to_excel("data_topic.xlsx",index=False)
  75. plexs = []
  76. scores = []
  77. n_max_topics = 16
  78. for i in range(1,n_max_topics):
  79. print(i)
  80. lda = LatentDirichletAllocation(n_components=i, max_iter=50,
  81. learning_method='batch',
  82. learning_offset=50,random_state=0)
  83. lda.fit(tf)
  84. plexs.append(lda.perplexity(tf))
  85. scores.append(lda.score(tf))
  86. n_t=15#区间最右侧的值。注意:不能大于n_max_topics
  87. x=list(range(1,n_t+1))
  88. plt.plot(x,plexs[0:n_t])
  89. plt.xlabel("number of topics")
  90. plt.ylabel("perplexity")
  91. plt.savefig('困惑度' + '.png')
  92. print("图片保存成功")
  93. plt.show()

代码源阿婆主传送

文本分词+词云代码,可利用词云图选出无意义的词语。

  1. #注意!文件一定要标明是UTF-8格式,这是个警告!
  2. #文本分词
  3. import jieba as jb
  4. # import docx
  5. import re
  6. # stopWordsFile:停用词表地址,fileName 是待处理文本,writeFile 是新建保存预处理好文本的文件
  7. # 加载停用词,并把停用词存储为列表的形式
  8. def loadStopWords(fileName):
  9. with open(fileName,encoding='utf-8', errors='ignore') as f:
  10. stopwords = f.read()
  11. stopwords_list = stopwords.split('\n')
  12. custom_stopwords_list = [i for i in stopwords_list]
  13. custom_stopwords_list.extend(['二'])
  14. return custom_stopwords_list
  15. # 把文本分词并去除停用词,返回列表
  16. def wordsCut(words, stopWordsFile):
  17. result = jb.cut(words)
  18. newWords = []
  19. stopWords = loadStopWords(stopWordsFile)
  20. for s in result:
  21. if s not in stopWords:
  22. newWords.append(s)
  23. return newWords
  24. # 去空格
  25. def not_empty(s):
  26. return s and s.strip()
  27. # 把样本文件的每一行每一句做分词处理,并写文件
  28. def fileCut(fileName, writeFile, stopWordsFile):
  29. dataMat = []
  30. fr = open(fileName,encoding='utf-8', errors='ignore')#gbk,gb18030
  31. frW = open(writeFile, 'w',encoding='utf-8', errors='ignore')
  32. for line in fr.readlines(): # 将文件逐行读取
  33. curLine = line.strip() # 去掉所有空格
  34. curLine1 = curLine.upper() # 把字符串中的英文字母转换成大写
  35. cutWords = wordsCut(curLine1, stopWordsFile) # 分词且去停用词,返回一行的列表
  36. cutWords = list(filter(not_empty, cutWords))
  37. for i in range(len(cutWords)):
  38. if re.match(r'^\d.*', cutWords[i]):
  39. cutWords[i] = ''
  40. else:
  41. frW.write(cutWords[i])
  42. frW.write(' ')
  43. frW.write('\n')
  44. dataMat.append(cutWords)
  45. frW.close()
  46. # stopWordsFile:停用词表地址,fileName 是待处理文本,writeFile 是新建保存预处理好文本的文件
  47. fileName = r'D:/Project/database/数据分析师测试.txt'
  48. writeFile = r'D:/Project/database/数据分析师测试已分词.txt'
  49. stopWordsFile = r'D:/Project/stopword.txt'
  50. fileCut(fileName, writeFile, stopWordsFile)
  1. #完整代码
  2. import numpy as np
  3. import wordcloud as wc
  4. import matplotlib.pyplot as plt
  5. from PIL import Image
  6. import jieba
  7. #1.打开文件,将评论读入一个字符串变量
  8. with open("D:/Project/岗位职责已分词.txt", mode="r",encoding='utf-8') as fp:
  9. text= fp.read()
  10. mask = np.array(Image.open('D:/Project/小猫咪3.png'))
  11. #3.指定停用词
  12. stopwords=['的','了','熟悉','负责','产品','供应商','新','工程师','岁','关键字','岗位职责','以上学历','专业','年龄','年','熟练','流程','质量','需求','客户','系统','具备','软件','管理','开发','公司','相关','项目','数据','设计','经验','技术','工作','合作','就是','上班','地址','查看','地图','优先','AND','职能','类别','任职']
  13. # 4.创建WordCloud对象,设置基本信息
  14. word_cloud = wc.WordCloud(mask=mask,font_path="C:\Windows\Fonts\SIMHEI.ttf",stopwords=stopwords,background_color='white') #此处stopwords是指定停用词
  15. # 5.调用词云对象的generate方法加载文本,生成词云图
  16. word_cloud.generate(text)
  17. #6.显示词云图
  18. plt.imshow(word_cloud)
  19. plt.show()
  20. word_cloud.to_file('小猫咪' + '.png')
  21. print('小猫咪'+'词云图','保存成功')

LDA可视化,注此处的数据集是已经分词的,代码运行完成后会生成一个链接,点进链接即可看到LDA可视化网页,num_topics为主题词数量。

  1. from gensim.corpora import Dictionary
  2. from gensim.models import LdaModel
  3. import codecs
  4. import pyLDAvis.gensim
  5. if __name__ == '__main__':
  6. doc1 = codecs.open('D:/Project/database/华东已分词.txt', mode='r',encoding='utf-8')
  7. lines = doc1.readlines()
  8. all_set = []
  9. listword = ['数据', '数据分析', '工作', '年', '强', '良好', '相关', '企业', '公司', '要求', '优先', '各类'
  10. ,'良好','具备','熟练','熟悉',' - ','类','提供','优先','具有','进行' ,'要求','能力','分析','负责'
  11. ,'经验','任职','完成','专业','活动','问题']
  12. for line in lines:
  13. line = line.replace(' \n', '')
  14. line = line.replace('\r\n', '')
  15. line = line.split()
  16. all_set.append([w for w in line if w not in listword])
  17. # print(all_set) #每行的嵌套列表 [['xx','xx'],['xx','xx'],...,['xx','xx']]
  18. # 构建训练语料,并将其可视化
  19. dictionary = Dictionary(all_set)
  20. corpus = [dictionary.doc2bow(text) for text in all_set]
  21. lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4)
  22. vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
  23. pyLDAvis.show(vis_data, open_browser=False)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/菜鸟追梦旅行/article/detail/358863?site
推荐阅读
相关标签
  

闽ICP备14008679号