赞
踩
数据集和资料:
链接:LDA主题模型
提取码:rlns
数据概览
代码:
import os import pandas as pd import re import jieba import jieba.posseg as psg #######预处理 output_path = 'D:/lda/result' file_path = 'D:/lda/data' os.chdir(file_path) data=pd.read_excel("data.xlsx")#content type os.chdir(output_path) dic_file = "D:/lda/stop_dic/dict.txt" stop_file = "D:/lda/stop_dic/stopwords.txt" def chinese_word_cut(mytext): jieba.load_userdict(dic_file) jieba.initialize() try: stopword_list = open(stop_file,encoding ='utf-8') except: stopword_list = [] print("error in stop_file") stop_list = [] flag_list = ['n','nz','vn'] for line in stopword_list: line = re.sub(u'\n|\\r', '', line) stop_list.append(line) word_list = [] #jieba分词 seg_list = psg.cut(mytext) for seg_word in seg_list: #word = re.sub(u'[^\u4e00-\u9fa5]','',seg_word.word) word = seg_word.word find = 0 for stop_word in stop_list: if stop_word == word or len(word)<2: #this word is stopword find = 1 break if find == 0 and seg_word.flag in flag_list: word_list.append(word) return (" ").join(word_list) data["content_cutted"] = data.content.apply(chinese_word_cut) #######LDA分析 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.decomposition import LatentDirichletAllocation def print_top_words(model, feature_names, n_top_words): tword = [] for topic_idx, topic in enumerate(model.components_): print("Topic #%d:" % topic_idx) topic_w = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) tword.append(topic_w) print(topic_w) return tword n_features = 1000 #提取1000个特征词语 tf_vectorizer = CountVectorizer(strip_accents = 'unicode', max_features=n_features, stop_words='english', max_df = 0.5, min_df = 10) tf = tf_vectorizer.fit_transform(data.content_cutted) n_topics = 8 lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50, learning_method='batch', learning_offset=50, # doc_topic_prior=0.1, # topic_word_prior=0.01, random_state=0) lda.fit(tf) ###########每个主题对应词语 n_top_words = 25 tf_feature_names = tf_vectorizer.get_feature_names() topic_word = print_top_words(lda, tf_feature_names, n_top_words) ###########输出每篇文章对应主题 import numpy as np topics=lda.transform(tf) topic = [] for t in topics: topic.append(list(t).index(np.max(t))) data['topic']=topic data.to_excel("data_topic.xlsx",index=False) topics[0]#0 1 2 ###########可视化 import pyLDAvis import pyLDAvis.sklearn pyLDAvis.enable_notebook() pic = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer) pyLDAvis.display(pic) pyLDAvis.save_html(pic, 'lda_pass'+str(n_topics)+'.html') #去工作路径下找保存好的html文件 pyLDAvis.show(pic) ###########困惑度 import matplotlib.pyplot as plt plexs = [] n_max_topics = 16 for i in range(1,n_max_topics): print(i) lda = LatentDirichletAllocation(n_components=i, max_iter=50, learning_method='batch', learning_offset=50,random_state=0) lda.fit(tf) plexs.append(lda.perplexity(tf)) n_t=15#区间最右侧的值。注意:不能大于n_max_topics x=list(range(1,n_t)) plt.plot(x,plexs[1:n_t]) plt.xlabel("number of topics") plt.ylabel("perplexity") plt.show() 可视化结果:
如上图的困惑度曲线可知,当取值为8时效果较好
处理结果
参考:up主—>五连单排一班
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。