赞
踩
LDA(Latent Dirichlet Allocation)是一种常用的主题模型,它可以帮助我们从大量文本数据中发现隐藏的主题信息。
需要的库
import pandas as pd
import matplotlib.pyplot as plt
import jieba
import jieba.posseg as pseg
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import os
import re
import pyLDAvis.gensim_models
ScenicSpotReviewAnalysis类
class ScenicSpotReviewAnalysis: def __init__(self, data_path, stopwords_path): self.data_path = data_path # txt路径 self.stopwords = self._load_stopwords(stopwords_path) # 停用词 self.all_texts = self._load_data() # 加载数据 self.dictionary = None # 初始化词典 self.corpus = None # 初始化语料库 def _load_stopwords(self, path): # 加载停用词 with open(path, encoding="utf8") as f: stopwords = f.read().split("\n") stopwords.append("\n") # 添加要排除的特定词语 extra_stopwords = ['地方', '总体', '真的', '建议','小时','一座','建议'] stopwords.extend(extra_stopwords) return stopwords def _load_data(self): # 加载评论数据并预处理 all_texts = {} for pathdir in os.listdir(self.data_path): if not pathdir.endswith(".DS_Store"): subfilepath = os.path.join(self.data_path, pathdir) month_text = [] if os.path.isdir(subfilepath): for file in os.listdir(subfilepath): filepath = os.path.join(subfilepath, file) with open(filepath, encoding="utf-8") as f: text = f.read() text = "".join(re.findall(r'[\u4e00-\u9fa5]+', text)) # 提取中文 words = jieba.lcut(text) # 分词 valid_words = [word for word in words if word not in self.stopwords] month_text.append(" ".join(valid_words)) all_texts[pathdir] = month_text return all_texts def filter_pos(self, target_texts): filtered_texts = [] for sentence in target_texts: filtered_words = [] words = pseg.cut(sentence) for word, flag in words: # 仅保留名词和形容词,且单词长度大于1 if (flag.startswith('n') or flag.startswith('a')) and len(word) > 1: filtered_words.append(word) filtered_texts.append(' '.join(filtered_words)) return filtered_texts def train_lda_models(self, texts, num_topics_range): self.dictionary = Dictionary(texts) self.corpus = [self.dictionary.doc2bow(tmp) for tmp in texts] for num_topics in num_topics_range: lda_model = LdaModel(corpus=self.corpus, num_topics=num_topics, id2word=self.dictionary, passes=20) lda_model.save(f'./lda_{num_topics}_{len(texts)}.model') # 生成困惑度曲线图 def plot_perplexity(self, num_topics_range): x_list, y_list = [], [] for num_topics in num_topics_range: try: lda_model = LdaModel.load(f'./lda_{num_topics}_{len(self.corpus)}.model') perplexity = lda_model.log_perplexity(self.corpus) x_list.append(num_topics) y_list.append(perplexity) except Exception as e: print('Error:', e) plt.plot(x_list, y_list) plt.xlabel('Num Topics') plt.ylabel('Perplexity Score') plt.show() # elbow_point = x_list[y_list.index(min(y_list))] # print("建议的主题个数为:", elbow_point) def visualize_topics(self, num_topics): lda_model = LdaModel.load(f'./lda_{num_topics}_{len(self.corpus)}.model') top_topics = lda_model.top_topics(self.corpus) avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics vis = pyLDAvis.gensim_models.prepare(lda_model, self.corpus, self.dictionary, mds='tsne', sort_topics=False) pyLDAvis.save_html(vis, "LDA.html") return vis
执行
if __name__ == '__main__':
data_path = "./景点评论数据/景点评论数据/txt"
stopwords_path = "./stopwords.txt"
analysis = ScenicSpotReviewAnalysis(data_path, stopwords_path)
phase_1 = analysis.all_texts['P1']
num_topics_range = range(1, 16)
# 对每个文档应用filter_pos处理
filtered_phase_1 = [analysis.filter_pos([doc])[0] for doc in phase_1]
analysis.train_lda_models([a.split() for a in filtered_phase_1], num_topics_range)
analysis.plot_perplexity(num_topics_range)
vis = analysis.visualize_topics(3) # 输出几个主题
代码运行之后会输出一张困惑度曲线图,用以确定最佳主题个数,找出变化率开始显著下降的点,这个点通常被称为肘点
.
可视化图保存在LDA.html
中
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。