赞
踩
# -*- coding: utf-8 -*- import re import warnings import jieba import matplotlib import matplotlib.pyplot as plt import pandas as pd from gensim import corpora from gensim.models.coherencemodel import CoherenceModel from gensim.models.ldamodel import LdaModel warnings.filterwarnings('ignore') # To ignore all warnings that arise here to enhance clarity PATH = "data_lda.csv" # 停用词 stop_words_path = "stop_words.txt" stop_words = [] # 保留词文本路径 # 文本格式: 词语 词频(可省略) 词性(可省略) # 一个词一行 reserved_words_path = "reserved_words.txt" with open(stop_words_path, encoding='utf-8') as f: for line in f.readlines(): stop_words.append(line.strip()) f.close() # 数据清洗, 可以根据自己的需求进行重载 def processing(text): text = re.sub("【.+?】", "", text) # 去除 【xx】 (里面的内容通常都不是用户自己写的) text = re.sub("\n", "", text) text = re.sub(r'[\W]', "", text) # 去除标点符号 text = re.sub(r'[\d]', "", text) # 去除数字 return text # 对句子进行中文分词 def seg_depart(sentence): jieba.load_userdict(reserved_words_path) sentence_depart = jieba.cut(sentence.strip()) out_str = '' # 输出结果为out_str for word in sentence_depart: if word in stop_words: continue out_str += word out_str += " " return out_str def get_data_set(path): data = pd.read_csv(path) data_set = [] # 建立存储分词的列表 print("一共有{}行数据".format(len(data["审稿意见"]))) for i_ in data["审稿意见"]: i_ = i_.strip() data_set.append(i_) return data_set def get_fen_ci_data(data): output = [] for line in data: line = processing(line) line_seg = seg_depart(line) output.append(line_seg.split()) print("分词成功!!!") return output """ 一般我们可以用指标来评估模型好坏,也可以用这些指标来确定最优主题数。 一般用来评价LDA主题模型的指标有困惑度(perplexity)和主题一致性(coherence), 困惑度越低或者一致性越高说明模型越好。一些研究表明perplexity并不是一个好的指标, 所以一般我用coherence来评价模型并选择最优主题 """ # 计算困惑度 def perplexity(topics_num): print("\n#######number of topics is {}#######\n".format(topics_num)) lda_model = LdaModel(corpus, num_topics=topics_num, id2word=dictionary, passes=30) print(lda_model.print_topics(num_topics=topics_num, num_words=15)) print(lda_model.log_perplexity(corpus)) return lda_model.log_perplexity(corpus) # 计算coherence def coherence(topics_num): print("\n####### number of topics is {} #######\n".format(topics_num)) lda_model = LdaModel(corpus, num_topics=topics_num, id2word=dictionary, passes=30, random_state=1) print(lda_model.print_topics(num_topics=topics_num, num_words=10)) lda_cm = CoherenceModel(model=lda_model, texts=fen_ci_data, dictionary=dictionary, coherence='c_v') print(lda_cm.get_coherence()) return lda_cm.get_coherence() # 打印LDA模型结果 def show_lda_result(data, topics_num, words_num): print("\n============== 主题数:{} 每个主题单词数: {} ==============".format(topics_num, words_num)) dictionary_ = corpora.Dictionary(data) # 构建词典 corpus_ = [dictionary.doc2bow(text) for text in fen_ci_data] # 表示为第几个单词出现了几次 lda_model = LdaModel(corpus_, num_topics=topics_num, id2word=dictionary_, passes=30, random_state=1) # 分为10个主题 out_put = lda_model.print_topics(num_topics=topics_num, num_words=words_num) # 每个主题输出15个单词 for i_ in out_put: print(i_) print("\n\n") if __name__ == "__main__": print("Hello world!") print("当前停用词为: ", stop_words) # 获取数据 input_data = get_data_set(PATH) print("\n============ 读取数据 ==========\n") for i in input_data[:5]: print(i) print("\n######################\n") # 获取分词数据 fen_ci_data = get_fen_ci_data(input_data) print("\n============ 分词结果 ==========\n") for i in fen_ci_data[:5]: print(i) print("\n######################\n") print("\n============ LDA模型 ==========\n") dictionary = corpora.Dictionary(fen_ci_data) # 构建词典 corpus = [dictionary.doc2bow(text) for text in fen_ci_data] # 表示为第几个单词出现了几次 for i in corpus[:5]: print(i) print("\n######################\n") num_words = 15 # 每个主题输出的单词个数 num_topics = 5 # 主题数目 show_lda_result(fen_ci_data, num_topics, num_words) num_topics = 13 # 主题数目 show_lda_result(fen_ci_data, num_topics, num_words) # 画图主题数为1到15的图 x = range(1, 15) # z = [perplexity(i) for i in x] #如果想用困惑度就选这个 y = [coherence(i) for i in x] plt.plot(x, y) plt.xlabel('主题数目') plt.ylabel('coherence大小') plt.rcParams['font.sans-serif'] = ['SimHei'] matplotlib.rcParams['axes.unicode_minus'] = False plt.title('主题-coherence变化情况') plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。