赞
踩
首先使用gensim
库:
pip install gensim
import gensim # pip install gensim from gensim import corpora def train_lda_model(all_contents, dictionary, num_topic=10): """这是训练LDA的核心方法""" corpus = [dictionary.doc2bow(sentence) for sentence in all_contents] lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic) # 核心代码 return lda if __name__ == '__main__': data = list(iter(open('data.txt'))) data = [content.split() for content in data] try: dictionary = corpora.Dictionary(data) num_topic = 3 # 主题类型 lda_model = train_lda_model(data, dictionary, num_topic=num_topic) # 训练LDA模型 lda_model.save('lda_' + str(num_topic) + '.model') # 保存LDA模型 except Exception as e: print(e)
其中data.txt
文件是:
in conjunction with the release of the the allen institute for ai partnered with
the recent outbreak of the deadly and highly infectious covid disease caused by
coronaviruses is related illness that vary from a common cold more severe
it is shown that the evaporation rate of a liquid sample containing the
covid illness an on going epidemic started in wuhan city china in december
in the beginning of december covid virus that slipped from animals humans in
新建一个文件perplexity_cal.py
,文件代码是:
import math import gensim def perplexity(ldamodel: gensim.models.LdaModel, data, dictionary: gensim.corpora.Dictionary): """ 计算LDA模型困惑度 :param ldamodel: lda模型 :param data: 计算困惑度需要训练数据 :param dictionary: 文本处理后的Dictionary,使用corpora.Dictionary(my_data)处理训练gensim模型时的数据 my_data 后得到的 :return: 返回困惑度 """ size_dictionary = len(dictionary.keys()) testset = [] for i in data: testset.append(dictionary.doc2bow(i)) num_topics = ldamodel.num_topics prob_doc_sum = 0.0 topic_word_list = [] # store the probablity of topic-word:[(u'business', 0.010020942661849608),(u'family', 0.0088027946271537413)...] for topic_id in range(num_topics): topic_word = ldamodel.show_topic(topic_id, size_dictionary) dic = {} for word, probability in topic_word: dic[word] = probability topic_word_list.append(dic) doc_topics_ist = [] # store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...] for doc in testset: doc_topics_ist.append(ldamodel.get_document_topics(doc, minimum_probability=0)) testset_word_num = 0 for i in range(len(testset)): prob_doc = 0.0 # the probablity of the doc doc = testset[i] doc_word_num = 0 # the num of words in the doc for word_id, num in doc: prob_word = 0.0 # the probablity of the word doc_word_num += num word = dictionary[word_id] for topic_id in range(num_topics): # cal p(w) : p(w) = sumz(p(z)*p(w|z)) prob_topic = doc_topics_ist[i][topic_id][1] prob_topic_word = topic_word_list[topic_id][word] prob_word += prob_topic * prob_topic_word prob_doc += math.log(prob_word) # p(d) = sum(log(p(w))) prob_doc_sum += prob_doc testset_word_num += doc_word_num prep = math.exp(-prob_doc_sum / testset_word_num) # perplexity = exp(-sum(p(d)/sum(Nd)) # print("LDA模型困惑度 : %s" % prep) return prep
在主函数中使用:
perp = perplexity_cal.perplexity(lda_model, data, dictionary)
其中lda_model
为训练出的LDA模型,data
为计算困惑度的训练集,dictionary
为训练LDA模型时的训练集使用corpora.Dictionary(my_data)
得到的dictionary格式的数据
首先要确保这一段文本的词在LDA的训练集中出现过,然后写一个函数:
def get_topic_from_model(lda_model: gensim.models.ldamodel.LdaModel, text: str = "related illness that"):
"""使用LDA模型得到文本主题"""
text = [word for word in text.lower().split()]
dictionary = corpora.Dictionary([text])
bow = dictionary.doc2bow(text)
return lda_model.get_document_topics(bow)
Main函数中使用:
topic = get_topic_from_model(lda_model, text="related illness that")
print(topic) # [(0, 0.08674477), (1, 0.084886044), (2, 0.8283692)] 返回值含义为 (主题:概率)
data.txt
数据文件:
in conjunction with the release of the the allen institute for ai partnered with
the recent outbreak of the deadly and highly infectious covid disease caused by
coronaviruses is related illness that vary from a common cold more severe
it is shown that the evaporation rate of a liquid sample containing the
covid illness an on going epidemic started in wuhan city china in december
in the beginning of december covid virus that slipped from animals humans in
Main.py文件
import gensim # pip install gensim from gensim import corpora import perplexity_cal def train_lda_model(all_contents, dictionary, num_topic=10): """这是训练LDA的核心方法""" corpus = [dictionary.doc2bow(sentence) for sentence in all_contents] lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic) # 核心代码 return lda def get_topic_from_model(lda_model: gensim.models.ldamodel.LdaModel, text: str = "related illness that"): """使用LDA模型得到文本主题""" text = [word for word in text.lower().split()] dictionary = corpora.Dictionary([text]) bow = dictionary.doc2bow(text) return lda_model.get_document_topics(bow) if __name__ == '__main__': data = list(iter(open('data.txt'))) data = [content.split() for content in data] try: dictionary = corpora.Dictionary(data) num_topic = 3 # 主题类型 lda_model = train_lda_model(data, dictionary, num_topic=num_topic) # 训练LDA模型 # lda_model.save('lda_' + str(num_topic) + '.model') # 保存LDA模型 # 计算困惑度 perp = perplexity_cal.perplexity(lda_model, data, dictionary) print("LDA困惑度: topic:", str(num_topic) + " value: " + str(perp)) # 测试一个文章的主题 topic = get_topic_from_model(lda_model, text="related illness that") print(topic) except Exception as e: print(e)
perplexity_cal.py
文件:
import math import gensim def perplexity(ldamodel: gensim.models.LdaModel, data, dictionary: gensim.corpora.Dictionary): """ 计算LDA模型困惑度 :param ldamodel: lda模型 :param data: 计算困惑度需要训练数据 :param dictionary: 文本处理后的Dictionary,使用corpora.Dictionary(my_data)处理训练gensim模型时的数据 my_data 后得到的 :return: 返回困惑度 """ size_dictionary = len(dictionary.keys()) testset = [] for i in data: testset.append(dictionary.doc2bow(i)) num_topics = ldamodel.num_topics prob_doc_sum = 0.0 topic_word_list = [] # store the probablity of topic-word:[(u'business', 0.010020942661849608),(u'family', 0.0088027946271537413)...] for topic_id in range(num_topics): topic_word = ldamodel.show_topic(topic_id, size_dictionary) dic = {} for word, probability in topic_word: dic[word] = probability topic_word_list.append(dic) doc_topics_ist = [] # store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...] for doc in testset: doc_topics_ist.append(ldamodel.get_document_topics(doc, minimum_probability=0)) testset_word_num = 0 for i in range(len(testset)): prob_doc = 0.0 # the probablity of the doc doc = testset[i] doc_word_num = 0 # the num of words in the doc for word_id, num in doc: prob_word = 0.0 # the probablity of the word doc_word_num += num word = dictionary[word_id] for topic_id in range(num_topics): # cal p(w) : p(w) = sumz(p(z)*p(w|z)) prob_topic = doc_topics_ist[i][topic_id][1] prob_topic_word = topic_word_list[topic_id][word] prob_word += prob_topic * prob_topic_word prob_doc += math.log(prob_word) # p(d) = sum(log(p(w))) prob_doc_sum += prob_doc testset_word_num += doc_word_num prep = math.exp(-prob_doc_sum / testset_word_num) # perplexity = exp(-sum(p(d)/sum(Nd)) # print("LDA模型困惑度 : %s" % prep) return prep
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。