pip install gensim
import gensim # pip install gensim from gensim import corpora def train_lda_model(all_contents, dictionary, num_topic=10): """这是训练LDA的核心方法""" corpus = [dictionary.doc2bow(sentence) for sentence in all_contents] lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic) # 核心代码 return lda if __name__ == '__main__': data = list(iter(open('data.txt'))) data = [content.split() for content in data] try: dictionary = corpora.Dictionary(data) num_topic = 3 # 主题类型 lda_model = train_lda_model(data, dictionary, num_topic=num_topic) # 训练LDA模型 lda_model.save('lda_' + str(num_topic) + '.model') # 保存LDA模型 except Exception as e: print(e)
in conjunction with the release of the the allen institute for ai partnered with
the recent outbreak of the deadly and highly infectious covid disease caused by
coronaviruses is related illness that vary from a common cold more severe
it is shown that the evaporation rate of a liquid sample containing the
covid illness an on going epidemic started in wuhan city china in december
in the beginning of december covid virus that slipped from animals humans in
import math import gensim def perplexity(ldamodel: gensim.models.LdaModel, data, dictionary: gensim.corpora.Dictionary): """ 计算LDA模型困惑度 :param ldamodel: lda模型 :param data: 计算困惑度需要训练数据 :param dictionary: 文本处理后的Dictionary,使用corpora.Dictionary(my_data)处理训练gensim模型时的数据 my_data 后得到的 :return: 返回困惑度 """ size_dictionary = len(dictionary.keys()) testset = [] for i in data: testset.append(dictionary.doc2bow(i)) num_topics = ldamodel.num_topics prob_doc_sum = 0.0 topic_word_list = [] # store the probablity of topic-word:[(u'business', 0.010020942661849608),(u'family', 0.0088027946271537413)...] for topic_id in range(num_topics): topic_word = ldamodel.show_topic(topic_id, size_dictionary) dic = {} for word, probability in topic_word: dic[word] = probability topic_word_list.append(dic) doc_topics_ist = [] # store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...] for doc in testset: doc_topics_ist.append(ldamodel.get_document_topics(doc, minimum_probability=0)) testset_word_num = 0 for i in range(len(testset)): prob_doc = 0.0 # the probablity of the doc doc = testset[i] doc_word_num = 0 # the num of words in the doc for word_id, num in doc: prob_word = 0.0 # the probablity of the word doc_word_num += num word = dictionary[word_id] for topic_id in range(num_topics): # cal p(w) : p(w) = sumz(p(z)*p(w|z)) prob_topic = doc_topics_ist[i][topic_id][1] prob_topic_word = topic_word_list[topic_id][word] prob_word += prob_topic * prob_topic_word prob_doc += math.log(prob_word) # p(d) = sum(log(p(w))) prob_doc_sum += prob_doc testset_word_num += doc_word_num prep = math.exp(-prob_doc_sum / testset_word_num) # perplexity = exp(-sum(p(d)/sum(Nd)) # print("LDA模型困惑度 : %s" % prep) return prep
perp = perplexity_cal.perplexity(lda_model, data, dictionary)
def get_topic_from_model(lda_model: gensim.models.ldamodel.LdaModel, text: str = "related illness that"):
text = [word for word in text.lower().split()]
dictionary = corpora.Dictionary([text])
bow = dictionary.doc2bow(text)
return lda_model.get_document_topics(bow)
topic = get_topic_from_model(lda_model, text="related illness that")
print(topic) # [(0, 0.08674477), (1, 0.084886044), (2, 0.8283692)] 返回值含义为 (主题:概率)
in conjunction with the release of the the allen institute for ai partnered with
the recent outbreak of the deadly and highly infectious covid disease caused by
coronaviruses is related illness that vary from a common cold more severe
it is shown that the evaporation rate of a liquid sample containing the
covid illness an on going epidemic started in wuhan city china in december
in the beginning of december covid virus that slipped from animals humans in
import gensim # pip install gensim from gensim import corpora import perplexity_cal def train_lda_model(all_contents, dictionary, num_topic=10): """这是训练LDA的核心方法""" corpus = [dictionary.doc2bow(sentence) for sentence in all_contents] lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic) # 核心代码 return lda def get_topic_from_model(lda_model: gensim.models.ldamodel.LdaModel, text: str = "related illness that"): """使用LDA模型得到文本主题""" text = [word for word in text.lower().split()] dictionary = corpora.Dictionary([text]) bow = dictionary.doc2bow(text) return lda_model.get_document_topics(bow) if __name__ == '__main__': data = list(iter(open('data.txt'))) data = [content.split() for content in data] try: dictionary = corpora.Dictionary(data) num_topic = 3 # 主题类型 lda_model = train_lda_model(data, dictionary, num_topic=num_topic) # 训练LDA模型 # lda_model.save('lda_' + str(num_topic) + '.model') # 保存LDA模型 # 计算困惑度 perp = perplexity_cal.perplexity(lda_model, data, dictionary) print("LDA困惑度: topic:", str(num_topic) + " value: " + str(perp)) # 测试一个文章的主题 topic = get_topic_from_model(lda_model, text="related illness that") print(topic) except Exception as e: print(e)
import math import gensim def perplexity(ldamodel: gensim.models.LdaModel, data, dictionary: gensim.corpora.Dictionary): """ 计算LDA模型困惑度 :param ldamodel: lda模型 :param data: 计算困惑度需要训练数据 :param dictionary: 文本处理后的Dictionary,使用corpora.Dictionary(my_data)处理训练gensim模型时的数据 my_data 后得到的 :return: 返回困惑度 """ size_dictionary = len(dictionary.keys()) testset = [] for i in data: testset.append(dictionary.doc2bow(i)) num_topics = ldamodel.num_topics prob_doc_sum = 0.0 topic_word_list = [] # store the probablity of topic-word:[(u'business', 0.010020942661849608),(u'family', 0.0088027946271537413)...] for topic_id in range(num_topics): topic_word = ldamodel.show_topic(topic_id, size_dictionary) dic = {} for word, probability in topic_word: dic[word] = probability topic_word_list.append(dic) doc_topics_ist = [] # store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...] for doc in testset: doc_topics_ist.append(ldamodel.get_document_topics(doc, minimum_probability=0)) testset_word_num = 0 for i in range(len(testset)): prob_doc = 0.0 # the probablity of the doc doc = testset[i] doc_word_num = 0 # the num of words in the doc for word_id, num in doc: prob_word = 0.0 # the probablity of the word doc_word_num += num word = dictionary[word_id] for topic_id in range(num_topics): # cal p(w) : p(w) = sumz(p(z)*p(w|z)) prob_topic = doc_topics_ist[i][topic_id][1] prob_topic_word = topic_word_list[topic_id][word] prob_word += prob_topic * prob_topic_word prob_doc += math.log(prob_word) # p(d) = sum(log(p(w))) prob_doc_sum += prob_doc testset_word_num += doc_word_num prep = math.exp(-prob_doc_sum / testset_word_num) # perplexity = exp(-sum(p(d)/sum(Nd)) # print("LDA模型困惑度 : %s" % prep) return prep
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。