赞
踩
废话不多说,直接上代码
'''LDA模型的实现及可视化''' import pandas as pd import numpy as np import jieba import jieba.posseg as peg from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer from sklearn.decomposition import LatentDirichletAllocation import pyLDAvis.sklearn import time #定义函数,加载txt文件 def readtxt(filepath,encoding='utf-8'): words=[line.strip() for line in open(filepath,mode='r',encoding=encoding).readlines()] return words #定义分词函数 def cut_word(text): #加载用户自定义词典 #jieba.load_userdict('user_dict.txt') #加载停用词表 stopwords=readtxt('...\stopwords_cn.txt',encoding='utf-8') sentence="" checkarr=['n'] for word,flag in peg.lcut(text): if (flag in checkarr) and (word not in stopwords) and (len(word)>1): sentence = sentence + word + " " return sentence #文本向量化 def word_vectorizer(n_features,max_df=0.5,min_df=3): cv = CountVectorizer(strip_accents = 'unicode',#将使用unicode编码在预处理步骤去除raw document中的重音符号 max_features=n_features, max_df = 0.5,# 阈值如果某个词的document frequence大于max_df,不当作关键词 min_df = 3 # 如果某个词的document frequence小于min_df,则这个词不会被当作关键词 ) return cv def lda_model(k,max_iter=50,method='online',learning_offset=50.,random_state=0): lda=LatentDirichletAllocation(n_components=k,max_iter=max_iter, learning_method=method, learning_offset=learning_offset, random_state=random_state) return lda def print_keywords(lda,cv,therahold,p): weight_matrix=lda.components_ tf_feature_names=cv.get_feature_names() id = 0 for weights in weight_matrix: dicts = [(name, weight) for name, weight in zip(tf_feature_names, weights)] dicts = sorted(dicts, key=lambda x: x[1], reverse=True)#根据特征词的权重降序排列 dicts = [word for word in dicts if word[1] > therahold]# 打印权重值大于0.6的主题词 dicts = dicts[:p]# 打印每个主题前5个主题词 print('主题%d:' % (id), dicts) id += 1 if __name__=='__main__': #调用函数 text=readtxt(r'...\data\reviews.txt') #分词 segged_words=[cut_word(x) for x in text] print(segged_words[0]) #向量化 n_features = 1000# 指定特征关键词提取最大值 cv=word_vectorizer(n_features) tf = cv.fit_transform(segged_words)#将评论关键字列表转换为词向量空间,TFIDF矩阵 #构建lda模型 time_start=time.time() lda=lda_model(4) ldamodel=lda.fit_transform(tf) time_end=time.time() print('time cost',time_end-time_start,'s') '''对于构建的词典,一些查看操作''' # #查看构建的词典 # print(cv.vocabulary_) # #查看词典大小 # print(len(cv.vocabulary_)) # print(cv.get_feature_names()) # #查看抽取出的特征词个数 # print(len(cv.get_feature_names())) # #查看每个特征词在单个文摘中的词频 # print(tf) # #查看全部文摘向量化表示的结果 # print(tf.toarray()) # #计算每个词在所有文摘中的累积词频 # print(tf.toarray().sum(axis=0)) # #根据累积词频,提取高频词 # #(1)获取高频词的索引 # fre=tf.toarray().sum(axis=0) # index_lst=[] # for i in range(len(fre)): # if fre[i]>10: # index_lst.append(i) # #(2)对词典按词频升序排列 # voca=list(cv.vocabulary_.items()) # sorted_voca=sorted(voca,key=lambda x:x[1],reverse=False) # #(3)提取高频词 # high_fre_voca=[] # for i in sorted_voca: # if i[1] in index_lst: # high_fre_voca.append(i[0]) # print(high_fre_voca) '''对于构建的lda模型,查看已有语料库属于各个主题的概率''' # #查看每个文摘属于各个主题的概率 # proba=np.array(ldamodel) # print('每个文摘属于各个主题的概率:\n',proba) # #构建一个零矩阵 # zero_matrix=np.zeros([proba.shape[0]]) # # 对比所属概率的大小,确定属于的类别 # max_proba = np.argmax(proba, axis=1) # 返回沿轴axis最大值的索引,axis=1代表行;最大索引即表示最可能表示的数字是多少 # print('每个文档所属类别:', max_proba) #查看每个特征词属于各个主题的权重 # weight_matrix=ldamodel.components_ # print(weight_matrix) # print(len(weight_matrix)) #打印每个主题前5个关键词【要求每个关键词的权重大于0.6】 print_keywords(lda, cv, 0.6, 5) #可视化 d=pyLDAvis.sklearn.prepare(lda, tf, cv) pyLDAvis.show(d)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。