赞
踩
本文介绍了如何使用python中的sklearn机器学习库实现自然语言处理中的LDA主题建模。
本文所使用的数据来源于web of sci 上的论文摘要数据。
import pandas as pd
import numpy as np
import mglearn
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
abstract = pd.read_csv('final_data.csv')
#train_text = abstract.loc[0:999,]
#train_text = train_text[(['TI', 'JI', 'PY', 'C1', 'AB', 'AU'])]
input_data = abstract['AB']
input_data = list(input_data)
input_data[0]
[output]:
Out[48]:
'This Work Proposes A Real Time Estimator For Needle Tip Deflection And Needle Shape During Needle Insertion Into Soft Tissue The Estimator Is Based On An Adaptive Quasi Static Mechanics Based Model For Needle Tissue Interactions The Model Uses Euler Bernoulli Beam Theory To Model The Needle As A Cantilever Beam That Experiences Loads Imposed By The Tissue The Modeled Needle Tissue Interactions Consist Of A Distributed Load Along The Inserted Needle Portion And Tissue Cutting Related Point Load At The Needle Tip We Propose A Closed Form Solution To Quantify The Magnitude Of These Needle Tissue Interaction Loads Based On Force And Torque Measured At The Needle Base The Model Adaptively Adjusts The Shape Of The Distributed Load As The Needle Is Inserted Experiments Are Carried Out Into Gelatin Phantom And Porcine Tissue To Validate The Deflection Estimate s Performance The Newly Proposed Model s Performance Is Compared Against A Previously Proposed Quasi Static Model For Needle Deflection Estimation It Is Shown That The Novel Model Outperforms The Previously Proposed Model '

定义数据预处理函数,将原始数据全部变为小写,分词,去除停用词
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
def textPreprocessing(text):
#小写化
text = [s.lower() for s in text]
#去除特殊标点
'''
for c in string.punctuation:
text = text.replace(c, ' ')
'''
#分词
wordLst = [nltk.word_tokenize(txt) for txt in text]
#去除停用词
filtered = [w for w in wordLst if w not in stopwords.words('english')]
return filtered
input_text = textPreprocessing(input_data)
input_text[0]
[output]:
['this',
'work',
'proposes',
'a',
'real',
'time',
'estimator',
'for',
'needle',
'tip',
'deflection',
'and',
'needle',
'shape',
'during',
'needle',
'insertion',
'into',
'soft',
'tissue',
'the',
'estimator',
'is',
'based',
'on',
'an',
'adaptive',
'quasi',
'static',
'mechanics',
'based',
'model',
'for',
'needle',
'tissue',
'interactions',
'the',
'model',
'uses',
'euler',
'bernoulli',
'beam',
'theory',
'to',
'model',
'the',
'needle',
'as',
'a',
'cantilever',
'beam',
'that',
'experiences',
'loads',
'imposed',
'by',
'the',
'tissue',
'the',
'modeled',
'needle',
'tissue',
'interactions',
'consist',
'of',
'a',
'distributed',
'load',
'along',
'the',
'inserted',
'needle',
'portion',
'and',
'tissue',
'cutting',
'related',
'point',
'load',
'at',
'the',
'needle',
'tip',
'we',
'propose',
'a',
'closed',
'form',
'solution',
'to',
'quantify',
'the',
'magnitude',
'of',
'these',
'needle',
'tissue',
'interaction',
'loads',
'based',
'on',
'force',
'and',
'torque',
'measured',
'at',
'the',
'needle',
'base',
'the',
'model',
'adaptively',
'adjusts',
'the',
'shape',
'of',
'the',
'distributed',
'load',
'as',
'the',
'needle',
'is',
'inserted',
'experiments',
'are',
'carried',
'out',
'into',
'gelatin',
'phantom',
'and',
'porcine',
'tissue',
'to',
'validate',
'the',
'deflection',
'estimate',
's',
'performance',
'the',
'newly',
'proposed',
'model',
's',
'performance',
'is',
'compared',
'against',
'a',
'previously',
'proposed',
'quasi',
'static',
'model',
'for',
'needle',
'deflection',
'estimation',
'it',
'is',
'shown',
'that',
'the',
'novel',
'model',
'outperforms',
'the',
'previously',
'proposed',
'model']

将文本数据向量化,作为LDA模型的输入,本文用TFIDF作为向量化的指标。
vect = TfidfVectorizer(max_features=10000, min_df=10, max_df=0.95,
stop_words='english')
X = vect.fit_transform(input_text)
TfidfVectorizer.fit_transform 返回的是Tf-idf-weighted document-term matrix,可通过如下方式访问。
feature_names=np.array(vect.get_feature_names())
sorted_by_tfidf = np.argsort(X.max(axis=0).toarray().ravel())
#输出TFIDF排序最小的20与最大的20个term
print(feature_names[sorted_by_tfidf[20:]])
print(feature_names[sorted_by_tfidf[-20:]])
同时可以输出按照逆文档矩阵idf输出在所有文档中出现频率都很高的term。
sorted_by_idf = np.argsort(vect.idf_)
print(feature_names[sorted_by_idf[:20]])
设置主题数量,学习方式,超参数α,β取默认值。
lda = LatentDirichletAllocation(n_topics=10, learning_method='batch',
max_iter=25, random_state=0)
lda_topics = lda.fit_transform(X)
lda.fit_transform 返回的是document-topic matrix,lda.components_返回的是topic-term matrix
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
sorting=sorting, topics_per_chunk=5, n_words=20)
输出每个topic的top words
def print_top_words(model, feature_names, n_top_words):
#打印每个主题下权重较高的term
for topic_idx, topic in enumerate(model.components_):
print "Topic #%d:" % topic_idx
print " ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
#打印主题-词语分布矩阵
print ("#主题-词语分布矩阵: \n" model.components_)
n_top_words=20
feature_names = vect.get_feature_names()
print_top_words(lda, feature_names, n_top_words)
输出Doc-Topic矩阵
lda_topics
收敛效果(perplexity)
lda.perplexity(X)
可以调整的参数
两种可行的调参方案
一、以n_topics为例,按照perplexity的大小选择最佳模型。当然,topic数目的不同势必会导致perplexity计算的不同,因此perplexity仅能作为参考,topic数目还需要根据实际需求主观指定。n_topics调参代码如下:
n_topics = range(20, 75, 5)
perplexityLst = [1.0]*len(n_topics)
#训练LDA并打印训练时间
lda_models = []
for idx, n_topic in enumerate(n_topics):
lda = LatentDirichletAllocation(n_topics=n_topic,
max_iter=20,
learning_method='batch',
evaluate_every=200,
# perp_tol=0.1, #default
# doc_topic_prior=1/n_topic, #default
# topic_word_prior=1/n_topic, #default
verbose=0)
t0 = time()
lda.fit(tf)
perplexityLst[idx] = lda.perplexity(tf)
lda_models.append(lda)
print "# of Topic: %d, " % n_topics[idx],
print "done in %0.3fs, N_iter %d, " % ((time() - t0), lda.n_iter_),
print "Perplexity Score %0.3f" % perplexityLst[idx]
#打印最佳模型
best_index = perplexityLst.index(min(perplexityLst))
best_n_topic = n_topics[best_index]
best_model = lda_models[best_index]
print "Best # of Topic: ", best_n_topic
#绘制不同主题数perplexity的不同
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(n_topics, perplexityLst)
ax.set_xlabel("# of topics")
ax.set_ylabel("Approximate Perplexity")
plt.grid(True)
plt.savefig(os.path.join('lda_result', 'perplexityTrend'+CODE+'.png'))
plt.show()
Output:
Best # of Topic: 25

二、如果想一次性调整所有参数也可以直接利用sklearn作cv,但是这样做的结果一定是,耗时十分长。以下代码仅供参考,可以根据自身的需求进行增减。
from sklearn.model_selection import GridSearchCV
parameters = {'learning_method':('batch', 'online'),
'n_topics':range(20, 75, 5),
'perp_tol': (0.001, 0.01, 0.1),
'doc_topic_prior':(0.001, 0.01, 0.05, 0.1, 0.2),
'topic_word_prior':(0.001, 0.01, 0.05, 0.1, 0.2)
'max_iter':1000}
lda = LatentDirichletAllocation()
model = GridSearch(lda, parameters)
model.fit(tf)
sorted(model.cv_results_.keys())
注:调参部分内容参考如下文章:https://blog.csdn.net/TiffanyRabbit/article/details/76445909
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。