赞
踩
from gensim import corpora
from gensim import models
def get_corpus_dictionary():
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print('原文本:')
for text in texts:
print(text)
return corpus, dictionary
corpus,dictionary = get_corpus_dictionary()
print('=================dictinary=============')
print('词ID到这个词在多少篇文档数的映射(dfs):',dictionary.dfs)
print('词到id编码的映射(token2id):',dictionary.token2id)
print('id编码到词的映射(id2token):',dictionary.id2token)
print('处理的文档数(num_docs):',dictionary.num_docs)
print('没有去重词条总数(num_pos):',dictionary.num_pos)
print('对文档内去重后的词条总数,文档间相同词不去重,只要记录BOW矩阵的非零元素个数(num_nnz):',dictionary.num_nnz)
print('=================dictinary=============')
print('原词袋表示:')
for c in corpus:
print(c)
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
print('转换整个语料库:')
for doc in corpus_tfidf:
print(doc)
运行结果:
原文本:
['human', 'interface', 'computer']
['survey', 'user', 'computer', 'system', 'response', 'time']
['eps', 'user', 'interface', 'system']
['system', 'human', 'system', 'eps']
['user', 'response', 'time']
['trees']
['graph', 'trees']
['graph', 'minors', 'trees']
['graph', 'minors', 'survey']
=================dictinary=============
词ID到这个词在多少篇文档数的映射(dfs): {0: 2, 1: 2, 2: 2, 3: 2, 4: 3, 5: 2, 6: 2, 7: 3, 8: 2, 9: 3, 10: 3, 11: 2}
词到id编码的映射(token2id): {'minors': 11, 'computer': 0, 'user': 4, 'trees': 9, 'response': 6, 'time': 5, 'graph': 10, 'human': 1, 'interface': 2, 'survey': 3, 'eps': 8, 'system': 7}
id编码到词的映射(id2token): {}
处理的文档数(num_docs): 9
没有去重词条总数(num_pos): 29
对文档内去重后的词条总数,文档间相同词不去重,只要记录BOW矩阵的非零元素个数(num_nnz): 28
=================dictinary=============
原词袋表示:
[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (4, 1), (7, 1), (8, 1)]
[(1, 1), (7, 2), (8, 1)]
[(4, 1), (5, 1), (6, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(3, 1), (10, 1), (11, 1)]
转换整个语料库:
[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.3244870206138555), (5, 0.44424552527467476), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (4, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (7, 0.7184811607083769), (8, 0.49182558987264147)]
[(4, 0.45889394536615247), (5, 0.6282580468670046), (6, 0.6282580468670046)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(3, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]
def __init__(self, corpus=None, id2word=None, dictionary=None,
wlocal=utils.identity, wglobal=df2idf, normalize=True)
#corpus: 语料
#id2word: id转向词函数
#dictionary:词典
#wlocal: 用在计算
# vector = [(termid, self.wlocal(tf)
# self.idfs.get(termid))
# for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]
# wglobal: 用要计算地方
# dict((termid, wglobal(df, total_docs))
# for termid, df in iteritems(dfs))
# normalize: 规范化处理;这个可以是一个布尔类型的值,也可以是自定义的函数;
wlocal与wglobal可自定义计算tfidf函数;默认为wlocal=utils.identity{默认是,这个个函数不作任何处理的},wglobal=df2idf {
idf = add + log(totaldocs / doc_freq)
}
def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
"""使用文档频率去计算,逆文档频率"""
return add + math.log(1.0 * totaldocs / docfreq, log_base)
#TF-IDF主要实现的公式
#weight_{i,j} = wlocal(frequency_{i,j}) * #wglobal(document_freq_{i}, D)
# TfidfModel可由字典创建或corpus创建。两个都给了定了,就按字典创建。
if dictionary is not None:
if corpus is not None:
self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
self.dfs = dictionary.dfs.copy()
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
if id2word is None:
self.id2word = dictionary
elif corpus is not None:
self.initialize(corpus)
else:
pass
补充字典参数:
token2id:词到id编码的映射;
id2token:id编码到词的映射;
dfs:词ID到这个词在多少篇文档数的映射;
self.num_docs:处理的文档数
self.num_pos: 没有去重词条总数[因为一个文档会有相同出现的词]
self.num_nnz: 对文档内去重后的词条总数,文档间相同词不去重,只要记录BOW矩阵的非零元素个数
无论是给了语料或字典,一般来说如果前面有计算出字典的,最好给字典,因为给了语料还要计算字典相关的数据,这个initialize()函数中可以看到。不过,最后还是调用了precompute_idfs(),它会预计算idfs,为后提供数据支持,为后面计算加快计算数据用的:返回词id与该转换后的tf-idf值对应关系,即由原来的整型BOW词袋转成了tf-idf模型的词袋:
def precompute_idfs(wglobal, dfs, total_docs):
# 遍历每个词与词所在的文档数数据对,对每个词,根据文档数据调用wglobal公式,默认时是调用df2idf()函数来求。
return dict((termid, wglobal(df, total_docs)) for termid, df in iteritems(dfs))
如上面例子,调用方法为:
corpus_tfidf = tfidf[corpus]
代码实现为:
def __getitem__(self, bow, eps=1e-12):
# 判断转入的是否是合法的语料
is_corpus, bow = utils.is_corpus(bow)
if is_corpus:
return self._apply(bow)
# 对每个语料计算权重,去掉权重为0结果
vector = [
(termid, self.wlocal(tf) * self.idfs.get(termid))
for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0
]
# 标准化,normalize可是一个布尔值,同时也可以是一个函数。
if self.normalize is True:
vector = matutils.unitvec(vector)
elif self.normalize:
vector = self.normalize(vector)
# 确保权重大于一个eps域值,这个可以用来作关键词提取的过滤
vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
return vector
题外话:python中实例tfidf[corpus]就类似上面的的操作,则会自动调用类中定义的方法getitem;这个是python里面特殊函数,语法层面的内容;另外len, setitem, delitem等函数也有类似的特殊操作。
了解更多TF-IDF算法:
http://www.cnblogs.com/biyeymyhjob/archive/2012/07/17/2595249.html
【作者:happyprince, http://blog.csdn.net/ld326/article/details/78441773】
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。