当前位置:   article > 正文

python中文相似度_python文本相似度计算

similarities.matrixsimilarity

话不多说,直接上源码

import jieba

from gensim import corpora, models, similarities

import codecs

def cut_words(file):

with open(file, 'r') as f:

text = f.read()

words = jieba.lcut(text)

# print(len(words),words) #查看分词结果

return words

def drop_Disable_Words(cut_res, stopwords):

res = []

for word in cut_res:

if word in stopwords or word == "\n" or word == "\u3000":

continue

res.append(word)

# print(len(res),res) #查看去停用词结果

return res

def read_stop_word(file_path):

file = file_path

stopwords = codecs.open(file, 'r', encoding='GBK').readlines()

stopwords = [w.strip() for w in stopwords]

return stopwords

# 读取原始语料、停用词表

files = ['F:/fenxi/测试之美食.txt',

'F:/fenxi/测试之名人自述.txt',

'F:/fenxi/吃货大师的欲望清单.txt',

'F:/fenxi/名人自述1.txt',

'F:/fenxi/名人自述2.txt',

'F:/fenxi/世界上什么下酒菜最美.txt',

'F:/fenxi/鱼藏剑和酿菜.txt',

'F:/fenxi/臭味食物.txt'

]

stopwords = read_stop_word("F:/fenxi/stop_word.txt")

# 分词、去停用词

corpus = []

for file in files:

# 分词

cut_res = cut_words(file)

# 去停用词

res = drop_Disable_Words(cut_res, stopwords)

corpus.append(res)

# print(len(corpus))

# 建立词袋模型

dictionary = corpora.Dictionary(corpus)

doc_vectors = [dictionary.doc2bow(text) for text in corpus]

# print(len(doc_vectors),doc_vectors)

tfidf = models.TfidfModel(doc_vectors)

tfidf_vectors = tfidf[doc_vectors]

print(len(tfidf_vectors))

print(len(tfidf_vectors[0]))

print(tfidf_vectors[0])

# 建立TF-IDF模型

def TF_IDF(tfidf_vectors, doc_vectors):

index = similarities.MatrixSimilarity(tfidf_vectors)

sims = index[doc_vectors[0]]

print(list(enumerate(sims)))

# 建立LSI模型

def LSI(tfidf_vectors, dictionary, doc_vectors, theme_num):

lsi = models.LsiModel(tfidf_vectors, id2word=dictionary, num_topics=theme_num)

lsi_vector = lsi[tfidf_vectors]

query_lsi = lsi[doc_vectors[0]]

index = similarities.MatrixSimilarity(lsi_vector)

sims = index[query_lsi]

print(list(enumerate(sims)))

# 使用LSI模型计算相似度

LSI(tfidf_vectors, dictionary, doc_vectors, 2)

# 使用TF-IDF模型计算相似度

TF_IDF(tfidf_vectors, doc_vectors)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/运维做开发/article/detail/878001
推荐阅读
相关标签
  

闽ICP备14008679号