赞
踩
TF-IDF(term frequency–inverse document frequency,词频-逆文档频率)是一种用于信息检索(information retrieval)与文本挖掘(text mining)的常用加权技术。
TF-IDF是一种统计方法,用以评估一字词对于一个文件集或一个语料库中的其中一份文件的重要程度。字词的重要性随着它在文件中出现的次数成正比增加,但同时会随着它在语料库中出现的频率成反比下降。
TF-IDF的主要思想是:如果某个单词在一篇文章中出现的频率高,并且在其他文章中很少出现,则认为此词或者短语具有很好的类别区分能力,适合用来分类。
参考:https://www.capitalone.com/tech/machine-learning/understanding-tf-idf/
import jieba
corpus = ['PHP是最好的语言,我用PHP','Java才是最好的语言,我用Java','人生苦短我用Python,Python']
words_list = [list(jieba.cut(doc)) for doc in corpus]
words_list
vocab = set([word for words in words_list for word in words]) #set函数—集合,去除重复的项;使用了双重for循环
vocab
import math
from collections import Counter
tf={}
total_words = sum([len(w) for w in words_list])
words = [w for words in words_list for w in words] # 二维转一维
count = Counter(words)
for w in vocab:
tf = count.get(w)/total_words # 某个词的数量/总词数
idf = (math.log(len(corpus)/(1+sum([w in word for word in words_list]))))
print(f'{w}: tf={tf}, idf={idf}, tf-idf={tf*idf}')
import math # 定义一个函数用于计算单词在文档中的出现次数 def count_words(word, document): return document.count(word) # 定义一个函数用于计算单词在所有文档中的出现次数 def count_documents(word, documents): return sum(1 for document in documents if word in document) # 定义一个函数用于计算单词的TF-IDF值 def calculate_tf_idf(word, document, documents): tf = count_words(word, document) / len(document) idf = math.log(len(documents) / (1 + count_documents(word, documents))) return tf * idf # 定义一个函数用于计算文档的TF-IDF向量 def calculate_tf_idf_vector(document, documents): tf_idf_vector = {} for word in set(document): tf_idf_vector[word] = calculate_tf_idf(word, document, documents) return tf_idf_vector # 示例用法 '''documents = [ ["apple", "banana", "orange"], ["orange", "peach", "grape"], ["banana", "pear", "peach"], ]''' documents = words_list tf_idf_vectors = [] for document in documents: tf_idf_vector = calculate_tf_idf_vector(document, documents) tf_idf_vectors.append(tf_idf_vector) print(documents) for result in tf_idf_vectors: print(result)
import numpy as np
import pandas as pd
df = pd.DataFrame(corpus,columns=['doc'])
df['words'] = df.doc.apply(jieba.lcut)
df1 = df.explode('words')
df1
df2=df1.groupby('words')['doc'].agg(['count','nunique']).reset_index()
print()
df2['tf'] = df2['count']/len(df1)
df2['idf'] = np.log(len(df)/(1+df2['nunique']))
df2['tfidf'] = df2.tf*df2.idf
df2
df1.drop_duplicates().merge(df2,on=['words'])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。