赞
踩
比如专业术语的TF-IDF较高,“的”、“这”、“表明”一类的词语TF-IDF较低。
import pandas as pd
import numpy as np
import math
wordA = "xxx"
wordB = "yyy "
#将单词取出,并建立集合
sepA = wordA.split(" ")
sepB = wordB.split(" ")
setA = set(sepA)
setB = set(sepB)
#取并集,建立字典
union = setA.union(setB)
dictA = dict.fromkeys(setA,0)
dictB = dict.fromkeys(setA,0)
#统计两个字典中单词出现的次数
for word in dictA:
dictA[word] += 1
for word in dictB:
dictB[word] += 1
pd.DataFrame([dictA,dictB])
def computeTF(dict,set):
tfdict = {}
setcount = len(set)
for word,count in dict.items():
tfdict[word] = count / setcount
return tfdict
tfA = computeTF(dictA,setA)
tfB = computeTF(dictB,setB)
#计算逆文档频率
def computeIDF(dictlist):
#用字典对象保存IDF结果,每个词作为key
idfDict = dict.fromkeys(dictlist[0],0)
N = len(dictlist)
for Dict in dictlist:
for word,count in Dict.items():
if count > 0:
idfDict[word] += 1
#得到所有词汇对应的Ni值,求idf
for word ,ni in idfDict.items():
idfDict[word] = math.log10((N+1)/(ni+1))
return idfDict
idfs = computeIDF([dictA,dictB])
#计算TF-IDF
def computeTFIDF(tf,idfs):
tfidf = {}
for word,tfval in tf.items():
tfidf[word] = tfval * idfs[word]
return tfidf
tfidfA = computeTFIDF(tfA,idfs)
tfidfB = computeTFIDF(tfB,idfs)
pd.DataFrame([tfidfA,tfidfB])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。