赞
踩
import numpy as np
import pandas as pd
import math
docA = "The cat sat on my bed"
docB = "This dog sat in the trees"
bowA = docA.split(" ")
bowB= docB.split(" ")
#构建词库
wordSet = set(bowA).union(set(bowB))
wordSet #去重后的单词有哪些
# 用统计字典来保存词出现的次数 wordDictA = dict.fromkeys(wordSet,0) wordDictB = dict.fromkeys(wordSet,0) #print(wordDictA) ''' 显示结果: {'The': 0, 'dog': 0, 'trees': 0, 'cat': 0, 'bed': 0,'on': 0, 'my': 0, 'in': 0, 'the': 0, 'sat': 0, 'This': 0} ''' #wordDictA print('---------------------------------------------------\n') #wordDictB # 遍历文档,对各个输入词进行统计 for word in bowA: wordDictA[word] += 1 for word in bowB: wordDictB[word] += 1 #wordDictA print('---------------------------------------------------\n') #wordDictB ''' 显示结果: {'The': 1, 'dog': 0, 'trees': 0, 'cat': 1, 'bed': 1, 'on': 1, 'my': 1, 'in': 0, 'the': 0, 'sat': 1, 'This': 0} ''' pd.DataFrame([wordDictA,wordDictB])
def computeTF(wordDict,bow):
#用一个字典对象记录tf
tfDict = {}
nbowCount = len(bow)
for word,count in wordDict.items():
tfDict[word ] = count / nbowCount
return tfDict
tfA = computeTF(wordDictA,bowA)
tfB = computeTF(wordDictB,bowB)
print('---------------------------------------------------\n')# 在这条语句前面,好像就不会被执行起来,很奇怪啊,TNND
tfA
def computeIDF(wordDictList): #用一个字典对象保存IDF结果,每个词都作为key,初始值为0 idfDict = dict.fromkeys(wordDictList[0],0) N = len(wordDictList) for worDict in wordDictList: #遍历字典中的每个词汇 for word,count in worDict.items(): if count>0: #先把Ni增加1,存入到idfDict里面 idfDict[word]+=1 #已经得到所有词汇i对应的Ni,现在根据公式把它替换成为idf的值 for word,Ni in idfDict.items(): idfDict[word] = math.log10((N+1)/(Ni+1)) return idfDict
idfs = computeIDF([wordDictA,wordDictB])
idfs
def computeTF_IDF(tf,idfs):
tfidf = {}
for word ,tfval in tf.items():
tfidf[word] = tfval*idfs[word]
return tfidf
tfdifA = computeTF_IDF(tfA,idfs)
tfidfB = computeTF_IDF(tfB,idfs)
pd.DataFrame([tfdifA,tfidfB])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。