赞
踩
**
**
TF-IDF(词频-逆文档频率)是信息检索中衡量一个词语重要程度的统计指标。公式为:
TF是词频,代表一个词在文中出现的次数,公式为:
DF是文档频率,代表有多少篇文章包含词,DF的倒数(inverse)称为IDF,公式为:
log函数的好处:当x的值非常大时,logd的值也不大。由于log函数是单调函数,优化目标是一致,因此不影响结果的计算。
效果图:
代码(使用pyhanlp中的词典):
# -*- coding:utf-8 -*- """ 1.创建一个语料库---初始状态 2.文档分词,并将文档存入语料库中 3.计算tf,计算idf 4.排序,获取关键字,通过数值来控制输出关键字的个数 """ import glob from pyhanlp import JClass,HanLP import time import math class InitCorpusAndDictionary: """获取初始的语料库文档信息,假定语料库中的文档都是txt格式的""" def __init__(self): self.dic = self.dictionary() #实例化的过程中自动加载字典 def getFlieList(self): """返回txt形式文件列表""" return glob.glob("*.txt") def dictionary(self): """ 加载HanLP中的词库 返回一个集合形式词库 """ IOUtil = JClass('com.hankcs.hanlp.corpus.io.IOUtil') #自定义词典兼容含有空格的路径 path = HanLP.Config.CoreDictionaryPath #获取核心词典路径 dic = IOUtil.loadDictionary([path]) #返回的为一个字典 return set(dic.keySet()) def AddCorpus(self,text): """添加语料库""" filename = str(int(time.time()))+".txt" try: with open(filename,"w",encoding="utf-8") as f: f.write(text) except Exception as e: print("文件写入失败",e) class ParseWord: """拆分语句,形成分词""" def __init__(self,text): self.a = InitCorpusAndDictionary() self.dic = self.a.dic self.text = text self.a.AddCorpus(self.text) #逆向最长匹配 def backward_segment(self): word_list = [] i = len(self.text) - 1 while i >= 0: word = self.text[i] for j in range(i): #print(text[j:i+1]) if self.text[j:i+1] in self.dic: if len(self.text[j:i+1]) > len(word): word = self.text[j:i+1] word_list.insert(0,word) i -= len(word) return word_list class CountTFIDF: """ 计算 tf = 词在一篇文章中出现的次数/文章总的词汇数 idf = log(语料库的总文档数/包含该词条的文档数+1),分母+1是为了避免分母为0 TFIDF=TF*IDF """ def __init__(self,text): self.text = text self.wordlist = ParseWord(self.text).backward_segment() self.flielist = InitCorpusAndDictionary().getFlieList() self.tf() self.idf() def tf(self): """计算 tf""" self.worddict = {} for word in self.wordlist: self.worddict[word] = self.wordlist.count(word)/len(self.wordlist) #print(self.worddict) def idf(self): """计算idf""" self.idf_dict = dict.fromkeys(self.worddict.keys(),0) #生成值为0的字典 for filename in self.flielist: with open(filename,encoding="utf-8") as f: #获取词在语料库中个数 words = f.read() for key in self.idf_dict.keys(): if key in words: self.idf_dict[key] += 1 #print(self.idf_dict) for key in self.idf_dict.keys(): #idf概率 self.idf_dict[key] = math.log(len(self.flielist)/(self.idf_dict[key]+1)) #print(self.idf_dict) def if_idf(self): """计算if_idf""" self.if_idf = {} for key in self.idf_dict.keys(): self.if_idf[key] = round(self.worddict[key]*self.idf_dict[key],4) return self.if_idf class ControlKeywordOutput: """排序,控制关键字的输出""" def __init__(self,text,num): self.if_idf = CountTFIDF(text).if_idf() self.wordlist = CountTFIDF(text).wordlist self.num = num def extractionKeyword(self): word_sort = sorted(self.if_idf.items(),key=lambda x:x[1],reverse=True) if self.num <= len(word_sort): # 防止输入的关键字数大于分词数,程序报索引超出 self.num = self.num else: self.num = len(word_sort) #print(word_sort) extract_word = [word_sort[i][0] for i in range(self.num)] print(f'文档的关键字为:{",".join(extract_word)}') if __name__ == "__main__": text1 = "当下雨天地面积水" text2 = "东方巨龙正在觉醒" text3 = "下雨天的积水" ControlKeywordOutput(text1,3).extractionKeyword() ControlKeywordOutput(text2,8).extractionKeyword() ControlKeywordOutput(text3,2).extractionKeyword()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。