赞
踩
所用数据为经典的20Newsgroup数据
数据集链接:http://qwone.com/~jason/20Newsgroups/(比较慢,建议采用Science上网等其他方法下载)
直接上完整代码:
# -*- coding: utf-8 -*- import os import math import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer def TF(wordSet,split): tf = dict.fromkeys(wordSet, 0) for word in split: tf[word] += 1 return tf def IDF(tfList): idfDict = dict.fromkeys(tfList[0],0) #词为key,初始值为0 N = len(tfList) #总文档数量 for tf in tfList: # 遍历字典中每一篇文章 for word, count in tf.items(): #遍历当前文章的每一个词 if count > 0 : #当前遍历的词语在当前遍历到的文章中出现 idfDict[word] += 1 #包含词项tj的文档的篇数df+1 for word, Ni in idfDict.items(): #利用公式将df替换为逆文档频率idf idfDict[word] = math.log10(N/Ni) #N,Ni均不会为0 return idfDict #返回逆文档频率IDF字典 def TFIDF(tf, idfs): #tf词频,idf逆文档频率 tfidf = {} for word, tfval in tf.items(): tfidf[word] = tfval * idfs[word] return tfidf if __name__ == "__main__": #1 获取文件 text=[] name_all = os.listdir(r'20news-bydate-train/alt.atheism/') for i in range(len(name_all)): name = "20news-bydate-train/alt.atheism/" + name_all[i] f = open(name,"rb") str1=f.read() text.append(str1) f.close() #2 将每篇文档进行分词 wordSet = {} split_list = [] for i in range(len(text)): split =str(text[i]).split(' ') split_list.append(split) wordSet = set(wordSet ).union(split)#通过set去重来构建词库 #3 统计每篇文章各项词语的词频 tf = [] for i in range(len(split_list)): tf.append(TF(wordSet,split_list[i])) #4 计算文档集的逆文档频率 idfs = IDF(tf) #5 tf*idf = tfidf算法 tfidf = [] for i in range(len(tf)): tfidf.append(TFIDF(tf[i], idfs)) print(pd.DataFrame(tfidf)) #可转换为DataFrame类型用于后序操作
本例读取了480篇英文文档,并将其向量化
最终获取到了一个480*31412维的DataFrame类型数据,可根据后续PCA降维和相关分类算法的实际需要将其转换为ndarray类型、矩阵类型(scipy.sparse.csr.csr_matrix)等。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。