赞
踩
下面这段代码是用来计算文本的词频、TF-IDF值
import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer # Load data comment1 = pd.read_csv(r"good_1.csv", header = 0, index_col = False, engine='python',encoding = 'utf-8') comment2 = pd.read_csv(r"good_2.csv", header = 0, index_col = False, engine='python',encoding = 'utf-8') comment3 = pd.read_csv(r"good_3.csv", header = 0, index_col = False, engine='python',encoding = 'utf-8') comment4 = pd.read_csv(r"good_4.csv", header = 0, index_col = False, engine='python',encoding = 'utf-8') # 生成corpus corpus = [] for i in range(4): file = eval('comment'+str(i+1)) print(file.shape) comment_txt = '' for line in range(file.shape[0]): if len(str(file.iloc[line,8])) >5: comment_txt += file.iloc[line,8] if (line % 2000 == 0): print(line, end = ' ') corpus.append(comment_txt) a = pd.DataFrame(columns = [0, 1, 2, 3]) # 计数器和TFIDF生成器 cv = CountVectorizer() cv_fit = cv.fit_transform(corpus) # 计算 transformer = TfidfTransformer() tfidf_fit = transformer.fit_transform(cv_fit) word = cv.get_feature_names() weight = cv_fit.toarray() weight_tfidf = tfidf_fit.toarray() for i in range(len(weight)): print("-------第", i+1, "段文本的词语------") for j in range(len(word)): # 进度查看器 if j % 2000 == 0: print(j,end =' ') b = pd.DataFrame([str(i), str(word[j]), str(weight[i][j]), str(weight_tfidf[i][j])]).T a = pd.concat([a,b]) print(' ') a.columns = ['type','name','number','tfidf'] # 写文件 a.to_csv(r"tfidf.csv",header=True,index=False,encoding='utf-8')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。