赞
踩
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
with open('D:\\proj\\sodic_2021\\job_title.txt','w',encoding='utf-8') as f:
for i in range(len(df_all_info)):
job_title=df_all_info.loc[i,'JOB_TITLE']
f.write('\n%s' %(job_title))
df_all_info['SPECILTY_JIEBA']=0
jieba.load_userdict('D:\\proj\\sodic_2021\\job_title.txt')
for i in range(len(df_all_info)):
if df_all_info.loc[i,'SPECILTY'] is np.NAN:
continue
else:
word=jieba.cut(df_all_info.loc[i,'SPECILTY'])
word_cut=''
for j in word:
word_cut=word_cut+' '+j.upper()
df_all_info.loc[i,'SPECILTY_JIEBA']=word_cut
df_all_info['SPECILTY_JIEBA'].replace(0,'None',inplace=True)
df_all_info_specilty_jieba_list=df_all_info['SPECILTY_JIEBA'].tolist() vector=CountVectorizer() count=vector.fit_transform(df_all_info_specilty_jieba_list) word=vector.get_feature_names() tranform=TfidfTransformer() tfidf=tranform.fit_transform(count) weight=tfidf.toarray() # print(vector.vocabulary_) # print(vector.fit_transform(c)) # print(vector.fit_transform(df_all_info_specilty_jieba_list).todense()) # print(count.toarray()) # print(tfidf) # print(word) # print(tfidf) # print(weight) for i in range(len(weight)): #打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 print ("%d" %(i)) #第一类文本 for j in range(len(word)): print (word[j],weight[i][j])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。