赞
踩
学习华为云的自然语言处理课程,了解并实现TF-IDF的代码
import jieba from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer with open("dataset.txt", 'r', encoding='utf-8') as fenci: text = fenci.read() new_text = jieba.cut(text, cut_all=False) str_out = ' '.join(new_text).replace(',', '').replace('。', '').replace('?', '').replace('!', '').replace('《', '') \ .replace('》', '').replace('‘', '').replace('“', '').replace('”', '').replace(':', '').replace(';', '').replace('【','')\ .replace('】','').replace('、', '').replace('(', '').replace(')', '').replace('···', '').replace('——', '').replace('-', '')\ .replace('(', '').replace(')', '').replace('/', '').replace('\n', '').replace('[', '').replace(']', '').replace('"', '') fo = open("dataset_cut.txt", 'w', encoding='utf-8') fo.write(str_out) fo.close() # open files with open("dataset_cut.txt", 'r', encoding='utf-8') as f: dataset = list(f.readlines()) print(len(dataset)) print(dataset) # open stopwords text with open("stopwords.txt", 'r', encoding='utf-8') as f: stopwords = list(f.read().replace('\n', ' ').split()) print(stopwords) vectorizer = CountVectorizer(stop_words=stopwords, min_df=0) transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(dataset)) word = vectorizer.get_feature_names_out() print("word:", word) print(vectorizer.vocabulary_) weight = tfidf.toarray() print("weight:", weight) word_weight = list() for i in range(len(word)): print("————这里输出第%d类文本的词语tf-idf权重" % i) print(" ", word[i], weight[0][i])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。