赞
踩
1.Imdb影评的数据集介绍与下载
2.贝叶斯原理介绍
3.TF-IDF是什么
def preprocess_data(): X_orig, Y_orig = get_data(aclImdb_train_dir) X_orig_test, Y_orig_test = get_data(aclImdb_test_dir) X_orig = np.concatenate([X_orig, X_orig_test]) Y_orig = np.concatenate([Y_orig, Y_orig_test]) cv = CountVectorizer(max_features=vocab_size) tfidf = TfidfVectorizer(max_features=vocab_size) cv.fit(X_orig) tfidf.fit(X_orig) print(cv.vocabulary_) # 3. 4.transform training data into a 'document-term matrix' (which is a sparse matrix) use “transform()” train_data = cv.transform(X_orig) tfidf_train_data = tfidf.transform(X_orig) # (the index of the list , the index of the dict ) the frequency of the list[index] print(cv.get_feature_names()) print(train_data) train_data = train_data.toarray() tfidf_train_data = tfidf_train_data.toarray() print(train_data) joblib.dump(cv, "data/CountVectorizer.joblib") joblib.dump(tfidf, "data/TfidfVectorizer.joblib") np.savez(r'data\CountVectorizer_trainData', x=train_data, y=Y_orig) np.savez(r'data\TfidfVectorizer_trainData', x=tfidf_train_data, y=Y_orig)
运行后会生成下面四个文件
def train_my_module(is_tfidf): if is_tfidf: trainDataNew = np.load(r'data/TfidfVectorizer_trainData.npz') else: trainDataNew = np.load('data/CountVectorizer_trainData.npz') x = trainDataNew['x'] y = trainDataNew['y'] x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.3) x_train,x_test,y_train,y_test=np.array(x_train),np.array(x_test),np.array(y_train),np.array(y_test) print(x_train.shape,x_test.shape) #训练数据 module=MultinomialNB() module.fit(x_train,y_train) #测试数据 y_pred=module.predict(x_test) if is_tfidf: joblib.dump(module, r'data/Tfidf_bayes_module.joblib') else: joblib.dump(module, r'data/bayes_module.joblib') #输出 print("正确值:{0}".format(y_test)) print("预测值:{0}".format(y_pred)) print("准确率:%f%%"%(accuracy_score(y_test, y_pred)*100))
def predict_my_module(is_tfidf): if is_tfidf: model = joblib.load(r'data/Tfidf_bayes_module.joblib') else: model = joblib.load(r'data/bayes_module.joblib') #neg:0 postive:1 review =["the character is so poorly written.", "this is bad movie ", "I'm not very disappoint for this movie", "I'm very happy for this movie" ] if is_tfidf: cv = joblib.load(r'data/TfidfVectorizer.joblib') else: cv = joblib.load(r'data/CountVectorizer.joblib') train_data = cv.transform(review) train_data = train_data.toarray() s = model.predict(train_data) print(s)
如果is_tfidf为False,那么就是用CountVectorizer生成的词向量训练的模型
如果is_tfidf为True,那么就是用TfidfVectorizer生成的词向量训练的模型
if __name__ == '__main__':
preprocess_data()
is_tfidf =True
train_my_module(is_tfidf)
#predict_my_module(is_tfidf)
运行结果如下,可知TfidfVectorizer 生成的词向量准确率最高。
Count Vectorizer | Tfidf Vectorizer |
---|---|
准确率:84.326667% | 准确率: 85.893333% |
import numpy as np from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB import numpy as np import re from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer import os as os import joblib vocab_size = 30000 aclImdb_train_dir = r'D:\train_data\aclImdb\aclImdb\train' aclImdb_test_dir = r'D:\train_data\aclImdb\aclImdb\test' # remove html tag like '<br /><br />' def rm_tags(text): re_tag = re.compile(r'<[^>]+>') return re_tag.sub(' ', text) def clean_str(string): return string.strip().lower() def process(text): text = clean_str(text) text = rm_tags(text) return text def get_data(datapath): pos_files = os.listdir(datapath + '/pos') neg_files = os.listdir(datapath + '/neg') print(len(pos_files)) print(len(neg_files)) pos_all = [] neg_all = [] for pf, nf in zip(pos_files, neg_files): with open(datapath + '/pos' + '/' + pf, encoding='utf-8') as f: s = f.read() s = process(s) pos_all.append(s) with open(datapath + '/neg' + '/' + nf, encoding='utf-8') as f: s = f.read() s = process(s) neg_all.append(s) X_orig= np.array(pos_all + neg_all) Y_orig = np.array([1 for _ in range(len(pos_all))] + [0 for _ in range(len(neg_all))]) return X_orig, Y_orig def preprocess_data(): X_orig, Y_orig = get_data(aclImdb_train_dir) X_orig_test, Y_orig_test = get_data(aclImdb_test_dir) X_orig = np.concatenate([X_orig, X_orig_test]) Y_orig = np.concatenate([Y_orig, Y_orig_test]) cv = CountVectorizer(max_features=vocab_size) tfidf = TfidfVectorizer(max_features=vocab_size) cv.fit(X_orig) tfidf.fit(X_orig) print(cv.vocabulary_) # 3. 4.transform training data into a 'document-term matrix' (which is a sparse matrix) use “transform()” train_data = cv.transform(X_orig) tfidf_train_data = tfidf.transform(X_orig) # (the index of the list , the index of the dict ) the frequency of the list[index] print(cv.get_feature_names()) print(train_data) train_data = train_data.toarray() tfidf_train_data = tfidf_train_data.toarray() print(train_data) joblib.dump(cv, "data/CountVectorizer.joblib") joblib.dump(tfidf, "data/TfidfVectorizer.joblib") np.savez(r'data\CountVectorizer_trainData', x=train_data, y=Y_orig) np.savez(r'data\TfidfVectorizer_trainData', x=tfidf_train_data, y=Y_orig) def train_my_module(is_tfidf): if is_tfidf: trainDataNew = np.load(r'data/TfidfVectorizer_trainData.npz') else: trainDataNew = np.load('data/CountVectorizer_trainData.npz') x = trainDataNew['x'] y = trainDataNew['y'] x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.3) x_train,x_test,y_train,y_test=np.array(x_train),np.array(x_test),np.array(y_train),np.array(y_test) print(x_train.shape,x_test.shape) #训练数据 module=MultinomialNB() module.fit(x_train,y_train) #测试数据 y_pred=module.predict(x_test) if is_tfidf: joblib.dump(module, r'data/Tfidf_bayes_module.joblib') else: joblib.dump(module, r'data/bayes_module.joblib') #输出 print("正确值:{0}".format(y_test)) print("预测值:{0}".format(y_pred)) print("准确率:%f%%"%(accuracy_score(y_test, y_pred)*100)) def predict_my_module(is_tfidf): if is_tfidf: model = joblib.load(r'data/Tfidf_bayes_module.joblib') else: model = joblib.load(r'data/bayes_module.joblib') #neg:0 postive:1 review =["the character is so poorly written.", "this is bad movie ", "I'm not very disappoint for this movie", "I'm very happy for this movie" ] if is_tfidf: cv = joblib.load(r'data/TfidfVectorizer.joblib') else: cv = joblib.load(r'data/CountVectorizer.joblib') train_data = cv.transform(review) train_data = train_data.toarray() s = model.predict(train_data) print(s) if __name__ == '__main__': preprocess_data() is_tfidf =True train_my_module(is_tfidf) #predict_my_module(is_tfidf)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。