赞
踩
# -*- coding: UTF-8 -*- import pandas as pd import numpy as np import re from bs4 import BeautifulSoup def review_to_wordlist(review): ''' 把IMDB的评论转成词序列 参考:http://blog.csdn.net/longxinchen_ml/article/details/50629613 ''' # 去掉HTML标签,拿到内容 review_text = BeautifulSoup(review, "html.parser").get_text() # 用正则表达式取出符合规范的部分 review_text = re.sub("[^a-zA-Z]"," ", review_text) # 小写化所有的词,并转成词list words = review_text.lower().split() # 返回words return words
# 载入数据集
train = pd.read_csv('data/new_train.csv', header=0)
test = pd.read_csv('data/new_test.csv', header=0)
print (train.head())
print (test.head())
ID sentiment review
0 1 1 Jo bhi ap se tou behtar hoon
1 2 0 ya Allah meri sister Affia ki madad farma
2 3 1 Yeh khud chahta a is umar main shadi krna ha...
3 4 1 Tc Apky mun xe exe alfax achy nae lgty
4 5 0 Good
id review
0 1 Jis ke aiteraf mien inhe behtareen muaawin ac...
1 2 Thank you same to you
2 3 ALLAH ki marzi hai Beshak wohi ata karne wala ...
3 4 Asal masla yehi hei k wo iss umar mein bhi sha...
4 5 Chaudhry Rehmat Ali ne January ko Ab...
# 预处理数据
label = train['sentiment']
train_data = []
for i in range(len(train['review'])):
train_data.append(' '.join(review_to_wordlist(train['review'][i])))
test_data = []
for i in range(len(test['review'])):
test_data.append(' '.join(review_to_wordlist(test['review'][i])))
# 预览数据
print (train_data[0], '\n')
print (test_data[0])
jo bhi ap se tou behtar hoon
jis ke aiteraf mien inhe behtareen muaawin actor ke national film award se nawaza gaya
#TF-IDF from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF # 参考:http://blog.csdn.net/longxinchen_ml/article/details/50629613 tfidf = TFIDF(min_df=2, # 最小支持度为2 max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), # 二元文法模型 use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words = 'english') # 去掉英文停用词 # 合并训练和测试集以便进行TFIDF向量化操作 data_all = train_data + test_data len_train = len(train_data) tfidf.fit(data_all) data_all = tfidf.transform(data_all) # 恢复成训练集和测试集部分 train_x = data_all[:len_train] test_x = data_all[len_train:] print ('TF-IDF处理结束.')
TF-IDF处理结束.
D:\anaconda\lib\site-packages\sklearn\feature_extraction\text.py:1059: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
from sklearn.naive_bayes import MultinomialNB as MNB
model_NB = MNB()
model_NB.fit(train_x, label)
MNB(alpha=1.0, class_prior=None, fit_prior=True)
from sklearn.cross_validation import cross_val_score
import numpy as np
print ("多项式贝叶斯分类器10折交叉验证得分: ", np.mean(cross_val_score(model_NB, train_x, label, cv=10, scoring='roc_auc')))
多项式贝叶斯分类器10折交叉验证得分: 0.8631634970590059
test_predicted = np.array(model_NB.predict_proba(test_x))
# print ('保存结果...')
# nb_output = pd.DataFrame(data=test_predicted, columns=['sentiment'])
# nb_output['id'] = test['id']
# nb_output = nb_output[['id', 'sentiment']]
# nb_output.to_csv('nb_output.csv', index=False)
# print ('结束.')
test_predicted
array([[0.88318156, 0.11681844],
[0.87972973, 0.12027027],
[0.68929881, 0.31070119],
...,
[0.5871227 , 0.4128773 ],
[0.38977763, 0.61022237],
[0.46662657, 0.53337343]])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。