赞
踩
Python_001_旅游评论情感倾向性分析_002_基于glove词向量训练
关于词向量的训练参考文章:
https://blog.csdn.net/weixin_37947156/article/details/83145778
https://blog.csdn.net/weixin_40952784/article/details/100729036
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import jieba as jb
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings("ignore") # 忽略版本问题
def loadGLoveModel(filename):
embeddings_index = {}
f = open(filename, encoding='UTF-8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
return embeddings_index
def suc_train(train_vecs, y_train, test_vecs, y_test):
# 创建SVC模型
print("#创建SVC模型")
cls = SVC(kernel="rbf", verbose=True, shrinking=0)
# 训练模型#
cls.fit(train_vecs, y_train) # 训练集数据,第二个是训练集标签
# 保存模型
joblib.dump(cls, "../model/svcmodel.pkl")
# 输出评分
print("SVC评分:", cls.score(test_vecs, y_test))
return cls.score(test_vecs, y_test)
def logistic_train(train_vecs, y_train, test_vecs, y_test):
print("#创建逻辑回归模型")
# 训练模型#
regr = LogisticRegression()
regr.fit(train_vecs, y_train)
# 保存模型
joblib.dump(regr, "../model/logisticmodel.pkl")
print("Logisitic评分:", regr.score(test_vecs, y_test))
return regr.score(test_vecs, y_test)
def naivenayesian_train(train_vecs, y_train, test_vecs, y_test):
print("#创建高斯朴素贝叶斯模型")
clf = GaussianNB()
# 利用朴素贝叶斯做训练
clf.fit(train_vecs, y_train)
# 保存模型
joblib.dump(clf, "../model/naivenayesianmodel.pkl")
print("高斯朴素贝叶斯评分:", clf.score(test_vecs, y_test))
return clf.score(test_vecs, y_test)
def SVM_PRF():
#print("#SVC模型性能评估")
train_vecs = np.load("../model/train_vecs.npy")
regr = joblib.load("../model/svcmodel.pkl")
y_pred = regr.predict(train_vecs)
y_true = np.load("../model/y_train.npy")
y_pred = y_pred.astype(np.int)
y_true = y_true.astype(np.int)
tp = sum(y_true & y_pred) # 结果1
fp = sum((y_true == 0) & (y_pred == 1)) # 结果1
tn = sum((y_true == 0) & (y_pred == 0)) # 结果0
fn = sum((y_true == 1) & (y_pred == 0)) # 结果2
# print("tp", tp)
# print("fp", fp)
# print("tn", tn)
# print("fn", fn)
POS_P = tp / (tp + fp)
POS_R = tp / (tp + fn)
POS_F = (2 * POS_R * POS_P) / (POS_R + POS_P)
NEG_P = tn / (tn + fn)
NEG_R = tn / (tn + fp)
NEG_F = (2 * NEG_R * NEG_P) / (NEG_R + NEG_P)
print("POS_P", POS_P)
print("POS_R", POS_R)
print("POS_F", POS_F)
print("NEG_P", NEG_P)
print("NEG_R", NEG_R)
print("NEG_F", NEG_F)
print(POS_P)
print(POS_R)
print(POS_F)
print(NEG_P)
print(NEG_R)
print(NEG_F)
def logistic_PRF():
#print("#逻辑回归模型性能评估")
train_vecs = np.load("../model/train_vecs.npy")
regr = joblib.load("../model/logisticmodel.pkl")
y_pred = regr.predict(train_vecs)
y_true = np.load("../model/y_train.npy")
y_pred = y_pred.astype(np.int)
y_true = y_true.astype(np.int)
tp = sum(y_true & y_pred) # 结果1
fp = sum((y_true == 0) & (y_pred == 1)) # 结果1
tn = sum((y_true == 0) & (y_pred == 0)) # 结果0
fn = sum((y_true == 1) & (y_pred == 0)) # 结果2
# print("tp", tp)
# print("fp", fp)
# print("tn", tn)
# print("fn", fn)
POS_P = tp / (tp + fp)
POS_R = tp / (tp + fn)
POS_F = (2 * POS_R * POS_P) / (POS_R + POS_P)
NEG_P = tn / (tn + fn)
NEG_R = tn / (tn + fp)
NEG_F = (2 * NEG_R * NEG_P) / (NEG_R + NEG_P)
print("POS_P", POS_P)
print("POS_R", POS_R)
print("POS_F", POS_F)
print("NEG_P", NEG_P)
print("NEG_R", NEG_R)
print("NEG_F", NEG_F)
print(POS_P)
print(POS_R)
print(POS_F)
print(NEG_P)
print(NEG_R)
print(NEG_F)
def naivenayesian_PRF():
#print("#高斯朴素贝叶斯模型性能评估")
train_vecs = np.load("../model/train_vecs.npy")
regr = joblib.load("../model/naivenayesianmodel.pkl")
y_pred = regr.predict(train_vecs)
y_true = np.load("../model/y_train.npy")
y_pred = y_pred.astype(np.int)
y_true = y_true.astype(np.int)
tp = sum(y_true & y_pred) # 结果1
fp = sum((y_true == 0) & (y_pred == 1)) # 结果1
tn = sum((y_true == 0) & (y_pred == 0)) # 结果0
fn = sum((y_true == 1) & (y_pred == 0)) # 结果2
# print("tp", tp)
# print("fp", fp)
# print("tn", tn)
# print("fn", fn)
POS_P = tp / (tp + fp)
POS_R = tp / (tp + fn)
POS_F = (2 * POS_R * POS_P) / (POS_R + POS_P)
NEG_P = tn / (tn + fn)
NEG_R = tn / (tn + fp)
NEG_F = (2 * NEG_R * NEG_P) / (NEG_R + NEG_P)
print("POS_P", POS_P)
print("POS_R", POS_R)
print("POS_F", POS_F)
print("NEG_P", NEG_P)
print("NEG_R", NEG_R)
print("NEG_F", NEG_F)
print(POS_P)
print(POS_R)
print(POS_F)
print(NEG_P)
print(NEG_R)
print(NEG_F)
def build_vector(text, size, wv):
# 创建一个指定大小的数据空间
# print("#创建空间")
vec = np.zeros(size).reshape((1, size))
# count是统计有多少词向量
count = 0
# 循环所有的词向量进行求和
for w in text:
try:
vec += wv[w].reshape((1, size))
count += 1
# print(w)
except:
continue
# 循环完成后求均值
if count!=0:
vec/=count
return vec
i=50
#if __name__ == '__main__':
while i<=50:
print("开始启动",i)
#List, labelList = loadData() # 加载语料数据
neg = pd.read_excel("../originalData/yn_neg.xlsx", header=None) # 消极
pos = pd.read_excel("../originalData/yn_pos.xlsx", header=None) # 积极
# 这是两类数据都是x值
pos['words'] = pos[0].apply(lambda x: list(jb.cut(x)))
neg['words'] = neg[0].apply(lambda x: list(jb.cut(x)))
# 分词
y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))
# 需要y值 0 代表neg 1代表是pos
X = np.concatenate((pos['words'], neg['words']))
print("X-size:", len(X))
# 数组拼接
# 切分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)
np.save("../model/y_train.npy", y_train)
np.save("../model/y_test.npy", y_test)
# print(X_train)
np.save("../model/x_train.npy", X_train)
np.save("../model/x_test.npy", X_test)
gloveModel = loadGLoveModel('../gloveWordVector/yn_' +str(i)+'.txt')
train_vecs = np.concatenate([build_vector(z, i,gloveModel) for z in X_train])
np.save('../model/train_vecs.npy', train_vecs)
#print(train_vecs)
test_vecs = np.concatenate([build_vector(z, i,gloveModel) for z in X_test])
np.save('../model/test_vecs.npy', test_vecs)
s=suc_train(train_vecs, y_train, test_vecs, y_test) # SVC
l=logistic_train(train_vecs, y_train, test_vecs, y_test) # logistic回归
n=naivenayesian_train(train_vecs, y_train, test_vecs, y_test) # 朴素贝叶斯
#print(s)
SVM_PRF()
#print(l)
logistic_PRF()
#print(n)
naivenayesian_PRF()
i+=50
数据下载:
https://www.aliyundrive.com/s/rPNV3YXWjEy
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。