Python_001_旅游评论情感倾向性分析_002_基于glove词向量训练_基于python景区评论情感分析毕设

作者：很楠不爱3 | 2024-06-14 22:21:11

踩

基于python景区评论情感分析毕设

Python_001_旅游评论情感倾向性分析_002_基于glove词向量训练

一、训练词向量

关于词向量的训练参考文章:
https://blog.csdn.net/weixin_37947156/article/details/83145778
https://blog.csdn.net/weixin_40952784/article/details/100729036

二、跑分

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import jieba as jb
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import warnings

warnings.filterwarnings("ignore")  # 忽略版本问题
def loadGLoveModel(filename):
    embeddings_index = {}
    f = open(filename, encoding='UTF-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

def suc_train(train_vecs, y_train, test_vecs, y_test):
    # 创建SVC模型
    print("#创建SVC模型")
    cls = SVC(kernel="rbf", verbose=True, shrinking=0)
    # 训练模型#
    cls.fit(train_vecs, y_train)  # 训练集数据，第二个是训练集标签
    # 保存模型
    joblib.dump(cls, "../model/svcmodel.pkl")
    # 输出评分
    print("SVC评分:", cls.score(test_vecs, y_test))
    return cls.score(test_vecs, y_test)

def logistic_train(train_vecs, y_train, test_vecs, y_test):
    print("#创建逻辑回归模型")
    # 训练模型#
    regr = LogisticRegression()
    regr.fit(train_vecs, y_train)
    # 保存模型
    joblib.dump(regr, "../model/logisticmodel.pkl")
    print("Logisitic评分:", regr.score(test_vecs, y_test))
    return regr.score(test_vecs, y_test)
def naivenayesian_train(train_vecs, y_train, test_vecs, y_test):
    print("#创建高斯朴素贝叶斯模型")
    clf = GaussianNB()
    # 利用朴素贝叶斯做训练
    clf.fit(train_vecs, y_train)
    # 保存模型
    joblib.dump(clf, "../model/naivenayesianmodel.pkl")
    print("高斯朴素贝叶斯评分:", clf.score(test_vecs, y_test))
    return clf.score(test_vecs, y_test)
def SVM_PRF():
    #print("#SVC模型性能评估")
    train_vecs = np.load("../model/train_vecs.npy")
    regr = joblib.load("../model/svcmodel.pkl")
    y_pred = regr.predict(train_vecs)
    y_true = np.load("../model/y_train.npy")
    y_pred = y_pred.astype(np.int)
    y_true = y_true.astype(np.int)
    tp = sum(y_true & y_pred)  # 结果1
    fp = sum((y_true == 0) & (y_pred == 1))  # 结果1
    tn = sum((y_true == 0) & (y_pred == 0))  # 结果0
    fn = sum((y_true == 1) & (y_pred == 0))  # 结果2
    # print("tp", tp)
    # print("fp", fp)
    # print("tn", tn)
    # print("fn", fn)
    POS_P = tp / (tp + fp)
    POS_R = tp / (tp + fn)
    POS_F = (2 * POS_R * POS_P) / (POS_R + POS_P)
    NEG_P = tn / (tn + fn)
    NEG_R = tn / (tn + fp)
    NEG_F = (2 * NEG_R * NEG_P) / (NEG_R + NEG_P)
    print("POS_P", POS_P)
    print("POS_R", POS_R)
    print("POS_F", POS_F)
    print("NEG_P", NEG_P)
    print("NEG_R", NEG_R)
    print("NEG_F", NEG_F)
    print(POS_P)
    print(POS_R)
    print(POS_F)
    print(NEG_P)
    print(NEG_R)
    print(NEG_F)

def logistic_PRF():
    #print("#逻辑回归模型性能评估")
    train_vecs = np.load("../model/train_vecs.npy")
    regr = joblib.load("../model/logisticmodel.pkl")
    y_pred = regr.predict(train_vecs)
    y_true = np.load("../model/y_train.npy")
    y_pred = y_pred.astype(np.int)
    y_true = y_true.astype(np.int)
    tp = sum(y_true & y_pred)  # 结果1
    fp = sum((y_true == 0) & (y_pred == 1))  # 结果1
    tn = sum((y_true == 0) & (y_pred == 0))  # 结果0
    fn = sum((y_true == 1) & (y_pred == 0))  # 结果2
    # print("tp", tp)
    # print("fp", fp)
    # print("tn", tn)
    # print("fn", fn)
    POS_P = tp / (tp + fp)
    POS_R = tp / (tp + fn)
    POS_F = (2 * POS_R * POS_P) / (POS_R + POS_P)
    NEG_P = tn / (tn + fn)
    NEG_R = tn / (tn + fp)
    NEG_F = (2 * NEG_R * NEG_P) / (NEG_R + NEG_P)
    print("POS_P", POS_P)
    print("POS_R", POS_R)
    print("POS_F", POS_F)
    print("NEG_P", NEG_P)
    print("NEG_R", NEG_R)
    print("NEG_F", NEG_F)
    print(POS_P)
    print(POS_R)
    print(POS_F)
    print(NEG_P)
    print(NEG_R)
    print(NEG_F)
def naivenayesian_PRF():
    #print("#高斯朴素贝叶斯模型性能评估")
    train_vecs = np.load("../model/train_vecs.npy")
    regr = joblib.load("../model/naivenayesianmodel.pkl")
    y_pred = regr.predict(train_vecs)
    y_true = np.load("../model/y_train.npy")
    y_pred = y_pred.astype(np.int)
    y_true = y_true.astype(np.int)
    tp = sum(y_true & y_pred)  # 结果1
    fp = sum((y_true == 0) & (y_pred == 1))  # 结果1
    tn = sum((y_true == 0) & (y_pred == 0))  # 结果0
    fn = sum((y_true == 1) & (y_pred == 0))  # 结果2
    # print("tp", tp)
    # print("fp", fp)
    # print("tn", tn)
    # print("fn", fn)
    POS_P = tp / (tp + fp)
    POS_R = tp / (tp + fn)
    POS_F = (2 * POS_R * POS_P) / (POS_R + POS_P)
    NEG_P = tn / (tn + fn)
    NEG_R = tn / (tn + fp)
    NEG_F = (2 * NEG_R * NEG_P) / (NEG_R + NEG_P)
    print("POS_P", POS_P)
    print("POS_R", POS_R)
    print("POS_F", POS_F)
    print("NEG_P", NEG_P)
    print("NEG_R", NEG_R)
    print("NEG_F", NEG_F)
    print(POS_P)
    print(POS_R)
    print(POS_F)
    print(NEG_P)
    print(NEG_R)
    print(NEG_F)

def build_vector(text, size, wv):
    # 创建一个指定大小的数据空间
    # print("#创建空间")
    vec = np.zeros(size).reshape((1, size))

    # count是统计有多少词向量
    count = 0
    # 循环所有的词向量进行求和
    for w in text:
        try:
            vec += wv[w].reshape((1, size))
            count += 1
            # print(w)
        except:
            continue

    # 循环完成后求均值
    if count!=0:
        vec/=count
    return vec
i=50
#if __name__ == '__main__':
while i<=50:
    print("开始启动",i)
    #List, labelList = loadData()  # 加载语料数据
    neg = pd.read_excel("../originalData/yn_neg.xlsx", header=None)  # 消极
    pos = pd.read_excel("../originalData/yn_pos.xlsx", header=None)  # 积极
    # 这是两类数据都是x值
    pos['words'] = pos[0].apply(lambda x: list(jb.cut(x)))
    neg['words'] = neg[0].apply(lambda x: list(jb.cut(x)))
    # 分词
    y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))
    # 需要y值  0 代表neg 1代表是pos
    X = np.concatenate((pos['words'], neg['words']))
    print("X-size:", len(X))
    # 数组拼接
    # 切分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)
    np.save("../model/y_train.npy", y_train)
    np.save("../model/y_test.npy", y_test)
    # print(X_train)
    np.save("../model/x_train.npy", X_train)
    np.save("../model/x_test.npy", X_test)

    gloveModel = loadGLoveModel('../gloveWordVector/yn_' +str(i)+'.txt')

    train_vecs = np.concatenate([build_vector(z, i,gloveModel) for z in X_train])
    np.save('../model/train_vecs.npy', train_vecs)
    #print(train_vecs)
    test_vecs = np.concatenate([build_vector(z, i,gloveModel) for z in X_test])
    np.save('../model/test_vecs.npy', test_vecs)
    s=suc_train(train_vecs, y_train, test_vecs, y_test)  # SVC
    l=logistic_train(train_vecs, y_train, test_vecs, y_test)  # logistic回归
    n=naivenayesian_train(train_vecs, y_train, test_vecs, y_test)  # 朴素贝叶斯
    #print(s)
    SVM_PRF()
    #print(l)
    logistic_PRF()
    #print(n)
    naivenayesian_PRF()
    i+=50
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217

数据下载:

https://www.aliyundrive.com/s/rPNV3YXWjEy

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/很楠不爱3/article/detail/719814

Python_001_旅游评论情感倾向性分析_002_基于glove词向量训练_基于python景区评论情感分析 毕设

一、训练词向量

二、跑分

Python_001_旅游评论情感倾向性分析_002_基于glove词向量训练_基于python景区评论情感分析毕设