当前位置:   article > 正文

NLP学习笔记——情感分析实战(情感分类)

NLP学习笔记——情感分析实战(情感分类)

相关知识自行了解,上代码,代码是好久之前在大学写的了,可能有点乱,工作之后没时间优化了,但模块功能绝对没问题,报错的话应该是库的不兼容产生的,我忘记原来的库版本了,python用的是3.8。

all_param.py

  1. word2vec_size = 768 # 词向量维度
  2. max_len = 250 # 最大句子长度
  3. batch_size = 16 # 一次训练批数
  4. head_num = 8 # 多头个数, 必须小于词向量维度,(head_dim=word2vec_size//head_num)
  5. transformer_layer = 1 # 编码器(解码器)层数
  6. class_num = 2 # 分类的类别数
  7. learning_rate = 1e-5 # 学习率
  8. steps = 10 # 训练次数
  9. Train = True # 是否选择训练模式,True为训练模式, False为预测模式
  10. cnn_layer = 3 # CNN层数
  11. kernel_num = 32 # 卷积核个数

一、Transformer-textCNN

data2vector.py   这里比较乱,不同数据不同处理方法,大家尽量自己写

  1. import numpy as np
  2. from all_param import *
  3. def word2vec_index(file_path):
  4. """
  5. :param file_path: 词向量文件路径
  6. :return word2vector: 字到向量的字典
  7. :return word2index: 字到词袋表示的字典
  8. :return index2word: 词袋表示到字的字典
  9. """
  10. word2vector = {}
  11. word2index = {}
  12. index2word = {}
  13. with open(file_path, 'r', encoding='utf-8') as file:
  14. index = 1
  15. data = file.readlines()[1:]
  16. for line in data:
  17. line = line.replace('\n', '')
  18. line = line.split(' ')
  19. word = line[0]
  20. vector = np.array(line[1:], dtype=float)
  21. #建立索引
  22. word2vector[word] = vector
  23. word2index[word] = index
  24. index2word[index] = word
  25. index +=1
  26. # 加入填充符
  27. word2vector['<pad>'] = np.zeros(shape=(word2vec_size))
  28. word2index['<pad>'] = 0
  29. index2word[0] = '<pad>'
  30. return word2vector, word2index, index2word
  31. def data_processing(path, data_len, word2vector, word2index, data_batch, data_start_site):
  32. """
  33. :param path: 数据集路径
  34. :param data_len: 数据数
  35. :param word2vector: 转词向量字典
  36. :param word2index: 转词词袋表示字典
  37. :param data_batch: 一次取的数据数
  38. :param data_start_site: 开始取的数据位置
  39. :return comment2vector: 评论向量表示
  40. :return comment2index: 评论词袋表示
  41. :return labels: 标签(独热编码)
  42. """
  43. with open(path, 'r', encoding='utf-8') as file1:
  44. data = file1.readlines()
  45. if data_start_site + data_batch > data_len: # 选取数据下标超出列表的长度但小于所取的数据批数时
  46. end_site = data_start_site + data_batch - data_len # 应取数据的末尾位置
  47. data = data[data_start_site:] + data[:end_site]
  48. else:
  49. end_site = data_start_site + data_batch # 应取数据的末尾位置
  50. data = data[data_start_site:end_site]
  51. file1.close()
  52. #初始化向量空间和词袋空间
  53. comment2vector = np.zeros(shape=(len(data), max_len, word2vec_size))
  54. comment2index = np.zeros(shape=(len(data), max_len))
  55. labels = np.zeros(shape=(len(data), class_num), dtype=float)
  56. #遍历每一条评论
  57. for i in range(len(data)):
  58. comment = data[i][2:] # 获取评论
  59. comment = comment.replace('\n', '')
  60. comment = comment.split(' ')
  61. comment = [i for i in comment if i !=''] # 去除列表里所有空元素
  62. for word in range(max_len): #对评论进行数值转换
  63. if word > len(comment) - 1: #评论长度短需要填充时
  64. continue
  65. else: #正常数值转换时
  66. comment2vector[i][word] = word2vector[comment[word]] #向量转换
  67. comment2index[i][word] = word2index[comment[word]] #词袋转换
  68. label = int(data[i][:1]) # 获取标签
  69. # 独热编码
  70. labels[i][label] = 1
  71. # 标签平滑
  72. for zero in range(len(labels[i])):
  73. if labels[i][zero] == 0:
  74. labels[i][zero] = 0.0000001
  75. else:
  76. labels[i][zero] = 0.9999999
  77. return comment2vector, comment2index, labels
  78. if __name__ == '__main__':
  79. word2vector, word2index, index2word = word2vec_index(
  80. 'word2vec/douban_comment/fen_ci128/balanced/balanced_data.vector') # 加载词向量
  81. # 获取数据集个数
  82. with open('data_set/douban_comment/balanced/balanced_train.txt', 'r', encoding='utf-8') as file1:
  83. datas_len = len(file1.readlines())
  84. file1.close()
  85. print('一共有{}条数据'.format(datas_len))
  86. # 分批次输入数据集
  87. #batch_num = datas_len // batch_size # 可分的批次数
  88. batch_num = 1
  89. for i in range(batch_num+1):
  90. comment_vector, comment_index, labels = data_processing(
  91. 'data_set/douban_comment/balanced/balanced_train.txt', datas_len,word2vector, word2index, batch_size, i * batch_size)
  92. print(labels)

block_transformer.py

  1. """
  2. 定义transformer模块
  3. """
  4. from all_param import *
  5. from tensorflow import keras
  6. import numpy as np
  7. import tensorflow as tf
  8. class transformer(keras.Model):
  9. def __init__(self, max_len, word_dim, head_num, class_num, learning_rate, Train):
  10. super(transformer, self).__init__()
  11. self.Train = Train
  12. self.pe = self.positional_encoding(word_dim,max_len) # 位置编码
  13. self.head_dim = word_dim // head_num # 分头后的维度
  14. # Q、K、V矩阵 kernel_initializer='RandomUniform'
  15. self.Wq = keras.layers.Dense(self.head_dim * head_num,kernel_initializer='RandomUniform')
  16. self.Wk = keras.layers.Dense(self.head_dim * head_num,kernel_initializer='RandomUniform')
  17. self.Wv = keras.layers.Dense(self.head_dim * head_num,kernel_initializer='RandomUniform')
  18. # 前馈神经网络
  19. self.feed_forward_network = keras.layers.Dense(word_dim * head_num,kernel_initializer='RandomUniform',
  20. activation=keras.activations.relu)
  21. self.adjust_shape = [keras.layers.Dense(word_dim,kernel_initializer='RandomUniform') for _ in range(2)] # 调整多头注意力输出张量形状
  22. self.drop = [keras.layers.Dropout(rate=learning_rate) for _ in range(2)] # 防止过拟合,让神经元以rate的概率停止工作
  23. self.layer_norm = [keras.layers.LayerNormalization(axis=-1) for _ in range(2)] # Norm
  24. self.linear = keras.layers.Dense(class_num,kernel_initializer='RandomUniform') # 初始化全连接层(linear层)
  25. # 位置编码
  26. def positional_encoding(self,word_dim,max_len):
  27. """
  28. :return pe: 位置编码
  29. """
  30. # 初始化变量pos和i
  31. pos = np.array([[i for i in range(max_len)]]).T
  32. I = np.array([[i if i%2==0 else (i-1) for i in range(word_dim)]])
  33. # 公式计算
  34. pe = pos / np.power(10000, I/word_dim)
  35. pe[:, 0::2] = np.sin(pe[:, 0::2])
  36. pe[:, 1::2] = np.cos(pe[:, 1::2])
  37. return pe
  38. # 多头注意力机制
  39. def multi_head_attention(self,x_embedding, x_index, this_layer):
  40. """
  41. :param x_embedding: 词向量表示
  42. :return output: 含注意力信息的词向量
  43. """
  44. # 公式计算
  45. q,k,v = self.Wq(x_embedding), self.Wk(x_embedding), self.Wv(x_embedding)
  46. h_q = tf.reshape(q, (q.shape[0], head_num, q.shape[1], self.head_dim)) # 分头
  47. h_k = tf.reshape(k, (k.shape[0], head_num, k.shape[1], self.head_dim))
  48. h_v = tf.reshape(v, (v.shape[0], head_num, v.shape[1], self.head_dim))
  49. dk = h_q.shape[-1]
  50. attention = tf.matmul(h_q, h_k, transpose_b=True) / np.sqrt(dk) # 未加掩码的注意力
  51. attention_mask = self.mask(x_index)
  52. # 加入掩码
  53. attention += attention_mask * -1e10 # 使要遮掩的位置的注意力为负无穷大
  54. self.attention = tf.nn.softmax(attention, axis=-1) # 经过softmax后需要遮掩位置的注意力为无限接近0
  55. att_massage = tf.matmul(self.attention, h_v) # 获得通过注意力表示的词向量
  56. # 输出数据形状调整
  57. att_massage = tf.transpose(att_massage, perm=[0, 2, 1, 3]) # 为了方便下一步降维,将head_num和head_dim整合成word_dim
  58. att_massage = tf.reshape(att_massage, (att_massage.shape[0], att_massage.shape[1], -1))
  59. output = self.adjust_shape[0](att_massage) # 词向量形状规范化,head_num * head_dim不一定等于word_dim
  60. output = self.drop[0](output, training=self.Train)
  61. return output
  62. # 多头注意力机制里的掩码
  63. def mask(self, x_index):
  64. """
  65. :param x_index: 词袋表示
  66. :return word_mask: 填充符向量掩码
  67. :return attention_mask: 注意力掩码
  68. """
  69. mask = tf.math.equal(x_index, np.zeros(shape=x_index.shape)) # 找到需要遮掩的元素位置,值为True
  70. attention_mask = mask[:, np.newaxis, np.newaxis, :]
  71. attention_mask = tf.cast(attention_mask, dtype=tf.float32) # 获得词向量填充符掩码
  72. return attention_mask
  73. # 前馈神经网络
  74. def feed_forward(self,attention, this_layer):
  75. """
  76. :param attention: 含注意力信息的词向量
  77. :return output: 调整后的词向量
  78. """
  79. # 数据输入计算
  80. output = self.feed_forward_network(attention)
  81. output = self.adjust_shape[1](output)
  82. output = self.drop[1](output, training=self.Train)
  83. return output
  84. # 编码器层
  85. def encoder_layer(self, x_embedding, x_index, this_layer):
  86. """
  87. :param x_embedding: 含位置编码的词向量表示
  88. :param x_index: 词袋表示
  89. :param this_layer: 编码器层
  90. :return:
  91. """
  92. x_attention = self.layer_norm[0](x_embedding) # Norm (layerNorm)
  93. x_attention = self.multi_head_attention(x_attention, x_index, this_layer) # 多头注意力机制
  94. x_attention += x_embedding # Add
  95. x_message = self.layer_norm[1](x_attention) # Norm (layerNorm)
  96. x_message = self.feed_forward(x_message, this_layer) # 前馈神经网络
  97. x_message += x_attention # Add
  98. return x_message
  99. # 整个编码器模块
  100. def encoder(self, x_embedding, x_index, layer_num):
  101. """
  102. :param x_embedding: 含位置编码的词向量表示
  103. :param x_index: 词袋表示
  104. :return x_message: 编码器提取到的信息
  105. """
  106. # 各个模块组成编码器
  107. x_message = x_embedding
  108. for i in range(layer_num): # encoder的个数
  109. x_message = self.encoder_layer(x_message, x_index, i)
  110. return x_message
  111. # 整个transformer模型
  112. def calls(self, x_vector, x_index, layer_num):
  113. """
  114. :param x_vector: 词向量表示
  115. :param x_index: 词袋表示
  116. :return: 预测类别的概率
  117. """
  118. # 各个模块拼接成transformer
  119. x_embedding = x_vector + self.pe # 位置编码嵌入
  120. scores = self.encoder(x_embedding, x_index, layer_num) # 编码器
  121. #scores = tf.reduce_mean(scores, axis=1) # 降维形成句向量,去掉max_len维度
  122. #scores = self.linear(scores) # 全链接,实现类别数值的计算[batch_size,class_num]
  123. #scores = tf.math.softmax(scores, axis=-1) # 获得类别概率
  124. return scores
  125. if __name__ == '__main__':
  126. """
  127. 测试transformer能不能正常使用
  128. """
  129. import os
  130. import data2vector
  131. import pickle
  132. import time
  133. # 训练
  134. def train(model, data_path, batch_size, steps, word2vector, word2index, class_num, layer_num,
  135. cross_entropy,optimizer,save_path,writing_mode):
  136. """
  137. :param data_path: 训练集路径
  138. :param batch_size: 批数
  139. :param steps: 训练次数
  140. """
  141. # 获取数据总数
  142. with open(data_path, 'r', encoding='utf-8') as file1:
  143. datas_len = len(file1.readlines())
  144. print('共有{}条数据'.format(datas_len))
  145. file1.close()
  146. # 训练轮数
  147. all_time = 0 # 记录训练总耗时
  148. for step in range(steps):
  149. start_time = time.time()
  150. # 遍历数据集,分批次输入数据集
  151. data_copies = datas_len // batch_size # 可分的批次数
  152. #data_copies = 10
  153. # 用来记录每一批数据的训练结果
  154. all_loss = []
  155. all_scores = np.zeros(shape=((data_copies + 1) * batch_size, class_num))
  156. all_labels = np.zeros(shape=((data_copies + 1) * batch_size, class_num))
  157. for i in range(data_copies):
  158. x_vector, x_index, labels = data2vector.data_processing(
  159. data_path, datas_len, word2vector, word2index,
  160. batch_size, i * batch_size)
  161. # 开始训练并计算损失
  162. with tf.GradientTape() as tape:
  163. scores = model.calls(x_vector, x_index,layer_num) # 获取模型预测值
  164. loss = cross_entropy(labels, scores) # 计算交叉熵损失
  165. derivative = tape.gradient(loss, model.trainable_variables) # 自动求导
  166. optimizer.apply_gradients(zip(derivative, model.trainable_variables)) # 更新参数
  167. # 记录遍历一遍数据的总结果
  168. all_loss.append(loss)
  169. all_scores[i * batch_size: (i + 1) * batch_size, :] = scores
  170. all_labels[i * batch_size: (i + 1) * batch_size, :] = labels
  171. print('\r共有{}批数据,第 {:3} 批数据,当前损失: {:4f} '.format(data_copies,i, loss), end='')
  172. # 打印并保存本次训练结果
  173. if step % 1 == 0:
  174. this_time = time.time() - start_time # 本次耗时
  175. all_time += this_time # 总耗时
  176. predict_value = np.argmax(all_scores, axis=-1)[:, None] # 预测标签(0或1)
  177. actual_value = np.argmax(all_labels, axis=-1)[:, None] # 实际标签
  178. result = np.concatenate((predict_value, actual_value), axis=1) # 标签拼接对比[预测,实际]
  179. # 保存和打印
  180. look_and_save_data(model,result, this_time, save_path,writing_mode,word2vector=word2vector,
  181. word2index=word2index, index2word=index2word, step=step,
  182. loss=np.array(all_loss).mean(), all_time=all_time)
  183. writing_mode = 'a'
  184. # 测试
  185. def test(model, data_path, batch_size,layer_num,class_num,save_path,writing_mode):
  186. """
  187. :param data_path: 测试集路径
  188. :param batch_size: 批数
  189. """
  190. # 加载训练好的模型
  191. with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "rb") as f:
  192. dic = pickle.load(f)
  193. f.close()
  194. word2vector = dic['word2vector']
  195. word2index = dic['word2idx']
  196. model.load_weights(save_path+"/model.ckpt")
  197. # 获取数据集长度
  198. with open(data_path, 'r', encoding='utf-8') as file1:
  199. datas_len = len(file1.readlines())
  200. print('共有{}条数据'.format(datas_len))
  201. file1.close()
  202. # 测试
  203. start_time = time.time()
  204. batch_num = datas_len // batch_size # 需要处理的次数
  205. # 记录全部预测结果
  206. results = np.zeros(shape=((batch_num) * batch_size, class_num))
  207. for i in range(batch_num):
  208. x_vector, x_index, labels = data2vector.data_processing(
  209. data_path, datas_len, word2vector, word2index,
  210. batch_size, i * batch_size)
  211. scores = model.calls(x_vector, x_index,layer_num) # 获取模型预测值
  212. predict_value = np.argmax(scores, axis=-1)[:, None] # 预测标签(0或1)
  213. actual_value = np.argmax(labels, axis=-1)[:, None] # 实际标签
  214. result = np.concatenate((predict_value, actual_value), axis=1) # 标签拼接对比[预测,实际]
  215. results[i * batch_size: (i + 1) * batch_size, :] = result # 将该批结果存入总结果
  216. print('\r第 {:3} 批数据,共有{}批数据'.format(i+1, batch_num+1), end='')
  217. times = time.time() - start_time
  218. look_and_save_data(model,results, times,save_path,writing_mode)
  219. # 打印和保存训练过程或预测结果
  220. def look_and_save_data(model, result, this_time, save_path,writing_mode,word2vector=None, word2index=None, index2word=None,
  221. step=None, loss=None, all_time=None):
  222. """
  223. :param result: 预测和标签 [预测,标签]
  224. :param this_time: 本次耗时
  225. :param step: 训练次数
  226. :param loss: 损失值
  227. :param all_time: 总耗时
  228. """
  229. # 计算P、R、F1、Accuracy
  230. TP = len([i for i in result if i.sum() == 2])
  231. TN = len([i for i in result if i.sum() == 0])
  232. FP = len([i for i in result if (i[0] - i[1]) == 1])
  233. FN = len([i for i in result if (i[0] - i[1]) == -1])
  234. P = (TP + 0.0001) / (TP + FP + 0.0001)
  235. R = (TP + 0.0001) / (TP + FN + 0.0001)
  236. F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
  237. Accuracy = (TP + TN) / len(result)
  238. os.makedirs(save_path, exist_ok=True) # 创建文件目录
  239. # 输出并保存结果
  240. if Train == True: # 训练模式
  241. # 打印并保存训练过程
  242. print("\tstep: {:3} | mean_loss: {:3f} | time: {:3f}m | Accuracy: {:3f} |".format(
  243. step, loss, this_time / 60, Accuracy))
  244. # 保存训练过程的数据
  245. with open(save_path+'/train_process.txt', writing_mode, encoding='utf-8') as file:
  246. file.write(
  247. "step: {:3} | mean_loss: {:3f} | time: {:3f} | P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} |\n".format(
  248. step, loss, all_time, P, R, F1, Accuracy))
  249. file.close()
  250. # 保存模型
  251. model.save_weights(save_path+"/model.ckpt")
  252. os.makedirs(save_path+"/tmp", exist_ok=True)
  253. with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "wb") as f:
  254. pickle.dump({"word2vector": word2vector, "word2idx": word2index, "idx2word": index2word}, f)
  255. else: # 预测模式
  256. print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
  257. P, R, F1, Accuracy, this_time))
  258. with open(save_path+'/test_result.txt', writing_mode, encoding='utf-8') as file:
  259. file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
  260. P, R, F1, Accuracy, this_time))
  261. file.close()
  262. # 初始化交叉熵和优化器
  263. cross_entropy = keras.losses.CategoricalCrossentropy(from_logits=False)
  264. optimizer = keras.optimizers.Adam()
  265. writing_mode = 'w' # 初始写入模式为覆盖
  266. save_path = './model_data/cg'
  267. # 模型参数初始化
  268. model = transformer(max_len, word2vec_size, head_num, class_num, learning_rate, Train)
  269. if Train == True: # 模型训练
  270. word2vector, word2index, index2word = data2vector.word2vec_index(
  271. 'word2vec/douban_comment/fen_ci128/balanced/balanced_data.vector')
  272. train(model,'data_set/douban_comment/balanced/balanced_train.txt', batch_size, steps,
  273. word2vector,word2index, class_num, transformer_layer,cross_entropy,optimizer,save_path,writing_mode)
  274. else: # 测试模型
  275. # 模型参数初始化
  276. test(model,'data_set/douban_comment/balanced/balanced_test.txt', batch_size, transformer_layer,
  277. class_num,save_path,writing_mode)

block_CNN.py

  1. import tensorflow as tf
  2. from all_param import *
  3. import numpy as np
  4. class TextCNN(tf.keras.Model):
  5. def __init__(self, word2vec_size, kernel_num, cnn_layer, learning_rate, class_num, Train):
  6. super(TextCNN, self).__init__()
  7. self.Train = Train
  8. # 初始化第一层卷积核大小分别为(2,embed_dim),(3,embed_dim),(4,embed_dim)的卷积层
  9. self.conv = [tf.keras.layers.Conv2D(kernel_num, (i,word2vec_size), strides=(1,1), padding='valid',
  10. kernel_initializer='RandomUniform', activation='relu') for i in range(2,5)]
  11. self.max_pool = tf.keras.layers.MaxPool1D(pool_size=2,padding='same')
  12. self.drop = tf.keras.layers.Dropout(rate=learning_rate)
  13. # self.line = tf.keras.layers.Dense(512, kernel_initializer='RandomUniform') # 初始化全连接层
  14. self.line0 = tf.keras.layers.Dense(word2vec_size, kernel_initializer='RandomUniform') # 初始化全连接层
  15. self.line1 = tf.keras.layers.Dense(class_num,kernel_initializer='RandomUniform') # 初始化全连接层
  16. # 后续的深层卷积层
  17. if cnn_layer>1:
  18. self.conv_add = [tf.keras.layers.Conv1D(tf.math.pow(2, i + 1) * kernel_num, 2, strides=1,
  19. kernel_initializer='RandomUniform', activation='relu',padding='valid') for i in range(cnn_layer - 1)]
  20. # 一个cnn结构
  21. def conv_and_pool(self,input,conv):
  22. """
  23. :param input: 输入数据
  24. :param conv: 卷积层
  25. :return:
  26. """
  27. data = conv(input) # 卷积 [batch, max_len-1, 1, kernel_num]
  28. data = tf.reshape(data, (data.shape[0], data.shape[1], -1)) # 降维[batch, max_len-1, kernel_num]
  29. data = self.max_pool(data) # 池化 [batch, (max_len-1)/2, kernel_num]
  30. if cnn_layer > 1: # 进入深度卷积层
  31. for this_layer in range(cnn_layer-1): # 例如第二层卷积数据形状
  32. data = self.conv_add[this_layer](data) # 卷积 [batch, (max_len-1)/2-1, kernel_num]
  33. data = self.max_pool(data) # 池化 [batch, ((max_len-1)/2-1)/2, kernel_num]
  34. data = tf.reshape(data, (data.shape[0], -1)) # 展开最后一维进行降维
  35. return data
  36. # 用上2,3,4这三个cnn
  37. def calls(self, input):
  38. """
  39. :param input: 输入数据
  40. :return:
  41. """
  42. datas = []
  43. # 获取三个cnn的结果
  44. for i in range(len(self.conv)):
  45. data = self.conv_and_pool(input,self.conv[i])
  46. datas.append(data)
  47. # 将结果进行拼接
  48. for i in range(1,len(datas)):
  49. datas[0] = tf.concat((datas[0],datas[i]),1)
  50. output = self.drop(datas[0],training=self.Train) # 防止过拟合
  51. output = self.line0(output)
  52. #output = self.line1(output) # 全连接
  53. #output = tf.math.softmax(output, axis=-1) # 获得类别概率
  54. return output
  55. if __name__=='__main__':
  56. """
  57. 测试CNN能不能正常使用
  58. """
  59. import os
  60. import data2vector
  61. import pickle
  62. import time
  63. # 训练
  64. def train(model, data_path, batch_size, steps, word2vector, word2index, index2word, class_num,
  65. cross_entropy,optimizer,save_path,writing_mode):
  66. """
  67. :param data_path: 训练集路径
  68. :param batch_size: 批数
  69. :param steps: 训练次数
  70. """
  71. # 获取数据总数
  72. with open(data_path, 'r', encoding='utf-8') as file1:
  73. datas_len = len(file1.readlines())
  74. print('共有{}条数据'.format(datas_len))
  75. file1.close()
  76. # 训练轮数
  77. all_time = 0 # 记录训练总耗时
  78. for step in range(steps):
  79. start_time = time.time()
  80. # 遍历数据集,分批次输入数据集
  81. data_copies = datas_len // batch_size # 可分的批次数
  82. #data_copies = 80
  83. # 用来记录每一批数据的训练结果
  84. all_loss = []
  85. all_scores = np.zeros(shape=(data_copies * batch_size, class_num))
  86. all_labels = np.zeros(shape=(data_copies * batch_size, class_num))
  87. for i in range(data_copies):
  88. x_vector, x_index, labels = data2vector.data_processing(
  89. data_path, datas_len, word2vector, word2index,
  90. batch_size, i * batch_size)
  91. # 开始训练并计算损失
  92. with tf.GradientTape() as tape:
  93. x_vector = x_vector[:,:,:,np.newaxis]
  94. scores = model.calls(x_vector) # 获取模型预测值
  95. loss = cross_entropy(labels, scores) # 计算交叉熵损失
  96. derivative = tape.gradient(loss, model.trainable_variables) # 自动求导
  97. optimizer.apply_gradients(zip(derivative, model.trainable_variables)) # 更新参数
  98. # 记录遍历一遍数据的总结果
  99. all_loss.append(loss)
  100. all_scores[i * batch_size: (i + 1) * batch_size, :] = scores
  101. all_labels[i * batch_size: (i + 1) * batch_size, :] = labels
  102. print('\r共有{}批数据,第 {:3} 批数据,当前损失: {:4f} '.format(data_copies,i+1, loss), end='')
  103. # 打印并保存本次训练结果
  104. if step % 1 == 0:
  105. this_time = time.time() - start_time # 本次耗时
  106. all_time += this_time # 总耗时
  107. predict_value = np.argmax(all_scores, axis=-1)[:, None] # 预测标签(0或1)
  108. actual_value = np.argmax(all_labels, axis=-1)[:, None] # 实际标签
  109. result = np.concatenate((predict_value, actual_value), axis=1) # 标签拼接对比[预测,实际]
  110. mean_loss = np.array(all_loss).mean() # 平均损失
  111. look_and_save_data(model,result, this_time, save_path,writing_mode,word2vector=word2vector,
  112. word2index=word2index, index2word=index2word, step=step,
  113. loss=mean_loss, all_time=all_time) # 保存和打印
  114. writing_mode = 'a'
  115. # 测试
  116. def test(model, data_path, batch_size,class_num,save_path,writing_mode):
  117. """
  118. :param data_path: 测试集路径
  119. :param batch_size: 批数
  120. """
  121. # 加载训练好的模型
  122. with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "rb") as f:
  123. dic = pickle.load(f)
  124. f.close()
  125. word2vector = dic['word2vector']
  126. word2index = dic['word2idx']
  127. model.load_weights(save_path+"/model.ckpt")
  128. # 获取数据集长度
  129. with open(data_path, 'r', encoding='utf-8') as file1:
  130. datas_len = len(file1.readlines())
  131. print('共有{}条数据'.format(datas_len))
  132. file1.close()
  133. # 测试
  134. start_time = time.time()
  135. batch_num = datas_len // batch_size # 需要处理的次数
  136. # 记录全部预测结果
  137. results = np.zeros(shape=(batch_num * batch_size, class_num))
  138. for i in range(batch_num):
  139. x_vector, x_index, labels = data2vector.data_processing(
  140. data_path, datas_len, word2vector, word2index,
  141. batch_size, i * batch_size)
  142. x_vector = x_vector[:, :, :, np.newaxis]
  143. scores = model.calls(x_vector) # 获取模型预测值
  144. predict_value = np.argmax(scores, axis=-1)[:, None] # 预测标签(0或1)
  145. actual_value = np.argmax(labels, axis=-1)[:, None] # 实际标签
  146. result = np.concatenate((predict_value, actual_value), axis=1) # 标签拼接对比[预测,实际]
  147. results[i * batch_size: (i + 1) * batch_size, :] = result # 将该批结果存入总结果
  148. print('\r第 {:3} 批数据,共有{}批数据'.format(i+1, batch_num), end='')
  149. times = time.time() - start_time
  150. look_and_save_data(model,results, times,save_path,writing_mode)
  151. # 打印和保存训练过程或预测结果
  152. def look_and_save_data(model, result, this_time, save_path,writing_mode,word2vector=None, word2index=None, index2word=None,
  153. step=None, loss=None, all_time=None):
  154. """
  155. :param result: 预测和标签 [预测,标签]
  156. :param this_time: 本次耗时
  157. :param step: 训练次数
  158. :param loss: 损失值
  159. :param all_time: 总耗时
  160. """
  161. # 计算P、R、F1、Accuracy
  162. TP = len([i for i in result if i.sum() == 2])
  163. TN = len([i for i in result if i.sum() == 0])
  164. FP = len([i for i in result if (i[0] - i[1]) == 1])
  165. FN = len([i for i in result if (i[0] - i[1]) == -1])
  166. P = (TP + 0.0001) / (TP + FP + 0.0001)
  167. R = (TP + 0.0001) / (TP + FN + 0.0001)
  168. F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
  169. Accuracy = (TP + TN) / len(result)
  170. os.makedirs(save_path, exist_ok=True) # 创建文件目录
  171. # 输出并保存结果
  172. if Train == True: # 训练模式
  173. # 打印并保存训练过程
  174. print("\tstep: {:3} | mean_loss: {:3f} | time: {:3f}m | Accuracy: {:3f} |".format(
  175. step, loss, this_time / 60, Accuracy))
  176. # 保存训练过程的数据
  177. with open(save_path+'/train_process.txt', writing_mode, encoding='utf-8') as file:
  178. file.write(
  179. "step: {:3} | mean_loss: {:3f} | time: {:3f} | P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} |\n".format(
  180. step, loss, all_time, P, R, F1, Accuracy))
  181. file.close()
  182. # 保存模型
  183. model.save_weights(save_path+"/model.ckpt")
  184. os.makedirs(save_path+"/tmp", exist_ok=True)
  185. with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "wb") as f:
  186. pickle.dump({"word2vector": word2vector, "word2idx": word2index, "idx2word": index2word}, f)
  187. else: # 预测模式
  188. print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
  189. P, R, F1, Accuracy, this_time))
  190. with open(save_path+'/test_result.txt', writing_mode, encoding='utf-8') as file:
  191. file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
  192. P, R, F1, Accuracy, this_time))
  193. file.close()
  194. # 初始化交叉熵和优化器
  195. cross_entropy = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
  196. optimizer = tf.keras.optimizers.Adam()
  197. writing_mode = 'w' # 初始写入模式为覆盖
  198. save_path = './model_data/balanced_CNN_4_128'
  199. # 模型参数初始化
  200. model = TextCNN(word2vec_size,kernel_num,cnn_layer,learning_rate,class_num,Train)
  201. if Train == True: # 模型训练
  202. word2vector, word2index, index2word = data2vector.word2vec_index(
  203. 'word2vec/douban_comment/fen_ci128/balanced/balanced_data.vector')
  204. train(model,'data_set/douban_comment/balanced/balanced_train.txt', batch_size, steps,
  205. word2vector,word2index, index2word, class_num,cross_entropy,optimizer,save_path,writing_mode)
  206. else: # 测试模型
  207. # 模型参数初始化
  208. test(model,'data_set/douban_comment/balanced/balanced_test.txt', batch_size,
  209. class_num,save_path,writing_mode)

transformer-textCNN.py

  1. import os
  2. import pickle
  3. from all_param import *
  4. import data2vector
  5. import numpy as np
  6. import tensorflow as tf
  7. import time
  8. import block_CNN,block_transformer
  9. class TransformerCNN(tf.keras.Model):
  10. def __init__(self,max_len, word2vec_size, head_num, class_num, learning_rate, Train,kernel_num,cnn_layer):
  11. super(TransformerCNN, self).__init__()
  12. self.transformer = block_transformer.transformer(max_len, word2vec_size, head_num, class_num, learning_rate, Train)
  13. self.CNN = block_CNN.TextCNN(word2vec_size,kernel_num,cnn_layer,learning_rate,class_num,Train)
  14. self.Train = Train
  15. # 初始化交叉熵和优化器
  16. self.cross_entropy = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
  17. self.optimizer = tf.keras.optimizers.Adam()
  18. self.tf_linear = tf.keras.layers.Dense(word2vec_size, kernel_initializer='RandomUniform')
  19. self.cnn_linear = tf.keras.layers.Dense(word2vec_size, kernel_initializer='RandomUniform')
  20. self.linear = tf.keras.layers.Dense(class_num, kernel_initializer='RandomUniform') # 初始化全连接层(linear层)
  21. self.writing_mode = 'w' # 初始写入模式为覆盖
  22. self.attention = None
  23. # 训练
  24. def train(self, data_path, batch_size, steps, word2vector, word2index, class_num, save_path, layer_num):
  25. """
  26. :param data_path: 训练集路径
  27. :param batch_size: 批数
  28. :param steps: 训练次数
  29. """
  30. # 获取数据总数
  31. with open(data_path, 'r', encoding='utf-8') as file1:
  32. datas_len = len(file1.readlines())
  33. print('共有{}条数据'.format(datas_len))
  34. file1.close()
  35. # 训练轮数
  36. all_time = 0 # 记录训练总耗时
  37. for step in range(steps):
  38. start_time = time.time()
  39. # 遍历数据集,分批次输入数据集
  40. data_copies = datas_len // batch_size # 可分的批次数
  41. # data_copies = 10
  42. # 用来记录每一批数据的训练结果
  43. all_loss = []
  44. all_scores = np.zeros(shape=(data_copies * batch_size, class_num),)
  45. all_labels = np.zeros(shape=(data_copies * batch_size, class_num), dtype=float)
  46. for i in range(data_copies):
  47. x_vector, x_index, labels = data2vector.data_processing(
  48. data_path, datas_len, word2vector, word2index,
  49. batch_size, i * batch_size)
  50. # 开始训练并计算损失
  51. with tf.GradientTape() as tape:
  52. scores1 = self.transformer.calls(x_vector, x_index, layer_num) # 获取transformer模型预测值
  53. scores2 = self.CNN.calls(scores1[:, :, :, np.newaxis]) # 获取cnn模型预测值
  54. scores = self.linear(scores2) # 全连接
  55. scores = tf.math.softmax(scores, axis=-1) # 获得类别概率
  56. loss = self.cross_entropy(labels, scores) # 计算交叉熵损失
  57. derivative = tape.gradient(loss, self.trainable_variables) # 自动求导
  58. self.optimizer.apply_gradients(zip(derivative, self.trainable_variables)) # 更新参数
  59. # 记录遍历一遍数据的总结果
  60. all_loss.append(loss)
  61. all_scores[i * batch_size: (i + 1) * batch_size, :] = scores
  62. all_labels[i * batch_size: (i + 1) * batch_size, :] = labels
  63. print('\r共有{}批数据,第 {:3} 批数据,当前损失: {:4f} '.format(data_copies, i, loss), end='')
  64. # 打印并保存本次训练结果
  65. if step % 1 == 0:
  66. this_time = time.time() - start_time # 本次耗时
  67. all_time += this_time # 总耗时
  68. predict_value = np.argmax(all_scores, axis=-1)[:, None] # 预测标签(0或1)
  69. actual_value = np.argmax(all_labels, axis=-1)[:, None] # 实际标签
  70. result = np.concatenate((predict_value, actual_value), axis=1) # 标签拼接对比[预测,实际]
  71. mean_loss = np.array(all_loss).mean()
  72. self.look_and_save_data(result, this_time, save_path, word2vector=word2vector,
  73. word2index=word2index, index2word=index2word, step=step,
  74. loss=mean_loss, all_time=all_time) # 保存和打印
  75. self.writing_mode = 'a'
  76. # 测试
  77. def test(self, data_path, batch_size, layer_num, class_num, save_path):
  78. """
  79. :param data_path: 测试集路径
  80. :param batch_size: 批数
  81. """
  82. # 加载训练好的模型
  83. with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "rb") as f:
  84. dic = pickle.load(f)
  85. f.close()
  86. word2vector = dic['word2vector']
  87. word2index = dic['word2idx']
  88. self.load_weights(save_path+"/model.ckpt")
  89. # 获取数据集长度
  90. with open(data_path, 'r', encoding='utf-8') as file1:
  91. datas_len = len(file1.readlines())
  92. print('共有{}条数据'.format(datas_len))
  93. file1.close()
  94. # 测试
  95. start_time = time.time()
  96. batch_num = datas_len // batch_size # 需要处理的次数
  97. # 记录全部预测结果
  98. results = np.zeros(shape=(batch_num * batch_size, class_num))
  99. for i in range(batch_num):
  100. x_vector, x_index, labels = data2vector.data_processing(
  101. data_path, datas_len, word2vector, word2index,
  102. batch_size, i * batch_size)
  103. scores1 = self.transformer.calls(x_vector, x_index, layer_num) # 获取transformer模型预测值
  104. scores2 = self.CNN.calls(scores1[:, :, :, np.newaxis]) # 获取cnn模型预测值
  105. scores = self.linear(scores2) # 全连接
  106. predict_value = np.argmax(scores, axis=-1)[:, None] # 预测标签(0或1)
  107. actual_value = np.argmax(labels, axis=-1)[:, None] # 实际标签
  108. result = np.concatenate((predict_value, actual_value), axis=1) # 标签拼接对比[预测,实际]
  109. results[i * batch_size: (i + 1) * batch_size, :] = result # 将该批结果存入总结果
  110. print('\r第 {:3} 批数据,共有{}批数据'.format(i + 1, batch_num + 1), end='')
  111. times = time.time() - start_time
  112. self.look_and_save_data(results, times,save_path)
  113. # 打印和保存训练过程或预测结果
  114. def look_and_save_data(self, result, this_time, save_path, word2vector=None, word2index=None, index2word=None,
  115. step=None, loss=None, all_time=None):
  116. """
  117. :param result: 预测和标签 [预测,标签]
  118. :param this_time: 本次耗时
  119. :param step: 训练次数
  120. :param loss: 损失值
  121. :param all_time: 总耗时
  122. """
  123. # 计算P、R、F1、Accuracy
  124. TP = len([i for i in result if i.sum() == 2])
  125. TN = len([i for i in result if i.sum() == 0])
  126. FP = len([i for i in result if (i[0] - i[1]) == 1])
  127. FN = len([i for i in result if (i[0] - i[1]) == -1])
  128. P = (TP + 0.0001) / (TP + FP + 0.0001)
  129. R = (TP + 0.0001) / (TP + FN + 0.0001)
  130. F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
  131. Accuracy = (TP + TN) / len(result)
  132. os.makedirs(save_path, exist_ok=True)
  133. # 输出并保存结果
  134. if self.Train == True: # 训练模式
  135. # 打印并保存训练过程
  136. print("\tstep: {:3} | mean_loss: {:3f} | time: {:3f}m | Accuracy: {:3f} |".format(
  137. step, loss, this_time / 60, Accuracy))
  138. # 保存训练过程的数据
  139. with open(save_path+'/train_process.txt', self.writing_mode, encoding='utf-8') as file:
  140. file.write(
  141. "step: {:3} | mean_loss: {:3f} | time: {:3f} | P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} |\n".format(
  142. step, loss, all_time, P, R, F1, Accuracy))
  143. file.close()
  144. # 保存模型
  145. self.save_weights(save_path+"/model.ckpt")
  146. os.makedirs(save_path+"/tmp", exist_ok=True)
  147. with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "wb") as f:
  148. pickle.dump({"word2vector": word2vector, "word2idx": word2index, "idx2word": index2word}, f)
  149. else: # 预测模式
  150. print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
  151. P, R, F1, Accuracy, this_time))
  152. with open(save_path+'/test_result.txt', self.writing_mode, encoding='utf-8') as file:
  153. file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
  154. P, R, F1, Accuracy, this_time))
  155. file.close()
  156. if __name__ == '__main__':
  157. # 模型参数初始化
  158. model = TransformerCNN(max_len, word2vec_size, head_num, class_num, learning_rate, Train,kernel_num,cnn_layer)
  159. save_path = './model_data/balanced_RU_1_64_CNN_4_64_label'
  160. if Train == True: # 模型训练
  161. word2vector, word2index, index2word = data2vector.word2vec_index(
  162. 'word2vec/douban_comment/fen_ci128/balanced/balanced_data.vector')
  163. model.train('data_set/douban_comment/balanced/balanced_train.txt', batch_size, steps, word2vector, word2index,
  164. class_num, save_path,transformer_layer)
  165. else: # 测试模型
  166. # 模型参数初始化
  167. model.test('data_set/douban_comment/balanced/balanced_test.txt',batch_size, transformer_layer, class_num, save_path)

二、BERT-textCNN

get_data.py

  1. import numpy as np
  2. # 数据处理和提取
  3. def get_input(path, data_num, data_batch, data_start_site):
  4. # 读取对应批数的数据
  5. with open(path, 'r', encoding='utf-8') as file1:
  6. data = file1.readlines()
  7. if data_start_site + data_batch > data_num: # 选取数据下标超出列表的长度但小于所取的数据批数时
  8. end_site = data_start_site + data_batch - data_num # 应取数据的末尾位置
  9. data = data[data_start_site:] + data[:end_site]
  10. else:
  11. end_site = data_start_site + data_batch # 应取数据的末尾位置
  12. data = data[data_start_site:end_site]
  13. file1.close()
  14. labels = np.zeros(shape=(len(data)))
  15. comments = []
  16. # 数据处理
  17. for i in range(len(data)):
  18. one_data = data[i].replace('\n', '')
  19. one_data = one_data.split(' ')
  20. label, comment = int(one_data[0]) ,one_data[1:]
  21. if label != 0 and label != 1: # 如果标签不存在,舍弃这条数据
  22. labels[i] = 0
  23. else:
  24. if label == 0:
  25. labels[i] = 0.001
  26. if label == 1:
  27. labels[i] = 1.001
  28. comments.append(''.join(comment))
  29. return labels, comments
  30. if __name__ == '__main__':
  31. with open('data_set/douban_comment/balanced/balanced_train.txt', 'r', encoding='utf-8') as file:
  32. data_len = len(file.readlines())
  33. file.close()
  34. labels, comments = get_input('data_set/douban_comment/balanced/balanced_train.txt', data_len, 10, 0)
  35. print(labels)

bert_torch.py

  1. from transformers import BertModel, BertTokenizer
  2. import torch
  3. #print(torch.cuda.is_available()) # 查看GPU是否可用
  4. #print(torch.cuda.device_count()) # 查看GPU数量
  5. #print(torch.cuda.current_device()) # 查看GPU索引号
  6. #print(torch.cuda.get_device_name(0)) # 根据索引号得到GPU名称
  7. class bert(torch.nn.Module):
  8. def __init__(self):
  9. super(bert, self).__init__()
  10. self.tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm') # Bert分词器
  11. self.BERT = BertModel.from_pretrained('hfl/chinese-bert-wwm') # Bert模型,放GPU上
  12. def calls(self,input_list):
  13. batch_tokenized = self.tokenizer.batch_encode_plus(input_list, add_special_tokens=True,
  14. max_length=max_len, padding='max_length',
  15. truncation=True)
  16. input_ids = torch.tensor(batch_tokenized['input_ids'])
  17. attention_mask = torch.tensor(batch_tokenized['attention_mask'])
  18. #with torch.no_grad():
  19. hidden_outputs = self.BERT(input_ids, attention_mask=attention_mask)
  20. outputs = hidden_outputs[0] # [0]表示输出结果(last_hidden_state部分),[:,0,:]表示[CLS]对应的结果
  21. cls = outputs[:, 0, :]
  22. return outputs, cls
  23. if __name__ == '__main__':
  24. import get_data
  25. import numpy as np
  26. import os
  27. import time
  28. from all_param import *
  29. def train(BERT, data_path, epoch, batch_size, class_num, optimizer, line, cross_entropy, save_path, writing_mode, Train):
  30. # 获取数据总数
  31. with open(data_path, 'r', encoding='utf-8') as file1:
  32. datas_len = len(file1.readlines())
  33. file1.close()
  34. print('一共有{}条数据'.format(datas_len))
  35. # 训练
  36. all_time_start = time.time()
  37. torch.cuda.empty_cache()
  38. for e in range(epoch):
  39. this_time_start = time.time() # 起始时间
  40. batch_num = datas_len // batch_size # 可取的批数
  41. batch_num = 2
  42. all_loss = []
  43. all_outputs = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
  44. all_labels = torch.tensor(np.zeros(shape=(1)), dtype=torch.float32)
  45. # 批训练
  46. for batch in range(batch_num):
  47. # 获取数据
  48. labels, comments = get_data.get_input(data_path, datas_len, batch_size, batch)
  49. labels = torch.tensor(labels, dtype=torch.float32).long()
  50. optimizer.zero_grad() # 1.梯度置零
  51. _, cls = BERT.calls(comments) # 2.模型获得结果
  52. cls = line(cls)
  53. #cls = torch.softmax(cls, dim=-1)
  54. loss = cross_entropy(cls, labels) # 3.计算损失
  55. loss.requires_grad_(True)
  56. loss.backward() # 4.反向传播
  57. optimizer.step() # 5.修改参数,w,b
  58. print('\r共有{}批数据,第 {:3} 批数据,当前损失: {:4f} '.format(batch_num, batch, loss), end='')
  59. ## 记录遍历一遍数据的总结果
  60. all_loss.append(loss.item()) # item()返回loss的值
  61. all_outputs = torch.cat((all_outputs, cls), dim=0)
  62. all_labels = torch.cat((all_labels, labels), dim=0)
  63. # 打印并保存本次训练结果
  64. if e % 1 == 0:
  65. this_time = time.time() - this_time_start # 本次耗时
  66. all_time = time.time() - all_time_start # 当前总耗时
  67. predict_value = np.argmax(all_outputs[1:].detach().numpy(), axis=-1)[:, None] # 预测标签(0或1)
  68. actual_value = all_labels[1:].detach().numpy()[:, None] # 实际标签
  69. result = np.concatenate((predict_value, actual_value), axis=1) # 标签拼接对比[预测,实际]
  70. look_and_save_data(BERT, result, this_time, save_path, writing_mode, Train, step=e,
  71. loss=np.array(all_loss).mean(), all_time=all_time)
  72. writing_mode = 'a' # 更改写入模式为追加
  73. def test(BERT, data_path, batch_size, class_num, save_path, writing_mode, Train):
  74. # 获取数据总数
  75. with open(data_path, 'r', encoding='utf-8') as file1:
  76. datas_len = len(file1.readlines())
  77. file1.close()
  78. print('一共有{}条数据'.format(datas_len))
  79. BERT.load_state_dict(torch.load(save_path+"/model.ckpt"))
  80. BERT.eval()
  81. this_time_start = time.time() # 起始时间
  82. batch_num = datas_len // batch_size # 可取的批数
  83. all_outputs = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
  84. all_labels = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
  85. # 批训练
  86. for batch in range(batch_num):
  87. # 获取数据
  88. labels, comments = get_data.get_input(data_path, datas_len, batch_size, batch)
  89. labels = torch.tensor(labels, dtype=torch.float32)
  90. outputs, cls = BERT.call(comments) # 2.模型获得结果
  91. cls = line(cls)
  92. cls = torch.softmax(cls, dim=-1)
  93. # 记录遍历一遍数据的总结果
  94. all_outputs = torch.cat((all_outputs, cls), dim=0)
  95. all_labels = torch.cat((all_labels, labels), dim=0)
  96. print('\r共有{}批数据, 第 {:3} 批数据'.format(batch_num, batch+1), end='')
  97. this_time = time.time() - this_time_start # 本次耗时
  98. predict_value = np.argmax(all_outputs[1:], axis=-1)[:, None] # 预测标签(0或1)
  99. actual_value = np.argmax(all_labels[1:], axis=-1)[:, None] # 实际标签
  100. result = np.concatenate((predict_value, actual_value), axis=1) # 标签拼接对比[预测,实际]
  101. look_and_save_data(BERT, result, this_time, save_path, writing_mode, Train)
  102. # 打印和保存训练过程或预测结果
  103. def look_and_save_data(model, result, this_time, save_path,writing_mode, Train,
  104. step=None, loss=None, all_time=None):
  105. # 计算P、R、F1、Accuracy
  106. TP = len([i for i in result if i.sum() == 2])
  107. TN = len([i for i in result if i.sum() == 0])
  108. FP = len([i for i in result if (i[0] - i[1]) == 1])
  109. FN = len([i for i in result if (i[0] - i[1]) == -1])
  110. P = (TP + 0.0001) / (TP + FP + 0.0001)
  111. R = (TP + 0.0001) / (TP + FN + 0.0001)
  112. F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
  113. Accuracy = (TP + TN) / len(result)
  114. os.makedirs(save_path, exist_ok=True) # 创建文件目录
  115. # 输出并保存结果
  116. if Train == True: # 训练模式
  117. # 打印并保存训练过程
  118. print("\tstep: {:3} | mean_loss: {:3f} | time: {:3f}m | Accuracy: {:3f} |".format(
  119. step, loss, this_time / 60, Accuracy))
  120. # 保存训练过程的数据
  121. with open(save_path+'/train_process.txt', writing_mode, encoding='utf-8') as file:
  122. file.write(
  123. "step: {:3} | mean_loss: {:3f} | time: {:3f}m | P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} |\n".format(
  124. step, loss, all_time / 60, P, R, F1, Accuracy))
  125. file.close()
  126. # 保存模型
  127. torch.save(model.state_dict(), save_path+"/model.ckpt")
  128. else: # 预测模式
  129. print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f}m |\n".format(
  130. P, R, F1, Accuracy, this_time / 60))
  131. with open(save_path+'/test_result.txt', writing_mode, encoding='utf-8') as file:
  132. file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f}m |\n".format(
  133. P, R, F1, Accuracy, this_time / 60))
  134. file.close()
  135. # 初始化交叉熵和优化器
  136. bert = bert()
  137. line = torch.nn.Linear(768, class_num)
  138. cross_entropy = torch.nn.CrossEntropyLoss() # 定义损失函数,交叉熵损失函数
  139. optimizer = torch.optim.Adam(bert.parameters(),lr=learning_rate)
  140. writing_mode = 'w' # 初始写入模式为覆盖
  141. save_path = './model_data/cg'
  142. # 模型参数初始化
  143. if Train == True: # 模型训练
  144. train(bert, 'data_set/douban_comment/balanced/balanced_train.txt', steps, batch_size, class_num,
  145. optimizer, line, cross_entropy, save_path, writing_mode, Train)
  146. else: # 测试模型
  147. # 模型参数初始化
  148. test(bert, 'data_set/douban_comment/balanced/balanced_test.txt', batch_size, class_num,
  149. save_path, writing_mode, Train)

bert_textCNN.py

  1. import os
  2. import time
  3. import numpy as np
  4. import get_data
  5. from all_param import *
  6. import bert_torch
  7. import torch
  8. import math
  9. #from transformers import BertTokenizer, BertModel
  10. class TextCNN(torch.nn.Module):
  11. def __init__(self,embed_dim,kernel_num,cnn_layer,learning_rate,class_num,DEVICE):
  12. super(TextCNN, self).__init__()
  13. # 初始化第一层卷积核大小分别为(2,embed_dim),(3,embed_dim),(4,embed_dim)的卷积层
  14. self.conv = [torch.nn.Conv2d(1,kernel_num,(i,embed_dim)).to(DEVICE) for i in range(2,5)]
  15. self.relu = torch.nn.ReLU()
  16. self.max_pool = torch.nn.MaxPool1d(2,ceil_mode=True) # 最大池化层
  17. self.drop = torch.nn.Dropout(learning_rate)
  18. # 后续的深层卷积层
  19. if cnn_layer>1:
  20. self.conv_add = [torch.nn.Conv1d(int(math.pow(2, i)) * kernel_num,
  21. 2 *int(math.pow(2, i)) * kernel_num, 2).to(DEVICE) for i in range(cnn_layer-1)]
  22. # 根据矩阵变化的规律求出最后得到全连接前的矩阵[batch_size,line_dim]里的dim
  23. line_dim = max_len / 2 # 由第一层池化操作得到的
  24. if cnn_layer > 1:
  25. for i in range(cnn_layer - 1): # 第二层到第cnn_layer层
  26. if i%2==0: # 偶数层刚好卷积后全部池化
  27. line_dim = int((line_dim - 1) / 2)
  28. if i%2==1: # 奇数层卷积后会剩一个没池化到,便多池化一次
  29. line_dim = int((line_dim - 1) / 2) + 1
  30. line_dim = int(math.pow(2, cnn_layer - 1)) * kernel_num * line_dim # 乘上卷积核个
  31. # 初始化全连接层
  32. self.line = torch.nn.Linear(line_dim * 3, class_num)
  33. # 一个cnn结构
  34. def conv_and_pool(self,input,conv):
  35. """
  36. :param input: 输入数据
  37. :param conv: 卷积层
  38. :return:
  39. """
  40. data = conv(input) # 卷积 [batch,kernel_num,max_len,1]
  41. data = data.squeeze(3) # 降维 [batch,kernel_num,max_len]
  42. data = self.relu(data) # relu激活函数
  43. data = self.max_pool(data) # 池化 [batch,kernel_num,max_len/2]
  44. #print(data.shape)
  45. if cnn_layer>1: # 进入深度卷积层
  46. for this_layer in range(len(self.conv_add)): # 例如第二层卷积数据形状
  47. data = self.conv_add[this_layer](data) # 卷积 [batch, kernel_num*2, max_len/2-1]
  48. data = self.relu(data) # relu激活函数[batch, kernel_num*2, max_len/2-1]
  49. data = self.max_pool(data) # 池化 [batch, kernel_num*2, (max_len/2-1)/2]
  50. #print(data.shape)
  51. data = torch.reshape(data,shape=(data.shape[0],-1)) # 展开最后一维进行降维
  52. return data
  53. # 用上2,3,4这三个cnn
  54. def calls(self,input):
  55. """
  56. :param input: 输入数据
  57. :return:
  58. """
  59. datas = []
  60. # 获取三个cnn的结果
  61. for i in range(len(self.conv)):
  62. data = self.conv_and_pool(input,self.conv[i])
  63. datas.append(data)
  64. # 将结果进行拼接
  65. for i in range(1,len(datas)):
  66. datas[0] = torch.cat((datas[0],datas[i]),dim=1)
  67. datas = self.drop(datas[0]) # 防止过拟合
  68. output = self.line(datas) # 全连接
  69. return output
  70. class mymodel(torch.nn.Module):
  71. def __init__(self, embed_dim, kernel_num, cnn_layer, learning_rate, class_num, Train, DEVICE):
  72. super(mymodel, self).__init__()
  73. self.bert = bert_torch.bert(class_num)
  74. self.cnn = TextCNN(embed_dim,kernel_num,cnn_layer,learning_rate,class_num,DEVICE)
  75. # none表示不降维,返回和target相同形状;mean表示对一个batch的损失求均值;sum表示对一个batch的损失求和
  76. self.cross_entropy = torch.nn.CrossEntropyLoss() # 定义损失函数,交叉熵损失函数
  77. self.optimizer = torch.optim.Adam(self.parameters(),lr=learning_rate)
  78. #self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=10, gamma=0.1) # 设置学习率下降策略"
  79. #self.drop = torch.nn.Dropout(learning_rate)
  80. ## 根据矩阵变化的规律求出最后得到全连接前的矩阵[batch_size,line_dim]里的dim
  81. #line_dim = max_len / 2 # 由第一层池化操作得到的
  82. #if cnn_layer > 1:
  83. # for i in range(cnn_layer - 1): # 第二层到第cnn_layer层
  84. # if i % 2 == 0: # 偶数层刚好卷积后全部池化
  85. # line_dim = int((line_dim - 1) / 2)
  86. # if i % 2 == 1: # 奇数层卷积后会剩一个没池化到,便多池化一次
  87. # line_dim = int((line_dim - 1) / 2) + 1
  88. # line_dim = int(math.pow(2, cnn_layer - 1)) * kernel_num * line_dim # 乘上卷积核个数
  89. #
  90. ## 初始化全连接层
  91. #self.line = torch.nn.Linear(line_dim * 3, class_num)
  92. #
  93. self.writing_mode = 'w'
  94. self.Train = Train
  95. def Training(self, data_path, verify_path, max_len, DEVICE, epoch, batch_size, class_num, save_path):
  96. self.train()
  97. # 获取数据总数
  98. with open(data_path, 'r', encoding='utf-8') as file1:
  99. datas_len = len(file1.readlines())
  100. file1.close()
  101. print('一共有{}条数据'.format(datas_len))
  102. # 训练
  103. all_time_start = time.time()
  104. bast_acc = 0
  105. for e in range(epoch):
  106. this_time_start = time.time() # 起始时间
  107. batch_num = datas_len // batch_size # 可取的批数
  108. batch_num = 10
  109. all_loss = []
  110. all_outputs = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
  111. all_labels = torch.tensor(np.zeros(shape=1), dtype=torch.float32)
  112. # 批训练
  113. for batch in range(batch_num):
  114. # 获取数据
  115. labels, comments = get_data.get_input(data_path, datas_len, batch_size, batch)
  116. long_labels = torch.tensor(labels, dtype=torch.float32).long()
  117. self.optimizer.zero_grad() # 1.梯度置零
  118. outputs, _ = self.bert.calls(comments, max_len, DEVICE) # 2.模型获得结果
  119. outputs = outputs.unsqueeze(1)
  120. #print(outputs.shape)
  121. outputs = self.cnn.calls(outputs)
  122. #cls = self.drop(cls) # 防止过拟合
  123. #cls = self.line(cls) # 全连接
  124. #outputs = torch.softmax(outputs, dim=-1)
  125. loss = self.cross_entropy(outputs.to('cpu'), long_labels) # 3.计算损失
  126. #loss.requires_grad_(True)
  127. loss.backward() # 4.反向传播
  128. self.optimizer.step() # 5.修改参数,w,b
  129. ## 记录遍历一遍数据的总结果
  130. all_loss.append(loss.item()) # item()返回loss的值
  131. all_outputs = torch.cat((all_outputs, outputs.to('cpu')), dim=0)
  132. for i in range(len(labels)):
  133. if labels[i] == 0.001:
  134. labels[i] = 0
  135. else:
  136. labels[i] = 1
  137. labels = torch.tensor(labels, dtype=torch.float32)
  138. all_labels = torch.cat((all_labels, labels), dim=0)
  139. ## 选择训练最好的参数保存
  140. #Acc = self.test(verify_path, batch_size, class_num, save_path)
  141. #if Acc > bast_acc:
  142. # bast_acc = Acc
  143. # # 保存模型
  144. # torch.save(self.state_dict(), save_path + "/model.pth")
  145. print('\r训练进度{:2d}%, 共有{}批数据, 已完成{:2d}%, 当前损失: {:4f}, ACC: {} '.format(
  146. int((e) / epoch * 100), batch_num, int((batch + 1) / batch_num * 100),loss, 'None'), end='')
  147. # 打印并保存本次训练结果
  148. if e % 1 == 0:
  149. torch.save(self,save_path + "/model.pth")
  150. this_time = time.time() - this_time_start # 本次耗时
  151. all_time = time.time() - all_time_start # 当前总耗时
  152. predict_value = np.argmax(all_outputs[1:].detach().numpy(), axis=-1)[:, None] # 预测标签(0或1)
  153. actual_value = all_labels[1:].detach().numpy()[:, None] # 实际标签
  154. result = np.concatenate((predict_value, actual_value), axis=1) # 标签拼接对比[预测,实际]
  155. mean_loss = np.array(all_loss).mean()
  156. acc = self.look_and_save_data(result, this_time, save_path, self.writing_mode, self.Train, step=e,
  157. loss=mean_loss, all_time=all_time)
  158. self.writing_mode = 'a' # 更改写入模式为追加
  159. def test(self, data_path, batch_size, class_num, save_path, test_data_save=False):
  160. self.eval()
  161. # 获取数据总数
  162. with open(data_path, 'r', encoding='utf-8') as file1:
  163. datas_len = len(file1.readlines())
  164. file1.close()
  165. print('一共有{}条数据'.format(datas_len))
  166. this_time_start = time.time() # 起始时间
  167. batch_num = datas_len // batch_size # 可取的批数
  168. all_outputs = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
  169. all_labels = torch.tensor(np.zeros(shape=1), dtype=torch.float32)
  170. batch_num = 30
  171. # 批训练
  172. for batch in range(batch_num):
  173. # 获取数据
  174. labels, comments = get_data.get_input(data_path, datas_len, batch_size, batch)
  175. labels = torch.tensor(labels, dtype=torch.float32)
  176. with torch.no_grad(): # 不进行梯度计算,节省内存
  177. outputs, _ = self.bert.calls(comments, max_len, DEVICE) # 2.模型获得结果
  178. outputs = self.cnn.calls(outputs.unsqueeze(1))
  179. #cls = self.drop(cls) # 防止过拟合
  180. #cls = self.line(cls) # 全连接
  181. #outputs = torch.softmax(outputs, dim=-1)
  182. # 记录遍历一遍数据的总结果
  183. all_outputs = torch.cat((all_outputs, outputs.to('cpu')), dim=0)
  184. for i in range(len(labels)):
  185. if labels[i] == 0.001:
  186. labels[i] = 0
  187. else:
  188. labels[i] = 1
  189. labels = torch.tensor(labels, dtype=torch.float32)
  190. all_labels = torch.cat((all_labels, labels), dim=0)
  191. if test_data_save != False:
  192. print('\r共有{}批数据, 测试进度{:2d}% '.format(batch_num, int((batch + 1) / batch_num * 100)), end='')
  193. this_time = time.time() - this_time_start # 本次耗时
  194. all_outputs = np.argmax(all_outputs[1:].detach().numpy(), axis=-1)[:, None] # 预测标签(0或1)
  195. all_labels = all_labels[1:].detach().numpy()[:, None] # 实际标签
  196. all_outputs = np.concatenate((all_outputs, all_labels), axis=1) # 标签拼接对比[预测,实际]
  197. # 计算评价指标并保存训练情况
  198. Acc = self.look_and_save_data(all_outputs, this_time, save_path, self.writing_mode, test_data_save=test_data_save)
  199. return Acc
  200. # 打印和保存训练过程或预测结果
  201. def look_and_save_data(self, result, this_time, save_path, writing_mode, Train=False, step=None, loss=None,
  202. all_time=None, test_data_save=False):
  203. # 计算P、R、F1、Accuracy
  204. TP = len([i for i in result if i.sum() == 2])
  205. TN = len([i for i in result if i.sum() == 0])
  206. FP = len([i for i in result if (i[0] - i[1]) == 1])
  207. FN = len([i for i in result if (i[0] - i[1]) == -1])
  208. P = (TP + 0.0001) / (TP + FP + 0.0001)
  209. R = (TP + 0.0001) / (TP + FN + 0.0001)
  210. F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
  211. Accuracy = (TP + TN) / len(result)
  212. # 输出并保存结果
  213. if Train == True: # 训练模式
  214. # 打印并保存训练过程
  215. print("\tstep: {:3} | mean_loss: {:3f} | time: {:3f}m | train_data_Acc: {:3f} |".format(
  216. step, loss, this_time / 60, Accuracy))
  217. # 保存训练过程的数据
  218. with open(save_path + '/train_process.txt', writing_mode, encoding='utf-8') as file:
  219. file.write(
  220. "step: {:3} | mean_loss: {:3f} | time: {:3f}m | P: {:3f} | R: {:3f} | F1: {:3f} | train_data_Acc: {:3f} |\n".format(
  221. step, loss, all_time / 60, P, R, F1, Accuracy))
  222. file.close()
  223. ## 保存模型
  224. # torch.save(model.state_dict(), save_path+"/model.pth")
  225. else: # 预测模式
  226. if test_data_save == True:
  227. print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f}m |\n".format(
  228. P, R, F1, Accuracy, this_time / 60))
  229. with open(save_path + '/test_result.txt', writing_mode, encoding='utf-8') as file:
  230. file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f}m |\n".format(
  231. P, R, F1, Accuracy, this_time / 60))
  232. file.close()
  233. return Accuracy
  234. if __name__ == '__main__':
  235. #tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') # 加载base模型的对应的切词器
  236. #model = BertModel.from_pretrained('bert-base-chinese')
  237. DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  238. print('GPU: ', '可用' if str(DEVICE) == "cuda" else "不可用") # 查看GPU是否可用
  239. print('torch版本: ', torch.__version__) # 查看torch版本
  240. print('GPU数量: ', torch.cuda.device_count()) # 查看GPU数量
  241. print('GPU索引号: ', torch.cuda.current_device()) # 查看GPU索引号
  242. print('GPU名称: ', torch.cuda.get_device_name(0)) # 根据索引号得到GPU名称
  243. # 获取数据集个数
  244. save_path = 'model_data/balanced_bert_output_CNN_in_50_3_label'
  245. os.makedirs(save_path, exist_ok=True) # 创建保存文件目录
  246. train_path = 'data_set/douban_comment/balanced/balanced_train.txt'
  247. test_path = 'data_set/douban_comment/balanced/balanced_test.txt'
  248. verify_path = 'data_set/douban_comment/balanced/balanced_verify.txt'
  249. if Train == True:
  250. model = mymodel(word2vec_size, kernel_num, cnn_layer, learning_rate, class_num, Train, DEVICE).to(DEVICE)
  251. model.Training(train_path, verify_path, max_len, DEVICE, steps, batch_size, class_num, save_path)
  252. # 自行测试
  253. Train = False
  254. model.test(test_path, batch_size, class_num, save_path, test_data_save=True)
  255. model.test(test_path, batch_size, class_num, save_path, test_data_save=True)
  256. else:
  257. model = torch.load(save_path + "/model.pth") # 加载模型参数
  258. model.test(test_path, batch_size, class_num, save_path, test_data_save=True)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小丑西瓜9/article/detail/356540
推荐阅读
相关标签
  

闽ICP备14008679号