当前位置:   article > 正文

图解Transformer+DSSM_dssm transformer

dssm transformer

 

图解Transformer

https://blog.csdn.net/qq_41664845/article/details/84969266

The Illustrated Transformer

https://jalammar.github.io/illustrated-transformer/

DSSM|基于Transformer的语义相似度计算模型DSSM及代码开源

https://blog.csdn.net/qq_28385535/article/details/92803375

Transformer解析与tensorflow代码解读

https://www.cnblogs.com/zhouxiaosong/p/11032431.html

 

  1. # coding=utf-8
  2. """
  3. author: 王黎成
  4. function: 通过使用双向GRU+Transformer作为表示层进行语义相似度计算
  5. """
  6. # 引入外部库
  7. import os
  8. import numpy as np
  9. import random
  10. import tensorflow as tf
  11. from tensorflow.contrib.rnn import GRUCell, DropoutWrapper
  12. # 引入内部库
  13. from Sampling.RandomSampling.Sampling import *
  14. class TransformerDSSM:
  15. def __init__ (self,
  16. q_set=None, # 问题集,二维数组
  17. t_set=None, # 答案集,二维数组
  18. dict_set=None, # 字典集,[词:index]
  19. vec_set=None, # 向量集,[向量],与dict_set顺序一致
  20. batch_size=None, # 训练批次,默认是全部数据
  21. hidden_num=256, # 隐藏层个数
  22. attention_num=512, # 注意力机制的数目
  23. learning_rate=0.0001, # 学习率
  24. epoch_steps=200, # 训练迭代次数
  25. gamma=20, # 余弦相似度平滑因子
  26. is_train=True, # 是否进行训练
  27. is_extract=False, # 是否进行t特征提取
  28. is_sample=False # 是否采用随机采样方法进行训练
  29. ):
  30. # 外部参数
  31. self.q_set = q_set
  32. self.t_set = t_set
  33. self.dict_set = dict_set
  34. self.vec_set = vec_set
  35. # 最后一行表示字典中没有词的词向量
  36. self.vec_set.append([0. for i in range(len(self.vec_set[0]))])
  37. self.batch_size = batch_size
  38. self.hidden_num = hidden_num
  39. self.attention_num = attention_num
  40. self.learning_rate = learning_rate
  41. self.epoch_steps = epoch_steps
  42. self.gamma = gamma
  43. self.is_train = is_train
  44. self.is_extract = is_extract
  45. self.is_sample = is_sample
  46. self.keep_prob = 0.85
  47. self.params = {'num_layers': 4, 'num_heads': 8, 'keep_prob': self.keep_prob, 'hidden_size': hidden_num * 2}
  48. # 内部参数
  49. self.q_size = 0
  50. self.negative_sample_num = 0
  51. self.q_actual_length = []
  52. self.t_actual_length = []
  53. self.q_max_length = 0
  54. self.t_max_length = 0
  55. self.model_save_name = './ModelMemory/model/transformerDSSM'
  56. self.model_save_checkpoint = './ModelMemory/model/checkpoint'
  57. # 模型参数
  58. self.graph = None
  59. self.session = None
  60. self.saver = None
  61. self.q_inputs = None
  62. self.q_inputs_actual_length = None
  63. self.t_inputs = None
  64. self.t_inputs_actual_length = None
  65. self.t_final_state = None
  66. self.top_k_answer = None
  67. self.outputs_prob = None
  68. self.outputs_index = None
  69. self.accuracy = None
  70. self.loss = None
  71. self.train_op = None
  72. # transformer表示层构建
  73. self.encoder_stack = TransformerEncoder(self.params)
  74. self.layer_normalization = LayerNormalization(self.hidden_num * 2)
  75. def init_model_parameters (self):
  76. print('Initializing------')
  77. if not self.is_extract:
  78. # 获取问题数据大小
  79. self.q_size = len(self.q_set)
  80. if self.batch_size is None and self.is_train:
  81. self.batch_size = self.q_size
  82. if self.is_train:
  83. self.negative_sample_num = self.batch_size // 10
  84. if not self.is_extract:
  85. # 获取q_set实际长度及最大长度
  86. self.q_actual_length = []
  87. for data in self.q_set:
  88. self.q_actual_length.append(len(data))
  89. self.q_max_length = max(self.q_actual_length)
  90. print('the max length of q set is %d' % self.q_max_length)
  91. # q_set数据补全
  92. for i in range(len(self.q_set)):
  93. if len(self.q_set[i]) < self.q_max_length:
  94. self.q_set[i] = self.q_set[i] + ['UNK' for _ in range(self.q_max_length - len(self.q_set[i]))]
  95. if self.is_train:
  96. # 获取t_set实际长度及最大长度
  97. for data in self.t_set:
  98. self.t_actual_length.append(len(data))
  99. self.t_max_length = max(self.t_actual_length)
  100. print('the max length of t set is %d' % self.t_max_length)
  101. # t_set数据补全
  102. for i in range(len(self.t_set)):
  103. if len(self.t_set[i]) < self.t_max_length:
  104. self.t_set[i] = self.t_set[i] + ['UNK' for _ in range(self.t_max_length - len(self.t_set[i]))]
  105. pass
  106. def generate_data_set (self):
  107. if not self.is_extract:
  108. # 将q_set每一个字转换为其在字典中的序号
  109. for i in range(len(self.q_set)):
  110. for j in range(len(self.q_set[i])):
  111. if self.q_set[i][j] in self.dict_set:
  112. self.q_set[i][j] = self.dict_set[self.q_set[i][j]]
  113. else:
  114. self.q_set[i][j] = len(self.vec_set) - 1
  115. self.q_set = np.array(self.q_set)
  116. if self.is_train:
  117. # 将t_set每一个字转换为其在字典中的向量
  118. for i in range(len(self.t_set)):
  119. for j in range(len(self.t_set[i])):
  120. if self.t_set[i][j] in self.dict_set:
  121. self.t_set[i][j] = self.dict_set[self.t_set[i][j]]
  122. else:
  123. self.t_set[i][j] = len(self.vec_set) - 1
  124. self.t_set = np.array(self.t_set)
  125. pass
  126. def presentation_transformer (self, inputs, inputs_actual_length):
  127. with tf.variable_scope('presentation_layer', reuse=tf.AUTO_REUSE):
  128. with tf.name_scope('structure_presentation_layer'):
  129. # 正向
  130. fw_cell = GRUCell(num_units=self.hidden_num)
  131. fw_drop_cell = DropoutWrapper(fw_cell, output_keep_prob=self.keep_prob)
  132. # 反向
  133. bw_cell = GRUCell(num_units=self.hidden_num)
  134. bw_drop_cell = DropoutWrapper(bw_cell, output_keep_prob=self.keep_prob)
  135. # 动态rnn函数传入的是一个三维张量,[batch_size,n_steps,n_input] 输出是一个元组 每一个元素也是这种形状
  136. if self.is_train and not self.is_extract:
  137. output, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_drop_cell, cell_bw=bw_drop_cell,
  138. inputs=inputs, sequence_length=inputs_actual_length,
  139. dtype=tf.float32)
  140. else:
  141. output, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell, cell_bw=bw_cell, inputs=inputs,
  142. sequence_length=inputs_actual_length, dtype=tf.float32)
  143. # hiddens的长度为2,其中每一个元素代表一个方向的隐藏状态序列,将每一时刻的输出合并成一个输出
  144. structure_output = tf.concat(output, axis=2)
  145. structure_output = self.layer_normalization(structure_output)
  146. with tf.name_scope('transformer_layer'):
  147. transformer_output = self.encoder_stack(structure_output, self.is_train)
  148. with tf.name_scope('global_attention_layer'):
  149. w_omega = tf.get_variable(name='w_omega', shape=[self.hidden_num * 2, self.attention_num],
  150. initializer=tf.random_normal_initializer())
  151. b_omega = tf.get_variable(name='b_omega', shape=[self.attention_num],
  152. initializer=tf.random_normal_initializer())
  153. u_omega = tf.get_variable(name='u_omega', shape=[self.attention_num],
  154. initializer=tf.random_normal_initializer())
  155. v = tf.tanh(tf.tensordot(transformer_output, w_omega, axes=1) + b_omega)
  156. vu = tf.tensordot(v, u_omega, axes=1, name='vu') # (B,T) shape
  157. alphas = tf.nn.softmax(vu, name='alphas') # (B,T) shape
  158. # tf.expand_dims用于在指定维度增加一维
  159. global_attention_output = tf.reduce_sum(transformer_output * tf.expand_dims(alphas, -1), 1)
  160. return global_attention_output
  161. def matching_layer_training (self, q_final_state, t_final_state):
  162. with tf.name_scope('TrainProgress'):
  163. # 负采样
  164. t_temp_state = tf.tile(t_final_state, [1, 1])
  165. for i in range(self.negative_sample_num):
  166. rand = int((random.random() + i) * self.batch_size / self.negative_sample_num)
  167. t_final_state = tf.concat((t_final_state,
  168. tf.slice(t_temp_state, [rand, 0], [self.batch_size - rand, -1]),
  169. tf.slice(t_temp_state, [0, 0], [rand, -1])), 0)
  170. # ||q|| * ||t||
  171. q_norm = tf.tile(tf.sqrt(tf.reduce_sum(tf.square(q_final_state), 1, True)),
  172. [self.negative_sample_num + 1, 1])
  173. t_norm = tf.sqrt(tf.reduce_sum(tf.square(t_final_state), 1, True))
  174. norm_prod = tf.multiply(q_norm, t_norm)
  175. # q * tT
  176. prod = tf.reduce_sum(tf.multiply(tf.tile(q_final_state, [self.negative_sample_num + 1, 1]), t_final_state),
  177. 1, True)
  178. # cosine
  179. cos_sim_raw = tf.truediv(prod, norm_prod)
  180. cos_sim = tf.transpose(
  181. tf.reshape(tf.transpose(cos_sim_raw), [self.negative_sample_num + 1, self.batch_size])) * self.gamma
  182. return cos_sim
  183. def matching_layer_infer (self, q_final_state, t_final_state):
  184. with tf.name_scope('InferProgress'):
  185. # ||q|| * ||t||
  186. q_sqrt = tf.sqrt(tf.reduce_sum(tf.square(q_final_state), 1, True))
  187. t_sqrt = tf.sqrt(tf.reduce_sum(tf.square(t_final_state), 1, True))
  188. norm_prod = tf.matmul(q_sqrt, t_sqrt, transpose_b=True)
  189. # q * tT
  190. prod = tf.matmul(q_final_state, t_final_state, transpose_b=True)
  191. # cosine
  192. cos_sim = tf.truediv(prod, norm_prod)
  193. return cos_sim
  194. def build_graph_by_cpu (self):
  195. # 构建模型训练所需的数据流图
  196. self.graph = tf.Graph()
  197. with self.graph.as_default():
  198. with tf.name_scope('placeholder'):
  199. # 定义Q输入
  200. if not self.is_extract:
  201. self.q_inputs = tf.placeholder(dtype=tf.int64, shape=[None, None])
  202. self.q_inputs_actual_length = tf.placeholder(dtype=tf.int32, shape=[None])
  203. # 定义T输入
  204. if self.is_train:
  205. self.t_inputs = tf.placeholder(dtype=tf.int64, shape=[None, self.t_max_length])
  206. self.t_inputs_actual_length = tf.placeholder(dtype=tf.int32, shape=[None])
  207. with tf.name_scope('InputLayer'):
  208. # 定义词向量
  209. embeddings = tf.constant(self.vec_set)
  210. # 将句子中的每个字转换为字向量
  211. if not self.is_extract:
  212. q_embeddings = tf.nn.embedding_lookup(embeddings, self.q_inputs)
  213. if self.is_train:
  214. t_embeddings = tf.nn.embedding_lookup(embeddings, self.t_inputs)
  215. with tf.name_scope('PresentationLayer'):
  216. if not self.is_extract:
  217. q_final_state = self.presentation_transformer(q_embeddings, self.q_inputs_actual_length)
  218. if self.is_train and self.is_extract:
  219. self.t_final_state = self.presentation_transformer(t_embeddings, self.t_inputs_actual_length)
  220. elif self.is_train:
  221. self.t_final_state = self.presentation_transformer(t_embeddings, self.t_inputs_actual_length)
  222. else:
  223. self.t_final_state = tf.placeholder(dtype=tf.float32, shape=[None, self.hidden_num * 2])
  224. if not self.is_extract:
  225. with tf.name_scope('MatchingLayer'):
  226. if self.is_train:
  227. cos_sim = self.matching_layer_training(q_final_state, self.t_final_state)
  228. else:
  229. cos_sim = self.matching_layer_infer(q_final_state, self.t_final_state)
  230. if not self.is_train:
  231. self.top_k_answer = tf.placeholder(dtype=tf.int32)
  232. self.outputs_prob, self.outputs_index = tf.nn.top_k(cos_sim, self.top_k_answer)
  233. else:
  234. # softmax归一化并输出
  235. prob = tf.nn.softmax(cos_sim)
  236. with tf.name_scope('Loss'):
  237. # 取正样本
  238. hit_prob = tf.slice(prob, [0, 0], [-1, 1])
  239. self.loss = -tf.reduce_sum(tf.log(hit_prob)) / self.batch_size
  240. with tf.name_scope('Accuracy'):
  241. output_train = tf.argmax(prob, axis=1)
  242. self.accuracy = tf.reduce_sum(tf.cast(tf.equal(output_train, tf.zeros_like(output_train)),
  243. dtype=tf.float32)) / self.batch_size
  244. # 优化并进行梯度修剪
  245. with tf.name_scope('Train'):
  246. optimizer = tf.train.AdamOptimizer(self.learning_rate)
  247. # 分解成梯度列表和变量列表
  248. grads, vars = zip(*optimizer.compute_gradients(self.loss))
  249. # 梯度修剪
  250. gradients, _ = tf.clip_by_global_norm(grads, 5) # clip gradients
  251. # 将每个梯度以及对应变量打包
  252. self.train_op = optimizer.apply_gradients(zip(gradients, vars))
  253. # 设置模型存储所需参数
  254. self.saver = tf.train.Saver()
  255. def build_graph_by_gpu (self, gpu_num=1):
  256. # 构建模型训练所需的数据流图
  257. self.graph = tf.Graph()
  258. with self.graph.as_default():
  259. with tf.device("/cpu:0"):
  260. with tf.name_scope('placeholder'):
  261. # 定义Q输入
  262. self.q_inputs = tf.placeholder(dtype=tf.int64, shape=[None, None])
  263. self.q_inputs_actual_length = tf.placeholder(dtype=tf.int32, shape=[None])
  264. # 定义T输入
  265. self.t_inputs = tf.placeholder(dtype=tf.int64, shape=[None, self.t_max_length])
  266. self.t_inputs_actual_length = tf.placeholder(dtype=tf.int32, shape=[None])
  267. with tf.name_scope('InputLayer'):
  268. # 定义词向量
  269. embeddings = tf.constant(self.vec_set)
  270. # 将句子中的每个字转换为字向量
  271. q_embeddings = tf.nn.embedding_lookup(embeddings, self.q_inputs)
  272. t_embeddings = tf.nn.embedding_lookup(embeddings, self.t_inputs)
  273. optimizer = tf.train.AdamOptimizer(self.learning_rate)
  274. tower_grads = []
  275. loss_list = []
  276. accuracy_list = []
  277. with tf.variable_scope(tf.get_variable_scope()):
  278. for i in range(gpu_num):
  279. with tf.device("/gpu:%d" % i):
  280. with tf.name_scope("tower_%d" % i):
  281. _q_embeddings = q_embeddings[i * self.batch_size:(i + 1) * self.batch_size]
  282. _t_embeddings = t_embeddings[i * self.batch_size:(i + 1) * self.batch_size]
  283. q_inputs_actual_length = self.q_inputs_actual_length[
  284. i * self.batch_size:(i + 1) * self.batch_size]
  285. t_inputs_actual_length = self.t_inputs_actual_length[
  286. i * self.batch_size:(i + 1) * self.batch_size]
  287. with tf.name_scope('PresentationLayer'):
  288. q_final_state = self.presentation_transformer(_q_embeddings, q_inputs_actual_length)
  289. t_final_state = self.presentation_transformer(_t_embeddings, t_inputs_actual_length)
  290. with tf.name_scope('MatchingLayer'):
  291. cos_sim = self.matching_layer_training(q_final_state, t_final_state)
  292. # softmax归一化并输出
  293. prob = tf.nn.softmax(cos_sim)
  294. with tf.name_scope('Loss'):
  295. # 取正样本
  296. hit_prob = tf.slice(prob, [0, 0], [-1, 1])
  297. cur_loss = -tf.reduce_sum(tf.log(hit_prob)) / self.batch_size
  298. loss_list.append(cur_loss)
  299. with tf.name_scope('Accuracy'):
  300. output_train = tf.argmax(prob, axis=1)
  301. cur_accuracy = tf.reduce_sum(tf.cast(tf.equal(output_train, tf.zeros_like(output_train)),
  302. dtype=tf.float32)) / self.batch_size
  303. accuracy_list.append(cur_accuracy)
  304. # 优化并进行梯度修剪
  305. with tf.name_scope('Train'):
  306. # 分解成梯度列表和变量列表
  307. grads, vars = zip(*optimizer.compute_gradients(cur_loss))
  308. # 梯度修剪
  309. gradients, _ = tf.clip_by_global_norm(grads, 5) # clip gradients
  310. tower_grads.append(zip(gradients, vars))
  311. self.loss = tf.reduce_mean(loss_list, 0)
  312. self.accuracy = tf.reduce_mean(accuracy_list, 0)
  313. grads = average_gradients(tower_grads)
  314. self.train_op = optimizer.apply_gradients(grads)
  315. # 设置模型存储所需参数
  316. self.saver = tf.train.Saver()
  317. def train (self, gpu_num=1):
  318. config = tf.ConfigProto(allow_soft_placement=True)
  319. config.gpu_options.allow_growth = True
  320. with tf.Session(graph=self.graph, config=config) as self.session:
  321. # 判断模型是否存在
  322. if os.path.exists(self.model_save_checkpoint):
  323. # 恢复变量
  324. self.saver.restore(self.session, self.model_save_name)
  325. else:
  326. # 初始化变量
  327. self.session.run(tf.global_variables_initializer())
  328. # 开始迭代,使用Adam优化的随机梯度下降法,并将结果输出到日志文件
  329. print('training------')
  330. index_list = [i for i in range(self.q_size)]
  331. sample_nums = self.batch_size * gpu_num
  332. for i in range(self.epoch_steps):
  333. total_loss = 0
  334. total_accuracy = 0
  335. for j in range(self.q_size // sample_nums):
  336. if self.is_sample:
  337. sample_list = simple_sampling(index_list, sample_nums)
  338. q_set = []
  339. t_set = []
  340. q_actual_length = []
  341. t_actual_length = []
  342. for index in sample_list:
  343. q_set.append(self.q_set[index])
  344. t_set.append(self.t_set[index])
  345. q_actual_length.append(self.q_actual_length[index])
  346. t_actual_length.append(self.t_actual_length[index])
  347. else:
  348. q_set = self.q_set[j * sample_nums:(j + 1) * sample_nums]
  349. t_set = self.t_set[j * sample_nums:(j + 1) * sample_nums]
  350. q_actual_length = self.q_actual_length[j * sample_nums:(j + 1) * sample_nums]
  351. t_actual_length = self.t_actual_length[j * sample_nums:(j + 1) * sample_nums]
  352. feed_dict = {self.q_inputs: q_set, self.q_inputs_actual_length: q_actual_length,
  353. self.t_inputs: t_set, self.t_inputs_actual_length: t_actual_length}
  354. _, loss, accuracy = self.session.run([self.train_op, self.loss, self.accuracy], feed_dict=feed_dict)
  355. total_loss += loss
  356. total_accuracy += accuracy
  357. print('[epoch:%d] loss %f accuracy %f' % (
  358. i, total_loss / (self.q_size // sample_nums), total_accuracy / (self.q_size // sample_nums)))
  359. # 保存模型
  360. print('save model------')
  361. self.saver.save(self.session, self.model_save_name)
  362. pass
  363. def start_session (self):
  364. self.session = tf.Session(graph=self.graph)
  365. self.saver.restore(self.session, self.model_save_name)
  366. def inference (self, top_k):
  367. feed_dict = {self.q_inputs: self.q_set, self.q_inputs_actual_length: self.q_actual_length,
  368. self.t_final_state: self.t_set, self.top_k_answer: top_k}
  369. prob, index = self.session.run([self.outputs_prob, self.outputs_index], feed_dict=feed_dict)
  370. return prob, index
  371. def extract_t_pre (self):
  372. with tf.Session(graph=self.graph) as self.session:
  373. self.saver.restore(self.session, self.model_save_name)
  374. feed_dict = {self.t_inputs: self.t_set, self.t_inputs_actual_length: self.t_actual_length}
  375. t_state = self.session.run(self.t_final_state, feed_dict=feed_dict)
  376. return t_state
  377. class TransformerEncoder(tf.layers.Layer):
  378. def __init__ (self, params):
  379. super(TransformerEncoder, self).__init__()
  380. self.layers = []
  381. for _ in range(params["num_layers"]):
  382. self_attention_layer = SelfAttention(params["hidden_size"], params["num_heads"],
  383. params["keep_prob"])
  384. feed_forward_network = FeedFowardNetwork(params["hidden_size"], params["keep_prob"])
  385. self.layers.append([LayNormAdd(self_attention_layer, params),
  386. LayNormAdd(feed_forward_network, params)])
  387. self.output_normalization = LayerNormalization(params["hidden_size"])
  388. def call (self, encoder_inputs, training):
  389. for n, layer in enumerate(self.layers):
  390. self_attention_layer = layer[0]
  391. feed_forward_network = layer[1]
  392. with tf.variable_scope("layer_%d" % n):
  393. with tf.variable_scope("self_attention"):
  394. encoder_inputs = self_attention_layer(encoder_inputs, training=training)
  395. with tf.variable_scope("ffn"):
  396. encoder_inputs = feed_forward_network(encoder_inputs, training=training)
  397. return self.output_normalization(encoder_inputs)
  398. class SelfAttention(tf.layers.Layer):
  399. def __init__ (self, hidden_size, num_heads, keep_prob):
  400. if hidden_size % num_heads != 0:
  401. raise ValueError("Hidden size must be evenly divisible by the number of "
  402. "heads.")
  403. super(SelfAttention, self).__init__()
  404. self.hidden_size = hidden_size
  405. self.num_heads = num_heads
  406. self.keep_prob = keep_prob
  407. self.q_dense_layer = tf.layers.Dense(self.hidden_size, use_bias=False, name="q")
  408. self.k_dense_layer = tf.layers.Dense(self.hidden_size, use_bias=False, name="k")
  409. self.v_dense_layer = tf.layers.Dense(self.hidden_size, use_bias=False, name="v")
  410. self.output_dense_layer = tf.layers.Dense(self.hidden_size, use_bias=False, name="output_transform")
  411. def call (self, x, training):
  412. q = self.q_dense_layer(x)
  413. k = self.k_dense_layer(x)
  414. v = self.v_dense_layer(x)
  415. q = self.split_heads(q)
  416. k = self.split_heads(k)
  417. v = self.split_heads(v)
  418. depth = (self.hidden_size // self.num_heads)
  419. q *= depth ** -0.5
  420. logits = tf.matmul(q, k, transpose_b=True)
  421. weights = tf.nn.softmax(logits, name="attention_weights")
  422. if training:
  423. weights = tf.nn.dropout(weights, self.keep_prob)
  424. attention_output = tf.matmul(weights, v)
  425. attention_output = self.combine_heads(attention_output)
  426. attention_output = self.output_dense_layer(attention_output)
  427. return attention_output
  428. def split_heads (self, x):
  429. with tf.name_scope("split_heads"):
  430. batch_size = tf.shape(x)[0]
  431. length = tf.shape(x)[1]
  432. depth = (self.hidden_size // self.num_heads)
  433. x = tf.reshape(x, [batch_size, length, self.num_heads, depth])
  434. return tf.transpose(x, [0, 2, 1, 3])
  435. def combine_heads (self, x):
  436. with tf.name_scope("combine_heads"):
  437. batch_size = tf.shape(x)[0]
  438. length = tf.shape(x)[2]
  439. x = tf.transpose(x, [0, 2, 1, 3])
  440. return tf.reshape(x, [batch_size, length, self.hidden_size])
  441. class FeedFowardNetwork(tf.layers.Layer):
  442. def __init__ (self, hidden_size, keep_prob):
  443. super(FeedFowardNetwork, self).__init__()
  444. self.hidden_size = hidden_size
  445. self.keep_prob = keep_prob
  446. self.filter_dense_layer = tf.layers.Dense(self.hidden_size, use_bias=True, activation=tf.nn.swish,
  447. name="filter_layer")
  448. self.output_dense_layer = tf.layers.Dense(self.hidden_size, use_bias=True, name="output_layer")
  449. def call (self, x, training):
  450. output = self.filter_dense_layer(x)
  451. if training:
  452. output = tf.nn.dropout(output, self.keep_prob)
  453. output = self.output_dense_layer(output)
  454. return output
  455. class LayNormAdd(tf.layers.Layer):
  456. def __init__ (self, layer, params):
  457. super(LayNormAdd, self).__init__()
  458. self.layer = layer
  459. self.params = params
  460. self.keep_prob = params["keep_prob"]
  461. self.layer_norm = LayerNormalization(self.params["hidden_size"])
  462. def __call__ (self, x, training):
  463. y = self.layer(self.layer_norm(x), training)
  464. if training:
  465. y = tf.nn.dropout(y, self.keep_prob)
  466. return x + y
  467. class LayerNormalization(tf.layers.Layer):
  468. def __init__ (self, hidden_size):
  469. super(LayerNormalization, self).__init__()
  470. self.hidden_size = hidden_size
  471. def build (self, _):
  472. self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size], initializer=tf.ones_initializer())
  473. self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size], initializer=tf.zeros_initializer())
  474. self.built = True
  475. def call (self, x, epsilon=1e-6):
  476. mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
  477. variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
  478. norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
  479. return norm_x * self.scale + self.bias
  480. def average_gradients (tower_grads):
  481. average_grads = []
  482. for grad_and_vars in zip(*tower_grads):
  483. grads = []
  484. for g, _ in grad_and_vars:
  485. expend_g = tf.expand_dims(g, 0)
  486. grads.append(expend_g)
  487. grad = tf.concat(grads, 0)
  488. grad = tf.reduce_mean(grad, 0)
  489. v = grad_and_vars[0][1]
  490. grad_and_var = (grad, v)
  491. average_grads.append(grad_and_var)
  492. return average_grads

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/一键难忘520/article/detail/755900
推荐阅读
相关标签
  

闽ICP备14008679号