赞
踩
2021SC@SDUSC
接我的上一篇博客:PaddleDetection代码解析之Transformer encoder源码实现分析(上)。
依旧是对代码的解释在下面代码的注释中:
几个维度:
self._emb_size = config['hidden_size'] # 768
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4
得到q、k、v的矩阵:
q = self._q_fc(queries) k = self._k_fc(keys) v = self._v_fc(values)
- class MultiHeadAttentionLayer(Layer):
- """
- MultiHeadAttentionLayer
- """
-
- def __init__(self,
- d_key,
- d_value,
- d_model,
- n_head=1,
- dropout_rate=0.,
- cache=None,
- gather_idx=None,
- static_kv=False,
- param_initializer=None,
- name=""):
- super(MultiHeadAttentionLayer, self).__init__()
- self._n_head = n_head
- self._d_key = d_key
- self._d_value = d_value
- self._d_model = d_model
- self._dropout_rate = dropout_rate
-
- self._q_fc = Linear(
- input_dim=d_model,
- output_dim=d_key * n_head,
- param_attr=fluid.ParamAttr(
- name=name + '_query_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_query_fc.b_0')
-
- self._k_fc = Linear(
- input_dim=d_model,
- output_dim=d_key * n_head,
- param_attr=fluid.ParamAttr(
- name=name + '_key_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_key_fc.b_0')
-
- self._v_fc = Linear(
- input_dim=d_model,
- output_dim=d_value * n_head,
- param_attr=fluid.ParamAttr(
- name=name + '_value_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_value_fc.b_0')
-
- self._proj_fc = Linear(
- input_dim=d_value * n_head,
- output_dim=d_model,
- param_attr=fluid.ParamAttr(
- name=name + '_output_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_output_fc.b_0')
-
- def forward(self, queries, keys, values, attn_bias):
- """
- forward
- :param queries:
- :param keys:
- :param values:
- :param attn_bias:
- :return:
- """
- # compute q ,k ,v
- keys = queries if keys is None else keys
- values = keys if values is None else values
- # 得到q k v 矩阵
- q = self._q_fc(queries)
- k = self._k_fc(keys)
- v = self._v_fc(values)
-
- # split head
-
- q_hidden_size = q.shape[-1]
- eshaped_q = fluid.layers.reshape(
- x=q,
- shape=[0, 0, self._n_head, q_hidden_size // self._n_head],
- inplace=False)
- transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
-
- k_hidden_size = k.shape[-1]
- reshaped_k = fluid.layers.reshape(
- x=k,
- shape=[0, 0, self._n_head, k_hidden_size // self._n_head],
- inplace=False)
- transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
-
- v_hidden_size = v.shape[-1]
- reshaped_v = fluid.layers.reshape(
- x=v,
- shape=[0, 0, self._n_head, v_hidden_size // self._n_head],
- inplace=False)
- transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
-
- scaled_q = fluid.layers.scale(x=transpose_q, scale=self._d_key**-0.5)
- # scale dot product attention
- product = fluid.layers.matmul(
- #x=transpose_q,
- x=scaled_q,
- y=transpose_k,
- transpose_y=True)
- #alpha=self._d_model**-0.5)
- if attn_bias:
- product += attn_bias
- weights = fluid.layers.softmax(product)
- if self._dropout_rate:
- weights_droped = fluid.layers.dropout(
- weights,
- dropout_prob=self._dropout_rate,
- dropout_implementation="upscale_in_train",
- is_test=False)
- out = fluid.layers.matmul(weights_droped, transpose_v)
- else:
- out = fluid.layers.matmul(weights, transpose_v)
-
- # combine heads
- if len(out.shape) != 4:
- raise ValueError("Input(x) should be a 4-D Tensor.")
- trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
- final_out = fluid.layers.reshape(
- x=trans_x,
- shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
- inplace=False)
-
- # fc to output
- proj_out = self._proj_fc(final_out)
- return proj_out
预处理:
self._preprocess_layer = PrePostProcessLayer( self._preprocess_cmd, d_model, prepostprocess_dropout, name=name + "_pre_att")
多头注意力:
self._multihead_attention_layer = MultiHeadAttentionLayer( d_key, d_value, d_model, n_head, attention_dropout, None, None, False, param_initializer, name=name + "_multi_head_att")
- class EncoderSubLayer(Layer):
- """
- EncoderSubLayer
- """
-
- def __init__(self,
- hidden_act,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- preprocess_cmd="n",
- postprocess_cmd="da",
- param_initializer=None,
- name=""):
-
- super(EncoderSubLayer, self).__init__()
- self.name = name
- self._preprocess_cmd = preprocess_cmd
- self._postprocess_cmd = postprocess_cmd
- self._prepostprocess_dropout = prepostprocess_dropout
- # 预处理
- self._preprocess_layer = PrePostProcessLayer(
- self._preprocess_cmd,
- d_model,
- prepostprocess_dropout,
- name=name + "_pre_att")
- # 多头注意力
- self._multihead_attention_layer = MultiHeadAttentionLayer(
- d_key,
- d_value,
- d_model,
- n_head,
- attention_dropout,
- None,
- None,
- False,
- param_initializer,
- name=name + "_multi_head_att")
-
- self._postprocess_layer = PrePostProcessLayer(
- self._postprocess_cmd,
- d_model,
- self._prepostprocess_dropout,
- name=name + "_post_att")
- self._preprocess_layer2 = PrePostProcessLayer(
- self._preprocess_cmd,
- d_model,
- self._prepostprocess_dropout,
- name=name + "_pre_ffn")
-
- self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
- hidden_act,
- d_inner_hid,
- d_model,
- relu_dropout,
- param_initializer,
- name=name + "_ffn")
-
- self._postprocess_layer2 = PrePostProcessLayer(
- self._postprocess_cmd,
- d_model,
- self._prepostprocess_dropout,
- name=name + "_post_ffn")
-
- def forward(self, enc_input, attn_bias):
- """
- forward
- :param enc_input: encoder 输入
- :param attn_bias: attention 偏置
- :return: 一层encoder encode输入之后的结果
- """
- # 在进行多头attention前,先进行预处理
- pre_process_multihead = self._preprocess_layer(enc_input)
- # 预处理之后的结果给到多头attention层
- attn_output = self._multihead_attention_layer(pre_process_multihead,
- None, None, attn_bias)
- # 经过attention之后进行后处理
- attn_output = self._postprocess_layer(attn_output, enc_input)
- # 在给到FFN层前进行预处理
- pre_process2_output = self._preprocess_layer2(attn_output)
- # 得到FFN层的结果
- ffd_output = self._positionwise_feed_forward(pre_process2_output)
- # 返回后处理后的结果
- return self._postprocess_layer2(ffd_output, attn_output)
# 使用add_sublayer方法添加子层:
self.add_sublayer( 'esl_%d' % i, EncoderSubLayer( hidden_act, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, param_initializer, name=name + '_layer_' + str(i))))
- class EncoderLayer(Layer):
- """
- encoder
- """
-
- def __init__(self,
- hidden_act,
- n_layer, # encoder子层数量 / encoder深度
- n_head, # 注意力机制中head数量
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout, # 处理层的dropout概率
- attention_dropout, # attention层的dropout概率
- relu_dropout, # 激活函数层的dropout概率
- preprocess_cmd="n", # 前处理,正则化
- postprocess_cmd="da", # 后处理,dropout + 残差连接
- param_initializer=None,
- name=""):
-
- super(EncoderLayer, self).__init__()
- self._preprocess_cmd = preprocess_cmd
- self._encoder_sublayers = list()
- self._prepostprocess_dropout = prepostprocess_dropout
- self._n_layer = n_layer
- self._hidden_act = hidden_act
- # 后处理层,这里是层正则化
- self._preprocess_layer = PrePostProcessLayer(
- self._preprocess_cmd, 3, self._prepostprocess_dropout,
- "post_encoder")
- # 根据n_layer的设置(bert_base中是12)迭代定义几个encoder子层
- for i in range(n_layer):
- self._encoder_sublayers.append(
- # 使用add_sublayer方法添加子层
- self.add_sublayer(
- 'esl_%d' % i,
- EncoderSubLayer(
- hidden_act,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- preprocess_cmd,
- postprocess_cmd,
- param_initializer,
- name=name + '_layer_' + str(i))))
-
- def forward(self, enc_input, attn_bias):
- """
- forward
- :param enc_input: 模型输入
- :param attn_bias: bias项可根据具体情况选择是否保留
- :return: encode之后的结果
- """
- # 迭代多个encoder子层,例如 bert base 的encoder子层数为12(self._n_layer)
- for i in range(self._n_layer):
- # 得到子层的输出,参数为 enc_input, attn_bias
- enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
- # 该子层的输出作为下一子层的输入
- enc_input = enc_output
- # 返回处理过的层
- return self._preprocess_layer(enc_output)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。