当前位置:   article > 正文

PaddleDetection代码解析之Transformer encoder源码实现分析(下)_attn_bias_layer

attn_bias_layer

2021SC@SDUSC

接我的上一篇博客:PaddleDetection代码解析之Transformer encoder源码实现分析(上)。

依旧是对代码的解释在下面代码的注释中:

MultiHeadAttentionLayer

几个维度:

self._emb_size = config['hidden_size']   # 768

d_key=self._emb_size // self._n_head,

d_value=self._emb_size // self._n_head,

d_model=self._emb_size,

d_inner_hid=self._emb_size * 4

得到q、k、v的矩阵:

  
        q = self._q_fc(queries)
        k = self._k_fc(keys)
        v = self._v_fc(values)
  1. class MultiHeadAttentionLayer(Layer):
  2. """
  3. MultiHeadAttentionLayer
  4. """
  5. def __init__(self,
  6. d_key,
  7. d_value,
  8. d_model,
  9. n_head=1,
  10. dropout_rate=0.,
  11. cache=None,
  12. gather_idx=None,
  13. static_kv=False,
  14. param_initializer=None,
  15. name=""):
  16. super(MultiHeadAttentionLayer, self).__init__()
  17. self._n_head = n_head
  18. self._d_key = d_key
  19. self._d_value = d_value
  20. self._d_model = d_model
  21. self._dropout_rate = dropout_rate
  22. self._q_fc = Linear(
  23. input_dim=d_model,
  24. output_dim=d_key * n_head,
  25. param_attr=fluid.ParamAttr(
  26. name=name + '_query_fc.w_0', initializer=param_initializer),
  27. bias_attr=name + '_query_fc.b_0')
  28. self._k_fc = Linear(
  29. input_dim=d_model,
  30. output_dim=d_key * n_head,
  31. param_attr=fluid.ParamAttr(
  32. name=name + '_key_fc.w_0', initializer=param_initializer),
  33. bias_attr=name + '_key_fc.b_0')
  34. self._v_fc = Linear(
  35. input_dim=d_model,
  36. output_dim=d_value * n_head,
  37. param_attr=fluid.ParamAttr(
  38. name=name + '_value_fc.w_0', initializer=param_initializer),
  39. bias_attr=name + '_value_fc.b_0')
  40. self._proj_fc = Linear(
  41. input_dim=d_value * n_head,
  42. output_dim=d_model,
  43. param_attr=fluid.ParamAttr(
  44. name=name + '_output_fc.w_0', initializer=param_initializer),
  45. bias_attr=name + '_output_fc.b_0')
  46. def forward(self, queries, keys, values, attn_bias):
  47. """
  48. forward
  49. :param queries:
  50. :param keys:
  51. :param values:
  52. :param attn_bias:
  53. :return:
  54. """
  55. # compute q ,k ,v
  56. keys = queries if keys is None else keys
  57. values = keys if values is None else values
  58. # 得到q k v 矩阵
  59. q = self._q_fc(queries)
  60. k = self._k_fc(keys)
  61. v = self._v_fc(values)
  62. # split head
  63. q_hidden_size = q.shape[-1]
  64. eshaped_q = fluid.layers.reshape(
  65. x=q,
  66. shape=[0, 0, self._n_head, q_hidden_size // self._n_head],
  67. inplace=False)
  68. transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
  69. k_hidden_size = k.shape[-1]
  70. reshaped_k = fluid.layers.reshape(
  71. x=k,
  72. shape=[0, 0, self._n_head, k_hidden_size // self._n_head],
  73. inplace=False)
  74. transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
  75. v_hidden_size = v.shape[-1]
  76. reshaped_v = fluid.layers.reshape(
  77. x=v,
  78. shape=[0, 0, self._n_head, v_hidden_size // self._n_head],
  79. inplace=False)
  80. transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
  81. scaled_q = fluid.layers.scale(x=transpose_q, scale=self._d_key**-0.5)
  82. # scale dot product attention
  83. product = fluid.layers.matmul(
  84. #x=transpose_q,
  85. x=scaled_q,
  86. y=transpose_k,
  87. transpose_y=True)
  88. #alpha=self._d_model**-0.5)
  89. if attn_bias:
  90. product += attn_bias
  91. weights = fluid.layers.softmax(product)
  92. if self._dropout_rate:
  93. weights_droped = fluid.layers.dropout(
  94. weights,
  95. dropout_prob=self._dropout_rate,
  96. dropout_implementation="upscale_in_train",
  97. is_test=False)
  98. out = fluid.layers.matmul(weights_droped, transpose_v)
  99. else:
  100. out = fluid.layers.matmul(weights, transpose_v)
  101. # combine heads
  102. if len(out.shape) != 4:
  103. raise ValueError("Input(x) should be a 4-D Tensor.")
  104. trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
  105. final_out = fluid.layers.reshape(
  106. x=trans_x,
  107. shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
  108. inplace=False)
  109. # fc to output
  110. proj_out = self._proj_fc(final_out)
  111. return proj_out

EncoderSubLayer

预处理:

 
        self._preprocess_layer = PrePostProcessLayer(
            self._preprocess_cmd,
            d_model,
            prepostprocess_dropout,
            name=name + "_pre_att")

多头注意力:

        
        self._multihead_attention_layer = MultiHeadAttentionLayer(
            d_key,
            d_value,
            d_model,
            n_head,
            attention_dropout,
            None,
            None,
            False,
            param_initializer,
            name=name + "_multi_head_att")
  1. class EncoderSubLayer(Layer):
  2. """
  3. EncoderSubLayer
  4. """
  5. def __init__(self,
  6. hidden_act,
  7. n_head,
  8. d_key,
  9. d_value,
  10. d_model,
  11. d_inner_hid,
  12. prepostprocess_dropout,
  13. attention_dropout,
  14. relu_dropout,
  15. preprocess_cmd="n",
  16. postprocess_cmd="da",
  17. param_initializer=None,
  18. name=""):
  19. super(EncoderSubLayer, self).__init__()
  20. self.name = name
  21. self._preprocess_cmd = preprocess_cmd
  22. self._postprocess_cmd = postprocess_cmd
  23. self._prepostprocess_dropout = prepostprocess_dropout
  24. # 预处理
  25. self._preprocess_layer = PrePostProcessLayer(
  26. self._preprocess_cmd,
  27. d_model,
  28. prepostprocess_dropout,
  29. name=name + "_pre_att")
  30. # 多头注意力
  31. self._multihead_attention_layer = MultiHeadAttentionLayer(
  32. d_key,
  33. d_value,
  34. d_model,
  35. n_head,
  36. attention_dropout,
  37. None,
  38. None,
  39. False,
  40. param_initializer,
  41. name=name + "_multi_head_att")
  42. self._postprocess_layer = PrePostProcessLayer(
  43. self._postprocess_cmd,
  44. d_model,
  45. self._prepostprocess_dropout,
  46. name=name + "_post_att")
  47. self._preprocess_layer2 = PrePostProcessLayer(
  48. self._preprocess_cmd,
  49. d_model,
  50. self._prepostprocess_dropout,
  51. name=name + "_pre_ffn")
  52. self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
  53. hidden_act,
  54. d_inner_hid,
  55. d_model,
  56. relu_dropout,
  57. param_initializer,
  58. name=name + "_ffn")
  59. self._postprocess_layer2 = PrePostProcessLayer(
  60. self._postprocess_cmd,
  61. d_model,
  62. self._prepostprocess_dropout,
  63. name=name + "_post_ffn")
  64. def forward(self, enc_input, attn_bias):
  65. """
  66. forward
  67. :param enc_input: encoder 输入
  68. :param attn_bias: attention 偏置
  69. :return: 一层encoder encode输入之后的结果
  70. """
  71. # 在进行多头attention前,先进行预处理
  72. pre_process_multihead = self._preprocess_layer(enc_input)
  73. # 预处理之后的结果给到多头attention层
  74. attn_output = self._multihead_attention_layer(pre_process_multihead,
  75. None, None, attn_bias)
  76. # 经过attention之后进行后处理
  77. attn_output = self._postprocess_layer(attn_output, enc_input)
  78. # 在给到FFN层前进行预处理
  79. pre_process2_output = self._preprocess_layer2(attn_output)
  80. # 得到FFN层的结果
  81. ffd_output = self._positionwise_feed_forward(pre_process2_output)
  82. # 返回后处理后的结果
  83. return self._postprocess_layer2(ffd_output, attn_output)

EncoderLayer

# 使用add_sublayer方法添加子层:
                self.add_sublayer(
                    'esl_%d' % i,
                    EncoderSubLayer(
                        hidden_act,
                        n_head,
                        d_key,
                        d_value,
                        d_model,
                        d_inner_hid,
                        prepostprocess_dropout,
                        attention_dropout,
                        relu_dropout,
                        preprocess_cmd,
                        postprocess_cmd,
                        param_initializer,
                        name=name + '_layer_' + str(i))))
  1. class EncoderLayer(Layer):
  2. """
  3. encoder
  4. """
  5. def __init__(self,
  6. hidden_act,
  7. n_layer, # encoder子层数量 / encoder深度
  8. n_head, # 注意力机制中head数量
  9. d_key,
  10. d_value,
  11. d_model,
  12. d_inner_hid,
  13. prepostprocess_dropout, # 处理层的dropout概率
  14. attention_dropout, # attention层的dropout概率
  15. relu_dropout, # 激活函数层的dropout概率
  16. preprocess_cmd="n", # 前处理,正则化
  17. postprocess_cmd="da", # 后处理,dropout + 残差连接
  18. param_initializer=None,
  19. name=""):
  20. super(EncoderLayer, self).__init__()
  21. self._preprocess_cmd = preprocess_cmd
  22. self._encoder_sublayers = list()
  23. self._prepostprocess_dropout = prepostprocess_dropout
  24. self._n_layer = n_layer
  25. self._hidden_act = hidden_act
  26. # 后处理层,这里是层正则化
  27. self._preprocess_layer = PrePostProcessLayer(
  28. self._preprocess_cmd, 3, self._prepostprocess_dropout,
  29. "post_encoder")
  30. # 根据n_layer的设置(bert_base中是12)迭代定义几个encoder子层
  31. for i in range(n_layer):
  32. self._encoder_sublayers.append(
  33. # 使用add_sublayer方法添加子层
  34. self.add_sublayer(
  35. 'esl_%d' % i,
  36. EncoderSubLayer(
  37. hidden_act,
  38. n_head,
  39. d_key,
  40. d_value,
  41. d_model,
  42. d_inner_hid,
  43. prepostprocess_dropout,
  44. attention_dropout,
  45. relu_dropout,
  46. preprocess_cmd,
  47. postprocess_cmd,
  48. param_initializer,
  49. name=name + '_layer_' + str(i))))
  50. def forward(self, enc_input, attn_bias):
  51. """
  52. forward
  53. :param enc_input: 模型输入
  54. :param attn_bias: bias项可根据具体情况选择是否保留
  55. :return: encode之后的结果
  56. """
  57. # 迭代多个encoder子层,例如 bert base 的encoder子层数为12(self._n_layer)
  58. for i in range(self._n_layer):
  59. # 得到子层的输出,参数为 enc_input, attn_bias
  60. enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
  61. # 该子层的输出作为下一子层的输入
  62. enc_input = enc_output
  63. # 返回处理过的层
  64. return self._preprocess_layer(enc_output)
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/394867
推荐阅读
相关标签
  

闽ICP备14008679号