赞
踩
回到t5模型的整体的结构之中
(0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=512, out_features=512, bias=False) (k): Linear(in_features=512, out_features=512, bias=False) (v): Linear(in_features=512, out_features=512, bias=False) (o): Linear(in_features=512, out_features=512, bias=False) (relative_attention_bias): Embedding(32, 8) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseReluDense( (wi): Linear(in_features=512, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=512, bias=False) (dropout): Dropout(p=0.1, inplace=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) )
这里我们接着查看一下T5DenseReluDense模型的结构特点
class T5DenseReluDense(nn.Module):
def __init__(self, config):
super().__init__()
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(self, hidden_states):
hidden_states = self.wi(hidden_states)
hidden_states = nn.functional.relu(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.wo(hidden_states)
return hidden_states
这里内容的参数之中,
config.d_model = 512
config.d_ff = 2048
这里还需要注意一下T5LayerNorm网络层结构的实现
先copy一下T5LayerNorm整个部分的源代码
class T5LayerNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ Construct a layernorm module in the T5 style No bias and no subtraction of mean. """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps def forward(self, hidden_states): # layer norm should always be calculated in float32 variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) # convert into float16 if necessary if self.weight.dtype == torch.float16: hidden_states = hidden_states.to(torch.float16) return self.weight * hidden_states
常规的layernormalization的计算公式为
y
=
x
−
μ
V
a
r
(
x
)
+
ϵ
y = \frac{x-\mu}{\sqrt{Var(x)+\epsilon}}
y=Var(x)+ϵ
x−μ
这里的
μ
\mu
μ为期望,
ϵ
\epsilon
ϵ为方差
而新更新的layernormalization的计算公式为
y
=
w
e
i
g
h
t
∗
x
V
a
r
(
x
)
+
ϵ
y = weight*\frac{x}{\sqrt{Var(x)+\epsilon}}
y=weight∗Var(x)+ϵ
x
这里对应的weight参数初始化的时候全为1
self.weight = nn.Parameter(torch.ones(hidden_size))
比如对应的一个tensor的内容
tensor = torch.FloatTensor([[1,2,4,1],
[6,3,2,4],
[2,4,6,1]])
这里对最后一个维度的数值计算期望的方差的内容
E(x) = [2.0,3.75,3.25]
V(x) = [1.5,2.18,3.68]
然后对于当中的每个数值,计算
y
=
x
−
μ
V
a
r
(
x
)
+
ϵ
y = \frac{x-\mu}{\sqrt{Var(x)+\epsilon}}
y=Var(x)+ϵ
x−μ
这里就是改成
y
=
w
e
i
g
h
t
∗
x
V
a
r
(
x
)
+
ϵ
y = weight*\frac{x}{\sqrt{Var(x)+\epsilon}}
y=weight∗Var(x)+ϵ
x
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
这里的x去除掉了减去
μ
\mu
μ期望的内容,最后一波直接乘上self.weight初始化为1的参数
另外需要注意的一点是,T5LayerFF调用了残差连接的内容
class T5LayerFF(nn.Module):
......
def forward(self,hidden_states):
forwarded_states = self.layer_norm(hidden_states)
forwarded_states = self.DenseReluDense(forwarded_states)
hidden_states = hidden_states + self.dropout(forwarded_states)
return hidden_states
这里我们再返回查看一下T5decoder部分模型的整个的内容
(0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=512, out_features=512, bias=False) (k): Linear(in_features=512, out_features=512, bias=False) (v): Linear(in_features=512, out_features=512, bias=False) (o): Linear(in_features=512, out_features=512, bias=False) (relative_attention_bias): Embedding(32, 8) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=512, out_features=512, bias=False) (k): Linear(in_features=512, out_features=512, bias=False) (v): Linear(in_features=512, out_features=512, bias=False) (o): Linear(in_features=512, out_features=512, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseReluDense( (wi): Linear(in_features=512, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=512, bias=False) (dropout): Dropout(p=0.1, inplace=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False)
首先放出T5LayerSelfAttention和T5LayerCrossAttention网络层结构的内容,这里目前暂时没有看出来有什么差异之处
(从上面来看,t5多了一个relative_attention_bias)
查看一下t5的Decoder部分,能看出来也是多了一个a_bias的内容
t5的Encoder部分内容
x = self.apply(
inputs=[x, x, x, position_bias],
layer=MultiHeadAttention,
arguments={'p_bias': 't5_relative'},
heads=self.num_attention_heads,
head_size=self.attention_head_size,
out_dim=self.hidden_size,
key_size=self.attention_key_size,
use_bias=False,
attention_scale=False,
kernel_initializer=self.initializer,
name=attention_name
)
t5的Decoder部分的内容
x = self.apply( inputs=[x, x, x, attention_mask, position_bias[0]], layer=MultiHeadAttention, arguments={ 'a_bias': True, 'p_bias': 't5_relative' }, heads=self.num_attention_heads, head_size=self.attention_head_size, out_dim=self.hidden_size, key_size=self.attention_key_size, use_bias=False, attention_scale=False, kernel_initializer=self.initializer, name=self_attention_name )
可以看出来上面的t5decoder部分多了一个a_bias的内容
class T5LayerSelfAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super().__init__() self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) def forward( self, hidden_states, attention_mask=None, position_bias=None, layer_head_mask=None, past_key_value=None, use_cache=False, output_attentions=False, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.SelfAttention( normed_hidden_states, mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, ) hidden_states = hidden_states + self.dropout(attention_output[0]) outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them return outputs class T5LayerCrossAttention(nn.Module): def __init__(self, config): super().__init__() self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) def forward( self, hidden_states, key_value_states, attention_mask=None, position_bias=None, layer_head_mask=None, past_key_value=None, use_cache=False, query_length=None, output_attentions=False, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( normed_hidden_states, mask=attention_mask, key_value_states=key_value_states, position_bias=position_bias, layer_head_mask=layer_head_mask, past_key_value=past_key_value, use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, ) layer_output = hidden_states + self.dropout(attention_output[0]) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them return outputs
这里先查看一下bert4keras中t5模型encoder与decoder部分的调用
encoder部分调用attention
x = self.apply(
inputs=[x, x, x, position_bias],
layer=MultiHeadAttention,
arguments={'p_bias': 't5_relative'},
heads=self.num_attention_heads,
head_size=self.attention_head_size,
out_dim=self.hidden_size,
key_size=self.attention_key_size,
use_bias=False,
attention_scale=False,
kernel_initializer=self.initializer,
name=attention_name
)
decoder部分调用attention(分为两部分:内容1、内容2)
x = self.apply( inputs=[x, x, x, attention_mask, position_bias[0]], layer=MultiHeadAttention, arguments={ 'a_bias': True, 'p_bias': 't5_relative' }, heads=self.num_attention_heads, head_size=self.attention_head_size, out_dim=self.hidden_size, key_size=self.attention_key_size, use_bias=False, attention_scale=False, kernel_initializer=self.initializer, name=self_attention_name ) ............ ............ x = self.apply( inputs=[x, c, c, position_bias[1]], layer=MultiHeadAttention, arguments={ 'a_bias': None, 'p_bias': 't5_relative' }, heads=self.num_attention_heads, head_size=self.attention_head_size, out_dim=self.hidden_size, key_size=self.attention_key_size, use_bias=False, attention_scale=False, kernel_initializer=self.initializer, name=cross_attention_name )
从苏神的代码来看,上面的decoder部分的后一部分和encoder的中的attention网络结构内容相同
(transformer中的代码感觉全都相似。)
另外一个区别就是transformer每次encoder开始和decoder开始都会调用一个embedding网络层
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
(relative_attention_bias): Embedding(32, 8)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
将T5LayerCrossAttention部分改为T5LayerSelfAttention部分,发现了报错,终于看出来了两者的不同之处
仔细查看transformer之中T5LayerCrossAttention网络层和T5LayerSelfAttention网络层的不同之处
class T5LayerSelfAttention(nn.Module):
......
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
):
class T5LayerCrossAttention(nn.Module):
......
def forward(
self,
hidden_states,
key_value_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
query_length=None,
output_attentions=False,
):
发现T5LayerCrossAttention比T5LayerSelfAttention多了两个参数key_value_states以及query_length
其他的区域部分保持一致
回到bert4keras中的模型
发现
x = self.apply( inputs=[x, x, x, attention_mask, position_bias[0]], layer=MultiHeadAttention, arguments={ 'a_bias': True, 'p_bias': 't5_relative' }, heads=self.num_attention_heads, head_size=self.attention_head_size, out_dim=self.hidden_size, key_size=self.attention_key_size, use_bias=False, attention_scale=False, kernel_initializer=self.initializer, name=self_attention_name )
这里的a_bias本质上就是下三角的mask内容
class LM_Mask(object): """定义下三角Attention Mask(语言模型用) """ def compute_attention_bias(self, inputs=None): """通过idxs序列的比较来得到对应的mask """ if self.attention_bias is None: def lm_mask(s): seq_len = K.shape(s)[1] idxs = K.arange(0, seq_len) mask = idxs[None, :] <= idxs[:, None] mask = K.cast(mask, K.floatx()) return -(1 - mask[None, None]) * 1e12 self.attention_bias = self.apply( inputs=self.inputs[0], layer=Lambda, function=lm_mask, name='Attention-LM-Mask' ) return self.attention_bias
如果a_bias == True的情况下,这里先加上LM_Mask(下三角掩码),再加上相对位置编码,否则直接加上相对位置编码
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。