Bert代码详解(一)重点详细_bert bertlayernorm

这是bert的pytorch版本(与tensorflow一样的,这个更简单些,这个看懂了,tf也能看懂),地址:https://github.com/huggingface/pytorch-pretrained-BERT 主要内容在pytorch_pretrained_bert/modeling文件中。

BertModel 流程详解

#将attention_mask变成(batch_size, 1, 1, to_seq_length) 
#(to be completed)
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
#(to be completed)
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
embedding_output = self.embeddings(input_ids, token_type_ids)
  • 2


#输入为input_ids和token_type_ids,其维度均为(batch_size, seq_length)
#如果一句话的长度是seq_length,那么生成的positions_id就是【0,1,2,......,seq_length - 1】
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
    token_type_ids = torch.zeros_like(input_ids)
words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
#layerNorm 和batchNorm的区别和作用网上有解释
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias
embedding_output = self.embeddings(input_ids, token_type_ids)
encoded_layers = self.encoder(embedding_output, extended_attention_mask,output_all_encoded_layers=output_all_encoded_layers)
#Transformer构架参考:https://zhuanlan.zhihu.com/p/39034683        (BE CAUTIOUS!)
#所以下面的代码首先声明了一层layer,然后构造了num_hidden_layers(12 or 24)层相同的layer放在一个列表中,既是self.layer
layer = BertLayer(config)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
#hidden_states:根据上面所讲,hidden_states就是embedding_output,其维度为[batch_size, seq_lenght, word_dimension],embedding出来后,多了一个dimension
#attention_mask:维度[batch_size, 1, 1, seq_length]
#(to be completed)
    all_encoder_layers = []
    for layer_module in self.layer:
        hidden_states = layer_module(hidden_states, attention_mask)
        #如果output_all_encoded_layers == True:则将每一层的结果添加到all_encoder_layers中
        if output_all_encoded_layers:
    #如果output_all_encoded_layers == False, 则只将最后一层的输出加到all_encoded_layers中
    if not output_all_encoded_layers:
    return all_encoder_layers
attention_output = self.attention(hidden_states, attention_mask)
#(to be completed)
intermediate_output = self.intermediate(attention_output)
#(to be completed)
layer_output = self.output(intermediate_output, attention_output)
不幸的是,这个attention层又tm引用了其他层,一环套一环。为了看起来方便,我决定将这引用层的讲解一并放到这个jupyter cell中讲解,而不像之前那样一个model放在一个jupyter cell中。

#BertAttention的输入是两个:一个是input_tensor(之前的hidden_states,第一层是embedding_output),维度为[batch_size, seq_length, word_dimension]
#另一个则是attention_mask:其维度为(batch_size, 1, 1, seq_length)
def forward(self, input_tensor, attention_mask):
    self_output = self.self(input_tensor, attention_mask)     #BertSelfAttention层
    attention_output = self.output(self_output, input_tensor)  #BertSelfOutput层
    return attention_output
#下面则是激动人心的selfattention层,没有单独放在一个jupyter cell中显得很没有排面……
        #num_attention_heads: Number of attention heads for each attention layer in the Transformer encoder
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        #这里相当于声明了一个hidden_size * all_head_size大小的矩阵, 既是768*768
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
    #首先看输入,hidden_states:(batch_size, seq_length, word_dimension = hidden_size = 768)(仔细看embedding代码,确实输出的维度是hiddensize的)
    #另一个输入:attention_mask(batch_size, 1, 1, seq_length)
    def forward(self, hidden_states, attention_mask):
        #简单提一下query, key,value的作用。简单来说query和key用来确定权重,然后乘以value用来得到注意力的大小
        #下面三行均是[batch_size, seq_length, hidden_states]*[hidden_states, all_head_size]
        #结果是[batch_size, seq_length, all_head_size = 768]
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
        #这个操作干了什么呢?把[batch_size, seq_length, all_head_size = 768] 的矩阵变成了
        #[batch_size, num_attention_heads=12, seq_length, attention_head_size=64]
        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)
        #首先query和key相乘,得到的矩阵形状是[batch_size, num_attention_heads, seq_length, seq_length]
        #首先看后两维A[seq_length, seq_length],自注意力机制是自己对自己的注意力,假设一个句子长度是seq_length,那么这个二维矩阵代表什么呢?
        #那么A[i][j]则代表第j个单词对第i的单词的影响(注意力)权重。如果你还不明白,以"I am so handsome"为例(矩阵数值是瞎编的)#                                   I    am    so    handsome
        #                               I   3    4     -10    3
        #                               am  4    6     9      1
        #                               so  2    4     1      2
        #                         handsome  3    12    1      0
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        #(to be completed)
        #我们不妨假设上例中的handsome是padding是填充的,那么这个handsome就是无用信息,attention_mask = [0,0,0,-10000]
        attention_scores = attention_scores + attention_mask
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)
        #格式变为:[batch_size, num_attention_heads, seq_length, attention_head_size]
        context_layer = torch.matmul(attention_probs, value_layer)
        #下面的三行就是将[batch_size, num_attention_heads, seq_length, attention_head_size]格式转化为
        #[batch_size, seq_leagth, all_head_size],又回到了最初的起点……
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        return context_layer
#1、全连接层 2、dropout层 3、layernormer层
#最后输出的维度是[batch_size, seq_leagth, hidden_size=768]
def forward(self, hidden_states, input_tensor):
    hidden_states = self.dense(hidden_states)
    hidden_states = self.dropout(hidden_states)
    hidden_states = self.LayerNorm(hidden_states + input_tensor)
    return hidden_states
#1、一个全连接层 2、一个激活层
#具体地,输入为[batch_size, seq_length, hidden_size = 768]
    def forward(self, hidden_states):
        #[batch_size, seq_length, all_head_size = 768] * [hidden_size, intermediate_size = 4*768](论文和代码都是这样的设置的)
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        #然后返他妈的回,形状变成了[batch_size, seq_length,intermediate_size=4*768]
        return hidden_states
#输入形状[batch_size, seq_length,intermediate_size=4*768]
#输出是[batch_size, seq_length,hidden_size=768]
class BertOutput(nn.Module):
    def __init__(self, config):
        super(BertOutput, self).__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
encoded_layers = self.encoder(embedding_output, extended_attention_mask,output_all_encoded_layers=output_all_encoded_layers)
sequence_output = encoded_layers[-1]
pooled_output = self.pooler(sequence_output)
if not output_all_encoded_layers:
    encoded_layers = encoded_layers[-1]
return encoded_layers, pooled_output
#由上面的讲解可知,pooler层的输入是transformer最后一层的输出,[batch_size, seq_length, hidden_size]
def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output
