赞
踩
BERT(Bidirectional Encoder Representations from Transformers) 是Google AI Language由2019年发表的工作,其在某种意义上开创了NLP领域的新纪元。其采用了Transformer中的encoder架构进行工作,主要做的是MLM以及next sentence predict两个任务,其在大量的无标号的数据上进行预训练,之后进行fine-tune(微调)到相应的子任务数据集。
与之相对应的是openAI的GPT系列,GPT系列使用的transformer中的decoder架构。但是BERT的影响力至少是GPT系列的10倍,被众多研究者广泛使用。
在这里,我分享以下我对于BERT的主体网络代码的解析,因为时间有限,进行的比较仓促,难免会有错误,希望大家多多指教。
对于代码,我使用的是他人复现的pytorch版本。
paper:https://arxiv.org/pdf/1810.04805.pdf&usg=ALkJrhhzxlCL6yTht2BRmH9atgvKFxHsxQ
code:https://github.com/codertimo/BERT-pytorch
1. BERT主体网络代码,对于其中的Transformer的细节我并没有加进去,需要的同学可以去上面的仓库中寻找。
- import torch.nn as nn
- import torch
- import math
- from .attention import MultiHeadedAttention
- from .utils import SublayerConnection, PositionwiseFeedForward
- #-----------------------------------------------------------------#
- # BERT: Bidirectional Encoder Representations from Transformers
- # 可以直译为BERT:使用Transformer的双向编码器表示
- #-----------------------------------------------------------------#
- class BERT(nn.Module):
- def __init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):
- super(BERT,self).__init__()
- """
- :param vocab_size: vocab_size of total words
- :param hidden: BERT model hidden size
- :param n_layers: numbers of Transformer blocks(layers),为Transformer中的encoder的层数
- :param attn_heads: number of attention heads
- :param dropout: dropout rate
- """
- self.hidden = hidden
- self.n_layers = n_layers
- self.attn_heads = attn_heads
- #----------------------------------------------#
- # 对于Feed Forward Network,paper中指出
- # 对于feed_forward_hidden他们使用了4*hidden_size
- #----------------------------------------------#
- self.feed_forward_hidden = hidden * 4
- #----------------------------------------------#
- # 对于BERT的编码操作,在这里是positional, segment
- # 以及token embeddings的sum操作
- #----------------------------------------------#
- self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden)
- #-------------------------------------------------#
- # 具有多层transformer encoder的transformer结构
- #-------------------------------------------------#
- self.transformer_blocks = nn.ModuleList([TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])
-
- def forward(self, x, segment_info):
- # attention masking for padded token
- # torch.ByteTensor([batch_size, 1, seq_len, seq_len)
- mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
- #---------------------------------------------------#
- # 将一条句子嵌入转换为sequence of vectors
- #---------------------------------------------------#
- x = self.embedding(x, segment_info)
- #---------------------------------------------------#
- # pass through transformer layers
- #---------------------------------------------------#
- for transformer in self.transformer_blocks:
- x = transformer.forward(x, mask)
-
- return x
-
- #---------------------------------------------------------------------------------#
- # BERTEmbedding这一个类的定义
- # BERT Embedding which is consisted with under features
- # 1. TokenEmbedding : normal embedding matrix
- # 2. PositionalEmbedding : adding positional information using sin, cos
- # 3. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)
- #---------------------------------------------------------------------------------#
- class BERTEmbedding(nn.Module):
- def __init__(self, vocab_size, embed_size, dropout=0.1):
- """
- :param vocab_size: total vocab size
- :param embed_size: embedding size of token embedding
- :param dropout: dropout rate
- """
- super(BERTEmbedding,self).__init__()
- self.embed_size = embed_size
- self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
- self.position = PositionalEmbedding(d_model=self.token.embedding_dim)
- self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim)
- self.dropout = nn.Dropout(p=dropout)
- #------------------------------------------------------------------------#
- # 将TokenEmbedding,PositionalEmbedding以及SegmentEmbedding
- # 三部分相加,之后过Dropout来降低过拟合发生的风险,最后得出BERTEmbedding这一个类的输出
- #------------------------------------------------------------------------#
- def forward(self, sequence, segment_label):
- x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
- return self.dropout(x)
-
- #-----------------------------------------------#
- # TokenEmbedding以及SegmentEmbedding这两个类的定义
- # 两者都继承nn.Embedding这个方法
- #-----------------------------------------------#
- class TokenEmbedding(nn.Embedding):
- def __init__(self, vocab_size, embed_size=512):
- super(TokenEmbedding,self).__init__(vocab_size, embed_size, padding_idx=0)
-
- class SegmentEmbedding(nn.Embedding):
- def __init__(self, embed_size=512):
- super(SegmentEmbedding,self).__init__(3, embed_size, padding_idx=0)
-
- #-----------------------------------#
- # PositionalEmbedding这一个类的定义
- #-----------------------------------#
- class PositionalEmbedding(nn.Module):
- def __init__(self, d_model, max_len=512):
- super(PositionalEmbedding,self).__init__()
- #---------------#
- # 位置信息的嵌入
- #---------------#
- pe = torch.zeros(max_len, d_model).float()
- pe.require_grad = False
- position = torch.arange(0, max_len).float().unsqueeze(1)
- div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
- #--------------------------------#
- # 使用sin对偶数位置上的token进行编码
- # 使用cos对奇数位置上的token进行编码
- #--------------------------------#
- pe[:, 0::2] = torch.sin(position * div_term)
- pe[:, 1::2] = torch.cos(position * div_term)
- pe = pe.unsqueeze(0)
- #------------------------------------------#
- # 对pe进行buffer注册,使得其可以保存到权重中,但不会
- # 随着训练的进行而进行梯度更新
- #------------------------------------------#
- self.register_buffer('pe', pe)
-
- def forward(self, x):
- return self.pe[:, :x.size(1)]
-
- #-----------------------------------------#
- # 对于Transformer模块的定义
- # 在BERT中其只使用了Transformer中的encoder结构
- # 而位置信息嵌入以及token Embedding则均在
- # BERTEmbedding中完成
- #-----------------------------------------#
- class TransformerBlock(nn.Module):
- """
- Bidirectional Encoder = Transformer (self-attention)
- Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
- """
- def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):
- """
- :param hidden: hidden size of transformer
- :param attn_heads: head sizes of multi-head attention
- :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
- :param dropout: dropout rate
- """
- super(TransformerBlock,self).__init__()
- self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
- self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)
- self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
- self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
- self.dropout = nn.Dropout(p=dropout)
- #-------------------------------------#
- # 1.run over MHA
- # 2.run over input_sublayer(add&norm)
- # 3.run over FFN
- # 4.run over output_sublayer(add&norm)
- # 5.run over dropout
- #-------------------------------------#
- def forward(self, x, mask):
- x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
- x = self.output_sublayer(x, self.feed_forward)
- return self.dropout(x)
BERT所进行的两个任务的相关模型 1. MLM--Masked Language Model;2.Next Sentence Prediction Model.
- import torch.nn as nn
- from .bert import BERT
- #----------------------------------------------------------------#
- # 对于BERT Language Model的定义
- # Next Sentence Prediction Model + Masked Language Model
- # 当需要在一个类的定义中定义另外一个类的时候,我们需要的定义形式为 bert: BERT
- # bert为在该类中的定义,BERT为所调用的类的类名
- #----------------------------------------------------------------#
- class BERTLM(nn.Module):
- def __init__(self, bert: BERT, vocab_size):
- """
- :param bert: BERT model which should be trained
- :param vocab_size: total vocab size for masked_lm
- """
- super(BERTLM,self).__init__()
- self.bert = bert
- #---------------------------#
- # 对于所作用的两个任务的模型的定义
- #---------------------------#
- self.next_sentence = NextSentencePrediction(self.bert.hidden)
- self.mask_lm = MaskedLanguageModel(self.bert.hidden, vocab_size)
- #--------------------------------#
- # 会输出两个与概率相关的值
- #--------------------------------#
- def forward(self, x, segment_label):
- x = self.bert(x, segment_label)
- return self.next_sentence(x), self.mask_lm(x)
-
- #-----------------------------------------------------#
- # Next Sentence Prediction Model的定义
- # 2-class classification model : is_next, is_not_next
- #-----------------------------------------------------#
- class NextSentencePrediction(nn.Module):
- def __init__(self, hidden):
- """
- :param hidden: BERT model output size
- """
- super(NextSentencePrediction,self).__init__()
- #-------------------------------------#
- # Linear层将feature从BERT输出的size-> 2
- # 对应is_next, is_not_next这两个类
- # dim=-1表示沿着最后一个维度做LogSoftmax
- #-------------------------------------#
- self.linear = nn.Linear(hidden, 2)
- self.softmax = nn.LogSoftmax(dim=-1)
- #-------------------------------#
- # x[:, 0]表示取所有维度的第0个数据
- #-------------------------------#
- def forward(self, x):
- return self.softmax(self.linear(x[:, 0]))
-
- #------------------------------------------------------#
- # 对于MLM模型的定义
- # predicting origin token from masked input sequence
- # n-class classification problem, n-class = vocab_size
- #------------------------------------------------------#
- class MaskedLanguageModel(nn.Module):
- def __init__(self, hidden, vocab_size):
- """
- :param hidden: output size of BERT model
- :param vocab_size: total vocab size
- """
- super(MaskedLanguageModel,self).__init__()
- self.linear = nn.Linear(hidden, vocab_size)
- self.softmax = nn.LogSoftmax(dim=-1)
- #-----------------------------#
- # 表示对x做全连接之后再输出其概率分布
- #-----------------------------#
- def forward(self, x):
- return self.softmax(self.linear(x))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。