当前位置:   article > 正文

【PyTorch】torch.nn.Transformer解读与应用_nn.transformerencoder

nn.transformerencoder

nn.TransformerEncoderLayer

        这个类是transformer encoder的组成部分,代表encoder的一个层,而encoder就是将transformerEncoderLayer重复几层。

Args:
d_model: the number of expected features in the input (required).
nhead: the number of heads in the multiheadattention models (required).
dim_feedforward: the dimension of the feedforward network model (default=2048).
dropout: the dropout value (default=0.1).
activation: the activation function of intermediate layer, relu or gelu (default=relu).

Examples::
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
src = torch.rand(10, 32, 512)
out = encoder_layer(src)

        需要注意的是transformer 只能输入 seqlenth x batch x dim 形式的数据。

nn.TransformerEncoder

        这里是transformer的encoder部分,即将上述的encoder-layer作为参数输入初始话以后可以获得TransformerEncoder

Args
encoder_layer: an instance of the TransformerEncoderLayer() class (required).
num_layers: the number of sub-encoder-layers in the encoder (required).
norm: the layer normalization component (optional).

Examples::
encoder_layer = nn.TransformerEncoderLayer(d_model=512,nhead=8) transformer_encoder=nn.TransformerEncoder(encoder_layer,num_layers=6)
src = torch.rand(10, 32, 512)
out =transformer_encoder(src)

PostionEncoder

        这里的数学原理就不再详细叙述了,因为我也没搞特别明白反正就是获得位置信息,与embedding加起来就行了。

  1. class PositionalEncoding(nn.Module):
  2.     def __init__(self, d_model, dropout=0.1, max_len=5000):
  3.         super(PositionalEncoding, self).__init__()
  4.         self.dropout = nn.Dropout(p=dropout)
  5.         pe = torch.zeros(max_len, d_model)
  6.         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
  7.         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
  8.         pe[:, 0::2] = torch.sin(position * div_term)
  9.         pe[:, 1::2] = torch.cos(position * div_term)
  10.         pe = pe.unsqueeze(0).transpose(0, 1)
  11.         self.register_buffer('pe', pe)
  12.     def forward(self, x):
  13.         x = x + self.pe[:x.size(0), :]
  14.         return self.dropout(x)

TransformerModel

        这里将参考pytorch tutorial中的内容

  1. class First_TransformerModel(nn.Module):
  2.     def __init__(self, ninp=300, nhead=4, nhid=128, nlayers=6, dropout=0.5):
  3.         super(First_TransformerModel, self).__init__()
  4.         from torch.nn import TransformerEncoder, TransformerEncoderLayer
  5.         self.model_type = 'Transformer'
  6.         self.src_mask = None
  7.         self.pos_encoder = PositionalEncoding(ninp, dropout)
  8.         encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
  9.         self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
  10.         self.ninp = ninp
  11.         
  12.     def _generate_square_subsequent_mask(self, src, lenths):
  13.         '''
  14.         padding_mask
  15.         src:max_lenth,num,300
  16.         lenths:[lenth1,lenth2...]
  17.         '''
  18.         # mask num_of_sens x max_lenth
  19.         mask = torch.ones(src.size(1), src.size(0)) == 1
  20.         for i in range(len(lenths)):
  21.             lenth = lenths[i]
  22.             for j in range(lenth):
  23.                 mask[i][j] = False
  24.         return mask
  25.     def forward(self, src, mask):
  26.         '''
  27.         src:num_of_all_sens,max_lenth,300
  28.         '''
  29.         self.src_mask = mask
  30.         src = src * math.sqrt(self.ninp)
  31.         src = self.pos_encoder(src)
  32.         output = self.transformer_encoder(src, src_key_padding_mask=self.src_mask)
  33.         output = output[0,:,:]
  34.         return output
  35. class PositionalEncoding(nn.Module):
  36.     def __init__(self, d_model, dropout=0.1, max_len=5000):
  37.         super(PositionalEncoding, self).__init__()
  38.         self.dropout = nn.Dropout(p=dropout)
  39.         pe = torch.zeros(max_len, d_model)
  40.         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
  41.         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
  42.         pe[:, 0::2] = torch.sin(position * div_term)
  43.         pe[:, 1::2] = torch.cos(position * div_term)
  44.         pe = pe.unsqueeze(0).transpose(0, 1)
  45.         self.register_buffer('pe', pe)
  46.     def forward(self, x):
  47.         x = x + self.pe[:x.size(0), :]
  48.         return self.dropout(x)

        在这里我们只需将输入的src (seqlenth x batch x ninp)进行下面的操作即可,先乘上根号下的ninp,经过positionencoder,再经过encoder即可。

  1.     src = src * math.sqrt(self.ninp)
  2.     src = self.pos_encoder(src)
  3.     output = self.transformer_encoder(src, src_key_padding_mask=self.src_mask)

        这里还需要提一下mask

        mask 是什么呢?
        mask主要可以分为两种mask,一种是src_mask,一种是src_key_padding_mask, 这里我们主要解释src_key_padding_mask。

        nn.Transformer中,提到了src_key_padding_mask的size,必须是 NxS ,即 batch x seqlenths通过这个mask,就可以将padding的部分忽略掉,让attention注意力机制不再参与这一部分的运算。

        需要注意的是,src_key_padding_mask 是一个二值化的tensor,在需要被忽略地方应该是True,在需要保留原值的情况下,是False

        这里附上我定义的双层transformer代码
第一层

  1. class First_TransformerModel(nn.Module):
  2.     def __init__(self, ninp=300, nhead=4, nhid=128, nlayers=6, dropout=0.5):
  3.         super(First_TransformerModel, self).__init__()
  4.         from torch.nn import TransformerEncoder, TransformerEncoderLayer
  5.         self.model_type = 'Transformer'
  6.         self.src_mask = None
  7.         self.pos_encoder = PositionalEncoding(ninp, dropout)
  8.         encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
  9.         self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
  10.         # self.encoder = nn.Embedding(ntoken, ninp)
  11.         self.ninp = ninp
  12.         # self.decoder = nn.Linear(ninp, ntoken)
  13.     def _generate_square_subsequent_mask(self, src, lenths):
  14.         '''
  15.         padding_mask
  16.         src:max_lenth,num,300
  17.         lenths:[lenth1,lenth2...]
  18.         '''
  19.         # mask num_of_sens x max_lenth
  20.         mask = torch.ones(src.size(1), src.size(0)) == 1
  21.         for i in range(len(lenths)):
  22.             lenth = lenths[i]
  23.             for j in range(lenth):
  24.                 mask[i][j] = False
  25.         # mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
  26.         #mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
  27.         return mask
  28.     def forward(self, src, mask):
  29.         '''
  30.         src:num_of_all_sens,max_lenth,300
  31.         '''
  32.         self.src_mask = mask
  33.         src = src * math.sqrt(self.ninp)
  34.         src = self.pos_encoder(src)
  35.         output = self.transformer_encoder(src, src_key_padding_mask=self.src_mask)
  36.         output = output[0,:,:]
  37.         #output = self.decoder(output)
  38.         return output
  39. class PositionalEncoding(nn.Module):
  40.     def __init__(self, d_model, dropout=0.1, max_len=5000):
  41.         super(PositionalEncoding, self).__init__()
  42.         self.dropout = nn.Dropout(p=dropout)
  43.         pe = torch.zeros(max_len, d_model)
  44.         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
  45.         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
  46.         pe[:, 0::2] = torch.sin(position * div_term)
  47.         pe[:, 1::2] = torch.cos(position * div_term)
  48.         pe = pe.unsqueeze(0).transpose(0, 1)
  49.         self.register_buffer('pe', pe)
  50.     def forward(self, x):
  51.         x = x + self.pe[:x.size(0), :]
  52.         return self.dropout(x)

第二层

  1. #second level
  2. class Second_TransformerModel(nn.Module):
  3.     def __init__(self, ninp=300, nhead=4, nhid=128, nlayers=6, dropout=0.5):
  4.         super(Second_TransformerModel, self).__init__()
  5.         from torch.nn import TransformerEncoder, TransformerEncoderLayer
  6.         self.src_mask = None
  7.         self.pos_encoder = PositionalEncoding(ninp, dropout)
  8.         encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
  9.         self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
  10.         self.ninp = ninp
  11.     def _generate_square_subsequent_mask(self, src, lenths):
  12.         '''
  13.         padding_mask
  14.         src:num_of_sentence x batch(文章数) x 300
  15.         lenths:[lenth1,lenth2...]
  16.         '''
  17.         # mask num_of_sens x max_lenth
  18.         mask = torch.ones(src.size(1), src.size(0)) == 1
  19.         for i in range(len(lenths)):
  20.             lenth = lenths[i]
  21.             for j in range(lenth):
  22.                 mask[i][j] = False
  23.         return mask
  24.     def forward(self, src, mask):
  25.         '''
  26.         
  27.         src:max_sentence_num x batch(文章数) x 300
  28.         
  29.         '''
  30.         self.src_mask = mask
  31.         src = src * math.sqrt(self.ninp)
  32.         src = self.pos_encoder(src)
  33.         output = self.transformer_encoder(src, src_key_padding_mask=self.src_mask)
  34.         #output = self.decoder(output)
  35.         return output

最终代码

  1. class segmentmodel(nn.Module):
  2.     def __init__(self, ninp=300, nhead=4, nhid=128, nlayers=6, dropout=0.5):
  3.         super(segmentmodel, self).__init__()
  4.         self.first_layer = First_TransformerModel(ninp,nhead,nhid,nlayers,dropout)
  5.         self.second_layer = Second_TransformerModel(ninp,nhead,nhid,nlayers,dropout)
  6.         self.linear = nn.Linear(ninp,2)  
  7.     
  8.     def pad(self, s, max_length):
  9.         s_length = s.size()[0]
  10.         v = torch.tensor(s.unsqueeze(0).unsqueeze(0))
  11.         padded = F.pad(v, (0, 0, 0, max_length - s_length))  # (1, 1, max_length, 300)
  12.         shape = padded.size()
  13.         return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
  14.     def pad_document(self, d, max_document_length):
  15.         d_length = d.size()[0]
  16.         v = d.unsqueeze(0).unsqueeze(0)
  17.         padded = F.pad(v, (0, 0,0, max_document_length - d_length ))  # (1, 1, max_length, 300)
  18.         shape = padded.size()
  19.         return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
  20.     
  21.     def forward(self, batch):
  22.         batch_size = len(batch)
  23.         sentences_per_doc = []
  24.         all_batch_sentences = []
  25.         for document in batch:
  26.             all_batch_sentences.extend(document)
  27.             sentences_per_doc.append(len(document))
  28.         lengths = [s.size()[0] for s in all_batch_sentences]
  29.         max_length = max(lengths)
  30.         #logger.debug('Num sentences: %s, max sentence length: %s', 
  31.                     # sum(sentences_per_doc), max_length)
  32.         padded_sentences = [self.pad(s, max_length) for s in all_batch_sentences]
  33.         big_tensor = torch.cat(padded_sentences, 1)  # (max_length, batch size, 300)
  34.         
  35.         mask = self.first_layer._generate_square_subsequent_mask(big_tensor,
  36.                                                                  lengths).cuda()
  37.         
  38.         firstlayer_out = self.first_layer(src = big_tensor,mask = mask)
  39.         # 句子数 x 300
  40.         
  41.         
  42.         #padded_output  batch x 300 
  43.         # 将各个文章中的句子分别取出来
  44.         encoded_documents =[]
  45.         index = 0
  46.         for sentences_count in sentences_per_doc:
  47.             end_index = index + sentences_count
  48.             encoded_documents.append(firstlayer_out[index : end_index, :])
  49.             index = end_index
  50.             
  51.             
  52.         #docuemnt_padding
  53.         doc_sizes = [doc.size()[0] for doc in encoded_documents]
  54.         max_doc_size = np.max(doc_sizes)
  55.         padded_docs = [self.pad_document(d, max_doc_size) for d in encoded_documents]
  56.         docs_tensor = torch.cat(padded_docs, 1)
  57.         #docs_tensor max_doc_size x batch x 300
  58.         
  59.         mask = self.second_layer._generate_square_subsequent_mask(docs_tensor,doc_sizes).cuda()
  60.         second_layer_out = self.second_layer(src = docs_tensor,mask = mask)
  61.         #去除最后一个句子
  62.         doc_outputs = []
  63.         
  64.         for i, doc_len in enumerate(doc_sizes):
  65.             doc_outputs.append(second_layer_out[0:doc_len - 1, i, :])  # -1 to remove last predic
  66.         sentence_outputs = torch.cat(doc_outputs, 0)
  67.         # 句子数 x 300
  68.         
  69.         
  70.         out = self.linear(sentence_outputs)
  71.         return out

值得注意的是,这里的第一层提取的句子信息,是采用的第一层的输出的一个向量来表示的,即从 seqlenth x N x 300 中选出 seqlenth维度的第一个作为句子表示,得到Nx300的tensor。
————————————————
来源:https://blog.csdn.net/qq_43645301/article/details/109279616

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/黑客灵魂/article/detail/838694
推荐阅读
相关标签
  

闽ICP备14008679号