赞
踩
import torch.nn as nn import torch.nn.functional as F import torch import math class Attention(nn.Module): def forward(self, Q, K, V, mask=None, dropout=None): scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(Q.size(-1)) if mask: # 因为一个 batch 中有的句子长有的短,比较短的句子会补很多 0, # 那么这句话在进行 self attention 的时候算完权重之后,把那些正常的词对于补 0 位置的权重依赖设成无穷小 # 然后做 softmax 生成 attention 权重向量的时候这些值就会变成 0 而不影响训练过程 scores = scores.mask_fill(mask==0, -1e9) attention = F.softmax(scores, dim=-1) if dropout: attention = dropout(attention) return torch.matmul(attention, V), attention
Attention
中进行矩阵乘法运算class Attention(nn.Module): def __init__(self, d_model, d_q, dropout=0.1): super(Attention, self).__init__() self.d_model = d_model self.d_q = d_q self.linear_q = nn.Linear(d_model, d_q) self.linear_k = nn.Linear(d_model, d_q) self.linear_v = nn.Linear(d_model, d_q) self.dropout = nn.Dropout(p=dropout) def forward(self, x, mask=None, dropout=None): batch_size, seq_len, d_model = x.shape #Q (batchsize, seq_len, 64) Q = self.linear_q(x) K = self.linear_q(x) V = self.linear_q(x) # score (batchsize, seq_len, seq_len) scores = torch.matmul(Q, K.transpose(1,2)) / math.sqrt(Q.size(-1)) if mask: # 因为一个 batch 中有的句子长有的短,比较短的句子会补很多 0, # 那么这句话在进行 self attention 的时候就把那些补 0 位置的 score 设成无穷小 # 然后做 softmax 生成 attention 权重向量的时候这些值就会变成 0 而不影响训练过程 scores = scores.mask_fill(mask==0, -1e9) attention = F.softmax(scores, dim=-1) if dropout: attention = dropout(attention) return torch.matmul(attention, V), attention
import torch.nn as nn from .single import Attention class MultiHeadedAttention(nn.Module): """ Take in model size and number of heads. """ def __init__(self, h, d_model, dropout=0.1): super().__init__() assert d_model % h == 0 # We assume d_v always equals d_k self.d_k = d_model // h self.h = h self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)]) self.output_linear = nn.Linear(d_model, d_model) self.attention = Attention() self.dropout = nn.Dropout(p=dropout) def forward(self, query, key, value, mask=None): batch_size = query.size(0) # 1) Do all the linear projections in batch from d_model => h x d_k query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2) for l, x in zip(self.linear_layers, (query, key, value))] # 2) Apply attention on all the projected vectors in batch. x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout) # 3) "Concat" using a view and apply a final linear. x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k) return self.output_linear(x)
class MultiHeadedAttention(nn.Module): def __init__(self, h, d_model, dropout=0.1): super(MultiHeadedAttention, self).__init__() assert d_model % h == 0 # 编码词向量的空间维度,默认 768 self.d_model = d_model # head 数量,默认 12 self.h = h self.d_q = d_model // h self.d_k = d_model // h # 默认 64 self.d_v = d_model // h self.linear_q = nn.Linear(d_model, d_model) self.linear_k = nn.Linear(d_model, d_model) self.linear_v = nn.Linear(d_model, d_model) self.dropout = nn.Dropout(p=dropout) self.output_linear = nn.Linear(d_model, d_model) def forward(self, x, mask=None): # Q -> (batchsize, seq_len, 64) # K -> (batchsize, seq_len, 64) # V -> (batchsize, seq_len, 64) # Q @ K.t -> (batchsize, seq_len, seq_len) # 多个 head 的时候,我们希望在每一个 head 表示的空间中,都能完成这个过程 batch_size, seq_len, d_model = x.shape # linear 只是在最后一个维度上进行线性映射,所以这边的 reshape 也是将最后一个维度的 d_model 拆分成了 h * d_q (或者 d_k) # 交换 1,2 轴之后变成了 (batchsize, h, seq_len, d_q) 相当于在每一个 head 的空间内,得到维度为 seq_len * d_q 的张量 Q, K, V Q = self.linear_q(x).reshape(batch_size, seq_len, self.h, self.d_q).transpose(1, 2) # (batch, nh, n, dk) K = self.linear_k(x).reshape(batch_size, seq_len, self.h, self.d_k).transpose(1, 2) # (batch, nh, n, dk) V = self.linear_v(x).reshape(batch_size, seq_len, self.h, self.d_v).transpose(1, 2) # (batch, nh, n, dv) # (batchsize, h, seq_len, seq_len) scores = torch.matmul(Q, K.transpose(2, 3)) / math.sqrt(self.d_q) # (batchsize, h, seq_len, seq_len) attention = torch.softmax(scores, dim=-1) # batch, nh, n, n attention = self.dropout(attention) # (batchsize, h, seq_len, d_k) result = torch.matmul(attention, V) # (batchsize, seq_len, h, d_k) -> reshape -> (batch, seq_len, d_model) result = result.transpose(1, 2).reshape(batch_size, seq_len, self.d_model) # batch, n, dim_v return self.output_linear(result)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。