赞
踩
本专栏主要是深度学习/自动驾驶相关的源码实现,获取全套代码请参考
Multi-Head Attention是一种注意力机制,是transfomer的核心机制,就是图中黄色框内的部分.
Multi-Head Attention的原理是通过将模型分为多个头,形成多个子空间,让模型关注不同方面的信息。每个头独立进行注意力运算,得到一个注意力权重矩阵。输出的结果再通过线性变换和拼接操作组合在一起。这样可以提高模型的表示能力和泛化性能。
在Multi-Head Attention中,每个头的权重矩阵是随机初始化生成的,并在训练过程中通过梯度下降等优化算法进行更新。通过这种方式,模型可以学习到如何将输入序列的不同部分关联起来,从而捕获更多的上下文信息。
总之,Multi-Head Attention通过将模型分为多个头,形成多个子空间,让模型关注不同方面的信息,提高了模型的表示能力和泛化性能。它的源码实现基于Scaled Dot-Product Attention,通过并行运算和组合输出来实现多头注意力机制。
import torch from torch import nn class MultiheadAttention(nn.Module): def __init__(self, embed_dim, num_heads, att_dropout=0.1, out_dropout=0.1, average_attn_weights=True, use_separate_proj_weight = False, device=None, dtype=None): super(MultiheadAttention, self).__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.att_dropout = nn.Dropout(att_dropout) self.out_dropout = nn.Dropout(out_dropout) self.average_attn_weights = average_attn_weights self.head_dim = embed_dim // num_heads self.scale = self.head_dim ** 0.5 assert self.embed_dim == self.num_heads * self.head_dim, \ 'embed_dim <{}> must be divisible by num_heads <{}>'.format(self.embed_dim, self.num_heads) self.fuse_heads = nn.Linear(self.embed_dim, self.embed_dim) factory_kwargs = {'device': device, 'dtype': dtype} self.use_separate_proj_weight = use_separate_proj_weight # 是否对输入进行线性映射 if not use_separate_proj_weight: self.in_proj_weight = nn.Parameter(torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)) self.in_proj_bias = nn.Parameter(torch.empty(3 * embed_dim, **factory_kwargs)) self._reset_parameters() def _reset_parameters(self): nn.init.xavier_uniform_(self.in_proj_weight) nn.init.constant_(self.in_proj_bias, 0.) def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, identity=None, query_pos=None, key_pos=None, use_separate_proj_weight: bool = False): ''' Args: query: key: value: identity: query_pos: key_pos: use_separate_proj_weight: 参考pytorch Returns: ''' assert query.dim() == 3 and key.dim() == 3 and value.dim() == 3 assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}" tgt_len, bsz, embed_dim = query.shape # [查询数量 batch数量 特征维度] src_len, _, _ = key.shape # [被查询数量,_,_] # 默认和query进行shortcut(要在位置编码前,因为output为输出特征,特征和原特征shortcut,下一层再重新加位置编码,否则不就重了) if identity is None: identity = query.clone() # 位置编码 if query_pos is not None: query = query + query_pos if key_pos is not None: key = key + key_pos # 是否需要对输入进行映射,mmcv中 q=k=v,那么就需要此处进行映射 if not self.use_separate_proj_weight: assert self.in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None" query, key, value = nn.functional._in_projection_packed(query, key, value, self.in_proj_weight, self.in_proj_bias) # 特征划分为self.num_heads 份 [tgt,b,embed_dim] -> [b,n_h, tgt, d_h] # [n,b,n_h*d_h] -> [b,n_h,n,d_h] 主要是target和source之前的特征匹配和提取, batch和n_h维度不处理 query = query.contiguous().view(tgt_len, bsz, self.num_heads, self.head_dim).permute(1, 2, 0, 3) key = key.contiguous().view(src_len, bsz, self.num_heads, self.head_dim).permute(1, 2, 0, 3) value = value.contiguous().view(src_len, bsz, self.num_heads, self.head_dim).permute(1, 2, 0, 3) # [b,n_h,tgt_len,src_len] # Scaled Dot-Product Attention attention = query @ key.transpose(-2, -1) attention /= self.scale # 参考: https://blog.csdn.net/zwhdldz/article/details/135462127 attention = torch.softmax(attention, dim=-1) # 行概率矩阵 attention = self.att_dropout(input=attention) # 正则化方法 DropKey,用于缓解 Vision Transformer 中的过拟合问题 # [b,n_h,tgt_len,d_h] = [b,n_h,tgt_len,src_len] * [b,n_h,src_len,d_h] output = attention @ value # [b,n_h,tgt_len,d_h] -> [b,tgt_len,embed_dim] output = output.permute(0, 2, 1, 3).contiguous().view(tgt_len, bsz, embed_dim) # 头之间通过全连接融合一下 output = self.fuse_heads(output) output = self.out_dropout(output) # shortcut output = output + identity # 多头head求平均 if self.average_attn_weights: attention = attention.sum(dim=1) / self.num_heads # [tgt_len,b,embed_dim],[b,tgt_len,src_len] return output, attention if __name__ == '__main__': query = torch.rand(size=(10, 2, 64)) key = torch.rand(size=(5, 2, 64)) value = torch.rand(size=(5, 2, 64)) query_pos = torch.rand(size=(10, 2, 64)) key_pos = torch.rand(size=(5, 2, 64)) att = MultiheadAttention(64, 4) # 返回特征采样结果和attention矩阵 output = att(query=query, key=key, value=value,query_pos=query_pos,key_pos=key_pos) pass
在实现中,参考pytorch我在内部加输入映射,具体作用参考下一篇博客。
如需获取全套代码请参考
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。