赞
踩
Norm,也即 Normalization,已经是深度神经网络模型中非常常规的操作了,但它背后的实现,原理和作用等,其实我们可以理解的更细致,本文会以最常用的 BatchNorm 和 LayerNorm 为例(其他 Norm 方法大同小异),通过 Q&A 的形式,去深入理解关于 Norm 的细节知识点。
所有文字不如代码准确,决定先上一个简化版的MyBN1d和MyLN镇楼:
MyBN1d:
- import torch.nn as nn
- import torch
-
- class MyBN1d(nn.Module):
- def __init__(self, momentum=0.1, eps=1e-5, feat_dim=2):
- super(MyBN1d, self).__init__()
- # 更新self._running_xxx时的动量
- self._momentum = momentum
- # 防止分母计算为0
- self._eps = eps
-
- # running_mean和running_var都是要存在模型weights里,但是不需要更新参数,所以self.register_buffer
- self.register_buffer('_running_mean', torch.zeros(1,feat_dim,1))
- self.register_buffer('_running_var', torch.ones(1,feat_dim,1))
-
- # weight和bias都是需要训练时候更新参数的
- self._weight = nn.Parameter(torch.ones(1,feat_dim,1))
- self._bias = nn.Parameter(torch.zeros(1,feat_dim,1))
-
- def forward(self, x):
- if self.training: #self.training是nn.Module自带参数,net.train()和net.eval()会改变这个值
- # x_mean = x.mean([0,2])
- # x_var = x.var([0,2], correction=0) #correction=0表示分母是不是归一化
-
- x_mean = x.mean(dim=(0,2), keepdims=True)
- x_var = x.var(dim=(0,2), keepdims=True, correction=0)
- # 对应running_mean的更新公式,下面注释是备用写法
- # self._running_mean = (1-self._momentum)*self._running_mean + self._momentum*x_mean
- # self._running_var = (1-self._momentum)*self._running_var + self._momentum*x_var
-
- self._running_mean -= self._momentum*(x_mean-self._running_mean)
- self._running_var -= self._momentum * (x_mean - self._running_var)
-
- # [None,:,None]不常看有点恶心,相当于x_mean.unsqueeze(0).unsqueeze(2),也就是把一个shape是(feat_dim)的Tensor变成了(1,feat_dim,1),以下是备用写法
- # x_hat = (x-x_mean[None,:,None])/torch.sqrt(x_var[None,:,None]+self._eps)
- x_hat = (x-x_mean)/torch.sqrt(x_var+self._eps)
- else:
- # 注意上面训练的时候不要用running_mean做差输出
- # x_hat = (x-self._running_mean[None,:,None])/torch.sqrt(self._running_var[None,:,None]+self._eps)
- x_hat = (x-self._running_mean)/torch.sqrt(self._running_var+self._eps)
- return self._weight*x_hat + self._bias
-
- #CV中的feat_num, num_features, hidden_size都是指中间那个维度
- feat_dim=3
- x = torch.randn(2,feat_dim,5)
- bn1d = nn.BatchNorm1d(feat_dim)
- out_bn1d = bn1d(x)
- print(out_bn1d)
-
- mybn1d = MyBN1d(feat_dim=feat_dim)
- out_mybn1d = mybn1d(x)
- print(out_mybn1d)
-
- '''
- 两个输出都是一样的:
- tensor([[[-0.9176, 1.4579, 0.2473, 0.7218, 1.0444],
- [-0.7802, 0.3168, 0.8793, -0.4985, 2.3281],
- [ 1.6315, 0.4317, 0.1170, -0.3623, -1.5179]],
- [[ 0.8724, 0.0181, -0.9045, -0.7437, -1.7963],
- [ 0.0180, -0.1456, -0.6095, -1.5661, 0.0578],
- [-1.6694, 0.4632, -0.6507, 0.9601, 0.5968]]],
- grad_fn=<NativeBatchNormBackward0>)
- tensor([[[-0.9176, 1.4579, 0.2473, 0.7218, 1.0444],
- [-0.7802, 0.3168, 0.8793, -0.4985, 2.3281],
- [ 1.6315, 0.4317, 0.1170, -0.3623, -1.5179]],
- [[ 0.8724, 0.0181, -0.9045, -0.7437, -1.7963],
- [ 0.0180, -0.1456, -0.6095, -1.5661, 0.0578],
- [-1.6694, 0.4632, -0.6507, 0.9601, 0.5968]]],
- grad_fn=<AddBackward0>)
- '''
MyLN:
- import torch
- from torch import nn
-
- class MyLN(nn.Module):
- def __init__(self, normalized_shape, # 在哪个维度上做LN
- eps: float = 1e-5, # 防止分母为0
- elementwise_affine: bool = True): # 是否使用可学习的缩放因子和偏移因子
- super(MyLN, self).__init__()
- # 需要对哪个维度的特征做LN, torch.size查看维度
- self.normalized_shape = normalized_shape # [c,w*h]
- self.eps = eps
- self.elementwise_affine = elementwise_affine
- # 构造可训练的缩放因子和偏置
- if self.elementwise_affine:
- self.weight = nn.Parameter(torch.ones(normalized_shape)) # [c,w*h]
- self.bias = nn.Parameter(torch.zeros(normalized_shape)) # [c,w*h]
-
- def forward(self, x: torch.Tensor): # [b,c,w*h]
- # 需要做LN的维度和输入特征图对应维度的shape相同
- assert self.normalized_shape == x.shape[-len(self.normalized_shape):] # [-2:]
- # 需要做LN的维度索引
- dims = [-(i + 1) for i in range(len(self.normalized_shape))] # [b,c,w*h]维度上取[-1,-2]维度,即[c,w*h]
- # 计算特征图对应维度的均值和方差
- mean = x.mean(dim=dims, keepdims=True) # [b,1,1]
- mean_x2 = (x ** 2).mean(dim=dims, keepdims=True) # [b,1,1]
- var = mean_x2 - mean ** 2 # [b,c,1,1]
- x_norm = (x - mean) / torch.sqrt(var + self.eps) # [b,c,w*h]
- # 线性变换
- if self.elementwise_affine:
- x_norm = self.weight * x_norm + self.bias # [b,c,w*h]
- return x_norm
-
- if __name__ == '__main__':
- x = torch.randn(2,3,5)
- my_ln = MyLN(x.shape[1:])
- print(my_ln(x))
-
- ln = nn.LayerNorm(x.shape[1:])
- print(ln(x))
- '''
- 两个输出都是一样的:
- tensor([[[-0.4581, 1.5668, 0.6686, -0.4423, -0.7992],
- [ 0.1808, 0.7245, 0.3380, -1.1207, 1.3641],
- [ 0.4380, -2.3911, -0.1461, -0.7776, 0.8543]],
- [[ 1.3458, 0.4072, -2.2993, -0.7033, -0.7776],
- [ 0.3645, 0.2430, 0.0801, 0.5956, 0.5822],
- [ 0.1153, 0.9114, -0.0091, 1.0779, -1.9336]]],
- grad_fn=<AddBackward0>)
- tensor([[[-0.4581, 1.5668, 0.6686, -0.4423, -0.7992],
- [ 0.1808, 0.7245, 0.3380, -1.1207, 1.3641],
- [ 0.4380, -2.3911, -0.1461, -0.7776, 0.8543]],
- [[ 1.3458, 0.4072, -2.2993, -0.7033, -0.7776],
- [ 0.3645, 0.2430, 0.0801, 0.5956, 0.5822],
- [ 0.1153, 0.9114, -0.0091, 1.0779, -1.9336]]],
- grad_fn=<NativeLayerNormBackward0>)
- '''
BN在训练和测试时的差异
对于 BN,在训练时,是对每一个 batch 的训练数据进行归一化,也即用每一批数据的均值和方差。
而在测试时,比如进行一个样本的预测,就并没有 batch 的概念,因此,这个时候用的均值和方差是在训练过程中通过滑动平均得到的均值和方差,这个会和模型权重一起,在训练完成后一并保存下来。
对于 BN,是对每一批数据进行归一化到一个相同的分布,而每一批数据的均值和方差会有一定的差别,而不是用固定的值,这个差别实际上也能够增加模型的鲁棒性,并会在一定程度上减少过拟合。
但是一批数据和全量数据的均值和方差相差太多,又无法较好地代表训练集的分布,因此,BN 一般要求将训练集完全打乱,并用一个较大的 batch 值,去缩小与全量数据的差别。
卷积层, 例如对于图像的卷积的时候, 我们往往不会考虑每一个像素, 注意, 实际上, 往往每一个像素是作为一个特征, 并且还有其
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。