赞
踩
神经网络参数初始化非常重要,适合的初始化可以简化训练过程,提高精确率;参数初始化不是一成不变的,需要根据网络结构、激活函数以及优化算法等进行分析;
本篇记录5种初始化参数方法
1、将参数初始化为0
2、随机初始化参数,np.random.randn(layer_dims[l],layer_dims[l-1])*0.01 然后乘以0.01, 乘以0.01是为了尽量避免梯度消失,随机生成的参数服从标准正态分布(即均值为0, 方差为1的分布),统计学中方差是每个样本减去总体样本均值的平方的平均数
3、xavier初始化方法,np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(1 / layers_dims[l - 1]) 激活函数为tanh时,采用该方法效果较好
4、np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1]) 激活函数为relu时,采用该方法效果较好
5、np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / ((layers_dims[l]+layers_dims[l - 1]))
完整代码如下:
- #对比几种初始化方法
- import numpy as np
- import matplotlib.pyplot as plt
-
- #初始化为0
- def initialize_parameters_zeros(layers_dims):
- """
- Arguments:
- layer_dims -- python array (list) containing the size of each layer.
- Returns:
- parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
- W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
- b1 -- bias vector of shape (layers_dims[1], 1)
- ...
- WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
- bL -- bias vector of shape (layers_dims[L], 1)
- """
- parameters = {}
- L = len(layers_dims) # number of layers in the network
-
- for l in range(1, L):
- parameters['W' + str(l)] = np.zeros((layers_dims[l], layers_dims[l - 1]))
- parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
- return parameters
-
- #随机初始化
- def initialize_parameters_random(layers_dims):
- """
- Arguments:
- layer_dims -- python array (list) containing the size of each layer.
- Returns:
- parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
- W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
- b1 -- bias vector of shape (layers_dims[1], 1)
- ...
- WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
- bL -- bias vector of shape (layers_dims[L], 1)
- """
- np.random.seed(3) # This seed makes sure your "random" numbers will be the as ours
- parameters = {}
- L = len(layers_dims) # integer representing the number of layers
- for l in range(1, L):
- parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1])*0.01
- parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
- return parameters
-
- #xavier initialization
- def initialize_parameters_xavier(layers_dims):
- """
- Arguments:
- layer_dims -- python array (list) containing the size of each layer.
- Returns:
- parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
- W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
- b1 -- bias vector of shape (layers_dims[1], 1)
- ...
- WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
- bL -- bias vector of shape (layers_dims[L], 1)
- """
- np.random.seed(3)
- parameters = {}
- L = len(layers_dims) # integer representing the number of layers
- for l in range(1, L):
- parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(1 / layers_dims[l - 1])
- parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
- return parameters
-
- #He initialization
- def initialize_parameters_he(layers_dims):
- """
- Arguments:
- layer_dims -- python array (list) containing the size of each layer.
- Returns:
- parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
- W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
- b1 -- bias vector of shape (layers_dims[1], 1)
- ...
- WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
- bL -- bias vector of shape (layers_dims[L], 1)
- """
- np.random.seed(3)
- parameters = {}
- L = len(layers_dims) # integer representing the number of layers
-
- for l in range(1, L):
- parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])
- parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
- return parameters
-
- def initialize_parameters_yo(layers_dims):
- """
- Arguments:
- layer_dims -- python array (list) containing the size of each layer.
- Returns:
- parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
- W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
- b1 -- bias vector of shape (layers_dims[1], 1)
- ...
- WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
- bL -- bias vector of shape (layers_dims[L], 1)
- """
- np.random.seed(3)
- parameters = {}
- L = len(layers_dims) # integer representing the number of layers
-
- for l in range(1, L):
- parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / (layers_dims[l]+layers_dims[l - 1]))
- parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
- return parameters
-
- def relu(Z):
- """
- :param Z: Output of the linear layer
- :return:
- A: output of activation
- """
- A = np.maximum(0,Z)
- return A
-
-
- def initialize_parameters(layer_dims):
- """
- :param layer_dims: list,每一层单元的个数(维度)
- :return:dictionary,存储参数w1,w2,...,wL,b1,...,bL
- """
- np.random.seed(3)
- L = len(layer_dims)#the number of layers in the network
- parameters = {}
- for l in range(1, L):
- parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*np.sqrt(2 / (layer_dims[l - 1]+layer_dims[l]))
- parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))
- return parameters
-
- def forward_propagation(initialization="yo"):
- np.random.seed(3)
- data = np.random.randn(1000, 100000)
- print("data shape : ", data.shape)
- layers_dims = [1000, 800, 500, 300, 200, 100, 10]
- num_layers = len(layers_dims)
- # Initialize parameters dictionary.
- if initialization == "zeros":
- parameters = initialize_parameters_zeros(layers_dims)
- elif initialization == "random":
- parameters = initialize_parameters_random(layers_dims)
- elif initialization == "xavier":
- parameters = initialize_parameters_xavier(layers_dims)
- elif initialization == "he":
- parameters = initialize_parameters_he(layers_dims)
- elif initialization == "yo":
- parameters = initialize_parameters_yo(layers_dims)
- A = data
- for l in range(1, num_layers):
- A_pre = A
- W = parameters["W" + str(l)]
- print("W shape : ", W.shape)
- b = parameters["b" + str(l)]
- z = np.dot(W, A_pre) + b # dot: 向量相乘是内积运算即对应相乘再相加得到一个常量;
- # dot: 矩阵乘法前者的列数等于后者的行数,axb * bxc = axc
- # A = np.tanh(z) #relu activation function
- A = relu(z)
- print("A shape : ", A.shape)
- print(" A flatten shape: ", A.flatten().shape)
- plt.subplot(2, 3, l)
- plt.hist(A.flatten(), facecolor='g')
- plt.xlim([-1, 1])
- plt.yticks([])
- plt.savefig("save_picture/%s.jpg" % initialization, dpi=500)
- plt.show()
- plt.close()
-
- if __name__ == '__main__':
- forward_propagation()
下面贴上采用方式5对lenet5参数的初始化
- import torch.nn as nn
- import numpy as np
- import torch
-
- np.random.seed(1307)
-
- class LeNet(nn.Module):
- def __init__(self, cfg):
- super(LeNet, self).__init__()
- self.features = self.features_layers(cfg)
-
- self.classifier = self.classifier_layers(cfg)
-
- def forward(self, x):
- out = self.features(x)
- out = out.view(out.size(0), -1)
- out = self.classifier(out)
- return out
-
- def features_layers(self, cfg):
- layers = []
- in_channels = 1
-
- conv1 = nn.Conv2d(in_channels, cfg[0], kernel_size=3, stride=1, padding=1)
- layers += [self.init_weight_bias(conv1, [cfg[0], in_channels, 3, 3], cfg[0])]
- # layers += [conv1]
- layers = self.make_activation(cfg[1], layers)
- layers = self.make_pool(cfg[2], layers)
-
- conv2 = nn.Conv2d(cfg[0], cfg[3], kernel_size=5)
- layers += [self.init_weight_bias(conv2, [cfg[3], cfg[0], 5, 5], cfg[3])]
- # layers += [conv2]
- layers = self.make_activation(cfg[4], layers)
- layers = self.make_pool(cfg[5], layers)
-
- return nn.Sequential(*layers)
-
- def classifier_layers(self, cfg):
- layers = []
- num_classes = 10
-
- linear1 = nn.Linear(cfg[3]*5*5, cfg[6])
- layers += [self.init_weight_bias(linear1, [cfg[6], cfg[3]*5*5], cfg[6])]
- # layers += [linear1]
- layers = self.make_activation(cfg[7], layers)
-
- linear2 = nn.Linear(cfg[6], cfg[8])
- layers += [self.init_weight_bias(linear2, [cfg[8], cfg[6]], cfg[8])]
- # layers += [linear2]
- layers = self.make_activation(cfg[9], layers)
-
- linear3 = nn.Linear(cfg[8], num_classes)
- layers += [self.init_weight_bias(linear3, [num_classes, cfg[8]], num_classes)]
- # layers += [linear3]
-
- return nn.Sequential(*layers)
-
- def make_activation(self, activation, layers):
- if activation == "relu":
- layers += [nn.ReLU(inplace=True)]
- elif activation == "sigmoid":
- layers += [nn.Sigmoid()]
- elif activation == "tanh":
- layers += [nn.Tanh()]
- else:
- print("the activation is wrong!")
- return layers
-
- def make_pool(self, pool, layers):
- if pool == "maxpool":
- layers += [nn.MaxPool2d(2)]
- elif pool == "avgpool":
- layers += [nn.AvgPool2d(2)]
- else:
- print("the convolutional pool is wrong!")
- return layers
-
- def init_weight_bias(self, layer, weight_size, bias_size):
- length = len(weight_size)
- if length == 2:
- init_weights = torch.Tensor(np.random.randn(weight_size[0], weight_size[1])*np.sqrt(2/(weight_size[1]+weight_size[0])))
- else:
- init_weights = torch.Tensor(np.random.randn(weight_size[0], weight_size[1],
- weight_size[2], weight_size[3]) *
- np.sqrt(2/(weight_size[3]+weight_size[2]+weight_size[1]+weight_size[0])))
- init_bias = torch.Tensor(np.random.uniform(0, 0, bias_size))
-
- layer.weight = nn.Parameter(init_weights)
-
- layer.bias = nn.Parameter(init_bias)
-
-
- return layer
-
- if __name__ == '__main__':
- cfg = [5, "sigmoid", "maxpool", 5, "sigmoid", "maxpool", 50, "relu", 150, "relu"]
- net = LeNet(cfg)
- print(net)
-
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。