前面我们已经简单建立一个分类器的神经网络,虽然训练的效果比较一般,不过这就是一个神经网络大体应该具备的特征,后面的优化也就是基于这个不断进行尝试对某些部分进行优化以提高学习效率,我们接下来跳过Pytorch for former torch users,直接来看learning pytorch with examples
import numpy as np
#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N,D_in,H,D_out = 64,1000,100,10
#Create random input and output data
x = np.random.randn(N,D_in)
y = np.random.randn(N,D_out)
#Randomly initialize weights
w1 = np.random.randn(D_in,H)
w2 = np.random.randn(H,D_out)
learning_rate = 1e-6
for t in range(500):
#forward pass:compute predicted y
h = x.dot(w1)
h_relu = np.maximum(h,0)
y_pred = h_relu.dot(w2)
#compute and print loss
loss = np.square(y_pred-y).sum()
if t%100 == 99:
#backprop to compute gradients of w1 and w2 tith respect to loss
grad_y_pred = 2.0*(y_pred-y)
grad_w2 = h_relu.T.dot(grad_y_pred)
grad_h_relu = grad_y_pred.dot(w2.T)
grad_h = grad_h_relu.copy()
grad_h[h<0] = 0
grad_w1 = x.T.dot(grad_h)
#update weights
w1 -= learning_rate*grad_w1
w2 -= learning_rate*grad_w2
而理解了这个,代码中类似h = x.dot(w1)就好理解了
99 611.8403334325828
199 5.780260334791743
299 0.09678974435224459
399 0.0019321130866979581
499 4.126089452091746e-05
我们之前也介绍过什么是tensor,其实也就和numpy array一样,但不同的是pytorch tensor可以在gpu上跑,速度更快,同样,我们用pytorch tensor来写一个两层的神经网络
import torch
dtype = torch.float
device = torch.device("cpu")
#device = torch.device("cuda:0")#Uncommrnt this to run on GPU
#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N,D_in,H,D_out = 64,1000,100,10
#Create random input and output data
x = torch.randn(N,D_in,device=device,dtype=dtype)
y = torch.randn(N,D_out,device=device,dtype=dtype)
#Randomly initialize weights
w1 = torch.randn(D_in,H,device=device,dtype=dtype)
w2 = torch.randn(H,D_out,device = device, dtype = dtype)
learning_rate = 1e-6
for t in range(500):
#Forward pass:compute predicted y
h = x.mm(w1)
h_relu = h.clamp(min = 0)
y_pred = h_relu.mm(w2)
#compute and print loss
loss = (y_pred - y).pow(2).sum().item()
if t %100 == 99:
#backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 = h_relu.t().mm(grad_y_pred)
grad_h_relu = grad_y_pred.mm(w2.t())
grad_h = grad_h_relu.clone()
grad_h[h < 0] = 0
grad_w1 = x.t().mm(grad_h)
# update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
99 688.8875122070312
199 4.103602886199951
299 0.04172804579138756
399 0.0007906379760242999
499 8.704190258868039e-05
import torch
dtype = torch.float
device = torch.device("cpu")
#device = torch.device("cuda:0")#Uncommrnt this to run on GPU
#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10
#Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
#Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
learning_rate = 1e-6
for t in range(500):
#Forward pass:compute predicted y
y_pred = x.mm(w1).clamp(min=0).mm(w2)
#compute and print loss using operations on Tensors
#Now loss is a Tensor of shape(1,)
#loss.item() gets the a scalar value held in the loss.
loss = (y_pred - y).pow(2).sum()
if t % 100 == 99:
print(t, loss.item())
#use autograd to compute the backward pass.This call will compute the
#gradient of loss with respect to all Tensors with requires_grad = True
#After this call w1.grad and w2.grad will be Tensors holding the gradient
#of the loss with respect to w1 and w2 respectively.
#Manually update weights using gradient descent.Wrap in torch.no_grad()
#because weight have requires_grad = True,but we don't need to track this
#in autograde
#An alternative way is to operate on weight.data and weight.grad.data.
#Recall that tensor.data gives a tensor that shares the storage with
#tensor,but doesn't track history.
#You can also use torch.optim.SGD to achieve this
with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
#Manually zero the gradients after updating weights
99 468.9629821777344
199 2.9594504833221436
299 0.023482277989387512
399 0.0004086267144884914
499 5.1561615691753104e-05
import torch
class MyReLU(torch.autograd.Function):
def forward(ctx, input):
return input.clamp(min=0)
def backward(ctx, grad_output):
input, = ctx.saved_tensors
grad_input = grad_output.clone()
grad_input[input<0] = 0
return grad_input
dtype = torch.float
device = torch.device("cpu")
#device = torch.device("cuda:0")#Uncommrnt this to run on GPU
#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10
#Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
#Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
learning_rate = 1e-6
for t in range(500):
#Forward pass:compute predicted y
relu = MyReLU.apply
y_pred = x.mm(w1).clamp(min=0).mm(w2)
#compute and print loss using operations on Tensors
#Now loss is a Tensor of shape(1,)
#loss.item() gets the a scalar value held in the loss.
loss = (y_pred - y).pow(2).sum()
if t % 100 == 99:
print(t, loss.item())
#use autograd to compute the backward pass.This call will compute the
#gradient of loss with respect to all Tensors with requires_grad = True
#After this call w1.grad and w2.grad will be Tensors holding the gradient
#of the loss with respect to w1 and w2 respectively.
#Manually update weights using gradient descent.Wrap in torch.no_grad()
#because weight have requires_grad = True,but we don't need to track this
#in autograde
#An alternative way is to operate on weight.data and weight.grad.data.
#Recall that tensor.data gives a tensor that shares the storage with
#tensor,but doesn't track history.
#You can also use torch.optim.SGD to achieve this
with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
#Manually zero the gradients after updating weights
99 664.2792358398438
199 3.2187328338623047
299 0.023685619235038757
399 0.00038831226993352175
499 4.969811925548129e-05
nn中定义了一系列可以近似等同于神经网络层的modules,我们来看看用nn来完成tow-layer network:
import torch
#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10
#Create random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
#use the nn package to define our model as a sequence of layers.nn.Sequential
#is a Module which contains other Modules,and applies them in sequence to
#produce its output.Each Linear Module computes output from input using a
#linear function,and holds internal Tensors for its weight and bias
model = torch.nn.Sequential(
#the nn package also contains definitions of popular loss functions;in this
#case we will use Mean Squared Error(MSE) as our lossfunction.
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-4
for t in range(500):
#Forward pass:compute predicted y
y_pred = model(x)
#compute and print loss using operations on Tensors
loss = loss_fn(y_pred,y)
if t % 100 == 99:
print(t, loss.item())
#zero the gradients before running the backward pass
#this call will compute gradients for all learnable parameters in the model.
with torch.no_grad():
for param in model.parameters():
param -= learning_rate*param.grad
99 2.496163845062256
199 0.06094813346862793
299 0.003522129962220788
399 0.0002878477971535176
499 2.720016345847398e-05
import torch
#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10
#Create random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
#use the nn package to define our model as a sequence of layers.nn.Sequential
#is a Module which contains other Modules,and applies them in sequence to
#produce its output.Each Linear Module computes output from input using a
#linear function,and holds internal Tensors for its weight and bias
model = torch.nn.Sequential(
#the nn package also contains definitions of popular loss functions;in this
#case we will use Mean Squared Error(MSE) as our lossfunction.
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
for t in range(500):
#Forward pass:compute predicted y
y_pred = model(x)
#compute and print loss using operations on Tensors
loss = loss_fn(y_pred,y)
if t % 100 == 99:
print(t, loss.item())
#before the backward pass,use the optimizer object to zero all of the gradients
#for all the variables it will update.This is because by fault,gradients are
#accumulated in buffers whenever .backward() is called.
#this call will compute gradients for all learnable parameters in the model.
#calling the step function on an Optimizer makes an updata to its parameters
99 51.58766174316406
199 0.7978752851486206
299 0.0029272770043462515
399 9.20035017770715e-06
499 1.124239989991338e-08
从while循环往下看,第一行是更新step, 第二行是计算梯度, 第三行计算一阶矩的估计,即mean均值 第四行计算二阶距的估计,即variance,和方差类似,都是二阶距的一种。 第五、六行则是对mean和var进行校正,因为mean和var的初始值为0,所以它们会向0偏置,这样处理后会减少这种偏置影响。 第七行是梯度下降。注意alpha后的梯度是用一阶距和二阶距估计的。
import torch
class TwoLayerNet(torch.nn.Module):
def __init__(self,D_in,H,D_out):
super(TwoLayerNet, self).__init__()
self.linear1 = torch.nn.Linear(D_in,H)
self.linear2 = torch.nn.Linear(H,D_out)
def forward(self, x):
h_relu = self.linear1(x).clamp(min=0)
y_pred = self.linear2(h_relu)
return y_pred
#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10
#Create random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = TwoLayerNet(D_in,H,D_out)
#the nn package also contains definitions of popular loss functions;in this
#case we will use Mean Squared Error(MSE) as our lossfunction.
criterion = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-4
optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)
for t in range(500):
#Forward pass:compute predicted y
y_pred = model(x)
#compute and print loss using operations on Tensors
loss = criterion(y_pred,y)
if t % 100 == 99:
print(t, loss.item())
#before the backward pass,use the optimizer object to zero all of the gradients
#for all the variables it will update.This is because by fault,gradients are
#accumulated in buffers whenever .backward() is called.
#this call will compute gradients for all learnable parameters in the model.
#calling the step function on an Optimizer makes an updata to its parameters
import torch
import random
class TwoLayerNet(torch.nn.Module):
def __init__(self,D_in,H,D_out):
super(TwoLayerNet, self).__init__()
self.input_linear = torch.nn.Linear(D_in,H)
self.middle_linear = torch.nn.Linear(H,H)
self.output_linear = torch.nn.Linear(H,D_out)
def forward(self, x):
h_relu = self.input_linear(x).clamp(min=0)
for _ in range(random.randint(0,3)):
h_relu = self.middle_linear(h_relu).clamp(min=0)
y_pred = self.output_linear(h_relu)
return y_pred
#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10
#Create random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = TwoLayerNet(D_in,H,D_out)
#the nn package also contains definitions of popular loss functions;in this
#case we will use Mean Squared Error(MSE) as our lossfunction.
criterion = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-4
optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)
for t in range(500):
#Forward pass:compute predicted y
y_pred = model(x)
#compute and print loss using operations on Tensors
loss = criterion(y_pred,y)
if t % 100 == 99:
print(t, loss.item())
#before the backward pass,use the optimizer object to zero all of the gradients
#for all the variables it will update.This is because by fault,gradients are
#accumulated in buffers whenever .backward() is called.
#this call will compute gradients for all learnable parameters in the model.
#calling the step function on an Optimizer makes an updata to its parameters
