赞
踩
完全用numpy写,第一层线性层,第二层relu激活层,第三层线性层
# 用numpy实现两层神经网络 import numpy as np N,D_in,H,D_out = 64,1000,100,10 x = np.random.randn(N,D_in) y = np.random.randn(N,D_out) # w1 = np.random.rand(D_in,H) # w2 = np.random.rand(H,D_out) # 初始化用rand效果很差 w1 = np.random.randn(D_in,H) w2 = np.random.randn(H,D_out) learning_rate = 1e-6 for i in range(500): # forward pass h = x.dot(w1) h_relu = np.maximum(h,0) y_pred = h_relu.dot(w2) # compute loss loss = np.square(y_pred-y).sum() print(i,loss) # back pass y_pred_grad = 2.0 * (y_pred-y) w2_grad = h_relu.T.dot(y_pred_grad) h_relu_grad = y_pred_grad.dot(w2.T) h_grad = h_relu_grad.copy() h_grad[h<0] = 0 w1_grad = x.T.dot(h_grad) # update weight w1 -= learning_rate * w1_grad w2 -= learning_rate * w2_grad
# 用torch实现两层神经网络 import torch N,D_in,H,D_out = 64,1000,100,10 x = torch.randn(N,D_in) y = torch.randn(N,D_out) # w1 = np.random.rand(D_in,H) # w2 = np.random.rand(H,D_out) w1 = torch.randn(D_in,H) w2 = torch.randn(H,D_out) learning_rate = 1e-6 for i in range(500): # forward pass h = x.mm(w1) # 用.mm代替.dot h_relu = torch.clamp(h,min=0) # clamp函数代替maximum,博客写了 y_pred = h_relu.mm(w2) # compute loss loss = (y_pred-y).pow(2).sum() # loss是一个值的tensor,需要item()将值取出 print(i,loss.item()) # back pass y_pred_grad = 2.0 * (y_pred-y) w2_grad = h_relu.t().mm(y_pred_grad) # torch的转置用.t() h_relu_grad = y_pred_grad.mm(w2.t()) h_grad = h_relu_grad.clone() # torch的复制用.clone() h_grad[h<0] = 0 w1_grad = x.t().mm(h_grad) # update weight w1 -= learning_rate * w1_grad w2 -= learning_rate * w2_grad
具体更改两点:
# 利用pytorch的autograd机制 import torch N,D_in,H,D_out = 64,1000,100,10 x = torch.randn(N,D_in) y = torch.randn(N,D_out) w1 = torch.randn(D_in,H,requires_grad=True) w2 = torch.randn(H,D_out,requires_grad=True) learning_rate = 1e-6 for i in range(500): # forward pass h = x.mm(w1) # 用.mm代替.dot h_relu = torch.clamp(h,min=0) # clamp函数代替maximum,博客写了 y_pred = h_relu.mm(w2) # compute loss loss = (y_pred-y).pow(2).sum() # loss是一个值的tensor,需要item()将值取出 print(i,loss.item()) # back pass # 不需要复杂的计算,直接用backward() loss.backward() # update weight with torch.no_grad(): # 节省内存空间 w1 -= learning_rate * w1.grad w2 -= learning_rate * w2.grad # 使用完梯度,要将梯度清零,否则每次迭代都会堆积 # 利用zero_()这一inplace函数,直接将w1、w2的梯度清零 w1.grad.zero_() w2.grad.zero_()
model里定义了各个层级,在下面的情况下, 初始化后的效果比较好(玄学)
# model import torch import torch.nn as nn N,D_in,H,D_out = 64,1000,100,10 x = torch.randn(N,D_in) y = torch.randn(N,D_out) model = nn.Sequential( nn.Linear(D_in,H), nn.ReLU(), nn.Linear(H,D_out) ) # 初始化model的权重w1、w2,mean和std决定了分布,可以调 nn.init.normal_(model[0].weight,mean = 0.0,std = 1.0) nn.init.normal_(model[2].weight,mean = 0.0,std = 1.0) learning_rate = 1e-6 for i in range(500): # forward pass y_pred = model(x) # compute loss loss = (y_pred-y).pow(2).sum() # loss是一个值的tensor,需要item()将值取出 print(i,loss.item()) # back pass # 不需要复杂的计算,直接用backward() loss.backward() # update weight with torch.no_grad(): # 节省内存空间 # 更新每一个参数 for para in model.parameters(): para -= learning_rate * para.grad # 使用完梯度,要将梯度清零,否则每次迭代都会堆积 # 利用zero_grad()清零 model.zero_grad()
在上面model的基础上,将model中的所有参数传入optim中,并设好learningrate
有adm、sgd等优化方式
SGD常用learning_rate应>1e-6,至于下面代码中最终结果也不错,我也不知道为何
# model import torch import torch.nn as nn N,D_in,H,D_out = 64,1000,100,10 x = torch.randn(N,D_in) y = torch.randn(N,D_out) model = nn.Sequential( nn.Linear(D_in,H), nn.ReLU(), nn.Linear(H,D_out) ) # 初始化model的权重w1、w2,mean和std决定了分布,可以调 nn.init.normal_(model[0].weight,mean = 0.0,std = 1.0) nn.init.normal_(model[2].weight,mean = 0.0,std = 1.0) optimizer = torch.optim.SGD( params= model.parameters(), lr = 1e-6 ) loss_fn = nn.MSELoss(reduction='sum') for i in range(500): # forward pass y_pred = model(x) # compute loss loss = loss_fn(y_pred,y) # loss是一个值的tensor,需要item()将值取出 print(i,loss.item()) # back pass # 不需要复杂的计算,直接用backward() loss.backward() # update weight optimizer.step() # 用step更新参数 optimizer.zero_grad() # 用zero_grad清零梯度
3.中的nn.model可以方便的定义两层神经网络模型,但不够灵活。故树用nn.module。
需要两个内部函数:
损失函数计算和后向传播不变
# module import torch import torch.nn as nn N,D_in,H,D_out = 64,1000,100,10 x = torch.randn(N,D_in) y = torch.randn(N,D_out) class TwoLayerNet(nn.Module): # 在init中继承类,并定义两层线性层 def __init__(self,D_in,H,D_out): super(TwoLayerNet,self).__init__() # 继承nn.Module self.linear1 = nn.Linear(D_in,H) self.linear2 = nn.Linear(H,D_out) # 在forward中定义前向传播 def forward(self,x): return self.linear2(torch.clamp(self.linear1(x),min=0)) model = TwoLayerNet(D_in,H,D_out) # optimizer = torch.optim.Adam( # params= model.parameters(), # lr = 1e-3 # ) optimizer = torch.optim.SGD( params= model.parameters(), lr = 1e-3 ) loss_fn = nn.MSELoss(reduction='sum') for i in range(500): # forward pass y_pred = model(x) # compute loss loss = loss_fn(y_pred,y) # loss是一个值的tensor,需要item()将值取出 print(i,loss.item()) # back pass # 不需要复杂的计算,直接用backward() loss.backward() # update weight optimizer.step() # 用step更新参数 optimizer.zero_grad() # 用zero_grad清零梯度
以上是我在b站某pytorch入门教程的第一课中得到的收获,于我而言,课程讲得真的很好,感兴趣的xd也可以去康康。
最终合适的框架应该是*5.*即:
end
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。