赞
踩
用 python 从零 手动实现 简单的 深度学习框架(乞丐版)。从Tensor的实现,到 MLP 的构建,手写损失函数、随机梯度下降算法,能够实现基础 NN 网络的自动微分和反向传播,跑通MNIST
除了random(用于生成随机数,初始化网络参数) 无需 import 任何库
这里的 Tensor 是极简版,只相当于paddle.Tensor的单元素,只有最基础、最重要的功能:加、乘、ReLU、反向传播
在这里定义:如果一个节点是由其他节点计算得出,则这些其他节点为该节点的父节点;例如 c = a + b, 则 a,b 为c 的父节点
节点记录自己的父节点,是为了反向传播计算所有节点梯度
class Tensor: def __init__(self,data,_parents=()): self.data = data # 节点的值 self.grad = 0 # 梯度 self._backward = lambda: None # 初始化反向传播,等效为一个空函数 self._parents = set(_parents) # 节点的父节点 def __repr__(self): # 优化输出 return f'Tensor(data={self.data})' def __add__(self, other): # 实现加法运算 out = Tensor(self.data + other.data, (self,other)) def _backward(): self.grad += 1.0 * out.grad other.grad += 1.0 * out.grad out._backward = _backward # 定义输出节点的反向传播函数 return out def __mul__(self, other): # 实现乘法运算 out = Tensor(self.data * other.data, (self,other)) def _backward(): self.grad += other.data * out.grad other.grad += self.data * out.grad out._backward = _backward return out def relu(self): # 实现 ReLu 运算, 激活函数 out = Tensor(0 if self.data<0 else self.data, (self,)) def _backward(): self.grad += (self.data>0)*out.grad out._backward = _backward return out def backward(self): # 实现所有节点的反向传播,计算梯度 # 广搜 确定节点的拓扑排序 topo = [] vis = set() def build_topo(v): if v not in vis: v.grad = 0.0 vis.add(v) topo.append(v) for parent in v._parents: build_topo(parent) build_topo(self) # 反向传播 self.grad = 1.0 for v in topo: v._backward()
举个栗子:z = 2x+y 计算 x 对于 z 的梯度,即 z 关于 x 的偏导,很容易得到 x.grad = 2
再加一条:x = 3a + b 计算 a 对于 z 的梯度。求复合函数的导数,我们需要用到链式法则
∂ z ∂ a = ∂ z ∂ x ⋅ ∂ x ∂ a \frac{\partial{z}}{\partial{a}}=\frac{\partial{z}}{\partial{x}}·\frac{\partial{x}}{\partial{a}} ∂a∂z=∂x∂z⋅∂a∂x
所以:a.grad = 3 * x.grad = 6 解释了为什么求节点梯度时需要乘上out.grad
下面解释为什么梯度是用 +=
同样举个栗子:z = x + x, 此时 x.grad = 2 如果使用 self.grad = 1.0 * out.grad,则会得到 x.grad = 1 的错误结果
ReLU为分段函数,在求梯度时进行分段讨论即可
# 测试梯度求解
a = Tensor(2)
b = Tensor(3)
x = a*b
z = x + x
z.grad = 1 # 由于求解父节点的梯度依赖于子节点梯度,所以应该讲 z.grad 初始化为 1
z._backward() # 求 z 父节点 x 的梯度
x._backward() # 求 x 父节点 a,b 的梯度
a.grad, b.grad, x.grad, z.grad
(6.0, 4.0, 2.0, 1)
由于 _backward() 只能计算父节点的梯度,如果要求所有节点的梯度,则要多次调用 _backward(),非常不方便
所以定义 backward() 反向传播函数:先用广度优先搜索(BFS),得到所有节点,再调用所有节点的 _backward(),计算所有节点的梯度
之所以使用广搜,而不是深度优先搜索(DFS),是因为求解父节点的梯度依赖于子节点梯度,需要按照一定顺序,调用节点的 _backward()
# 测试 backward()
a = Tensor(2)
b = Tensor(3)
x = a*b
z = x + x
z.grad = 1 # z.grad 初始化为 1
z.backward()
a.grad, b.grad, x.grad, z.grad
(6.0, 4.0, 2.0, 1.0)
相当于一个矩阵乘法和一个矩阵加法,输入一个一维矩阵,乘上权重矩阵 w,再加上一个偏置矩阵 b
import random class Linear: def __init__(self, in_features, out_features): self.in_features = in_features self.out_features = out_features self.w = [[Tensor(random.random())] * out_features for _ in range(in_features)] self.b = [Tensor(random.random())] * out_features def __call__(self, x): return self.forward(x) def forward(self, x): # 手动实现矩阵乘法 out = [Tensor(0.0)] * self.out_features for i in range(self.out_features): out[i] = out[i]+self.b[i] for j in range(self.in_features): out[i] = out[i] + x[j] * self.w[j][i] return out def parameters(self): # 获得 Linear 的所有参数,可用于参数更新 return [self.w, self.b]
# 测试 Linear
net = Linear(2,1)
x = [Tensor(1.0)] * 2
net(x)
[Tensor(data=1.7437665109884302)]
这里采用均方损失
比较真实值和预估值,例如房屋售价和估价
假设y是真实值,y是估计值,我们可以比较
l
(
y
,
y
^
)
=
1
2
(
y
−
y
^
)
2
l(y,\widehat{y})=\frac{1}{2}(y-\widehat{y})^2
l(y,y
)=21(y−y
)2
这个叫做平方损失(或均方损失)
def squared_loss(y_hat, y):
"""均方损失"""
loss = Tensor(0.0)
for i in range(len(y)):
tmp = y_hat[i] + y[i]*Tensor(-1.0)
tmp = tmp*tmp*Tensor(0.5)
loss = loss + tmp
return loss
这里采用随机梯度下降
def sgd(params, lr):
"""随机梯度下降"""
if isinstance(params, list):
for i in params:
sgd(i, lr)
elif isinstance(params, Tensor):
params.data -= params.grad*lr
params.grad = 0.0
import numpy as np # 与框架关系不大,后面手写字数据集用 def to_tensor(x): """将 list/numpy.ndarray 中的每个元素,转换成 Tensor""" ans = [] if isinstance(x, list) or isinstance(x, np.ndarray): for i in range(len(x)): ans.append(to_tensor(x[i])) else: ans = Tensor(x) return ans # 准备数据 X = [[1,2,3],[2,3,7],[5,3,1]] Y = [[x_[0] + 2*x_[1] + 4*x_[2]] for x_ in X] # y = [[17, 36, 15]] X = to_tensor(X) Y = to_tensor(Y) for i in range(len(X)): print(X[i],Y[i])
[Tensor(data=1), Tensor(data=2), Tensor(data=3)] [Tensor(data=17)]
[Tensor(data=2), Tensor(data=3), Tensor(data=7)] [Tensor(data=36)]
[Tensor(data=5), Tensor(data=3), Tensor(data=1)] [Tensor(data=15)]
lr = 0.03 # 学习率 num_epochs = 3 # 迭代次数 net = Linear(3,1) # 构建神经网络 loss = squared_loss # 设置损失函数 for epoch in range(num_epochs): for i in range(len(X)): l = loss(net(X[i]),Y[i]) # X 和 y 的损失 l.backward() params = net.parameters() sgd(params, lr) train_l = 0.0 for i in range(len(X)): train_l += loss(net(X[i]),Y[i]).data print(f'epoch{epoch + 1}, loss {float(train_l/len(X)):f}') net(X[0]), Y[0]
epoch1, loss 5.200388
epoch2, loss 0.407041
epoch3, loss 0.041877
([Tensor(data=17.498340752751304)], [Tensor(data=17)])
多个 Linear 组合
class MLP: def __init__(self, in_features, outs): self.linears = [] self.num_linears = len(outs) for i in range(len(outs)): # 根据 outs 构建多个 Linear 层,上一层 Linear 的 out_features, 为下一层 Linear 的 in_features self.linears.append(Linear(in_features, outs[i])) in_features = outs[i] def __call__(self, x): return self.forward(x) def forward(self, x): for i in range(self.num_linears-1): x = self.linears[i](x) for j in x: j.relu() # 每一个 Linear 层后加一个 ReLU 激活函数 x = self.linears[-1](x) return x def parameters(self): return [p for linear in self.linears for p in linear.parameters()]
# 准备数据
X = [[1,2,3],[2,3,7],[5,3,1]]
Y = [[x_[0] + 2*x_[1] + 4*x_[2]] for x_ in X] # y = [[17, 36, 15]]
X = to_tensor(X)
Y = to_tensor(Y)
for i in range(len(X)):
print(X[i],Y[i])
[Tensor(data=1), Tensor(data=2), Tensor(data=3)] [Tensor(data=17)]
[Tensor(data=2), Tensor(data=3), Tensor(data=7)] [Tensor(data=36)]
[Tensor(data=5), Tensor(data=3), Tensor(data=1)] [Tensor(data=15)]
lr = 0.001 num_epochs = 10 net = MLP(3,[2,1]) loss = squared_loss for epoch in range(num_epochs): for i in range(len(X)): l = loss(net(X[i]),Y[i]) # X 和 y 的损失 l.backward() params = net.parameters() sgd(params, lr) train_l = 0.0 for i in range(len(X)): train_l += loss(net(X[i]),Y[i]).data print(f'epoch{epoch + 1}, loss {float(train_l/len(X)):f}') net(X[0]), Y[0]
epoch1, loss 184.394965 epoch2, loss 95.322663 epoch3, loss 30.901516 epoch4, loss 10.462304 epoch5, loss 5.934934 epoch6, loss 4.167137 epoch7, loss 3.059523 epoch8, loss 2.275400 epoch9, loss 1.705741 epoch10, loss 1.289367 ([Tensor(data=17.56170670010426)], [Tensor(data=17)])
!mkdir -p /home/aistudio/work/mnist
!unzip /home/aistudio/data/data33695/mnist.zip -d /home/aistudio/work/mnist/
Archive: /home/aistudio/data/data33695/mnist.zip
inflating: /home/aistudio/work/mnist/t10k-images.idx3-ubyte
inflating: /home/aistudio/work/mnist/t10k-labels.idx1-ubyte
inflating: /home/aistudio/work/mnist/train-images.idx3-ubyte
inflating: /home/aistudio/work/mnist/train-labels.idx1-ubyte
import sys sys.path.append('/home/aistudio/work') import load_MNIST import matplotlib.pyplot as plt def load_datasets(show_examples=False): X_train = load_MNIST.load_train_images() y_train = load_MNIST.load_train_labels() X_test = load_MNIST.load_test_images() y_test = load_MNIST.load_test_labels() if show_examples is True: sample = X_train[1, :, :] plt.imshow(sample) plt.show() print('样例的矩阵形式为:\n {}'.format(sample)) return X_train, X_test, y_train, y_test X_train_, X_test_, y_train_, y_test_ = load_datasets(show_examples=True)
开始载入MNIST手写数字数据集: 训练集图片大小: 28*28, 已载入60000/60000. 训练集标签数量: 60000...已完成。 测试集图片大小: 28*28, 已载入10000/10000. 测试集标签数量: 10000...已完成。 <Figure size 640x480 with 1 Axes> 样例的矩阵形式为: [[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 51. 159. 253. 159. 50. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 48. 238. 252. 252. 252. 237. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 54. 227. 253. 252. 239. 233. 252. 57. 6. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 10. 60. 224. 252. 253. 252. 202. 84. 252. 253. 122. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 163. 252. 252. 252. 253. 252. 252. 96. 189. 253. 167. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 51. 238. 253. 253. 190. 114. 253. 228. 47. 79. 255. 168. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 48. 238. 252. 252. 179. 12. 75. 121. 21. 0. 0. 253. 243. 50. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 38. 165. 253. 233. 208. 84. 0. 0. 0. 0. 0. 0. 253. 252. 165. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 7. 178. 252. 240. 71. 19. 28. 0. 0. 0. 0. 0. 0. 253. 252. 195. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 57. 252. 252. 63. 0. 0. 0. 0. 0. 0. 0. 0. 0. 253. 252. 195. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 198. 253. 190. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 255. 253. 196. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 76. 246. 252. 112. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 253. 252. 148. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 85. 252. 230. 25. 0. 0. 0. 0. 0. 0. 0. 0. 7. 135. 253. 186. 12. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 85. 252. 223. 0. 0. 0. 0. 0. 0. 0. 0. 7. 131. 252. 225. 71. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 85. 252. 145. 0. 0. 0. 0. 0. 0. 0. 48. 165. 252. 173. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 86. 253. 225. 0. 0. 0. 0. 0. 0. 114. 238. 253. 162. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 85. 252. 249. 146. 48. 29. 85. 178. 225. 253. 223. 167. 56. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 85. 252. 252. 252. 229. 215. 252. 252. 252. 196. 130. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 28. 199. 252. 252. 253. 252. 252. 233. 145. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 25. 128. 252. 253. 252. 141. 37. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
X_train_.shape, y_train_.shape,type(X_train_),X_test_.shape, y_test_.shape
((60000, 28, 28), (60000,), numpy.ndarray, (10000, 28, 28), (10000,))
# 数据处理
X_train = X_train_.reshape(60000,28*28)
y_train = y_train_.reshape(60000,1)
X_test = X_test_.reshape(10000,28*28)
y_test = y_test_.reshape(10000,1)
X_train[0],y_train[0]
# 这个框架太粗糙,训练太慢,只使用部分数据
train_data = 1000
test_data =10
X_train = to_tensor(X_train[0:train_data])
y_train = to_tensor(y_train[0:train_data])
X_test = to_tensor(X_test[0:test_data])
y_test = to_tensor(y_test[0:test_data])
该框架太过于粗糙,有待优化。这里只使用部分数据训练,用作演示
lr = 0.00000001 num_epochs = 2 net = Linear(784,10) # net = MLP(28*28, [256,10]) loss = squared_loss for epoch in range(num_epochs): for i in range(train_data): l = loss(net(X_train[i]),y_train[i]) # X 和 y 的损失 l.backward() params = net.parameters() sgd(params, lr) if i%100 == 0: print(f'epoch{epoch + 1}, train loss {float(l.data):f}') test_l = 0.0 for i in range(test_data): test_l += loss(net(X_test[i]),y_test[i]).data print(f'epoch{epoch + 1}, test loss {float(test_l/len(X_test)):f}') # net(X_train[0]), y_train[0]
epoch1, train loss 93719477.784360 epoch1, train loss 83988.257223 epoch1, train loss 344.078848 epoch1, train loss 219084.742202 epoch1, train loss 1874046.072795 epoch1, train loss 11194.016174 epoch1, train loss 85165.438772 epoch1, train loss 6325.009070 epoch1, train loss 152919.084357 epoch1, train loss 55067.312630 epoch1, test loss 330436.891082 epoch2, train loss 260784.575575 epoch2, train loss 15026.502161 epoch2, train loss 81925.278769 epoch2, train loss 40676.119289 epoch2, train loss 1202164.376063 epoch2, train loss 23028.411272 epoch2, train loss 20343.179800 epoch2, train loss 18186.245439 epoch2, train loss 15613.012288 epoch2, train loss 15918.717633 epoch2, test loss 223290.439788
使用一个全连接层,粗略训练
import paddle import paddle.nn as nn # 数据准备 X_train_p = paddle.to_tensor(X_train_, stop_gradient=True).flatten(1).astype('float32') X_test_p = paddle.to_tensor(X_test_, stop_gradient=True).flatten(1).astype('float32') y_train_p = paddle.to_tensor(y_train_, stop_gradient=True).astype('float32') y_test_p = paddle.to_tensor(y_test_, stop_gradient=True).astype('float32') batch_size, lr, num_epochs = 256, 0.1, 10 loss = paddle.nn.loss.MSELoss() net = nn.Linear(28*28, 1) # 一个简单的全连接层 sgd = paddle.optimizer.SGD(learning_rate=0.0000001, parameters=net.parameters()) for epoch in range(num_epochs): for i in range(60000): l = loss(net(X_train_p[i]),y_train_p[i]) # X 和 y 的损失 l.backward() sgd.step() sgd.clear_grad() if i%30000 == 0: print(f'epoch{epoch + 1}, train loss {float(l.value()):f}') test_l = 0.0 for i in range(1000): test_l += loss(net(X_test_p[i]),y_test_p[i]).value() print(f'epoch{epoch + 1}, test loss {float(test_l/10000):f}') # 粗略训练结果 net(X_train_p[0]), y_train_p[0]
epoch1, train loss 3617.558350 epoch1, train loss 36.344646 epoch1, test loss 2.478513 epoch2, train loss 22.809593 epoch2, train loss 0.706598 epoch2, test loss 1.734603 epoch3, train loss 4.005621 epoch3, train loss 0.181211 epoch3, test loss 1.424313 epoch4, train loss 0.791444 epoch4, train loss 0.112583 epoch4, test loss 1.256442 epoch5, train loss 0.059686 epoch5, train loss 0.095287 epoch5, test loss 1.151413 epoch6, train loss 0.034223 epoch6, train loss 0.085789 epoch6, test loss 1.080396 epoch7, train loss 0.241043 epoch7, train loss 0.075794 epoch7, test loss 1.030066 epoch8, train loss 0.513496 epoch8, train loss 0.064292 epoch8, test loss 0.993136 epoch9, train loss 0.784249 epoch9, train loss 0.052085 epoch9, test loss 0.965241 epoch10, train loss 1.025587 epoch10, train loss 0.040221 epoch10, test loss 0.943633 (Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=False, [3.89197564]), Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, [5.]))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。