当前位置:   article > 正文

【日常】手写卷积神经网络(conv+relu+maxpooling+linear+relu+linear+softmax+交叉熵损失+正则)_convolution relu max pooling

convolution relu max pooling

第二次课程作业,手写CNN进行手写数字识别。据说还要手写一次RNN(这酸爽还有谁)。

作业坑点在于Assignment给定的这个架构与默认参数效果非常差,结果怎么调都调不好,一开始一直是以为写错了,但是检查了很久都发现不了哪里写错了。后来用keras实现了同样的架构,发现确实效果差得惊人(跟瞎猜没有区别),然后随便改了改(加一个卷积层或者加一个dense层,甚至只要修改一下优化函数,SGD的效果比adagrad及adam都要差太多),就能得到比较好的效果。这作业做完之后觉得神经网络真的是科学中的玄学,玄学中的佛学。

代码架构如下所示:

  1. ../
  2. train_small.py # 主文件
  3. test_grad.py # 用于测试layers.py的层前向及反向传播是否正确
  4. dataset.py # 读取数据
  5. check_gradient.py # test_grad.py常用的函数
  6. ../nn/
  7. cnn.py # CNN架构
  8. layers.py # 各个层的类
  9. loss.py # 损失函数
  10. optimizer.py # 优化器
  11. utils.py # 其他
  12. ../cifar-10-batches-py
  13. # 数据文件, 通过下面的方法下载
  14. # wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
  15. # tar -xzvf cifar-10-python.tar.gz
  16. # rm cifar-10-python.tar.gz

代码下面自取:

train_small.py

  1. #-*- coding:UTF-8 -*-
  2. import os
  3. import sys
  4. sys.path.append("{}\\nn".format(os.getcwd())) # 最近python老是犯病
  5. import numpy as np
  6. from nn.cnn import CNN
  7. from nn.optimizer import SGD
  8. from nn.utils import accuracy
  9. from dataset import get_cifar10_data
  10. from matplotlib import pyplot as plt
  11. from collections import Counter
  12. def train(model,X_train,y_train,X_val,y_val,batch_size,n_epochs,
  13. lr=1e-2,
  14. lr_decay=0.8,
  15. momentum=0.0,
  16. wd=0.0,
  17. verbose=True,
  18. print_level=1,
  19. ): # 模型训练
  20. print("开始训练...")
  21. n_train = X_train.shape[0] # 获取训练集样本数量
  22. iterations_per_epoch = max(n_train//batch_size,1) # 每个epoch的迭代次数
  23. n_iterations = n_epochs*iterations_per_epoch # 总的迭代次数
  24. loss_hist = [] # 储存每个epoch的损失函数值
  25. opt_params = {"lr":lr,"weight_decay":wd,"momentum":momentum} # 设置最优参数, 预设了学习率
  26. print("训练样本数:{}".format(n_train))
  27. print("每阶段迭代次数:{}".format(iterations_per_epoch))
  28. print("阶段数:{}".format(n_epochs))
  29. print("总迭代次数:{}".format(n_iterations))
  30. count = 0
  31. for epoch in range(n_epochs): # 遍历所有epoch
  32. for t in range(iterations_per_epoch): # 遍历总的迭代次数
  33. count += 1
  34. batch_mask = np.random.choice(n_train,batch_size) # 从训练集中随机取batch_size个出来
  35. X_batch = X_train[batch_mask] # 输入采样
  36. y_batch = y_train[batch_mask] # 输出采样
  37. loss,score = model.oracle(X_batch,y_batch) # 评估函数值与梯度
  38. loss_hist.append(loss) # 记录损失值
  39. sgd = SGD(model.param_groups,**opt_params) # 定义优化器
  40. sgd.step() # 进行随机梯度下降
  41. if verbose and t%print_level==0: # 输出训练损失
  42. train_acc = accuracy(score,y_batch) # 计算精度
  43. print("(Iteration {}/{},epoch {})loss:{},accu:{}".format(
  44. count,n_iterations,epoch,loss_hist[-1],train_acc))
  45. if t==iterations_per_epoch-1: opt_params["lr"] *= lr_decay
  46. """ 实验绘图 """
  47. plt.close()
  48. plt.figure()
  49. plt.plot(loss_hist,label="training loss")
  50. plt.legend(loc="best")
  51. plt.show()
  52. if __name__ == "__main__":
  53. model = CNN() # 初始化卷积神经网络
  54. data = get_cifar10_data() # 读取数据
  55. num_train = 100 # 训练集数量
  56. data = { # 把data转为字典形式
  57. "X_train": data["X_train"][:num_train],
  58. "y_train": data["y_train"][:num_train],
  59. "X_val": data["X_val"],
  60. "y_val": data["y_val"],
  61. }
  62. X_train,y_train,X_val,y_val = data["X_train"],data["y_train"],data["X_val"],data["y_val"]
  63. train(model,X_train,y_train,X_val,y_val,50,50)

test_grad.py

  1. #-*- coding:UTF-8 -*-
  2. import os
  3. import sys
  4. sys.path.append("{}\\nn".format(os.getcwd())) # 最近python老是犯病
  5. import numpy as np
  6. from nn.cnn import CNN
  7. from check_gradient import *
  8. from nn.layers import Conv,MaxPool,Linear,Relu
  9. def rel_error(x,y): # 计算相对误差
  10. return np.nanmax(np.abs(x-y)/(np.maximum(1e-8,np.abs(x)+np.abs(y))))
  11. """ relu激活层就不管了, 反正错了也不是我的锅 """
  12. check_conv_forward = True
  13. #check_conv_forward = False
  14. check_conv_backward = True
  15. #check_conv_backward = False
  16. check_linear_forward = True
  17. #check_linear_forward = False
  18. check_linear_backward = True
  19. #check_linear_backward = False
  20. check_pool_forward = True
  21. #check_pool_forward = False
  22. check_pool_backward = True
  23. #check_pool_backward = False
  24. if check_conv_forward: # 检查卷积层前向传播是否正确
  25. x_shape = (2,3,4,4) # 2个样本, 3个轨道, 4×4像素
  26. w_shape = (3,3,4,4) # 3个输入轨道, 3个输出轨道, 4×4卷积核
  27. x = np.linspace(-0.1,0.5,num=np.prod(x_shape)).reshape(x_shape) # 2×3×4×4的样本
  28. w = np.linspace(-0.2,0.3,num=np.prod(w_shape)).reshape(w_shape) # 3×3×4×4的卷积过滤器
  29. b = np.linspace(-0.1,0.2,num=3) # 3个偏差
  30. conv = Conv(4,4,3,3,2,1) # 卷积层: 4×4卷积核, 3个输入轨道, 3个输出轨道, 步长2, padding值为1
  31. conv.params["w"]["param"] = w # 设置卷积过滤器
  32. conv.params["b"]["param"] = b # 设置偏差值
  33. out = conv(x) # Layer类是可以被调用的, 返回前向传播的结果
  34. correct_out = np.array([ # 事先计算好的正确输出维度为: 2×3×2×2
  35. [
  36. [[-0.08759809,-0.10987781],[-0.18387192,-0.2109216]],
  37. [[0.21027089,0.21661097],[0.22847626,0.23004637]],
  38. [[0.50813986,0.54309974],[0.64082444,0.67101435]]
  39. ],
  40. [
  41. [[-0.98053589,-1.03143541],[-1.19128892,-1.24695841]],
  42. [[0.69108355,0.66880383],[0.59480972,0.56776003]],
  43. [[2.36270298,2.36904306],[2.38090835,2.38247847]]
  44. ]
  45. ])
  46. print("Testing convolutional forward...")
  47. print("difference: {}".format(rel_error(out,correct_out)))
  48. if check_conv_backward: # 检查卷积层反向传播是否正确
  49. np.random.seed(231) # 初始化随机种子
  50. x = np.random.randn(2,3,16,16) # 2个样本, 3个轨道, 16×16像素
  51. w = np.random.randn(3,3,3,3) # 3个输入轨道, 3个输出轨道, 3×3的卷积核
  52. b = np.random.randn(3,) # 3个偏差
  53. dout = np.random.randn(2,3,14,14) # 随机给定一个输出dout: 这个应该是被假设为下一层反向传回当层的输入梯度值
  54. conv = Conv(3,3,3,3,1,0) # 初始化一个3个输入轨道, 3个输出轨道, 3×3的卷积核, 步长为1且不padding
  55. conv.params["w"]["param"] = w # 设置卷积过滤器
  56. conv.params["b"]["param"] = b # 设置偏差
  57. out = conv(x) # 计算卷积层在输入x后的输出结果
  58. dx = conv.backward(dout,x) # 计算对应dout输入的反向传播输出
  59. dx_num = eval_numerical_gradient_array(conv,x,dout) # dx_num维度与x,dx完全相同
  60. params = conv.params # 获取卷积层的参数
  61. def fw(v): # 计算用v过滤器后的输出结果
  62. tmp = params["w"]["param"]
  63. params["w"]["param"] = v
  64. f_w = conv(x)
  65. params["w"]["param"] = tmp
  66. return f_w
  67. def fb(v): # 计算用v偏差后的输出结果
  68. tmp = params["b"]["param"]
  69. params["b"]["param"] = v
  70. f_b = conv(x)
  71. params["b"]["param"] = tmp
  72. return f_b
  73. dw = params["w"]["grad"] # 卷积过滤器的梯度
  74. dw_num = eval_numerical_gradient_array(fw,w,dout) # dw_num维度与w,dw完全相同
  75. db = params["b"]["grad"] # db_num维度与b,db完全相同
  76. db_num = eval_numerical_gradient_array(fb,b,dout)
  77. print("Testing convolutional backward")
  78. print("dx error: {}".format(rel_error(dx_num,dx)))
  79. print("dw error: {}".format(rel_error(dw_num,dw)))
  80. print("db error: {}".format(rel_error(db_num,db)))
  81. if check_linear_forward: # 检查线性层前向传播是否正确
  82. x_shape = (2,3,4,4) # 2个样本, 3个轨道, 4×4像素
  83. w_shape = (3*4*4,64) # 从48维映射到64
  84. b_shape = (1,64)
  85. x = np.linspace(-0.1,0.5,num=np.prod(x_shape)).reshape(x_shape)
  86. w = np.linspace(-0.2,0.3,num=np.prod(w_shape)).reshape(w_shape)
  87. b = np.linspace(-0.1,0.2,num=64).reshape(b_shape) # 64个偏差
  88. linear = Linear(3*4*4,64)
  89. linear.params["w"]["param"] = w # 设置卷积过滤器
  90. linear.params["b"]["param"] = b # 设置偏差值
  91. out = linear(x) # Layer类是可以被调用的, 返回前向传播的结果
  92. correct_out = np.dot(x.reshape(2,48),w)+b # 单纯的全连接层
  93. print("Testing linear forward...")
  94. print("difference: {}".format(rel_error(out,correct_out)))
  95. if check_linear_backward: # 检查线性层反向传播是否正确
  96. np.random.seed(231) # 初始化随机种子
  97. x = np.random.randn(2,3,4,4) # 2个样本, 3个轨道, 16×16像素
  98. w = np.random.randn(3*4*4,64) # 3个输入轨道, 3个输出轨道, 3×3的卷积核
  99. b = np.random.randn(1,64) # 3个偏差
  100. dout = np.random.randn(2,64) # 随机给定一个输出dout: 这个应该是被假设为下一层反向传回当层的输入梯度值
  101. linear = Linear(3*4*4,64) # 初始化一个3个输入轨道, 3个输出轨道, 3×3的卷积核, 步长为1且不padding
  102. linear.params["w"]["param"] = w # 设置卷积过滤器
  103. linear.params["b"]["param"] = b # 设置偏差
  104. out = linear(x) # 计算卷积层在输入x后的输出结果
  105. dx = linear.backward(dout,x) # 计算对应dout输入的反向传播输出
  106. dx_num = eval_numerical_gradient_array(linear,x,dout) # dx_num维度与x,dx完全相同
  107. dx_num = dx_num.reshape(dx_num.shape[0],-1) # 调整dx_num维度为二维张量
  108. params = linear.params # 获取卷积层的参数
  109. def fw(v): # 计算用v过滤器后的输出结果
  110. tmp = params["w"]["param"]
  111. params["w"]["param"] = v
  112. f_w = linear(x)
  113. params["w"]["param"] = tmp
  114. return f_w
  115. def fb(v): # 计算用v偏差后的输出结果
  116. tmp = params["b"]["param"]
  117. params["b"]["param"] = v
  118. f_b = linear(x)
  119. params["b"]["param"] = tmp
  120. return f_b
  121. dw = params["w"]["grad"] # 卷积过滤器的梯度
  122. dw_num = eval_numerical_gradient_array(fw,w,dout) # dw_num维度与w,dw完全相同
  123. db = params["b"]["grad"] # db_num维度与b,db完全相同
  124. db_num = eval_numerical_gradient_array(fb,b,dout)
  125. print("Testing linear backward")
  126. print("dx error: {}".format(rel_error(dx_num,dx)))
  127. print("dw error: {}".format(rel_error(dw_num,dw)))
  128. print("db error: {}".format(rel_error(db_num,db)))
  129. if check_pool_forward: # 检查池化层前向传播是否正确
  130. x_shape = (2,3,4,4) # 2个样本, 3个轨道, 4×4像素
  131. x = np.linspace(-0.1,0.5,num=np.prod(x_shape)).reshape(x_shape) # 2×3×4×4的样本
  132. pool = MaxPool(kernel_size=2,stride=2,padding=0) # 2×2的池化层, 步长2且不padding
  133. out = pool(x)
  134. out_shape = (2,3,2,2)
  135. correct_out = np.zeros(out_shape)
  136. for i in range(out_shape[0]):
  137. for j in range(out_shape[1]):
  138. for k in range(out_shape[2]):
  139. for l in range(out_shape[3]):
  140. correct_out[i,j,k,l] = x[i,j,2*k+1,2*l+1] # 因为是按顺序排列的, 每个窗口的右下角恰好为最大值
  141. print("Testing pooling forward...")
  142. print("difference: {}".format(rel_error(out,correct_out)))
  143. if check_pool_backward: # 检查池化层反向传播是否正确
  144. np.random.seed(231) # 初始化随机种子
  145. x = np.random.randn(3,2,8,8) # 随机给定一个输入x
  146. dout = np.random.randn(3,2,4,4) # 随机给定一个输出dout: 这个应该是被假设为下一层反向传回当层的输入梯度值
  147. pool = MaxPool(kernel_size=2,stride=2,padding=0) # 初始化一个2×2的池化核, 不padding且步长为2, 其输出恰好为8×8-->4×4
  148. out = pool(x) # 得出一个对应x的前向输出
  149. dx = pool.backward(dout,x) # 得出一个对应dout的反向输出的梯度
  150. dx_num = eval_numerical_gradient_array(pool,x,dout) # 调用手动计算梯度的函数: dx_num的维度与x的维度完全相同(3,2,8,8)
  151. print("Testing pooling backward:")
  152. print("dx error: ",rel_error(dx,dx_num)) # 你的误差应该在1e-12左右

check_gradient.py

  1. #-*- coding:UTF-8 -*-
  2. import sys
  3. import numpy as np
  4. from random import randrange
  5. if sys.version_info>=(3,0): # 判断python版本, 对于python3.x需要重写xrange生成器
  6. def xrange(*args,**kwargs):
  7. return iter(range(*args,**kwargs))
  8. def eval_numerical_gradient(f,x,
  9. verbose=True,
  10. h=1e-5
  11. ): # 求函数f在点x处的梯度的一个非常幼稚的实施方案: f为只有一个参数的函数, x为需要求梯度的点或者是数组
  12. fx = f(x) # 求函数在给定点的函数值
  13. grad = np.zeros_like(x) # 预设梯度为与x形状相同的玩意儿
  14. it = np.nditer(x,flags=["multi_index"],op_flags=["readwrite"]) # 把x做成numpy生成器
  15. while not it.finished: # 遍历x中的所有元素, 一个个求偏导
  16. ix = it.multi_index # 获取生成器index
  17. oldval = x[ix] # 获取对应index的数值
  18. x[ix] = oldval+h # 右移一段
  19. fxph = f(x) # 计算右函数值
  20. x[ix] = oldval-h # 左移一段
  21. fxmh = f(x) # 计算左函数值
  22. x[ix] = oldval # 还原该index上原先的值
  23. grad[ix] = (fxph-fxmh)/(2*h) # 计算偏导数
  24. if verbose: print(ix,grad[ix]) # 输出梯度
  25. it.iternext() # 步入下一次
  26. return grad
  27. def eval_numerical_gradient_array(f,x,df,
  28. h=1e-5
  29. ): # 对于一个接收了一个数组并返回数组的函数来计算数值的梯度
  30. grad = np.zeros_like(x) # 这里的输入x的维度为N_samples×n_Channels×Height×Width
  31. it = np.nditer(x,flags=["multi_index"],op_flags=["readwrite"]) # 把x做成一个迭代生成器
  32. while not it.finished:
  33. ix = it.multi_index # 获取生成器index
  34. oldval = x[ix] # 获取对应index的数值
  35. x[ix] = oldval+h # 右移一段
  36. pos = f(x).copy() # 计算右函数值
  37. x[ix] = oldval-h # 左移一段
  38. neg = f(x).copy() # 计算左函数值
  39. x[ix] = oldval # 还原该index上原先的值
  40. grad[ix] = np.sum((pos-neg)*df)/(2*h) # 这个乘以df是什么操作?
  41. it.iternext()
  42. return grad
  43. def eval_numerical_gradient_blobs(f,inputs,output,
  44. h=1e-5
  45. ): # 对一个操作输入与输出斑点的函数计算数值梯度: f为函数(f接收几个输入斑点作为参数,然后跟随一个斑点用于写入输出==>y=f(x,w,out),x与w为输入斑点,f的结果将被写入out),inputs为输入的斑点,output为输出斑点,h为步长
  46. numeric_diffs = []
  47. for input_blob in inputs:
  48. diff = np.zeros_like(input_blob.diffs)
  49. it = np.nditer(
  50. input_blob.vals,
  51. flags=["multi_index"],
  52. op_flags=["readwrite"]
  53. )
  54. while not it.finished:
  55. idx = it.multi_index
  56. orig = input_blob.vals[idx]
  57. input_blob.vals[idx] = orig+h
  58. f(*(inputs + (output,)))
  59. pos = np.copy(output.vals)
  60. input_blob.vals[idx] = orig-h
  61. f(*(inputs + (output,)))
  62. neg = np.copy(output.vals)
  63. input_blob.vals[idx] = orig
  64. diff[idx] = np.sum((pos-neg)*output.diffs)/(2.0*h)
  65. it.iternext()
  66. numeric_diffs.append(diff)
  67. return numeric_diffs
  68. def eval_numerical_gradient_net(net,inputs,output,
  69. h=1e-5
  70. ):
  71. result = eval_numerical_gradient_blobs(
  72. lambda *args: net.forward(),inputs,output,h=h
  73. )
  74. return result
  75. def grad_check_sparse(f,x,analytic_grad,
  76. num_checks=10,
  77. h=1e-5
  78. ): # 通过采样来检查梯度的稀疏性
  79. for i in range(num_checks): # 检查num_checks个点
  80. ix = tuple([randrange(m) for m in x.shape]) # 随机生成一个位置
  81. oldval = x[ix] # 获取对应index的数值
  82. x[ix] = oldval+h # 右移一段
  83. fxph = f(x) # 计算左函数值
  84. x[ix] = oldval-h # 左移一段
  85. fxmh = f(x) # 计算右函数值
  86. x[ix] = oldval # 还原该index上原先的值
  87. grad[ix] = (fxph-fxmh)/(2*h) # 计算偏导数
  88. grad_analytic = analytic_grad[ix]
  89. error = abs(grad_numerical-grad_analytic)
  90. total = abs(grad_numerical)+abs(grad_analytic)
  91. rel_error = error/total
  92. print("numerical: %f analytic: %f,relative error: %e" % (
  93. grad_numerical,
  94. grad_analytic,
  95. rel_error
  96. ))

dataset.py

  1. #-*- coding:UTF-8 -*-
  2. import os
  3. import pickle
  4. import numpy as np
  5. def load_cifar_batch(filename): # 导入cifar-10数据集的一个batch
  6. with open(filename,"rb") as f:
  7. datadict = pickle.load(f,encoding="latin1")
  8. X = datadict["data"] # 获取输入字段数据
  9. Y = datadict["labels"] # 获取输出标签数据
  10. X = X.reshape(10000,3,32,32).transpose(0,2,3,1).astype("float") # 每个batch里面都是10000个样本, 改变维度顺序把RGB轨道调到最后
  11. Y = np.array(Y)
  12. return X,Y
  13. def load_cifar10(): # 导入全部cifar-10数据集: 5个训练集batch与1个测试集test_batch
  14. xs = []
  15. ys = []
  16. for b in range(1,6):
  17. f = os.path.join("cifar-10-batches-py","data_batch_%d" % (b,))
  18. X,Y = load_cifar_batch(f)
  19. xs.append(X)
  20. ys.append(Y)
  21. Xtr = np.concatenate(xs) # 把5个分文件的输入字段拼接
  22. Ytr = np.concatenate(ys) # 把5个分文件的输出标签拼接
  23. del X,Y
  24. Xte,Yte = load_cifar_batch(
  25. os.path.join("cifar-10-batches-py","test_batch")
  26. )
  27. """
  28. Xtr: 训练集(50000,32,32,3)
  29. Ytr: 训练集(50000,)
  30. Xte: 训练集(10000,32,32,3)
  31. Yte: 训练集(10000,)
  32. """
  33. return Xtr,Ytr,Xte,Yte
  34. def get_cifar10_data(
  35. n_train=49000,
  36. n_val=1000,
  37. n_test=10000,
  38. subtract_mean=True
  39. ): # 从磁盘导入cifar-10数据集并进行预处理, 这与SVM的预处理操作相同, 我们将它打包为一个函数
  40. X_train,y_train,X_test,y_test = load_cifar10() # 导入全部数据
  41. mask = list(range(n_train,n_train + n_val)) # 用于划分验证集
  42. X_val = X_train[mask]
  43. y_val = y_train[mask]
  44. mask = list(range(n_train)) # 用于划分训练集
  45. X_train = X_train[mask]
  46. y_train = y_train[mask]
  47. mask = list(range(n_test)) # 用于划分测试集
  48. X_test = X_test[mask]
  49. y_test = y_test[mask]
  50. if subtract_mean: # 数据标准化, 减去平均图片
  51. mean_image = np.mean(X_train,axis=0)
  52. X_train -= mean_image
  53. X_val -= mean_image
  54. X_test -= mean_image
  55. """ 调整输入特征维度的顺序从而使得channel处于最前面的位置 """
  56. X_train = X_train.transpose(0,3,1,2).copy()
  57. X_val = X_val.transpose(0,3,1,2).copy()
  58. X_test = X_test.transpose(0,3,1,2).copy()
  59. return { # 将数据打包为一个字典
  60. "X_train": X_train,
  61. "y_train": y_train,
  62. "X_val": X_val,
  63. "y_val": y_val,
  64. "X_test": X_test,
  65. "y_test": y_test,
  66. }
  67. if __name__ == "__main__":
  68. Xtr,ytr,Xte,yte = load_cifar10()
  69. print(Xtr.shape)
  70. print(ytr.shape)
  71. print(Xte.shape)
  72. print(yte.shape)
  73. print(set(ytr.tolist()))
  74. data = get_cifar10_data()
  75. for key,value in data.items():
  76. print(key,value.shape)

nn/cnn.py

  1. #-*- coding:UTF-8 -*-
  2. import math
  3. import time
  4. import numpy as np
  5. from loss import SoftmaxCE,softmax
  6. from layers import Conv,Relu,MaxPool,Linear
  7. class CNN(object): # 卷积神经网络架构类: 卷积+激活+池化+线性+激活+线性+软大
  8. def __init__(self,
  9. image_size=(3,32,32),
  10. channels=3,
  11. conv_kernel=7,
  12. pool_kernel=2,
  13. hidden_units=100,
  14. n_classes=10,
  15. ): # 构造函数: 初始化神经网络,定义网络层
  16. """ 类构造参数 """
  17. self.image_size = image_size # 图片形状3×H×W
  18. self.channels = channels # 卷积层的轨道数
  19. self.conv_kernel = conv_kernel # 卷积层核维度
  20. self.pool_kernel = pool_kernel # 池化层核维度
  21. self.hidden_units = hidden_units # 线性传播中隐层单元数量
  22. self.n_classes = n_classes # 也许是总层数吧
  23. """ 类常用参数 """
  24. channel,height,width = self.image_size # 这三个变量将记录卷积部分的输入轨道与维度
  25. self.conv_stride = 1 # 卷积核移动步长
  26. self.conv_padding = 0 # 卷积层对输入padding数量
  27. self.pool_stride = 2 # 池化层窗口移动步长
  28. self.pool_padding = 0 # 池化层对输入padding数量
  29. self.conv = Conv( # 卷积层: 3×32×32-->3×(32-7+1)×(32-7+1)-->3×26×26
  30. height=self.conv_kernel,
  31. width=self.conv_kernel,
  32. in_channels=self.image_size[0],
  33. out_channels=self.channels,
  34. stride=self.conv_stride,
  35. padding=self.conv_padding,
  36. init_scale=1e-2,
  37. )
  38. """ 经过卷积层后轨道与维度的变化 """
  39. channel = self.channels
  40. height += (2*self.conv_padding-self.conv_kernel)
  41. height /= self.conv_stride
  42. height = int(height)+1
  43. width += (2*self.conv_padding-self.conv_kernel)
  44. width /= self.conv_stride
  45. width = int(width)+1
  46. self.relu1 = Relu() # 激活层 A: 3×26×26-->3×26×26
  47. self.pool = MaxPool( # 池化层: 3×26×26-->3×13×13
  48. kernel_size=self.pool_kernel,
  49. stride=self.pool_stride,
  50. padding=self.pool_padding,
  51. )
  52. """ 经过池化层后轨道与维度的变化 """
  53. channel = channel
  54. height += (2*self.pool_padding-self.pool_kernel)
  55. height /= self.pool_stride
  56. height = int(height)+1
  57. width += (2*self.pool_padding-self.pool_kernel)
  58. width /= self.pool_stride
  59. width = int(width)+1
  60. self.linear1 = Linear( # 线性层 A: 3×13×13-->507-->100
  61. in_features=channel*height*width,
  62. out_features=self.hidden_units,
  63. init_scale=1e-2,
  64. )
  65. self.relu2 = Relu() # 激活层 B: 100-->100
  66. self.linear2 = Linear( # 线性层 B: 100-->10
  67. in_features=self.hidden_units,
  68. out_features=10, # 最后一层应该是输出对每个字段的预测概率
  69. init_scale=1e-2,
  70. )
  71. """ 类初始化 """
  72. self.softmaxce = SoftmaxCE()
  73. self.param_groups = [ # 卷积层与线性层有参数
  74. {
  75. "w": {
  76. "param": self.conv.params["w"]["param"],
  77. "grad": self.conv.params["w"]["grad"],
  78. "pregrad": np.zeros_like(self.conv.params["w"]["grad"])
  79. },
  80. "b": {
  81. "param": self.conv.params["b"]["param"],
  82. "grad": self.conv.params["b"]["grad"],
  83. "pregrad": np.zeros_like(self.conv.params["b"]["grad"])
  84. },
  85. },
  86. {
  87. "w": {
  88. "param": self.linear1.params["w"]["param"],
  89. "grad": self.linear1.params["w"]["grad"],
  90. "pregrad": np.zeros_like(self.linear1.params["w"]["grad"])
  91. },
  92. "b": {
  93. "param": self.linear1.params["b"]["param"],
  94. "grad": self.linear1.params["b"]["grad"],
  95. "pregrad": np.zeros_like(self.linear1.params["b"]["grad"])
  96. },
  97. },
  98. {
  99. "w": {
  100. "param": self.linear2.params["w"]["param"],
  101. "grad": self.linear2.params["w"]["grad"],
  102. "pregrad": np.zeros_like(self.linear2.params["w"]["grad"])
  103. },
  104. "b": {
  105. "param": self.linear2.params["b"]["param"],
  106. "grad": self.linear2.params["b"]["grad"],
  107. "pregrad": np.zeros_like(self.linear2.params["b"]["grad"])
  108. },
  109. },
  110. ]
  111. def oracle(self,x,y): # 计算损失函数值,输出得分,损失函数梯度: x为一个N_samples×N_channels×Height×Width的张量,y为类别标签
  112. """ 前向传播 """
  113. """ 卷积层 """
  114. conv_out = self.conv.forward(x)
  115. """ 激活层 """
  116. relu1_out = self.relu1.forward(conv_out)
  117. """ 池化层 """
  118. pool_out = self.pool.forward(relu1_out)
  119. """ 线性层 """
  120. linear1_out = self.linear1.forward(pool_out)
  121. """ 激活层 """
  122. relu2_out = self.relu2.forward(linear1_out)
  123. """ 线性层 """
  124. linear2_out = self.linear2.forward(relu2_out)
  125. """ 软大交叉熵 """
  126. fx,g,s = self.softmaxce(linear2_out,y) # 损失函数值&梯度(是算在最后一层上面的梯度)&得分
  127. """ 反向传播 """
  128. linear2_back = self.linear2.backward(g,relu2_out)
  129. self.update_param()
  130. relu2_back = self.relu2.backward(linear2_back,linear1_out)
  131. self.update_param()
  132. linear1_back = self.linear1.backward(relu2_back,pool_out)
  133. self.update_param()
  134. pool_back = self.pool.backward(linear1_back,relu1_out)
  135. self.update_param()
  136. relu1_back = self.relu1.backward(pool_back,conv_out)
  137. self.update_param()
  138. conv_back = self.conv.backward(relu1_back,x)
  139. self.update_param()
  140. return fx,s
  141. def oracle_time(self,x,y): # 计算损失函数值,输出得分,损失函数梯度: x为一个N_samples×N_channels×Height×Width的张量,y为类别标签
  142. """ 前向传播 """
  143. """ 卷积层 """
  144. t = time.time()
  145. conv_out = self.conv.forward(x)
  146. conv_out_time = time.time()-t
  147. """ 激活层 """
  148. t = time.time()
  149. relu1_out = self.relu1.forward(conv_out)
  150. relu1_out_time = time.time()-t
  151. """ 池化层 """
  152. t = time.time()
  153. pool_out = self.pool.forward(relu1_out)
  154. pool_out_time = time.time()-t
  155. """ 线性层 """
  156. t = time.time()
  157. linear1_out = self.linear1.forward(pool_out)
  158. linear1_out_time = time.time()-t
  159. """ 激活层 """
  160. t = time.time()
  161. relu2_out = self.relu2.forward(linear1_out)
  162. relu2_out_time = time.time()-t
  163. """ 线性层 """
  164. t = time.time()
  165. linear2_out = self.linear2.forward(relu2_out)
  166. linear2_out_time = time.time()-t
  167. """ 软大交叉熵 """
  168. t = time.time()
  169. fx,g,s = self.softmaxce(linear2_out,y) # 损失函数值&梯度(是算在最后一层上面的梯度)&得分
  170. sf_out_time = time.time()-t
  171. """ 反向传播 """
  172. t = time.time()
  173. linear2_back = self.linear2.backward(g,relu2_out)
  174. self.update_param()
  175. linear2_back_time = time.time()-t
  176. t = time.time()
  177. relu2_back = self.relu2.backward(linear2_back,linear1_out)
  178. self.update_param()
  179. relu2_back_time = time.time()-t
  180. t = time.time()
  181. linear1_back = self.linear1.backward(relu2_back,pool_out)
  182. self.update_param()
  183. linear1_back_time = time.time()-t
  184. t = time.time()
  185. pool_back = self.pool.backward(linear1_back,relu1_out)
  186. self.update_param()
  187. pool_back_time = time.time()-t
  188. t = time.time()
  189. relu1_back = self.relu1.backward(pool_back,conv_out)
  190. self.update_param()
  191. relu1_back_time = time.time()-t
  192. t = time.time()
  193. conv_back = self.conv.backward(relu1_back,x)
  194. self.update_param()
  195. conv_back_time = time.time()-t
  196. timedict = dict(
  197. conv_out_time=conv_out_time,
  198. relu1_out_time=relu1_out_time,
  199. pool_out_time=pool_out_time,
  200. linear1_out_time=linear1_out_time,
  201. relu2_out_time=relu2_out_time,
  202. linear2_out_time=linear2_out_time,
  203. conv_back_time=conv_back_time,
  204. relu1_back_time=relu1_back_time,
  205. pool_back_time=pool_back_time,
  206. linear1_back_time=linear1_back_time,
  207. relu2_back_time=relu2_back_time,
  208. linear2_back_time=linear2_back_time,
  209. )
  210. return fx,s,timedict
  211. def score(self,x): # 预测的得分,除了oracle函数外还需要一个另外的得分函数,这在检查精度时是有用的: x为输入特征
  212. conv_out = self.conv(x)
  213. relu1_out = self.relu1(conv_out)
  214. pool_out = self.pool(relu1_out)
  215. linear1_out = self.linear1(pool_out)
  216. relu2_out = self.relu2(linear1_out)
  217. linear2_out = self.linear2(relu2_out)
  218. s = softmax(linear2_out)
  219. return s
  220. def update_param(self,): # 更新参数及梯度
  221. self.param_groups[0]["w"]["param"] = self.conv.params["w"]["param"]
  222. self.param_groups[0]["w"]["grad"] = self.conv.params["w"]["grad"]
  223. self.param_groups[0]["b"]["param"] = self.conv.params["b"]["param"]
  224. self.param_groups[0]["b"]["grad"] = self.conv.params["b"]["grad"]
  225. self.param_groups[1]["w"]["param"] = self.linear1.params["w"]["param"]
  226. self.param_groups[1]["w"]["grad"] = self.linear1.params["w"]["grad"]
  227. self.param_groups[1]["b"]["param"] = self.linear1.params["b"]["param"]
  228. self.param_groups[1]["b"]["grad"] = self.linear1.params["b"]["grad"]
  229. self.param_groups[2]["w"]["param"] = self.linear2.params["w"]["param"]
  230. self.param_groups[2]["w"]["grad"] = self.linear2.params["w"]["grad"]
  231. self.param_groups[2]["b"]["param"] = self.linear2.params["b"]["param"]
  232. self.param_groups[2]["b"]["grad"] = self.linear2.params["b"]["grad"]
  233. if __name__ == "__main__":
  234. cnn = CNN()

nn/layers.py

  1. #-*- coding:UTF-8 -*-
  2. import abc
  3. import numpy as np
  4. from numba import jit
  5. class Layer(object): # 神经网络层的基类,该类包含抽象方法,因此不可以被实例化
  6. def __init__(self,): # 抽象类构造函数
  7. self.params = dict() # 参数字典: 其中键为参数名称的字符串,键为字典,键字典里面包含"param"与"grad"两个字段,分别记录参数值与梯度
  8. @abc.abstractmethod
  9. def forward(self,x): # 评估输入特征与返回输出: x为输入特征, 返回值f(x)为输出特征
  10. pass
  11. @abc.abstractmethod
  12. def backward(self,grad_in,x): # 计算梯度并将梯度反向传播, 为了未来可以参考, 更新过的梯度应该被储存在self.params的对应区域中: grad_in为从反向传播得到的梯度, x为输入特征, 返回值grad_x为反向传播到下一层的梯度(分别为w.r.t x)
  13. pass
  14. def __call__(self,*args,**kwargs): # 使得Layer类型的变量可调用
  15. return self.forward(*args,**kwargs)
  16. class Conv(Layer): # 卷积层类, 参数w为卷积过滤器, 参数b为偏差
  17. def __init__(self,height,width,
  18. in_channels=3,
  19. out_channels=3,
  20. stride=1,
  21. padding=0,
  22. init_scale=1e-2,
  23. ): # 构造函数
  24. super(Conv,self).__init__()
  25. """ 类构造参数 """
  26. self.height = height # 卷积核的高度
  27. self.width = width # 卷积核的宽度
  28. self.in_channels = in_channels # 输入轨道数
  29. self.out_channels = out_channels # 输出轨道数
  30. self.stride = stride # 卷积核移动步长
  31. self.padding = padding # 是否需要在周围补充0
  32. self.init_scale = init_scale # 初始规模
  33. """ 父类参数: 我不在类初始化时设定grad参数, 因为如果重复调用同一对象的backward方法可能会导致梯度重复更新而错误 """
  34. self.params["w"] = { # 我理解的情况是这样的: 每个卷积过滤器的轨道数应与输入特征的轨道数相同, 每个卷积过滤器可以生成一个输出层轨道, 输出轨道数应当与卷积过滤器的数量相一致
  35. "param": self.init_scale*np.random.random((self.out_channels,
  36. self.in_channels,self.height,self.width)),
  37. "grad": None,
  38. }
  39. self.params["b"] = { # 偏差值
  40. "param": self.init_scale*np.random.random((self.out_channels,)),
  41. "grad": None,
  42. }
  43. @jit(nopython=False,parallel=True)
  44. def forward(self,x): # 前向传播: x为一只四维张量, N_samples×n_Channels×Height×Width, 返回值out为卷积核的输出
  45. nSamples,nChannels,height,width = x.shape # 获取输入张量x的四个维度值
  46. assert nChannels==self.in_channels # 断言
  47. outshape = ( # 计算输出的形状
  48. nSamples,self.out_channels, # 样本数, 输出轨道数
  49. int((2*self.padding+height-self.height)/self.stride)+1, # 输出高度
  50. int((2*self.padding+width-self.width)/self.stride)+1 # 输出宽度
  51. )
  52. out = np.zeros(outshape) # 初始化输出
  53. if self.padding: # 如果需要padding
  54. x_ = np.zeros((
  55. nSamples,nChannels,
  56. height+2*self.padding,
  57. width+2*self.padding
  58. ))
  59. x_[:,:,self.padding:-self.padding,
  60. self.padding:-self.padding] = x
  61. else: x_ = x.copy()
  62. for i in range(outshape[0]): # 遍历样本
  63. for j in range(outshape[1]): # 遍历输出轨道
  64. for k in range(outshape[2]): # 遍历像素点
  65. for l in range(outshape[3]): # 虽然很蠢, 但是用迭代生成器也不是很方便操作感觉
  66. x1,y1 = k*self.stride,l*self.stride
  67. x2,y2 = x1+self.height,y1+self.width
  68. total = 0
  69. for m in range(nChannels): # 遍历输入轨道计算哈氏积并累和: 这里天坑在于check_gradient.py的正确结果是默认w参数第一个维度是输出轨道, 第二个维度是输入轨道得到的, 而参数表里是先输入轨道后输出轨道的
  70. t1 = x_[i,m,x1:x2,y1:y2] # 输入对应区域
  71. t2 = self.params["w"]["param"][j,m,:,:] # 卷积过滤器对应区域
  72. total += np.nansum(t1*t2)
  73. out[i,j,k,l] = total+self.params["b"]["param"][j]# 最后不要忘了加上偏差值
  74. return out
  75. @jit(nopython=False,parallel=True)
  76. def backward(self,grad_in,x): # 卷积层的反向传播: grad_in的维度与卷积层forward中输出的维度相同
  77. self.params["w"]["grad"] = np.zeros((
  78. self.out_channels,self.in_channels,self.height,self.width)) # 选择在反向传播时再重定义grad参数
  79. self.params["b"]["grad"] = np.zeros((self.out_channels,)) # 选择在反向传播时再重定义grad参数
  80. nSamples,nChannels,height,width = x.shape
  81. outshape = ( # 计算输出的形状: 也是grad_in的形状
  82. nSamples,self.out_channels, # 样本数, 输出轨道数
  83. int((2*self.padding+height-self.height)/self.stride)+1, # 输出高度
  84. int((2*self.padding+width-self.width)/self.stride)+1 # 输出宽度
  85. )
  86. assert outshape==grad_in.shape # 断言
  87. x_ = np.zeros(( # 复现padding后的输入
  88. nSamples,nChannels,
  89. height+2*self.padding,
  90. width+2*self.padding
  91. ))
  92. if self.padding: # 如果需要padding
  93. x_[:,:,self.padding:-self.padding,
  94. self.padding:-self.padding] = x
  95. else: x_ = x.copy()
  96. grad_x = np.zeros_like(x_) # 先设法对padding后的x求梯度, 然后只需取grad_x中间部分即可
  97. """
  98. https://www.cnblogs.com/pinard/p/6494810.html
  99. 上面链接中给出了步长为1且不padding情况下卷积层反向传播极其简洁的表达式;
  100. 很可惜我只能用最愚蠢的方法来一个个填写梯度了;
  101. """
  102. for i in range(outshape[0]): # 遍历样本
  103. for j in range(outshape[1]): # 遍历输出轨道
  104. self.params["b"]["grad"][j] += np.nansum(grad_in[i,j,:,:])
  105. for k in range(outshape[2]): # 遍历像素点
  106. for l in range(outshape[3]): # grad_in的维度必然为outshape, 通过遍历前向传播中outshape中每个位置上元素表达式来反向求导
  107. x1,y1 = k*self.stride,l*self.stride
  108. x2,y2 = x1+self.height,y1+self.width
  109. for m in range(nChannels):
  110. grad_x[i,m,x1:x2,y1:y2] += grad_in[i,j,
  111. k,l]*self.params["w"]["param"][j,m,:,:]
  112. self.params["w"]["grad"][j,m,:,:] += grad_in[
  113. i,j,k,l]*x_[i,m,x1:x2,y1:y2]
  114. grad_x = grad_x[:,:,self.padding:-self.padding,
  115. self.padding:-self.padding] if self.padding else grad_x # 取出grad_x中不是padding的部分作为将要传递下去的梯度
  116. return grad_x
  117. class Linear(Layer): # 线性层类, 用于对特征应用线性变换: w为n_in×n_out的矩阵, b为1×n_out的向量
  118. def __init__(self,in_features,out_features,
  119. init_scale=1e-2
  120. ): # 构造函数
  121. super(Linear,self).__init__()
  122. """ 类构造参数 """
  123. self.in_features = in_features # 输入特征数量
  124. self.out_features = out_features # 输出特征数量
  125. self.init_scale = 1e-2 # 初始规模
  126. """ 父类参数 """
  127. self.params["w"] = { # 线性变换矩阵
  128. "param": self.init_scale*np.random.random((
  129. self.in_features,self.out_features)),
  130. "grad": None, # 不多说, 选择放在backward方法中初始化
  131. }
  132. self.params["b"] = { # 常数项偏差
  133. "param": self.init_scale*np.random.random((1,self.out_features)),
  134. "grad": None,
  135. }
  136. def forward(self,x): # 前向传播: x为维度为[n,d1,d2,...,dm]的输入特征, 返回值out为输出特征
  137. w = self.params["w"]["param"]
  138. b = self.params["b"]["param"]
  139. x_ = x.reshape(x.shape[0],-1) # 全连接层我们需要以把每个输入样本压成一条向量处理
  140. out = np.dot(x_,w)+b # 这里两个维度分别为n_Sample×n_out与n_out×1, 但是它们还是可以相加的, 结果为每个sample加上b
  141. return out
  142. def backward(self,grad_in,x): # 线性层的反向传播: 这里的grad_in维度与forward中out维度相同
  143. """
  144. out = np.dot(x,w) + b;
  145. x.shape = (n_Sample,in_features);
  146. w.shape = (in_features,out_features);
  147. out.shape = grad_in.shape = (n_Sample,out_features);
  148. b.shape = (1,out_features)
  149. b的梯度应该为(n_Sample,out_features)
  150. """
  151. x_ = x.reshape(x.shape[0],-1)
  152. self.params["w"]["grad"] = np.dot(x_.T,grad_in) # 凑维度凑出来的
  153. self.params["b"]["grad"] = np.nansum(grad_in,axis=0) # 每个样本都被b搞了一下
  154. grad_x = np.dot(grad_in,self.params["w"]["param"].T) # 凑维度凑出来的
  155. return grad_x
  156. class Relu(Layer): # 激活层类
  157. def __init__(self): # 构造函数
  158. super(Relu,self).__init__()
  159. def forward(self,x): # 前向传播
  160. return np.maximum(x,0) # 正数返回, 负数变零
  161. def backward(self,grad_in,x): # 反向传播
  162. return grad_in*(x>0) # 返回输入的梯度乘上relu的梯度
  163. class MaxPool(Layer): # 池化层类
  164. def __init__(self,kernel_size,
  165. stride=2,
  166. padding=0
  167. ): # 构造函数
  168. super(MaxPool,self).__init__()
  169. """ 类构造参数 """
  170. self.kernel_size = kernel_size
  171. self.stride = stride
  172. self.padding = padding
  173. """ 父类参数 """
  174. self.params = dict() # 池化层应该是没有参数
  175. def forward(self,x): # 前向传播: x为一只四维张量, N_samples×n_Channels×Height×Width, 返回值out为池化层的输出
  176. nSamples,nChannels,height,width = x.shape # 获取输入张量x的四个维度值
  177. outshape = ( # 计算输出的形状
  178. nSamples,nChannels, # 样本数, 输出轨道数
  179. int((2*self.padding+height-self.kernel_size)/self.stride)+1, # 输出高度
  180. int((2*self.padding+width-self.kernel_size)/self.stride)+1 # 输出宽度
  181. )
  182. out = np.zeros(outshape) # 初始化输出
  183. if self.padding: # 如果需要padding
  184. x_ = np.zeros((
  185. nSamples,nChannels,
  186. height+2*self.padding,
  187. width+2*self.padding
  188. ))
  189. x_[:,:,self.padding:-self.padding,
  190. self.padding:-self.padding] = x
  191. else: x_ = x.copy()
  192. for i in range(outshape[0]): # 遍历样本
  193. for j in range(outshape[1]): # 遍历输出轨道
  194. for k in range(outshape[2]): # 遍历像素点
  195. for l in range(outshape[3]): # 虽然很蠢, 但是用迭代生成器也不是很方便操作感觉
  196. x1,y1 = k*self.stride,l*self.stride
  197. x2,y2 = x1+self.kernel_size,y1+self.kernel_size
  198. out[i,j,k,l] = np.nanmax(x_[i,j,x1:x2,y1:y2]) # 取窗口最大值
  199. return out
  200. def backward(self,grad_in,x): # 反向传播: 此事grad_in的维度恰与MaxPool的输出相同, 因此需要找出grad_in中每个元素对应了x中哪块区域
  201. """
  202. 最大值池化层平均值池化层都没有需要学习的参数;
  203. 只需将误差项传递到上一层, 没有梯度的计算;
  204. 最大值池化层:下一层的误差项的值会原封不动的传递到上一层对应区块中的最大值所对应的神经元, 而其他神经元的误差项的值都是0;
  205. 平均值池化层: 下一层的误差项的值会平均分配到上一层对应区块中的所有神经元;
  206. """
  207. nSamples,nChannels,height,width = x.shape # 获取输入的形状
  208. outshape = ( # 计算输出的形状: 也是grad_in的形状
  209. nSamples,nChannels, # 样本数, 输出轨道数
  210. int((2*self.padding+height-self.kernel_size)/self.stride)+1, # 输出高度
  211. int((2*self.padding+width-self.kernel_size)/self.stride)+1 # 输出宽度
  212. )
  213. grad_in_reshape = grad_in.reshape(outshape)
  214. x_ = np.zeros(( # 复现padding后的输入
  215. nSamples,nChannels,
  216. height+2*self.padding,
  217. width+2*self.padding
  218. ))
  219. if self.padding: # 如果需要padding
  220. x_[:,:,self.padding:-self.padding,
  221. self.padding:-self.padding] = x
  222. else: x_ = x.copy()
  223. grad_x = np.zeros_like(x_) # 先设法对padding后的x求梯度, 然后只需取grad_x中间部分即可
  224. for i in range(outshape[0]): # 遍历样本
  225. for j in range(outshape[1]): # 遍历输出轨道
  226. for k in range(outshape[2]): # 遍历像素点
  227. for l in range(outshape[3]): # grad_in的维度必然为outshape, 通过遍历前向传播中outshape中每个位置上元素表达式来反向求导
  228. x1,y1 = k*self.stride,l*self.stride
  229. x2,y2 = x1+self.kernel_size,y1+self.kernel_size
  230. maxgrid = np.nanmax(x_[i,j,x1:x2,y1:y2]) # 找出对应该grad_in格子的原区域中最大值
  231. grad_x[i,j,x1:x2,y1:y2] += grad_in_reshape[i,j,
  232. k,l]*(x_[i,j,x1:x2,y1:y2]==maxgrid)
  233. grad_x = grad_x[:,:,self.padding:-self.padding,
  234. self.padding:-self.padding] if self.padding else grad_x # 取出grad_x中不是padding的部分作为将要传递下去的梯度
  235. return grad_x

nn/optimizer.py

  1. #-*- coding:UTF-8 -*-
  2. import abc
  3. import numpy as np
  4. class Optimizer(object): # 抽象优化器类, 为神经网络优化的一次算法(SGD算一次,Newton算二次): params_groups是神经网络模型中所有的参数构成的列表, configs是优化超参数
  5. def __init__(self,param_groups): # 构造函数
  6. self.param_groups = param_groups # 参数组应该是一个列表, 一般包含w, b两个字典, 再下一层是param与grad两个字段
  7. @abc.abstractmethod
  8. def step(self,isNestrov): # 步长问题
  9. pass
  10. class SGD(Optimizer): # 随机梯度下降类
  11. def __init__(self,param_groups,
  12. lr=1e-2,
  13. weight_decay=0.0,
  14. momentum=0.0
  15. ): # 构造函数, 参数含义可以参考HW2
  16. super(SGD,self).__init__(param_groups)
  17. self.configs = dict( # 配置字典中包含学习率与权重衰减等超参数
  18. lr=lr,
  19. weight_decay=weight_decay,
  20. momentum=momentum,
  21. )
  22. def step(self): # 步长问题
  23. lr = self.configs["lr"] # 获取学习率
  24. weight_decay = self.configs["weight_decay"] # 获取权重衰减系数
  25. momentum = self.configs["momentum"] # 获取动量系数
  26. """ 实现传统动量: 我觉得可能Nesterov's方法实现起来比较麻烦, 徒增计算量 """
  27. count = 0
  28. for group in self.param_groups: # 遍历每个参数
  29. count += 1
  30. for k,p in group.items(): # 获取每个参数的信息
  31. grad = p["grad"] # 获取参数梯度值
  32. pregrad = p["pregrad"] # 获取上一次参数梯度值
  33. if k=="w": # 如果这是一个矩阵参数那么需要考虑权重衰减系数
  34. v = momentum*pregrad - grad - weight_decay*p["param"]
  35. p["param"] += lr*v
  36. else:
  37. v = momentum*pregrad - grad
  38. p["param"] += lr*v # 对于偏差参数不需要考虑权重衰减
  39. p["pregrad"] = v # 记录本次下降方向

nn/loss.py

  1. #-*- coding:UTF-8 -*-
  2. import numpy as np
  3. def softmax(x): # 给定一个矩阵, 计算每行减去该行最大值后的softmax输出
  4. x_bar = x-np.nanmax(x,axis=1,keepdims=True) # 减去该行最大值: 防止数值溢出
  5. z = np.nansum(np.exp(x_bar),axis=1,keepdims=True) # 计算自然底数幂和
  6. return np.exp(x_bar)/z # 返回softmax输出
  7. class SoftmaxCE(object): # 利用softmax转换计算交叉熵损失值的抽象类
  8. def __init__(self): # 构造函数
  9. pass
  10. @staticmethod
  11. def __call__(x,y): # 调用方法: x为n_samples×n_features的矩阵, y看起来像是某个样本的标签0~9
  12. sf = softmax(x) # 计算x的softmax输出
  13. n = x.shape[0] # 获取样本数
  14. sf_log = -np.log(sf[range(n),y]) # 对应y的位置的概率即为预测准确的概率
  15. loss = np.mean(sf_log) # 用这部分概率来计算交叉熵值
  16. g = sf.copy()
  17. g[range(n),y] -= 1 # softmax交叉熵求导的形式极其简洁
  18. g /= n
  19. return loss,g,sf # 返回交叉熵损失值, 梯度及softmax层的输出结果

nn/utils.py

  1. #-*- coding:UTF-8 -*-
  2. import numpy as np
  3. def accuracy(score,y): # 计算分类精度, 返回一个0~1的标量: score是n×n类别得分矩阵, y为n×1的标签
  4. acc = np.mean(score.argmax(axis=1)==y)
  5. return acc

另外附一份用keras复现上面的架构,以及在该架构的基础上调参的测试代码:

  1. #-*- coding:UTF-8 -*-
  2. import time
  3. import numpy as np
  4. import tensorflow as tf
  5. from dataset import get_cifar10_data
  6. from matplotlib import pyplot as plt
  7. from keras.optimizers import SGD
  8. from keras.utils import to_categorical
  9. from keras.models import Sequential,Model
  10. from keras.layers import Dense,Dropout,Activation,Flatten,Conv2D,MaxPooling2D,Embedding,LSTM,Input
  11. def read_data(num_train=100): # 读取数据
  12. data = get_cifar10_data()
  13. data = { # 把data转为字典形式
  14. "X_train": data["X_train"][:num_train].transpose(0,2,3,1), # 把轨道放到最后一个维度上
  15. "y_train": to_categorical(data["y_train"][:num_train],10), # 标签一热化
  16. "X_val": data["X_val"].transpose(0,2,3,1), # 把轨道放到最后一个维度上
  17. "y_val": to_categorical(data["y_val"],10), # 标签一热化
  18. }
  19. return data
  20. def model_1(): # conv+relu+maxpool+linear+relu+linear+softmax
  21. model = Sequential()
  22. model.add(Conv2D(3,(7,7),activation="relu",input_shape=(32,32,3)))
  23. model.add(MaxPooling2D(pool_size=(2,2)))
  24. model.add(Flatten())
  25. model.add(Dense(100,activation="relu"))
  26. model.add(Dense(10,activation="softmax"))
  27. sgd = SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=False)
  28. model.compile(loss="categorical_crossentropy",optimizer=sgd,metrics=["accuracy"])
  29. model.summary()
  30. return model
  31. def model_2():
  32. model = Sequential()
  33. model.add(Conv2D(3,(7,7),activation="relu",input_shape=(32,32,3)))
  34. model.add(MaxPooling2D(pool_size=(2,2)))
  35. model.add(Conv2D(3,(3,3),activation="relu",input_shape=(13,13,3)))
  36. model.add(Flatten())
  37. model.add(Dense(100,activation="sigmoid"))
  38. model.add(Dense(10,activation="softmax"))
  39. sgd = SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=False)
  40. model.compile(loss="categorical_crossentropy",optimizer=sgd,metrics=["accuracy"])
  41. model.summary()
  42. return model
  43. def model_3():
  44. model = Sequential()
  45. model.add(Conv2D(3,(7,7),activation="relu",input_shape=(32,32,3)))
  46. model.add(MaxPooling2D(pool_size=(2,2)))
  47. model.add(Flatten())
  48. model.add(Dense(256,activation="sigmoid"))
  49. model.add(Dense(128,activation="sigmoid"))
  50. model.add(Dense(10,activation="softmax"))
  51. sgd = SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=False)
  52. model.compile(loss="categorical_crossentropy",optimizer=sgd,metrics=["accuracy"])
  53. model.summary()
  54. return model
  55. def model_4():
  56. model = Sequential()
  57. model.add(Conv2D(3,(7,7),activation="relu",input_shape=(32,32,3)))
  58. model.add(MaxPooling2D(pool_size=(2,2)))
  59. model.add(Conv2D(3,(3,3),activation="relu",input_shape=(13,13,3)))
  60. model.add(Flatten())
  61. model.add(Dense(256,activation="sigmoid"))
  62. model.add(Dense(128,activation="sigmoid"))
  63. model.add(Dense(10,activation="softmax"))
  64. sgd = SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=False)
  65. model.compile(loss="categorical_crossentropy",optimizer=sgd,metrics=["accuracy"])
  66. model.summary()
  67. return model
  68. def model_adagrad():
  69. model = Sequential()
  70. model.add(Conv2D(3,(7,7),activation="relu",input_shape=(32,32,3)))
  71. model.add(MaxPooling2D(pool_size=(2,2)))
  72. model.add(Flatten())
  73. model.add(Dense(256,activation="sigmoid"))
  74. model.add(Dense(128,activation="sigmoid"))
  75. model.add(Dense(10,activation="softmax"))
  76. model.compile(loss="categorical_crossentropy",optimizer="adagrad",metrics=["accuracy"])
  77. model.summary()
  78. return model
  79. def model_adam():
  80. model = Sequential()
  81. model.add(Conv2D(3,(7,7),activation="relu",input_shape=(32,32,3)))
  82. model.add(MaxPooling2D(pool_size=(2,2)))
  83. model.add(Flatten())
  84. model.add(Dense(256,activation="sigmoid"))
  85. model.add(Dense(128,activation="sigmoid"))
  86. model.add(Dense(10,activation="softmax"))
  87. model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
  88. model.summary()
  89. return model
  90. if __name__ == "__main__":
  91. """ P3Q1 """
  92. model = model_1()
  93. #model = model_2()
  94. #model = model_3()
  95. #model = model_4()
  96. data = read_data(49000)
  97. t = time.time()
  98. model.fit(data["X_train"],data["y_train"],batch_size=50,epochs=10)
  99. print("训练耗时:{}".format(time.time()-t))
  100. score = model.evaluate(data["X_val"],data["y_val"],batch_size=32)
  101. print(score)
  102. """ P3Q2 """
  103. #model = model_adagrad()
  104. model = model_adam()
  105. data = read_data(49000)
  106. t = time.time()
  107. model.fit(data["X_train"],data["y_train"],batch_size=50,epochs=10)
  108. print("训练耗时:{}".format(time.time()-t))
  109. score = model.evaluate(data["X_val"],data["y_val"],batch_size=32)
  110. print(score)

分享学习,共同进步!

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/2023面试高手/article/detail/75043
推荐阅读
相关标签
  

闽ICP备14008679号