当前位置:   article > 正文

【23-24 秋学期】NNDL 作业13 优化算法3D可视化

【23-24 秋学期】NNDL 作业13 优化算法3D可视化

编程实现优化算法,并3D可视化

1. 函数3D可视化

分别画出x[0]^{2}+x[1]^{2}+x[1]^{3}+x[0]*x[1] 和 x^{2} /20+y^{2}的3D图

代码如下:

  1. from mpl_toolkits.mplot3d import Axes3D
  2. import numpy as np
  3. from matplotlib import pyplot as plt
  4. import torch
  5. from nndl.op import Op
  6. # 画出x**2
  7. class OptimizedFunction3D(Op):
  8. def __init__(self):
  9. super(OptimizedFunction3D, self).__init__()
  10. self.params = {'x': 0}
  11. self.grads = {'x': 0}
  12. def forward(self, x):
  13. self.params['x'] = x
  14. return x[0] ** 2 + x[1] ** 2 + x[1] ** 3 + x[0] * x[1]
  15. def backward(self):
  16. x = self.params['x']
  17. gradient1 = 2 * x[0] + x[1]
  18. gradient2 = 2 * x[1] + 3 * x[1] ** 2 + x[0]
  19. grad1 = torch.Tensor([gradient1])
  20. grad2 = torch.Tensor([gradient2])
  21. self.grads['x'] = torch.cat([grad1, grad2])
  22. # 使用numpy.meshgrid生成x1,x2矩阵,矩阵的每一行为[-3, 3],以0.1为间隔的数值
  23. x1 = np.arange(-3, 3, 0.1)
  24. x2 = np.arange(-3, 3, 0.1)
  25. x1, x2 = np.meshgrid(x1, x2)
  26. init_x = torch.Tensor(np.array([x1, x2]))
  27. model = OptimizedFunction3D()
  28. # 绘制 f_3d函数 的 三维图像
  29. fig = plt.figure()
  30. ax = plt.axes(projection='3d')
  31. X = init_x[0].numpy()
  32. Y = init_x[1].numpy()
  33. Z = model(init_x).numpy()
  34. ax.plot_surface(X, Y, Z, cmap='rainbow')
  35. ax.set_xlabel('x1')
  36. ax.set_ylabel('x2')
  37. ax.set_zlabel('f(x1,x2)')
  38. plt.show()
  39. # 画出x * x / 20 + y * y
  40. def func(x, y):
  41. return x * x / 20 + y * y
  42. def paint_loss_func():
  43. x = np.linspace(-50, 50, 100) # x的绘制范围是-50到50,从改区间均匀取100个数
  44. y = np.linspace(-50, 50, 100) # y的绘制范围是-50到50,从改区间均匀取100个数
  45. X, Y = np.meshgrid(x, y)
  46. Z = func(X, Y)
  47. fig = plt.figure() # figsize=(10, 10))
  48. ax = Axes3D(fig)
  49. plt.xlabel('x')
  50. plt.ylabel('y')
  51. ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='rainbow')
  52. plt.show()
  53. paint_loss_func()

2.加入优化算法,画出轨迹

分别画出x[0]^{2}+x[1]^{2}+x[1]^{3}+x[0]*x[1] 和 x^{2} /20+y^{2}的3D轨迹图

(1)x[0]^{2}+x[1]^{2}+x[1]^{3}+x[0]*x[1]

  1. import torch
  2. import numpy as np
  3. import copy
  4. from matplotlib import pyplot as plt
  5. from matplotlib import animation
  6. from itertools import zip_longest
  7. from nndl.op import Op
  8. class Optimizer(object): # 优化器基类
  9. def __init__(self, init_lr, model):
  10. """
  11. 优化器类初始化
  12. """
  13. # 初始化学习率,用于参数更新的计算
  14. self.init_lr = init_lr
  15. # 指定优化器需要优化的模型
  16. self.model = model
  17. def step(self):
  18. """
  19. 定义每次迭代如何更新参数
  20. """
  21. pass
  22. class SimpleBatchGD(Optimizer):
  23. def __init__(self, init_lr, model):
  24. super(SimpleBatchGD, self).__init__(init_lr=init_lr, model=model)
  25. def step(self):
  26. # 参数更新
  27. if isinstance(self.model.params, dict):
  28. for key in self.model.params.keys():
  29. self.model.params[key] = self.model.params[key] - self.init_lr * self.model.grads[key]
  30. class Adagrad(Optimizer):
  31. def __init__(self, init_lr, model, epsilon):
  32. """
  33. Adagrad 优化器初始化
  34. 输入:
  35. - init_lr: 初始学习率 - model:模型,model.params存储模型参数值 - epsilon:保持数值稳定性而设置的非常小的常数
  36. """
  37. super(Adagrad, self).__init__(init_lr=init_lr, model=model)
  38. self.G = {}
  39. for key in self.model.params.keys():
  40. self.G[key] = 0
  41. self.epsilon = epsilon
  42. def adagrad(self, x, gradient_x, G, init_lr):
  43. """
  44. adagrad算法更新参数,G为参数梯度平方的累计值。
  45. """
  46. G += gradient_x ** 2
  47. x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x
  48. return x, G
  49. def step(self):
  50. """
  51. 参数更新
  52. """
  53. for key in self.model.params.keys():
  54. self.model.params[key], self.G[key] = self.adagrad(self.model.params[key],
  55. self.model.grads[key],
  56. self.G[key],
  57. self.init_lr)
  58. class RMSprop(Optimizer):
  59. def __init__(self, init_lr, model, beta, epsilon):
  60. """
  61. RMSprop优化器初始化
  62. 输入:
  63. - init_lr:初始学习率
  64. - model:模型,model.params存储模型参数值
  65. - beta:衰减率
  66. - epsilon:保持数值稳定性而设置的常数
  67. """
  68. super(RMSprop, self).__init__(init_lr=init_lr, model=model)
  69. self.G = {}
  70. for key in self.model.params.keys():
  71. self.G[key] = 0
  72. self.beta = beta
  73. self.epsilon = epsilon
  74. def rmsprop(self, x, gradient_x, G, init_lr):
  75. """
  76. rmsprop算法更新参数,G为迭代梯度平方的加权移动平均
  77. """
  78. G = self.beta * G + (1 - self.beta) * gradient_x ** 2
  79. x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x
  80. return x, G
  81. def step(self):
  82. """参数更新"""
  83. for key in self.model.params.keys():
  84. self.model.params[key], self.G[key] = self.rmsprop(self.model.params[key],
  85. self.model.grads[key],
  86. self.G[key],
  87. self.init_lr)
  88. class Momentum(Optimizer):
  89. def __init__(self, init_lr, model, rho):
  90. """
  91. Momentum优化器初始化
  92. 输入:
  93. - init_lr:初始学习率
  94. - model:模型,model.params存储模型参数值
  95. - rho:动量因子
  96. """
  97. super(Momentum, self).__init__(init_lr=init_lr, model=model)
  98. self.delta_x = {}
  99. for key in self.model.params.keys():
  100. self.delta_x[key] = 0
  101. self.rho = rho
  102. def momentum(self, x, gradient_x, delta_x, init_lr):
  103. """
  104. momentum算法更新参数,delta_x为梯度的加权移动平均
  105. """
  106. delta_x = self.rho * delta_x - init_lr * gradient_x
  107. x += delta_x
  108. return x, delta_x
  109. def step(self):
  110. """参数更新"""
  111. for key in self.model.params.keys():
  112. self.model.params[key], self.delta_x[key] = self.momentum(self.model.params[key],
  113. self.model.grads[key],
  114. self.delta_x[key],
  115. self.init_lr)
  116. class Nesterov(Optimizer):
  117. def __init__(self, init_lr, model, rho):
  118. """
  119. Nesterov优化器初始化
  120. 输入:
  121. - init_lr:初始学习率
  122. - model:模型,model.params存储模型参数值
  123. - rho:动量因子
  124. """
  125. super(Nesterov, self).__init__(init_lr=init_lr, model=model)
  126. self.delta_x = {}
  127. for key in self.model.params.keys():
  128. self.delta_x[key] = 0
  129. self.rho = rho
  130. def nesterov(self, x, gradient_x, delta_x, init_lr):
  131. """
  132. Nesterov算法更新参数,delta_x为梯度的加权移动平均
  133. """
  134. delta_x_prev = delta_x
  135. delta_x = self.rho * delta_x - init_lr * gradient_x
  136. x += -self.rho * delta_x_prev + (1 + self.rho) * delta_x
  137. return x, delta_x
  138. def step(self):
  139. """参数更新"""
  140. for key in self.model.params.keys():
  141. self.model.params[key], self.delta_x[key] = self.nesterov(self.model.params[key],
  142. self.model.grads[key],
  143. self.delta_x[key],
  144. self.init_lr)
  145. class Adam(Optimizer):
  146. def __init__(self, init_lr, model, beta1, beta2, epsilon):
  147. """
  148. Adam优化器初始化
  149. 输入:
  150. - init_lr:初始学习率
  151. - model:模型,model.params存储模型参数值
  152. - beta1, beta2:移动平均的衰减率
  153. - epsilon:保持数值稳定性而设置的常数
  154. """
  155. super(Adam, self).__init__(init_lr=init_lr, model=model)
  156. self.beta1 = beta1
  157. self.beta2 = beta2
  158. self.epsilon = epsilon
  159. self.M, self.G = {}, {}
  160. for key in self.model.params.keys():
  161. self.M[key] = 0
  162. self.G[key] = 0
  163. self.t = 1
  164. def adam(self, x, gradient_x, G, M, t, init_lr):
  165. """
  166. adam算法更新参数
  167. 输入:
  168. - x:参数
  169. - G:梯度平方的加权移动平均
  170. - M:梯度的加权移动平均
  171. - t:迭代次数
  172. - init_lr:初始学习率
  173. """
  174. M = self.beta1 * M + (1 - self.beta1) * gradient_x
  175. G = self.beta2 * G + (1 - self.beta2) * gradient_x ** 2
  176. M_hat = M / (1 - self.beta1 ** t)
  177. G_hat = G / (1 - self.beta2 ** t)
  178. t += 1
  179. x -= init_lr / torch.sqrt(G_hat + self.epsilon) * M_hat
  180. return x, G, M, t
  181. def step(self):
  182. """参数更新"""
  183. for key in self.model.params.keys():
  184. self.model.params[key], self.G[key], self.M[key], self.t = self.adam(self.model.params[key],
  185. self.model.grads[key],
  186. self.G[key],
  187. self.M[key],
  188. self.t,
  189. self.init_lr)
  190. class OptimizedFunction3D(Op):
  191. def __init__(self):
  192. super(OptimizedFunction3D, self).__init__()
  193. self.params = {'x': 0}
  194. self.grads = {'x': 0}
  195. def forward(self, x):
  196. self.params['x'] = x
  197. return x[0] ** 2 + x[1] ** 2 + x[1] ** 3 + x[0] * x[1]
  198. def backward(self):
  199. x = self.params['x']
  200. gradient1 = 2 * x[0] + x[1]
  201. gradient2 = 2 * x[1] + 3 * x[1] ** 2 + x[0]
  202. grad1 = torch.Tensor([gradient1])
  203. grad2 = torch.Tensor([gradient2])
  204. self.grads['x'] = torch.cat([grad1, grad2])
  205. class Visualization3D(animation.FuncAnimation):
  206. """ 绘制动态图像,可视化参数更新轨迹 """
  207. def __init__(self, *xy_values, z_values, labels=[], colors=[], fig, ax, interval=600, blit=True, **kwargs):
  208. """
  209. 初始化3d可视化类
  210. 输入:
  211. xy_values:三维中x,y维度的值
  212. z_values:三维中z维度的值
  213. labels:每个参数更新轨迹的标签
  214. colors:每个轨迹的颜色
  215. interval:帧之间的延迟(以毫秒为单位)
  216. blit:是否优化绘图
  217. """
  218. self.fig = fig
  219. self.ax = ax
  220. self.xy_values = xy_values
  221. self.z_values = z_values
  222. frames = max(xy_value.shape[0] for xy_value in xy_values)
  223. self.lines = [ax.plot([], [], [], label=label, color=color, lw=2)[0]
  224. for _, label, color in zip_longest(xy_values, labels, colors)]
  225. super(Visualization3D, self).__init__(fig, self.animate, init_func=self.init_animation, frames=frames,
  226. interval=interval, blit=blit, **kwargs)
  227. def init_animation(self):
  228. # 数值初始化
  229. for line in self.lines:
  230. line.set_data([], [])
  231. # line.set_3d_properties(np.asarray([])) # 源程序中有这一行,加上会报错。 Edit by David 2022.12.4
  232. return self.lines
  233. def animate(self, i):
  234. # 将x,y,z三个数据传入,绘制三维图像
  235. for line, xy_value, z_value in zip(self.lines, self.xy_values, self.z_values):
  236. line.set_data(xy_value[:i, 0], xy_value[:i, 1])
  237. line.set_3d_properties(z_value[:i])
  238. return self.lines
  239. def train_f(model, optimizer, x_init, epoch):
  240. x = x_init
  241. all_x = []
  242. losses = []
  243. for i in range(epoch):
  244. all_x.append(copy.deepcopy(x.numpy())) # 浅拷贝 改为 深拷贝, 否则List的原值会被改变。 Edit by David 2022.12.4.
  245. loss = model(x)
  246. losses.append(loss)
  247. model.backward()
  248. optimizer.step()
  249. x = model.params['x']
  250. return torch.Tensor(np.array(all_x)), losses
  251. # 构建6个模型,分别配备不同的优化器
  252. model1 = OptimizedFunction3D()
  253. opt_gd = SimpleBatchGD(init_lr=0.01, model=model1)
  254. model2 = OptimizedFunction3D()
  255. opt_adagrad = Adagrad(init_lr=0.5, model=model2, epsilon=1e-7)
  256. model3 = OptimizedFunction3D()
  257. opt_rmsprop = RMSprop(init_lr=0.1, model=model3, beta=0.9, epsilon=1e-7)
  258. model4 = OptimizedFunction3D()
  259. opt_momentum = Momentum(init_lr=0.01, model=model4, rho=0.9)
  260. model5 = OptimizedFunction3D()
  261. opt_adam = Adam(init_lr=0.1, model=model5, beta1=0.9, beta2=0.99, epsilon=1e-7)
  262. model6 = OptimizedFunction3D()
  263. opt_Nesterov = Nesterov(init_lr=0.1, model=model6, rho=0.9)
  264. models = [model1, model2, model3, model4, model5, model6]
  265. opts = [opt_gd, opt_adagrad, opt_rmsprop, opt_momentum, opt_adam, opt_Nesterov]
  266. x_all_opts = []
  267. z_all_opts = []
  268. # 使用不同优化器训练
  269. for model, opt in zip(models, opts):
  270. x_init = torch.FloatTensor([2, 3])
  271. x_one_opt, z_one_opt = train_f(model, opt, x_init, 150) # epoch
  272. # 保存参数值
  273. x_all_opts.append(x_one_opt.numpy())
  274. z_all_opts.append(np.squeeze(z_one_opt))
  275. # 使用numpy.meshgrid生成x1,x2矩阵,矩阵的每一行为[-3, 3],以0.1为间隔的数值
  276. x1 = np.arange(-3, 3, 0.1)
  277. x2 = np.arange(-3, 3, 0.1)
  278. x1, x2 = np.meshgrid(x1, x2)
  279. init_x = torch.Tensor(np.array([x1, x2]))
  280. model = OptimizedFunction3D()
  281. # 绘制 f_3d函数 的 三维图像
  282. fig = plt.figure()
  283. ax = plt.axes(projection='3d')
  284. X = init_x[0].numpy()
  285. Y = init_x[1].numpy()
  286. Z = model(init_x).numpy() # 改为 model(init_x).numpy() David 2022.12.4
  287. ax.plot_surface(X, Y, Z, cmap='rainbow')
  288. ax.set_xlabel('x1')
  289. ax.set_ylabel('x2')
  290. ax.set_zlabel('f(x1,x2)')
  291. labels = ['SGD', 'AdaGrad', 'RMSprop', 'Momentum', 'Adam', 'Nesterov']
  292. colors = ['#8B0000', '#0000FF', '#000000', '#008B00', '#FF0000']
  293. animator = Visualization3D(*x_all_opts, z_values=z_all_opts, labels=labels, colors=colors, fig=fig, ax=ax)
  294. ax.legend(loc='upper left')
  295. plt.show()
  296. animator.save('animation.gif') # 效果不好,估计被挡住了…… 有待进一步提高 Edit by David 2022.12.4

结果如下:

 (2)x^{2} /20+y^{2}

  1. import torch
  2. import numpy as np
  3. import copy
  4. from matplotlib import pyplot as plt
  5. from matplotlib import animation
  6. from itertools import zip_longest
  7. from matplotlib import cm
  8. class Op(object):
  9. def __init__(self):
  10. pass
  11. def __call__(self, inputs):
  12. return self.forward(inputs)
  13. # 输入:张量inputs
  14. # 输出:张量outputs
  15. def forward(self, inputs):
  16. # return outputs
  17. raise NotImplementedError
  18. # 输入:最终输出对outputs的梯度outputs_grads
  19. # 输出:最终输出对inputs的梯度inputs_grads
  20. def backward(self, outputs_grads):
  21. # return inputs_grads
  22. raise NotImplementedError
  23. # 优化器基类
  24. class Optimizer(object):
  25. def __init__(self, init_lr, model):
  26. """
  27. 优化器类初始化
  28. """
  29. # 初始化学习率,用于参数更新的计算
  30. self.init_lr = init_lr
  31. # 指定优化器需要优化的模型
  32. self.model = model
  33. def step(self):
  34. """
  35. 定义每次迭代如何更新参数
  36. """
  37. pass
  38. #SGD
  39. class SimpleBatchGD(Optimizer):
  40. def __init__(self, init_lr, model):
  41. super(SimpleBatchGD, self).__init__(init_lr=init_lr, model=model)
  42. def step(self):
  43. # 参数更新
  44. if isinstance(self.model.params, dict):
  45. for key in self.model.params.keys():
  46. self.model.params[key] = self.model.params[key] - self.init_lr * self.model.grads[key]
  47. #Adagrad
  48. class Adagrad(Optimizer):
  49. def __init__(self, init_lr, model, epsilon):
  50. """
  51. Adagrad 优化器初始化
  52. 输入:
  53. - init_lr: 初始学习率 - model:模型,model.params存储模型参数值 - epsilon:保持数值稳定性而设置的非常小的常数
  54. """
  55. super(Adagrad, self).__init__(init_lr=init_lr, model=model)
  56. self.G = {}
  57. for key in self.model.params.keys():
  58. self.G[key] = 0
  59. self.epsilon = epsilon
  60. def adagrad(self, x, gradient_x, G, init_lr):
  61. """
  62. adagrad算法更新参数,G为参数梯度平方的累计值。
  63. """
  64. G += gradient_x ** 2
  65. x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x
  66. return x, G
  67. def step(self):
  68. """
  69. 参数更新
  70. """
  71. for key in self.model.params.keys():
  72. self.model.params[key], self.G[key] = self.adagrad(self.model.params[key],
  73. self.model.grads[key],
  74. self.G[key],
  75. self.init_lr)
  76. #RMSprop
  77. class RMSprop(Optimizer):
  78. def __init__(self, init_lr, model, beta, epsilon):
  79. """
  80. RMSprop优化器初始化
  81. 输入:
  82. - init_lr:初始学习率
  83. - model:模型,model.params存储模型参数值
  84. - beta:衰减率
  85. - epsilon:保持数值稳定性而设置的常数
  86. """
  87. super(RMSprop, self).__init__(init_lr=init_lr, model=model)
  88. self.G = {}
  89. for key in self.model.params.keys():
  90. self.G[key] = 0
  91. self.beta = beta
  92. self.epsilon = epsilon
  93. def rmsprop(self, x, gradient_x, G, init_lr):
  94. """
  95. rmsprop算法更新参数,G为迭代梯度平方的加权移动平均
  96. """
  97. G = self.beta * G + (1 - self.beta) * gradient_x ** 2
  98. x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x
  99. return x, G
  100. def step(self):
  101. """参数更新"""
  102. for key in self.model.params.keys():
  103. self.model.params[key], self.G[key] = self.rmsprop(self.model.params[key],
  104. self.model.grads[key],
  105. self.G[key],
  106. self.init_lr)
  107. #动量法
  108. class Momentum(Optimizer):
  109. def __init__(self, init_lr, model, rho):
  110. """
  111. Momentum优化器初始化
  112. 输入:
  113. - init_lr:初始学习率
  114. - model:模型,model.params存储模型参数值
  115. - rho:动量因子
  116. """
  117. super(Momentum, self).__init__(init_lr=init_lr, model=model)
  118. self.delta_x = {}
  119. for key in self.model.params.keys():
  120. self.delta_x[key] = 0
  121. self.rho = rho
  122. def momentum(self, x, gradient_x, delta_x, init_lr):
  123. """
  124. momentum算法更新参数,delta_x为梯度的加权移动平均
  125. """
  126. delta_x = self.rho * delta_x - init_lr * gradient_x
  127. x += delta_x
  128. return x, delta_x
  129. def step(self):
  130. """参数更新"""
  131. for key in self.model.params.keys():
  132. self.model.params[key], self.delta_x[key] = self.momentum(self.model.params[key],
  133. self.model.grads[key],
  134. self.delta_x[key],
  135. self.init_lr)
  136. #Adam
  137. class Adam(Optimizer):
  138. def __init__(self, init_lr, model, beta1, beta2, epsilon):
  139. """
  140. Adam优化器初始化
  141. 输入:
  142. - init_lr:初始学习率
  143. - model:模型,model.params存储模型参数值
  144. - beta1, beta2:移动平均的衰减率
  145. - epsilon:保持数值稳定性而设置的常数
  146. """
  147. super(Adam, self).__init__(init_lr=init_lr, model=model)
  148. self.beta1 = beta1
  149. self.beta2 = beta2
  150. self.epsilon = epsilon
  151. self.M, self.G = {}, {}
  152. for key in self.model.params.keys():
  153. self.M[key] = 0
  154. self.G[key] = 0
  155. self.t = 1
  156. def adam(self, x, gradient_x, G, M, t, init_lr):
  157. """
  158. adam算法更新参数
  159. 输入:
  160. - x:参数
  161. - G:梯度平方的加权移动平均
  162. - M:梯度的加权移动平均
  163. - t:迭代次数
  164. - init_lr:初始学习率
  165. """
  166. M = self.beta1 * M + (1 - self.beta1) * gradient_x
  167. G = self.beta2 * G + (1 - self.beta2) * gradient_x ** 2
  168. M_hat = M / (1 - self.beta1 ** t)
  169. G_hat = G / (1 - self.beta2 ** t)
  170. t += 1
  171. x -= init_lr / torch.sqrt(G_hat + self.epsilon) * M_hat
  172. return x, G, M, t
  173. def step(self):
  174. """参数更新"""
  175. for key in self.model.params.keys():
  176. self.model.params[key], self.G[key], self.M[key], self.t = self.adam(self.model.params[key],
  177. self.model.grads[key],
  178. self.G[key],
  179. self.M[key],
  180. self.t,
  181. self.init_lr)
  182. #三维函数--实现前向传播和反向传播
  183. class OptimizedFunction3D(Op):
  184. def __init__(self):
  185. super(OptimizedFunction3D, self).__init__()
  186. self.params = {'x': 0}
  187. self.grads = {'x': 0}
  188. def forward(self, x):
  189. self.params['x'] = x
  190. return x[0] * x[0] / 20 + x[1] * x[1] / 1 # x[0] ** 2 + x[1] ** 2 + x[1] ** 3 + x[0] * x[1]
  191. def backward(self):
  192. x = self.params['x']
  193. gradient1 = 2 * x[0] / 20
  194. gradient2 = 2 * x[1] / 1
  195. grad1 = torch.Tensor([gradient1])
  196. grad2 = torch.Tensor([gradient2])
  197. self.grads['x'] = torch.cat([grad1, grad2])
  198. #动态三维图像--可视化参数更新轨迹
  199. class Visualization3D(animation.FuncAnimation):
  200. """ 绘制动态图像,可视化参数更新轨迹 """
  201. def __init__(self, *xy_values, z_values, labels=[], colors=[], fig, ax, interval=100, blit=True, **kwargs):
  202. """
  203. 初始化3d可视化类
  204. 输入:
  205. xy_values:三维中x,y维度的值
  206. z_values:三维中z维度的值
  207. labels:每个参数更新轨迹的标签
  208. colors:每个轨迹的颜色
  209. interval:帧之间的延迟(以毫秒为单位)
  210. blit:是否优化绘图
  211. """
  212. self.fig = fig
  213. self.ax = ax
  214. self.xy_values = xy_values
  215. self.z_values = z_values
  216. frames = max(xy_value.shape[0] for xy_value in xy_values)
  217. self.lines = [ax.plot([], [], [], label=label, color=color, lw=2)[0]
  218. for _, label, color in zip_longest(xy_values, labels, colors)]
  219. self.points = [ax.plot([], [], [], color=color, markeredgewidth=1, markeredgecolor='black', marker='o')[0]
  220. for _, color in zip_longest(xy_values, colors)]
  221. # print(self.lines)
  222. super(Visualization3D, self).__init__(fig, self.animate, init_func=self.init_animation, frames=frames,
  223. interval=interval, blit=blit, **kwargs)
  224. def init_animation(self):
  225. # 数值初始化
  226. for line in self.lines:
  227. line.set_data_3d([], [], [])
  228. for point in self.points:
  229. point.set_data_3d([], [], [])
  230. return self.points + self.lines
  231. #实现动态效果-根据i更新line和point的位置
  232. def animate(self, i):
  233. # 将x,y,z三个数据传入,绘制三维图像
  234. for line, xy_value, z_value in zip(self.lines, self.xy_values, self.z_values):
  235. line.set_data_3d(xy_value[:i, 0], xy_value[:i, 1], z_value[:i])
  236. for point, xy_value, z_value in zip(self.points, self.xy_values, self.z_values):
  237. point.set_data_3d(xy_value[i, 0], xy_value[i, 1], z_value[i])
  238. return self.points + self.lines
  239. def train_f(model, optimizer, x_init, epoch):
  240. x = x_init
  241. all_x = []
  242. losses = []
  243. for i in range(epoch):
  244. all_x.append(copy.deepcopy(x.numpy())) # 浅拷贝 改为 深拷贝, 否则List的原值会被改变。 Edit by David 2022.12.4.
  245. loss = model(x)
  246. losses.append(loss)
  247. model.backward()
  248. optimizer.step()
  249. x = model.params['x']
  250. return torch.Tensor(np.array(all_x)), losses
  251. # 构建5个模型,分别配备不同的优化器
  252. model1 = OptimizedFunction3D()
  253. opt_gd = SimpleBatchGD(init_lr=0.95, model=model1)
  254. model2 = OptimizedFunction3D()
  255. opt_adagrad = Adagrad(init_lr=1.5, model=model2, epsilon=1e-7)
  256. model3 = OptimizedFunction3D()
  257. opt_rmsprop = RMSprop(init_lr=0.05, model=model3, beta=0.9, epsilon=1e-7)
  258. model4 = OptimizedFunction3D()
  259. opt_momentum = Momentum(init_lr=0.1, model=model4, rho=0.9)
  260. model5 = OptimizedFunction3D()
  261. opt_adam = Adam(init_lr=0.3, model=model5, beta1=0.9, beta2=0.99, epsilon=1e-7)
  262. models = [model1, model2, model3, model4, model5]
  263. opts = [opt_gd, opt_adagrad, opt_rmsprop, opt_momentum, opt_adam]
  264. x_all_opts = []
  265. z_all_opts = []
  266. # 使用不同优化器训练
  267. for model, opt in zip(models, opts):
  268. x_init = torch.FloatTensor([-7, 2])
  269. x_one_opt, z_one_opt = train_f(model, opt, x_init, 100) # epoch
  270. # 保存参数值
  271. x_all_opts.append(x_one_opt.numpy())
  272. z_all_opts.append(np.squeeze(z_one_opt))
  273. # 使用numpy.meshgrid生成x1,x2矩阵,矩阵的每一行为[-3, 3],以0.1为间隔的数值
  274. x1 = np.arange(-10, 10, 0.01)
  275. x2 = np.arange(-5, 5, 0.01)
  276. x1, x2 = np.meshgrid(x1, x2)
  277. init_x = torch.Tensor(np.array([x1, x2]))
  278. model = OptimizedFunction3D()
  279. # 绘制 f_3d函数 的 三维图像
  280. fig = plt.figure()
  281. ax = plt.axes(projection='3d')
  282. X = init_x[0].numpy()
  283. Y = init_x[1].numpy()
  284. Z = model(init_x).numpy() # 改为 model(init_x).numpy() David 2022.12.4
  285. surf = ax.plot_surface(X, Y, Z, edgecolor='grey', cmap=cm.coolwarm)
  286. # fig.colorbar(surf, shrink=0.5, aspect=1)
  287. # ax.set_zlim(-3, 2)
  288. ax.set_xlabel('x1')
  289. ax.set_ylabel('x2')
  290. ax.set_zlabel('f(x1,x2)')
  291. labels = ['SGD', 'AdaGrad', 'RMSprop', 'Momentum', 'Adam']
  292. colors = ['#8B0000', '#0000FF', '#000000', '#008B00', '#FF0000']
  293. animator = Visualization3D(*x_all_opts, z_values=z_all_opts, labels=labels, colors=colors, fig=fig, ax=ax)
  294. ax.legend(loc='upper right')
  295. plt.show()

结合3D动画,用自己的语言,从轨迹、速度等多个角度讲解各个算法优缺点

3.复现CS231经典动画


Animations that may help your intuitions about the learning process dynamics. 

Left: Contours of a loss surface and time evolution of different optimization algorithms. Notice the "overshooting" behavior of momentum-based methods, which make the optimization look like a ball rolling down the hill. 

Right: A visualization of a saddle point in the optimization landscape, where the curvature along different dimension has different signs (one dimension curves up and another down). Notice that SGD has a very hard time breaking symmetry and gets stuck on the top. Conversely, algorithms such as RMSprop will see very low gradients in the saddle direction. Due to the denominator term in the RMSprop update, this will increase the effective learning rate along this direction, helping RMSProp proceed. 

下边代码加上了Nesterov算法(对动量法进行改进:计算速度时先对当前位置进行一次预更新,然后再根据预更新的位置计算速度

参考NNDL 作业13 优化算法3D可视化-CSDN博客

  1. import torch
  2. import numpy as np
  3. import copy
  4. from matplotlib import pyplot as plt
  5. from matplotlib import animation
  6. from itertools import zip_longest
  7. from matplotlib import cm
  8. class Op(object):
  9. def __init__(self):
  10. pass
  11. def __call__(self, inputs):
  12. return self.forward(inputs)
  13. # 输入:张量inputs
  14. # 输出:张量outputs
  15. def forward(self, inputs):
  16. # return outputs
  17. raise NotImplementedError
  18. # 输入:最终输出对outputs的梯度outputs_grads
  19. # 输出:最终输出对inputs的梯度inputs_grads
  20. def backward(self, outputs_grads):
  21. # return inputs_grads
  22. raise NotImplementedError
  23. class Optimizer(object): # 优化器基类
  24. def __init__(self, init_lr, model):
  25. """
  26. 优化器类初始化
  27. """
  28. # 初始化学习率,用于参数更新的计算
  29. self.init_lr = init_lr
  30. # 指定优化器需要优化的模型
  31. self.model = model
  32. def step(self):
  33. """
  34. 定义每次迭代如何更新参数
  35. """
  36. pass
  37. class SimpleBatchGD(Optimizer):
  38. def __init__(self, init_lr, model):
  39. super(SimpleBatchGD, self).__init__(init_lr=init_lr, model=model)
  40. def step(self):
  41. # 参数更新
  42. if isinstance(self.model.params, dict):
  43. for key in self.model.params.keys():
  44. self.model.params[key] = self.model.params[key] - self.init_lr * self.model.grads[key]
  45. class Adagrad(Optimizer):
  46. def __init__(self, init_lr, model, epsilon):
  47. """
  48. Adagrad 优化器初始化
  49. 输入:
  50. - init_lr: 初始学习率 - model:模型,model.params存储模型参数值 - epsilon:保持数值稳定性而设置的非常小的常数
  51. """
  52. super(Adagrad, self).__init__(init_lr=init_lr, model=model)
  53. self.G = {}
  54. for key in self.model.params.keys():
  55. self.G[key] = 0
  56. self.epsilon = epsilon
  57. def adagrad(self, x, gradient_x, G, init_lr):
  58. """
  59. adagrad算法更新参数,G为参数梯度平方的累计值。
  60. """
  61. G += gradient_x ** 2
  62. x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x
  63. return x, G
  64. def step(self):
  65. """
  66. 参数更新
  67. """
  68. for key in self.model.params.keys():
  69. self.model.params[key], self.G[key] = self.adagrad(self.model.params[key],
  70. self.model.grads[key],
  71. self.G[key],
  72. self.init_lr)
  73. class RMSprop(Optimizer):
  74. def __init__(self, init_lr, model, beta, epsilon):
  75. """
  76. RMSprop优化器初始化
  77. 输入:
  78. - init_lr:初始学习率
  79. - model:模型,model.params存储模型参数值
  80. - beta:衰减率
  81. - epsilon:保持数值稳定性而设置的常数
  82. """
  83. super(RMSprop, self).__init__(init_lr=init_lr, model=model)
  84. self.G = {}
  85. for key in self.model.params.keys():
  86. self.G[key] = 0
  87. self.beta = beta
  88. self.epsilon = epsilon
  89. def rmsprop(self, x, gradient_x, G, init_lr):
  90. """
  91. rmsprop算法更新参数,G为迭代梯度平方的加权移动平均
  92. """
  93. G = self.beta * G + (1 - self.beta) * gradient_x ** 2
  94. x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x
  95. return x, G
  96. def step(self):
  97. """参数更新"""
  98. for key in self.model.params.keys():
  99. self.model.params[key], self.G[key] = self.rmsprop(self.model.params[key],
  100. self.model.grads[key],
  101. self.G[key],
  102. self.init_lr)
  103. class Momentum(Optimizer):
  104. def __init__(self, init_lr, model, rho):
  105. """
  106. Momentum优化器初始化
  107. 输入:
  108. - init_lr:初始学习率
  109. - model:模型,model.params存储模型参数值
  110. - rho:动量因子
  111. """
  112. super(Momentum, self).__init__(init_lr=init_lr, model=model)
  113. self.delta_x = {}
  114. for key in self.model.params.keys():
  115. self.delta_x[key] = 0
  116. self.rho = rho
  117. def momentum(self, x, gradient_x, delta_x, init_lr):
  118. """
  119. momentum算法更新参数,delta_x为梯度的加权移动平均
  120. """
  121. delta_x = self.rho * delta_x - init_lr * gradient_x
  122. x += delta_x
  123. return x, delta_x
  124. def step(self):
  125. """参数更新"""
  126. for key in self.model.params.keys():
  127. self.model.params[key], self.delta_x[key] = self.momentum(self.model.params[key],
  128. self.model.grads[key],
  129. self.delta_x[key],
  130. self.init_lr)
  131. class Nesterov(Optimizer):
  132. def __init__(self, init_lr, model, rho):
  133. """
  134. Nesterov优化器初始化
  135. 输入:
  136. - init_lr:初始学习率
  137. - model:模型,model.params存储模型参数值
  138. - rho:动量因子
  139. """
  140. super(Nesterov, self).__init__(init_lr=init_lr, model=model)
  141. self.delta_x = {}
  142. for key in self.model.params.keys():
  143. self.delta_x[key] = 0
  144. self.rho = rho
  145. def nesterov(self, x, gradient_x, delta_x, init_lr):
  146. """
  147. Nesterov算法更新参数,delta_x为梯度的加权移动平均
  148. """
  149. delta_x_prev = delta_x
  150. delta_x = self.rho * delta_x - init_lr * gradient_x
  151. x += -self.rho * delta_x_prev + (1 + self.rho) * delta_x
  152. return x, delta_x
  153. def step(self):
  154. """参数更新"""
  155. for key in self.model.params.keys():
  156. self.model.params[key], self.delta_x[key] = self.nesterov(self.model.params[key],
  157. self.model.grads[key],
  158. self.delta_x[key],
  159. self.init_lr)
  160. class Adam(Optimizer):
  161. def __init__(self, init_lr, model, beta1, beta2, epsilon):
  162. """
  163. Adam优化器初始化
  164. 输入:
  165. - init_lr:初始学习率
  166. - model:模型,model.params存储模型参数值
  167. - beta1, beta2:移动平均的衰减率
  168. - epsilon:保持数值稳定性而设置的常数
  169. """
  170. super(Adam, self).__init__(init_lr=init_lr, model=model)
  171. self.beta1 = beta1
  172. self.beta2 = beta2
  173. self.epsilon = epsilon
  174. self.M, self.G = {}, {}
  175. for key in self.model.params.keys():
  176. self.M[key] = 0
  177. self.G[key] = 0
  178. self.t = 1
  179. def adam(self, x, gradient_x, G, M, t, init_lr):
  180. """
  181. adam算法更新参数
  182. 输入:
  183. - x:参数
  184. - G:梯度平方的加权移动平均
  185. - M:梯度的加权移动平均
  186. - t:迭代次数
  187. - init_lr:初始学习率
  188. """
  189. M = self.beta1 * M + (1 - self.beta1) * gradient_x
  190. G = self.beta2 * G + (1 - self.beta2) * gradient_x ** 2
  191. M_hat = M / (1 - self.beta1 ** t)
  192. G_hat = G / (1 - self.beta2 ** t)
  193. t += 1
  194. x -= init_lr / torch.sqrt(G_hat + self.epsilon) * M_hat
  195. return x, G, M, t
  196. def step(self):
  197. """参数更新"""
  198. for key in self.model.params.keys():
  199. self.model.params[key], self.G[key], self.M[key], self.t = self.adam(self.model.params[key],
  200. self.model.grads[key],
  201. self.G[key],
  202. self.M[key],
  203. self.t,
  204. self.init_lr)
  205. class OptimizedFunction3D(Op):
  206. def __init__(self):
  207. super(OptimizedFunction3D, self).__init__()
  208. self.params = {'x': 0}
  209. self.grads = {'x': 0}
  210. def forward(self, x):
  211. self.params['x'] = x
  212. return - x[0] * x[0] / 2 + x[1] * x[1] / 1 # x[0] ** 2 + x[1] ** 2 + x[1] ** 3 + x[0] * x[1]
  213. def backward(self):
  214. x = self.params['x']
  215. gradient1 = - 2 * x[0] / 2
  216. gradient2 = 2 * x[1] / 1
  217. grad1 = torch.Tensor([gradient1])
  218. grad2 = torch.Tensor([gradient2])
  219. self.grads['x'] = torch.cat([grad1, grad2])
  220. class Visualization3D(animation.FuncAnimation):
  221. """ 绘制动态图像,可视化参数更新轨迹 """
  222. def __init__(self, *xy_values, z_values, labels=[], colors=[], fig, ax, interval=100, blit=True, **kwargs):
  223. """
  224. 初始化3d可视化类
  225. 输入:
  226. xy_values:三维中x,y维度的值
  227. z_values:三维中z维度的值
  228. labels:每个参数更新轨迹的标签
  229. colors:每个轨迹的颜色
  230. interval:帧之间的延迟(以毫秒为单位)
  231. blit:是否优化绘图
  232. """
  233. self.fig = fig
  234. self.ax = ax
  235. self.xy_values = xy_values
  236. self.z_values = z_values
  237. frames = max(xy_value.shape[0] for xy_value in xy_values)
  238. self.lines = [ax.plot([], [], [], label=label, color=color, lw=2)[0]
  239. for _, label, color in zip_longest(xy_values, labels, colors)]
  240. self.points = [ax.plot([], [], [], color=color, markeredgewidth=1, markeredgecolor='black', marker='o')[0]
  241. for _, color in zip_longest(xy_values, colors)]
  242. # print(self.lines)
  243. super(Visualization3D, self).__init__(fig, self.animate, init_func=self.init_animation, frames=frames,
  244. interval=interval, blit=blit, **kwargs)
  245. def init_animation(self):
  246. # 数值初始化
  247. for line in self.lines:
  248. line.set_data_3d([], [], [])
  249. for point in self.points:
  250. point.set_data_3d([], [], [])
  251. return self.points + self.lines
  252. def animate(self, i):
  253. # 将x,y,z三个数据传入,绘制三维图像
  254. for line, xy_value, z_value in zip(self.lines, self.xy_values, self.z_values):
  255. line.set_data_3d(xy_value[:i, 0], xy_value[:i, 1], z_value[:i])
  256. for point, xy_value, z_value in zip(self.points, self.xy_values, self.z_values):
  257. point.set_data_3d(xy_value[i, 0], xy_value[i, 1], z_value[i])
  258. return self.points + self.lines
  259. def train_f(model, optimizer, x_init, epoch):
  260. x = x_init
  261. all_x = []
  262. losses = []
  263. for i in range(epoch):
  264. all_x.append(copy.deepcopy(x.numpy())) # 浅拷贝 改为 深拷贝, 否则List的原值会被改变。 Edit by David 2022.12.4.
  265. loss = model(x)
  266. losses.append(loss)
  267. model.backward()
  268. optimizer.step()
  269. x = model.params['x']
  270. return torch.Tensor(np.array(all_x)), losses
  271. # 构建5个模型,分别配备不同的优化器
  272. model1 = OptimizedFunction3D()
  273. opt_gd = SimpleBatchGD(init_lr=0.05, model=model1)
  274. model2 = OptimizedFunction3D()
  275. opt_adagrad = Adagrad(init_lr=0.05, model=model2, epsilon=1e-7)
  276. model3 = OptimizedFunction3D()
  277. opt_rmsprop = RMSprop(init_lr=0.05, model=model3, beta=0.9, epsilon=1e-7)
  278. model4 = OptimizedFunction3D()
  279. opt_momentum = Momentum(init_lr=0.05, model=model4, rho=0.9)
  280. model5 = OptimizedFunction3D()
  281. opt_adam = Adam(init_lr=0.05, model=model5, beta1=0.9, beta2=0.99, epsilon=1e-7)
  282. model6 = OptimizedFunction3D()
  283. opt_Nesterov = Nesterov(init_lr=0.1, model=model6, rho=0.9)
  284. models = [model1, model2, model3, model4, model5, model6]
  285. opts = [opt_gd, opt_adagrad, opt_rmsprop, opt_momentum, opt_adam, opt_Nesterov]
  286. x_all_opts = []
  287. z_all_opts = []
  288. # 使用不同优化器训练
  289. for model, opt in zip(models, opts):
  290. x_init = torch.FloatTensor([0.00001, 0.5])
  291. x_one_opt, z_one_opt = train_f(model, opt, x_init, 100) # epoch
  292. # 保存参数值
  293. x_all_opts.append(x_one_opt.numpy())
  294. z_all_opts.append(np.squeeze(z_one_opt))
  295. # 使用numpy.meshgrid生成x1,x2矩阵,矩阵的每一行为[-3, 3],以0.1为间隔的数值
  296. x1 = np.arange(-1, 2, 0.01)
  297. x2 = np.arange(-1, 1, 0.05)
  298. x1, x2 = np.meshgrid(x1, x2)
  299. init_x = torch.Tensor(np.array([x1, x2]))
  300. model = OptimizedFunction3D()
  301. # 绘制 f_3d函数 的 三维图像
  302. fig = plt.figure()
  303. ax = plt.axes(projection='3d')
  304. X = init_x[0].numpy()
  305. Y = init_x[1].numpy()
  306. Z = model(init_x).numpy() # 改为 model(init_x).numpy() David 2022.12.4
  307. surf = ax.plot_surface(X, Y, Z, edgecolor='grey', cmap=cm.coolwarm)
  308. # fig.colorbar(surf, shrink=0.5, aspect=1)
  309. ax.set_zlim(-3, 2)
  310. ax.set_xlabel('x1')
  311. ax.set_ylabel('x2')
  312. ax.set_zlabel('f(x1,x2)')
  313. labels = ['SGD', 'AdaGrad', 'RMSprop', 'Momentum', 'Adam', 'Nesterov']
  314. colors = ['#8B0000', '#0000FF', '#000000', '#008B00', '#FF0000']
  315. animator = Visualization3D(*x_all_opts, z_values=z_all_opts, labels=labels, colors=colors, fig=fig, ax=ax)
  316. ax.legend(loc='upper right')
  317. plt.show()
  318. # animator.save('teaser' + '.gif', writer='imagemagick',fps=10) # 效果不好,估计被挡住了…… 有待进一步提高 Edit by David 2022.12.4
  319. # save不好用,不费劲了,安装个软件做gif https://pc.qq.com/detail/13/detail_23913.html

 图像如下

4.结合3D动画,用自己的语言,从轨迹、速度等多个角度讲解各个算法优缺点

SGD(随机梯度下降)

轨迹:

1.轨迹呈现"之"字型:在y方向上更新变化很大,而x轴方向上变化小,梯度的具有随机性,产生动荡现象。

缺点:

2.收敛速度较慢

3.在动画中可以看到SGD陷入了局部最小值。

4.需要调整学习率

AdaGrad

轨迹:

1.“之”字形的变动程度有所衰减:y轴方向梯度大->开始变动较大->按比例进行调整减小更新->y轴方向上的更新程度被减弱。函数的取值高效地向着最小值移动。

2.刚开始AdaGrad速度比RMSprop和SGD要快,有时甚至比Momentum和Nesterov还要快。但是时间越长,AdaGrad的速度会变成最慢【梯度衰减】

缺点:

到后期梯度衰减,速度变慢,会早停

优点:

1.自适应学习率

2.“之”字形的变动程度有所衰减

3.依旧需要手动添加一个全局学习率

RMSprop

轨迹

1.RMSprop在初始阶段比AdaGrad慢。

2.随着时间变化,AdaGrad变慢但RMSprop仍然保持稳定平缓。

3.RMSprop与AdaGrad在轨迹方向上基本一致。

优点:解决AdaGrad的梯度衰减问题,引入衰减率

缺点:要设置新的超参数

Momentum动量法

轨迹

1.Momentum在速度上明显快于其他几种算法,与Nesterov类似。

2.开始时Momentum会朝错误的方向震荡,然后再修正。

3.Momentum可以快速收敛到局部最小值,并解决鞍点问题。

优点:收敛速度、训练速度快

缺点:方向性差

Nesterov

轨迹:

同图中可以看到,在方向正确性高和速度上也快,比Momentum要好。

优点:Nesterov速度快,轨迹正确性高。 Nesterov是对Momentum的改进,它不仅速度快,同时轨迹更加平滑、具有方向性驶向最优点。

Adam

轨迹:

可以看到前期收敛幅度较大,后期逐渐平稳,朝着最优点不断移动。

优点:

它结合了RMSprop算法【调整学习率】和Momentum算法【梯度修正】,轨迹稳定且快速收敛。

参考:【23-24 秋学期】NNDL 作业12 优化算法2D可视化-CSDN博客 

【23-24 秋学期】NNDL 作业13 优化算法3D可视化-CSDN博客 

NNDL 作业13 优化算法3D可视化-CSDN博客

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/IT小白/article/detail/599801
推荐阅读
相关标签
  

闽ICP备14008679号