当前位置:   article > 正文

在finetune的时候优化bert中优化器AdamWeightDecayOptimizer

adamweightdecayoptimizer

 主要是bert源码当中的adam是简化版本的,这里给出完整版的adamWeightDecay

  1. # bert源码中的AdamWeightDecayOptimizer
  2. class AdamWeightDecayOptimizer(tf.train.Optimizer):
  3. """A basic Adam optimizer that includes "correct" L2 weight decay."""
  4. def __init__(self,
  5. learning_rate,
  6. weight_decay_rate=0.0,
  7. beta_1=0.9,
  8. beta_2=0.999,
  9. epsilon=1e-6,
  10. exclude_from_weight_decay=None,
  11. name="AdamWeightDecayOptimizer"):
  12. """Constructs a AdamWeightDecayOptimizer."""
  13. super(AdamWeightDecayOptimizer, self).__init__(False, name)
  14. self.learning_rate = learning_rate
  15. self.weight_decay_rate = weight_decay_rate
  16. self.beta_1 = beta_1
  17. self.beta_2 = beta_2
  18. self.epsilon = epsilon
  19. self.exclude_from_weight_decay = exclude_from_weight_decay
  20. def apply_gradients(self, grads_and_vars, global_step=None, name=None):
  21. """See base class."""
  22. assignments = []
  23. for (grad, param) in grads_and_vars:
  24. if grad is None or param is None:
  25. continue
  26. param_name = self._get_variable_name(param.name)
  27. m = tf.get_variable(
  28. name=param_name + "/adam_m",
  29. shape=param.shape.as_list(),
  30. dtype=tf.float32,
  31. trainable=False,
  32. initializer=tf.zeros_initializer())
  33. v = tf.get_variable(
  34. name=param_name + "/adam_v",
  35. shape=param.shape.as_list(),
  36. dtype=tf.float32,
  37. trainable=False,
  38. initializer=tf.zeros_initializer())
  39. # Standard Adam update.
  40. next_m = (
  41. tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
  42. next_v = (
  43. tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
  44. tf.square(grad)))
  45. update = next_m / (tf.sqrt(next_v) + self.epsilon)
  46. # Just adding the square of the weights to the loss function is *not*
  47. # the correct way of using L2 regularization/weight decay with Adam,
  48. # since that will interact with the m and v parameters in strange ways.
  49. #
  50. # Instead we want ot decay the weights in a manner that doesn't interact
  51. # with the m/v parameters. This is equivalent to adding the square
  52. # of the weights to the loss with plain (non-momentum) SGD.
  53. if self._do_use_weight_decay(param_name):
  54. update += self.weight_decay_rate * param
  55. update_with_lr = self.learning_rate * update
  56. next_param = param - update_with_lr
  57. assignments.extend(
  58. [param.assign(next_param),
  59. m.assign(next_m),
  60. v.assign(next_v)])
  61. return tf.group(*assignments, name=name)
  62. def _do_use_weight_decay(self, param_name):
  63. """Whether to use L2 weight decay for `param_name`."""
  64. if not self.weight_decay_rate:
  65. return False
  66. if self.exclude_from_weight_decay:
  67. for r in self.exclude_from_weight_decay:
  68. if re.search(r, param_name) is not None:
  69. return False
  70. return True
  71. def _get_variable_name(self, param_name):
  72. """Get the variable name from the tensor name."""
  73. m = re.match("^(.*):\\d+$", param_name)
  74. if m is not None:
  75. param_name = m.group(1)
  76. return param_name
  77. # adam原始论文对应的源码
  78. # class AdamWeightDecayOptimizer(tf.train.Optimizer):
  79. # """A basic Adam optimizer that includes "correct" L2 weight decay."""
  80. # def __init__(self,
  81. # learning_rate,
  82. # weight_decay_rate=0.0,
  83. # beta_1=0.9,
  84. # beta_2=0.999,
  85. # epsilon=1e-6,
  86. # exclude_from_weight_decay=None,
  87. # name="AdamWeightDecayOptimizer"):
  88. # """Constructs a AdamWeightDecayOptimizer."""
  89. # super(AdamWeightDecayOptimizer, self).__init__(False, name)
  90. # self.learning_rate = learning_rate
  91. # self.weight_decay_rate = weight_decay_rate
  92. # self.beta_1 = beta_1
  93. # self.beta_2 = beta_2
  94. # self.epsilon = epsilon
  95. # self.exclude_from_weight_decay = exclude_from_weight_decay
  96. # self.learning_rate_t = None
  97. # self._beta1_t = None
  98. # self._beta2_t = None
  99. # self._epsilon_t = None
  100. # def _get_beta_accumulators(self):
  101. # with ops.init_scope():
  102. # if tf.executing_eagerly():
  103. # graph = None
  104. # else:
  105. # graph = ops.get_default_graph()
  106. # return (self._get_non_slot_variable("beta1_power", graph=graph),
  107. # self._get_non_slot_variable("beta2_power", graph=graph))
  108. # def _prepare(self):
  109. # self.learning_rate_t = ops.convert_to_tensor(
  110. # self.learning_rate, name='learning_rate')
  111. # self.weight_decay_rate_t = ops.convert_to_tensor(
  112. # self.weight_decay_rate, name='weight_decay_rate')
  113. # self.beta_1_t = ops.convert_to_tensor(self.beta_1, name='beta_1')
  114. # self.beta_2_t = ops.convert_to_tensor(self.beta_2, name='beta_2')
  115. # self.epsilon_t = ops.convert_to_tensor(self.epsilon, name='epsilon')
  116. # def _create_slots(self, var_list):
  117. # first_var = min(var_list, key=lambda x: x.name)
  118. # self._create_non_slot_variable(initial_value=self.beta_1,
  119. # name="beta1_power",
  120. # colocate_with=first_var)
  121. # self._create_non_slot_variable(initial_value=self.beta_2,
  122. # name="beta2_power",
  123. # colocate_with=first_var)
  124. # for v in var_list:
  125. # self._zeros_slot(v, 'm', self._name)
  126. # self._zeros_slot(v, 'v', self._name)
  127. # def _apply_dense(self, grad, var):
  128. # learning_rate_t = math_ops.cast(
  129. # self.learning_rate_t, var.dtype.base_dtype)
  130. # beta_1_t = math_ops.cast(self.beta_1_t, var.dtype.base_dtype)
  131. # beta_2_t = math_ops.cast(self.beta_2_t, var.dtype.base_dtype)
  132. # epsilon_t = math_ops.cast(self.epsilon_t, var.dtype.base_dtype)
  133. # weight_decay_rate_t = math_ops.cast(
  134. # self.weight_decay_rate_t, var.dtype.base_dtype)
  135. # m = self.get_slot(var, 'm')
  136. # v = self.get_slot(var, 'v')
  137. # beta1_power, beta2_power = self._get_beta_accumulators()
  138. # beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
  139. # beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
  140. # learning_rate_t = math_ops.cast(self.learning_rate_t, var.dtype.base_dtype)
  141. # learning_rate_t = (learning_rate_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
  142. # # Standard Adam update.
  143. # next_m = (
  144. # tf.multiply(beta_1_t, m) +
  145. # tf.multiply(1.0 - beta_1_t, grad))
  146. # next_v = (
  147. # tf.multiply(beta_2_t, v) + tf.multiply(1.0 - beta_2_t,
  148. # tf.square(grad)))
  149. # update = next_m / (tf.sqrt(next_v) + epsilon_t)
  150. # if self._do_use_weight_decay(var.name):
  151. # update += weight_decay_rate_t * var
  152. # update_with_lr = learning_rate_t * update
  153. # next_param = var - update_with_lr
  154. # return control_flow_ops.group(*[var.assign(next_param),
  155. # m.assign(next_m),
  156. # v.assign(next_v)])
  157. # def _resource_apply_dense(self, grad, var):
  158. # learning_rate_t = math_ops.cast(
  159. # self.learning_rate_t, var.dtype.base_dtype)
  160. # beta_1_t = math_ops.cast(self.beta_1_t, var.dtype.base_dtype)
  161. # beta_2_t = math_ops.cast(self.beta_2_t, var.dtype.base_dtype)
  162. # epsilon_t = math_ops.cast(self.epsilon_t, var.dtype.base_dtype)
  163. # weight_decay_rate_t = math_ops.cast(
  164. # self.weight_decay_rate_t, var.dtype.base_dtype)
  165. # m = self.get_slot(var, 'm')
  166. # v = self.get_slot(var, 'v')
  167. # beta1_power, beta2_power = self._get_beta_accumulators()
  168. # beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
  169. # beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
  170. # learning_rate_t = math_ops.cast(self.learning_rate_t, var.dtype.base_dtype)
  171. # learning_rate_t = (learning_rate_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
  172. # # Standard Adam update.
  173. # next_m = (
  174. # tf.multiply(beta_1_t, m) +
  175. # tf.multiply(1.0 - beta_1_t, grad))
  176. # next_v = (
  177. # tf.multiply(beta_2_t, v) + tf.multiply(1.0 - beta_2_t,
  178. # tf.square(grad)))
  179. # update = next_m / (tf.sqrt(next_v) + epsilon_t)
  180. # if self._do_use_weight_decay(var.name):
  181. # update += weight_decay_rate_t * var
  182. # update_with_lr = learning_rate_t * update
  183. # next_param = var - update_with_lr
  184. # return control_flow_ops.group(*[var.assign(next_param),
  185. # m.assign(next_m),
  186. # v.assign(next_v)])
  187. # def _apply_sparse_shared(self, grad, var, indices, scatter_add):
  188. # learning_rate_t = math_ops.cast(
  189. # self.learning_rate_t, var.dtype.base_dtype)
  190. # beta_1_t = math_ops.cast(self.beta_1_t, var.dtype.base_dtype)
  191. # beta_2_t = math_ops.cast(self.beta_2_t, var.dtype.base_dtype)
  192. # epsilon_t = math_ops.cast(self.epsilon_t, var.dtype.base_dtype)
  193. # weight_decay_rate_t = math_ops.cast(
  194. # self.weight_decay_rate_t, var.dtype.base_dtype)
  195. # m = self.get_slot(var, 'm')
  196. # v = self.get_slot(var, 'v')
  197. # beta1_power, beta2_power = self._get_beta_accumulators()
  198. # beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
  199. # beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
  200. # learning_rate_t = math_ops.cast(self.learning_rate_t, var.dtype.base_dtype)
  201. # learning_rate_t = (learning_rate_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
  202. # m_t = state_ops.assign(m, m * beta_1_t,
  203. # use_locking=self._use_locking)
  204. # m_scaled_g_values = grad * (1 - beta_1_t)
  205. # with ops.control_dependencies([m_t]):
  206. # m_t = scatter_add(m, indices, m_scaled_g_values)
  207. # v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
  208. # v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
  209. # with ops.control_dependencies([v_t]):
  210. # v_t = scatter_add(v, indices, v_scaled_g_values)
  211. # update = m_t / (math_ops.sqrt(v_t) + epsilon_t)
  212. # if self._do_use_weight_decay(var.name):
  213. # update += weight_decay_rate_t * var
  214. # update_with_lr = learning_rate_t * update
  215. # var_update = state_ops.assign_sub(var,
  216. # update_with_lr,
  217. # use_locking=self._use_locking)
  218. # return control_flow_ops.group(*[var_update, m_t, v_t])
  219. # def _apply_sparse(self, grad, var):
  220. # return self._apply_sparse_shared(
  221. # grad.values, var, grad.indices,
  222. # lambda x, i, v: state_ops.scatter_add( # pylint: disable=g-long-lambda
  223. # x, i, v, use_locking=self._use_locking))
  224. # def _resource_scatter_add(self, x, i, v):
  225. # with ops.control_dependencies(
  226. # [resource_variable_ops.resource_scatter_add(
  227. # x.handle, i, v)]):
  228. # return x.value()
  229. # def _resource_apply_sparse(self, grad, var, indices):
  230. # return self._apply_sparse_shared(
  231. # grad, var, indices, self._resource_scatter_add)
  232. # def _do_use_weight_decay(self, param_name):
  233. # """Whether to use L2 weight decay for `param_name`."""
  234. # if not self.weight_decay_rate:
  235. # return False
  236. # if self.exclude_from_weight_decay:
  237. # for r in self.exclude_from_weight_decay:
  238. # if re.search(r, param_name) is not None:
  239. # return False
  240. # return True
  241. # def _finish(self, update_ops, name_scope):
  242. # # Update the power accumulators.
  243. # with ops.control_dependencies(update_ops):
  244. # beta1_power, beta2_power = self._get_beta_accumulators()
  245. # with ops.colocate_with(beta1_power):
  246. # update_beta1 = beta1_power.assign(
  247. # beta1_power * self.beta_1_t, use_locking=self._use_locking)
  248. # update_beta2 = beta2_power.assign(
  249. # beta2_power * self.beta_2_t, use_locking=self._use_locking)
  250. # return control_flow_ops.group(*update_ops + [update_beta1, update_beta2],
  251. # name=name_scope)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/天景科技苑/article/detail/946054
推荐阅读
相关标签
  

闽ICP备14008679号