当前位置:   article > 正文

yolov3_tensorflow代码学习_gradient exploded! please train again and you may

gradient exploded! please train again and you may need modify some parameter

 

  1. # setting loggers
  2. logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s',
  3. datefmt='%a, %d %b %Y %H:%M:%S', filename=args.progress_log_path, filemode='w')

插入log信息,format数据格式,datafmt时间格式

  1. # setting placeholders
  2. is_training = tf.placeholder(tf.bool, name="phase_train")
  3. handle_flag = tf.placeholder(tf.string, [], name='iterator_handle_flag')
  4. # register the gpu nms operation here for the following evaluation scheme
  5. pred_boxes_flag = tf.placeholder(tf.float32, [1, None, None])
  6. pred_scores_flag = tf.placeholder(tf.float32, [1, None, None])
  7. gpu_nms_op = gpu_nms(pred_boxes_flag, pred_scores_flag, args.class_num, args.nms_topk, args.score_threshold, args.nms_threshold)

所以placeholder()函数是在神经网络构建graph的时候在模型中的占位,此时并没有把要输入的数据传入模型,它只会分配必要的内存。等建立session,在会话中,运行模型的时候通过feed_dict()函数向占位符喂入数据。

gpu_nms_op() 实现gpu评估函数

 

  1. ##################
  2. # tf.data pipeline
  3. ##################
  4. train_dataset = tf.data.TextLineDataset(args.train_file)#指定路径
  5. train_dataset = train_dataset.shuffle(args.train_img_cnt)#打乱
  6. train_dataset = train_dataset.batch(args.batch_size)#取batch
  7. train_dataset = train_dataset.map(
  8. lambda x: tf.py_func(get_batch_data,
  9. inp=[x, args.class_num, args.img_size, args.anchors, 'train', args.multi_scale_train, args.use_mix_up, args.letterbox_resize],
  10. Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
  11. num_parallel_calls=args.num_threads
  12. )#转化数据
  13. train_dataset = train_dataset.prefetch(args.prefetech_buffer)#设置预取
  14. val_dataset = tf.data.TextLineDataset(args.val_file)
  15. val_dataset = val_dataset.batch(1)
  16. val_dataset = val_dataset.map(
  17. lambda x: tf.py_func(get_batch_data,
  18. inp=[x, args.class_num, args.img_size, args.anchors, 'val', False, False, args.letterbox_resize],
  19. Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
  20. num_parallel_calls=args.num_threads
  21. )
  22. val_dataset.prefetch(args.prefetech_buffer)
  23. iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
  24. train_init_op = iterator.make_initializer(train_dataset)
  25. val_init_op = iterator.make_initializer(val_dataset)#创建迭代器

get_batch_data() 

 

  1. def process_box(boxes, labels, img_size, class_num, anchors):
  2. '''
  3. Generate the y_true label, i.e. the ground truth feature_maps in 3 different scales.
  4. params:
  5. boxes: [N, 5] shape, float32 dtype. `x_min, y_min, x_max, y_mix, mixup_weight`.
  6. labels: [N] shape, int64 dtype.
  7. class_num: int64 num.
  8. anchors: [9, 4] shape, float32 dtype.
  9. '''
  10. anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
  11. # convert boxes form:
  12. # shape: [N, 2]
  13. # (x_center, y_center)
  14. box_centers = (boxes[:, 0:2] + boxes[:, 2:4]) / 2
  15. # (width, height)
  16. box_sizes = boxes[:, 2:4] - boxes[:, 0:2]
  17. # [13, 13, 3, 5+num_class+1] `5` means coords and labels. `1` means mix up weight.
  18. y_true_13 = np.zeros((img_size[1] // 32, img_size[0] // 32, 3, 6 + class_num), np.float32)
  19. y_true_26 = np.zeros((img_size[1] // 16, img_size[0] // 16, 3, 6 + class_num), np.float32)
  20. y_true_52 = np.zeros((img_size[1] // 8, img_size[0] // 8, 3, 6 + class_num), np.float32)
  21. # mix up weight default to 1.
  22. y_true_13[..., -1] = 1.
  23. y_true_26[..., -1] = 1.
  24. y_true_52[..., -1] = 1.
  25. y_true = [y_true_13, y_true_26, y_true_52]
  26. # [N, 1, 2]
  27. box_sizes = np.expand_dims(box_sizes, 1)
  28. # broadcast tricks
  29. # [N, 1, 2] & [9, 2] ==> [N, 9, 2]
  30. mins = np.maximum(- box_sizes / 2, - anchors / 2)
  31. maxs = np.minimum(box_sizes / 2, anchors / 2)
  32. # [N, 9, 2]
  33. whs = maxs - mins
  34. # [N, 9]
  35. iou = (whs[:, :, 0] * whs[:, :, 1]) / (
  36. box_sizes[:, :, 0] * box_sizes[:, :, 1] + anchors[:, 0] * anchors[:, 1] - whs[:, :, 0] * whs[:, :,
  37. 1] + 1e-10)
  38. # [N]
  39. best_match_idx = np.argmax(iou, axis=1)
  40. ratio_dict = {1.: 8., 2.: 16., 3.: 32.}
  41. for i, idx in enumerate(best_match_idx):
  42. # idx: 0,1,2 ==> 2; 3,4,5 ==> 1; 6,7,8 ==> 0
  43. feature_map_group = 2 - idx // 3
  44. # scale ratio: 0,1,2 ==> 8; 3,4,5 ==> 16; 6,7,8 ==> 32
  45. ratio = ratio_dict[np.ceil((idx + 1) / 3.)]
  46. x = int(np.floor(box_centers[i, 0] / ratio))
  47. y = int(np.floor(box_centers[i, 1] / ratio))
  48. k = anchors_mask[feature_map_group].index(idx)
  49. c = labels[i]
  50. # print(feature_map_group, '|', y,x,k,c)
  51. y_true[feature_map_group][y, x, k, :2] = box_centers[i]
  52. y_true[feature_map_group][y, x, k, 2:4] = box_sizes[i]
  53. y_true[feature_map_group][y, x, k, 4] = 1.
  54. y_true[feature_map_group][y, x, k, 5 + c] = 1.
  55. y_true[feature_map_group][y, x, k, -1] = boxes[i, -1]
  56. return y_true_13, y_true_26, y_true_52

np.expand_dims:用于扩展数组的形状 

  1. # get an element from the chosen dataset iterator
  2. image_ids, image, y_true_13, y_true_26, y_true_52 = iterator.get_next()
  3. y_true = [y_true_13, y_true_26, y_true_52]
  4. # tf.data pipeline will lose the data `static` shape, so we need to set it manually
  5. image_ids.set_shape([None])
  6. image.set_shape([None, None, None, 4])
  7. for y in y_true:
  8. y.set_shape([None, None, None, None, None])

设定dataset输出的变量维度

  1. ##################
  2. # Model definition
  3. ##################
  4. yolo_model = yolov3(args.class_num, args.anchors, args.use_label_smooth, args.use_focal_loss, args.batch_norm_decay, args.weight_decay, use_static_shape=False)
  5. with tf.variable_scope('yolov3'):
  6. pred_feature_maps = yolo_model.forward(image, is_training=is_training)
  7. loss = yolo_model.compute_loss(pred_feature_maps, y_true)
  8. y_pred = yolo_model.predict(pred_feature_maps)
  9. l2_loss = tf.losses.get_regularization_loss()#获得L2正则化损失
  10. # setting restore parts and vars to update
  11. saver_to_restore = tf.train.Saver(var_list=tf.contrib.framework.get_variables_to_restore(include=args.restore_include, exclude=args.restore_exclude))
  12. update_vars = tf.contrib.framework.get_variables_to_restore(include=args.update_part)
  13. #tensorboard画图使用
  14. tf.summary.scalar('train_batch_statistics/total_loss', loss[0])
  15. tf.summary.scalar('train_batch_statistics/loss_xy', loss[1])
  16. tf.summary.scalar('train_batch_statistics/loss_wh', loss[2])
  17. tf.summary.scalar('train_batch_statistics/loss_conf', loss[3])
  18. tf.summary.scalar('train_batch_statistics/loss_class', loss[4])
  19. tf.summary.scalar('train_batch_statistics/loss_l2', l2_loss)
  20. tf.summary.scalar('train_batch_statistics/loss_ratio', l2_loss / loss[0])
  21. #设置学习率
  22. global_step = tf.Variable(float(args.global_step), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])
  23. if args.use_warm_up:
  24. learning_rate = tf.cond(tf.less(global_step, args.train_batch_num * args.warm_up_epoch),
  25. lambda: args.learning_rate_init * global_step / (args.train_batch_num * args.warm_up_epoch),
  26. lambda: config_learning_rate(args, global_step - args.train_batch_num * args.warm_up_epoch))
  27. else:
  28. learning_rate = config_learning_rate(args, global_step)
  29. tf.summary.scalar('learning_rate', learning_rate)
  30. #创建saver类
  31. if not args.save_optimizer:
  32. saver_to_save = tf.train.Saver()
  33. saver_best = tf.train.Saver()
  34. optimizer = config_optimizer(args.optimizer_name, learning_rate)
  35. # set dependencies for BN ops
  36. #https://www.cnblogs.com/reaptomorrow-flydream/p/9492191.html
  37. #https://blog.csdn.net/NockinOnHeavensDoor/article/details/80632677
  38. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
  39. with tf.control_dependencies(update_ops):
  40. # train_op = optimizer.minimize(loss[0] + l2_loss, var_list=update_vars, global_step=global_step)
  41. # apply gradient clip to avoid gradient exploding
  42. gvs = optimizer.compute_gradients(loss[0] + l2_loss, var_list=update_vars)
  43. clip_grad_var = [gv if gv[0] is None else [
  44. tf.clip_by_norm(gv[0], 100.), gv[1]] for gv in gvs]
  45. train_op = optimizer.apply_gradients(clip_grad_var, global_step=global_step)#在这里global_step+1
  46. #创建saver类
  47. if args.save_optimizer:
  48. print('Saving optimizer parameters to checkpoint! Remember to restore the global_step in the fine-tuning afterwards.')
  49. saver_to_save = tf.train.Saver()
  50. saver_best = tf.train.Saver()

update_ops

 

sess

  1. with tf.Session() as sess:
  2. sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
  3. #saver_to_restore.restore(sess, args.restore_path)
  4. merged = tf.summary.merge_all()
  5. writer = tf.summary.FileWriter(args.log_dir, sess.graph)
  6. print('\n----------- start to train -----------\n')
  7. best_mAP = -np.Inf
  8. for epoch in range(args.total_epoches):
  9. sess.run(train_init_op)
  10. loss_total, loss_xy, loss_wh, loss_conf, loss_class = AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter()
  11. for i in trange(args.train_batch_num):
  12. _, summary, __y_pred, __y_true, __loss, __global_step, __lr = sess.run(
  13. [train_op, merged, y_pred, y_true, loss, global_step, learning_rate],
  14. feed_dict={is_training: True})
  15. writer.add_summary(summary, global_step=__global_step)
  16. loss_total.update(__loss[0], len(__y_pred[0]))
  17. loss_xy.update(__loss[1], len(__y_pred[0]))
  18. loss_wh.update(__loss[2], len(__y_pred[0]))
  19. loss_conf.update(__loss[3], len(__y_pred[0]))
  20. loss_class.update(__loss[4], len(__y_pred[0]))
  21. if __global_step % args.train_evaluation_step == 0 and __global_step > 0:
  22. # recall, precision = evaluate_on_cpu(__y_pred, __y_true, args.class_num, args.nms_topk, args.score_threshold, args.nms_threshold)
  23. recall, precision = evaluate_on_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __y_pred, __y_true, args.class_num, args.nms_threshold)
  24. info = "Epoch: {}, global_step: {} | loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f} | ".format(
  25. epoch, int(__global_step), loss_total.average, loss_xy.average, loss_wh.average, loss_conf.average, loss_class.average)
  26. info += 'Last batch: rec: {:.3f}, prec: {:.3f} | lr: {:.5g}'.format(recall, precision, __lr)
  27. print(info)
  28. logging.info(info)
  29. writer.add_summary(make_summary('evaluation/train_batch_recall', recall), global_step=__global_step)
  30. writer.add_summary(make_summary('evaluation/train_batch_precision', precision), global_step=__global_step)
  31. if np.isnan(loss_total.average):
  32. print('****' * 10)
  33. raise ArithmeticError(
  34. 'Gradient exploded! Please train again and you may need modify some parameters.')
  35. # NOTE: this is just demo. You can set the conditions when to save the weights.
  36. if epoch % args.save_epoch == 0 and epoch > 0:
  37. if loss_total.average <= 2.:
  38. saver_to_save.save(sess, args.save_dir + 'model-epoch_{}_step_{}_loss_{:.4f}_lr_{:.5g}'.format(epoch, int(__global_step), loss_total.average, __lr))
  39. # switch to validation dataset for evaluation
  40. if epoch % args.val_evaluation_epoch == 0 and epoch >= args.warm_up_epoch:
  41. sess.run(val_init_op)
  42. val_loss_total, val_loss_xy, val_loss_wh, val_loss_conf, val_loss_class = \
  43. AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter()
  44. val_preds = []
  45. for j in trange(args.val_img_cnt):
  46. __image_ids, __y_pred, __loss = sess.run([image_ids, y_pred, loss],
  47. feed_dict={is_training: False})
  48. pred_content = get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __image_ids, __y_pred)
  49. val_preds.extend(pred_content)
  50. val_loss_total.update(__loss[0])
  51. val_loss_xy.update(__loss[1])
  52. val_loss_wh.update(__loss[2])
  53. val_loss_conf.update(__loss[3])
  54. val_loss_class.update(__loss[4])
  55. # calc mAP
  56. rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter()
  57. gt_dict = parse_gt_rec(args.val_file, args.img_size, args.letterbox_resize)
  58. info = '======> Epoch: {}, global_step: {}, lr: {:.6g} <======\n'.format(epoch, __global_step, __lr)
  59. for ii in range(args.class_num):
  60. npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, ii, iou_thres=args.eval_threshold, use_07_metric=args.use_voc_07_metric)
  61. info += 'EVAL: Class {}: Recall: {:.4f}, Precision: {:.4f}, AP: {:.4f}\n'.format(ii, rec, prec, ap)
  62. rec_total.update(rec, npos)
  63. prec_total.update(prec, nd)
  64. ap_total.update(ap, 1)
  65. writer.add_summary(make_summary('evaluation/val_mAP' + 'class' + str(ii), ap), global_step=epoch)
  66. mAP = ap_total.average
  67. info += 'EVAL: Recall: {:.4f}, Precison: {:.4f}, mAP: {:.4f}\n'.format(rec_total.average, prec_total.average, mAP)
  68. info += 'EVAL: loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f}\n'.format(
  69. val_loss_total.average, val_loss_xy.average, val_loss_wh.average, val_loss_conf.average, val_loss_class.average)
  70. print(info)
  71. logging.info(info)
  72. if mAP > best_mAP:
  73. best_mAP = mAP
  74. saver_best.save(sess, args.save_dir + 'best_model_Epoch_{}_step_{}_mAP_{:.4f}_loss_{:.4f}_lr_{:.7g}'.format(
  75. epoch, int(__global_step), best_mAP, val_loss_total.average, __lr))
  76. writer.add_summary(make_summary('evaluation/val_mAP', mAP), global_step=epoch)
  77. writer.add_summary(make_summary('evaluation/val_recall', rec_total.average), global_step=epoch)
  78. writer.add_summary(make_summary('evaluation/val_precision', prec_total.average), global_step=epoch)
  79. writer.add_summary(make_summary('validation_statistics/total_loss', val_loss_total.average), global_step=epoch)
  80. writer.add_summary(make_summary('validation_statistics/loss_xy', val_loss_xy.average), global_step=epoch)
  81. writer.add_summary(make_summary('validation_statistics/loss_wh', val_loss_wh.average), global_step=epoch)
  82. writer.add_summary(make_summary('validation_statistics/loss_conf', val_loss_conf.average), global_step=epoch)
  83. writer.add_summary(make_summary('validation_statistics/loss_class', val_loss_class.average), global_step=epoch)

evaluate_on_gpu

声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop】
推荐阅读
相关标签
  

闽ICP备14008679号