        记录MindSpore AI框架使用SSD目标检测算法对图像内容识别的过程、步骤和方法。包括环境准备、下载数据集、数据采样、数据集加载和预处理、构建模型、损失函数、模型训练、模型评估等。




Single Shot MultiBox Detector

使用Nvidia Titan X在VOC 2007测试集上


        达到74.3%mAP(mean Average Precision)以及59FPS;



超越当时最强的Faster RCNN(73.2%mAP)










检测特征层使用3 × 3卷积

















        检测网络3 ×× 3卷积得到输出





        SSD 通过卷积得到最后的边界框







        38 × 38

        19 × 19

        10 × 10

          5 × 5

          3 × 3

          1 × 1





m × n × p形状特征图采用3 × 3 × p小卷积核得到检测值





用数据集COCO 2017







  1. from download import download
  2. dataset_url = "https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/notebook/datasets/ssd_datasets.zip"
  3. path = "./"
  4. path = download(dataset_url, path, kind="zip", replace=True)


  1. coco_root = "./datasets/"
  2. anno_json = "./datasets/annotations/instances_val2017.json"
  3. train_cls = ['background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
  4. 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
  5. 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
  6. 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
  7. 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
  8. 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
  9. 'kite', 'baseball bat', 'baseball glove', 'skateboard',
  10. 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
  11. 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
  12. 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
  13. 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
  14. 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
  15. 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
  16. 'refrigerator', 'book', 'clock', 'vase', 'scissors',
  17. 'teddy bear', 'hair drier', 'toothbrush']
  18. train_cls_dict = {}
  19. for i, cls in enumerate(train_cls):
  20. train_cls_dict[cls] = i















  1. import cv2
  2. import numpy as np
  3. def _rand(a=0., b=1.):
  4. return np.random.rand() * (b - a) + a
  5. def intersect(box_a, box_b):
  6. """Compute the intersect of two sets of boxes."""
  7. max_yx = np.minimum(box_a[:, 2:4], box_b[2:4])
  8. min_yx = np.maximum(box_a[:, :2], box_b[:2])
  9. inter = np.clip((max_yx - min_yx), a_min=0, a_max=np.inf)
  10. return inter[:, 0] * inter[:, 1]
  11. def jaccard_numpy(box_a, box_b):
  12. """Compute the jaccard overlap of two sets of boxes."""
  13. inter = intersect(box_a, box_b)
  14. area_a = ((box_a[:, 2] - box_a[:, 0]) *
  15. (box_a[:, 3] - box_a[:, 1]))
  16. area_b = ((box_b[2] - box_b[0]) *
  17. (box_b[3] - box_b[1]))
  18. union = area_a + area_b - inter
  19. return inter / union
  20. def random_sample_crop(image, boxes):
  21. """Crop images and boxes randomly."""
  22. height, width, _ = image.shape
  23. min_iou = np.random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])
  24. if min_iou is None:
  25. return image, boxes
  26. for _ in range(50):
  27. image_t = image
  28. w = _rand(0.3, 1.0) * width
  29. h = _rand(0.3, 1.0) * height
  30. # aspect ratio constraint b/t .5 & 2
  31. if h / w < 0.5 or h / w > 2:
  32. continue
  33. left = _rand() * (width - w)
  34. top = _rand() * (height - h)
  35. rect = np.array([int(top), int(left), int(top + h), int(left + w)])
  36. overlap = jaccard_numpy(boxes, rect)
  37. # dropout some boxes
  38. drop_mask = overlap > 0
  39. if not drop_mask.any():
  40. continue
  41. if overlap[drop_mask].min() < min_iou and overlap[drop_mask].max() > (min_iou + 0.2):
  42. continue
  43. image_t = image_t[rect[0]:rect[2], rect[1]:rect[3], :]
  44. centers = (boxes[:, :2] + boxes[:, 2:4]) / 2.0
  45. m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
  46. m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
  47. # mask in that both m1 and m2 are true
  48. mask = m1 * m2 * drop_mask
  49. # have any valid boxes? try again if not
  50. if not mask.any():
  51. continue
  52. # take only matching gt boxes
  53. boxes_t = boxes[mask, :].copy()
  54. boxes_t[:, :2] = np.maximum(boxes_t[:, :2], rect[:2])
  55. boxes_t[:, :2] -= rect[:2]
  56. boxes_t[:, 2:4] = np.minimum(boxes_t[:, 2:4], rect[2:4])
  57. boxes_t[:, 2:4] -= rect[:2]
  58. return image_t, boxes_t
  59. return image, boxes
  60. def ssd_bboxes_encode(boxes):
  61. """Labels anchors with ground truth inputs."""
  62. def jaccard_with_anchors(bbox):
  63. """Compute jaccard score a box and the anchors."""
  64. # Intersection bbox and volume.
  65. ymin = np.maximum(y1, bbox[0])
  66. xmin = np.maximum(x1, bbox[1])
  67. ymax = np.minimum(y2, bbox[2])
  68. xmax = np.minimum(x2, bbox[3])
  69. w = np.maximum(xmax - xmin, 0.)
  70. h = np.maximum(ymax - ymin, 0.)
  71. # Volumes.
  72. inter_vol = h * w
  73. union_vol = vol_anchors + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) - inter_vol
  74. jaccard = inter_vol / union_vol
  75. return np.squeeze(jaccard)
  76. pre_scores = np.zeros((8732), dtype=np.float32)
  77. t_boxes = np.zeros((8732, 4), dtype=np.float32)
  78. t_label = np.zeros((8732), dtype=np.int64)
  79. for bbox in boxes:
  80. label = int(bbox[4])
  81. scores = jaccard_with_anchors(bbox)
  82. idx = np.argmax(scores)
  83. scores[idx] = 2.0
  84. mask = (scores > matching_threshold)
  85. mask = mask & (scores > pre_scores)
  86. pre_scores = np.maximum(pre_scores, scores * mask)
  87. t_label = mask * label + (1 - mask) * t_label
  88. for i in range(4):
  89. t_boxes[:, i] = mask * bbox[i] + (1 - mask) * t_boxes[:, i]
  90. index = np.nonzero(t_label)
  91. # Transform to tlbr.
  92. bboxes = np.zeros((8732, 4), dtype=np.float32)
  93. bboxes[:, [0, 1]] = (t_boxes[:, [0, 1]] + t_boxes[:, [2, 3]]) / 2
  94. bboxes[:, [2, 3]] = t_boxes[:, [2, 3]] - t_boxes[:, [0, 1]]
  95. # Encode features.
  96. bboxes_t = bboxes[index]
  97. default_boxes_t = default_boxes[index]
  98. bboxes_t[:, :2] = (bboxes_t[:, :2] - default_boxes_t[:, :2]) / (default_boxes_t[:, 2:] * 0.1)
  99. tmp = np.maximum(bboxes_t[:, 2:4] / default_boxes_t[:, 2:4], 0.000001)
  100. bboxes_t[:, 2:4] = np.log(tmp) / 0.2
  101. bboxes[index] = bboxes_t
  102. num_match = np.array([len(np.nonzero(t_label)[0])], dtype=np.int32)
  103. return bboxes, t_label.astype(np.int32), num_match
  104. def preprocess_fn(img_id, image, box, is_training):
  105. """Preprocess function for dataset."""
  106. cv2.setNumThreads(2)
  107. def _infer_data(image, input_shape):
  108. img_h, img_w, _ = image.shape
  109. input_h, input_w = input_shape
  110. image = cv2.resize(image, (input_w, input_h))
  111. # When the channels of image is 1
  112. if len(image.shape) == 2:
  113. image = np.expand_dims(image, axis=-1)
  114. image = np.concatenate([image, image, image], axis=-1)
  115. return img_id, image, np.array((img_h, img_w), np.float32)
  116. def _data_aug(image, box, is_training, image_size=(300, 300)):
  117. ih, iw, _ = image.shape
  118. h, w = image_size
  119. if not is_training:
  120. return _infer_data(image, image_size)
  121. # Random crop
  122. box = box.astype(np.float32)
  123. image, box = random_sample_crop(image, box)
  124. ih, iw, _ = image.shape
  125. # Resize image
  126. image = cv2.resize(image, (w, h))
  127. # Flip image or not
  128. flip = _rand() < .5
  129. if flip:
  130. image = cv2.flip(image, 1, dst=None)
  131. # When the channels of image is 1
  132. if len(image.shape) == 2:
  133. image = np.expand_dims(image, axis=-1)
  134. image = np.concatenate([image, image, image], axis=-1)
  135. box[:, [0, 2]] = box[:, [0, 2]] / ih
  136. box[:, [1, 3]] = box[:, [1, 3]] / iw
  137. if flip:
  138. box[:, [1, 3]] = 1 - box[:, [3, 1]]
  139. box, label, num_match = ssd_bboxes_encode(box)
  140. return image, box, label, num_match
  141. return _data_aug(image, box, is_training, image_size=[300, 300])


  1. from mindspore import Tensor
  2. from mindspore.dataset import MindDataset
  3. from mindspore.dataset.vision import Decode, HWC2CHW, Normalize, RandomColorAdjust
  4. def create_ssd_dataset(mindrecord_file, batch_size=32, device_num=1, rank=0,
  5. is_training=True, num_parallel_workers=1, use_multiprocessing=True):
  6. """Create SSD dataset with MindDataset."""
  7. dataset = MindDataset(mindrecord_file, columns_list=["img_id", "image", "annotation"], num_shards=device_num,
  8. shard_id=rank, num_parallel_workers=num_parallel_workers, shuffle=is_training)
  9. decode = Decode()
  10. dataset = dataset.map(operations=decode, input_columns=["image"])
  11. change_swap_op = HWC2CHW()
  12. # Computed from random subset of ImageNet training images
  13. normalize_op = Normalize(mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
  14. std=[0.229 * 255, 0.224 * 255, 0.225 * 255])
  15. color_adjust_op = RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4)
  16. compose_map_func = (lambda img_id, image, annotation: preprocess_fn(img_id, image, annotation, is_training))
  17. if is_training:
  18. output_columns = ["image", "box", "label", "num_match"]
  19. trans = [color_adjust_op, normalize_op, change_swap_op]
  20. else:
  21. output_columns = ["img_id", "image", "image_shape"]
  22. trans = [normalize_op, change_swap_op]
  23. dataset = dataset.map(operations=compose_map_func, input_columns=["img_id", "image", "annotation"],
  24. output_columns=output_columns, python_multiprocessing=use_multiprocessing,
  25. num_parallel_workers=num_parallel_workers)
  26. dataset = dataset.map(operations=trans, input_columns=["image"], python_multiprocessing=use_multiprocessing,
  27. num_parallel_workers=num_parallel_workers)
  28. dataset = dataset.batch(batch_size, drop_remainder=True)
  29. return dataset



VGG16 Base Layer

Extra Feature Layer

Detection Layer



VGG16 Base LayerBackbone Layer





        fc6转换成3 × 3卷积层block6







        fc7转换成1 × 1卷积层block7

Extra Feature Layer





































                长和宽的计算公式:w_{k}^{a}=s_k\sqrt{a_r}, h_{k}^{a}=s_k/\sqrt{a_r}


                        计算公式:s_{k}^{'}=\sqrt{s_k s_{k+1}}














Detection Layer















  1. from mindspore import nn
  2. def _make_layer(channels):
  3. in_channels = channels[0]
  4. layers = []
  5. for out_channels in channels[1:]:
  6. layers.append(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3))
  7. layers.append(nn.ReLU())
  8. in_channels = out_channels
  9. return nn.SequentialCell(layers)
  10. class Vgg16(nn.Cell):
  11. """VGG16 module."""
  12. def __init__(self):
  13. super(Vgg16, self).__init__()
  14. self.b1 = _make_layer([3, 64, 64])
  15. self.b2 = _make_layer([64, 128, 128])
  16. self.b3 = _make_layer([128, 256, 256, 256])
  17. self.b4 = _make_layer([256, 512, 512, 512])
  18. self.b5 = _make_layer([512, 512, 512, 512])
  19. self.m1 = nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='SAME')
  20. self.m2 = nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='SAME')
  21. self.m3 = nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='SAME')
  22. self.m4 = nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='SAME')
  23. self.m5 = nn.MaxPool2d(kernel_size=3, stride=1, pad_mode='SAME')
  24. def construct(self, x):
  25. # block1
  26. x = self.b1(x)
  27. x = self.m1(x)
  28. # block2
  29. x = self.b2(x)
  30. x = self.m2(x)
  31. # block3
  32. x = self.b3(x)
  33. x = self.m3(x)
  34. # block4
  35. x = self.b4(x)
  36. block4 = x
  37. x = self.m4(x)
  38. # block5
  39. x = self.b5(x)
  40. x = self.m5(x)
  41. return block4, x

  1. import mindspore as ms
  2. import mindspore.nn as nn
  3. import mindspore.ops as ops
  4. def _last_conv2d(in_channel, out_channel, kernel_size=3, stride=1, pad_mod='same', pad=0):
  5. in_channels = in_channel
  6. out_channels = in_channel
  7. depthwise_conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad_mode='same',
  8. padding=pad, group=in_channels)
  9. conv = nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=1, padding=0, pad_mode='same', has_bias=True)
  10. bn = nn.BatchNorm2d(in_channel, eps=1e-3, momentum=0.97,
  11. gamma_init=1, beta_init=0, moving_mean_init=0, moving_var_init=1)
  12. return nn.SequentialCell([depthwise_conv, bn, nn.ReLU6(), conv])
  13. class FlattenConcat(nn.Cell):
  14. """FlattenConcat module."""
  15. def __init__(self):
  16. super(FlattenConcat, self).__init__()
  17. self.num_ssd_boxes = 8732
  18. def construct(self, inputs):
  19. output = ()
  20. batch_size = ops.shape(inputs[0])[0]
  21. for x in inputs:
  22. x = ops.transpose(x, (0, 2, 3, 1))
  23. output += (ops.reshape(x, (batch_size, -1)),)
  24. res = ops.concat(output, axis=1)
  25. return ops.reshape(res, (batch_size, self.num_ssd_boxes, -1))
  26. class MultiBox(nn.Cell):
  27. """
  28. Multibox conv layers. Each multibox layer contains class conf scores and localization predictions.
  29. """
  30. def __init__(self):
  31. super(MultiBox, self).__init__()
  32. num_classes = 81
  33. out_channels = [512, 1024, 512, 256, 256, 256]
  34. num_default = [4, 6, 6, 6, 4, 4]
  35. loc_layers = []
  36. cls_layers = []
  37. for k, out_channel in enumerate(out_channels):
  38. loc_layers += [_last_conv2d(out_channel, 4 * num_default[k],
  39. kernel_size=3, stride=1, pad_mod='same', pad=0)]
  40. cls_layers += [_last_conv2d(out_channel, num_classes * num_default[k],
  41. kernel_size=3, stride=1, pad_mod='same', pad=0)]
  42. self.multi_loc_layers = nn.CellList(loc_layers)
  43. self.multi_cls_layers = nn.CellList(cls_layers)
  44. self.flatten_concat = FlattenConcat()
  45. def construct(self, inputs):
  46. loc_outputs = ()
  47. cls_outputs = ()
  48. for i in range(len(self.multi_loc_layers)):
  49. loc_outputs += (self.multi_loc_layers[i](inputs[i]),)
  50. cls_outputs += (self.multi_cls_layers[i](inputs[i]),)
  51. return self.flatten_concat(loc_outputs), self.flatten_concat(cls_outputs)
  52. class SSD300Vgg16(nn.Cell):
  53. """SSD300Vgg16 module."""
  54. def __init__(self):
  55. super(SSD300Vgg16, self).__init__()
  56. # VGG16 backbone: block1~5
  57. self.backbone = Vgg16()
  58. # SSD blocks: block6~7
  59. self.b6_1 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6, pad_mode='pad')
  60. self.b6_2 = nn.Dropout(p=0.5)
  61. self.b7_1 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1)
  62. self.b7_2 = nn.Dropout(p=0.5)
  63. # Extra Feature Layers: block8~11
  64. self.b8_1 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1, padding=1, pad_mode='pad')
  65. self.b8_2 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, pad_mode='valid')
  66. self.b9_1 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1, padding=1, pad_mode='pad')
  67. self.b9_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, pad_mode='valid')
  68. self.b10_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1)
  69. self.b10_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, pad_mode='valid')
  70. self.b11_1 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1)
  71. self.b11_2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, pad_mode='valid')
  72. # boxes
  73. self.multi_box = MultiBox()
  74. def construct(self, x):
  75. # VGG16 backbone: block1~5
  76. block4, x = self.backbone(x)
  77. # SSD blocks: block6~7
  78. x = self.b6_1(x) # 1024
  79. x = self.b6_2(x)
  80. x = self.b7_1(x) # 1024
  81. x = self.b7_2(x)
  82. block7 = x
  83. # Extra Feature Layers: block8~11
  84. x = self.b8_1(x) # 256
  85. x = self.b8_2(x) # 512
  86. block8 = x
  87. x = self.b9_1(x) # 128
  88. x = self.b9_2(x) # 256
  89. block9 = x
  90. x = self.b10_1(x) # 128
  91. x = self.b10_2(x) # 256
  92. block10 = x
  93. x = self.b11_1(x) # 128
  94. x = self.b11_2(x) # 256
  95. block11 = x
  96. # boxes
  97. multi_feature = (block4, block7, block8, block9, block10, block11)
  98. pred_loc, pred_label = self.multi_box(multi_feature)
  99. if not self.training:
  100. pred_label = ops.sigmoid(pred_label)
  101. pred_loc = pred_loc.astype(ms.float32)
  102. pred_label = pred_label.astype(ms.float32)
  103. return pred_loc, pred_label



预选框与目标类别的置信度误差(confidence loss, conf)

位置误差(locatization loss, loc)

              L\left ( x,c,l,g \right )=\frac{1}{N}\left ( L_{conf}\left ( x,c \right ) \right )+\alpha L_{loc}\left ( x,l,g \right )

N 先验框的正样本数量;
c 类别置信度预测值;
l 先验框对应边界框的位置预测值;
g ground truth的位置参数;
α confidence loss和location loss之间的调整比例,默认为1。



采用 Smooth L1 Loss

encode 之后的位置信息

smooth_{L_1}\left (x \right )=\begin{cases} 0.5x^2 & \text{ if } |x|<1 \\ \left | x \right |-0.5 & \text{ otherwise } \end{cases}



  1. def class_loss(logits, label):
  2. """Calculate category losses."""
  3. label = ops.one_hot(label, ops.shape(logits)[-1], Tensor(1.0, ms.float32), Tensor(0.0, ms.float32))
  4. weight = ops.ones_like(logits)
  5. pos_weight = ops.ones_like(logits)
  6. sigmiod_cross_entropy = ops.binary_cross_entropy_with_logits(logits, label, weight.astype(ms.float32), pos_weight.astype(ms.float32))
  7. sigmoid = ops.sigmoid(logits)
  8. label = label.astype(ms.float32)
  9. p_t = label * sigmoid + (1 - label) * (1 - sigmoid)
  10. modulating_factor = ops.pow(1 - p_t, 2.0)
  11. alpha_weight_factor = label * 0.75 + (1 - label) * (1 - 0.75)
  12. focal_loss = modulating_factor * alpha_weight_factor * sigmiod_cross_entropy
  13. return focal_loss












  1. import json
  2. from pycocotools.coco import COCO
  3. from pycocotools.cocoeval import COCOeval
  4. def apply_eval(eval_param_dict):
  5. net = eval_param_dict["net"]
  6. net.set_train(False)
  7. ds = eval_param_dict["dataset"]
  8. anno_json = eval_param_dict["anno_json"]
  9. coco_metrics = COCOMetrics(anno_json=anno_json,
  10. classes=train_cls,
  11. num_classes=81,
  12. max_boxes=100,
  13. nms_threshold=0.6,
  14. min_score=0.1)
  15. for data in ds.create_dict_iterator(output_numpy=True, num_epochs=1):
  16. img_id = data['img_id']
  17. img_np = data['image']
  18. image_shape = data['image_shape']
  19. output = net(Tensor(img_np))
  20. for batch_idx in range(img_np.shape[0]):
  21. pred_batch = {
  22. "boxes": output[0].asnumpy()[batch_idx],
  23. "box_scores": output[1].asnumpy()[batch_idx],
  24. "img_id": int(np.squeeze(img_id[batch_idx])),
  25. "image_shape": image_shape[batch_idx]
  26. }
  27. coco_metrics.update(pred_batch)
  28. eval_metrics = coco_metrics.get_metrics()
  29. return eval_metrics
  30. def apply_nms(all_boxes, all_scores, thres, max_boxes):
  31. """Apply NMS to bboxes."""
  32. y1 = all_boxes[:, 0]
  33. x1 = all_boxes[:, 1]
  34. y2 = all_boxes[:, 2]
  35. x2 = all_boxes[:, 3]
  36. areas = (x2 - x1 + 1) * (y2 - y1 + 1)
  37. order = all_scores.argsort()[::-1]
  38. keep = []
  39. while order.size > 0:
  40. i = order[0]
  41. keep.append(i)
  42. if len(keep) >= max_boxes:
  43. break
  44. xx1 = np.maximum(x1[i], x1[order[1:]])
  45. yy1 = np.maximum(y1[i], y1[order[1:]])
  46. xx2 = np.minimum(x2[i], x2[order[1:]])
  47. yy2 = np.minimum(y2[i], y2[order[1:]])
  48. w = np.maximum(0.0, xx2 - xx1 + 1)
  49. h = np.maximum(0.0, yy2 - yy1 + 1)
  50. inter = w * h
  51. ovr = inter / (areas[i] + areas[order[1:]] - inter)
  52. inds = np.where(ovr <= thres)[0]
  53. order = order[inds + 1]
  54. return keep
  55. class COCOMetrics:
  56. """Calculate mAP of predicted bboxes."""
  57. def __init__(self, anno_json, classes, num_classes, min_score, nms_threshold, max_boxes):
  58. self.num_classes = num_classes
  59. self.classes = classes
  60. self.min_score = min_score
  61. self.nms_threshold = nms_threshold
  62. self.max_boxes = max_boxes
  63. self.val_cls_dict = {i: cls for i, cls in enumerate(classes)}
  64. self.coco_gt = COCO(anno_json)
  65. cat_ids = self.coco_gt.loadCats(self.coco_gt.getCatIds())
  66. self.class_dict = {cat['name']: cat['id'] for cat in cat_ids}
  67. self.predictions = []
  68. self.img_ids = []
  69. def update(self, batch):
  70. pred_boxes = batch['boxes']
  71. box_scores = batch['box_scores']
  72. img_id = batch['img_id']
  73. h, w = batch['image_shape']
  74. final_boxes = []
  75. final_label = []
  76. final_score = []
  77. self.img_ids.append(img_id)
  78. for c in range(1, self.num_classes):
  79. class_box_scores = box_scores[:, c]
  80. score_mask = class_box_scores > self.min_score
  81. class_box_scores = class_box_scores[score_mask]
  82. class_boxes = pred_boxes[score_mask] * [h, w, h, w]
  83. if score_mask.any():
  84. nms_index = apply_nms(class_boxes, class_box_scores, self.nms_threshold, self.max_boxes)
  85. class_boxes = class_boxes[nms_index]
  86. class_box_scores = class_box_scores[nms_index]
  87. final_boxes += class_boxes.tolist()
  88. final_score += class_box_scores.tolist()
  89. final_label += [self.class_dict[self.val_cls_dict[c]]] * len(class_box_scores)
  90. for loc, label, score in zip(final_boxes, final_label, final_score):
  91. res = {}
  92. res['image_id'] = img_id
  93. res['bbox'] = [loc[1], loc[0], loc[3] - loc[1], loc[2] - loc[0]]
  94. res['score'] = score
  95. res['category_id'] = label
  96. self.predictions.append(res)
  97. def get_metrics(self):
  98. with open('predictions.json', 'w') as f:
  99. json.dump(self.predictions, f)
  100. coco_dt = self.coco_gt.loadRes('predictions.json')
  101. E = COCOeval(self.coco_gt, coco_dt, iouType='bbox')
  102. E.params.imgIds = self.img_ids
  103. E.evaluate()
  104. E.accumulate()
  105. E.summarize()
  106. return E.stats[0]
  107. class SsdInferWithDecoder(nn.Cell):
  108. """
  109. SSD Infer wrapper to decode the bbox locations."""
  110. def __init__(self, network, default_boxes, ckpt_path):
  111. super(SsdInferWithDecoder, self).__init__()
  112. param_dict = ms.load_checkpoint(ckpt_path)
  113. ms.load_param_into_net(network, param_dict)
  114. self.network = network
  115. self.default_boxes = default_boxes
  116. self.prior_scaling_xy = 0.1
  117. self.prior_scaling_wh = 0.2
  118. def construct(self, x):
  119. pred_loc, pred_label = self.network(x)
  120. default_bbox_xy = self.default_boxes[..., :2]
  121. default_bbox_wh = self.default_boxes[..., 2:]
  122. pred_xy = pred_loc[..., :2] * self.prior_scaling_xy * default_bbox_wh + default_bbox_xy
  123. pred_wh = ops.exp(pred_loc[..., 2:] * self.prior_scaling_wh) * default_bbox_wh
  124. pred_xy_0 = pred_xy - pred_wh / 2.0
  125. pred_xy_1 = pred_xy + pred_wh / 2.0
  126. pred_xy = ops.concat((pred_xy_0, pred_xy_1), -1)
  127. pred_xy = ops.maximum(pred_xy, 0)
  128. pred_xy = ops.minimum(pred_xy, 1)
  129. return pred_xy, pred_label



确定训练图片中ground truth(真实目标)匹配的先验框


SSD先验框与ground truth的匹配原则主要有两点:


       正样本:图片中每个ground truth IOU最大的先验框为匹配先验框

       负样本:未能与任何ground truth匹配的先验框,只能与背景匹配










训练中 prior boxes 和 ground truth boxes 匹配的基本思路:

每个prior box回归到ground truth box





















  1. import math
  2. import itertools as it
  3. from mindspore import set_seed
  4. class GeneratDefaultBoxes():
  5. """
  6. Generate Default boxes for SSD, follows the order of (W, H, archor_sizes).
  7. `self.default_boxes` has a shape of [archor_sizes, H, W, 4], the last dimension is [y, x, h, w].
  8. `self.default_boxes_tlbr` has a shape as `self.default_boxes`, the last dimension is [y1, x1, y2, x2].
  9. """
  10. def __init__(self):
  11. fk = 300 / np.array([8, 16, 32, 64, 100, 300])
  12. scale_rate = (0.95 - 0.1) / (len([4, 6, 6, 6, 4, 4]) - 1)
  13. scales = [0.1 + scale_rate * i for i in range(len([4, 6, 6, 6, 4, 4]))] + [1.0]
  14. self.default_boxes = []
  15. for idex, feature_size in enumerate([38, 19, 10, 5, 3, 1]):
  16. sk1 = scales[idex]
  17. sk2 = scales[idex + 1]
  18. sk3 = math.sqrt(sk1 * sk2)
  19. if idex == 0 and not [[2], [2, 3], [2, 3], [2, 3], [2], [2]][idex]:
  20. w, h = sk1 * math.sqrt(2), sk1 / math.sqrt(2)
  21. all_sizes = [(0.1, 0.1), (w, h), (h, w)]
  22. else:
  23. all_sizes = [(sk1, sk1)]
  24. for aspect_ratio in [[2], [2, 3], [2, 3], [2, 3], [2], [2]][idex]:
  25. w, h = sk1 * math.sqrt(aspect_ratio), sk1 / math.sqrt(aspect_ratio)
  26. all_sizes.append((w, h))
  27. all_sizes.append((h, w))
  28. all_sizes.append((sk3, sk3))
  29. assert len(all_sizes) == [4, 6, 6, 6, 4, 4][idex]
  30. for i, j in it.product(range(feature_size), repeat=2):
  31. for w, h in all_sizes:
  32. cx, cy = (j + 0.5) / fk[idex], (i + 0.5) / fk[idex]
  33. self.default_boxes.append([cy, cx, h, w])
  34. def to_tlbr(cy, cx, h, w):
  35. return cy - h / 2, cx - w / 2, cy + h / 2, cx + w / 2
  36. # For IoU calculation
  37. self.default_boxes_tlbr = np.array(tuple(to_tlbr(*i) for i in self.default_boxes), dtype='float32')
  38. self.default_boxes = np.array(self.default_boxes, dtype='float32')
  39. default_boxes_tlbr = GeneratDefaultBoxes().default_boxes_tlbr
  40. default_boxes = GeneratDefaultBoxes().default_boxes
  41. y1, x1, y2, x2 = np.split(default_boxes_tlbr[:, :4], 4, axis=-1)
  42. vol_anchors = (x2 - x1) * (y2 - y1)
  43. matching_threshold = 0.5
  1. from mindspore.common.initializer import initializer, TruncatedNormal
  2. def init_net_param(network, initialize_mode='TruncatedNormal'):
  3. """Init the parameters in net."""
  4. params = network.trainable_params()
  5. for p in params:
  6. if 'beta' not in p.name and 'gamma' not in p.name and 'bias' not in p.name:
  7. if initialize_mode == 'TruncatedNormal':
  8. p.set_data(initializer(TruncatedNormal(0.02), p.data.shape, p.data.dtype))
  9. else:
  10. p.set_data(initialize_mode, p.data.shape, p.data.dtype)
  11. def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch):
  12. """ generate learning rate array"""
  13. lr_each_step = []
  14. total_steps = steps_per_epoch * total_epochs
  15. warmup_steps = steps_per_epoch * warmup_epochs
  16. for i in range(total_steps):
  17. if i < warmup_steps:
  18. lr = lr_init + (lr_max - lr_init) * i / warmup_steps
  19. else:
  20. lr = lr_end + (lr_max - lr_end) * (1. + math.cos(math.pi * (i - warmup_steps) / (total_steps - warmup_steps))) / 2.
  21. if lr < 0.0:
  22. lr = 0.0
  23. lr_each_step.append(lr)
  24. current_step = global_step
  25. lr_each_step = np.array(lr_each_step).astype(np.float32)
  26. learning_rate = lr_each_step[current_step:]
  27. return learning_rate
  1. import mindspore.dataset as ds
  2. ds.config.set_enable_shared_mem(False)
  1. import time
  2. from mindspore.amp import DynamicLossScaler
  3. set_seed(1)
  4. # load data
  5. mindrecord_dir = "./datasets/MindRecord_COCO"
  6. mindrecord_file = "./datasets/MindRecord_COCO/ssd.mindrecord0"
  7. dataset = create_ssd_dataset(mindrecord_file, batch_size=5, rank=0, use_multiprocessing=True)
  8. dataset_size = dataset.get_dataset_size()
  9. image, get_loc, gt_label, num_matched_boxes = next(dataset.create_tuple_iterator())
  10. # Network definition and initialization
  11. network = SSD300Vgg16()
  12. init_net_param(network)
  13. # Define the learning rate
  14. lr = Tensor(get_lr(global_step=0 * dataset_size,
  15. lr_init=0.001, lr_end=0.001 * 0.05, lr_max=0.05,
  16. warmup_epochs=2, total_epochs=60, steps_per_epoch=dataset_size))
  17. # Define the optimizer
  18. opt = nn.Momentum(filter(lambda x: x.requires_grad, network.get_parameters()), lr,
  19. 0.9, 0.00015, float(1024))
  20. # Define the forward procedure
  21. def forward_fn(x, gt_loc, gt_label, num_matched_boxes):
  22. pred_loc, pred_label = network(x)
  23. mask = ops.less(0, gt_label).astype(ms.float32)
  24. num_matched_boxes = ops.sum(num_matched_boxes.astype(ms.float32))
  25. # Positioning loss
  26. mask_loc = ops.tile(ops.expand_dims(mask, -1), (1, 1, 4))
  27. smooth_l1 = nn.SmoothL1Loss()(pred_loc, gt_loc) * mask_loc
  28. loss_loc = ops.sum(ops.sum(smooth_l1, -1), -1)
  29. # Category loss
  30. loss_cls = class_loss(pred_label, gt_label)
  31. loss_cls = ops.sum(loss_cls, (1, 2))
  32. return ops.sum((loss_cls + loss_loc) / num_matched_boxes)
  33. grad_fn = ms.value_and_grad(forward_fn, None, opt.parameters, has_aux=False)
  34. loss_scaler = DynamicLossScaler(1024, 2, 1000)
  35. # Gradient updates
  36. def train_step(x, gt_loc, gt_label, num_matched_boxes):
  37. loss, grads = grad_fn(x, gt_loc, gt_label, num_matched_boxes)
  38. opt(grads)
  39. return loss
  40. print("=================== Starting Training =====================")
  41. for epoch in range(60):
  42. network.set_train(True)
  43. begin_time = time.time()
  44. for step, (image, get_loc, gt_label, num_matched_boxes) in enumerate(dataset.create_tuple_iterator()):
  45. loss = train_step(image, get_loc, gt_label, num_matched_boxes)
  46. end_time = time.time()
  47. times = end_time - begin_time
  48. print(f"Epoch:[{int(epoch + 1)}/{int(60)}], "
  49. f"loss:{loss} , "
  50. f"time:{times}s ")
  51. ms.save_checkpoint(network, "ssd-60_9.ckpt")
  52. print("=================== Training Success =====================")


=================== Training Success =====================





        Average Precision(AP)

        Average Recall(AR)




TP:IoU>阈值检测框数量(同一Ground Truth只计算一次)。




精确率(Average Precision,AP):


TP 正样本预测正确的结果

FP 正样本预测错误的结果

【需确认】召回率(Average Recall,AR):


TP 正样本预测正确的结果

FN 正样本预测错误的和



(1)类别AP的平均值mAP(mean Average Precision)











  1. mindrecord_file = "./datasets/MindRecord_COCO/ssd_eval.mindrecord0"
  2. def ssd_eval(dataset_path, ckpt_path, anno_json):
  3. """SSD evaluation."""
  4. batch_size = 1
  5. ds = create_ssd_dataset(dataset_path, batch_size=batch_size,
  6. is_training=False, use_multiprocessing=False)
  7. network = SSD300Vgg16()
  8. print("Load Checkpoint!")
  9. net = SsdInferWithDecoder(network, Tensor(default_boxes), ckpt_path)
  10. net.set_train(False)
  11. total = ds.get_dataset_size() * batch_size
  12. print("\n========================================\n")
  13. print("total images num: ", total)
  14. eval_param_dict = {"net": net, "dataset": ds, "anno_json": anno_json}
  15. mAP = apply_eval(eval_param_dict)
  16. print("\n========================================\n")
  17. print(f"mAP: {mAP}")
  18. def eval_net():
  19. print("Start Eval!")
  20. ssd_eval(mindrecord_file, "./ssd-60_9.ckpt", anno_json)
  21. eval_net()


