赞
踩
SSD是one-stage目标检测方法,和yolo一样。可以同时进行目标检测和分类,速度很快。
SSD主要流程:
选取合适的模型结构,挑选其中合适的特征层或者所有特征层作为backbone,再之后加上额外的卷积网络,组成SSD网络
选取其中的6层卷积层输出,对卷积层输出做2个操作。
坐标信息卷积处理:num_anchors x 4
分类信息卷积处理:num_anchors x num_classes
预测结果解码
具体代码可以参考CSDN@Bubbliiiing的代码,本次实现的是Resnet50实现SSD检测,这是为了与之前的Faster RCNN做对比,其他代码可以参考他的代码,我将我实现的Resnet 50 backbone贴出来,需要的小伙伴粘贴复制修改参考即可。
import torch.nn.functional as F import torch from torch import nn from torch.nn import init from torchvision.models import resnet50 class L2Norm(nn.Module): def __init__(self, n_channels, scale): super(L2Norm, self).__init__() self.n_channels = n_channels self.gamma = scale or None self.eps = 1e-10 self.weight = nn.Parameter(torch.Tensor(self.n_channels)) self.reset_parameters() def reset_parameters(self): init.constant_(self.weight, self.gamma) def forward(self, x): norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps # x /= norm x = torch.div(x, norm) out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x return out def add_extras(in_channels): layers = [] # Block 6 # 19,19,1024 -> 10,10,512 layers += [nn.Conv2d(in_channels, 512, kernel_size=3, stride=2, padding=1)] # Block 7 # 10,10,512 -> 5,5,256 layers += [nn.Conv2d(512, 256, kernel_size=3, stride=2, padding=1)] # Block 8 # 5,5,256 -> 3,3,128 layers += [nn.Conv2d(256, 128, kernel_size=3, stride=1)] # Block 9 # 3,3,128 -> 1,1,256 layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1)] return nn.ModuleList(layers) def resnet_backbone(pretrained=True, progress=True, **kwargs): resnet50_model = resnet50(pretrained=pretrained) model = nn.Sequential(*list(resnet50_model.children()))[:7] five_outchannels = resnet50_model.layer2[-1].conv3.out_channels six_outchannels = resnet50_model.layer3[-1].conv3.out_channels return model, five_outchannels, six_outchannels class SSD_Resnet50(nn.Module): def __init__(self, num_classes,pretrained=False): super(SSD_Resnet50, self).__init__() self.num_classes = num_classes # 获取整体的特征和倒数两层的输出channels 是pytorch官方backbone的第6层作为多尺度卷积层第一层,第7层做为多尺度卷积层第二层 self.features, six_outchannels, seven_outchannels = resnet_backbone(pretrained) # 添加4层的额外卷积层作为多尺度卷积层第三到第六层的输出 self.extras = add_extras(seven_outchannels) self.L2Norm = L2Norm(1024, 20) mbox = [4, 6, 6, 6, 4, 4] loc_layers = [] conf_layers = [] # 获取多尺度卷积层第一层 第二层的坐标信息,分类信息 for index, i in enumerate([six_outchannels, seven_outchannels]): loc_layers.append(nn.Conv2d(i, mbox[index] * 4, kernel_size=(3, 3), padding=1)) conf_layers.append(nn.Conv2d(i, mbox[index] * num_classes, kernel_size=(3, 3), padding=1)) # 获取多尺度卷积层第三到第六层的坐标信息,分类信息 for k, v in enumerate(self.extras): k += 2 loc_layers += [nn.Conv2d(v.out_channels, mbox[k] * 4, kernel_size=(3, 3), padding=1)] conf_layers += [nn.Conv2d(v.out_channels, mbox[k] * num_classes, kernel_size=(3, 3), padding=1)] self.loc = nn.ModuleList(loc_layers) self.conf = nn.ModuleList(conf_layers) def forward(self,x): sources = list() loc = list() conf = list() x = self.features[0](x) x = self.features[1](x) x = self.features[2](x) x = self.features[3](x) x = self.features[4](x) # 获取多尺度卷积层第一层的features输出 shape为39x39 x = self.features[5](x) sources.append(x) # 获取多尺度卷积层第二的features输出 18x18 x = self.features[6](x) # 将特征做归一化处理 x = self.L2Norm(x) sources.append(x) # 获取多尺度卷积层剩下第三到第六层的features输出 10x10 5x5 3x3 1x1 for k, v in enumerate(self.extras): x = F.relu(v(x), inplace=True) sources.append(x) for (x, l, c) in zip(sources, self.loc, self.conf): # 将多尺度卷积层输出的特征变化shape 将[batch, channels, h, w]转化为[batch, h, w, channels] loc.append(l(x).permute(0, 2, 3, 1).contiguous()) conf.append(c(x).permute(0, 2, 3, 1).contiguous()) # 将多尺度卷积层(6层)直接堆叠起来 loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) # -------------------------------------------------------------# # loc会reshape到batch_size, num_anchors, 4 # conf会reshap到batch_size, num_anchors, self.num_classes # -------------------------------------------------------------# # 将loc、conf reshape为[batch_size, num_anchors, 4] [batch_size, num_anchors, num_classes] output = ( loc.view(loc.size(0), -1, 4), conf.view(conf.size(0), -1, self.num_classes), ) return output if __name__ == '__main__': from torchsummary import summary model = SSD_Resnet50(10).cuda() print(summary(model,(3, 300, 300)))
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
Anchor box
对于Faster RCNN、yolov3、v4、v5都有专属的anchor box,SSD也不例外,首先在我们的多尺度卷积层,会输出mbox(4,6,6,6 ,4, 4)个aspect_ratios,比如说在resnet的第六层(38x38)的每个像素点输出4种大小比例的anchor box,分别为[1,1,2,1/2];在resnet的第七层会生成6种比例大小的anchor box[1, 1, 2 ,1/2, 3, 1/3]。这就是aspect_ratios的比例。
那么怎么生成anchor box?
anchors = [] for i in range(len(feature_heights)): # 这一步会遍历我们预设的6中aspect_ratios,分别为[4,6,6,6,4,4] anchor_boxes = AnchorBox(input_shape, anchors_size[i], max_size = anchors_size[i+1],aspect_ratios = aspect_ratios[i]).call([feature_heights[i], feature_widths[i]]) anchors.append(anchor_boxes)# 这里产生的anchor box为38*38*4=5776 19*19*6=2166 600 150 36 4 总共为8732个框 class AnchorBox(): def __init__(self, input_shape, min_size, max_size=None, aspect_ratios=None, flip=True): self.input_shape = input_shape # 我们预设的anchors_size=[30, 60, 111, 162, 213, 264, 315] # min_size 为anchors_size[i],则max_size 为anchor_size[i + 1] self.min_size = min_size self.max_size = max_size self.aspect_ratios = [] for ar in aspect_ratios: self.aspect_ratios.append(ar) self.aspect_ratios.append(1.0 / ar) def call(self, layer_shape, mask=None): # --------------------------------- # # 获取输入进来的特征层的宽和高 # 比如38x38 # --------------------------------- # layer_height = layer_shape[0] layer_width = layer_shape[1] # --------------------------------- # # 获取输入进来的图片的宽和高 # 比如300x300 # --------------------------------- # img_height = self.input_shape[0] img_width = self.input_shape[1] box_widths = [] box_heights = [] # --------------------------------- # # self.aspect_ratios一般有两个值 # [1, 1, 2, 1/2] # [1, 1, 2, 1/2, 3, 1/3] # --------------------------------- # for ar in self.aspect_ratios: # 首先添加一个较小的正方形 if ar == 1 and len(box_widths) == 0: box_widths.append(self.min_size) # 从我们预设的anchors_size获取 box_heights.append(self.min_size) # 然后添加一个较大的正方形 elif ar == 1 and len(box_widths) > 0: box_widths.append(np.sqrt(self.min_size * self.max_size)) box_heights.append(np.sqrt(self.min_size * self.max_size)) # 然后添加长方形 elif ar != 1: box_widths.append(self.min_size * np.sqrt(ar)) box_heights.append(self.min_size / np.sqrt(ar)) # --------------------------------- # # 获得所有先验框的宽高1/2 # --------------------------------- # box_widths = 0.5 * np.array(box_widths) box_heights = 0.5 * np.array(box_heights) # --------------------------------- # # 每一个特征层对应的步长 # --------------------------------- # step_x = img_width / layer_width step_y = img_height / layer_height # --------------------------------- # # 生成网格中心 # --------------------------------- # linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x, layer_width) liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y, layer_height) centers_x, centers_y = np.meshgrid(linx, liny) centers_x = centers_x.reshape(-1, 1) centers_y = centers_y.reshape(-1, 1) # 每一个先验框需要两个(centers_x, centers_y),前一个用来计算左上角,后一个计算右下角 num_anchors_ = len(self.aspect_ratios) anchor_boxes = np.concatenate((centers_x, centers_y), axis=1) anchor_boxes = np.tile(anchor_boxes, (1, 2 * num_anchors_)) # 获得先验框的左上角和右下角 anchor_boxes[:, ::4] -= box_widths anchor_boxes[:, 1::4] -= box_heights anchor_boxes[:, 2::4] += box_widths anchor_boxes[:, 3::4] += box_heights # --------------------------------- # # 将先验框变成小数的形式 # 归一化 # --------------------------------- # anchor_boxes[:, ::2] /= img_width anchor_boxes[:, 1::2] /= img_height anchor_boxes = anchor_boxes.reshape(-1, 4) anchor_boxes = np.minimum(np.maximum(anchor_boxes, 0.0), 1.0) return anchor_boxes
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
第六层 第七层指的是(参考:https://blog.csdn.net/kui9702/article/details/123807917 中第6层,如下图)
之后我们来看SSD是如何计算loss,从上面模型的forward中可以看到,每一次前向传播会返回坐标回归预测结果和每一个anchor box对应的分类预测结果。
# 将SSD的forward获取的回归预测和分类预测拼接起来 y_pred = torch.cat([y_pred[0], nn.Softmax(-1)(y_pred[1])], dim = -1) # 计算标注与预测的分类损失 conf_loss = self._softmax_loss(y_true[:, :, 4:-1], y_pred[:, :, 4:]) # 计算标注与预测的回归损失 loc_loss = self._l1_smooth_loss(y_true[:, :, :4], y_pred[:, :, :4]) # 计算预测的损失 # 计算预测正确的回归损失和分类损失,预测正确的loss将会加起来 pos_loc_loss = torch.sum(loc_loss * y_true[:, :, -1], axis=1) pos_conf_loss = torch.sum(conf_loss * y_true[:, :, -1], axis=1) # 获取每张图片的正样本数目 num_pos = torch.sum(y_true[:, :, -1], axis=-1) # 获取一张图片会产生多少个样本(anchores 总和) num_boxes = y_true.size()[1] # 获取负样本的个数 num_neg = torch.min(self.neg_pos_ratio * num_pos, num_boxes - num_pos) # 找到了哪些值是大于0的,记录下负样本的数量 pos_num_neg_mask = num_neg > 0 # 判断负样本的数量,如果负样本数量为0,则默认使用负样本数为100 has_min = torch.sum(pos_num_neg_mask) num_neg_batch = torch.sum(num_neg) if has_min > 0 else self.negatives_for_hard # 除了背景和物体的先验框,其他先眼眶 既不属于物体也不属于背景,属于较难分类的样本 confs_start = 4 + self.background_label_id + 1 confs_end = confs_start + self.num_classes - 1 # 对难分类的样本进行概率求和,求和后概率变大,代表越难分类 max_confs = torch.sum(y_pred[:, :, confs_start:confs_end], dim=2) # 这里其实我不太懂,之后可能需要看论文其他资料 max_confs = (max_confs * (1 - y_true[:, :, -1])).view([-1]) _, indices = torch.topk(max_confs, k = int(num_neg_batch.cpu().numpy().tolist())) neg_conf_loss = torch.gather(conf_loss.view([-1]), 0, indices) # 求的loss num_pos = torch.where(num_pos != 0, num_pos, torch.ones_like(num_pos)) total_loss = torch.sum(pos_conf_loss) + torch.sum(neg_conf_loss) + torch.sum(self.alpha * pos_loc_loss) total_loss = total_loss / torch.sum(num_pos)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
训练结果:与yolo v3和Faster RCNN对比(坐标SSD 右边 yolov3)
用的是与yolo v3一样的训练集,感觉比yolo v3要好,但是在小物体上,SSD难为力。我使用了vgg 和 resnet50 做SSDbackbone,提升比较明显,但是在小物体检测上,Resnet和vgg都表现的不好。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。