赞
踩
目录
官方论文地址:https://arxiv.org/pdf/2106.08322.pdf
官方代码地址:GitCode - 开发者的代码家园
在计算机视觉应用中,目标检测是为了回答“是什么物体位于哪里”的问题的任务。如何提高目标检测头的性能已成为现有目标检测工作中的一个关键问题。
在本文中,提出了一种新的目标检测头,它将尺度感知、空间感知和任务感知的注意统一在一个框架中。提出了一种关注目标检测头的新视角。作为一个插件块,动态头可以灵活地集成到任何现有的目标检测器框架中,并在目标检测头中有效地应用了注意力机制,以提高其性能和效率。
下图说明了的动态头部方法。它包含三种不同的注意机制,每一种机制都侧重于不同的视角:尺度感知注意力、空间感知注意力和任务感知注意力。同时,还可视化了在每个注意模块之后特征图是如何改进的。
下图是动态头的详细设计。
(a) 显示了每个注意力模块的详细实施情况。
(b) 展示了如何将动态头部块应用于one-stage目标检测。
(c) 展示了如何将动态头部块应用于two-stage目标检测。
ultralytics的版本为8.1.47,如下图所示:
① 添加DynamicHead.py文件
在ultralytics/nn/modules目录下新建DynamicHead.py文件,内容如下所示:
- import torch
- import math
- import torch.nn as nn
- import torch.nn.functional as F
- from mmcv.ops import ModulatedDeformConv2d
- from ultralytics.utils.tal import dist2bbox, make_anchors
-
- __all__ = ['Detect_DynamicHead']
-
- def _make_divisible(v, divisor, min_value=None):
- if min_value is None:
- min_value = divisor
- new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
- # Make sure that round down does not go down by more than 10%.
- if new_v < 0.9 * v:
- new_v += divisor
- return new_v
-
-
- class h_swish(nn.Module):
- def __init__(self, inplace=False):
- super(h_swish, self).__init__()
- self.inplace = inplace
-
- def forward(self, x):
- return x * F.relu6(x + 3.0, inplace=self.inplace) / 6.0
-
-
- class h_sigmoid(nn.Module):
- def __init__(self, inplace=True, h_max=1):
- super(h_sigmoid, self).__init__()
- self.relu = nn.ReLU6(inplace=inplace)
- self.h_max = h_max
-
- def forward(self, x):
- return self.relu(x + 3) * self.h_max / 6
-
-
- class DYReLU(nn.Module):
- def __init__(self, inp, oup, reduction=4, lambda_a=1.0, K2=True, use_bias=True, use_spatial=False,
- init_a=[1.0, 0.0], init_b=[0.0, 0.0]):
- super(DYReLU, self).__init__()
- self.oup = oup
- self.lambda_a = lambda_a * 2
- self.K2 = K2
- self.avg_pool = nn.AdaptiveAvgPool2d(1)
-
- self.use_bias = use_bias
- if K2:
- self.exp = 4 if use_bias else 2
- else:
- self.exp = 2 if use_bias else 1
- self.init_a = init_a
- self.init_b = init_b
-
- # determine squeeze
- if reduction == 4:
- squeeze = inp // reduction
- else:
- squeeze = _make_divisible(inp // reduction, 4)
- # print('reduction: {}, squeeze: {}/{}'.format(reduction, inp, squeeze))
- # print('init_a: {}, init_b: {}'.format(self.init_a, self.init_b))
-
- self.fc = nn.Sequential(
- nn.Linear(inp, squeeze),
- nn.ReLU(inplace=True),
- nn.Linear(squeeze, oup * self.exp),
- h_sigmoid()
- )
- if use_spatial:
- self.spa = nn.Sequential(
- nn.Conv2d(inp, 1, kernel_size=1),
- nn.BatchNorm2d(1),
- )
- else:
- self.spa = None
-
- def forward(self, x):
- if isinstance(x, list):
- x_in = x[0]
- x_out = x[1]
- else:
- x_in = x
- x_out = x
- b, c, h, w = x_in.size()
- y = self.avg_pool(x_in).view(b, c)
- y = self.fc(y).view(b, self.oup * self.exp, 1, 1)
- if self.exp == 4:
- a1, b1, a2, b2 = torch.split(y, self.oup, dim=1)
- a1 = (a1 - 0.5) * self.lambda_a + self.init_a[0] # 1.0
- a2 = (a2 - 0.5) * self.lambda_a + self.init_a[1]
-
- b1 = b1 - 0.5 + self.init_b[0]
- b2 = b2 - 0.5 + self.init_b[1]
- out = torch.max(x_out * a1 + b1, x_out * a2 + b2)
- elif self.exp == 2:
- if self.use_bias: # bias but not PL
- a1, b1 = torch.split(y, self.oup, dim=1)
- a1 = (a1 - 0.5) * self.lambda_a + self.init_a[0] # 1.0
- b1 = b1 - 0.5 + self.init_b[0]
- out = x_out * a1 + b1
-
- else:
- a1, a2 = torch.split(y, self.oup, dim=1)
- a1 = (a1 - 0.5) * self.lambda_a + self.init_a[0] # 1.0
- a2 = (a2 - 0.5) * self.lambda_a + self.init_a[1]
- out = torch.max(x_out * a1, x_out * a2)
-
- elif self.exp == 1:
- a1 = y
- a1 = (a1 - 0.5) * self.lambda_a + self.init_a[0] # 1.0
- out = x_out * a1
-
- if self.spa:
- ys = self.spa(x_in).view(b, -1)
- ys = F.softmax(ys, dim=1).view(b, 1, h, w) * h * w
- ys = F.hardtanh(ys, 0, 3, inplace=True) / 3
- out = out * ys
-
- return out
-
-
- class Conv3x3Norm(torch.nn.Module):
- def __init__(self, in_channels, out_channels, stride):
- super(Conv3x3Norm, self).__init__()
-
- self.conv = ModulatedDeformConv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
- self.bn = nn.GroupNorm(num_groups=16, num_channels=out_channels)
-
- def forward(self, input, **kwargs):
- x = self.conv(input.contiguous(), **kwargs)
- x = self.bn(x)
- return x
-
-
- class DyConv(nn.Module):
- def __init__(self, in_channels=256, out_channels=256, conv_func=Conv3x3Norm):
- super(DyConv, self).__init__()
-
- self.DyConv = nn.ModuleList()
- self.DyConv.append(conv_func(in_channels, out_channels, 1))
- self.DyConv.append(conv_func(in_channels, out_channels, 1))
- self.DyConv.append(conv_func(in_channels, out_channels, 2))
-
- self.AttnConv = nn.Sequential(
- nn.AdaptiveAvgPool2d(1),
- nn.Conv2d(in_channels, 1, kernel_size=1),
- nn.ReLU(inplace=True))
-
- self.h_sigmoid = h_sigmoid()
- self.relu = DYReLU(in_channels, out_channels)
- self.offset = nn.Conv2d(in_channels, 27, kernel_size=3, stride=1, padding=1)
- self.init_weights()
-
- def init_weights(self):
- for m in self.DyConv.modules():
- if isinstance(m, nn.Conv2d):
- nn.init.normal_(m.weight.data, 0, 0.01)
- if m.bias is not None:
- m.bias.data.zero_()
- for m in self.AttnConv.modules():
- if isinstance(m, nn.Conv2d):
- nn.init.normal_(m.weight.data, 0, 0.01)
- if m.bias is not None:
- m.bias.data.zero_()
-
- def forward(self, x):
- next_x = {}
- feature_names = list(x.keys())
- for level, name in enumerate(feature_names):
-
- feature = x[name]
-
- offset_mask = self.offset(feature)
- offset = offset_mask[:, :18, :, :]
- mask = offset_mask[:, 18:, :, :].sigmoid()
- conv_args = dict(offset=offset, mask=mask)
-
- temp_fea = [self.DyConv[1](feature, **conv_args)]
- if level > 0:
- temp_fea.append(self.DyConv[2](x[feature_names[level - 1]], **conv_args))
- if level < len(x) - 1:
- input = x[feature_names[level + 1]]
- temp_fea.append(F.interpolate(self.DyConv[0](input, **conv_args),
- size=[feature.size(2), feature.size(3)]))
- attn_fea = []
- res_fea = []
- for fea in temp_fea:
- res_fea.append(fea)
- attn_fea.append(self.AttnConv(fea))
-
- res_fea = torch.stack(res_fea)
- spa_pyr_attn = self.h_sigmoid(torch.stack(attn_fea))
- mean_fea = torch.mean(res_fea * spa_pyr_attn, dim=0, keepdim=False)
- next_x[name] = self.relu(mean_fea)
-
- return next_x
-
-
- def autopad(k, p=None, d=1): # kernel, padding, dilation
- """Pad to 'same' shape outputs."""
- if d > 1:
- k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size
- if p is None:
- p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
- return p
-
-
- class Conv(nn.Module):
- """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
- default_act = nn.SiLU() # default activation
-
- def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
- """Initialize Conv layer with given arguments including activation."""
- super().__init__()
- self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
- self.bn = nn.BatchNorm2d(c2)
- self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
-
- def forward(self, x):
- """Apply convolution, batch normalization and activation to input tensor."""
- return self.act(self.bn(self.conv(x)))
-
- def forward_fuse(self, x):
- """Perform transposed convolution of 2D data."""
- return self.act(self.conv(x))
-
- class DFL(nn.Module):
- """
- Integral module of Distribution Focal Loss (DFL).
- Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
- """
-
- def __init__(self, c1=16):
- """Initialize a convolutional layer with a given number of input channels."""
- super().__init__()
- self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
- x = torch.arange(c1, dtype=torch.float)
- self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
- self.c1 = c1
-
- def forward(self, x):
- """Applies a transformer layer on input tensor 'x' and returns a tensor."""
- b, c, a = x.shape # batch, channels, anchors
- return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
- # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
-
- class Detect_DynamicHead(nn.Module):
- """YOLOv8 Detect head for detection models."""
- dynamic = False # force grid reconstruction
- export = False # export mode
- shape = None
- anchors = torch.empty(0) # init
- strides = torch.empty(0) # init
-
- def __init__(self, nc=80, ch=()):
- """Initializes the YOLOv8 detection layer with specified number of classes and channels."""
- super().__init__()
- self.nc = nc # number of classes
- self.nl = len(ch) # number of detection layers
- self.reg_max = 16 # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
- self.no = nc + self.reg_max * 4 # number of outputs per anchor
- self.stride = torch.zeros(self.nl) # strides computed during build
- c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels
- self.cv2 = nn.ModuleList(
- nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
- self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
- self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
- dyhead_tower = []
- for i in range(self.nl):
- channel = ch[i]
- dyhead_tower.append(
- DyConv(
- channel,
- channel,
- conv_func=Conv3x3Norm,
- )
- )
- self.add_module('dyhead_tower', nn.Sequential(*dyhead_tower))
-
- def forward(self, x):
- tensor_dict = {i: tensor for i, tensor in enumerate(x)}
- x = self.dyhead_tower(tensor_dict)
- x = list(x.values())
- """Concatenates and returns predicted bounding boxes and class probabilities."""
- shape = x[0].shape # BCHW
- for i in range(self.nl):
- x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
- if self.training:
- return x
- elif self.dynamic or self.shape != shape:
- self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
- self.shape = shape
-
- x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
- if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'): # avoid TF FlexSplitV ops
- box = x_cat[:, :self.reg_max * 4]
- cls = x_cat[:, self.reg_max * 4:]
- else:
- box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
- dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
-
- if self.export and self.format in ('tflite', 'edgetpu'):
- # Normalize xywh with image size to mitigate quantization error of TFLite integer models as done in YOLOv5:
- # https://github.com/ultralytics/yolov5/blob/0c8de3fca4a702f8ff5c435e67f378d1fce70243/models/tf.py#L307-L309
- # See this PR for details: https://github.com/ultralytics/ultralytics/pull/1695
- img_h = shape[2] * self.stride[0]
- img_w = shape[3] * self.stride[0]
- img_size = torch.tensor([img_w, img_h, img_w, img_h], device=dbox.device).reshape(1, 4, 1)
- dbox /= img_size
-
- y = torch.cat((dbox, cls.sigmoid()), 1)
- return y if self.export else (y, x)
-
- def bias_init(self):
- """Initialize Detect() biases, WARNING: requires stride availability."""
- m = self # self.model[-1] # Detect() module
- # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
- # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum()) # nominal class frequency
- for a, b, s in zip(m.cv2, m.cv3, m.stride): # from
- a[-1].bias.data[:] = 1.0 # box
- b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
② 修改ultralytics/nn/tasks.py文件
具体的修改内容如下图所示:
可以Ctrl+f然后搜索 “detect”方便后续锁定需要修改的位置。
yolov8_DynamicHead.yaml的内容如下所示:
- # Ultralytics YOLO 声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/weixin_40725706/article/detail/475293推荐阅读
相关标签
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。