当前位置:   article > 正文

【YOLOv8改进[检测头Head]】YOLOv8的“新头”之动态头(DynamicHead)

【YOLOv8改进[检测头Head]】YOLOv8的“新头”之动态头(DynamicHead)

目录

一 DynamicHead

二 YOLOv8的“新头”之动态头

1 总体修改

2 配置文件

3 训练

其他


一 DynamicHead

官方论文地址:https://arxiv.org/pdf/2106.08322.pdf

官方代码地址:GitCode - 开发者的代码家园

计算机视觉应用中,目标检测是为了回答“是什么物体位于哪里”的问题的任务。如何提高目标检测头的性能已成为现有目标检测工作中的一个关键问题。

在本文中,提出了一种新的目标检测头,它将尺度感知、空间感知和任务感知的注意统一在一个框架中。提出了一种关注目标检测头的新视角。作为一个插件块,动态头可以灵活地集成到任何现有的目标检测器框架中,并在目标检测头中有效地应用了注意力机制,以提高其性能和效率。

下图说明了的动态头部方法。它包含三种不同的注意机制,每一种机制都侧重于不同的视角:尺度感知注意力空间感知注意力任务感知注意力。同时,还可视化了在每个注意模块之后特征图是如何改进的。

下图是动态头的详细设计。

(a) 显示了每个注意力模块的详细实施情况。

(b) 展示了如何将动态头部块应用于one-stage目标检测。

(c) 展示了如何将动态头部块应用于two-stage目标检测。

二 YOLOv8的“新头”之动态头

ultralytics的版本为8.1.47,如下图所示:

1 总体修改

① 添加DynamicHead.py文件

ultralytics/nn/modules目录下新建DynamicHead.py文件,内容如下所示:

  1. import torch
  2. import math
  3. import torch.nn as nn
  4. import torch.nn.functional as F
  5. from mmcv.ops import ModulatedDeformConv2d
  6. from ultralytics.utils.tal import dist2bbox, make_anchors
  7. __all__ = ['Detect_DynamicHead']
  8. def _make_divisible(v, divisor, min_value=None):
  9. if min_value is None:
  10. min_value = divisor
  11. new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
  12. # Make sure that round down does not go down by more than 10%.
  13. if new_v < 0.9 * v:
  14. new_v += divisor
  15. return new_v
  16. class h_swish(nn.Module):
  17. def __init__(self, inplace=False):
  18. super(h_swish, self).__init__()
  19. self.inplace = inplace
  20. def forward(self, x):
  21. return x * F.relu6(x + 3.0, inplace=self.inplace) / 6.0
  22. class h_sigmoid(nn.Module):
  23. def __init__(self, inplace=True, h_max=1):
  24. super(h_sigmoid, self).__init__()
  25. self.relu = nn.ReLU6(inplace=inplace)
  26. self.h_max = h_max
  27. def forward(self, x):
  28. return self.relu(x + 3) * self.h_max / 6
  29. class DYReLU(nn.Module):
  30. def __init__(self, inp, oup, reduction=4, lambda_a=1.0, K2=True, use_bias=True, use_spatial=False,
  31. init_a=[1.0, 0.0], init_b=[0.0, 0.0]):
  32. super(DYReLU, self).__init__()
  33. self.oup = oup
  34. self.lambda_a = lambda_a * 2
  35. self.K2 = K2
  36. self.avg_pool = nn.AdaptiveAvgPool2d(1)
  37. self.use_bias = use_bias
  38. if K2:
  39. self.exp = 4 if use_bias else 2
  40. else:
  41. self.exp = 2 if use_bias else 1
  42. self.init_a = init_a
  43. self.init_b = init_b
  44. # determine squeeze
  45. if reduction == 4:
  46. squeeze = inp // reduction
  47. else:
  48. squeeze = _make_divisible(inp // reduction, 4)
  49. # print('reduction: {}, squeeze: {}/{}'.format(reduction, inp, squeeze))
  50. # print('init_a: {}, init_b: {}'.format(self.init_a, self.init_b))
  51. self.fc = nn.Sequential(
  52. nn.Linear(inp, squeeze),
  53. nn.ReLU(inplace=True),
  54. nn.Linear(squeeze, oup * self.exp),
  55. h_sigmoid()
  56. )
  57. if use_spatial:
  58. self.spa = nn.Sequential(
  59. nn.Conv2d(inp, 1, kernel_size=1),
  60. nn.BatchNorm2d(1),
  61. )
  62. else:
  63. self.spa = None
  64. def forward(self, x):
  65. if isinstance(x, list):
  66. x_in = x[0]
  67. x_out = x[1]
  68. else:
  69. x_in = x
  70. x_out = x
  71. b, c, h, w = x_in.size()
  72. y = self.avg_pool(x_in).view(b, c)
  73. y = self.fc(y).view(b, self.oup * self.exp, 1, 1)
  74. if self.exp == 4:
  75. a1, b1, a2, b2 = torch.split(y, self.oup, dim=1)
  76. a1 = (a1 - 0.5) * self.lambda_a + self.init_a[0] # 1.0
  77. a2 = (a2 - 0.5) * self.lambda_a + self.init_a[1]
  78. b1 = b1 - 0.5 + self.init_b[0]
  79. b2 = b2 - 0.5 + self.init_b[1]
  80. out = torch.max(x_out * a1 + b1, x_out * a2 + b2)
  81. elif self.exp == 2:
  82. if self.use_bias: # bias but not PL
  83. a1, b1 = torch.split(y, self.oup, dim=1)
  84. a1 = (a1 - 0.5) * self.lambda_a + self.init_a[0] # 1.0
  85. b1 = b1 - 0.5 + self.init_b[0]
  86. out = x_out * a1 + b1
  87. else:
  88. a1, a2 = torch.split(y, self.oup, dim=1)
  89. a1 = (a1 - 0.5) * self.lambda_a + self.init_a[0] # 1.0
  90. a2 = (a2 - 0.5) * self.lambda_a + self.init_a[1]
  91. out = torch.max(x_out * a1, x_out * a2)
  92. elif self.exp == 1:
  93. a1 = y
  94. a1 = (a1 - 0.5) * self.lambda_a + self.init_a[0] # 1.0
  95. out = x_out * a1
  96. if self.spa:
  97. ys = self.spa(x_in).view(b, -1)
  98. ys = F.softmax(ys, dim=1).view(b, 1, h, w) * h * w
  99. ys = F.hardtanh(ys, 0, 3, inplace=True) / 3
  100. out = out * ys
  101. return out
  102. class Conv3x3Norm(torch.nn.Module):
  103. def __init__(self, in_channels, out_channels, stride):
  104. super(Conv3x3Norm, self).__init__()
  105. self.conv = ModulatedDeformConv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
  106. self.bn = nn.GroupNorm(num_groups=16, num_channels=out_channels)
  107. def forward(self, input, **kwargs):
  108. x = self.conv(input.contiguous(), **kwargs)
  109. x = self.bn(x)
  110. return x
  111. class DyConv(nn.Module):
  112. def __init__(self, in_channels=256, out_channels=256, conv_func=Conv3x3Norm):
  113. super(DyConv, self).__init__()
  114. self.DyConv = nn.ModuleList()
  115. self.DyConv.append(conv_func(in_channels, out_channels, 1))
  116. self.DyConv.append(conv_func(in_channels, out_channels, 1))
  117. self.DyConv.append(conv_func(in_channels, out_channels, 2))
  118. self.AttnConv = nn.Sequential(
  119. nn.AdaptiveAvgPool2d(1),
  120. nn.Conv2d(in_channels, 1, kernel_size=1),
  121. nn.ReLU(inplace=True))
  122. self.h_sigmoid = h_sigmoid()
  123. self.relu = DYReLU(in_channels, out_channels)
  124. self.offset = nn.Conv2d(in_channels, 27, kernel_size=3, stride=1, padding=1)
  125. self.init_weights()
  126. def init_weights(self):
  127. for m in self.DyConv.modules():
  128. if isinstance(m, nn.Conv2d):
  129. nn.init.normal_(m.weight.data, 0, 0.01)
  130. if m.bias is not None:
  131. m.bias.data.zero_()
  132. for m in self.AttnConv.modules():
  133. if isinstance(m, nn.Conv2d):
  134. nn.init.normal_(m.weight.data, 0, 0.01)
  135. if m.bias is not None:
  136. m.bias.data.zero_()
  137. def forward(self, x):
  138. next_x = {}
  139. feature_names = list(x.keys())
  140. for level, name in enumerate(feature_names):
  141. feature = x[name]
  142. offset_mask = self.offset(feature)
  143. offset = offset_mask[:, :18, :, :]
  144. mask = offset_mask[:, 18:, :, :].sigmoid()
  145. conv_args = dict(offset=offset, mask=mask)
  146. temp_fea = [self.DyConv[1](feature, **conv_args)]
  147. if level > 0:
  148. temp_fea.append(self.DyConv[2](x[feature_names[level - 1]], **conv_args))
  149. if level < len(x) - 1:
  150. input = x[feature_names[level + 1]]
  151. temp_fea.append(F.interpolate(self.DyConv[0](input, **conv_args),
  152. size=[feature.size(2), feature.size(3)]))
  153. attn_fea = []
  154. res_fea = []
  155. for fea in temp_fea:
  156. res_fea.append(fea)
  157. attn_fea.append(self.AttnConv(fea))
  158. res_fea = torch.stack(res_fea)
  159. spa_pyr_attn = self.h_sigmoid(torch.stack(attn_fea))
  160. mean_fea = torch.mean(res_fea * spa_pyr_attn, dim=0, keepdim=False)
  161. next_x[name] = self.relu(mean_fea)
  162. return next_x
  163. def autopad(k, p=None, d=1): # kernel, padding, dilation
  164. """Pad to 'same' shape outputs."""
  165. if d > 1:
  166. k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size
  167. if p is None:
  168. p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
  169. return p
  170. class Conv(nn.Module):
  171. """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
  172. default_act = nn.SiLU() # default activation
  173. def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
  174. """Initialize Conv layer with given arguments including activation."""
  175. super().__init__()
  176. self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
  177. self.bn = nn.BatchNorm2d(c2)
  178. self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
  179. def forward(self, x):
  180. """Apply convolution, batch normalization and activation to input tensor."""
  181. return self.act(self.bn(self.conv(x)))
  182. def forward_fuse(self, x):
  183. """Perform transposed convolution of 2D data."""
  184. return self.act(self.conv(x))
  185. class DFL(nn.Module):
  186. """
  187. Integral module of Distribution Focal Loss (DFL).
  188. Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
  189. """
  190. def __init__(self, c1=16):
  191. """Initialize a convolutional layer with a given number of input channels."""
  192. super().__init__()
  193. self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
  194. x = torch.arange(c1, dtype=torch.float)
  195. self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
  196. self.c1 = c1
  197. def forward(self, x):
  198. """Applies a transformer layer on input tensor 'x' and returns a tensor."""
  199. b, c, a = x.shape # batch, channels, anchors
  200. return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
  201. # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
  202. class Detect_DynamicHead(nn.Module):
  203. """YOLOv8 Detect head for detection models."""
  204. dynamic = False # force grid reconstruction
  205. export = False # export mode
  206. shape = None
  207. anchors = torch.empty(0) # init
  208. strides = torch.empty(0) # init
  209. def __init__(self, nc=80, ch=()):
  210. """Initializes the YOLOv8 detection layer with specified number of classes and channels."""
  211. super().__init__()
  212. self.nc = nc # number of classes
  213. self.nl = len(ch) # number of detection layers
  214. self.reg_max = 16 # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
  215. self.no = nc + self.reg_max * 4 # number of outputs per anchor
  216. self.stride = torch.zeros(self.nl) # strides computed during build
  217. c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels
  218. self.cv2 = nn.ModuleList(
  219. nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
  220. self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
  221. self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
  222. dyhead_tower = []
  223. for i in range(self.nl):
  224. channel = ch[i]
  225. dyhead_tower.append(
  226. DyConv(
  227. channel,
  228. channel,
  229. conv_func=Conv3x3Norm,
  230. )
  231. )
  232. self.add_module('dyhead_tower', nn.Sequential(*dyhead_tower))
  233. def forward(self, x):
  234. tensor_dict = {i: tensor for i, tensor in enumerate(x)}
  235. x = self.dyhead_tower(tensor_dict)
  236. x = list(x.values())
  237. """Concatenates and returns predicted bounding boxes and class probabilities."""
  238. shape = x[0].shape # BCHW
  239. for i in range(self.nl):
  240. x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
  241. if self.training:
  242. return x
  243. elif self.dynamic or self.shape != shape:
  244. self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
  245. self.shape = shape
  246. x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
  247. if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'): # avoid TF FlexSplitV ops
  248. box = x_cat[:, :self.reg_max * 4]
  249. cls = x_cat[:, self.reg_max * 4:]
  250. else:
  251. box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
  252. dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
  253. if self.export and self.format in ('tflite', 'edgetpu'):
  254. # Normalize xywh with image size to mitigate quantization error of TFLite integer models as done in YOLOv5:
  255. # https://github.com/ultralytics/yolov5/blob/0c8de3fca4a702f8ff5c435e67f378d1fce70243/models/tf.py#L307-L309
  256. # See this PR for details: https://github.com/ultralytics/ultralytics/pull/1695
  257. img_h = shape[2] * self.stride[0]
  258. img_w = shape[3] * self.stride[0]
  259. img_size = torch.tensor([img_w, img_h, img_w, img_h], device=dbox.device).reshape(1, 4, 1)
  260. dbox /= img_size
  261. y = torch.cat((dbox, cls.sigmoid()), 1)
  262. return y if self.export else (y, x)
  263. def bias_init(self):
  264. """Initialize Detect() biases, WARNING: requires stride availability."""
  265. m = self # self.model[-1] # Detect() module
  266. # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
  267. # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum()) # nominal class frequency
  268. for a, b, s in zip(m.cv2, m.cv3, m.stride): # from
  269. a[-1].bias.data[:] = 1.0 # box
  270. b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)

② 修改ultralytics/nn/tasks.py文件

具体的修改内容如下图所示:

可以Ctrl+f然后搜索 “detect”方便后续锁定需要修改的位置。

2 配置文件

yolov8_DynamicHead.yaml的内容如下所示:

  1. # Ultralytics YOLO
    声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/weixin_40725706/article/detail/475293
    推荐阅读
    相关标签