因为YOLO的合理速度和准确性之间的权衡, 这一系列已成为最流行的实时目标检测框架。然而,观察到nms对yolo的速度和准确性产生了负面影响。最近,基于端到端变换器的检测器(DETRs)消除了传统实时检测器中的非最大抑制(NMS)等后处理步骤的需要,这些步骤一直是传统实时检测器中的瓶颈,提供了一种替代方案。然而,高昂的计算成本限制了它们的实用性,阻碍了它们充分发挥不用NMS的优势。
RT-DETR-R50/ R101在COCO上实现53.1% / 54.3%的AP,在T4 GPU上实现108 / 74 FPS,在速度和精度方面都优于以前先进的yolo。此外,RT-DETR-R50在精度上比DINO-R50高出2.2%,在FPS上高出约21倍。RT - DETR - R50 / R101经过Objects365预训练,AP达到55.3% / 56.2%。
官方代码地址:DETRs Beat YOLOs on Real-time Object Detection
① 高效混合编码器:通过解耦内部尺度交互和跨尺度融合来处理多尺度特征。这种设计显著降低了计算负担,同时保持了高性能,实现了实时目标检测。
② 提出了不确定性最小的查询选择,为解码器提供高质量的初始查询,从而提高准确率。
下图为 CCFF中的融合块。
- class RTDETRDecoder(nn.Module):
- """
- Real-Time Deformable Transformer Decoder (RTDETRDecoder) module for object detection.
- This decoder module utilizes Transformer architecture along with deformable convolutions to predict bounding boxes
- and class labels for objects in an image. It integrates features from multiple layers and runs through a series of
- Transformer decoder layers to output the final predictions.
- """
- export = False # export mode
- def __init__(
- self,
- nc=80,
- ch=(512, 1024, 2048),
- hd=256, # hidden dim
- nq=300, # num queries
- ndp=4, # num decoder points
- nh=8, # num head
- ndl=6, # num decoder layers
- d_ffn=1024, # dim of feedforward
- dropout=0.0,
- act=nn.ReLU(),
- eval_idx=-1,
- # Training args
- nd=100, # num denoising
- label_noise_ratio=0.5,
- box_noise_scale=1.0,
- learnt_init_query=False,
- ):
- """
- Initializes the RTDETRDecoder module with the given parameters.
- Args:
- nc (int): Number of classes. Default is 80.
- ch (tuple): Channels in the backbone feature maps. Default is (512, 1024, 2048).
- hd (int): Dimension of hidden layers. Default is 256.
- nq (int): Number of query points. Default is 300.
- ndp (int): Number of decoder points. Default is 4.
- nh (int): Number of heads in multi-head attention. Default is 8.
- ndl (int): Number of decoder layers. Default is 6.
- d_ffn (int): Dimension of the feed-forward networks. Default is 1024.
- dropout (float): Dropout rate. Default is 0.
- act (nn.Module): Activation function. Default is nn.ReLU.
- eval_idx (int): Evaluation index. Default is -1.
- nd (int): Number of denoising. Default is 100.
- label_noise_ratio (float): Label noise ratio. Default is 0.5.
- box_noise_scale (float): Box noise scale. Default is 1.0.
- learnt_init_query (bool): Whether to learn initial query embeddings. Default is False.
- """
- super().__init__()
- self.hidden_dim = hd
- self.nhead = nh
- self.nl = len(ch) # num level
- self.nc = nc
- self.num_queries = nq
- self.num_decoder_layers = ndl
- # Backbone feature projection
- self.input_proj = nn.ModuleList(nn.Sequential(nn.Conv2d(x, hd, 1, bias=False), nn.BatchNorm2d(hd)) for x in ch)
- # NOTE: simplified version but it's not consistent with .pt weights.
- # self.input_proj = nn.ModuleList(Conv(x, hd, act=False) for x in ch)
- # Transformer module
- decoder_layer = DeformableTransformerDecoderLayer(hd, nh, d_ffn, dropout, act, self.nl, ndp)
- self.decoder = DeformableTransformerDecoder(hd, decoder_layer, ndl, eval_idx)
- # Denoising part
- self.denoising_class_embed = nn.Embedding(nc, hd)
- self.num_denoising = nd
- self.label_noise_ratio = label_noise_ratio
- self.box_noise_scale = box_noise_scale
- # Decoder embedding
- self.learnt_init_query = learnt_init_query
- if learnt_init_query:
- self.tgt_embed = nn.Embedding(nq, hd)
- self.query_pos_head = MLP(4, 2 * hd, hd, num_layers=2)
- # Encoder head
- self.enc_output = nn.Sequential(nn.Linear(hd, hd), nn.LayerNorm(hd))
- self.enc_score_head = nn.Linear(hd, nc)
- self.enc_bbox_head = MLP(hd, hd, 4, num_layers=3)
- # Decoder head
- self.dec_score_head = nn.ModuleList([nn.Linear(hd, nc) for _ in range(ndl)])
- self.dec_bbox_head = nn.ModuleList([MLP(hd, hd, 4, num_layers=3) for _ in range(ndl)])
- self._reset_parameters()
- def forward(self, x, batch=None):
- """Runs the forward pass of the module, returning bounding box and classification scores for the input."""
- from ultralytics.models.utils.ops import get_cdn_group
- # Input projection and embedding
- feats, shapes = self._get_encoder_input(x)
- # Prepare denoising training
- dn_embed, dn_bbox, attn_mask, dn_meta = get_cdn_group(
- batch,
- self.nc,
- self.num_queries,
- self.denoising_class_embed.weight,
- self.num_denoising,
- self.label_noise_ratio,
- self.box_noise_scale,
- self.training,
- )
- embed, refer_bbox, enc_bboxes, enc_scores = self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
- # Decoder
- dec_bboxes, dec_scores = self.decoder(
- embed,
- refer_bbox,
- feats,
- shapes,
- self.dec_bbox_head,
- self.dec_score_head,
- self.query_pos_head,
- attn_mask=attn_mask,
- )
- x = dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta
- if self.training:
- return x
- # (bs, 300, 4+nc)
- y = torch.cat((dec_bboxes.squeeze(0), dec_scores.squeeze(0).sigmoid()), -1)
- return y if self.export else (y, x)
- def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device="cpu", eps=1e-2):
- """Generates anchor bounding boxes for given shapes with specific grid size and validates them."""
- anchors = []
- for i, (h, w) in enumerate(shapes):
- sy = torch.arange(end=h, dtype=dtype, device=device)
- sx = torch.arange(end=w, dtype=dtype, device=device)
- grid_y, grid_x = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx)
- grid_xy = torch.stack([grid_x, grid_y], -1) # (h, w, 2)
- valid_WH = torch.tensor([w, h], dtype=dtype, device=device)
- grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH # (1, h, w, 2)
- wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0**i)
- anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4)) # (1, h*w, 4)
- anchors = torch.cat(anchors, 1) # (1, h*w*nl, 4)
- valid_mask = ((anchors > eps) & (anchors < 1 - eps)).all(-1, keepdim=True) # 1, h*w*nl, 1
- anchors = torch.log(anchors / (1 - anchors))
- anchors = anchors.masked_fill(~valid_mask, float("inf"))
- return anchors, valid_mask
- def _get_encoder_input(self, x):
- """Processes and returns encoder inputs by getting projection features from input and concatenating them."""
- # Get projection features
- x = [self.input_proj[i](feat) for i, feat in enumerate(x)]
- # Get encoder inputs
- feats = []
- shapes = []
- for feat in x:
- h, w = feat.shape[2:]
- # [b, c, h, w] -> [b, h*w, c]
- feats.append(feat.flatten(2).permute(0, 2, 1))
- # [nl, 2]
- shapes.append([h, w])
- # [b, h*w, c]
- feats = torch.cat(feats, 1)
- return feats, shapes
- def _get_decoder_input(self, feats, shapes, dn_embed=None, dn_bbox=None):
- """Generates and prepares the input required for the decoder from the provided features and shapes."""
- bs = feats.shape[0]
- # Prepare input for decoder
- anchors, valid_mask = self._generate_anchors(shapes, dtype=feats.dtype, device=feats.device)
- features = self.enc_output(valid_mask * feats) # bs, h*w, 256
- enc_outputs_scores = self.enc_score_head(features) # (bs, h*w, nc)
- # Query selection
- # (bs, num_queries)
- topk_ind = torch.topk(enc_outputs_scores.max(-1).values, self.num_queries, dim=1).indices.view(-1)
- # (bs, num_queries)
- batch_ind = torch.arange(end=bs, dtype=topk_ind.dtype).unsqueeze(-1).repeat(1, self.num_queries).view(-1)
- # (bs, num_queries, 256)
- top_k_features = features[batch_ind, topk_ind].view(bs, self.num_queries, -1)
- # (bs, num_queries, 4)
- top_k_anchors = anchors[:, topk_ind].view(bs, self.num_queries, -1)
- # Dynamic anchors + static content
- refer_bbox = self.enc_bbox_head(top_k_features) + top_k_anchors
- enc_bboxes = refer_bbox.sigmoid()
- if dn_bbox is not None:
- refer_bbox = torch.cat([dn_bbox, refer_bbox], 1)
- enc_scores = enc_outputs_scores[batch_ind, topk_ind].view(bs, self.num_queries, -1)
- embeddings = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1) if self.learnt_init_query else top_k_features
- if self.training:
- refer_bbox = refer_bbox.detach()
- if not self.learnt_init_query:
- embeddings = embeddings.detach()
- if dn_embed is not None:
- embeddings = torch.cat([dn_embed, embeddings], 1)
- return embeddings, refer_bbox, enc_bboxes, enc_scores
- # TODO
- def _reset_parameters(self):
- """Initializes or resets the parameters of the model's various components with predefined weights and biases."""
- # Class and bbox head init
- bias_cls = bias_init_with_prob(0.01) / 80 * self.nc
- # NOTE: the weight initialization in `linear_init` would cause NaN when training with custom datasets.
- # linear_init(self.enc_score_head)
- constant_(self.enc_score_head.bias, bias_cls)
- constant_(self.enc_bbox_head.layers[-1].weight, 0.0)
- constant_(self.enc_bbox_head.layers[-1].bias, 0.0)
- for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
- # linear_init(cls_)
- constant_(cls_.bias, bias_cls)
- constant_(reg_.layers[-1].weight, 0.0)
- constant_(reg_.layers[-1].bias, 0.0)
- linear_init(self.enc_output[0])
- xavier_uniform_(self.enc_output[0].weight)
- if self.learnt_init_query:
- xavier_uniform_(self.tgt_embed.weight)
- xavier_uniform_(self.query_pos_head.layers[0].weight)
- xavier_uniform_(self.query_pos_head.layers[1].weight)
- for layer in self.input_proj:
- xavier_uniform_(layer[0].weight)
