赞
踩
__version__ = '8.0.110'
先验知识
统一注释
a = na = 3wh = h*w = num_anchor =8400 = 20*20 + 40*40 + 80*80 因为yolov8是free anchor的, 因此, 这个就是num_total_anchor
batchsize: bs = b = 32
lhw = 80*80, mwh=40*40, swh=20*20
n_max_boxes = max_num_obj =22 类似于: max(batch.len(label))
len(label)是batch中每一张图片的label数量, max(batch.len(label))是选择其中的最大值
损失函数代码讲解
''' 逻辑: 1. 将预测的三个特征图合并并进行split, 得到pred_scores(bs, 3hw, 2), pred_distri(bs, 3hw, 64) 2. 生成anchor_points[3hw, 2]和stride_tensor[3hw, 1] 3. 生成gt_labels, gt_bboxes, 这两部分其中有0补齐 4. 将pred_distri转化为xyxy, pred_bboxes(bs, 3hw, 4) 5. 使用TAL进行正负样本分配, 得到target_bboxes, target_scores(bboxs, scores都是与pred的bboxs, scores一一对应的), fg_mask是最终的正负样本分配结果 6. 计算cls_loss和boxes_loss ''' # Criterion class for computing training losses class Loss: def __init__(self, model): # model must be de-paralleled device = next(model.parameters()).device # get model device h = model.args # hyperparameters m = model.model[-1] # Detect() module self.bce = nn.BCEWithLogitsLoss(reduction='none') self.hyp = h self.stride = m.stride # model strides self.nc = m.nc # number of classes self.no = m.no # 每一个检测头的输出的[b, n, h, w]中的n: 2(class) + 4(box) * 16(reg_max) = 66 self.reg_max = m.reg_max # dfl reg_max self.device = device self.use_dfl = m.reg_max > 1 self.assigner = TaskAlignedAssigner(topk=10, num_classes=self.nc, alpha=0.5, beta=6.0) self.bbox_loss = BboxLoss(m.reg_max - 1, use_dfl=self.use_dfl).to(device) self.proj = torch.arange(m.reg_max, dtype=torch.float, device=device) def preprocess(self, targets, batch_size, scale_tensor): """ 这个函数的作用是输出一个tensor, 这个tensor由targets和zeros组成 counts是Batch中每一张图片的labels数量, 选择其中最大的数量生成一个out, 其shape为[Batch, max_labels, 5]的全零tensor 将targets中对应图片的labels(cls, xyxy)复制到out[..., :5]中, out[..., 0]为cls, out[..., 1:5]为xyxy targets中有小于max_labels数量的, 即out[B, len(targets):, 5]全都为0 Args: targets: torch.Size([na, 6]) batch_size: int scale_tensor: tensor([640, 640, 640, 640]) return: out: torch.Size([Batch, max_labels, 5]) """ """Preprocesses the target counts and matches with the input batch size to output a tensor.""" if targets.shape[0] == 0: out = torch.zeros(batch_size, 0, 5, device=self.device) else: i = targets[:, 0] # image index _, counts = i.unique(return_counts=True) # 图片索引出现的次数 torch.Size([32]) counts = counts.to(dtype=torch.int32) out = torch.zeros(batch_size, counts.max(), 5, device=self.device) for j in range(batch_size): matches = i == j n = matches.sum() if n: # 将targets中对应图片的labels(cls, xyxy)复制到out[..., :5]中, out[..., 0]为cls, out[..., 1:5]为xyxy # j是对应的图片, n是该图片的label数量, out[j, n:]全部是0 out[j, :n] = targets[matches, 1:] # scale_tensor: [640, 640, 640, 640] # mul_是将out[..., 1:5]中的值逐元素的乘以scale_tensor out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor)) return out def bbox_decode(self, anchor_points, pred_dist): # 这个bbox_decode函数的目的是从预测的物体边界框坐标分布(pred_dist)和参考点(anchor_points)解码出实际的边界框坐标xyxy。 """Decode predicted object bounding box coordinates from anchor points and distribution.""" if self.use_dfl: b, a, c = pred_dist.shape # batch, anchors, channels # pred_dist在matmul之前, shape为[b, a, 4, 16], self.proj的shape为[16], 最终的pred_dist的shape为[b, a, 4] # 如果不理解可以直接使用 a = torch.ones((1, 3, 4, 16)), 与b=torch.rand(16)进行matmul pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype)) # pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype)) # pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2) return dist2bbox(pred_dist, anchor_points, xywh=False) # 假设检测是二分类['flame','smoke'] # datasets如果使用了moasic技术的话, 那么batch_idx的label数量就对不上这个im_file中的图片上的label数量 # list((32, ))表示列表, (32, )类似shape # stride: 是检测头输出的特征图相对于模型输入图片的缩小倍数 def __call__(self, preds, batch): """ Args: preds: [tensor(Size([32, 66, 80, 80])), tensor(Size([32, 66, 40, 40])), tensor(Size([32, 66, 20, 20]))] bacth: {'im_file': list((32, )), 'ori_shape': list((32, )), 'resized_shape': list((32, )), 'img': tensor(Size[32, 3, 640, 640]), 'cls': tensor(Size[211, 1]), 'bboxes': tensor(Size[211, 4]), 'batch_idx': Size[211]} """ """Calculate the sum of the loss for box, cls and dfl multiplied by batch size.""" loss = torch.zeros(3, device=self.device) # box, cls, dfl feats = preds[1] if isinstance(preds, tuple) else preds # preds是列表, feats=preds # feats[0].shape[0]: bs self.no: 66 # xi: tensor(Size[bs, 66, w*h]]) h, w对应的三个检测头的特征图的大小分别为20, 40, 80(大目标, 中目标, 小目标) # pred_distri: tensor(Size[bs, 64, 8400]) pred_scores: tensor(Size[bs, 2, 8400]) # pred_distri表示的是边界框的距离, 将来会decode为边界框坐标, pred_scores表示的是边界框的分类分数 pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split( (self.reg_max * 4, self.nc), 1) pred_scores = pred_scores.permute(0, 2, 1).contiguous() # (bs, 3hw, 2) pred_distri = pred_distri.permute(0, 2, 1).contiguous() # (bs, 3hw, 64) dtype = pred_scores.dtype # dtype: torch.float16 batch_size = pred_scores.shape[0] # batch_size: 32 imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0] # image size (h,w) imgsz: [640, 640] # anchor_point: torch.Size([8400, 2]) # anchor_point代表的是三个特征图80*80, 40*40, 20*20的anchor的中心点坐标 # stride_tensor: torch.Size([8400, 1]) # 中心点坐标对应的stride anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5) # targets # targets: shape[na, 6] 这里的na表示的是32张图片mosaic后的总的label数量, 6表示的是(batch_idx, cls, xyxy) targets = torch.cat((batch['batch_idx'].view(-1, 1), batch['cls'].view(-1, 1), batch['bboxes']), 1) # targets: torch.Size([bs, n_max_boxes, 5]) # 这里的22就是32张图片中的最大label数量 targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]]) # gt_labels: torch.Size([bs, n_max_boxes, 1]), gt_bboxes: torch.Size([bs, n_max_boxes, 4]) gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy # mask_gt: torch.Size([bs, n_max_boxes, 1]) # 这个是用来判断是否有gt的, 先将xyxy的所有值求和, gt_(0)是大于0的置1, 否则置0 mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0) # pboxes # 将pred_distri解码为边界框坐标, pred_bboxes: torch.Size([bs, a, 4]) pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (bs, h*w, 4) # fg_mask: torch.Size([bs, h*w]) # 值为True的正样本, 为False的是负样本 # target_bboxes: torch.Size([bs, h*w, 4]) 与pred_bboxes的shape 一一对应 # 这个target_bboxes: h*w中还有负样本, 负样本不参与bbox计算, 在后续会进行处理 # target_scores(shape(bs, h*w, num_class)) # 负样本的num_class的值全为0 # 举个例子, bs=1, h*w=2, num_class=2 target_scores: tensor([[0, 0], [0, 1]])(当然这个1可能是个小数) # [0, 0]对应的pd0是负样本, [0, 1]对应的pd1是正样本,对应的是gt1, 即假如是二分类{'0':flame, '1':smoke}对应的是'1':smoke _, target_bboxes, target_scores, fg_mask, _ = self.assigner( pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt) # 在分类损失求平均的时候使用 # target_scores_sum: tensor(734.898, device='cuda:0') # 这里求和有小数的原因是在正负样本分配的最后target_scores乘以了一个动态权重 target_scores_sum = max(target_scores.sum(), 1) # cls loss # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum # VFL way # bceloss, 正负样本都参加分类损失的计算 # 计算方式详情可以看https://blog.csdn.net/shilichangtin/article/details/135185583 # 除以target_scores_sum就是为了求平均 loss[1] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE # bbox loss # 只有正样本参加bbox损失计算 if fg_mask.sum(): target_bboxes /= stride_tensor loss[0], loss[2] = self.bbox_loss(pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask) loss[0] *= self.hyp.box # box gain loss[1] *= self.hyp.cls # cls gain loss[2] *= self.hyp.dfl # dfl gain return loss.sum() * batch_size, loss.detach() # loss(box, cls, dfl)
bbox损失
class BboxLoss(nn.Module): def __init__(self, reg_max, use_dfl=False): """Initialize the BboxLoss module with regularization maximum and DFL settings.""" super().__init__() self.reg_max = reg_max self.use_dfl = use_dfl def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask): """IoU loss.""" # weight: torch.Size([1700, 1]) weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1) # iou: torch.Size([1700, 1]) # 这个就是只有正阳本参与box计算 iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, CIoU=True) # 这个类似于分类损失求平均, 再次提醒 target_scores中的值可能是小数, 因此weight也是小数 loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum # DFL loss # DFL loss论文解读https://blog.csdn.net/shilichangtin/article/details/135505430 if self.use_dfl: target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max) loss_dfl = self._df_loss(pred_dist[fg_mask].view(-1, self.reg_max + 1), target_ltrb[fg_mask]) * weight loss_dfl = loss_dfl.sum() / target_scores_sum else: loss_dfl = torch.tensor(0.0).to(pred_dist.device) return loss_iou, loss_dfl
DFL loss解读
检测头代码
class Detect(nn.Module): """YOLOv8 Detect head for detection models.""" dynamic = False # force grid reconstruction export = False # export mode shape = None anchors = torch.empty(0) # init strides = torch.empty(0) # init def __init__(self, nc=80, ch=()): # detection layer super().__init__() self.nc = nc # number of classes self.nl = len(ch) # number of detection layers self.reg_max = 16 # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x) self.no = nc + self.reg_max * 4 # number of outputs per anchor self.stride = torch.zeros(self.nl) # strides computed during build c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], self.nc) # channels # bbox分支 self.cv2 = nn.ModuleList( nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch) # cls分支 self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch) # 计算dfl,由概率分布转化为anchor points到bbox边的偏移量 self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity() def forward(self, x): """Concatenates and returns predicted bounding boxes and class probabilities.""" shape = x[0].shape # BCHW for i in range(self.nl): x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1) # 训练过程中直接返回x if self.training: return x elif self.dynamic or self.shape != shape: self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5)) self.shape = shape x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2) if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'): # avoid TF FlexSplitV ops box = x_cat[:, :self.reg_max * 4] cls = x_cat[:, self.reg_max * 4:] else: box, cls = x_cat.split((self.reg_max * 4, self.nc), 1) dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides y = torch.cat((dbox, cls.sigmoid()), 1) return y if self.export else (y, x)
解释一下self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity(),代码
class DFL(nn.Module): """ Integral module of Distribution Focal Loss (DFL). Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391 """ def __init__(self, c1=16): """Initialize a convolutional layer with a given number of input channels.""" super().__init__() # 就是构建一个参数为[0, 1, ..., 7, 8, ..., 15]的conv2d(因为reg_max为16) self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False) x = torch.arange(c1, dtype=torch.float) self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1)) self.c1 = c1 def forward(self, x): """Applies a transformer layer on input tensor 'x' and returns a tensor.""" b, c, a = x.shape # batch, channels, anchors # 由概率分布转化为anchor points到bbox边的偏移量 return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a) # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
上述的self.dfl的功能实现基本上就是
y
^
=
∑
i
=
0
n
P
(
y
i
)
y
i
=
∑
i
=
0
15
P
(
y
i
)
y
i
=
0
×
P
(
y
0
)
+
1
×
P
(
y
1
)
+
⋯
+
7
×
P
(
y
7
)
+
8
×
P
(
y
8
)
+
⋯
+
15
×
P
(
y
15
)
\hat{y}=\sum_{i=0}^nP(y_i)y_i=\sum_{i=0}^{15}P(y_i)y_i=0\times{P(y_0)}+1\times{P(y_1)+\dots +7\times{P(y_7)}+8\times{P(y_8)}+\dots+15\times{P(y_{15})}}
y^=i=0∑nP(yi)yi=i=0∑15P(yi)yi=0×P(y0)+1×P(y1)+⋯+7×P(y7)+8×P(y8)+⋯+15×P(y15)
这个
y
^
\hat{y}
y^就是anchor points到bbox边的偏移量
训练过程
计算流程:
1、 首先是训练过程中检测头class Detect(nn.Module):是直接返回统计分布x(x是一个[bs, num_anchor, 4 * reg_max + cls_num]的shape)(num_anchor是三个检测层中特征图中所有anchor points的数量,一般来说就是
80
∗
80
+
40
∗
40
+
20
∗
20
80*80 + 40*40 + 20*20
80∗80+40∗40+20∗20)。
2、 然后就是先将target_box转化为target_dist,即将gt_box转为anchor points到box四条边的距离。
target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max)
def bbox2dist(anchor_points, bbox, reg_max):
"""Transform bbox(xyxy) to dist(ltrb)."""
x1y1, x2y2 = bbox.chunk(2, -1)
# anchor_points - x1y1: anchor points到left top的距离
# x2y2 - anchor_points: anchor points到right bottom的距离
return torch.cat((anchor_points - x1y1, x2y2 - anchor_points), -1).clamp_(0, reg_max - 0.01) # dist (lt, rb)
3、 计算DFL loss
# pred_dist[fg_mask]: 只计算正样本的
# view(-1, self.reg_max + 1): 这个 + 1的原因是因为在初始化BboxLoss类的时候,reg_max传入的是m.reg_max - 1,已经减1了,所以需要 + 1
loss_dfl = self._df_loss(pred_dist[fg_mask].view(-1, self.reg_max + 1), target_ltrb[fg_mask]) * weight
注意此时的pred_dist[fg_mask].view(-1, self.reg_max + 1)的参数的形状类似于[bs * n * 4, reg_max],这个是因为F.cross_entropy传入的参数部分就必须是一个概率分布。具体的代码解析可以直接参考论文解读的DFL loss部分
预测过程
1、 模型先生成一个reg_max(默认为16)的概率统计分布,其对应得是
{
0
,
1
,
.
.
.
,
7
,
8
,
.
.
.
,
14
,
15
}
\{0, 1, ...,7,8,...,14, 15\}
{0,1,...,7,8,...,14,15},这个
{
0
,
1
,
.
.
.
,
7
,
8
,
.
.
.
,
14
,
15
}
\{0, 1, ...,7,8,...,14, 15\}
{0,1,...,7,8,...,14,15}就是模型预测出来的anchor points到bbox边的距离。假设模型预测出的
{
0.01
,
0.05
,
.
.
.
,
0.12
,
0.23
,
.
.
.
,
0.01
,
0.34
}
\{0.01, 0.05, ...,0.12,0.23,...,0.01, 0.34\}
{0.01,0.05,...,0.12,0.23,...,0.01,0.34},也就是anchor points到bbox边的距离为0的概率是0.01,距离为15的概率为0.34。
2、 然后使用上面介绍的检测头代码中的self.dfl求出anchor points到bbox的距离的期望
y
^
\hat{y}
y^,这个
y
^
\hat{y}
y^就是模型预测的最终的anchor points到bbox边的距离,这个期望最大是15,也就是说模型预测出的anchor points到bbox边的距离最大是15
以往的yolov5在BCELoss的使用BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h[‘cls_pw’]], device=device)),默认使用的是reduction='mean'
,这代表求完损失之后是直接求平均的。
在yolov8检测Loss初始化的时候, 使用的代码是self.bce = nn.BCEWithLogitsLoss(reduction=‘none’),这个reduction='none
的使用方法可以参考我的torch.nn.BCEWithLogitsLoss用法介绍博客, 因此需要进行平均,这个平均的方法是除以target_scores_sum(当然这个平均的分母中只有正样本进行参与,负样本的全是0,求和相当于直接排除负样本了, 而self.bce(pred_scores, target_scores.to(dtype))是正负样本都有值,因此求sum()
的时候正负样本都参与了,似乎是不公平的?这个还是不是特别理解, yolov5计算的时候使用的是reduction='mean'
是直接除以总数)(我阅读GFL论文后,这个除以target_scores_sum在GFL论文中也有提到,自己写的论文解读的最后部分有提到。这个GFL中的只不过并不是除以target_scores中求和得到target_scores_sum,而是直接除以正样本的数量
N
p
o
s
{N}_{pos}
Npos)
后续的box_loss计算也用上了这个target_scores_sum和target_score,作用也是求平均,与cls_loss类似(box_loss计算时候只有正样本参与计算损失, 因此不存在上面的这个疑惑)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。