WIDERFace数据集可参考本链接:WIDER FACE数据集简介 - 知乎,链接中也有下载地址。





        下载好的数据集解压到你想解压的文件夹中 。以train文件夹为例,里面应该包含一个文件夹和一个文件,如下图


  1. python3 train2yolo.py
  2. python3 val2yolo.py










        训练过程中,会默认下载VOC数据集。但如果我们只想训练人脸识别,这个数据集是不需要的,如果不想下载,可以将download: bash data/scripts/get_voc.sh行注释掉。



python3 train.py --data data/widerface.yaml --cfg models/yolov5s.yaml --weights 'weights/yolov5s.pt'


二 代码详解





  1. #y[..., 5:15] = y[..., 5:15] * 8 - 4
  2. y[..., 5:7] = y[..., 5:7] * self.anchor_grid[i] + self.grid[i].to(x[i].device) * self.stride[i] # landmark x1 y1 得到实际像素坐标
  3. y[..., 7:9] = y[..., 7:9] * self.anchor_grid[i] + self.grid[i].to(x[i].device) * self.stride[i]# landmark x2 y2
  4. y[..., 9:11] = y[..., 9:11] * self.anchor_grid[i] + self.grid[i].to(x[i].device) * self.stride[i]# landmark x3 y3
  5. y[..., 11:13] = y[..., 11:13] * self.anchor_grid[i] + self.grid[i].to(x[i].device) * self.stride[i]# landmark x4 y4
  6. y[..., 13:15] = y[..., 13:15] * self.anchor_grid[i] + self.grid[i].to(x[i].device) * self.stride[i]# landmark x5 y5



y = torch.full_like(x[i], 0) 是创建一个矩阵元素全为第二个参数,大小为第一个参数的tensor



在这里,我们初始化了一个landmarks_loss 的类(后面的1.0推测为alpha)(179行),然后传入参数分别为真实,预测和mask。

  1. plandmarks = ps[:,5:15]
  2. plandmarks[:, 0:2] = plandmarks[:, 0:2] * anchors[i] #缩放回原来的坐标
  3. plandmarks[:, 2:4] = plandmarks[:, 2:4] * anchors[i]
  4. plandmarks[:, 4:6] = plandmarks[:, 4:6] * anchors[i]
  5. plandmarks[:, 6:8] = plandmarks[:, 6:8] * anchors[i]
  6. plandmarks[:, 8:10] = plandmarks[:,8:10] * anchors[i]
  7. lmark += landmarks_loss(plandmarks, tlandmarks[i], lmks_mask[i]) #这里才是传入参数进行计算 lamrk是一个值 forward是在__call__中调用的,而__call__函数是在类的对象使用‘()’时被调用


lks_mask = torch.where(lks < 0, torch.full_like(lks, 0.), torch.full_like(lks, 1.0))



  1. class WingLoss(nn.Module):
  2. def __init__(self, w=10, e=2):
  3. super(WingLoss, self).__init__()
  4. # https://arxiv.org/pdf/1711.06753v4.pdf Figure 5
  5. self.w = w
  6. self.e = e
  7. self.C = self.w - self.w * np.log(1 + self.w / self.e)
  8. def forward(self, x, t, sigma=1): #这里的x,t分别对应之后的pret,truel
  9. weight = torch.ones_like(t) #返回一个大小为1的张量,大小与t相同
  10. weight[torch.where(t==-1)] = 0
  11. diff = weight * (x - t)
  12. abs_diff = diff.abs()
  13. flag = (abs_diff.data < self.w).float()
  14. y = flag * self.w * torch.log(1 + abs_diff / self.e) + (1 - flag) * (abs_diff - self.C) #全是0,1
  15. return y.sum()
  16. class LandmarksLoss(nn.Module):
  17. # BCEwithLogitLoss() with reduced missing label effects.
  18. def __init__(self, alpha=1.0):
  19. super(LandmarksLoss, self).__init__()
  20. self.loss_fcn = WingLoss()#nn.SmoothL1Loss(reduction='sum')
  21. self.alpha = alpha
  22. def forward(self, pred, truel, mask): #预测的,真实的 600(原来为62*10)(推测是去掉了那些没有标注的值)
  23. loss = self.loss_fcn(pred*mask, truel*mask) #一个值(tensor)
  24. return loss / (torch.sum(mask) + 10e-14)

然后我们回到LandmarkLoss类,如下段代码,我们通过 loss = self.loss_fcn(pred*mask, truel*mask)语句将真实值和预测值传入。

  1. class LandmarksLoss(nn.Module):
  2. # BCEwithLogitLoss() with reduced missing label effects.
  3. def __init__(self, alpha=1.0):
  4. super(LandmarksLoss, self).__init__()
  5. self.loss_fcn = WingLoss()#nn.SmoothL1Loss(reduction='sum')
  6. self.alpha = alpha
  7. def forward(self, pred, truel, mask):
  8. loss = self.loss_fcn(pred*mask, truel*mask)
  9. return loss / (torch.sum(mask) + 10e-14)


  1. # 计算objectness的损失
  2. obji = self.BCEobj(pi[..., 4], tobj)
  3. lobj += obji * self.balance[i] # obj loss 也是一个值






  1. def compute_loss(p, targets, model): # predictions, targets, model
  2. device = targets.device
  3. lcls, lbox, lobj, lmark = torch.zeros(1, device=device), torch.zeros(1, device=device), torch.zeros(1, device=device), torch.zeros(1, device=device)
  4. tcls, tbox, indices, anchors, tlandmarks, lmks_mask = build_targets(p, targets, model) # targets
  5. h = model.hyp # hyperparameters
  6. # Define criteria
  7. BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['cls_pw']], device=device)) # weight=model.class_weights)
  8. BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['obj_pw']], device=device))
  9. landmarks_loss = LandmarksLoss(1.0) #这里只是初始化了一个landmarks_loss的类
  10. # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
  11. cp, cn = smooth_BCE(eps=0.0)
  12. ...
  13. ...
  14. ...
  15. plandmarks = ps[:,5:13]
  16. plandmarks[:, 0:2] = plandmarks[:, 0:2] * anchors[i]
  17. plandmarks[:, 2:4] = plandmarks[:, 2:4] * anchors[i]
  18. plandmarks[:, 4:6] = plandmarks[:, 4:6] * anchors[i]
  19. plandmarks[:, 6:8] = plandmarks[:, 6:8] * anchors[i]
  20. # plandmarks[:, 8:10] = plandmarks[:,8:10] * anchors[i]
  21. lmark += landmarks_loss(plandmarks, tlandmarks[i], lmks_mask[i])


  1. def build_targets(p, targets, model):
  2. # Build targets for compute_loss(), input targets(image,class,x,y,w,h)
  3. det = model.module.model[-1] if is_parallel(model) else model.model[-1] # Detect() module
  4. na, nt = det.na, targets.shape[0] # number of anchors, targets
  5. tcls, tbox, indices, anch, landmarks, lmks_mask = [], [], [], [], [], []
  6. #gain = torch.ones(7, device=targets.device) # normalized to gridspace gain
  7. gain = torch.ones(15, device=targets.device)
  8. ai = torch.arange(na, device=targets.device).float().view(na, 1).repeat(1, nt) # same as .repeat_interleave(nt)
  9. targets = torch.cat((targets.repeat(na, 1, 1), ai[:, :, None]), 2) # append anchor indices
  10. g = 0.5 # bias
  11. off = torch.tensor([[0, 0],
  12. [1, 0], [0, 1], [-1, 0], [0, -1], # j,k,l,m
  13. # [1, 1], [1, -1], [-1, 1], [-1, -1], # jk,jm,lk,lm
  14. ], device=targets.device).float() * g # offsets
  15. for i in range(det.nl):
  16. anchors = det.anchors[i]
  17. gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]] # xyxy gain
  18. #landmarks 10
  19. gain[6:14] = torch.tensor(p[i].shape)[[3, 2, 3, 2, 3, 2, 3, 2]] # xyxy gain
  20. # Match targets to anchors
  21. t = targets * gain
  22. if nt:
  23. # Matches
  24. r = t[:, :, 4:6] / anchors[:, None] # wh ratio
  25. j = torch.max(r, 1. / r).max(2)[0] < model.hyp['anchor_t'] # compare
  26. # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t'] # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2))
  27. t = t[j] # filter
  28. # Offsets
  29. gxy = t[:, 2:4] # grid xy
  30. gxi = gain[[2, 3]] - gxy # inverse
  31. j, k = ((gxy % 1. < g) & (gxy > 1.)).T
  32. l, m = ((gxi % 1. < g) & (gxi > 1.)).T
  33. j = torch.stack((torch.ones_like(j), j, k, l, m))
  34. t = t.repeat((5, 1, 1))[j]
  35. offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]
  36. else:
  37. t = targets[0]
  38. offsets = 0
  39. # Define
  40. b, c = t[:, :2].long().T # image, class
  41. gxy = t[:, 2:4] # grid xy
  42. gwh = t[:, 4:6] # grid wh
  43. gij = (gxy - offsets).long()
  44. gi, gj = gij.T # grid xy indices
  45. # Append
  46. a = t[:, 14].long() # anchor indices
  47. indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1))) # image, anchor, grid indices
  48. tbox.append(torch.cat((gxy - gij, gwh), 1)) # box
  49. anch.append(anchors[a]) # anchors
  50. tcls.append(c) # class
  51. #landmarks
  52. lks = t[:,6:14]
  53. #lks_mask = lks > 0
  54. #lks_mask = lks_mask.float()
  55. lks_mask = torch.where(lks < 0, torch.full_like(lks, 0.), torch.full_like(lks, 1.0))
  56. #应该是关键点的坐标除以anch的宽高才对,便于模型学习。使用gwh会导致不同关键点的编码不同,没有统一的参考标准
  57. lks[:, [0, 1]] = (lks[:, [0, 1]] - gij)
  58. lks[:, [2, 3]] = (lks[:, [2, 3]] - gij)
  59. lks[:, [4, 5]] = (lks[:, [4, 5]] - gij)
  60. lks[:, [6, 7]] = (lks[:, [6, 7]] - gij)
  61. # lks[:, [8, 9]] = (lks[:, [8, 9]] - gij)
  62. lks_mask_new = lks_mask
  63. lmks_mask.append(lks_mask_new)
  64. landmarks.append(lks)
  65. #print('lks: ', lks.size())
  66. return tcls, tbox, indices, anch, landmarks, lmks_mask


三 代码改写



首先找到yolo.py的 Detect类.我们不知道train.py中哪一步用到了Detect类,所以采取调试的方式查找,发现这行代码调用了Detect类(通过Model类)(92行左右)

  1. else:
  2. model = Model(opt.cfg, ch=3, nc=nc).to(device) # create 本行核心


  1. # Build strides, anchors
  2. m = self.model[-1] # Detect() 这里m读取的是yaml中的最后一层,但是还没有运行Detect类
  3. if isinstance(m, Detect):
  4. s = 128 # 2x min stride
  5. m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))]) # forward
  6. m.anchors /= m.stride.view(-1, 1, 1)
  7. check_anchor_order(m)
  8. self.stride = m.stride
  9. self._initialize_biases() # only run once
  10. # print('Strides: %s' % m.stride.tolist())

上面这段代码第二行m = self.model[-1],是读取yaml,但还没有实际调用Detect.实际调用的是

m.stride = torch....这一行,这一行通过一次前向推理获得了每层的步长。

为了方便对比,我同时运行了原版yolov5 5.0代码,用的数据集是coco128,共80个类,同样在这一行打了断点train.py中。调试可发现,face中输出的层数为48(一类),源代码为255(80类)

  1. Detect(
  2. (m): ModuleList(
  3. (0): Conv2d(128, 255, kernel_size=(1, 1), stride=(1, 1))
  4. (1): Conv2d(256, 255, kernel_size=(1, 1), stride=(1, 1))
  5. (2): Conv2d(512, 255, kernel_size=(1, 1), stride=(1, 1))
  6. )
  7. )


face中 48=3*(1+14+1) ,1对应置信度,14对应xywh和五个坐标点,1对应类别(这里的顺序和上面应该不同,推测进行了调序,具体原因下面会说)。因此需要首先将其更改为51=3*(4+1+8+4)(分别为xywh,置信度,坐标点,类别,因为我将类别数改为4了)(但是我们没有办法直接更改,只能先进行下面的步骤)


 m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))])  # forward

可以看出,self.forward(torch.zeros(1, ch, s, s))是一个list,shape如下(源代码)


torch.zeros(1, ch, s, s)是传入的tensor,对这个tensor进行一次前向推理,输出的就是上面显示的。因此我们继续找到self.forward函数对其进行修改(147行)


  1. def forward(self, x, augment=False, profile=False, visualize=False):
  2. """前向推理"""
  3. if augment:
  4. return self.forward_augment(x) # augmented inference, None
  5. return self.forward_once(x, profile, visualize) # single-scale inference, train


  1. def forward_once(self, x, profile=False, visualize=False):
  2. """正常前向推理,一层一层执行"""
  3. y, dt = [], [] # outputs
  4. for m in self.model:
  5. if m.f != -1: # if not from previous layer
  6. x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
  7. if profile:
  8. # 显示模型的信息,每一层网络信息,推理速度,参数量GFLOPs等
  9. o = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2 if thop else 0 # FLOPs
  10. t = time_synchronized()
  11. for _ in range(10):
  12. _ = m(x)
  13. dt.append((time_synchronized() - t) * 100)
  14. if m == self.model[0]:
  15. logger.info(f"{'time (ms)':>10s} {'GFLOPs':>10s} {'params':>10s} {'module'}")
  16. logger.info(f'{dt[-1]:10.2f} {o:10.2f} {m.np:10.0f} {m.type}')
  17. x = m(x) # run
  18. y.append(x if m.i in self.save else None) # save output
  19. if visualize:
  20. # 可视化每一层网络的预测
  21. feature_visualization(x, m.type, m.i, save_dir=visualize)
  22. if profile:
  23. logger.info('%.1fms total' % sum(dt))
  24. return x



  1. if m.f != -1:
  2. x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]

在m.f != -1的条件下(不来自上一层)

如果isinstance(m.f, int)为真,则x=y[m.f];如果isinstance(m.f, int)为假,且j == -1,则x=x;如果如果isinstance(m.f, int)为假,且j != -1,则x=y[j] for j in m.f



 self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch])  # model, savelist


2.yolo.py detect模块改写


       self.no = nc + 5 + 8  # number of outputs per anchor



  1. y = torch.full_like(x[i], 0)
  2. class_range = list(range(5)) + list(range(13,13+self.nc))
  3. y[..., class_range] = x[i][..., class_range].sigmoid()
  4. y[..., 5:13] = x[i][..., 5:13]

                y[..., 5:13] = x[i][..., 5:13]

源代码y最后一维长度为16=(4+1+10+1) ,分别是xyxy,置信度,特征点,类别。新代码有4个类,应该为17=(4+1+8+4)


y[..., 13:15] = y[..., 13:15] * self.anchor_grid[i] + self.grid[i].to(x[i].device) * self.stride[i]# landmark x5 y5

3.loss.py 改写


  1. # Classification
  2. if model.nc > 1: # cls loss (only if multiple classes)
  3. t = torch.full_like(ps[:, 13:], cn, device=device) # targets
  4. t[range(n), tcls[i]] = cp
  5. lcls += BCEcls(ps[:, 13:], t) # BCE
  6. # Append targets to text file
  7. # with open('targets.txt', 'a') as file:
  8. # [file.write('%11.5g ' * 4 % tuple(x) + '\n') for x in torch.cat((txy[i], twh[i]), 1)]
  9. #landmarks loss
  10. #plandmarks = ps[:,5:15].sigmoid() * 8. - 4.
  11. plandmarks = ps[:,5:13]
  12. plandmarks[:, 0:2] = plandmarks[:, 0:2] * anchors[i] #缩放回原来的坐标
  13. plandmarks[:, 2:4] = plandmarks[:, 2:4] * anchors[i]
  14. plandmarks[:, 4:6] = plandmarks[:, 4:6] * anchors[i]
  15. plandmarks[:, 6:8] = plandmarks[:, 6:8] * anchors[i]
  16. # plandmarks[:, 8:10] = plandmarks[:,8:10] * anchors[i]


 gain = torch.ones(15, device=targets.device)


  1. for i in range(det.nl):
  2. anchors = det.anchors[i]
  3. gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]] # xyxy gain
  4. #landmarks 10
  5. gain[6:14] = torch.tensor(p[i].shape)[[3, 2, 3, 2, 3, 2, 3, 2]] # xyxy gain

247行,将a = t[:, 16]的16改为14

  1. a = t[:, 14].long() # anchor indices
  2. indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1))) # image, anchor, grid indices
  3. tbox.append(torch.cat((gxy - gij, gwh), 1)) # box
  4. anch.append(anchors[a]) # anchors
  5. tcls.append(c) # class

254行,将lks = t[:, 16]的16改为14

  1. #landmarks
  2. lks = t[:,6:14]
  3. #lks_mask = lks > 0
  4. #lks_mask = lks_mask.float()
  5. lks_mask = torch.where(lks < 0, torch.full_like(lks, 0.), torch.full_like(lks, 1.0))

261行,将下面的lks[:, [8, 9]] = (lks[:, [8, 9]] - gij)注释

  1. lks[:, [0, 1]] = (lks[:, [0, 1]] - gij)
  2. lks[:, [2, 3]] = (lks[:, [2, 3]] - gij)
  3. lks[:, [4, 5]] = (lks[:, [4, 5]] - gij)
  4. lks[:, [6, 7]] = (lks[:, [6, 7]] - gij)
  5. # lks[:, [8, 9]] = (lks[:, [8, 9]] - gij)


t = targets * gain

报错:RuntimeError: The size of tensor a (17) must match the size of tensor b (15) at non-singleton dimension 2


 targets = torch.cat((targets.repeat(na, 1, 1), ai[:, :, None]), 2)  # append anchor indices




loss, loss_items = compute_loss(pred, targets.to(device), model)  # loss scaled by batch_size


  1. pbar = enumerate(dataloader)
  2. logger.info(('\n' + '%10s' * 9) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'landmark', 'total', 'targets', 'img_size'))
  3. if rank in [-1, 0]:
  4. pbar = tqdm(pbar, total=nb) # progress bar
  5. optimizer.zero_grad()
  6. for i, (imgs, targets, paths, _) in pbar: # batch -------------------------------------------------------------
  7. ni = i + nb * epoch # number integrated batches (since train start)
  8. imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0


  1. dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
  2. hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank,
  3. world_size=opt.world_size, workers=opt.workers,
  4. image_weights=opt.image_weights)



        之前我在修改的时候,因为一些意外,忘记修改widerface.yaml,所以可能有些地方没有报错。在对其修改之后,在_methods.py39行出现报错:ValueError: zero-size array to reduction operation maximum which has no identity。


  1. with torch_distributed_zero_first(rank):
  2. dataset = LoadFaceImagesAndLabels(path, imgsz, batch_size,
  3. augment=augment, # augment images
  4. hyp=hyp, # augmentation hyperparameters
  5. rect=rect, # rectangular training
  6. cache_images=cache,
  7. single_cls=opt.single_cls,
  8. stride=int(stride),
  9. pad=pad,
  10. image_weights=image_weights,
  11. )


  1. dataloader = loader(dataset,
  2. batch_size=batch_size,
  3. num_workers=nw,
  4. sampler=sampler,
  5. pin_memory=True,
  6. collate_fn=LoadFaceImagesAndLabels.collate_fn4 if quad else LoadFaceImagesAndLabels.collate_fn)



labels, shapes = zip(*cache.values())

也就是cache引入的。 但是正常cache的dict中应该包含了所有读取到内容的,但这里的cache得到的全是没有对应txt文件的照片,说明cache引入出现了问题



  1. else:
  2. cache = self.cache_labels(cache_path) # cache


  1. def cache_labels(self, path=Path('./labels.cache')):
  2. # Cache dataset labels, check images and read shapes
  3. x = {} # dict
  4. nm, nf, ne, nc = 0, 0, 0, 0 # number missing, found, empty, duplicate
  5. pbar = tqdm(zip(self.img_files, self.label_files), desc='Scanning images', total=len(self.img_files))
  6. for i, (im_file, lb_file) in enumerate(pbar):
  7. try:
  8. # verify images
  9. im = Image.open(im_file)
  10. im.verify() # PIL verify
  11. shape = exif_size(im) # image size
  12. assert (shape[0] > 9) & (shape[1] > 9), 'image size <10 pixels'
  13. # verify labels
  14. if os.path.isfile(lb_file):
  15. nf += 1 # label found
  16. with open(lb_file, 'r') as f:
  17. l = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32) # labels
  18. if len(l):
  19. assert l.shape[1] == 15, 'labels require 15 columns each'
  20. assert (l >= -1).all(), 'negative labels'
  21. assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
  22. assert np.unique(l, axis=0).shape[0] == l.shape[0], 'duplicate labels'
  23. else:
  24. ne += 1 # label empty
  25. l = np.zeros((0, 15), dtype=np.float32)
  26. else:
  27. nm += 1 # label missing
  28. l = np.zeros((0, 15), dtype=np.float32)
  29. x[im_file] = [l, shape]
  30. except Exception as e:
  31. nc += 1
  32. print('WARNING: Ignoring corrupted image and/or label %s: %s' % (im_file, e))
  33. pbar.desc = f"Scanning '{path.parent / path.stem}' for images and labels... " \
  34. f"{nf} found, {nm} missing, {ne} empty, {nc} corrupted"
  35. if nf == 0:
  36. print(f'WARNING: No labels found in {path}. See {help_url}')
  37. x['hash'] = get_hash(self.label_files + self.img_files)
  38. x['results'] = [nf, nm, ne, nc, i + 1]
  39. torch.save(x, path) # save for next time
  40. logging.info(f"New cache created: {path}")
  41. return x


assert l.shape[1] == 13, 'labels require 13 columns each'


  1. else:
  2. ne += 1 # label empty
  3. l = np.zeros((0, 13), dtype=np.float32)
  4. else:
  5. nm += 1 # label missing
  6. l = np.zeros((0, 13), dtype=np.float32)
  7. x[im_file] = [l, shape]


        继续运行,在train.py第193行报错:AssertionError: Label class 5 exceeds nc=1 in data/widerface.yaml. Possible class labels are 0-0

assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1)




  1. # labels[:, 13] = np.array(x[:, 13] > 0, dtype=np.int32) * (w * x[:, 13] + padw) + (np.array(x[:, 13] > 0, dtype=np.int32) - 1)
  2. # labels[:, 14] = np.array(x[:, 14] > 0, dtype=np.int32) * (h * x[:, 14] + padh) + (np.array(x[:, 14] > 0, dtype=np.int32) - 1)



  1. if len(labels4):
  2. labels4 = np.concatenate(labels4, 0)
  3. np.clip(labels4[:, 1:5], 0, 2 * s, out=labels4[:, 1:5]) # use with random_perspective
  4. # img4, labels4 = replicate(img4, labels4) # replicate
  5. #landmarks
  6. labels4[:, 5:] = np.where(labels4[:, 5:] < 0, -1, labels4[:, 5:])
  7. labels4[:, 5:] = np.where(labels4[:, 5:] > 2 * s, -1, labels4[:, 5:])
  8. labels4[:, 5] = np.where(labels4[:, 6] == -1, -1, labels4[:, 5])
  9. labels4[:, 6] = np.where(labels4[:, 5] == -1, -1, labels4[:, 6])
  10. labels4[:, 7] = np.where(labels4[:, 8] == -1, -1, labels4[:, 7])
  11. labels4[:, 8] = np.where(labels4[:, 7] == -1, -1, labels4[:, 8])
  12. labels4[:, 9] = np.where(labels4[:, 10] == -1, -1, labels4[:, 9])
  13. labels4[:, 10] = np.where(labels4[:, 9] == -1, -1, labels4[:, 10])
  14. labels4[:, 11] = np.where(labels4[:, 12] == -1, -1, labels4[:, 11])
  15. labels4[:, 12] = np.where(labels4[:, 11] == -1, -1, labels4[:, 12])
  16. # labels4[:, 13] = np.where(labels4[:, 14] == -1, -1, labels4[:, 13])
  17. # labels4[:, 14] = np.where(labels4[:, 13] == -1, -1, labels4[:, 14])

(解释:np.where(condition, x, y)函数用法:满足条件(condition),输出x,不满足输出y。



  1. if n:
  2. # warp points
  3. #xy = np.ones((n * 4, 3))
  4. xy = np.ones((n * 8, 3))
  5. xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2, 5, 6, 7, 8, 9, 10, 11, 12]].reshape(n * 8, 2) # x1y1, x2y2, x1y2, x2y1
  6. xy = xy @ M.T # transform
  7. if perspective:
  8. xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 16) # rescale
  9. else: # affine
  10. xy = xy[:, :2].reshape(n, 16)


  1. landmarks = xy[:, [8, 9, 10, 11, 12, 13, 14, 15]]
  2. mask = np.array(targets[:, 5:] > 0, dtype=np.int32)
  3. landmarks = landmarks * mask
  4. landmarks = landmarks + mask - 1
  5. landmarks = np.where(landmarks < 0, -1, landmarks)
  6. landmarks[:, [0, 2, 4, 6]] = np.where(landmarks[:, [0, 2, 4, 6]] > width, -1, landmarks[:, [0, 2, 4, 6]])
  7. landmarks[:, [1, 3, 5, 7]] = np.where(landmarks[:, [1, 3, 5, 7]] > height, -1,landmarks[:, [1, 3, 5, 7]])
  8. landmarks[:, 0] = np.where(landmarks[:, 1] == -1, -1, landmarks[:, 0])
  9. landmarks[:, 1] = np.where(landmarks[:, 0] == -1, -1, landmarks[:, 1])
  10. landmarks[:, 2] = np.where(landmarks[:, 3] == -1, -1, landmarks[:, 2])
  11. landmarks[:, 3] = np.where(landmarks[:, 2] == -1, -1, landmarks[:, 3])
  12. landmarks[:, 4] = np.where(landmarks[:, 5] == -1, -1, landmarks[:, 4])
  13. landmarks[:, 5] = np.where(landmarks[:, 4] == -1, -1, landmarks[:, 5])
  14. landmarks[:, 6] = np.where(landmarks[:, 7] == -1, -1, landmarks[:, 6])
  15. landmarks[:, 7] = np.where(landmarks[:, 6] == -1, -1, landmarks[:, 7])
  16. # landmarks[:, 8] = np.where(landmarks[:, 9] == -1, -1, landmarks[:, 8])
  17. # landmarks[:, 9] = np.where(landmarks[:, 8] == -1, -1, landmarks[:, 9])
  18. targets[:,5:] = landmarks
  19. xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T



  1. if nL:
  2. labels[:, 1:5] = xyxy2xywh(labels[:, 1:5]) # convert xyxy to xywh
  3. labels[:, [2, 4]] /= img.shape[0] # normalized height 0-1
  4. labels[:, [1, 3]] /= img.shape[1] # normalized width 0-1
  5. labels[:, [5, 7, 9, 11]] /= img.shape[1] # normalized landmark x 0-1
  6. labels[:, [5, 7, 9, 11]] = np.where(labels[:, [5, 7, 9, 11]] < 0, -1, labels[:, [5, 7, 9, 11]])
  7. labels[:, [6, 8, 10, 12]] /= img.shape[0] # normalized landmark y 0-1
  8. labels[:, [6, 8, 10, 12]] = np.where(labels[:, [6, 8, 10, 12]] < 0, -1, labels[:, [6, 8, 10, 12]])
  9. if self.augment:
  10. # flip up-down
  11. if random.random() < hyp['flipud']:
  12. img = np.flipud(img)
  13. if nL:
  14. labels[:, 2] = 1 - labels[:, 2]
  15. labels[:, 6] = np.where(labels[:,6] < 0, -1, 1 - labels[:, 6])
  16. labels[:, 8] = np.where(labels[:, 8] < 0, -1, 1 - labels[:, 8])
  17. labels[:, 10] = np.where(labels[:, 10] < 0, -1, 1 - labels[:, 10])
  18. labels[:, 12] = np.where(labels[:, 12] < 0, -1, 1 - labels[:, 12])
  19. # labels[:, 14] = np.where(labels[:, 14] < 0, -1, 1 - labels[:, 14])
  20. # flip left-right
  21. if random.random() < hyp['fliplr']:
  22. img = np.fliplr(img)
  23. if nL:
  24. labels[:, 1] = 1 - labels[:, 1]
  25. labels[:, 5] = np.where(labels[:, 5] < 0, -1, 1 - labels[:, 5])
  26. labels[:, 7] = np.where(labels[:, 7] < 0, -1, 1 - labels[:, 7])
  27. labels[:, 9] = np.where(labels[:, 9] < 0, -1, 1 - labels[:, 9])
  28. labels[:, 11] = np.where(labels[:, 11] < 0, -1, 1 - labels[:, 11])
  29. # labels[:, 13] = np.where(labels[:, 13] < 0, -1, 1 - labels[:, 13])
  30. # #左右镜像的时候,左眼、右眼, 左嘴角、右嘴角无法区分, 应该交换位置,便于网络学习
  31. # eye_left = np.copy(labels[:, [5, 6]])
  32. # mouth_left = np.copy(labels[:, [11, 12]])
  33. # labels[:, [5, 6]] = labels[:, [7, 8]]
  34. # labels[:, [7, 8]] = eye_left
  35. # labels[:, [11, 12]] = labels[:, [13, 14]]
  36. # labels[:, [13, 14]] = mouth_left
  37. labels_out = torch.zeros((nL, 14))


  1. # labels[:, 13] = np.array(x[:, 13] > 0, dtype=np.int32) * (ratio[0] * w * x[:, 13] + pad[0]) + (
  2. # np.array(x[:, 13] > 0, dtype=np.int32) - 1)
  3. # labels[:, 14] = np.array(x[:, 14] > 0, dtype=np.int32) * (ratio[1] * h * x[:, 14] + pad[1]) + (
  4. # np.array(x[:, 14] > 0, dtype=np.int32) - 1)



上述修改完成之后,程序可以正常训练,一切显得如此美好,但是在训练结束的时候,程序突然报错(AssertionError: No results.txt files found in /home/luoxinhao/桌面/yolov5/新版yolov5-face-master/runs/train/exp114, nothing to plot.),主要原因是train.py 400行

plot_results(save_dir=save_dir)  # save as results.png



        if rank in [-1, 0] and epoch > 20:  #

只有在满足如上条件时,才会保存代码,但我只训练了十轮,系统默认不保存。于是我将and epoch > 20 部分注释掉,程序开始执行保存模型的部分,代码段如下:

  1. if ema:
  2. ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights'])
  3. final_epoch = epoch + 1 == epochs
  4. if not opt.notest or final_epoch: # Calculate mAP
  5. # results, maps, times = test.test(opt.data,
  6. # batch_size=total_batch_size,
  7. # imgsz=imgsz_test,
  8. # model=ema.ema,
  9. # single_cls=opt.single_cls,
  10. # dataloader=testloader,
  11. # save_dir=save_dir,
  12. # plots=False,
  13. # log_imgs=opt.log_imgs if wandb else 0)
  14. print("hello")
  15. # Write
  16. with open(results_file, 'a') as f:
  17. f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
  18. if len(opt.name) and opt.bucket:
  19. os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))
  20. # Log
  21. tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss
  22. 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
  23. 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss
  24. 'x/lr0', 'x/lr1', 'x/lr2'] # params
  25. for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
  26. if tb_writer:
  27. tb_writer.add_scalar(tag, x, epoch) # tensorboard
  28. if wandb:
  29. wandb.log({tag: x}) # W&B
  30. # Update best mAP
  31. fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
  32. if fi > best_fitness:
  33. best_fitness = fi
  34. # Save model
  35. save = (not opt.nosave) or (final_epoch and not opt.evolve)
  36. if save:
  37. with open(results_file, 'r') as f: # create checkpoint
  38. ckpt = {'epoch': epoch,
  39. 'best_fitness': best_fitness,
  40. 'training_results': f.read(),
  41. 'model': ema.ema,
  42. 'optimizer': None if final_epoch else optimizer.state_dict(),
  43. 'wandb_id': wandb_run.id if wandb else None}
  44. # Save last, best and delete
  45. torch.save(ckpt, last)
  46. if best_fitness == fi:
  47. torch.save(ckpt, best)
  48. del ckpt
  49. # end epoch ----------------------------------------------------------------------------------------------------
  50. # end training

        于是,新的问题出现了:程序在第337行(上述代码test.test行)报错:RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.




385行,nc = prediction.shape[2] - 15中的15改为13

396行,output = [torch.zeros((0, 16), device=prediction.device)] * prediction.shape[0]中的16改为17(但是这句话好像并没有什么作用,就算改成100也可以)

405行,v = torch.zeros((len(l), nc + 15), device=x.device)中的15改为13

408行,v[range(len(l)), l[:, 0].long() + 15] = 1.0 # cls中的15改为13

416行,x[:, 15:] *= x[:, 4:5] # conf = obj_conf * cls_conf中15改为13

这里说明一下,x的构成为(xyxy, conf, landmarks, cls),所以源代码分别是(4+1+10+1=16),而改后的代码为(4+1+8+4)=17

在这里的416行,x将他的前景置信度乘入了类别置信度,因此x[:, 13:]从原来的物体对每个类别置信度变成了对每个类别的总置信度

  1. if multi_label:
  2. i, j = (x[:, 15:] > conf_thres).nonzero(as_tuple=False).T
  3. x = torch.cat((box[i], x[i, j + 15, None], x[:, 5:15] ,j[:, None].float()), 1) #4,1,8,1,
  4. else: # best class only
  5. conf, j = x[:, 15:].max(1, keepdim=True)
  6. x = torch.cat((box, conf, x[:, 5:15], j.float()), 1)[conf.view(-1) > conf_thres]

        上段代码第二行,(x[:, 15:] > conf_thres)得到的是一个元素均为true或false的矩阵,nonzero(a)返回数组a中非零元素的索引值数组,也就是得到了那些大于置信度(为true)的预测框在矩阵中的位置索引。i代表横坐标,即第几个数据靠谱;j代表纵坐标,即该数据对应第几个类别。

        第三行,是把组成x的各部分拼接起来,分别是xyxy, conf, landmarks, cls。注意最后一个1不是用来补位的,而是torch.cat的语法,意为按列拼接。

        box[i]代表第i组数据(候选框)的box信息,x[i, j + 15, None]代表了其在对应类的置信度(但是我们只用通过j的取值来得知这是哪个类)。x[:, 5:15]代表五个点,后面的j[:, None]代表他是第几个类。

        但是,可能是原作者疏忽(因为yoloface并没有用到multi_label),这段代码执行会报错。经过调试后发现,xyxy, conf, cls三个量的横坐标都是置信度较大,被选中的框,即i中存放的,而 landmarks这一项却是全部的框。因此要将x[:, 5:15]中的第一个冒号改为i。考虑到其他更改(如15变为13),这段代码更改后如下

  1. if multi_label:
  2. i, j = (x[:, 13:] > conf_thres).nonzero(as_tuple=False).T
  3. x = torch.cat((box[i], x[i, j + 13, None], x[i, 5:13] ,j[:, None].float()), 1) #4,1,8,1, (4+1+10+1
  4. else: # best class only
  5. conf, j = x[:, 13:].max(1, keepdim=True)
  6. x = torch.cat((box, conf, x[:, 5:13], j.float()), 1)[conf.view(-1) > conf_thres]



继续在439行,c = x[:, 13:14] * (0 if agnostic else max_wh) # classes的1516改为1314。


  1. for si, pred in enumerate(output):
  2. pred = torch.cat((pred[:, :5], pred[:, 13:]), 1) # throw landmark in thresh
  3. labels = targets[targets[:, 0] == si, 1:]
  4. nl = len(labels)
  5. tcls = labels[:, 0].tolist() if nl else [] # target class
  6. path = Path(paths[si])
  7. seen += 1





 UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448278899/work/c10/core/TensorImpl.h:1156.)
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)



  1. # coords[:, 8].clamp_(0, img0_shape[1]) # x5
  2. # coords[:, 9].clamp_(0, img0_shape[0]) # y5


  1. clors = [(255,0,0),(0,255,0),(0,0,255),(255,255,0)]
  2. for i in range(4):
  3. point_x = int(landmarks[2 * i] * w)
  4. point_y = int(landmarks[2 * i + 1] * h)
  5. cv2.circle(img, (point_x, point_y), tl+1, clors[i], -1)


        gn_lks = torch.tensor(orgimg.shape)[[1, 0, 1, 0, 1, 0, 1, 0, ]].to(device)  # normalization gain landmarks


            det[:, 5:13] = scale_coords_landmarks(img.shape[2:], det[:, 5:13], orgimg.shape).round()


                landmarks = (det[j, 5:13].view(1, 8) / gn_lks).view(-1).tolist()


                class_num = det[j, 13].cpu().numpy()






RuntimeError: The size of tensor a (51) must match the size of tensor b (279) at non-singleton dimension 0



  1. pg0, pg1, pg2 = [], [], [] # optimizer parameter groups
  2. for k, v in model.named_modules():
  3. if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): #hasattr:判断物体是否包含相应属性
  4. pg2.append(v.bias) # biases
  5. if isinstance(v, nn.BatchNorm2d):
  6. pg0.append(v.weight) # no decay
  7. elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
  8. pg1.append(v.weight) # apply decay


        optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)


    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)  #函数功能:更新学习率


  1. if pretrained:
  2. # Optimizer
  3. if ckpt['optimizer'] is not None:
  4. optimizer.load_state_dict(ckpt['optimizer'])
  5. best_fitness = ckpt['best_fitness']
  6. # Results
  7. if ckpt.get('training_results') is not None:
  8. with open(results_file, 'w') as file:
  9. file.write(ckpt['training_results']) # write results.txt
  10. # Epochs
  11. #start_epoch = ckpt['epoch'] + 1
  12. if opt.resume:
  13. assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
  14. if epochs < start_epoch:
  15. logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
  16. (weights, ckpt['epoch'], epochs))
  17. epochs += ckpt['epoch'] # finetune additional epochs
  18. del ckpt, state_dict











更新: yoloface链接



