参与人员:Kaipeng Zhang, Zhanpeng Zhang, Zhifeng Li, Senior Member, IEEE, and Yu Qiao, Senior Member, IEEE
遮挡。最近的研究表明,深度学习方法可以在这两项任务上取得令人印象深刻的表现。在这个本文提出了一种深度级联多任务框架利用检测和对齐之间的内在相关性来提高他们的表现。特别是我们的框架利用级联架构的三个阶段仔细设计深度卷积网络来预测人脸和地标从粗到细的位置。此外,我们建议一种新的在线硬样本挖掘策略进一步改进练习表演。我们的方法达到了极高的准确度挑战的最先进技术用于人脸检测的FDDB和WIDER FACE基准测试AFLW人脸校准基准,同时保持实时性能
def adjust_input(in_data):#调整输入 """ adjust the input from (h, w, c) to ( 1, c, h, w) for network input Parameters: ---------- in_data: numpy array of shape (h, w, c) input data Returns: ------- out_data: numpy array of shape (1, c, h, w) reshaped array """ if in_data.dtype is not np.dtype('float32'): out_data = in_data.astype(np.float32) else: out_data = in_data out_data = out_data.transpose((2,0,1)) out_data = np.expand_dims(out_data, 0) out_data = (out_data - 127.5)*0.0078125 return out_data def generate_bbox(map, reg, scale, threshold):#生成bbox """ generate bbox from feature map Parameters: ---------- map: numpy array , n x m x 1 detect score for each position reg: numpy array , n x m x 4 bbox scale: float number scale of this detection threshold: float number detect threshold Returns: ------- bbox array """ stride = 2 cellsize = 12 t_index = np.where(map>threshold) # find nothing if t_index[0].size == 0: return np.array([]) dx1, dy1, dx2, dy2 = [reg[0, i, t_index[0], t_index[1]] for i in range(4)] reg = np.array([dx1, dy1, dx2, dy2]) score = map[t_index[0], t_index[1]] boundingbox = np.vstack([np.round((stride*t_index[1]+1)/scale), np.round((stride*t_index[0]+1)/scale), np.round((stride*t_index[1]+1+cellsize)/scale), np.round((stride*t_index[0]+1+cellsize)/scale), score, reg]) return boundingbox.T def detect_first_stage(img, net, scale, threshold):#检测第一阶段 """ run PNet for first stage Parameters: ---------- img: numpy array, bgr order input image scale: float number how much should the input image scale net: PNet worker Returns: ------- total_boxes : bboxes """ height, width, _ = img.shape hs = int(math.ceil(height * scale)) ws = int(math.ceil(width * scale)) im_data = cv2.resize(img, (ws,hs)) # adjust for the network input input_buf = adjust_input(im_data) output = net.predict(input_buf) boxes = generate_bbox(output[1][0,1,:,:], output[0], scale, threshold) if boxes.size == 0: return None # nms pick = nms(boxes[:,0:5], 0.5, mode='Union') boxes = boxes[pick] return boxes def detect_first_stage_warpper( args ): return detect_first_stage(*args)
def nms(boxes, overlap_threshold, mode='Union'): """ non max suppression(非极大抑制) Parameters:(参数设置) ---------- box: numpy array n x 5 input bbox array overlap_threshold: float number(重叠的阈值) threshold of overlap mode: float number how to compute overlap ratio, 'Union' or 'Min'如何计算重叠率、并集或最小值 Returns: ------- index array of the selected bbox """ # if there are no boxes, return an empty list if len(boxes) == 0: return [] # if the bounding boxes integers, convert them to floats if boxes.dtype.kind == "i": boxes = boxes.astype("float") # initialize the list of picked indexes pick = [] # grab the coordinates of the bounding boxes x1, y1, x2, y2, score = [boxes[:, i] for i in range(5)] area = (x2 - x1 + 1) * (y2 - y1 + 1) idxs = np.argsort(score) # keep looping while some indexes still remain in the indexes list while len(idxs) > 0: # grab the last index in the indexes list and add the index value to the list of picked indexes last = len(idxs) - 1 i = idxs[last] pick.append(i) xx1 = np.maximum(x1[i], x1[idxs[:last]]) yy1 = np.maximum(y1[i], y1[idxs[:last]]) xx2 = np.minimum(x2[i], x2[idxs[:last]]) yy2 = np.minimum(y2[i], y2[idxs[:last]]) # compute the width and height of the bounding box w = np.maximum(0, xx2 - xx1 + 1) h = np.maximum(0, yy2 - yy1 + 1) inter = w * h if mode == 'Min': overlap = inter / np.minimum(area[i], area[idxs[:last]]) else: overlap = inter / (area[i] + area[idxs[:last]] - inter) # delete all indexes from the index list that have idxs = np.delete(idxs, np.concatenate(([last], np.where(overlap > overlap_threshold)[0]))) return pick
############################################# # first stage ############################################# #for scale in scales: # return_boxes = self.detect_first_stage(img, scale, 0) # if return_boxes is not None: # total_boxes.append(return_boxes) sliced_index = self.slice_index(len(scales)) total_boxes = [] for batch in sliced_index: local_boxes = self.Pool.map( detect_first_stage_warpper, \ zip(repeat(img), self.PNets[:len(batch)], [scales[i] for i in batch], repeat(self.threshold[0])) ) total_boxes.extend(local_boxes) # remove the Nones total_boxes = [ i for i in total_boxes if i is not None] if len(total_boxes) == 0: return None total_boxes = np.vstack(total_boxes) if total_boxes.size == 0: return None # merge the detection from first stage pick = nms(total_boxes[:, 0:5], 0.7, 'Union') total_boxes = total_boxes[pick] bbw = total_boxes[:, 2] - total_boxes[:, 0] + 1 bbh = total_boxes[:, 3] - total_boxes[:, 1] + 1 # refine the bboxes total_boxes = np.vstack([total_boxes[:, 0]+total_boxes[:, 5] * bbw, total_boxes[:, 1]+total_boxes[:, 6] * bbh, total_boxes[:, 2]+total_boxes[:, 7] * bbw, total_boxes[:, 3]+total_boxes[:, 8] * bbh, total_boxes[:, 4] ]) total_boxes = total_boxes.T total_boxes = self.convert_to_square(total_boxes) total_boxes[:, 0:4] = np.round(total_boxes[:, 0:4])
############################################# # second stage ############################################# num_box = total_boxes.shape[0] # pad the bbox [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(total_boxes, width, height) # (3, 24, 24) is the input shape for RNet input_buf = np.zeros((num_box, 3, 24, 24), dtype=np.float32) for i in range(num_box): tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8) tmp[dy[i]:edy[i]+1, dx[i]:edx[i]+1, :] = img[y[i]:ey[i]+1, x[i]:ex[i]+1, :] input_buf[i, :, :, :] = adjust_input(cv2.resize(tmp, (24, 24))) output = self.RNet.predict(input_buf) # filter the total_boxes with threshold passed = np.where(output[1][:, 1] > self.threshold[1]) total_boxes = total_boxes[passed] if total_boxes.size == 0: return None total_boxes[:, 4] = output[1][passed, 1].reshape((-1,)) reg = output[0][passed] # nms pick = nms(total_boxes, 0.7, 'Union') total_boxes = total_boxes[pick] total_boxes = self.calibrate_box(total_boxes, reg[pick]) total_boxes = self.convert_to_square(total_boxes) total_boxes[:, 0:4] = np.round(total_boxes[:, 0:4])
############################################# # third stage ############################################# num_box = total_boxes.shape[0] # pad the bbox [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(total_boxes, width, height) # (3, 48, 48) is the input shape for ONet input_buf = np.zeros((num_box, 3, 48, 48), dtype=np.float32) for i in range(num_box): tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.float32) tmp[dy[i]:edy[i]+1, dx[i]:edx[i]+1, :] = img[y[i]:ey[i]+1, x[i]:ex[i]+1, :] input_buf[i, :, :, :] = adjust_input(cv2.resize(tmp, (48, 48))) output = self.ONet.predict(input_buf) # filter the total_boxes with threshold passed = np.where(output[2][:, 1] > self.threshold[2]) total_boxes = total_boxes[passed] if total_boxes.size == 0: return None total_boxes[:, 4] = output[2][passed, 1].reshape((-1,)) reg = output[1][passed] points = output[0][passed] # compute landmark points bbw = total_boxes[:, 2] - total_boxes[:, 0] + 1 bbh = total_boxes[:, 3] - total_boxes[:, 1] + 1 points[:, 0:5] = np.expand_dims(total_boxes[:, 0], 1) + np.expand_dims(bbw, 1) * points[:, 0:5] points[:, 5:10] = np.expand_dims(total_boxes[:, 1], 1) + np.expand_dims(bbh, 1) * points[:, 5:10] # nms total_boxes = self.calibrate_box(total_boxes, reg) pick = nms(total_boxes, 0.7, 'Min') total_boxes = total_boxes[pick] points = points[pick] if not self.accurate_landmark: return total_boxes, points
############################################# # extended stage ############################################# num_box = total_boxes.shape[0] patchw = np.maximum(total_boxes[:, 2]-total_boxes[:, 0]+1, total_boxes[:, 3]-total_boxes[:, 1]+1) patchw = np.round(patchw*0.25) # make it even patchw[np.where(np.mod(patchw,2) == 1)] += 1 input_buf = np.zeros((num_box, 15, 24, 24), dtype=np.float32) for i in range(5): x, y = points[:, i], points[:, i+5] x, y = np.round(x-0.5*patchw), np.round(y-0.5*patchw) [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(np.vstack([x, y, x+patchw-1, y+patchw-1]).T, width, height) for j in range(num_box): tmpim = np.zeros((tmpw[j], tmpw[j], 3), dtype=np.float32) tmpim[dy[j]:edy[j]+1, dx[j]:edx[j]+1, :] = img[y[j]:ey[j]+1, x[j]:ex[j]+1, :] input_buf[j, i*3:i*3+3, :, :] = adjust_input(cv2.resize(tmpim, (24, 24))) output = self.LNet.predict(input_buf) pointx = np.zeros((num_box, 5)) pointy = np.zeros((num_box, 5)) for k in range(5): # do not make a large movement tmp_index = np.where(np.abs(output[k]-0.5) > 0.35) output[k][tmp_index[0]] = 0.5 pointx[:, k] = np.round(points[:, k] - 0.5*patchw) + output[k][:, 0]*patchw pointy[:, k] = np.round(points[:, k+5] - 0.5*patchw) + output[k][:, 1]*patchw points = np.hstack([pointx, pointy]) points = points.astype(np.int32) return total_boxes, points
4,多源培训:由于我们采用不同的任务每个CNN,都有不同类型的训练图像学习过程,如人脸、非人脸、部分对齐等脸在这种情况下,一些损失函数(即Eq. (1)-(3))不习惯。例如背景区域的样本,我们只计算有目标,另外两项损失设为0。这可以通过示例类型指示器直接实现。
# coding: utf-8 import mxnet as mx from mtcnn_detector import MtcnnDetector import cv2 import os import time if __name__ == '__main__': detector = MtcnnDetector(model_folder='model', ctx=mx.cpu(0), num_worker=4, accurate_landmark=False) img = cv2.imread('test2.jpg') # run detector results = detector.detect_face(img) if results is not None: total_boxes = results[0] points = results[1] # extract aligned face chips chips = detector.extract_image_chips(img, points, 144, 0.37) for i, chip in enumerate(chips): cv2.imshow('chip_'+str(i), chip) cv2.imwrite('chip_'+str(i)+'.png', chip) draw = img.copy() for b in total_boxes: cv2.rectangle(draw, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255)) for p in points: for i in range(5): cv2.circle(draw, (int(p[i]), int(p[i + 5])), 1, (0, 0, 255), 2) cv2.imshow("detection result", draw) cv2.waitKey(0) # -------------- # test on camera # -------------- # camera = cv2.VideoCapture(0) # while True: # grab, frame = camera.read() # img = cv2.resize(frame, (320,180)) # # t1 = time.time() # results = detector.detect_face(img) # print('time: ',time.time() - t1) # # if results is None: # continue # # total_boxes = results[0] # points = results[1] # # draw = img.copy() # for b in total_boxes: # cv2.rectangle(draw, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255)) # # for p in points: # for i in range(5): # cv2.circle(draw, (int(p[i]), int(p[i + 5])), 1, (255, 0, 0), 2) # cv2.imshow("detection result", draw) # cv2.waitKey(30)
