当前位置:   article > 正文

多线程异步提高RK3588的NPU占用率,进而提高yolov5s帧率_rk3588 多线程

rk3588 多线程


        作者手头上有一块香橙派5,其搭载有一颗三核心6TOPS算力的NPU, 由于发布时间不长,社区的资料还是比较匮乏, 所以在这里写一下关于如何提高NPU使用率的教程

        文章和代码使用yolov5s进行讲解, 其他模型如resnet之类的同理,稍作修改就可以使用。 由于已经有很多人,如蓝灵风孙启尧等做了如何通过修改模型提高视频推理帧率的教程, 这里我就主要讲另外一种性能的方法——多线程异步


        yolov5s模型激活函数为silu, 此激活函数量化类型为float16, 导致推理过程中使用CPU进行计算, 量化效果较糟。 将激活函数换为relu, 可以在牺牲一点精度的情况下获得巨大性能提升, 目前测试约为80 - 83帧, c++优化后或许有上百? 详情可看蓝灵风大佬的演示视频



sudo cat /sys/kernel/debug/rknpu/load




  1. rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0)
  2. rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_1)
  3. rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_2)


  1. def initRKNN(rknnModel="./rknnModel/yolov5s.rknn", id=0):
  2. rknn_lite = RKNNLite()
  3. ret = rknn_lite.load_rknn(rknnModel)
  4. if ret != 0:
  5. print("Load RKNN rknnModel failed")
  6. exit(ret)
  7. if id == 0:
  8. ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0)
  9. elif id == 1:
  10. ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_1)
  11. elif id == 2:
  12. ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_2)
  13. elif id == -1:
  14. ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0_1_2)
  15. else:
  16. ret = rknn_lite.init_runtime()
  17. if ret != 0:
  18. print("Init runtime environment failed")
  19. exit(ret)
  20. print(rknnModel, "\t\tdone")
  21. return rknn_lite


  1. def initRKNNs(rknnModel="./rknnModel/yolov5s.rknn", TPEs=1):
  2. rknn_list = []
  3. for i in range(TPEs):
  4. rknn_list.append(initRKNN(rknnModel, i % 3))
  5. return rknn_list


  1. class rknnPoolExecutor():
  2. def __init__(self, rknnModel, TPEs, func):
  3. self.TPEs = TPEs
  4. self.queue = Queue()
  5. self.rknnPool = initRKNNs(rknnModel, TPEs)
  6. self.pool = ThreadPoolExecutor(max_workers=TPEs)
  7. self.func = func
  8. self.num = 0
  9. def put(self, frame):
  10. self.queue.put(self.pool.submit(
  11. self.func, self.rknnPool[self.num % self.TPEs], frame))
  12. self.num += 1
  13. def get(self):
  14. if self.queue.empty():
  15. return None, False
  16. temp = []
  17. temp.append(self.queue.get())
  18. for frame in as_completed(temp):
  19. return frame.result(), True
  20. def release(self):
  21. self.pool.shutdown()
  22. for rknn_lite in self.rknnPool:
  23. rknn_lite.release()


  1. # 线程数
  2. TPEs = 6
  3. # 初始化rknn池
  4. pool = rknnPoolExecutor(
  5. rknnModel=modelPath,
  6. TPEs=TPEs,
  7. func=myFunc)
  8. # 初始化异步所需要的帧
  9. if (cap.isOpened()):
  10. for i in range(TPEs + 1):
  11. ret, frame = cap.read()
  12. if not ret:
  13. cap.release()
  14. del pool
  15. exit(-1)
  16. pool.put(frame)


  1. frames, loopTime, initTime = 0, time.time(), time.time()
  2. while (cap.isOpened()):
  3. frames += 1
  4. ret, frame = cap.read()
  5. if not ret:
  6. break
  7. pool.put(frame)
  8. frame, flag = pool.get()
  9. if flag == False:
  10. break
  11. cv2.imshow('test', frame)
  12. if cv2.waitKey(1) & 0xFF == ord('q'):
  13. break
  14. if frames % 30 == 0:
  15. print("30帧平均帧率:\t", 30 / (time.time() - loopTime), "帧")
  16. loopTime = time.time()


  1. print("总平均帧率\t", frames / (time.time() - initTime))
  2. # 释放cap和rknn线程池
  3. cap.release()
  4. cv2.destroyAllWindows()
  5. pool.release()


        测试模型来源 yolov5s,激活函数为silu(非relu优化版本)


        测试视频为 新宝岛





  • yolov5s在6线程下NPU利用率仅有50 - 60%左右, 性能劣化原因猜想:
    1. python的GIL为伪多线程, 换为c++或许在8线程前仍有较大提升
    2. rk3588的CPU性能跟不上, 对OpenCV绘框部分做c++优化或许有提升


        可移步rknn多线程获取yolov5s, resnet26, resnet50的rknn模型、完整代码和演示视频


  1. import cv2
  2. import time
  3. from rknnpool import rknnPoolExecutor
  4. # 图像处理函数,实际应用过程中需要自行修改
  5. from func import myFunc
  6. cap = cv2.VideoCapture('./video/islandBenchmark.mp4')
  7. # cap = cv2.VideoCapture(0)
  8. modelPath = "./rknnModel/yolov5s.rknn"
  9. # 线程数
  10. TPEs = 6
  11. # 初始化rknn池
  12. pool = rknnPoolExecutor(
  13. rknnModel=modelPath,
  14. TPEs=TPEs,
  15. func=myFunc)
  16. # 初始化异步所需要的帧
  17. if (cap.isOpened()):
  18. for i in range(TPEs + 1):
  19. ret, frame = cap.read()
  20. if not ret:
  21. cap.release()
  22. del pool
  23. exit(-1)
  24. pool.put(frame)
  25. frames, loopTime, initTime = 0, time.time(), time.time()
  26. while (cap.isOpened()):
  27. frames += 1
  28. ret, frame = cap.read()
  29. if not ret:
  30. break
  31. pool.put(frame)
  32. frame, flag = pool.get()
  33. if flag == False:
  34. break
  35. cv2.imshow('test', frame)
  36. if cv2.waitKey(1) & 0xFF == ord('q'):
  37. break
  38. if frames % 30 == 0:
  39. print("30帧平均帧率:\t", 30 / (time.time() - loopTime), "帧")
  40. loopTime = time.time()
  41. print("总平均帧率\t", frames / (time.time() - initTime))
  42. # 释放cap和rknn线程池
  43. cap.release()
  44. cv2.destroyAllWindows()
  45. pool.release()


  1. from queue import Queue
  2. from rknnlite.api import RKNNLite
  3. from concurrent.futures import ThreadPoolExecutor, as_completed
  4. def initRKNN(rknnModel="./rknnModel/yolov5s.rknn", id=0):
  5. rknn_lite = RKNNLite()
  6. ret = rknn_lite.load_rknn(rknnModel)
  7. if ret != 0:
  8. print("Load RKNN rknnModel failed")
  9. exit(ret)
  10. if id == 0:
  11. ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0)
  12. elif id == 1:
  13. ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_1)
  14. elif id == 2:
  15. ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_2)
  16. elif id == -1:
  17. ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0_1_2)
  18. else:
  19. ret = rknn_lite.init_runtime()
  20. if ret != 0:
  21. print("Init runtime environment failed")
  22. exit(ret)
  23. print(rknnModel, "\t\tdone")
  24. return rknn_lite
  25. def initRKNNs(rknnModel="./rknnModel/yolov5s.rknn", TPEs=1):
  26. rknn_list = []
  27. for i in range(TPEs):
  28. rknn_list.append(initRKNN(rknnModel, i % 3))
  29. return rknn_list
  30. class rknnPoolExecutor():
  31. def __init__(self, rknnModel, TPEs, func):
  32. self.TPEs = TPEs
  33. self.queue = Queue()
  34. self.rknnPool = initRKNNs(rknnModel, TPEs)
  35. self.pool = ThreadPoolExecutor(max_workers=TPEs)
  36. self.func = func
  37. self.num = 0
  38. def put(self, frame):
  39. self.queue.put(self.pool.submit(
  40. self.func, self.rknnPool[self.num % self.TPEs], frame))
  41. self.num += 1
  42. def get(self):
  43. if self.queue.empty():
  44. return None, False
  45. temp = []
  46. temp.append(self.queue.get())
  47. for frame in as_completed(temp):
  48. return frame.result(), True
  49. def release(self):
  50. self.pool.shutdown()
  51. for rknn_lite in self.rknnPool:
  52. rknn_lite.release()


  1. #以下代码改自https://github.com/rockchip-linux/rknn-toolkit2/tree/master/examples/onnx/yolov5
  2. import cv2
  3. import numpy as np
  4. from rknnlite.api import RKNNLite
  5. QUANTIZE_ON = True
  6. OBJ_THRESH, NMS_THRESH, IMG_SIZE = 0.25, 0.45, 640
  7. CLASSES = ("person", "bicycle", "car", "motorbike ", "aeroplane ", "bus ", "train", "truck ", "boat", "traffic light",
  8. "fire hydrant", "stop sign ", "parking meter", "bench", "bird", "cat", "dog ", "horse ", "sheep", "cow", "elephant",
  9. "bear", "zebra ", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
  10. "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife ",
  11. "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza ", "donut", "cake", "chair", "sofa",
  12. "pottedplant", "bed", "diningtable", "toilet ", "tvmonitor", "laptop ", "mouse ", "remote ", "keyboard ", "cell phone", "microwave ",
  13. "oven ", "toaster", "sink", "refrigerator ", "book", "clock", "vase", "scissors ", "teddy bear ", "hair drier", "toothbrush ")
  14. def sigmoid(x):
  15. return 1 / (1 + np.exp(-x))
  16. def xywh2xyxy(x):
  17. # Convert [x, y, w, h] to [x1, y1, x2, y2]
  18. y = np.copy(x)
  19. y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
  20. y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
  21. y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
  22. y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
  23. return y
  24. def process(input, mask, anchors):
  25. anchors = [anchors[i] for i in mask]
  26. grid_h, grid_w = map(int, input.shape[0:2])
  27. box_confidence = sigmoid(input[..., 4])
  28. box_confidence = np.expand_dims(box_confidence, axis=-1)
  29. box_class_probs = sigmoid(input[..., 5:])
  30. box_xy = sigmoid(input[..., :2])*2 - 0.5
  31. col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w)
  32. row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h)
  33. col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
  34. row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
  35. grid = np.concatenate((col, row), axis=-1)
  36. box_xy += grid
  37. box_xy *= int(IMG_SIZE/grid_h)
  38. box_wh = pow(sigmoid(input[..., 2:4])*2, 2)
  39. box_wh = box_wh * anchors
  40. box = np.concatenate((box_xy, box_wh), axis=-1)
  41. return box, box_confidence, box_class_probs
  42. def filter_boxes(boxes, box_confidences, box_class_probs):
  43. """Filter boxes with box threshold. It's a bit different with origin yolov5 post process!
  44. # Arguments
  45. boxes: ndarray, boxes of objects.
  46. box_confidences: ndarray, confidences of objects.
  47. box_class_probs: ndarray, class_probs of objects.
  48. # Returns
  49. boxes: ndarray, filtered boxes.
  50. classes: ndarray, classes for boxes.
  51. scores: ndarray, scores for boxes.
  52. """
  53. boxes = boxes.reshape(-1, 4)
  54. box_confidences = box_confidences.reshape(-1)
  55. box_class_probs = box_class_probs.reshape(-1, box_class_probs.shape[-1])
  56. _box_pos = np.where(box_confidences >= OBJ_THRESH)
  57. boxes = boxes[_box_pos]
  58. box_confidences = box_confidences[_box_pos]
  59. box_class_probs = box_class_probs[_box_pos]
  60. class_max_score = np.max(box_class_probs, axis=-1)
  61. classes = np.argmax(box_class_probs, axis=-1)
  62. _class_pos = np.where(class_max_score >= OBJ_THRESH)
  63. boxes = boxes[_class_pos]
  64. classes = classes[_class_pos]
  65. scores = (class_max_score * box_confidences)[_class_pos]
  66. return boxes, classes, scores
  67. def nms_boxes(boxes, scores):
  68. """Suppress non-maximal boxes.
  69. # Arguments
  70. boxes: ndarray, boxes of objects.
  71. scores: ndarray, scores of objects.
  72. # Returns
  73. keep: ndarray, index of effective boxes.
  74. """
  75. x = boxes[:, 0]
  76. y = boxes[:, 1]
  77. w = boxes[:, 2] - boxes[:, 0]
  78. h = boxes[:, 3] - boxes[:, 1]
  79. areas = w * h
  80. order = scores.argsort()[::-1]
  81. keep = []
  82. while order.size > 0:
  83. i = order[0]
  84. keep.append(i)
  85. xx1 = np.maximum(x[i], x[order[1:]])
  86. yy1 = np.maximum(y[i], y[order[1:]])
  87. xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]])
  88. yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]])
  89. w1 = np.maximum(0.0, xx2 - xx1 + 0.00001)
  90. h1 = np.maximum(0.0, yy2 - yy1 + 0.00001)
  91. inter = w1 * h1
  92. ovr = inter / (areas[i] + areas[order[1:]] - inter)
  93. inds = np.where(ovr <= NMS_THRESH)[0]
  94. order = order[inds + 1]
  95. keep = np.array(keep)
  96. return keep
  97. def yolov5_post_process(input_data):
  98. masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
  99. anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
  100. [59, 119], [116, 90], [156, 198], [373, 326]]
  101. boxes, classes, scores = [], [], []
  102. for input, mask in zip(input_data, masks):
  103. b, c, s = process(input, mask, anchors)
  104. b, c, s = filter_boxes(b, c, s)
  105. boxes.append(b)
  106. classes.append(c)
  107. scores.append(s)
  108. boxes = np.concatenate(boxes)
  109. boxes = xywh2xyxy(boxes)
  110. classes = np.concatenate(classes)
  111. scores = np.concatenate(scores)
  112. nboxes, nclasses, nscores = [], [], []
  113. for c in set(classes):
  114. inds = np.where(classes == c)
  115. b = boxes[inds]
  116. c = classes[inds]
  117. s = scores[inds]
  118. keep = nms_boxes(b, s)
  119. nboxes.append(b[keep])
  120. nclasses.append(c[keep])
  121. nscores.append(s[keep])
  122. if not nclasses and not nscores:
  123. return None, None, None
  124. boxes = np.concatenate(nboxes)
  125. classes = np.concatenate(nclasses)
  126. scores = np.concatenate(nscores)
  127. return boxes, classes, scores
  128. def draw(image, boxes, scores, classes):
  129. for box, score, cl in zip(boxes, scores, classes):
  130. top, left, right, bottom = box
  131. # print('class: {}, score: {}'.format(CLASSES[cl], score))
  132. # print('box coordinate left,top,right,down: [{}, {}, {}, {}]'.format(top, left, right, bottom))
  133. top = int(top)
  134. left = int(left)
  135. right = int(right)
  136. bottom = int(bottom)
  137. cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2)
  138. cv2.putText(image, '{0} {1:.2f}'.format(CLASSES[cl], score),
  139. (top, left - 6),
  141. 0.6, (0, 0, 255), 2)
  142. def letterbox(im, new_shape=(640, 640), color=(0, 0, 0)):
  143. shape = im.shape[:2] # current shape [height, width]
  144. if isinstance(new_shape, int):
  145. new_shape = (new_shape, new_shape)
  146. r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
  147. ratio = r, r # width, height ratios
  148. new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
  149. dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - \
  150. new_unpad[1] # wh padding
  151. dw /= 2 # divide padding into 2 sides
  152. dh /= 2
  153. if shape[::-1] != new_unpad: # resize
  154. im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
  155. top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
  156. left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
  157. im = cv2.copyMakeBorder(im, top, bottom, left, right,
  158. cv2.BORDER_CONSTANT, value=color) # add border
  159. return im, ratio, (dw, dh)
  160. def myFunc(rknn_lite, IMG):
  161. img = cv2.cvtColor(IMG, cv2.COLOR_BGR2RGB)
  162. img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
  163. outputs = rknn_lite.inference(inputs=[img])
  164. input0_data = outputs[0]
  165. input1_data = outputs[1]
  166. input2_data = outputs[2]
  167. input0_data = input0_data.reshape([3, -1]+list(input0_data.shape[-2:]))
  168. input1_data = input1_data.reshape([3, -1]+list(input1_data.shape[-2:]))
  169. input2_data = input2_data.reshape([3, -1]+list(input2_data.shape[-2:]))
  170. input_data = list()
  171. input_data.append(np.transpose(input0_data, (2, 3, 0, 1)))
  172. input_data.append(np.transpose(input1_data, (2, 3, 0, 1)))
  173. input_data.append(np.transpose(input2_data, (2, 3, 0, 1)))
  174. boxes, classes, scores = yolov5_post_process(input_data)
  175. img_1 = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
  176. if boxes is not None:
  177. draw(img_1, boxes, scores, classes)
  178. return img_1

