树莓派4B使用ncnn部署yolov5-Lite,推理耗时 247ms 包含前后处理_树莓派4b配置yolov5


一. 引言


二. 版本选择


1. ncnn版本



git clone https://github.com/Tencent/ncnn.git
sudo apt-get install -y gfortran
sudo apt-get install -y libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler
sudo apt-get install --no-install-recommends libboost-all-dev
sudo apt-get install -y libgflags-dev libgoogle-glog-dev liblmdb-dev libatlas-base-dev
cd ~/ncnn
mkdir build
cmake ..
make -j4
make install
2. yolov5-Lite版本

由于作者版本更新,我目前使用最新的版本时候,有一些bug在ncnn上,没来及修改,于是我使用了之前的版本。推荐这个版本 yolov5-Lite版本 https://github.com/ppogg/YOLOv5-Lite/tree/v1.4


git clone https://github.com/ppogg/YOLOv5-Lite.git
# pip install -r requirements.txt

# base ----------------------------------------

# logging -------------------------------------
# wandb

# plotting ------------------------------------

# export --------------------------------------
# coremltools>=4.1
# onnx>=1.9.1
# scikit-learn==0.19.2  # for coreml quantization

# extras --------------------------------------
thop  # FLOPS computation
pycocotools>=2.0  # COCO mAP
1. pt 转 onnx

cd ~/yolov5-Lite-1.4/
python3 export --weights "./best.pt" --batch-size 1 --img-size 320
pip3 install onnx-simplifier
python -m onnxsim best.onnx best-sim.onnx
2. ncnn 模型转换

cd ~/ncnn/build/tools
./onnx2ncnn ./best-sim.onnx ./best-sim.param ./best-sim.bin
./ncnnoptimize ./best-sim.param ./best-sim.bin ./best-sim-fp16.param ./best-sim-fp16.bin 65536
其中65536 是设置模型转为f16开关
65536来自源码 vim ~/ncnn/tools/ncnnoptimize.cpp
fl16 设置源码

3. 修改best-sim.param文件


Reshape  Reshape_468 1 1 632 650 0=6400 1=85 2=3
Reshape  Reshape_484 1 1 652 670 0=1600 1=85 2=3
Reshape  Reshape_500 1 1 672 690 0=400 1=85 2=3
Reshape  Reshape_468 1 1 632 650 0=-1 1=85 2=3
Reshape  Reshape_484 1 1 652 670 0=-1 1=85 2=3
Reshape  Reshape_500 1 1 672 690 0=-1 1=85 2=3
632 表示输入层名称 650 表示输出层名称 0表示为第1维度个数 1表示第2维度个数 2表示第3个数
#include "layer.h"
#include "net.h"
#include "simpleocv.h"
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <float.h>
#include <stdio.h>
#include <vector>
#include <sys/time.h>
#include <iostream>  
#include <chrono>  
#include <opencv2/opencv.hpp>  
using namespace std;  
using namespace cv;  
using namespace std::chrono;  
// 0 : FP16
// 1 : INT8
#define USE_INT8 0
// 0 : Image
// 1 : Camera
#define USE_CAMERA 0
struct Object
    cv::Rect_<float> rect;
    int label;
    float prob;
static inline float intersection_area(const Object& a, const Object& b)
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].prob;
    while (i <= j)
        while (faceobjects[i].prob > p)
        while (faceobjects[j].prob < p)
        if (i <= j)
            // swap
            std::swap(faceobjects[i], faceobjects[j]);
    #pragma omp parallel sections
        #pragma omp section
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        #pragma omp section
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
static void qsort_descent_inplace(std::vector<Object>& faceobjects)
    if (faceobjects.empty())
    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
    const int n = faceobjects.size();
    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
        areas[i] = faceobjects[i].rect.area();
    for (int i = 0; i < n; i++)
        const Object& a = faceobjects[i];
        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
            const Object& b = faceobjects[picked[j]];
            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        if (keep)
static inline float sigmoid(float x)
    return static_cast<float>(1.f / (1.f + exp(-x)));
// unsigmoid
static inline float unsigmoid(float y) {
    return static_cast<float>(-1.0 * (log((1.0 / y) - 1.0)));
static void generate_proposals(const ncnn::Mat &anchors, int stride, const ncnn::Mat &in_pad,
                               const ncnn::Mat &feat_blob, float prob_threshold,
                               std::vector <Object> &objects) {
    const int num_grid = feat_blob.h;
    float unsig_pro = 0;
    if (prob_threshold > 0.6)
        unsig_pro = unsigmoid(prob_threshold);
    int num_grid_x;
    int num_grid_y;
    if (in_pad.w > in_pad.h) {
        num_grid_x = in_pad.w / stride;
        num_grid_y = num_grid / num_grid_x;
    } else {
        num_grid_y = in_pad.h / stride;
        num_grid_x = num_grid / num_grid_y;
    const int num_class = feat_blob.w - 5;
    const int num_anchors = anchors.w / 2;
    for (int q = 0; q < num_anchors; q++) {
        const float anchor_w = anchors[q * 2];
        const float anchor_h = anchors[q * 2 + 1];
        const ncnn::Mat feat = feat_blob.channel(q);
        for (int i = 0; i < num_grid_y; i++) {
            for (int j = 0; j < num_grid_x; j++) {
                const float *featptr = feat.row(i * num_grid_x + j);
                // find class index with max class score
                int class_index = 0;
                float class_score = -FLT_MAX;
                float box_score = featptr[4];
                if (prob_threshold > 0.6) {
                    // while prob_threshold > 0.6, unsigmoid better than sigmoid
                    if (box_score > unsig_pro) {
                        for (int k = 0; k < num_class; k++) {
                            float score = featptr[5 + k];
                            if (score > class_score) {
                                class_index = k;
                                class_score = score;
                        float confidence = sigmoid(box_score) * sigmoid(class_score);
                        if (confidence >= prob_threshold) {
                            float dx = sigmoid(featptr[0]);
                            float dy = sigmoid(featptr[1]);
                            float dw = sigmoid(featptr[2]);
                            float dh = sigmoid(featptr[3]);
                            float pb_cx = (dx * 2.f - 0.5f + j) * stride;
                            float pb_cy = (dy * 2.f - 0.5f + i) * stride;
                            float pb_w = pow(dw * 2.f, 2) * anchor_w;
                            float pb_h = pow(dh * 2.f, 2) * anchor_h;
                            float x0 = pb_cx - pb_w * 0.5f;
                            float y0 = pb_cy - pb_h * 0.5f;
                            float x1 = pb_cx + pb_w * 0.5f;
                            float y1 = pb_cy + pb_h * 0.5f;
                            Object obj;
                            obj.rect.x = x0;
                            obj.rect.y = y0;
                            obj.rect.width = x1 - x0;
                            obj.rect.height = y1 - y0;
                            obj.label = class_index;
                            obj.prob = confidence;
                    } else {
                        for (int k = 0; k < num_class; k++) {
                            float score = featptr[5 + k];
                            if (score > class_score) {
                                class_index = k;
                                class_score = score;
                        float confidence = sigmoid(box_score) * sigmoid(class_score);
                        if (confidence >= prob_threshold) {
                            float dx = sigmoid(featptr[0]);
                            float dy = sigmoid(featptr[1]);
                            float dw = sigmoid(featptr[2]);
                            float dh = sigmoid(featptr[3]);
                            float pb_cx = (dx * 2.f - 0.5f + j) * stride;
                            float pb_cy = (dy * 2.f - 0.5f + i) * stride;
                            float pb_w = pow(dw * 2.f, 2) * anchor_w;
                            float pb_h = pow(dh * 2.f, 2) * anchor_h;
                            float x0 = pb_cx - pb_w * 0.5f;
                            float y0 = pb_cy - pb_h * 0.5f;
                            float x1 = pb_cx + pb_w * 0.5f;
                            float y1 = pb_cy + pb_h * 0.5f;
                            Object obj;
                            obj.rect.x = x0;
                            obj.rect.y = y0;
                            obj.rect.width = x1 - x0;
                            obj.rect.height = y1 - y0;
                            obj.label = class_index;
                            obj.prob = confidence;
static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
    ncnn::Net yolov5;
#if USE_INT8
    yolov5.opt.use_vulkan_compute = true;
    yolov5.opt.use_bf16_storage = true;
    // original pretrained model from https://github.com/ultralytics/yolov5
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
#if USE_INT8
    const int target_size = 320;
    const float prob_threshold = 0.60f;
    const float nms_threshold = 0.60f;
    int img_w = bgr.cols;
    int img_h = bgr.rows;
    // letterbox pad to multiple of 32
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
    // pad to target_size rectangle
    // yolov5/utils/datasets.py letterbox
    int wpad = (w + 31) / 32 * 32 - w;
    int hpad = (h + 31) / 32 * 32 - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);
    ncnn::Extractor ex = yolov5.create_extractor();
    ex.input("images", in_pad);
    std::vector<Object> proposals;
    // stride 8
        ncnn::Mat out;
        ex.extract("output", out);
        ncnn::Mat anchors(6);
        anchors[0] = 10.f;
        anchors[1] = 13.f;
        anchors[2] = 16.f;
        anchors[3] = 30.f;
        anchors[4] = 33.f;
        anchors[5] = 23.f;
        std::vector<Object> objects8;
        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);
        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
    // stride 16
        ncnn::Mat out;
        ex.extract("671", out);
        ncnn::Mat anchors(6);
        anchors[0] = 30.f;
        anchors[1] = 61.f;
        anchors[2] = 62.f;
        anchors[3] = 45.f;
        anchors[4] = 59.f;
        anchors[5] = 119.f;
        std::vector<Object> objects16;
        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);
        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
    // stride 32
        ncnn::Mat out;
        ex.extract("691", out);
        ncnn::Mat anchors(6);
        anchors[0] = 116.f;
        anchors[1] = 90.f;
        anchors[2] = 156.f;
        anchors[3] = 198.f;
        anchors[4] = 373.f;
        anchors[5] = 326.f;
        std::vector<Object> objects32;
        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);
        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
    // sort all proposals by score from highest to lowest
    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);
    int count = picked.size();
    for (int i = 0; i < count; i++)
        objects[i] = proposals[picked[i]];
        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    return 0;
static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    cv::Mat image = bgr.clone();
    for (size_t i = 0; i < objects.size(); i++)
        const Object& obj = objects[i];
        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
        cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0));
        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;
        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);
        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); 
       // cv::putText(image, to_string(fps), cv::Point(100, 100),             //FPS
                    //cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); 
    imshow("camera", image);
    cv::imwrite("result.jpg", image);
int main(int argc, char** argv)
    cv::VideoCapture capture;
    capture.open(0);  //修改这个参数可以选择打开想要用的摄像头
    cv::Mat frame;
    int FPS = 0;  
    int total_frames = 0;  
    high_resolution_clock::time_point t1, t2; 
    while (true)
        capture >> frame;
        cv::Mat m = frame;
        cv::Mat f = frame;
        std::vector<Object> objects;
        auto start_time = std::chrono::high_resolution_clock::now();  // 记录开始时间 
        detect_yolov5(frame, objects);
        auto end_time = std::chrono::high_resolution_clock::now();  // 记录结束时间  
        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);  // 计算执行时间 
        float fps = (float)(1000)/duration.count();
        draw_objects(m, objects);
        cout << "FPS: " << fps << endl;  
        //int fps = 1000/duration.count();
        //int x = m.cols-50;
        //int y = m.rows-50;
        //cv::putText(f, to_string(fps), cv::Point(100, 100), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); 
        //if (cv::waitKey(30) >= 0)
int main(int argc, char** argv)
    if (argc != 2)
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    const char* imagepath = argv[1];
    struct timespec begin, end;
    long time;
    clock_gettime(CLOCK_MONOTONIC, &begin);
    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    std::vector<Object> objects;
    detect_yolov5(m, objects);
    clock_gettime(CLOCK_MONOTONIC, &end);
    time = (end.tv_sec - begin.tv_sec) + (end.tv_nsec - begin.tv_nsec);
    printf(">> Time : %lf ms\n", (double)time/1000000);
    draw_objects(m, objects);
    return 0;
ex.extract("output", out)
Permute Transpose_469 1 1 650 output 0=1
ex.extract("671", out)
Permute Transpose_485 1 1 670 671 0=1
ex.extract("691", out)
Permute Transpose_501 1 1 690 691 0=1
ncnn::Mat anchors(6);
        anchors[0] = 10.f;
        anchors[1] = 13.f;
        anchors[2] = 16.f;
        anchors[3] = 30.f;
        anchors[4] = 33.f;
        anchors[5] = 23.f;

anchors[0] = 30.f;
        anchors[1] = 61.f;
        anchors[2] = 62.f;
        anchors[3] = 45.f;
        anchors[4] = 59.f;
        anchors[5] = 119.f;

anchors[0] = 116.f;
        anchors[1] = 90.f;
        anchors[2] = 156.f;
        anchors[3] = 198.f;
        anchors[4] = 373.f;
        anchors[5] = 326.f;
export ncnn_DIR=~/ncnn/build/install/lib/cmake/ncnn
cd ~/ncnn
mdkir test
cmake_minimum_required(VERSION 3.10.2)

# set(ncnn_DIR "/xxx/path/to/ncnn/xxx/lib/cmake/ncnn")

find_package(ncnn REQUIRED)
find_package(OpenCV REQUIRED)

add_executable(ncnnv5lite v5lite-s.cpp)

target_link_libraries(ncnnv5lite ncnn ${OpenCV_LIBS})
cp ~/ncnn/v5lite-s.cpp ~/ncnn/test/
  • 1


mkdir build
cmake ..
make -j4
五. 推理结果

推理耗时 247ms 包含前后处理

