OpenCV实战——使用YOLO进行目标检测_yolo目标检测

作者：不正经 | 2024-04-09 07:00:23

踩

yolo目标检测

OpenCV实战——使用YOLO进行目标检测

0. 前言

在本节中，我们将使用 YOLO 算法执行目标检测。目标检测是计算机视觉中的一项常见任务，借助深度学习技术，我们可以实现高准确度的检测。YOLO 在 COCO 数据集(数据集中包含 80 个类别和超过 300000 张图像)中可以达到 60.6mAP (20 fps) 或 33mAP (220 fps)。

1. YOLO 模型简介

YOLO 是深度学习网络目标检测的一类重要分枝，其将输入图像划分为 SxS 网格。对于每个网格，YOLO 检查 B 个边界框，然后深度学习模型提取每个网格的边界框、包含可能对象的置信度以及每个边界框中(训练数据集中)每个类别的置信度：

YOLO 网格
YOLO 使用 19x19 个网格，每个网格包含 5 个边界框，训练数据集中包含 80 个类别。网络的输出结果为 19x19x425，其中 425 来自边界框 (x,y,width,height)、边界框中是否包含对象的置信度、对象属于每个类别(共 80 个类别)的置信度：

5_bounding box*(x,y,w,h,object_confidence,classify_confidence[80])=5*(4 + 1 + 80)
1

YOLO 架构基于 DarkNet (包含 53 层网络)，YOLO 在 DarkNet 的基础上增加了 53 层网络，共 106 层网络。如果我们需要预测速度更快的架构，可以使用包含较少网络层 TinyYOLO 架构。

2. 基于 YOLO 实现目标检测

在本节中，我们使用与深度学习简介一节相同的函数和类来加载模型、预处理图像和预测结果，同时介绍非极大值抑制 (non-maximum suppression, NMS)，以及绘制带有标签的预测结果：

(1) 创建 object_detection_yolo.cpp 文件，导入所需的头文件，初始化所需的全局变量：

#include <fstream>
#include <sstream>
#include <iostream>

#include <opencv2/core.hpp>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>

using namespace cv;
using namespace dnn;
using namespace std;

// Initialize the parameters
float confThreshold = 0.5; // Confidence threshold
float nmsThreshold = 0.4;  // Non-maximum suppression threshold
int inpWidth = 416;  // Width of network's input image
int inpHeight = 416; // Height of network's input image
vector<string> classes;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

(2) 我们从 main 函数开始，首先读取存储模型可以预测的所有类别的文件：

int main(int argc, char** argv) {
    // 加载类别名
    string classesFile = "data/coco.names";
    ifstream ifs(classesFile.c_str());
    string line;
    while (getline(ifs, line)) classes.push_back(line);
1
2
3
4
5
6

(3) 使用模型定义和权重文件加载模型：

    // 提供模型的配置和权重文件
    String modelConfiguration = "data/yolov3.cfg";
    String modelWeights = "data/yolov3.weights";
    // 加载网络
    Net net = readNetFromDarknet(modelConfiguration, modelWeights);
1
2
3
4
5

(4) 加载图像并将其转换为 blob：

    Mat input, blob;
    input= imread(argv[1]);
    if (input.empty()) {
        cout << "No input image" << endl;
        return 0;
    }
    // 创建输入
    blobFromImage(input, blob, 1/255.0, Size(inpWidth, inpHeight), Scalar(0,0,0), true, false);
1
2
3
4
5
6
7
8

(5) 使用 setInput 和 forward 函数检测所有对象及其类别：

    // 设定网络输入
    net.setInput(blob);
    // 执行前向传播
    vector<Mat> outs;
    net.forward(outs, getOutputsNames(net));
1
2
3
4
5

(6) 对输出结果进行后处理，绘制检测到的目标及预测置信度：

    // 移除低置信度边界框
    postprocess(input, outs);
1
2

(7) 在 postprocess 函数中，存储所有预测置信度高于 confThreshold 的边界框框：

    vector<int> classIds;
    vector<float> confidences;
    vector<Rect> boxes;
    for (size_t i = 0; i < outs.size(); ++i) {
        // 扫描网络输出的所有边界框，仅保留具有高置信度分数的边界框
        // 将边界框的类标签指定为边界框得分最高的类别
        float* data = (float*)outs[i].data;
        for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols) {
            Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
            Point classIdPoint;
            double confidence;
            // 获取最大分数的值和位置
            minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
            if (confidence > confThreshold) {
                int centerX = (int)(data[0] * frame.cols);
                int centerY = (int)(data[1] * frame.rows);
                int width = (int)(data[2] * frame.cols);
                int height = (int)(data[3] * frame.rows);
                int left = centerX - width / 2;
                int top = centerY - height / 2;
                
                classIds.push_back(classIdPoint.x);
                confidences.push_back((float)confidence);
                boxes.push_back(Rect(left, top, width, height));
            }
        }
    }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

(8) 使用 NMSBoxes 函数应用非极大值抑制，只得到具有高置信度的非重叠边界框并进行绘制：

    // 执行非极大值抑制
    // 消除具有较低置信度的冗余重叠边界框
    vector<int> indices;
    NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
    for (size_t i = 0; i < indices.size(); ++i) {
        int idx = indices[i];
        Rect box = boxes[idx];
        drawPred(classIds[idx], confidences[idx], box.x, box.y,
                 box.x + box.width, box.y + box.height, frame);
    }
1
2
3
4
5
6
7
8
9
10

使用 YOLO 执行目标检测的结果如下所示：

检测结果

3. 完整代码

完整代码 object_detection_yolo.cpp 如下所示：

#include <fstream>
#include <sstream>
#include <iostream>

#include <opencv2/core/core.hpp>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;
using namespace dnn;
using namespace std;

// 初始化参数
float confThreshold = 0.5;  // 置信度阈值
float nmsThreshold = 0.4;   // 非极大值抑制阈值
int inpWidth = 416;         // 网络输入图像宽度
int inpHeight = 416;        // 网络输入图像高度
vector<string> classes;

// 绘制预测边界框
void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame) {
    // 绘制显示边界框矩形
    rectangle(frame, Point(left, top), Point(right, bottom), Scalar(255, 255, 255), 1);
    // 获取类别名的标签及其置信度
    string conf_label = format("%.2f", conf);
    string label="";
    if (!classes.empty()) {
        label = classes[classId] + ":" + conf_label;
    }
    // 在边界框顶部显示标签
    int baseLine;
    Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
    top = max(top, labelSize.height);
    rectangle(frame, Point(left, top - labelSize.height), Point(left + labelSize.width, top + baseLine), Scalar(255, 255, 255), FILLED);
    putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0),1,LINE_AA);
}

// 使用非最大值抑制移除置信度低的边界框
void postprocess(Mat& frame, const vector<Mat>& outs) {
    vector<int> classIds;
    vector<float> confidences;
    vector<Rect> boxes;
    for (size_t i = 0; i < outs.size(); ++i) {
        // 扫描网络输出的所有边界框，仅保留具有高置信度分数的边界框
        // 将边界框的类标签指定为边界框得分最高的类别
        float* data = (float*)outs[i].data;
        for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols) {
            Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
            Point classIdPoint;
            double confidence;
            // 获取最大分数的值和位置
            minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
            if (confidence > confThreshold) {
                int centerX = (int)(data[0] * frame.cols);
                int centerY = (int)(data[1] * frame.rows);
                int width = (int)(data[2] * frame.cols);
                int height = (int)(data[3] * frame.rows);
                int left = centerX - width / 2;
                int top = centerY - height / 2;
                
                classIds.push_back(classIdPoint.x);
                confidences.push_back((float)confidence);
                boxes.push_back(Rect(left, top, width, height));
            }
        }
    }
    
    // 执行非极大值抑制
    // 消除具有较低置信度的冗余重叠边界框
    vector<int> indices;
    NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
    for (size_t i = 0; i < indices.size(); ++i) {
        int idx = indices[i];
        Rect box = boxes[idx];
        drawPred(classIds[idx], confidences[idx], box.x, box.y,
                 box.x + box.width, box.y + box.height, frame);
    }
}

// 获取输出层的名称
vector<String> getOutputsNames(const Net& net) {
    static vector<String> names;
    if (names.empty()) {
        // 获取输出层的索引
        vector<int> outLayers = net.getUnconnectedOutLayers();
        // 获取网络中所有层的名称
        vector<String> layersNames = net.getLayerNames();
        // 获取names变量中输出层的名称
        names.resize(outLayers.size());
        for (size_t i = 0; i < outLayers.size(); ++i) {
            names[i] = layersNames[outLayers[i] - 1];
        }
    }
    return names;
}

int main(int argc, char** argv) {
    // 加载类别名
    string classesFile = "data/coco.names";
    ifstream ifs(classesFile.c_str());
    string line;
    while (getline(ifs, line)) classes.push_back(line);
    // 提供模型的配置和权重文件
    String modelConfiguration = "data/yolov3.cfg";
    String modelWeights = "data/yolov3.weights";
    // 加载网络
    Net net = readNetFromDarknet(modelConfiguration, modelWeights);
    net.setPreferableBackend(DNN_BACKEND_OPENCV);
    net.setPreferableTarget(DNN_TARGET_CPU);
    
    Mat input, blob;
    input= imread(argv[1]);
    if (input.empty()) {
        cout << "No input image" << endl;
        return 0;
    }
    // 创建输入
    blobFromImage(input, blob, 1/255.0, Size(inpWidth, inpHeight), Scalar(0,0,0), true, false);
    // 设定网络输入
    net.setInput(blob);
    // 执行前向传播
    vector<Mat> outs;
    net.forward(outs, getOutputsNames(net));
    // 移除低置信度边界框
    postprocess(input, outs);
    vector<double> layersTimes;
    double freq = getTickFrequency() / 1000;
    double t = net.getPerfProfile(layersTimes) / freq;
    string label = format("Inference time for compute the image : %.2f ms", t);
    cout << label << endl;
    
    imshow("YOLOv3", input);
    waitKey(0);
    return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136