yolov8+tensorRt 实力分割_yolov8 实例分割 tensorrt

作者：我家自动化 | 2024-08-08 17:13:35

踩

yolov8 实例分割 tensorrt

model=yolov8n.ymal 使用正常版本 model=yolov8n-p2.ymal 小目标检测版本 model=yolov8n-p6.ymal 高分辨率版本
pt文件下载地址：
https://github.com/ultralytics/assets/releases
在这里插入图片描述

训练截图在这里插入图片描述
训练结果截图，可以看到输入和输出及onnx opset cuda python torch 等版本和检测时间。

在这里插入图片描述用Netron打开yolov8n-seg.onnx可以看到模型的输入和输出

输入节点名字：“images”；数据：float32[1,3,1280,1280]
【这里特殊强调一点：csdn有些文章里是和这里不一样的，“output0”；数据：float32[1,116,8400]，116 = 4+80+32，4为box的cx cy w h，80是每个类的置信度，32是分割需要用到的】
输出节点1的名字：“output0”；数据：float32[1,37,33600]。其中37（37 = 4 + 1 + 32）； 4为box的cx cy w h，1是每个类的置信度，32是分割需要用到的，用于计算掩膜数据。

输出节点2的名字：“output1”；数据：float32[1,32,160,160]。output0后32个字段与output1的数据做矩阵乘法后得到的结果，即为对应目标的掩膜数据。
在这里插入图片描述
检测效果如上，25ms

附：https://blog.csdn.net/qq_41043389/article/details/128682057

// onnx生成engine--补充了内存释放
#include <iostream>
#include "logging.h"
#include "NvOnnxParser.h"
#include "NvInfer.h"
#include <fstream>

using namespace nvinfer1;
using namespace nvonnxparser;

static Logger gLogger;
int main(int argc,char** argv) {
	if (argc < 2) {
		argv[1] = "E:/firefoxDown/yolov8-instance-seg-tensorrt-main/build/best_cu102_rail.onnx";
		argv[2] = "yolov8s_blob_best_rail.engine";
	}
	// 1 onnx解析器
	IBuilder* builder = createInferBuilder(gLogger);
	const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
	INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, gLogger);
	const char* onnx_filename = argv[1];
	parser->parseFromFile(onnx_filename, static_cast<int>(Logger::Severity::kWARNING));
	for (int i = 0; i < parser->getNbErrors(); ++i)
	{
		std::cout << parser->getError(i)->desc() << std::endl;
	}
	std::cout << "successfully load the onnx model" << std::endl;
	// 2build the engine
	unsigned int maxBatchSize = 1;
	builder->setMaxBatchSize(maxBatchSize);
	IBuilderConfig* config = builder->createBuilderConfig();
	config->setMaxWorkspaceSize(1 << 20);
	//config->setMaxWorkspaceSize(128 * (1 << 20));  // 16MB
	config->setFlag(BuilderFlag::kFP16);
	ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
	// 3serialize Model
	IHostMemory *gieModelStream = engine->serialize();
	std::ofstream p(argv[2], std::ios::binary);
	if (!p)
	{
		std::cerr << "could not open plan output file" << std::endl;
		return -1;
	}
	p.write(reinterpret_cast<const char*>(gieModelStream->data()), gieModelStream->size());
	gieModelStream->destroy();
	std::cout << "successfully generate the trt engine model" << std::endl;
	parser->destroy();
	engine->destroy();
	network->destroy();
	builder->destroy();
	return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53

// 推理：请根据自己onnx结构进行参数调整

#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "NvInferPlugin.h"
#include "logging.h"
#include <opencv2/opencv.hpp>
#include "utils.h"
#include <string>
using namespace nvinfer1;
using namespace cv;

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 640*2;
static const int INPUT_W = 640*2;
static const int _segWidth = 160*2;
static const int _segHeight = 160*2;
static const int _segChannels = 32;
static const int CLASSES = 1;//80
static const int Num_box = 33600;// 8400;
static const int OUTPUT_SIZE = Num_box * (CLASSES+4 + _segChannels);//output0
static const int OUTPUT_SIZE1 = _segChannels * _segWidth * _segHeight ;//output1


static const float CONF_THRESHOLD = 0.1;
static const float NMS_THRESHOLD = 0.5;
static const float MASK_THRESHOLD = 0.5;
const char* INPUT_BLOB_NAME = "images";
const char* OUTPUT_BLOB_NAME = "output0";//detect
const char* OUTPUT_BLOB_NAME1 = "output1";//mask

//
#define fengeflag

struct OutputSeg {
	int id;             //结果类别id
	float confidence;   //结果置信度
	cv::Rect box;       //矩形框
	cv::Mat boxMask;       //矩形框内mask，节省内存空间和加快速度
};

void DrawPred(Mat& img,std:: vector<OutputSeg> result) {
	//生成随机颜色
	std::vector<Scalar> color;
	srand(time(0));
	for (int i = 0; i < CLASSES; i++) {
		int b = rand() % 256;
		int g = rand() % 256;
		int r = rand() % 256;
		color.push_back(Scalar(b, g, r));
	}
	Mat mask = img.clone();
	for (int i = 0; i < result.size(); i++) {
		int left, top;
		left = result[i].box.x;
		top = result[i].box.y;
		int color_num = i;
		rectangle(img, result[i].box, color[result[i].id], 2, 8);
		
		mask(result[i].box).setTo(color[result[i].id], result[i].boxMask);
		char label[100];
		sprintf(label, "%d:%.2f", result[i].id, result[i].confidence);

		//std::string label = std::to_string(result[i].id) + ":" + std::to_string(result[i].confidence);
		int baseLine;
		Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
		top = max(top, labelSize.height);
		putText(img, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 1, color[result[i].id], 2);
	}
	
	addWeighted(img, 0.5, mask, 0.8, 1, img); //将mask加在原图上面

	
}



static Logger gLogger;
void doInference(IExecutionContext& context, float* input, float* output, float* output1, int batchSize,bool fengeFlag)
{
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 3);
    void* buffers[3];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
	const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
	int outputIndex1 = 0;
	if (fengeFlag) outputIndex1 = engine.getBindingIndex(OUTPUT_BLOB_NAME1);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));//
	CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
	if (fengeFlag) CHECK(cudaMalloc(&buffers[outputIndex1], batchSize * OUTPUT_SIZE1 * sizeof(float)));
	// cudaMalloc分配内存 cudaFree释放内存 cudaMemcpy或 cudaMemcpyAsync 在主机和设备之间传输数据
	// cudaMemcpy cudaMemcpyAsync 显式地阻塞传输 显式地非阻塞传输 
    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
	CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
	if (fengeFlag) CHECK(cudaMemcpyAsync(output1, buffers[outputIndex1], batchSize * OUTPUT_SIZE1 * sizeof(float), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
	CHECK(cudaFree(buffers[outputIndex]));
	if (fengeFlag) CHECK(cudaFree(buffers[outputIndex1]));
}


int main(int argc, char** argv)
{
	if (argc < 2) {
		argv[1] = "E:/firefoxDown/yolov8-instance-seg-tensorrt-main/gitupdate/yolov8-TensorRT/build/yolov8s_blob_best_rail.engine";
		argv[2] = "E:/E/testyolo/rail_data/images/val/rail_10.jpg";
	}
	// create a model using the API directly and serialize it to a stream
	char* trtModelStream{ nullptr }; //char* trtModelStream==nullptr;  开辟空指针后 要和new配合使用，比如89行 trtModelStream = new char[size]
	size_t size{ 0 };//与int固定四个字节不同有所不同,size_t的取值range是目标平台下最大可能的数组尺寸,一些平台下size_t的范围小于int的正数范围,又或者大于unsigned int. 使用Int既有可能浪费，又有可能范围不够大。

	std::ifstream file(argv[1], std::ios::binary);
	if (file.good()) {
		std::cout << "load engine success" << std::endl;
		file.seekg(0, file.end);//指向文件的最后地址
		size = file.tellg();//把文件长度告诉给size

		file.seekg(0, file.beg);//指回文件的开始地址
		trtModelStream = new char[size];//开辟一个char 长度是文件的长度
		assert(trtModelStream);
		file.read(trtModelStream, size);//将文件内容传给trtModelStream
		file.close();//关闭
	}
	else {
		std::cout << "load engine failed" << std::endl;
		return 1;
	}

	
	Mat src = imread(argv[2], 1);
	if (src.empty()) { std::cout << "image load faild" << std::endl; return 1; }
	int img_width = src.cols;
	int img_height = src.rows;
	// Subtract mean from image
	static float data[3 * INPUT_H * INPUT_W];
	Mat pr_img0, pr_img;
	std::vector<int> padsize;
	pr_img = preprocess_img(src, INPUT_H, INPUT_W, padsize);  // Resize
	int newh = padsize[0], neww = padsize[1], padh = padsize[2], padw = padsize[3];
	float ratio_h = (float)src.rows / newh;
	float ratio_w = (float)src.cols / neww;
	int i = 0;// [1,3,INPUT_H,INPUT_W]
	for (int row = 0; row < INPUT_H; ++row) {
		uchar* uc_pixel = pr_img.data + row * pr_img.step;//pr_img.step=widthx3 就是每一行有width个3通道的值
		for (int col = 0; col < INPUT_W; ++col)
		{
			data[i] = (float)uc_pixel[2] / 255.0;
			data[i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
			data[i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.;
			uc_pixel += 3;
			++i;
		}
	}

	IRuntime* runtime = createInferRuntime(gLogger);
	assert(runtime != nullptr);
	bool didInitPlugins = initLibNvInferPlugins(nullptr, "");
	ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
	assert(engine != nullptr);
	IExecutionContext* context = engine->createExecutionContext();
	assert(context != nullptr);
	delete[] trtModelStream;

	// Run inference
	static float prob[OUTPUT_SIZE];
	static float prob1[OUTPUT_SIZE1];

	auto start = std::chrono::system_clock::now();
	doInference(*context, data, prob, prob1, 1,true);
	auto end = std::chrono::system_clock::now();
	//std::cout << "推理时间：" << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
	std::vector<int> classIds;//结果id数组
	std::vector<float> confidences;//结果每个id对应置信度数组
	std::vector<cv::Rect> boxes;//每个id矩形框
	std::vector<cv::Mat> picked_proposals;  //后续计算mask

	// 处理box
	int net_length = CLASSES + 4 + _segChannels;
	cv::Mat out1 = cv::Mat(net_length, Num_box, CV_32F, prob);

	start = std::chrono::system_clock::now();
	for (int i = 0; i < Num_box; i++) {
		//输出是1*net_length*Num_box;所以每个box的属性是每隔Num_box取一个值，共net_length个值
		cv::Mat scores = out1(Rect(i, 4, 1, CLASSES)).clone();
		Point classIdPoint;
		double max_class_socre;
		minMaxLoc(scores, 0, &max_class_socre, 0, &classIdPoint);
		max_class_socre = (float)max_class_socre;
		if (max_class_socre >= CONF_THRESHOLD) {
			cv::Mat temp_proto = out1(Rect(i, 4 + CLASSES, 1, _segChannels)).clone();
			picked_proposals.push_back(temp_proto.t());
			float x = (out1.at<float>(0, i) - padw) * ratio_w;  //cx
			float y = (out1.at<float>(1, i) - padh) * ratio_h;  //cy
			float w = out1.at<float>(2, i) * ratio_w;  //w
			float h = out1.at<float>(3, i) * ratio_h;  //h
			int left = MAX((x - 0.5 * w), 0);
			int top = MAX((y - 0.5 * h), 0);
			int width = (int)w;
			int height = (int)h;
			if (width <= 0 || height <= 0) { continue; }

			classIds.push_back(classIdPoint.y);
			confidences.push_back(max_class_socre);
			boxes.push_back(Rect(left, top, width, height));
		}

	}
	//执行非最大抑制以消除具有较低置信度的冗余重叠框（NMS）
	std::vector<int> nms_result;
	cv::dnn::NMSBoxes(boxes, confidences, CONF_THRESHOLD, NMS_THRESHOLD, nms_result);
	std::vector<cv::Mat> temp_mask_proposals;
	std::vector<OutputSeg> output;
	Rect holeImgRect(0, 0, src.cols, src.rows);
	for (int i = 0; i < nms_result.size(); ++i) {
		int idx = nms_result[i];
		OutputSeg result;
		result.id = classIds[idx];
		result.confidence = confidences[idx];
		result.box = boxes[idx]& holeImgRect;
		output.push_back(result);
		temp_mask_proposals.push_back(picked_proposals[idx]);
	}

	// 处理mask
	Mat maskProposals;
	for (int i = 0; i < temp_mask_proposals.size(); ++i)
		maskProposals.push_back(temp_mask_proposals[i]);

	Mat protos = Mat(_segChannels, _segWidth * _segHeight, CV_32F, prob1);
	Mat matmulRes = (maskProposals * protos).t();//n*32 32*25600 A*B是以数学运算中矩阵相乘的方式实现的，要求A的列数等于B的行数时
	Mat masks = matmulRes.reshape(output.size(), { _segWidth,_segHeight });//n*160*160

	std::vector<Mat> maskChannels;
	cv::split(masks, maskChannels);
	Rect roi(int((float)padw / INPUT_W * _segWidth), int((float)padh / INPUT_H * _segHeight), int(_segWidth - padw / 2), int(_segHeight - padh / 2));
	for (int i = 0; i < output.size(); ++i) {
		Mat dest, mask;
		cv::exp(-maskChannels[i], dest);//sigmoid
		dest = 1.0 / (1.0 + dest);//
		dest = dest(roi);
		resize(dest, mask, cv::Size(src.cols, src.rows), INTER_NEAREST);
		//crop----截取box中的mask作为该box对应的mask
		Rect temp_rect = output[i].box;
		mask = mask(temp_rect) > MASK_THRESHOLD;
		output[i].boxMask = mask;
	}
	end = std::chrono::system_clock::now();
	std::cout << "后处理时间：" << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

	DrawPred(src, output);
	imwrite("dest.bmp", src);
	cv::imshow("output.jpg", src);
	char c = cv::waitKey(0);
	
	// Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277

声明：本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：【wpsshop博客】