当前位置:   article > 正文

基于RESNET网络实现tensorrt转换教程(onnx转engine和wts转engine)

onnx转为tensorrt的engine模型

近日很想验证使用pytorch训练模型转tensorrt各种关系,更深理解基于C++ API接口engine加速理论(Python API接口稍微简单,将不在验证),本文基于ResNet分类网络。

本文内容主要分为六个内容,第一个内容介绍使用python构建网络,获取pt/wts/onnx文件;第二个内容介绍基于C++ API构建engine;第三个内容介绍基于C++使用onnx构建

engine;第四个内容介绍windows性能及linux性能(添加于20220914);第五个内容介绍验证;第六个内容介绍如何在Linux环境下编译engine且运行。

代码:ResNet.zip

链接:https://pan.baidu.com/s/1ujX19IUV0EPSIMyIcBnClA?pwd=r63z  
提取码:r63z

                                                                      版本:tensorrt版本8.4,可使用8.0以上版本

 一.使用torchvision获得wts  onnx   编译语言:python

 ①.此代码通过调用torchvision获得resnet18分类权重,并转换为wts和onnx

  1. from torchvision.transforms import transforms
  2. import torch
  3. import torchvision.models as models
  4. import struct
  5. transform_train = transforms.Compose([
  6. transforms.ToTensor(),
  7. transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
  8. ])
  9. transforms_test = transforms.Compose([
  10. transforms.ToTensor(),
  11. transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
  12. ])
  13. def build_model():
  14. model = models.resnet18(pretrained=True)
  15. model = model.eval()
  16. model = model.cuda()
  17. torch.save(model, "./resnet18.pth")
  18. def get_wts(model_path='./resnet18.pth',save_wts_path="./resnet18.wts"):
  19. net = torch.load(model_path)
  20. net = net.cuda()
  21. net = net.eval()
  22. print('model: ', net)
  23. # print('state dict: ', net.state_dict().keys())
  24. tmp = torch.ones(1, 3, 224, 224).cuda()
  25. print('input: ', tmp)
  26. out = net(tmp)
  27. print('output:', out)
  28. f = open(save_wts_path, 'w')
  29. f.write("{}\n".format(len(net.state_dict().keys())))
  30. for k, v in net.state_dict().items():
  31. print('key: ', k)
  32. print('value: ', v.shape)
  33. vr = v.reshape(-1).cpu().numpy()
  34. f.write("{} {}".format(k, len(vr)))
  35. for vv in vr:
  36. f.write(" ")
  37. f.write(struct.pack(">f", float(vv)).hex())
  38. f.write("\n")
  39. def get_onnx(model_path='./resnet18.pth',save_onnx_path="./resnet18.onnx"):
  40. # 定义静态onnx,若推理input_data格式不一致,将导致保存
  41. input_data = torch.randn(2, 3, 224, 224).cuda()
  42. model = torch.load(model_path).cuda()
  43. input_names = ["data"] + ["called_%d" % i for i in range(2)]
  44. output_names = ["prob"]
  45. torch.onnx.export(
  46. model,
  47. input_data,
  48. save_onnx_path,
  49. verbose=True,
  50. input_names=input_names,
  51. output_names=output_names
  52. )
  53. if __name__ == '__main__':
  54. # build_model()
  55. # get_wts(model_path='./resnet18.pth',save_wts_path="./resnet18.wts")
  56. get_onnx(model_path='./resnet18.pth', save_onnx_path="./resnet18.onnx")
resnet分类之python

 二.Resnet分类采用C++ API 转换tensorrt  编译语言:C++/tensorrt 

 ①.此代码为resnet分类转换为tensorrt代码,已可使用visualstudi编译器

resnet18.cpp文件
  1. #include "NvInfer.h"
  2. #include "cuda_runtime_api.h"
  3. //#include "logging.h"
  4. #include <fstream>
  5. #include <iostream>
  6. #include <map>
  7. #include <sstream>
  8. #include <vector>
  9. #include <chrono>
  10. #include <cmath>
  11. #include <cassert>
  12. #include<opencv2/core/core.hpp>
  13. #include<opencv2/highgui/highgui.hpp>
  14. #include <opencv2/opencv.hpp>
  15. using namespace std;
  16. #define CHECK(status) \
  17. do\
  18. {\
  19. auto ret = (status);\
  20. if (ret != 0)\
  21. {\
  22. std::cerr << "Cuda failure: " << ret << std::endl;\
  23. abort();\
  24. }\
  25. } while (0)
  26. // stuff we know about the network and the input/output blobs
  27. static const int INPUT_H = 224;
  28. static const int INPUT_W = 224;
  29. static const int OUTPUT_SIZE = 1000;
  30. const char* INPUT_BLOB_NAME = "data";
  31. const char* OUTPUT_BLOB_NAME = "prob";
  32. using namespace nvinfer1;
  33. //static Logger gLogger;
  34. //构建Logger
  35. class Logger : public ILogger
  36. {
  37. void log(Severity severity, const char* msg) noexcept override
  38. {
  39. // suppress info-level messages
  40. if (severity <= Severity::kWARNING)
  41. std::cout << msg << std::endl;
  42. }
  43. } gLogger;
  44. // Load weights from files shared with TensorRT samples.
  45. // TensorRT weight files have a simple space delimited format:
  46. // [type] [size] <data x size in hex>
  47. std::map<std::string, Weights> loadWeights(const std::string file)
  48. {
  49. std::cout << "Loading weights: " << file << std::endl;
  50. std::map<std::string, Weights> weightMap;
  51. // Open weights file
  52. std::ifstream input(file);
  53. assert(input.is_open() && "Unable to load weight file.");
  54. // Read number of weight blobs
  55. int32_t count;
  56. input >> count;
  57. assert(count > 0 && "Invalid weight map file.");
  58. while (count--)
  59. {
  60. Weights wt{ DataType::kFLOAT, nullptr, 0 };
  61. uint32_t size;
  62. // Read name and type of blob
  63. std::string name;
  64. input >> name >> std::dec >> size;
  65. wt.type = DataType::kFLOAT;
  66. // Load blob
  67. uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
  68. for (uint32_t x = 0, y = size; x < y; ++x)
  69. {
  70. input >> std::hex >> val[x];
  71. }
  72. wt.values = val;
  73. wt.count = size;
  74. weightMap[name] = wt;
  75. }
  76. return weightMap;
  77. }
  78. IScaleLayer* addBatchNorm2d(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
  79. float* gamma = (float*)weightMap[lname + ".weight"].values;
  80. float* beta = (float*)weightMap[lname + ".bias"].values;
  81. float* mean = (float*)weightMap[lname + ".running_mean"].values;
  82. float* var = (float*)weightMap[lname + ".running_var"].values;
  83. int len = weightMap[lname + ".running_var"].count;
  84. std::cout << "len " << len << std::endl;
  85. float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
  86. for (int i = 0; i < len; i++) {
  87. scval[i] = gamma[i] / sqrt(var[i] + eps);
  88. }
  89. Weights scale{ DataType::kFLOAT, scval, len };
  90. float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
  91. for (int i = 0; i < len; i++) {
  92. shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
  93. }
  94. Weights shift{ DataType::kFLOAT, shval, len };
  95. float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
  96. for (int i = 0; i < len; i++) {
  97. pval[i] = 1.0;
  98. }
  99. Weights power{ DataType::kFLOAT, pval, len };
  100. weightMap[lname + ".scale"] = scale;
  101. weightMap[lname + ".shift"] = shift;
  102. weightMap[lname + ".power"] = power;
  103. IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
  104. assert(scale_1);
  105. return scale_1;
  106. }
  107. IActivationLayer* basicBlock(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) {
  108. Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
  109. IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ 3, 3 }, weightMap[lname + "conv1.weight"], emptywts);
  110. assert(conv1);
  111. conv1->setStrideNd(DimsHW{ stride, stride });
  112. conv1->setPaddingNd(DimsHW{ 1, 1 });
  113. IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);
  114. IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
  115. assert(relu1);
  116. IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{ 3, 3 }, weightMap[lname + "conv2.weight"], emptywts);
  117. assert(conv2);
  118. conv2->setPaddingNd(DimsHW{ 1, 1 });
  119. IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);
  120. IElementWiseLayer* ew1;
  121. if (inch != outch) {
  122. IConvolutionLayer* conv3 = network->addConvolutionNd(input, outch, DimsHW{ 1, 1 }, weightMap[lname + "downsample.0.weight"], emptywts);
  123. assert(conv3);
  124. conv3->setStrideNd(DimsHW{ stride, stride });
  125. IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "downsample.1", 1e-5);
  126. ew1 = network->addElementWise(*bn3->getOutput(0), *bn2->getOutput(0), ElementWiseOperation::kSUM);
  127. }
  128. else {
  129. ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM);
  130. }
  131. IActivationLayer* relu2 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU);
  132. assert(relu2);
  133. return relu2;
  134. }
  135. // Creat the engine using only the API and not any parser.
  136. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, string wts_path = "../resnet18.wts")
  137. {
  138. INetworkDefinition* network = builder->createNetworkV2(0U);
  139. // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME
  140. ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
  141. assert(data);
  142. std::map<std::string, Weights> weightMap = loadWeights(wts_path);
  143. Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
  144. IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{ 7, 7 }, weightMap["conv1.weight"], emptywts);
  145. assert(conv1);
  146. conv1->setStrideNd(DimsHW{ 2, 2 });
  147. conv1->setPaddingNd(DimsHW{ 3, 3 });
  148. IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5);
  149. IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
  150. assert(relu1);
  151. IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{ 3, 3 });
  152. assert(pool1);
  153. pool1->setStrideNd(DimsHW{ 2, 2 });
  154. pool1->setPaddingNd(DimsHW{ 1, 1 });
  155. IActivationLayer* relu2 = basicBlock(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0.");
  156. IActivationLayer* relu3 = basicBlock(network, weightMap, *relu2->getOutput(0), 64, 64, 1, "layer1.1.");
  157. IActivationLayer* relu4 = basicBlock(network, weightMap, *relu3->getOutput(0), 64, 128, 2, "layer2.0.");
  158. IActivationLayer* relu5 = basicBlock(network, weightMap, *relu4->getOutput(0), 128, 128, 1, "layer2.1.");
  159. IActivationLayer* relu6 = basicBlock(network, weightMap, *relu5->getOutput(0), 128, 256, 2, "layer3.0.");
  160. IActivationLayer* relu7 = basicBlock(network, weightMap, *relu6->getOutput(0), 256, 256, 1, "layer3.1.");
  161. IActivationLayer* relu8 = basicBlock(network, weightMap, *relu7->getOutput(0), 256, 512, 2, "layer4.0.");
  162. IActivationLayer* relu9 = basicBlock(network, weightMap, *relu8->getOutput(0), 512, 512, 1, "layer4.1.");
  163. IPoolingLayer* pool2 = network->addPoolingNd(*relu9->getOutput(0), PoolingType::kAVERAGE, DimsHW{ 7, 7 });
  164. assert(pool2);
  165. pool2->setStrideNd(DimsHW{ 1, 1 });
  166. IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]);
  167. assert(fc1);
  168. fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
  169. std::cout << "set name out" << std::endl;
  170. network->markOutput(*fc1->getOutput(0));
  171. // Build engine
  172. builder->setMaxBatchSize(maxBatchSize);
  173. config->setMaxWorkspaceSize(1 << 20);
  174. //config->setFlag(nvinfer1::BuilderFlag::kFP16);
  175. ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  176. std::cout << "build out" << std::endl;
  177. // Don't need the network any more
  178. network->destroy();
  179. // Release host memory
  180. for (auto& mem : weightMap)
  181. {
  182. free((void*)(mem.second.values));
  183. }
  184. return engine;
  185. }
  186. void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
  187. {
  188. string wts_path = "./resnet18.wts";
  189. // Create builder
  190. IBuilder* builder = createInferBuilder(gLogger);
  191. IBuilderConfig* config = builder->createBuilderConfig();
  192. // Create model to populate the network, then set the outputs and create an engine
  193. ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wts_path = wts_path);
  194. assert(engine != nullptr);
  195. // Serialize the engine
  196. (*modelStream) = engine->serialize();
  197. // Close everything down
  198. engine->destroy();
  199. builder->destroy();
  200. config->destroy();
  201. }
  202. void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
  203. {
  204. const ICudaEngine& engine = context.getEngine();
  205. // Pointers to input and output device buffers to pass to engine.
  206. // Engine requires exactly IEngine::getNbBindings() number of buffers.
  207. assert(engine.getNbBindings() == 2);
  208. void* buffers[2];
  209. // In order to bind the buffers, we need to know the names of the input and output tensors.
  210. // Note that indices are guaranteed to be less than IEngine::getNbBindings()
  211. const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
  212. const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
  213. // Create GPU buffers on device
  214. CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
  215. CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
  216. // Create stream
  217. cudaStream_t stream;
  218. CHECK(cudaStreamCreate(&stream));
  219. // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
  220. CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
  221. context.enqueue(batchSize, buffers, stream, nullptr);
  222. CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
  223. cudaStreamSynchronize(stream);
  224. // Release stream and buffers
  225. cudaStreamDestroy(stream);
  226. CHECK(cudaFree(buffers[inputIndex]));
  227. CHECK(cudaFree(buffers[outputIndex]));
  228. }
  229. //加工图片变成拥有batch的输入, tensorrt输入需要的格式,为一个维度
  230. void ProcessImage(cv::Mat image, float input_data[]) {
  231. //只处理一张图片,总之结果为一维[batch*3*INPUT_W*INPUT_H]
  232. //以下代码为投机取巧了
  233. cv::resize(image, image, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
  234. std::vector<cv::Mat> InputImage;
  235. InputImage.push_back(image);
  236. int ImgCount = InputImage.size();
  237. //float input_data[BatchSize * 3 * INPUT_H * INPUT_W];
  238. for (int b = 0; b < ImgCount; b++) {
  239. cv::Mat img = InputImage.at(b);
  240. int w = img.cols;
  241. int h = img.rows;
  242. int i = 0;
  243. for (int row = 0; row < h; ++row) {
  244. uchar* uc_pixel = img.data + row * img.step;
  245. for (int col = 0; col < INPUT_W; ++col) {
  246. input_data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0;
  247. input_data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
  248. input_data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
  249. uc_pixel += 3;
  250. ++i;
  251. }
  252. }
  253. }
  254. }
  255. int get_trtengine() {
  256. IHostMemory* modelStream{ nullptr };
  257. APIToModel(1, &modelStream);
  258. assert(modelStream != nullptr);
  259. std::ofstream p("./resnet18.engine", std::ios::binary);
  260. if (!p)
  261. {
  262. std::cerr << "could not open plan output file" << std::endl;
  263. return -1;
  264. }
  265. p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
  266. modelStream->destroy();
  267. return 0;
  268. }
  269. int infer() {
  270. //加载engine引擎
  271. char* trtModelStream{ nullptr };
  272. size_t size{ 0 };
  273. std::ifstream file("./resnet18.engine", std::ios::binary);
  274. if (file.good()) {
  275. file.seekg(0, file.end);
  276. size = file.tellg();
  277. file.seekg(0, file.beg);
  278. trtModelStream = new char[size];
  279. assert(trtModelStream);
  280. file.read(trtModelStream, size);
  281. file.close();
  282. }
  283. //反序列为engine,创建context
  284. IRuntime* runtime = createInferRuntime(gLogger);
  285. assert(runtime != nullptr);
  286. ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
  287. assert(engine != nullptr);
  288. IExecutionContext* context = engine->createExecutionContext();
  289. assert(context != nullptr);
  290. delete[] trtModelStream;
  291. //*********************推理*********************//
  292. // 循环推理
  293. float time_read_img = 0.0;
  294. float time_infer = 0.0;
  295. static float prob[OUTPUT_SIZE];
  296. for (int i = 0; i < 1000; i++) {
  297. // 处理图片为固定输出
  298. auto start = std::chrono::system_clock::now(); //时间函数
  299. std::string path = "./1.jpg";
  300. std::cout << "img_path=" << path << endl;
  301. static float data[3 * INPUT_H * INPUT_W];
  302. cv::Mat img = cv::imread(path);
  303. ProcessImage(img, data);
  304. auto end = std::chrono::system_clock::now();
  305. time_read_img = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() + time_read_img;
  306. //Run inference
  307. start = std::chrono::system_clock::now(); //时间函数
  308. doInference(*context, data, prob, 1);
  309. end = std::chrono::system_clock::now();
  310. time_infer = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() + time_infer;
  311. std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
  312. //输出后处理
  313. //std::cout <<"prob="<<prob << std::endl;
  314. float cls_float = prob[0];
  315. int cls_id = 0;
  316. for (int i = 0; i < OUTPUT_SIZE; i++) {
  317. if (cls_float < prob[i]) {
  318. cls_float = prob[i];
  319. cls_id = i;
  320. }
  321. }
  322. std::cout << "i=" << i << "\tcls_id=" << cls_id << "\t cls_float=" << cls_float << std::endl;
  323. }
  324. std::cout << "C++2engine" << "mean read img time =" << time_read_img / 1000 << "ms\t" << "mean infer img time =" << time_infer / 1000 << "ms" << std::endl;
  325. // Destroy the engine
  326. context->destroy();
  327. engine->destroy();
  328. runtime->destroy();
  329. return 0;
  330. }
  331. int main(int argc, char** argv)
  332. {
  333. //string mode = argv[1];
  334. string mode = "-d"; //适用windows编译,固定指定参数
  335. //if (std::string(argv[1]) == "-s") {
  336. if (mode == "-s") {
  337. get_trtengine();
  338. }
  339. //else if (std::string(argv[1]) == "-d") {
  340. else if (mode == "-d") {
  341. infer();
  342. }
  343. else {
  344. return -1;
  345. }
  346. return 0;
  347. }
resnet18.cpp

②.若需要linux系统运行可编译的CMakeLists.txt文件为:

  1. cmake_minimum_required(VERSION 2.6)
  2. project(resnet)
  3. add_definitions(-std=c++11)
  4. option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
  5. set(CMAKE_CXX_STANDARD 11)
  6. set(CMAKE_BUILD_TYPE Debug)
  7. include_directories(${PROJECT_SOURCE_DIR}/include)
  8. # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
  9. # cuda
  10. include_directories(/usr/local/cuda/include)
  11. link_directories(/usr/local/cuda/lib64)
  12. # tensorrt
  13. include_directories(/usr/include/x86_64-linux-gnu/)
  14. link_directories(/usr/lib/x86_64-linux-gnu/)
  15. add_executable(resnet18 ${PROJECT_SOURCE_DIR}/resnet18.cpp)
  16. target_link_libraries(resnet18 nvinfer)
  17. target_link_libraries(resnet18 cudart)
  18. add_definitions(-O2 -pthread)
CMakeLists.txt

③.visual studio预测结果:

总之测试2张图基本在一个大类中,应该没啥错误。

④.linux预测结果显示:

 三.Resnet分类采用C++ API  使用onnx 转换tensorrt  编译语言:C++/tensorrt 

①.此代码为resnet分类采用onnx转换为tensorrt代码,已可使用visualstudi编译器

resnet18.cpp文件
  1. #include "NvInfer.h"
  2. #include "cuda_runtime_api.h"
  3. #include <fstream>
  4. #include <iostream>
  5. #include <map>
  6. #include <sstream>
  7. #include <vector>
  8. #include <chrono>
  9. #include <cmath>
  10. #include <cassert>
  11. #include<opencv2/core/core.hpp>
  12. #include<opencv2/highgui/highgui.hpp>
  13. #include <opencv2/opencv.hpp>
  14. // onnx转换头文件
  15. #include "NvOnnxParser.h"
  16. using namespace nvonnxparser;
  17. using namespace std;
  18. #define CHECK(status) \
  19. do\
  20. {\
  21. auto ret = (status);\
  22. if (ret != 0)\
  23. {\
  24. std::cerr << "Cuda failure: " << ret << std::endl;\
  25. abort();\
  26. }\
  27. } while (0)
  28. // stuff we know about the network and the input/output blobs
  29. static const int INPUT_H = 224;
  30. static const int INPUT_W = 224;
  31. static const int OUTPUT_SIZE = 1000;
  32. const char* INPUT_BLOB_NAME = "data";
  33. const char* OUTPUT_BLOB_NAME = "prob";
  34. using namespace nvinfer1;
  35. //static Logger gLogger;
  36. //构建Logger
  37. class Logger : public ILogger
  38. {
  39. void log(Severity severity, const char* msg) noexcept override
  40. {
  41. // suppress info-level messages
  42. if (severity <= Severity::kWARNING)
  43. std::cout << msg << std::endl;
  44. }
  45. } gLogger;
  46. // Creat the engine using only the API and not any parser.
  47. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config)
  48. {
  49. const char* onnx_path = "./resnet18.onnx";
  50. INetworkDefinition* network = builder->createNetworkV2(1U); //此处重点1U为OU就有问题
  51. IParser* parser = createParser(*network, gLogger);
  52. parser->parseFromFile(onnx_path, static_cast<int32_t>(ILogger::Severity::kWARNING));
  53. for (int32_t i = 0; i < parser->getNbErrors(); ++i) { std::cout << parser->getError(i)->desc() << std::endl; }
  54. std::cout << "successfully load the onnx model" << std::endl;
  55. // Build engine
  56. builder->setMaxBatchSize(maxBatchSize);
  57. config->setMaxWorkspaceSize(1 << 20);
  58. config->setFlag(nvinfer1::BuilderFlag::kFP16); // 设置精度计算
  59. //config->setFlag(nvinfer1::BuilderFlag::kINT8);
  60. ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  61. std::cout << "successfully create engine " << std::endl;
  62. //销毁
  63. network->destroy();
  64. parser->destroy();
  65. return engine;
  66. }
  67. void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
  68. {
  69. // Create builder
  70. IBuilder* builder = createInferBuilder(gLogger);
  71. IBuilderConfig* config = builder->createBuilderConfig();
  72. // Create model to populate the network, then set the outputs and create an engine
  73. ICudaEngine* engine = createEngine(maxBatchSize, builder, config);
  74. assert(engine != nullptr);
  75. // Serialize the engine
  76. (*modelStream) = engine->serialize();
  77. // Close everything down
  78. engine->destroy();
  79. builder->destroy();
  80. config->destroy();
  81. }
  82. void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
  83. {
  84. const ICudaEngine& engine = context.getEngine();
  85. // Pointers to input and output device buffers to pass to engine.
  86. // Engine requires exactly IEngine::getNbBindings() number of buffers.
  87. assert(engine.getNbBindings() == 2);
  88. void* buffers[2];
  89. // In order to bind the buffers, we need to know the names of the input and output tensors.
  90. // Note that indices are guaranteed to be less than IEngine::getNbBindings()
  91. const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
  92. const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
  93. // Create GPU buffers on device
  94. CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
  95. CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
  96. // Create stream
  97. cudaStream_t stream;
  98. CHECK(cudaStreamCreate(&stream));
  99. // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
  100. CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
  101. context.enqueue(batchSize, buffers, stream, nullptr);
  102. CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
  103. cudaStreamSynchronize(stream);
  104. // Release stream and buffers
  105. cudaStreamDestroy(stream);
  106. CHECK(cudaFree(buffers[inputIndex]));
  107. CHECK(cudaFree(buffers[outputIndex]));
  108. }
  109. //加工图片变成拥有batch的输入, tensorrt输入需要的格式,为一个维度
  110. void ProcessImage(cv::Mat image, float input_data[]) {
  111. //只处理一张图片,总之结果为一维[batch*3*INPUT_W*INPUT_H]
  112. //以下代码为投机取巧了
  113. cv::resize(image, image, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
  114. std::vector<cv::Mat> InputImage;
  115. InputImage.push_back(image);
  116. int ImgCount = InputImage.size();
  117. //float input_data[BatchSize * 3 * INPUT_H * INPUT_W];
  118. for (int b = 0; b < ImgCount; b++) {
  119. cv::Mat img = InputImage.at(b);
  120. int w = img.cols;
  121. int h = img.rows;
  122. int i = 0;
  123. for (int row = 0; row < h; ++row) {
  124. uchar* uc_pixel = img.data + row * img.step;
  125. for (int col = 0; col < INPUT_W; ++col) {
  126. input_data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0;
  127. input_data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
  128. input_data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
  129. uc_pixel += 3;
  130. ++i;
  131. }
  132. }
  133. }
  134. }
  135. int get_trtengine() {
  136. IHostMemory* modelStream{ nullptr };
  137. APIToModel(1, &modelStream);
  138. assert(modelStream != nullptr);
  139. std::ofstream p("./resnet18.engine", std::ios::binary);
  140. if (!p)
  141. {
  142. std::cerr << "could not open plan output file" << std::endl;
  143. return -1;
  144. }
  145. p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
  146. modelStream->destroy();
  147. return 0;
  148. }
  149. int infer() {
  150. //加载engine引擎
  151. char* trtModelStream{ nullptr };
  152. size_t size{ 0 };
  153. std::ifstream file("./resnet18.engine", std::ios::binary);
  154. if (file.good()) {
  155. file.seekg(0, file.end);
  156. size = file.tellg();
  157. file.seekg(0, file.beg);
  158. trtModelStream = new char[size];
  159. assert(trtModelStream);
  160. file.read(trtModelStream, size);
  161. file.close();
  162. }
  163. //反序列为engine,创建context
  164. IRuntime* runtime = createInferRuntime(gLogger);
  165. assert(runtime != nullptr);
  166. ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
  167. assert(engine != nullptr);
  168. IExecutionContext* context = engine->createExecutionContext();
  169. assert(context != nullptr);
  170. delete[] trtModelStream;
  171. //*********************推理*********************//
  172. // 循环推理
  173. float time_read_img = 0.0;
  174. float time_infer = 0.0;
  175. static float prob[OUTPUT_SIZE];
  176. for (int i = 0; i < 1000; i++) {
  177. // 处理图片为固定输出
  178. auto start = std::chrono::system_clock::now(); //时间函数
  179. std::string path = "./1.jpg";
  180. std::cout << "img_path=" << path << endl;
  181. static float data[3 * INPUT_H * INPUT_W];
  182. cv::Mat img = cv::imread(path);
  183. ProcessImage(img, data);
  184. auto end = std::chrono::system_clock::now();
  185. time_read_img = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() + time_read_img;
  186. //Run inference
  187. start = std::chrono::system_clock::now(); //时间函数
  188. doInference(*context, data, prob, 1);
  189. end = std::chrono::system_clock::now();
  190. time_infer = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() + time_infer;
  191. std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
  192. //输出后处理
  193. //std::cout <<"prob="<<prob << std::endl;
  194. float cls_float = prob[0];
  195. int cls_id = 0;
  196. for (int i = 0; i < OUTPUT_SIZE; i++) {
  197. if (cls_float < prob[i]) {
  198. cls_float = prob[i];
  199. cls_id = i;
  200. }
  201. }
  202. std::cout << "i=" << i << "\tcls_id=" << cls_id << "\t cls_float=" << cls_float << std::endl;
  203. }
  204. std::cout << "C++2engine" << "mean read img time =" << time_read_img / 1000 << "ms\t" << "mean infer img time =" << time_infer / 1000 << "ms" << std::endl;
  205. // Destroy the engine
  206. context->destroy();
  207. engine->destroy();
  208. runtime->destroy();
  209. return 0;
  210. }
  211. int main(int argc, char** argv)
  212. {
  213. //string mode = argv[1];
  214. string mode = "-d"; //适用windows编译,固定指定参数
  215. //if (std::string(argv[1]) == "-s") {
  216. if (mode == "-s") {
  217. get_trtengine();
  218. }
  219. //else if (std::string(argv[1]) == "-d") {
  220. else if (mode == "-d") {
  221. infer();
  222. }
  223. else {
  224. return -1;
  225. }
  226. return 0;
  227. }
resnet18.cpp

windows visual studio tensorrt8.4版本 onnx转engine展示

 ②.使用onnx-simpiler 进行优化onnx,但已是最简化,但若能简化,猜想预测会更快一些。

四.性能测试

性能测试结果(测试平台:windows10 cuda11.4 tensorrt8.4  RTX 2060):

 性能测试结果(测试平台:Linux ubuntu18.4 cuda11.3  tensorrt8.2  RTX 2060)(添加:20220914):

 注:检测1000张的平均时间

说明 window10与ubuntu是2个独立设备(电脑),读图主要是CPU处理代码,后期可改成CUDA处理提速。

五.验证

①.使用python将onnx转为engine引擎,使用C++调用验证。

结论:windows系统 可行!  很令人兴奋,意味着使用python转换为engine,将可以使用C++调用,无需再使用C++创建engine。

注:推理时间变长了快2倍。

python代码将其转为engine库,注:使用同样的tensorrt版本

onnx2engine.py

C++推理代码(使用二或三中推理也可以),此代码已简化:

  1. #include "NvInfer.h"
  2. #include "cuda_runtime_api.h"
  3. #include <fstream>
  4. #include <iostream>
  5. #include <map>
  6. #include <sstream>
  7. #include <vector>
  8. #include <chrono>
  9. #include <cmath>
  10. #include <cassert>
  11. #include<opencv2/core/core.hpp>
  12. #include<opencv2/highgui/highgui.hpp>
  13. #include <opencv2/opencv.hpp>
  14. using namespace std;
  15. #define CHECK(status) \
  16. do\
  17. {\
  18. auto ret = (status);\
  19. if (ret != 0)\
  20. {\
  21. std::cerr << "Cuda failure: " << ret << std::endl;\
  22. abort();\
  23. }\
  24. } while (0)
  25. // stuff we know about the network and the input/output blobs
  26. static const int INPUT_H = 224;
  27. static const int INPUT_W = 224;
  28. static const int OUTPUT_SIZE = 1000;
  29. const char* INPUT_BLOB_NAME = "data";
  30. const char* OUTPUT_BLOB_NAME = "prob";
  31. using namespace nvinfer1;
  32. //static Logger gLogger;
  33. //构建Logger
  34. class Logger : public ILogger
  35. {
  36. void log(Severity severity, const char* msg) noexcept override
  37. {
  38. // suppress info-level messages
  39. if (severity <= Severity::kWARNING)
  40. std::cout << msg << std::endl;
  41. }
  42. } gLogger;
  43. void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
  44. {
  45. const ICudaEngine& engine = context.getEngine();
  46. // Pointers to input and output device buffers to pass to engine.
  47. // Engine requires exactly IEngine::getNbBindings() number of buffers.
  48. assert(engine.getNbBindings() == 2);
  49. void* buffers[2];
  50. // In order to bind the buffers, we need to know the names of the input and output tensors.
  51. // Note that indices are guaranteed to be less than IEngine::getNbBindings()
  52. const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
  53. const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
  54. // Create GPU buffers on device
  55. CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
  56. CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
  57. // Create stream
  58. cudaStream_t stream;
  59. CHECK(cudaStreamCreate(&stream));
  60. // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
  61. CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
  62. context.enqueue(batchSize, buffers, stream, nullptr);
  63. CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
  64. cudaStreamSynchronize(stream);
  65. // Release stream and buffers
  66. cudaStreamDestroy(stream);
  67. CHECK(cudaFree(buffers[inputIndex]));
  68. CHECK(cudaFree(buffers[outputIndex]));
  69. }
  70. //加工图片变成拥有batch的输入, tensorrt输入需要的格式,为一个维度
  71. void ProcessImage(cv::Mat image, float input_data[]) {
  72. //只处理一张图片,总之结果为一维[batch*3*INPUT_W*INPUT_H]
  73. //以下代码为投机取巧了
  74. cv::resize(image, image, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
  75. std::vector<cv::Mat> InputImage;
  76. InputImage.push_back(image);
  77. int ImgCount = InputImage.size();
  78. //float input_data[BatchSize * 3 * INPUT_H * INPUT_W];
  79. for (int b = 0; b < ImgCount; b++) {
  80. cv::Mat img = InputImage.at(b);
  81. int w = img.cols;
  82. int h = img.rows;
  83. int i = 0;
  84. for (int row = 0; row < h; ++row) {
  85. uchar* uc_pixel = img.data + row * img.step;
  86. for (int col = 0; col < INPUT_W; ++col) {
  87. input_data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0;
  88. input_data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
  89. input_data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
  90. uc_pixel += 3;
  91. ++i;
  92. }
  93. }
  94. }
  95. }
  96. int infer() {
  97. //加载engine引擎
  98. char* trtModelStream{ nullptr };
  99. size_t size{ 0 };
  100. std::ifstream file("./resnet18.engine", std::ios::binary);
  101. if (file.good()) {
  102. file.seekg(0, file.end);
  103. size = file.tellg();
  104. file.seekg(0, file.beg);
  105. trtModelStream = new char[size];
  106. assert(trtModelStream);
  107. file.read(trtModelStream, size);
  108. file.close();
  109. }
  110. //反序列为engine,创建context
  111. IRuntime* runtime = createInferRuntime(gLogger);
  112. assert(runtime != nullptr);
  113. ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
  114. assert(engine != nullptr);
  115. IExecutionContext* context = engine->createExecutionContext();
  116. assert(context != nullptr);
  117. delete[] trtModelStream;
  118. //*********************推理*********************//
  119. // 循环推理
  120. float time_read_img = 0.0;
  121. float time_infer = 0.0;
  122. static float prob[OUTPUT_SIZE];
  123. for (int i = 0; i < 1000; i++) {
  124. // 处理图片为固定输出
  125. auto start = std::chrono::system_clock::now(); //时间函数
  126. std::string path = "./1.jpg";
  127. std::cout << "img_path=" << path << endl;
  128. static float data[3 * INPUT_H * INPUT_W];
  129. cv::Mat img = cv::imread(path);
  130. ProcessImage(img, data);
  131. auto end = std::chrono::system_clock::now();
  132. time_read_img = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() + time_read_img;
  133. //Run inference
  134. start = std::chrono::system_clock::now(); //时间函数
  135. doInference(*context, data, prob, 1);
  136. end = std::chrono::system_clock::now();
  137. time_infer = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() + time_infer;
  138. std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
  139. //输出后处理
  140. //std::cout <<"prob="<<prob << std::endl;
  141. float cls_float = prob[0];
  142. int cls_id = 0;
  143. for (int i = 0; i < OUTPUT_SIZE; i++) {
  144. if (cls_float < prob[i]) {
  145. cls_float = prob[i];
  146. cls_id = i;
  147. }
  148. }
  149. std::cout << "i=" << i << "\tcls_id=" << cls_id << "\t cls_float=" << cls_float << std::endl;
  150. }
  151. std::cout << "C++2engine" << "mean read img time =" << time_read_img / 1000 << "ms\t" << "mean infer img time =" << time_infer / 1000 << "ms" << std::endl;
  152. // Destroy the engine
  153. context->destroy();
  154. engine->destroy();
  155. runtime->destroy();
  156. return 0;
  157. }
  158. int main(int argc, char** argv)
  159. {
  160. infer();
  161. return 0;
  162. }
resnet18_infer.cpp

注:最终因环未在服务器验证ONNX转engine方法,但CMakeList可借鉴wts转engine。

六.Linux环境下编译engine(添加:20220914)

本节介绍如何使用编译命令在ubuntu(linux)环境中运行,我将使用C++ API构建的网络称为Cengine,将Onnx转换构建的网络称为Oengine,那么本节将介绍主要介绍CMakeLists.txt文件的构建:

Cengine的CMakeLists.txt构建:

  1. cmake_minimum_required(VERSION 2.6)
  2. project(resnet)
  3. add_definitions(-std=c++11)
  4. option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
  5. set(CMAKE_CXX_STANDARD 11)
  6. set(CMAKE_BUILD_TYPE Debug)
  7. include_directories(${PROJECT_SOURCE_DIR}/include)
  8. # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
  9. # cuda
  10. include_directories(/usr/local/cuda/include)
  11. link_directories(/usr/local/cuda/lib64)
  12. # tensorrt
  13. include_directories(/home/ubuntu/soft/TensorRT-8.2.5.1/include/)
  14. link_directories(/home/ubuntu/soft/TensorRT-8.2.5.1/lib/)
  15. #include_directories(/usr/include/x86_64-linux-gnu/)
  16. #link_directories(/usr/lib/x86_64-linux-gnu/)
  17. # opencv
  18. find_package(OpenCV REQUIRED)
  19. include_directories(${OpenCV_INCLUDE_DIRS})
  20. add_executable(resnet18 ${PROJECT_SOURCE_DIR}/main.cpp)
  21. target_link_libraries(resnet18 nvinfer)
  22. target_link_libraries(resnet18 cudart)
  23. target_link_libraries(resnet18 ${OpenCV_LIBS})
  24. add_definitions(-O2 -pthread)

Oengine的CMakeLists.txt构建:

  1. cmake_minimum_required(VERSION 2.6)
  2. project(resnet)
  3. add_definitions(-std=c++11)
  4. option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
  5. set(CMAKE_CXX_STANDARD 11)
  6. set(CMAKE_BUILD_TYPE Debug)
  7. include_directories(${PROJECT_SOURCE_DIR}/include)
  8. # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
  9. # cuda
  10. include_directories(/usr/local/cuda/include)
  11. link_directories(/usr/local/cuda/lib64)
  12. # tensorrt
  13. include_directories(/home/ubuntu/soft/TensorRT-8.2.5.1/include/)
  14. link_directories(/home/ubuntu/soft/TensorRT-8.2.5.1/lib/)
  15. include_directories(/home/ubuntu/soft/TensorRT-8.2.5.1/samples/common/)
  16. #link_directories(/home/ubuntu/soft/TensorRT-8.2.5.1/lib/stubs/)
  17. # opencv
  18. find_package(OpenCV REQUIRED)
  19. include_directories(${OpenCV_INCLUDE_DIRS})
  20. add_executable(resnet18 ${PROJECT_SOURCE_DIR}/main.cpp)
  21. target_link_libraries(resnet18 nvinfer)
  22. target_link_libraries(resnet18 cudart)
  23. target_link_libraries(resnet18 ${OpenCV_LIBS})
  24. target_link_libraries(resnet18 /home/ubuntu/soft/TensorRT-8.2.5.1/lib/stubs/libnvonnxparser.so
  25. )
  26. add_definitions(-O2 -pthread)

以上为ONNX及C++构建engine的cmakelists的语句,主要在于库的链接或头文件之类,相关可看其它博客或网上资料。

附带说明:以上Onnx的CmakeLists.txt语句已经在yolov5、yolov7中验证,可以编译运行。

ResNet代码在上面已有说明,我将不放在本博客中,其中细节代码在我发布的链接中可下载使用。

测试结果展示:

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/盐析白兔/article/detail/351097
推荐阅读
相关标签
  

闽ICP备14008679号