赞
踩
tensorrtx C++版本的yolov5 tensorRT加速。 感谢开源大佬的无私奉献。
环境前提:搭建好YOLOv5所需环境,Jetson Nano自带了tensorRT环境。
进入正题,首先是tensorrtx 项目的使用。
第一步生成后续需要使用到的wts文件。准备好tensorrtx yolov5代码,yolov5官方代码,以及模型权(以官方yolov5s.pt为例),将tensorrtx代码中的gen_wts.py文件复制到yolov5代码文件夹下。使用
python3 gen_wts.py -w yolov5s.pt -o yolov5s.wts
生成.wts文件,注意jetson nano中python是使用的python2.x版本,现在应该都是3.x版本的,所以用python3而不是python。
第二步,使用cmake构建tensorrtx项目并使用。
在tensorrtx yolov5 代码下创建build文件夹,并将第一步生成的wts文件复制到这个文件夹下。进入build文件夹输入以下命令,构建项目
cmake ..
make
生成可执行文件
使用以下命令生成tensorRT引擎文件
sudo ./yolov5 -s yolov5s.wts yolov5s.engine s
先准备一下测试用的图片,这里使用yolov5的图片,将tensorrtx yolov5中的samples删除,重新创建一个samples文件夹,并将yolov5以下路径的图片复制到samples文件夹下
(发现这里的cp命令错了,cp到tensorrt的根目录下了,注意复制到samples文件下)
使用可执行文件来使用tensorrtx生成的engine文件(记得回到build目录)
sudo ./yolov5 -d yolov5s.engine ../samples
模型没有预热,第一张照片耗时久很正常。
首先是qt pro文件,以下是jetson nano环境下的配置文件。简单说明一下,除了opencv,qt,libtorch和tensorrt所需要的库和头文件,还有一个很重要的部分是需要编译.cuda文件,tensorrtx中部分函数是使用cuda进行显卡加速的,因此项目需要配置cuda文件的编译库。
#------------------------------------------------- # # Project created by QtCreator 2022-04-06T13:51:43 # #------------------------------------------------- QT += core gui greaterThan(QT_MAJOR_VERSION, 4): QT += widgets TARGET = guiTensorRT TEMPLATE = app # The following define makes your compiler emit warnings if you use # any feature of Qt which has been marked as deprecated (the exact warnings # depend on your compiler). Please consult the documentation of the # deprecated API in order to know how to port your code away from it. DEFINES += QT_DEPRECAT/D_WARNINGS # In order to do so, uncomment the following line. # You can also select to disable deprecated APIs only up to a certain version of Qt. #DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0 SOURCES += \ main.cpp \ gui.cpp \ yolov5tr.cpp \ common.cpp HEADERS += \ gui.h \ yolov5tr.h \ common.h # opencv INCLUDEPATH += \ /usr/include/opencv4/opencv2 \ /usr/include/opencv4 \ LIBS += /usr/lib/aarch64-linux-gnu/libopencv* \ # libtorch #INCLUDEPATH += \ # /home/nvidia/.local/lib/python3.6/site-packages/torch/include/torch/csrc/api/include \ # /home/nvidia/.local/lib/python3.6/site-packages/torch/include \ #LIBS += \ # /home/nvidia/.local/lib/python3.6/site-packages/torch/lib/*.so \ # -L/home/nvidia/.local/lib/python3.6/site-packages/torch/lib \ # -Wl,--no-as-needed -ltorch_cuda # force to link torch_cuda INCLUDEPATH += \ /usr/local/cuda-10.2/targets/aarch64-linux/include/ \ /usr/local/cuda-10.2/include/ \ /usr/include/aarch64-linux-gnu \ /usr/src/tensorrt/samples/common/ LIBS += \ /usr/local/cuda-10.2/targets/aarch64-linux/lib/*.so \ /usr/local/cuda-10.2/lib64/*.so \ /usr/lib/aarch64-linux-gnu/*.so # cannot link opencvlib twice #tensorRT INCLUDEPATH += \ /home/nvidia/Desktop/yolov5-tensorRT LIBS += \ /home/nvidia/Desktop/yolov5-tensorRT/build/*.so \ -L/home/nvidia/Desktop/yolov5-tensorRT/build/ #cuda # yolov5 tensorrt dir HEADERS += \ /home/nvidia/Desktop/yolov5-tensorRT/preprocess.h \ CUDA_SOURCES += \ /home/nvidia/Desktop/yolov5-tensorRT/preprocess.cu CUDA_SDK = "/usr/local/cuda-10.2/" # Path to cuda SDK install CUDA_DIR = "/usr/local/cuda-10.2/" # Path to cuda toolkit install # DO NOT EDIT BEYOND THIS UNLESS YOU KNOW WHAT YOU ARE DOING.... SYSTEM_NAME = ubuntu # Depending on your system either 'Win32', 'x64', or 'Win64' SYSTEM_TYPE = 64 # '32' or '64', depending on your system CUDA_ARCH = sm_53 # Type of CUDA architecture, for example 'compute_10', 'compute_11', 'sm_10' 'sm_50' # https://blog.csdn.net/lb1244206405/article/details/90718040 NVCC_OPTIONS = --use_fast_math # include paths INCLUDEPATH += $$CUDA_DIR/include # library directories QMAKE_LIBDIR += $$CUDA_DIR/lib64/ CUDA_OBJECTS_DIR = ./ # Add the necessary libraries CUDA_LIBS = -lcuda -lcudart # The following makes sure all path names (which often include spaces) are put between quotation marks CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"') #LIBS += $$join(CUDA_LIBS,'.so ', '', '.so') LIBS += $$CUDA_LIBS # Configuration of the Cuda compiler CONFIG(debug, debug|release) { # Debug mode cuda_d.input = CUDA_SOURCES cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}.o cuda_d.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME} cuda_d.dependency_type = TYPE_C QMAKE_EXTRA_COMPILERS += cuda_d } else { # Release mode cuda.input = CUDA_SOURCES cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}.o cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME} cuda.dependency_type = TYPE_C QMAKE_EXTRA_COMPILERS += cuda }
然后我根据tensorrtx的yolov5.cpp封装了一个yolov5类,方便项目中使用。加载模型后一直保留在内存中,而无需每次推理图片都重新加载一遍模型。
yolov5tr.h
#ifndef YOLOV5TR_H #define YOLOV5TR_H #include <iostream> #include <chrono> #include <cmath> #include "cuda_utils.h" #include "logging.h" #include "yololayer.h" #include "utils.h" #include "calibrator.h" #include "preprocess.h" #include "common.h" using namespace cv; class yolov5TR { private: void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* output, int batchSize); ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; IRuntime* runtime = nullptr; float* buffers[2]; uint8_t* img_host = nullptr; uint8_t* img_device = nullptr; cudaStream_t stream; int inputIndex; int outputIndex; std::string engineName; public: yolov5TR(); ~yolov5TR(); yolov5TR(std::string engine_name); std::vector<Yolo::Detection> DoInference(Mat img,clock_t*allTime); }; #endif // YOLOV5TR_H
#include "yolov5tr.h" #define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32 #define DEVICE 0 // GPU id #define NMS_THRESH 0.5 #define CONF_THRESH 0.1 #define BATCH_SIZE 1 #define MAX_IMAGE_INPUT_SIZE_THRESH 3000 * 3000 // ensure it exceed the maximum size in the input images ! // stuff we know about the network and the input/output blobs static const int INPUT_H = Yolo::INPUT_H; static const int INPUT_W = Yolo::INPUT_W; static const int CLASS_NUM = Yolo::CLASS_NUM; static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1 const char* YOLOv5_INPUT_BLOB_NAME = "data"; const char* YOLOv5_OUTPUT_BLOB_NAME = "prob"; class Logger : public ILogger { void log(Severity severity, const char* msg) noexcept override { // suppress info-level messages if (severity <= Severity::kWARNING) std::cout << msg << std::endl; } }; static Logger gLogger; yolov5TR::~yolov5TR(){ // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(img_device)); CUDA_CHECK(cudaFreeHost(img_host)); CUDA_CHECK(cudaFree(buffers[inputIndex])); CUDA_CHECK(cudaFree(buffers[outputIndex])); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); std::cout<<"deleted"<<std::endl; } yolov5TR::yolov5TR(std::string engine_name){ engineName = engine_name; std::ifstream fin(engine_name); std::string cached_engine = ""; while (fin.peek() != EOF){ std::stringstream buffer; buffer << fin.rdbuf(); cached_engine.append(buffer.str()); } fin.close(); runtime = createInferRuntime(gLogger); assert(runtime != nullptr); this->engine = runtime->deserializeCudaEngine(cached_engine.data(), cached_engine.size()); assert(engine != nullptr); context = engine->createExecutionContext(); assert(context != nullptr); assert(engine->getNbBindings() == 2); inputIndex = engine->getBindingIndex(YOLOv5_INPUT_BLOB_NAME); outputIndex = engine->getBindingIndex(YOLOv5_OUTPUT_BLOB_NAME); assert(inputIndex == 0); assert(outputIndex == 1); //Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); // Create stream CUDA_CHECK(cudaStreamCreate(&stream)); CUDA_CHECK(cudaMallocHost((void**)&img_host, MAX_IMAGE_INPUT_SIZE_THRESH * 3)); CUDA_CHECK(cudaMalloc((void**)&img_device, MAX_IMAGE_INPUT_SIZE_THRESH * 3)); } void yolov5TR::doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* output, int batchSize) { // infer on the batch asynchronously, and DMA output back to host context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } std::vector<Yolo::Detection> yolov5TR::DoInference(Mat img,clock_t*allTime){ float* buffer_idx = (float*)buffers[inputIndex]; size_t size_image = img.cols * img.rows * 3; //copy data to pinned memory memcpy(img_host,img.data,size_image); //copy data to device memory CUDA_CHECK(cudaMemcpyAsync(img_device,img_host,size_image,cudaMemcpyHostToDevice,stream)); preprocess_kernel_img(img_device, img.cols, img.rows, buffer_idx, INPUT_W, INPUT_H, stream); // Run inference auto start = std::chrono::system_clock::now(); static float prob[OUTPUT_SIZE]; doInference(*context, stream, (void**)buffers, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl; *allTime = *allTime + std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count(); std::vector<std::vector<Yolo::Detection>> batch_res(1); auto& res1 = batch_res[0]; nms(res1, prob, CONF_THRESH, NMS_THRESH); auto& res = batch_res[0]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } return res; }
本来还有qt部分的代码可以演示的,但是还没有整理完,而且目前手头上暂时没有usb相机,暂且作罢,先将精华部分整理出来。
关于python的tensorrt加速,实际上yolov5官方提供了export.py进行导出。直接使用官方代码就可以导出,并且推理代码也是支持tensorrt进行推理的。甚至backbone魔改后的版本,也是可以进行导出和使用的。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。