这里使用GitHub上shouxieai的 infer框架 对YOLOv8模型进行加速推理,操作过程如下所示:
(1).CUDA: 11.8
(2).cuDNN: 8.7.0
(4).ONNX: 1.16.0
(5).OpenCV: 4.10.0
2.clone infer代码:https://github.com/shouxieai/infer
3.使用 https://blog.csdn.net/fengbingchun/article/details/140691177 中采用的数据集生成best.onnx,训练代码如下所示:
- import argparse
- import colorama
- from ultralytics import YOLO
- import torch
- def parse_args():
- parser = argparse.ArgumentParser(description="YOLOv8 train")
- parser.add_argument("--yaml", required=True, type=str, help="yaml file")
- parser.add_argument("--epochs", required=True, type=int, help="number of training")
- parser.add_argument("--task", required=True, type=str, choices=["detect", "segment"], help="specify what kind of task")
- args = parser.parse_args()
- return args
- def train(task, yaml, epochs):
- if task == "detect":
- model = YOLO("yolov8n.pt") # load a pretrained model
- elif task == "segment":
- model = YOLO("yolov8n-seg.pt") # load a pretrained model
- else:
- print(colorama.Fore.RED + "Error: unsupported task:", task)
- raise
- results = model.train(data=yaml, epochs=epochs, imgsz=640) # train the model
- metrics = model.val() # It'll automatically evaluate the data you trained, no arguments needed, dataset and settings remembered
- # model.export(format="onnx") #, dynamic=True) # export the model, cannot specify dynamic=True, opencv does not support
- model.export(format="onnx", opset=12, simplify=True, dynamic=False, imgsz=640)
- model.export(format="torchscript") # libtorch
- model.export(format="engine", imgsz=640, dynamic=False, verbose=False, batch=1, workspace=2) # tensorrt fp32
- # model.export(format="engine", imgsz=640, dynamic=True, verbose=True, batch=4, workspace=2, half=True) # tensorrt fp16
- # model.export(format="engine", imgsz=640, dynamic=True, verbose=True, batch=4, workspace=2, int8=True, data=yaml) # tensorrt int8
- if __name__ == "__main__":
- # python test_yolov8_train.py --yaml datasets/melon_new_detect/melon_new_detect.yaml --epochs 1000 --task detect
- colorama.init()
- args = parse_args()
- if torch.cuda.is_available():
- print("Runging on GPU")
- else:
- print("Runting on CPU")
- train(args.task, args.yaml, args.epochs)
- print(colorama.Fore.GREEN + "====== execution completed ======")
python v8trans.py best.onnx
注:yolov8 onnx的输出为NHW,而inter框架的输出只支持NWH,因此需要在原始onnx的输出之前添加一个Transpose节点
5.从 https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-870/install-guide/index.html#install-zlib-windows 下载zlib123dllx64.zip,解压缩将其中的zlibwapi.dll拷贝到C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin目录下
- trtexec.exe --onnx=best.transd.onnx --saveEngine=best.transd.fp32.engine
- trtexec.exe --onnx=best.transd.onnx --fp16 --saveEngine=best.transd.fp16.engine
- trtexec.exe --onnx=best.transd.onnx --int8 --saveEngine=best.transd.int8.engine
- #include <iostream>
- #include <filesystem>
- #include <vector>
- #include <fstream>
- #include <sstream>
- #include <random>
- #include <map>
- #include <memory>
- #include <chrono>
- #include <string>
- #include <algorithm>
- #include <opencv2/opencv.hpp>
- #include "yolo.hpp"
- namespace {
- constexpr float confidence_threshold{ 0.45f }; // confidence threshold
- constexpr float nms_threshold{ 0.50f }; // nms threshold
- constexpr char* engine_file{ "../../../data/best.transd.fp32.engine" };
- constexpr char* images_dir{ "../../../data/images/predict" };
- constexpr char* result_dir{ "../../../data/result" };
- constexpr char* classes_file{ "../../../data/images/labels.txt" };
- std::vector<std::string> parse_classes_file(const char* name)
- {
- std::vector<std::string> classes;
- std::ifstream file(name);
- if (!file.is_open()) {
- std::cerr << "Error: fail to open classes file: " << name << std::endl;
- return classes;
- }
- std::string line;
- while (std::getline(file, line)) {
- auto pos = line.find_first_of(" ");
- classes.emplace_back(line.substr(0, pos));
- }
- file.close();
- return classes;
- }
- auto get_dir_images(const char* name)
- {
- std::map<std::string, std::string> images; // image name, image path + image name
- for (auto const& dir_entry : std::filesystem::directory_iterator(name)) {
- if (dir_entry.is_regular_file())
- images[dir_entry.path().filename().string()] = dir_entry.path().string();
- }
- return images;
- }
- auto get_random_color(int labels_number)
- {
- std::random_device rd;
- std::mt19937 gen(rd());
- std::uniform_int_distribution<int> dis(100, 255);
- std::vector<cv::Scalar> colors;
- for (auto i = 0; i < labels_number; ++i) {
- colors.emplace_back(cv::Scalar(dis(gen), dis(gen), dis(gen)));
- }
- return colors;
- }
- } // namespace
- int main()
- {
- namespace fs = std::filesystem;
- if (!fs::exists(result_dir)) {
- fs::create_directories(result_dir);
- }
- auto classes = parse_classes_file(classes_file);
- if (classes.size() == 0) {
- std::cerr << "Error: fail to parse classes file: " << classes_file << std::endl;
- return -1;
- }
- std::cout << "classes: ";
- for (const auto& val : classes) {
- std::cout << val << " ";
- }
- std::cout << std::endl;
- auto colors = get_random_color(classes.size());
- auto model = yolo::load(engine_file, yolo::Type::V8, confidence_threshold, nms_threshold);
- for (auto i = 0; i < 10; ++i) {
- std::cout << "i: " << i << std::endl;
- for (const auto& [key, val] : get_dir_images(images_dir)) {
- cv::Mat frame = cv::imread(val, cv::IMREAD_COLOR);
- if (frame.empty()) {
- std::cerr << "Warning: unable to load image: " << val << std::endl;
- continue;
- }
- auto tstart = std::chrono::high_resolution_clock::now();
- auto objs = model->forward(yolo::Image(frame.data, frame.cols, frame.rows));
- auto tend = std::chrono::high_resolution_clock::now();
- std::cout << "elapsed millisenconds: " << std::chrono::duration_cast<std::chrono::milliseconds>(tend - tstart).count() << " ms" << std::endl;
- for (const auto& obj : objs) {
- cv::rectangle(frame, cv::Point(obj.left, obj.top), cv::Point(obj.right, obj.bottom), colors[obj.class_label], 2);
- std::string class_string = classes[obj.class_label] + ' ' + std::to_string(obj.confidence).substr(0, 4);
- cv::Size text_size = cv::getTextSize(class_string, cv::FONT_HERSHEY_DUPLEX, 1, 2, 0);
- cv::Rect text_box(obj.left, obj.top - 40, text_size.width + 10, text_size.height + 20);
- cv::rectangle(frame, text_box, colors[obj.class_label], cv::FILLED);
- cv::putText(frame, class_string, cv::Point(obj.left + 5, obj.top - 10), cv::FONT_HERSHEY_DUPLEX, 1, cv::Scalar(0, 0, 0), 2, 0);
- }
- std::string path(result_dir);
- path += "/" + key;
- cv::imwrite(path, frame);
- }
- }
- std::cout << "test finish" << std::endl;
- return 0;
- }
(3).从构建器生成序列化时序缓存(serialized timing cache)。
(2).--minShapes=<shapes>, --optShapes=<shapes>, and --maxShapes=<shapes>:指定用于构建engine的输入shapes的范围。仅当输入模型为ONNX格式时才需要。
(5).--fp16, --bf16, --int8, --fp8, --noTF32, and --best:指定network-level精度。
(6).--stronglyTyped:创建strongly typed网络。
(7).--sparsity=[disable|enable|force]:指定是否使用支持结构化稀疏性(structured sparsity)的策略。
(11).--dumpLayerInfo, --exportLayerInfo=<file>:打印/保存engine的layer信息。
如:--layerPrecisions=*:fp16,layer_1:fp32 将除layer_1之外的所有层的精度设置为FP16,而layer_1的精度将设置为FP32。
(15).--versionCompatible, --vc:为engine构建和推理启用版本兼容模式。
(1).--loadEngine=<file>:从序列化计划文件加载engine,而不是从输入ONNX模型构建它。如果输入模型是ONNX格式或者engine是使用明确的batch dimension构建的,则改用--shapes。
(4).--noDataTransfers:关闭host to device和device to host的数据传输。
(6).--dumpProfile, --exportProfile=<file>:打印/保存每层性能概况。
(7).--dumpLayerInfo, --exportLayerInfo=<file>:打印engine的层信息。
