赞
踩
使用TensorRT推理肯定是在英伟达GPU环境上进行推理的。
trtexec
转换成TensorRT enginetrtexec --onnx=resnet50/model.onnx --saveEngine=resnet_engine_intro.trt --explicitBatch
# 参数解释
--maxBatch:设置一个最大batchsize上限,用于输入的batchsize不确定情况下
--explicitBatch:根据onnx模型后结构自动推导出明确的batchsize
def generate_engine(onnx_path, engine_path):
# 1.构建trt日志记录器
logger = trt.Logger(trt.Logger.WARNING)
# 初始化
trt.init_libnvinfer_plugins(logger, namespace="")
# 2.create a builder,logger放入进去
builder = trt.Builder(logger)
# 3.创建配置文件,用于trt如何优化模型
config = builder.create_builder_config()
# 设置工作空间内存大小
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 20) # 1 MiB
# 设置精度
config.set_flag(trt.BuilderFlag.FP16)
# INT8需要进行校准
# 4.创建一个network。EXPLICIT_BATCH:batch是动态的
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
# 创建ONNX模型解析器
parser = trt.OnnxParser(network, logger)
# 解析ONNX模型,并填充到网络
success = parser.parse_from_file(onnx_path)
# 处理错误
for idx in range(parser.num_errors):
print(parser.get_error(idx))
if not success:
pass # Error handling code here
# 5.engine模型序列化,即生成了trt.engine model
serialized_engine = builder.build_serialized_network(network, config)
# 保存序列化的engine,如果以后要用到的话. 模型不能跨平台,即和trt版本 gpu类型有关
with open(engine_path, "wb") as f:
f.write(serialized_engine)
# 6.反序列化engine。使用runtime接口。即加载engine模型进行推理。
# runtime = trt.Runtime(logger)
# engine = runtime.deserialize_cuda_engine(serialized_engine)
# with open("sample.engine", "rb") as f:
# serialized_engine = f.read()
对象概念:
以yolov7为例,读取图像,letter box 归一化等和训练输入除了data argument一样的处理!
方式1:
import torch
import tensorrt as trt
from collections import OrderedDict, namedtuple
def infer(img_data, engine_path):
# 1.日志器
logger = trt.Logger(trt.Logger.INFO)
# 2.runtime加载trt engine model
runtime = trt.Runtime(logger)
trt.init_libnvinfer_plugins(logger, '') # initialize TensorRT plugins
with open(engine_path, "rb") as f:
serialized_engine = f.read()
engine = runtime.deserialize_cuda_engine(serialized_engine)
# 3.绑定输入输出
bindings = OrderedDict()
Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
fp16 = False
for index in range(engine.num_bindings):
name = engine.get_binding_name(index)
dtype = trt.nptype(engine.get_binding_dtype(index))
shape = tuple(engine.get_binding_shape(index))
data = torch.from_numpy(np.empty(shape, dtype=np.dtype(dtype))).to(self.device)
# Tensor.data_ptr 该tensor首个元素的地址即指针,为int类型
bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr()))
if engine.binding_is_input(index) and dtype == np.float16:
fp16 = True
# 记录输入输出的指针地址
binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
# 4.加载数据,绑定数据,并推理,将推理的结果放入到
context = engine.create_execution_context()
binding_addrs['images'] = int(img_data.data_ptr())
context.execute_v2(list(binding_addrs.values()))
# 5.获取结果((根据导出onnx模型时设置的输入输出名字获取)
nums = bindings['num'].data[0]
boxes = bindings['boxes'].data[0]
scores = bindings['scores'].data[0]
classes = bindings['classes'].data[0]
方式2:
import torch
import tensorrt as trt
from collections import OrderedDict, namedtuple
def infer2(engine_path, img_data):
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)
trt.init_libnvinfer_plugins(logger, '') # initialize TensorRT plugins
with open(engine_path, "rb") as f:
serialized_engine = f.read()
engine = runtime.deserialize_cuda_engine(serialized_engine)
context = engine.create_execution_context()
inputs, outputs, bindings = [], [], []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding))
dtype = trt.nptype(engine.get_binding_dtype(binding))
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# 绑定输入输出的地址
bindings.append(int(device_mem))
if engine.binding_is_input(binding):
inputs.append({'host': host_mem, 'device': device_mem})
else:
outputs.append({'host': host_mem, 'device': device_mem})
# 将输入防暑到内存中
inputs[0]['host'] = np.ravel(img_data)
# transfer data to the gpu
for inp in inputs:
cuda.memcpy_htod_async(inp['device'], inp['host'], stream)
# 推理
context.execute_async_v2(
bindings=bindings,
stream_handle=stream.handle)
# fetch outputs from gpu
for out in outputs:
cuda.memcpy_dtoh_async(out['host'], out['device'], stream)
# synchronize stream
stream.synchronize()
data = [out['host'] for out in outputs]
NMS等后处理。
https://github.com/Linaom1214/TensorRT-For-YOLO-Series
https://docs.nvidia.com/deeplearning/tensorrt/quick-start-guide/index.html
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。