当前位置:   article > 正文

TensorRT加速方法示例代码_ghostnet tensorrt 加速

ghostnet tensorrt 加速

部分详细流程请阅读TensorRT加速,本文在此博客基础上做了点补充。本示例主要做语义分割,因此输入大小为(3,512,512),输出为(512,512)

import sys
sys.path.insert(0, '.')
import argparse
import torch
import torch.nn as nn
from PIL import Image
import numpy as np
import cv2

import lib.transform_cv2 as T
from lib.models import model_factory
from configs import set_cfg_from_file
import onnx
import onnxruntime

torch.set_grad_enabled(False)
np.random.seed(123)

# args
parse = argparse.ArgumentParser()
parse.add_argument('--config', dest='config', type=str, default='configs/bisenetv1_steel_t.py',)
parse.add_argument('--weight-path', type=str, default='./res/model_final_120_0.864089846611023.pth',)
parse.add_argument('--img-path', dest='img_path', type=str, default='./datasets/steel_total/image/train/11_7.jpg',)
args = parse.parse_args()
cfg = set_cfg_from_file(args.config)

# define model
net = model_factory[cfg.model_type](cfg.n_cats, aux_mode='pred')
net.load_state_dict(torch.load(args.weight_path, map_location='cpu'), strict=False)
# 构造模型实例
net.eval()

# 定义输入名称,list结构,可能有多个输入
input_names = ['input']
# 定义输出名称,list结构,可能有多个输出
output_names = ['output']
# 构造输入用以验证onnx模型的正确性
input = torch.rand(1, 3, 512, 512)
output_path = "bisenet.onnx"
# 导出
torch.onnx.export(net, input, output_path,
                  export_params=True,
                  opset_version=11,
                  do_constant_folding=True,
                  input_names=input_names,
                  output_names=output_names)

# 加载 ONNX 模型
onnx_model = onnx.load("bisenet.onnx")
onnx_model_graph = onnx_model.graph
onnx_session = onnxruntime.InferenceSession(onnx_model.SerializeToString())

# 使用随机张量测试 ONNX 模型
x = torch.randn(1, 3, 512, 512).numpy()
onnx_output = onnx_session.run(output_names, {input_names[0]: x})[0]

print(f"PyTorch output: {net(torch.from_numpy(x)).detach().numpy()[0, :5]}")
print(f"ONNX output: {onnx_output[0, :5]}")

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59

以下参考博客TensorRT加速方法介绍(python pytorch模型)

import torch
import torchvision
from PIL import Image
from torchvision import transforms
import torchvision.models as models
import matplotlib.pyplot as plt
import time
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import pdb
import os
import numpy as np
import cv2

# This logger is required to build an engine
TRT_LOGGER = trt.Logger()

filename = "./datasets/steel_total/image/train/11_1.jpg"
engine_file_path = "bisenet_engine.trt"6 


class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        """Within this context, host_mom means the cpu memory and device means the GPU memory
        """
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, bindings, stream


def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer data from CPU to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]

    # Run inference.
    t_model = time.perf_counter()
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    print(f'only one line cost:{time.perf_counter() - t_model:.8f}s')

    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]

    # Synchronize the stream
    stream.synchronize()

    # Return only the host outputs.
    return [out.host for out in outputs]


print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())

# create the context for this engine
context = engine.create_execution_context()

# allocate buffers for input and output
inputs, outputs, bindings, stream = allocate_buffers(engine)  # input, output: host # bindings

normalize = transforms.Normalize(mean=(0.3442322, 0.3442322, 0.3442322), # city, rgb
    std=(0.21136102, 0.21136102, 0.21136102))

transform = transforms.Compose([
    transforms.Resize(512),
    transforms.ToTensor(), normalize]
)

t_model = time.perf_counter()

# 读图
img = Image.open("./datasets/steel_total/image/train/11_1.jpg")
#print(img.size)

# 对图像进行归一化
img_p = transform(img)
#print(img_p.shape)

# 增加一个维度
img_normalize = torch.unsqueeze(img_p, 0)
#print(img_normalize.shape)

# output
#shape_of_output = (512, 512)

# covert to numpy
img_normalize_np = img_normalize.cpu().data.numpy()

# Load data to the buffer
inputs[0].host = img_normalize_np
#print(inputs[0].host.shape)

# Do Inference
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)  # numpy data
print(f'do inference cost:{time.perf_counter() - t_model:.8f}s')

print(len(trt_outputs))

pred = trt_outputs[0].reshape(512, 512)*255
#pred = palette[out]
cv2.imwrite('./res.jpg', pred)







  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家小花儿/article/detail/423950
推荐阅读
相关标签
  

闽ICP备14008679号