当前位置:   article > 正文

TensorRT 模型加速_自己的模型框架如何进行tensorrt加速


TensorRT 框架模型加速

  1. TensorRT配置

    conda install pycuda
    #下载 TensorRT框架
    pip install /home/s4/Downloads/TensorRT/TensorRT-
    # 添加系统路径
    sudo gedit ~/.bashrc
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/s4/Downloads/TensorRT/TensorRT-
    export LIBRARY_PATH=/home/s4/Downloads/TensorRT/TensorRT-$LIBRARY_PATH
    source ~/.bashrc
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
  2. pt模型转通用onnx模型

    import torch
    import torch.onnx as onnx
    def convert2onnx(model, input_size, batch_size, save_path):
        this function is going to conver a pytorch model into onnx file.
        :param model: original model
        :param input_size: the input image size of original model requirement. input size should be a list object.
        :param batch_size: set a batch size in predict process. this parameter should be a int object
        :param save_path: onnx file path
        # convert a pytorch model to onnx file
        input_size.insert(0, batch_size)
        dummy_input = torch.randn(input_size)
        torch.onnx.export(model, dummy_input, save_path, verbose=False)
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
  3. onnx 模型转 tensorRT推断用的.engine模型

    使用tensorRT 自带的trtexec工具进行转码。

    trtexrc --onnx xxx.onnx --saveEngine xxx.engine --fp16
    • 1
  4. 使用trt模型推断

    import torch
    from torchvision.transforms import Normalize
    import numpy as np
    import pycuda.driver as cuda
    # 处理读入内存的图像数据
    def preprocess_image(img, f_type=16):
        norm = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        result = norm(torch.from_numpy(img).transpose(0, 2).transpose(1, 2))
        if f_type == 16:
            return np.array(result, dtype=np.float16)
        elif f_type == 32:
            return np.array(result, dtype=np.float32)
            return np.array(result, dtype=np.float64)
    # 使用TensorRT工具进行预测
    def predict(batch, d_input, d_output, output, stream, bindings, context):  # result gets copied into output
        # transfer input data to device
        cuda.memcpy_htod_async(d_input, batch, stream)
        # execute model
        context.execute_async_v2(bindings, stream.handle, None)
        # transfer predictions back
        cuda.memcpy_dtoh_async(output, d_output, stream)
        # syncronize threads
        return output
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    # TensorRT 使用demo.展示TensorRT框架加速效果。
    from commo.torchTool import preprocess_image, predict
    import tensorrt as trt
    import pycuda.driver as cuda
    import pycuda.autoinit
    import numpy as np
    import cv2 as cv
    import time
    import torchvision.models as models
    import torch
    import torch.onnx
    BATCH_SIZE = 32
    # load the pretrained model
    resnet50_gpu = models.resnet50(num_classes=1000, channels=3).to('cuda').eval()
    # load data as imput_batch
    dummy_input = torch.randn(BATCH_SIZE, 3, 640, 640)
    url = '../image/1.jpg'
    img = cv.imread(url)
    img = cv.resize(img, (640, 640), interpolation=cv.INTER_AREA)
    input_batch = np.array(np.repeat(np.expand_dims(np.array(img, dtype=np.float32), axis=0), BATCH_SIZE, axis=0),
    input_batch_chw = torch.from_numpy(input_batch).transpose(1, 3).transpose(2, 3)
    input_batch_gpu = input_batch_chw.to("cuda")
    # execute torch model in cuda without TensorRT
    t1 = time.time()
    with torch.no_grad():
        predictions = np.array(resnet50_gpu(input_batch_gpu).cpu())
    t2 = time.time()
    print('pytorch model use {} ms!'.format((t2 - t1) * 1000))
    indices = (-predictions[0]).argsort()[:5]
    print("Class | Likelihood (torch)")
    res = list(zip(indices, predictions[0][indices]))
    print('predict result is :{}'.format(res))
    preprocessed_images = np.array([preprocess_image(image) for image in input_batch])
    print('preprocessed shape is {}'.format(preprocessed_images.shape))
    f = open("../trt/resnet_engine_pytorch.trt", "rb")
    runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
    engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()
    output = np.empty([BATCH_SIZE, 25200 * 12], dtype=np.float16)
    print('Output size is {}'.format(output.shape))
    # allocate device memory
    d_input = cuda.mem_alloc(1 * input_batch.nbytes)
    d_output = cuda.mem_alloc(1 * output.nbytes)
    bindings = [int(d_input), int(d_output)]
    stream = cuda.Stream()
    t1 = time.time()
    pred = predict(preprocessed_images, d_input, d_output, output, stream, bindings, context)
    res = np.resize(pred, (32, 25200, 12))
    t2 = time.time()
    print('trf model use {}ms! '.format((t2 - t1) * 1000))
    indices = (-pred[0]).argsort()[:7]
    print("Class | Probability (trf)")
    print(list(zip(indices, pred[0][indices])))
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
