可变形卷积原理、实现及工程化部署_dcnv2 onnx

作者：运维做开发 | 2024-07-25 09:40:55

踩

dcnv2 onnx

引言

本文主要以目标检测算法CenterNet作为载体，介绍可变形卷积的算法原理，python实现以及工程化部署

可变形卷积原理介绍

可变形卷积目前有DCN V1以及在V1基础上改进发展来的DCN V2，具体的算法原理可以看论文原文或者参考：

https://cloud.tencent.com/developer/article/1679389

可变形卷积的实现

这里主要参考：https://cloud.tencent.com/developer/article/1638363 中的讲解

CenterNet中实现DCN V2的工程化部署

这里主要参考的代码为：https://github.com/CaoWGG/TensorRT-CenterNet

这里参考：

有几个点需要事先说明:

为了能够实现对于模型的量化，需要将CenterNet模型先转换为ONNX，然后利用tensorRT官方公布的onnx-tensorrt库来实现tensorRT模型的转化
由于TensorRT中不支持可变形卷积的操作，所以需要自定义plugin来进行实现

CenterNet中DCNV2的plugin定义在onnx-tensorrt文件夹中，分别为：

dcn_v2_im2col_cuda.cu和dcn_v2_im2col_cuda.h；DCNv2.cpp和DCNv2.h，其中DCNv2.cpp和DCNv2.h为tensorRT中自定义plugin的文件，具体代码如下：

DCNv2Plugin::DCNv2Plugin(int in_channel,
                         int out_channel,
                         int kernel_H,
                         int kernel_W,
                         int deformable_group,
                         int dilation,
                         int groups,
                         int padding,
                         int stride,
                         nvinfer1::Weights const &weight, nvinfer1::Weights const &bias):_in_channel(in_channel),
                        _out_channel(out_channel),_kernel_H(kernel_H),_kernel_W(kernel_W),_deformable_group(deformable_group),
                         _dilation(dilation),_groups(groups),_padding(padding),_stride(stride),_initialized(false){

    if (weight.type == nvinfer1::DataType::kFLOAT)
    {
        _h_weight.assign((float*)weight.values,(float*)weight.values+weight.count);
    } else { throw std::runtime_error("Unsupported  weight dtype");}

    if (bias.type == nvinfer1::DataType::kFLOAT)
    {
        _h_bias.assign((float*)bias.values,(float*)bias.values+bias.count);
    } else { throw std::runtime_error("Unsupported  bias dtype");}

}

// 初始化函数，为参数提前开辟显存空间
int DCNv2Plugin::initialize() {
    if(_initialized) return 0;
    auto _output_dims = this->getOutputDimensions(0, &this->getInputDims(0), 3);
    assert(is_CHW(this->getInputDims(0)));
    assert(is_CHW(_output_dims));
    size_t ones_size = _output_dims.d[1]*_output_dims.d[2]* sizeof(float);
    size_t weight_size = _h_weight.size()* sizeof(float);
    size_t bias_size = _h_bias.size()* sizeof(float);
    float *ones_cpu = new float[ones_size/ sizeof(float)];
    for (int i = 0; i < ones_size/ sizeof(float); i++) {
        ones_cpu[i] = 1.0;
    }
    CHECK_CUDA(cudaMalloc((void**)&_d_columns, _in_channel * _kernel_H * _kernel_W * ones_size););
    CHECK_CUDA(cudaMalloc((void**)&_d_ones, ones_size));
    CHECK_CUDA(cudaMalloc((void**)&_d_weight, weight_size));
    CHECK_CUDA(cudaMalloc((void**)&_d_bias, bias_size));
    CHECK_CUDA(cudaMemcpy(_d_ones, ones_cpu, ones_size, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(_d_weight, _h_weight.data(), weight_size, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(_d_bias, _h_bias.data(), bias_size, cudaMemcpyHostToDevice));
    delete[] ones_cpu;
    _initialized = true;

    return 0;
}

// 用于释放之前申请的显存空间
void DCNv2Plugin::terminate() {
    if (!_initialized) {
        return;
    }
    cudaFree(_d_columns);
    cudaFree(_d_bias);
    cudaFree(_d_weight);
    cudaFree(_d_ones);
    _initialized = false;
}

DCNv2Plugin::~DCNv2Plugin() {
    terminate();
}

// 判断数据类型是否正确
bool DCNv2Plugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const {

    return (type == nvinfer1::DataType::kFLOAT);
}

// TensorRT支持动态tensor大小的时候，batch的维度需要用下面的函数定义
nvinfer1::Dims DCNv2Plugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) {
    assert(index == 0);
    assert(inputDims);
    assert(nbInputs == 3);
    nvinfer1::Dims const& input = inputDims[0];
    assert(is_CHW(input));
    nvinfer1::Dims output;
    output.nbDims = input.nbDims;
    for( int d=0; d<input.nbDims; ++d ) {
        output.type[d] = input.type[d];
        output.d[d] = input.d[d];
    }
    output.d[0] = _out_channel;
    output.d[1] = (output.d[1] + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1 ;
    output.d[2] = (output.d[2] + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1 ;
    return output;
}

// 返回该plugin需要中间显存变量的实际数据大小
size_t DCNv2Plugin::getWorkspaceSize(int maxBatchSize) const {
    return 0;
}

// 该plugin定义的op的执行函数
int DCNv2Plugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace,
                         cudaStream_t stream) {
    float alpha ,beta;
    int m, n, k;

    cublasHandle_t handle = blas_handle();
    const float* input = static_cast<const float *>(inputs[0]);
    const float* offset = static_cast<const float *>(inputs[1]);
    const float* mask = static_cast<const float *>(inputs[2]);
    float * output = static_cast<float *>(outputs[0]);
    nvinfer1::Dims input_dims = this->getInputDims(0);
    assert(batchSize==1);
    int h = input_dims.d[1];
    int w = input_dims.d[2];
    int height_out = (h + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1;
    int width_out = (w + 2 * _padding - (_dilation * (_kernel_W - 1) + 1)) / _stride + 1;
    m = _out_channel;
    n = height_out * width_out;
    k = 1;
    alpha = 1.0;
    beta = 0.0;
    /// output  nxm
    /// ones    1xn  T ->> nx1
    /// bias    1xm
    /// ones x bias = nxm
    //  add bias
    cublasSgemm(handle,
                CUBLAS_OP_T, CUBLAS_OP_N,
                n, m, k,&alpha,
                _d_ones, k,
                _d_bias, k,&beta,
                output, n);
    // im2col (offset and mask)
    modulated_deformable_im2col_cuda(stream,input,offset,mask,
                                     1, _in_channel, h, w,
                                     height_out, width_out, _kernel_H, _kernel_W,
                                     _padding, _padding, _stride, _stride, _dilation, _dilation,
                                     _deformable_group, _d_columns);
    m = _out_channel;
    n = height_out * width_out;
    k = _in_channel * _kernel_H * _kernel_W;
    alpha = 1.0;
    beta = 1.0;
    // im2col conv
    cublasSgemm(handle,
                CUBLAS_OP_N, CUBLAS_OP_N,
                n, m, k,&alpha,
                _d_columns, n,
                _d_weight, k,
                &beta,
                output, n);
    return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

然后需要将以上plugin定义好的op在onnx-tensorrt中builtin_op_importers.cpp文件中进行插件的注册操作：

DEFINE_BUILTIN_OP_IMPORTER(DCNv2) {
    ASSERT(inputs.at(0).is_tensor(),  ErrorCode::kUNSUPPORTED_NODE); // input
    ASSERT(inputs.at(1).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // offset
    ASSERT(inputs.at(2).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // mask
    ASSERT(inputs.at(3).is_weights(), ErrorCode::kUNSUPPORTED_NODE); // weight

    auto kernel_weights = inputs.at(3).weights();
    nvinfer1::Weights bias_weights;
    if( inputs.size() == 5 ) {
        ASSERT(inputs.at(4).is_weights(), ErrorCode::kUNSUPPORTED_NODE);
        auto shaped_bias_weights = inputs.at(4).weights();
        ASSERT(shaped_bias_weights.shape.nbDims == 1, ErrorCode::kINVALID_NODE);
        ASSERT(shaped_bias_weights.shape.d[0] == kernel_weights.shape.d[0], ErrorCode::kINVALID_NODE);
        bias_weights = shaped_bias_weights;
    } else {
        bias_weights = ShapedWeights::empty(kernel_weights.type);
    }
    int out_channel,in_channel,kernel_H,kernel_W,deformable_group,dilation,groups,padding,stride;
    out_channel = kernel_weights.shape.d[0];
    in_channel = kernel_weights.shape.d[1];
    kernel_H = kernel_weights.shape.d[2];
    kernel_W = kernel_weights.shape.d[3];

    OnnxAttrs attrs(node);
    deformable_group = attrs.get("deformable_group", 1);
    dilation = attrs.get("dilation", 1);
    groups = attrs.get("groups", 1);
    padding = attrs.get("padding", 1);
    stride = attrs.get("stride", 1);
    RETURN_FIRST_OUTPUT(
            ctx->addPlugin(
                    new DCNv2Plugin(in_channel,out_channel,kernel_H,kernel_W,deformable_group,
                            dilation,groups,padding,stride, kernel_weights, bias_weights),
                    {&inputs.at(0).tensor(),&inputs.at(1).tensor(),&inputs.at(2).tensor()}));
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

在builtin_plugins.cpp也进行注册：

REGISTER_BUILTIN_PLUGIN("DCNv2", DCNv2Plugin);
1

然后需要在onnx-tensorrt中的CMakeLists.txt里添加上定义plugin对应的源码并将其链接到动态库中：

# 定义插件源码
set(PLUGIN_SOURCES
  FancyActivation.cu
  ResizeNearest.cu
  Split.cu
  dcn_v2_im2col_cuda.cu
  InstanceNormalization.cpp
  DCNv2.cpp
  plugin.cpp
  )
  
# 链接到动态库
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC --expt-extended-lambda")
if(${CMAKE_VERSION} VERSION_LESS ${CMAKE_VERSION_THRESHOLD})
  CUDA_INCLUDE_DIRECTORIES(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR})
  CUDA_ADD_LIBRARY(nvonnxparser_plugin STATIC ${PLUGIN_SOURCES})
else()
  include_directories(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR})
  add_library(nvonnxparser_plugin STATIC ${PLUGIN_SOURCES})
endif()
target_include_directories(nvonnxparser_plugin PUBLIC ${CUDA_INCLUDE_DIRS} ${ONNX_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
target_link_libraries(nvonnxparser_plugin ${TENSORRT_LIBRARY} cuda cudart cublas)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

最后在链接以上生成的库来进行tensort标准C++ API函数进行前向推理以及模型转换操作

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/运维做开发/article/detail/879582