当前位置:   article > 正文

TensorRT笔记十(yolov5后处理)

yolov5后处理

一、前言

  • yolov5的输出为tensor(25200 * 85)
    (1)其中85表示为5+80,即cx, cy, width, height, objness(框里是否含有目标), classification * 80类别置信度
    (2)25200 = (20 * 20 + 40 * 40 + 80 * 80) * 3, 其中20 * 20, 40 * 40, 80 * 80分别是提取深、中、浅层的网格大小,浅层用来预测小目标,深层用来预测大目标;* 3是因为每个点有三种anchor
    20 * 20特征图的预测框
  • 使用核函数对yolov5推理结果进行解码并恢复成框
  • 研究过程,可以把pytorch的数据转换为numpy,通过tobytes写到文件,再使用C++进行读取,快速进行问题研究和排查,不需要tensorRT推理也可以进行后处理研究
  • 对于类似GPU解码这种复杂实现,可以先实现CPU解码,再进行修改

二、后处理CPU解码

CPU解码的重点思想

  • 避免多余计算,有些数学运算需要的时间远超过很多if语句,减少计算的次数就是性能的关键
  • nms实现的优化,例如使用remove_flag且预先分配内存,reserve对输出分配内存

1. 解码返回的结果为预测框

返回类型vector,Box为预测框的结构体

struct Box{
    float left, top, right, bottom, confidence;
    int label;

    Box() = default;
    Box(float left, float top, float right, float bottom, float confidence, int label):
    left(left), top(top), right(right), bottom(bottom), confidence(confidence), label(label){}
};
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8

2. 避免多余计算,先对objness进行判断

 auto data = load_file("predict.data");
 auto image = cv::imread("input-image.jpg");
 float* ptr = (float*)data.data();
 int nelem = data.size() / sizeof(float);
 int ncols = 85;
 int nrows = nelem / ncols;
 auto boxes = cpu_decode(ptr, nrows, ncols);
	vector<Box> cpu_decode(float* predict, int rows, int cols, float confidence_threshold = 0.25f, float nms_threshold = 0.45f){
		vector<Box> boxes;
    int num_classes = cols - 5;
    for(int i = 0; i < rows; ++i){
        float* pitem = predict + i * cols;
        float objness = pitem[4];
        if(objness < confidence_threshold)
            continue;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15

3. 避免多余计算,再对置信度进行判断

        float* pclass = pitem + 5;
        int label     = std::max_element(pclass, pclass + num_classes) - pclass;
        float prob    = pclass[label];
        float confidence = prob * objness;
        if(confidence < confidence_threshold)
            continue;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

4. 含有目标且置信度大于阈值的存入boxes

        float cx     = pitem[0];
        float cy     = pitem[1];
        float width  = pitem[2];
        float height = pitem[3];
        float left   = cx - width * 0.5;
        float top    = cy - height * 0.5;
        float right  = cx + width * 0.5;
        float bottom = cy + height * 0.5;
        boxes.emplace_back(left, top, right, bottom, confidence, (float)label);
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

5. nms的准备工作

  • boxes的置信度进行排序,lamda表达式的参数给引用,避免拷贝
  • std::sort(boxes.begin(), boxes.end(), [](Box& a, Box& b){return a.confidence > b.confidence;});
  • 为避免boxes中框删除过程,vector中框的移动造成时间浪费,采用remove_flags的标记来记录是否删除,true表示已删除
  • std::vector<bool> remove_flags(boxes.size());
  • vector<>.reserve提前分配空间,避免多次使用push_back分配内存,性能降低
std::vector<Box> box_result;
box_result.reserve(boxes.size());
  • 1
  • 2

5. iou计算表达式

    auto iou = [](const Box& a, const Box& b){
        float cross_left   = std::max(a.left, b.left);
        float cross_top    = std::max(a.top, b.top);
        float cross_right  = std::min(a.right, b.right);
        float cross_bottom = std::min(a.bottom, b.bottom);

        float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);
        float union_area = std::max(0.0f, a.right - a.left) * std::max(0.0f, a.bottom - a.top) 
                         + std::max(0.0f, b.right - b.left) * std::max(0.0f, b.bottom - b.top) - cross_area;
        if(cross_area == 0 || union_area == 0) return 0.0f;
        return cross_area / union_area;
    };
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12

6. nms实现

for(int i = 0; i < boxes.size(); ++i){
        if(remove_flags[i]) continue; //判断是否已不是目标框

        auto& ibox = boxes[i];
        box_result.emplace_back(ibox);
        for(int j = i + 1; j < boxes.size(); ++j){
            if(remove_flags[j]) continue;

            auto& jbox = boxes[j];
            if(ibox.label == jbox.label){
                // class matched
                if(iou(ibox, jbox) >= nms_threshold)
                    remove_flags[j] = true;
            }
        }
    }
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16

6. 画检测框

for(auto& box : boxes){
        cv::rectangle(image, cv::Point(box.left, box.top), cv::Point(box.right, box.bottom), cv::Scalar(0, 255, 0), 2);
        cv::putText(image, cv::format("%.2f", box.confidence), cv::Point(box.left, box.top - 7), 0, 0.8, cv::Scalar(0, 0, 255), 2, 16);
    }
    
    cv::imwrite("image-draw.jpg", image);
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

二、后处理GPU解码

GPU解码的重点思想

  • cpu中可是使用vector表示数量不确定的数组,gpu解码使用[count,box1,box2,…]的方式,此方法需要有最大数量限制
  • 通过atomicAdd实现数组元素的加入,并返回索引
  • 避免不必要的计算

1. 创建stream

    cudaStream_t stream = nullptr;
    checkRuntime(cudaStreamCreate(&stream));
  • 1
  • 2

2. 申请内存

  • 定义最大目标框数量max_objects = 1000
  • 每个框的元素有7个,NUM_BOX_ELEMENT = 7, 分别为框的left, top, right, bottom,confidence置信度,class类别,keepflag(是否保留框的标志位,true保留)
  • sizeof(float)表示count,max_objects * NUM_BOX_ELEMENT * sizeof(float))表示框
float* predict_device = nullptr;
    float* output_device = nullptr;
    float* output_host = nullptr;
    int max_objects = 1000;
    int NUM_BOX_ELEMENT = 7;  // left, top, right, bottom, confidence, class, keepflag
    checkRuntime(cudaMalloc(&predict_device, rows * cols * sizeof(float)));
    //sizeof(float)表示count,max_objects * NUM_BOX_ELEMENT * sizeof(float))表示框
    checkRuntime(cudaMalloc(&output_device, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));
    checkRuntime(cudaMallocHost(&output_host, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));

    checkRuntime(cudaMemcpyAsync(predict_device, predict, rows * cols * sizeof(float), cudaMemcpyHostToDevice, stream));
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11

3. gpu解码

调用cuda函数

decode_kernel_invoker(
        predict_device, rows, cols - 5, confidence_threshold, 
        nms_threshold, nullptr, output_device, max_objects, NUM_BOX_ELEMENT, stream
    );
  • 1
  • 2
  • 3
  • 4
  • 定义block和grid的大小
   auto block = num_bboxes > 512 ? 512 : num_bboxes;
    auto grid = (num_bboxes + block - 1) / block; //向上取整
  • 1
  • 2

4. 解码核函数

调用解码核函数

    decode_kernel<<<grid, block, 0, stream>>>(
        predict, num_bboxes, num_classes, confidence_threshold, 
        invert_affine_matrix, parray, max_objects, NUM_BOX_ELEMENT
    );
  • 1
  • 2
  • 3
  • 4
  • 解码核函数计算类似于cpu解码,避免不必要的计算

  • 开启总框数量个线程,25200

  • int index = atomicAdd(parray, 1);计算count大小:parray = [count , box1, box2, box3 …,] atomicAdd -> count += 1 但返回的是未加前的count,详细见CUDA笔记一(atomicAdd)

  • 超过符合阈值的最大框数量的return掉,实现动态数组 if(index >= max_objects) return;

static __global__ void decode_kernel(
    float* predict, int num_bboxes, int num_classes, float confidence_threshold, 
    float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT
){  
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= num_bboxes) return;

    float* pitem     = predict + (5 + num_classes) * position;
    float objectness = pitem[4];
    if(objectness < confidence_threshold)
        return;

    float* class_confidence = pitem + 5;
    float confidence        = *class_confidence++;
    int label               = 0;
    for(int i = 1; i < num_classes; ++i, ++class_confidence){
        if(*class_confidence > confidence){
            confidence = *class_confidence;
            label      = i;
        }
    }

    confidence *= objectness;
    if(confidence < confidence_threshold)
        return;

    int index = atomicAdd(parray, 1);
    if(index >= max_objects)
        return;

    float cx         = *pitem++;
    float cy         = *pitem++;
    float width      = *pitem++;
    float height     = *pitem++;
    float left   = cx - width * 0.5f;
    float top    = cy - height * 0.5f;
    float right  = cx + width * 0.5f;
    float bottom = cy + height * 0.5f;
    // affine_project(invert_affine_matrix, left,  top,    &left,  &top);
    // affine_project(invert_affine_matrix, right, bottom, &right, &bottom);

    // left, top, right, bottom, confidence, class, keepflag
    float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
    *pout_item++ = left;
    *pout_item++ = top;
    *pout_item++ = right;
    *pout_item++ = bottom;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1; // 1 = keep, 0 = ignore
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51

5. nms核函数

调用nms核函数

  • 启用最大框数量max_objects个线程
block = max_objects > 512 ? 512 : max_objects;
    grid = (max_objects + block - 1) / block;
    fast_nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold, NUM_BOX_ELEMENT);
  • 1
  • 2
  • 3

6. 实现nms核函数

  • 每个框都要单独对其余所有框进行nms计算是否是检测框
  • 由于并行计算且if过滤掉很多条件,速度很快
static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold, int NUM_BOX_ELEMENT){

    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = min((int)*bboxes, max_objects);
    if (position >= count) 
        return;
    
    // left, top, right, bottom, confidence, class, keepflag
    float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
    for(int i = 0; i < count; ++i){
        float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
        if(i == position || pcurrent[5] != pitem[5]) continue;

        if(pitem[4] >= pcurrent[4]){
            if(pitem[4] == pcurrent[4] && i < position)
                continue;

            float iou = box_iou(
                pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
                pitem[0],    pitem[1],    pitem[2],    pitem[3]
            );

            if(iou > threshold){
                pcurrent[6] = 0;  // 1=keep, 0=ignore
                return;
            }
        }
    }
} 
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29

7. 检测框结果存储

    checkRuntime(cudaMemcpyAsync(output_host, output_device, 
        sizeof(int) + max_objects * NUM_BOX_ELEMENT * sizeof(float), 
        cudaMemcpyDeviceToHost, stream
    ));
    checkRuntime(cudaStreamSynchronize(stream));

    int num_boxes = min((int)output_host[0], max_objects);
    for(int i = 0; i < num_boxes; ++i){
        float* ptr = output_host + 1 + NUM_BOX_ELEMENT * i;
        int keep_flag = ptr[6];
        if(keep_flag){
            box_result.emplace_back(
                ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], (int)ptr[5]
            );
        }
    }
    checkRuntime(cudaStreamDestroy(stream));
    checkRuntime(cudaFree(predict_device));
    checkRuntime(cudaFree(output_device));
    checkRuntime(cudaFreeHost(output_host));
    return box_result;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/从前慢现在也慢/article/detail/300311
推荐阅读
相关标签
  

闽ICP备14008679号