赞
踩
返回类型vector,Box为预测框的结构体
struct Box{
float left, top, right, bottom, confidence;
int label;
Box() = default;
Box(float left, float top, float right, float bottom, float confidence, int label):
left(left), top(top), right(right), bottom(bottom), confidence(confidence), label(label){}
};
auto data = load_file("predict.data");
auto image = cv::imread("input-image.jpg");
float* ptr = (float*)data.data();
int nelem = data.size() / sizeof(float);
int ncols = 85;
int nrows = nelem / ncols;
auto boxes = cpu_decode(ptr, nrows, ncols);
vector<Box> cpu_decode(float* predict, int rows, int cols, float confidence_threshold = 0.25f, float nms_threshold = 0.45f){
vector<Box> boxes;
int num_classes = cols - 5;
for(int i = 0; i < rows; ++i){
float* pitem = predict + i * cols;
float objness = pitem[4];
if(objness < confidence_threshold)
continue;
float* pclass = pitem + 5;
int label = std::max_element(pclass, pclass + num_classes) - pclass;
float prob = pclass[label];
float confidence = prob * objness;
if(confidence < confidence_threshold)
continue;
float cx = pitem[0];
float cy = pitem[1];
float width = pitem[2];
float height = pitem[3];
float left = cx - width * 0.5;
float top = cy - height * 0.5;
float right = cx + width * 0.5;
float bottom = cy + height * 0.5;
boxes.emplace_back(left, top, right, bottom, confidence, (float)label);
std::sort(boxes.begin(), boxes.end(), [](Box& a, Box& b){return a.confidence > b.confidence;})
;std::vector<bool> remove_flags(boxes.size());
std::vector<Box> box_result;
box_result.reserve(boxes.size());
auto iou = [](const Box& a, const Box& b){
float cross_left = std::max(a.left, b.left);
float cross_top = std::max(a.top, b.top);
float cross_right = std::min(a.right, b.right);
float cross_bottom = std::min(a.bottom, b.bottom);
float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);
float union_area = std::max(0.0f, a.right - a.left) * std::max(0.0f, a.bottom - a.top)
+ std::max(0.0f, b.right - b.left) * std::max(0.0f, b.bottom - b.top) - cross_area;
if(cross_area == 0 || union_area == 0) return 0.0f;
return cross_area / union_area;
};
for(int i = 0; i < boxes.size(); ++i){ if(remove_flags[i]) continue; //判断是否已不是目标框 auto& ibox = boxes[i]; box_result.emplace_back(ibox); for(int j = i + 1; j < boxes.size(); ++j){ if(remove_flags[j]) continue; auto& jbox = boxes[j]; if(ibox.label == jbox.label){ // class matched if(iou(ibox, jbox) >= nms_threshold) remove_flags[j] = true; } } }
for(auto& box : boxes){
cv::rectangle(image, cv::Point(box.left, box.top), cv::Point(box.right, box.bottom), cv::Scalar(0, 255, 0), 2);
cv::putText(image, cv::format("%.2f", box.confidence), cv::Point(box.left, box.top - 7), 0, 0.8, cv::Scalar(0, 0, 255), 2, 16);
}
cv::imwrite("image-draw.jpg", image);
cudaStream_t stream = nullptr;
checkRuntime(cudaStreamCreate(&stream));
float* predict_device = nullptr;
float* output_device = nullptr;
float* output_host = nullptr;
int max_objects = 1000;
int NUM_BOX_ELEMENT = 7; // left, top, right, bottom, confidence, class, keepflag
checkRuntime(cudaMalloc(&predict_device, rows * cols * sizeof(float)));
//sizeof(float)表示count,max_objects * NUM_BOX_ELEMENT * sizeof(float))表示框
checkRuntime(cudaMalloc(&output_device, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));
checkRuntime(cudaMallocHost(&output_host, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));
checkRuntime(cudaMemcpyAsync(predict_device, predict, rows * cols * sizeof(float), cudaMemcpyHostToDevice, stream));
调用cuda函数
decode_kernel_invoker(
predict_device, rows, cols - 5, confidence_threshold,
nms_threshold, nullptr, output_device, max_objects, NUM_BOX_ELEMENT, stream
);
auto block = num_bboxes > 512 ? 512 : num_bboxes;
auto grid = (num_bboxes + block - 1) / block; //向上取整
调用解码核函数
decode_kernel<<<grid, block, 0, stream>>>(
predict, num_bboxes, num_classes, confidence_threshold,
invert_affine_matrix, parray, max_objects, NUM_BOX_ELEMENT
);
解码核函数计算类似于cpu解码,避免不必要的计算
开启总框数量个线程,25200
int index = atomicAdd(parray, 1);计算count大小:parray = [count , box1, box2, box3 …,] atomicAdd -> count += 1 但返回的是未加前的count,详细见CUDA笔记一(atomicAdd)
超过符合阈值的最大框数量的return掉,实现动态数组 if(index >= max_objects) return;
static __global__ void decode_kernel( float* predict, int num_bboxes, int num_classes, float confidence_threshold, float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT ){ int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= num_bboxes) return; float* pitem = predict + (5 + num_classes) * position; float objectness = pitem[4]; if(objectness < confidence_threshold) return; float* class_confidence = pitem + 5; float confidence = *class_confidence++; int label = 0; for(int i = 1; i < num_classes; ++i, ++class_confidence){ if(*class_confidence > confidence){ confidence = *class_confidence; label = i; } } confidence *= objectness; if(confidence < confidence_threshold) return; int index = atomicAdd(parray, 1); if(index >= max_objects) return; float cx = *pitem++; float cy = *pitem++; float width = *pitem++; float height = *pitem++; float left = cx - width * 0.5f; float top = cy - height * 0.5f; float right = cx + width * 0.5f; float bottom = cy + height * 0.5f; // affine_project(invert_affine_matrix, left, top, &left, &top); // affine_project(invert_affine_matrix, right, bottom, &right, &bottom); // left, top, right, bottom, confidence, class, keepflag float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT; *pout_item++ = left; *pout_item++ = top; *pout_item++ = right; *pout_item++ = bottom; *pout_item++ = confidence; *pout_item++ = label; *pout_item++ = 1; // 1 = keep, 0 = ignore }
block = max_objects > 512 ? 512 : max_objects;
grid = (max_objects + block - 1) / block;
fast_nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold, NUM_BOX_ELEMENT);
static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold, int NUM_BOX_ELEMENT){ int position = (blockDim.x * blockIdx.x + threadIdx.x); int count = min((int)*bboxes, max_objects); if (position >= count) return; // left, top, right, bottom, confidence, class, keepflag float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT; for(int i = 0; i < count; ++i){ float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT; if(i == position || pcurrent[5] != pitem[5]) continue; if(pitem[4] >= pcurrent[4]){ if(pitem[4] == pcurrent[4] && i < position) continue; float iou = box_iou( pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3] ); if(iou > threshold){ pcurrent[6] = 0; // 1=keep, 0=ignore return; } } } }
checkRuntime(cudaMemcpyAsync(output_host, output_device, sizeof(int) + max_objects * NUM_BOX_ELEMENT * sizeof(float), cudaMemcpyDeviceToHost, stream )); checkRuntime(cudaStreamSynchronize(stream)); int num_boxes = min((int)output_host[0], max_objects); for(int i = 0; i < num_boxes; ++i){ float* ptr = output_host + 1 + NUM_BOX_ELEMENT * i; int keep_flag = ptr[6]; if(keep_flag){ box_result.emplace_back( ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], (int)ptr[5] ); } } checkRuntime(cudaStreamDestroy(stream)); checkRuntime(cudaFree(predict_device)); checkRuntime(cudaFree(output_device)); checkRuntime(cudaFreeHost(output_host)); return box_result;
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。