赞
踩
核心代码:
// 2. npp 图像预处理 bool keepRation = 0 ,keepCenter= 0; int width_in = img.cols; int height_in = img.rows; NppiSize srcSize = {width_in, height_in}; NppiRect srcROI = {0, 0, width_in, height_in}; int dst_width = inputDim.d[2]; int dst_height = inputDim.d[1]; NppiSize dstSize = {dst_width, dst_height}; NppiRect dstROI = {0, 0, dst_width, dst_height}; int bgr2rgb[3] = {2, 1, 0}; Npp32f m_scale[3] = {0.003921569, 0.003921569, 0.003921569}; Npp32f* r_plane = (Npp32f*)(mCudaBuffers[0]); Npp32f* g_plane = (Npp32f*)(mCudaBuffers[0] + dst_width*dst_height*sizeof(float) ); Npp32f* b_plane = (Npp32f*)(mCudaBuffers[0] + dst_width*dst_height*2*sizeof(float) ); Npp32f* dst_planes[3] = {r_plane, g_plane, b_plane}; CUDA_CHECK(cudaMemcpy(mCudaImg, img.data, img.step[0]*img.rows, cudaMemcpyHostToDevice)); nppiResize_8u_C3R( (Npp8u*)mCudaImg, width_in * 3, srcSize, srcROI, (Npp8u*)gpu_img_resize_buf, dst_width * 3, dstSize, dstROI, NPPI_INTER_LINEAR); nppiConvert_8u32f_C3R( (Npp8u*)gpu_img_resize_buf, dst_width*3, (Npp32f*)gpu_img_plane, dst_width*3*sizeof(float), dstSize); // 转成32float nppiMulC_32f_C3IR(m_scale, (Npp32f*)gpu_img_plane, dst_width*3*sizeof(float), dstSize); // 每个通道 × scale nppiCopy_32f_C3P3R( (Npp32f*)gpu_img_plane, dst_width*3*sizeof(float), dst_planes, dst_width*sizeof(float), dstSize );
耗时
cost: 113 ms cost: 68 ms cost: 52.6667 ms cost: 45.5 ms cost: 41 ms cost: 37.8333 ms cost: 35.7143 ms cost: 34 ms cost: 32.7778 ms cost: 31.6 ms cost: 30.8182 ms cost: 30.0833 ms cost: 29.6154 ms cost: 29.0714 ms cost: 28.6667 ms cost: 28.3125 ms cost: 27.8824 ms cost: 27.6111 ms cost: 27.3158 ms cost: 27.05 ms cost: 26.8095 ms cost: 26.5455 ms cost: 26.3478 ms cost: 26.125 ms cost: 25.96 ms cost: 25.8077 ms cost: 25.6667 ms cost: 25.5 ms cost: 25.3793 ms cost: 25.2333 ms cost: 25.129 ms cost: 25 ms cost: 24.9091 ms cost: 24.7941 ms cost: 24.7143 ms cost: 24.5833 ms cost: 24.5135 ms cost: 24.3947 ms cost: 24.3077 ms cost: 24.2 ms cost: 24.0976 ms
核心代码:
#include <cuda_runtime.h> #include <device_launch_parameters.h> #include <resize.h> #include <stdio.h> __forceinline__ __device__ float3 get(uchar3* src, int x,int y,int w,int h){ if(x < 0 || x>=w || y<0 || y>=h) return make_float3(0.5,0.5,0.5); uchar3 temp = src[y*w + x]; return make_float3(float(temp.x)/255.,float(temp.y)/255.,float(temp.z)/255.); } __global__ void resizeNormKernel(uchar3* src,float *dst,int dstW, int dstH,int srcW,int srcH, float scaleX, float scaleY,float shiftX, float shiftY) { int idx = blockIdx.x * blockDim.x + threadIdx.x; const int x = idx % dstW; const int y = idx / dstW; if (x >= dstW || y >= dstH) return; float w = (x - shiftX + 0.5) * scaleX - 0.5; // 缩放的反向映射矩阵 float h = (y - shiftY + 0.5) * scaleY - 0.5; // opencv int h_low = (int)h; int w_low = (int)w; int h_high = h_low + 1; int w_high = w_low + 1; float lh = h - h_low; float lw = w - w_low; float hh = 1 - lh, hw = 1 - lw; float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; float3 v1 = get(src,w_low,h_low,srcW,srcH); float3 v2 = get(src,w_high,h_low,srcW,srcH); float3 v3 = get(src,w_low,h_high,srcW,srcH); float3 v4 = get(src,w_high,h_high,srcW,srcH); int stride = dstW*dstH; dst[y*dstW + x] = w1 *v1.x + w2 * v2.x + w3 *v3.x + w4 * v4.x ; dst[stride + y*dstW + x] = w1 *v1.y + w2 * v2.y + w3 *v3.y + w4 * v4.y ; dst[stride*2 + y*dstW + x] = w1 *v1.z + w2 * v2.z + w3 *v3.z + w4 * v4.z; } int resizeAndNorm(void * p,float *d,int w,int h,int in_w,int in_h, bool keepration ,bool keepcenter,cudaStream_t stream){ float scaleX = (w*1.0f / in_w); float scaleY = (h*1.0f / in_h); float shiftX = 0.f ,shiftY = 0.f; if(keepration)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY; if(keepration && keepcenter){shiftX = (in_w - w/scaleX)/2.f;shiftY = (in_h - h/scaleY)/2.f;} const int n = in_w*in_h; int blockSize = 1024; const int gridSize = (n + blockSize - 1) / blockSize; resizeNormKernel<<<gridSize, blockSize, 0, stream>>>((uchar3*)(p),d,in_w,in_h,w,h,scaleX,scaleY,shiftX,shiftY); return 0; }
自己写的cuda函数
cost: 21 ms cost: 22 ms cost: 22 ms cost: 22.75 ms cost: 22.8 ms cost: 23.1667 ms cost: 23.1429 ms cost: 23 ms cost: 23.1111 ms cost: 22.9 ms cost: 23 ms cost: 22.9167 ms cost: 23.1538 ms cost: 23.1429 ms cost: 23 ms cost: 22.875 ms cost: 22.7647 ms cost: 22.6667 ms cost: 22.6316 ms cost: 22.55 ms cost: 22.5714 ms cost: 22.5 ms cost: 22.5217 ms cost: 22.4583 ms cost: 22.48 ms cost: 22.4231 ms cost: 22.4444 ms cost: 22.3929 ms cost: 22.4138 ms cost: 22.3667 ms cost: 22.3871 ms cost: 22.3438 ms cost: 22.3636 ms cost: 22.3235 ms cost: 22.3143 ms cost: 22.25 ms cost: 22.2162 ms cost: 22.1579 ms cost: 22.1538 ms cost: 22.1 ms cost: 22.0732 ms
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。