赞
踩
https://github.com/Keylost/BilinearImageResize
YOLOv3中处理一张1080P的图片,resize到输入416*416尺寸,调用内部接口做cpu resize,可能80%~90%的时间耗在图像解码、resize上,对比推理时间耗时严重。尝试用cuda做外部resize。
修改下工程用于Ubuntu16.04,1080ti显卡,提供个包其中需要cmakelist修改下opencv路径。
https://pan.baidu.com/s/10RC1Lvxt4FFg5bsbrtnX8w
resizeGPU.cu
-
- #include "resizeGPU.cuh"
- //#define _DEBUG
-
- #define BLOCK_DIM 64
- #define threadNum 1024
- #define WARP_SIZE 32
- #define elemsPerThread 1
-
- int32_t* deviceDataResized; //отмасштабированное изображение в памяти GPU
- int32_t* deviceData; //оригинальное изображение в памяти GPU
- int32_t* hostOriginalImage;
- int32_t* hostResizedImage;
-
- void reAllocPinned(int w, int h, int w2, int h2, int32_t* dataSource)
- {
- cudaMallocHost((void**)&hostOriginalImage, w*h* sizeof(int32_t)); // host pinned
- cudaMallocHost((void**)&hostResizedImage, w2*h2 * sizeof(int32_t)); // host pinned
- memcpy(hostOriginalImage, dataSource, w*h * sizeof(int32_t));
-
- return;
- }
-
- void freePinned()
- {
- cudaFreeHost(hostOriginalImage);
- cudaFreeHost(hostResizedImage);
-
- return;
- }
-
- void initGPU(const int maxResolutionX, const int maxResolutionY)
- {
- cudaMalloc((void**)&deviceDataResized, maxResolutionX*maxResolutionY * sizeof(int32_t));
- cudaMalloc((void**)&deviceData, maxResolutionX*maxResolutionY * sizeof(int32_t));
-
- return;
- }
-
- void deinitGPU()
- {
- cudaFree(deviceData);
- cudaFree(deviceDataResized);
-
- return;
- }
-
- __global__ void SomeKernel(int32_t* originalImage, int32_t* resizedImage, int w, int h, int w2, int h2/*, float x_ratio, float y_ratio*/)
- {
- __shared__ int32_t tile[1024];
- const float x_ratio = ((float)(w - 1)) / w2;
- const float y_ratio = ((float)(h - 1)) / h2;
- //const int blockbx = blockIdx.y * w2 + blockIdx.x*BLOCK_DIM;
- //unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x;
- unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread;
- //__shared__ float result[threadNum*elemsPerThread];
- unsigned int shift = 0;
- //int32_t a, b, c, d, x, y, index;
- while((threadId < w2*h2 && shift<elemsPerThread))
- {
- const int32_t i = threadId / w2;
- const int32_t j = threadId - (i*w2);
- //float x_diff, y_diff, blue, red, green;
-
- const int32_t x = (int)(x_ratio * j);
- const int32_t y = (int)(y_ratio * i);
- const float x_diff = (x_ratio * j) - x;
- const float y_diff = (y_ratio * i) - y;
- const int32_t index = (y*w + x);
- const int32_t a = originalImage[index];
- const int32_t b = originalImage[index + 1];
- const int32_t c = originalImage[index + w];
- const int32_t d = originalImage[index + w + 1];
- // blue element
- // Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh)
- const float blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) +
- (c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff);
-
- // green element
- // Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh)
- const float green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) +
- ((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff);
- // red element
- // Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh)
- const float red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) +
- ((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff);
- /*
- resizedImage[threadId] =
- 0xff000000 |
- ((((int32_t)red) << 16) & 0xff0000) |
- ((((int32_t)green) << 8) & 0xff00) |
- ((int32_t)blue);
- */
- tile[threadIdx.x] =
- 0xff000000 |
- ((((int32_t)red) << 16) & 0xff0000) |
- ((((int32_t)green) << 8) & 0xff00) |
- ((int32_t)blue);
-
- threadId++;
- //threadId+= WARP_SIZE;
- shift++;
- }
-
- __syncthreads();
- threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread;
- resizedImage[threadId] = tile[threadIdx.x];
- /*
- shift--;
- threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread+ shift;
-
- while (shift >= 0)
- {
- resizedImage[threadId] = tile[shift];
- shift--;
- threadId--;
- }
- */
- }
-
-
-
- int32_t* resizeBilinear_gpu(int w, int h, int w2, int h2)
- {
- #ifdef _DEBUG
- cudaError_t error; //store cuda error codes
- #endif
- int length = w2 * h2;
-
- // Копирование исходных данных в GPU для обработки
- cudaMemcpy(deviceData, hostOriginalImage, w*h * sizeof(int32_t), cudaMemcpyHostToDevice);
- //cudaMemcpy2D(deviceData, w * sizeof(int32_t), hostOriginalImage, w * sizeof(int32_t), w * sizeof(int32_t), h, cudaMemcpyHostToDevice);
- //error = cudaMemcpyToSymbol(deviceData, pixels, w*h * sizeof(int32_t),0, cudaMemcpyHostToDevice);
- #ifdef _DEBUG
- if (error != cudaSuccess)
- {
- printf("cudaMemcpy (pixels->deviceData), returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
- exit(EXIT_FAILURE);
- }
- #endif
-
- dim3 threads = dim3(threadNum, 1,1); //block size 32,32,x
- dim3 blocks = dim3(w2*h2/ threadNum*elemsPerThread, 1,1);
- //printf("Blockdim.x %d\n", blocks.x);
- //printf("thrdim.x %d\n", threads.x);
-
- // Запуск ядра из (length / 256) блоков по 256 потоков,
- // предполагая, что length кратно 256
- SomeKernel << <blocks, threads >> >(deviceData, deviceDataResized, w, h, w2, h2/*, x_ratio, y_ratio*/);
-
- cudaDeviceSynchronize();
- // Считывание результата из GPU
- cudaMemcpy(hostResizedImage, deviceDataResized, length * sizeof(int32_t), cudaMemcpyDeviceToHost);
-
- return hostResizedImage;
- }
converter.cpp
- #include "converter.hpp"
-
- int32_t* cvtMat2Int32(const cv::Mat& srcImage)
- {
- int32_t *result = new int32_t[srcImage.cols*srcImage.rows];
- int offset = 0;
-
- for (int i = 0; i<srcImage.cols*srcImage.rows * 3; i += 3)
- {
- int32_t blue = srcImage.data[i];
- int32_t green = srcImage.data[i + 1];
- int32_t red = srcImage.data[i + 2];
- result[offset++] =
- 0xff000000 |
- ((((int32_t)red) << 16) & 0xff0000) |
- ((((int32_t)green) << 8) & 0xff00) |
- ((int32_t)blue);
- }
-
- return result;
- }
-
- void cvtInt322Mat(int32_t *pxArray, cv::Mat& outImage)
- {
- int offset = 0;
- for (int i = 0; i<outImage.cols*outImage.rows * 3; i += 3)
- {
- int32_t a = pxArray[offset++];
- int32_t blue = a & 0xff;
- int32_t green = ((a >> 8) & 0xff);
- int32_t red = ((a >> 16) & 0xff);
- outImage.data[i] = blue;
- outImage.data[i + 1] = green;
- outImage.data[i + 2] = red;
- }
- return;
- }
resizeCPU.cpp
-
- #include "resizeCPU.hpp"
-
- int* resizeBilinear_cpu(int32_t* pixels, int w, int h, int w2, int h2)
- {
- int32_t* temp = new int32_t[w2*h2];
- int32_t a, b, c, d, x, y, index;
- float x_ratio = ((float)(w - 1)) / w2;
- float y_ratio = ((float)(h - 1)) / h2;
- float x_diff, y_diff, blue, red, green;
- int offset = 0;
- for (int i = 0; i<h2; i++)
- {
- for (int j = 0; j<w2; j++)
- {
- x = (int)(x_ratio * j);
- y = (int)(y_ratio * i);
- x_diff = (x_ratio * j) - x;
- y_diff = (y_ratio * i) - y;
- index = (y*w + x);
- a = pixels[index];
- b = pixels[index + 1];
- c = pixels[index + w];
- d = pixels[index + w + 1];
-
- // blue element
- // Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh)
- blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) +
- (c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff);
-
- // green element
- // Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh)
- green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) +
- ((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff);
- // red element
- // Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh)
- red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) +
- ((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff);
- temp[offset++] =
- 0xff000000 |
- ((((int32_t)red) << 16) & 0xff0000) |
- ((((int32_t)green) << 8) & 0xff00) |
- ((int32_t)blue);
- }
- }
- return temp;
- }
对比下结果,在1080ti下,resize 1080P图片到416*416尺寸,cuda resize 1.6ms,cpu resize 3.8ms,darknet内部接口cpu resize 8.0ms。cpu resize相比darknet resize 接口主要是移位操作有提速,cuda resize处理时间减少很多,但是需要做数据类型Mat与Int32相互转换。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。