当前位置:   article > 正文

预处理优化——cuda bilinear resize_cuda resize

cuda resize

https://github.com/Keylost/BilinearImageResize

YOLOv3中处理一张1080P的图片,resize到输入416*416尺寸,调用内部接口做cpu resize,可能80%~90%的时间耗在图像解码、resize上,对比推理时间耗时严重。尝试用cuda做外部resize。

修改下工程用于Ubuntu16.04,1080ti显卡,提供个包其中需要cmakelist修改下opencv路径。

https://pan.baidu.com/s/10RC1Lvxt4FFg5bsbrtnX8w

resizeGPU.cu

  1. #include "resizeGPU.cuh"
  2. //#define _DEBUG
  3. #define BLOCK_DIM 64
  4. #define threadNum 1024
  5. #define WARP_SIZE 32
  6. #define elemsPerThread 1
  7. int32_t* deviceDataResized; //отмасштабированное изображение в памяти GPU
  8. int32_t* deviceData; //оригинальное изображение в памяти GPU
  9. int32_t* hostOriginalImage;
  10. int32_t* hostResizedImage;
  11. void reAllocPinned(int w, int h, int w2, int h2, int32_t* dataSource)
  12. {
  13. cudaMallocHost((void**)&hostOriginalImage, w*h* sizeof(int32_t)); // host pinned
  14. cudaMallocHost((void**)&hostResizedImage, w2*h2 * sizeof(int32_t)); // host pinned
  15. memcpy(hostOriginalImage, dataSource, w*h * sizeof(int32_t));
  16. return;
  17. }
  18. void freePinned()
  19. {
  20. cudaFreeHost(hostOriginalImage);
  21. cudaFreeHost(hostResizedImage);
  22. return;
  23. }
  24. void initGPU(const int maxResolutionX, const int maxResolutionY)
  25. {
  26. cudaMalloc((void**)&deviceDataResized, maxResolutionX*maxResolutionY * sizeof(int32_t));
  27. cudaMalloc((void**)&deviceData, maxResolutionX*maxResolutionY * sizeof(int32_t));
  28. return;
  29. }
  30. void deinitGPU()
  31. {
  32. cudaFree(deviceData);
  33. cudaFree(deviceDataResized);
  34. return;
  35. }
  36. __global__ void SomeKernel(int32_t* originalImage, int32_t* resizedImage, int w, int h, int w2, int h2/*, float x_ratio, float y_ratio*/)
  37. {
  38. __shared__ int32_t tile[1024];
  39. const float x_ratio = ((float)(w - 1)) / w2;
  40. const float y_ratio = ((float)(h - 1)) / h2;
  41. //const int blockbx = blockIdx.y * w2 + blockIdx.x*BLOCK_DIM;
  42. //unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x;
  43. unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread;
  44. //__shared__ float result[threadNum*elemsPerThread];
  45. unsigned int shift = 0;
  46. //int32_t a, b, c, d, x, y, index;
  47. while((threadId < w2*h2 && shift<elemsPerThread))
  48. {
  49. const int32_t i = threadId / w2;
  50. const int32_t j = threadId - (i*w2);
  51. //float x_diff, y_diff, blue, red, green;
  52. const int32_t x = (int)(x_ratio * j);
  53. const int32_t y = (int)(y_ratio * i);
  54. const float x_diff = (x_ratio * j) - x;
  55. const float y_diff = (y_ratio * i) - y;
  56. const int32_t index = (y*w + x);
  57. const int32_t a = originalImage[index];
  58. const int32_t b = originalImage[index + 1];
  59. const int32_t c = originalImage[index + w];
  60. const int32_t d = originalImage[index + w + 1];
  61. // blue element
  62. // Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh)
  63. const float blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) +
  64. (c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff);
  65. // green element
  66. // Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh)
  67. const float green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) +
  68. ((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff);
  69. // red element
  70. // Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh)
  71. const float red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) +
  72. ((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff);
  73. /*
  74. resizedImage[threadId] =
  75. 0xff000000 |
  76. ((((int32_t)red) << 16) & 0xff0000) |
  77. ((((int32_t)green) << 8) & 0xff00) |
  78. ((int32_t)blue);
  79. */
  80. tile[threadIdx.x] =
  81. 0xff000000 |
  82. ((((int32_t)red) << 16) & 0xff0000) |
  83. ((((int32_t)green) << 8) & 0xff00) |
  84. ((int32_t)blue);
  85. threadId++;
  86. //threadId+= WARP_SIZE;
  87. shift++;
  88. }
  89. __syncthreads();
  90. threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread;
  91. resizedImage[threadId] = tile[threadIdx.x];
  92. /*
  93. shift--;
  94. threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread+ shift;
  95. while (shift >= 0)
  96. {
  97. resizedImage[threadId] = tile[shift];
  98. shift--;
  99. threadId--;
  100. }
  101. */
  102. }
  103. int32_t* resizeBilinear_gpu(int w, int h, int w2, int h2)
  104. {
  105. #ifdef _DEBUG
  106. cudaError_t error; //store cuda error codes
  107. #endif
  108. int length = w2 * h2;
  109. // Копирование исходных данных в GPU для обработки
  110. cudaMemcpy(deviceData, hostOriginalImage, w*h * sizeof(int32_t), cudaMemcpyHostToDevice);
  111. //cudaMemcpy2D(deviceData, w * sizeof(int32_t), hostOriginalImage, w * sizeof(int32_t), w * sizeof(int32_t), h, cudaMemcpyHostToDevice);
  112. //error = cudaMemcpyToSymbol(deviceData, pixels, w*h * sizeof(int32_t),0, cudaMemcpyHostToDevice);
  113. #ifdef _DEBUG
  114. if (error != cudaSuccess)
  115. {
  116. printf("cudaMemcpy (pixels->deviceData), returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
  117. exit(EXIT_FAILURE);
  118. }
  119. #endif
  120. dim3 threads = dim3(threadNum, 1,1); //block size 32,32,x
  121. dim3 blocks = dim3(w2*h2/ threadNum*elemsPerThread, 1,1);
  122. //printf("Blockdim.x %d\n", blocks.x);
  123. //printf("thrdim.x %d\n", threads.x);
  124. // Запуск ядра из (length / 256) блоков по 256 потоков,
  125. // предполагая, что length кратно 256
  126. SomeKernel << <blocks, threads >> >(deviceData, deviceDataResized, w, h, w2, h2/*, x_ratio, y_ratio*/);
  127. cudaDeviceSynchronize();
  128. // Считывание результата из GPU
  129. cudaMemcpy(hostResizedImage, deviceDataResized, length * sizeof(int32_t), cudaMemcpyDeviceToHost);
  130. return hostResizedImage;
  131. }

converter.cpp

  1. #include "converter.hpp"
  2. int32_t* cvtMat2Int32(const cv::Mat& srcImage)
  3. {
  4. int32_t *result = new int32_t[srcImage.cols*srcImage.rows];
  5. int offset = 0;
  6. for (int i = 0; i<srcImage.cols*srcImage.rows * 3; i += 3)
  7. {
  8. int32_t blue = srcImage.data[i];
  9. int32_t green = srcImage.data[i + 1];
  10. int32_t red = srcImage.data[i + 2];
  11. result[offset++] =
  12. 0xff000000 |
  13. ((((int32_t)red) << 16) & 0xff0000) |
  14. ((((int32_t)green) << 8) & 0xff00) |
  15. ((int32_t)blue);
  16. }
  17. return result;
  18. }
  19. void cvtInt322Mat(int32_t *pxArray, cv::Mat& outImage)
  20. {
  21. int offset = 0;
  22. for (int i = 0; i<outImage.cols*outImage.rows * 3; i += 3)
  23. {
  24. int32_t a = pxArray[offset++];
  25. int32_t blue = a & 0xff;
  26. int32_t green = ((a >> 8) & 0xff);
  27. int32_t red = ((a >> 16) & 0xff);
  28. outImage.data[i] = blue;
  29. outImage.data[i + 1] = green;
  30. outImage.data[i + 2] = red;
  31. }
  32. return;
  33. }

resizeCPU.cpp

  1. #include "resizeCPU.hpp"
  2. int* resizeBilinear_cpu(int32_t* pixels, int w, int h, int w2, int h2)
  3. {
  4. int32_t* temp = new int32_t[w2*h2];
  5. int32_t a, b, c, d, x, y, index;
  6. float x_ratio = ((float)(w - 1)) / w2;
  7. float y_ratio = ((float)(h - 1)) / h2;
  8. float x_diff, y_diff, blue, red, green;
  9. int offset = 0;
  10. for (int i = 0; i<h2; i++)
  11. {
  12. for (int j = 0; j<w2; j++)
  13. {
  14. x = (int)(x_ratio * j);
  15. y = (int)(y_ratio * i);
  16. x_diff = (x_ratio * j) - x;
  17. y_diff = (y_ratio * i) - y;
  18. index = (y*w + x);
  19. a = pixels[index];
  20. b = pixels[index + 1];
  21. c = pixels[index + w];
  22. d = pixels[index + w + 1];
  23. // blue element
  24. // Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh)
  25. blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) +
  26. (c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff);
  27. // green element
  28. // Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh)
  29. green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) +
  30. ((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff);
  31. // red element
  32. // Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh)
  33. red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) +
  34. ((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff);
  35. temp[offset++] =
  36. 0xff000000 |
  37. ((((int32_t)red) << 16) & 0xff0000) |
  38. ((((int32_t)green) << 8) & 0xff00) |
  39. ((int32_t)blue);
  40. }
  41. }
  42. return temp;
  43. }

 

对比下结果,在1080ti下,resize 1080P图片到416*416尺寸,cuda resize 1.6ms,cpu resize 3.8ms,darknet内部接口cpu resize 8.0ms。cpu resize相比darknet resize 接口主要是移位操作有提速,cuda resize处理时间减少很多,但是需要做数据类型Mat与Int32相互转换。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/432422
推荐阅读
相关标签
  

闽ICP备14008679号