当前位置:   article > 正文

基于CUDA编程二维卷积(Conv2)实现_cuda卷积算法

cuda卷积算法

最近做的CUDA C++的高性能并行作业

本人小菜鸡一个,padding,stride...,全局/共享内存啥的都没做

如果你的输入矩阵不是方针,可自行修改输入大小的宽高,本文默认方针宽高皆为arr_size

1. 二维卷积如下,便不多做解释(图片来自网络,侵权请联系删除)

 2. CPU串行实现代码

  1. void Conv2(float** filter, float** arr, float** result, int filter_size, int arr_size) {
  2. float temp;
  3. for (int i = 0; i < arr_size - filter_size + 1; i++) {
  4. for (int j = 0; j < arr_size - filter_size + 1; j++) {
  5. temp = 0;
  6. for (int m = 0; m < filter_size; m++) {
  7. for (int n = 0; n < filter_size; n++) {
  8. temp += filter[m][n] * arr[i + m][j + n];
  9. }
  10. }
  11. result[i][j] = temp;
  12. }
  13. }
  14. }

3.GPU并行代码

此出采用了一维数组表示二维数组的方法

  1. __global__
  2. void convolution_2D_basic(float* filter, float* arr, float* result, int filter_size, int arr_size)
  3. {
  4. int Col = blockIdx.x*blockDim.x + threadIdx.x;
  5. int Row = blockIdx.y*blockDim.y + threadIdx.y;
  6. if (Row < arr_size - filter_size + 1 && Col < arr_size - filter_size + 1)
  7. {
  8. float pixVal = 0;
  9. //start
  10. int startCol = Col;
  11. int startRow = Row;
  12. //caculate the res
  13. for (int i = 0; i < filter_size; i++)
  14. {
  15. for (int j = 0; j < filter_size; j++)
  16. {
  17. int curRow = startRow + i;
  18. int curCol = startCol + j;
  19. if (curRow > -1 && curRow<arr_size&&curCol>-1 && curCol < arr_size)
  20. {
  21. pixVal += filter[i*filter_size + j] * arr[curRow*arr_size + curCol];
  22. }
  23. }
  24. }
  25. result[Row*arr_size + Col] = pixVal;
  26. }
  27. }

4. 文件结构和使用

创建一个CUDA runtime 项目(我的项目名:cuda_code)

项目下文件结构如下:

5.完整代码 

 源.cpp完整代码:

  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <time.h>
  4. #include <iostream>
  5. #include"device_launch_parameters.h"
  6. #include"cuda_runtime.h"
  7. #define FILTER_WIDTH 3
  8. int filter_size = FILTER_WIDTH;
  9. int arr_size = 1024;
  10. int result_size = arr_size + FILTER_WIDTH - 1;
  11. extern "C" void Conv2Kernel(float** arr, float** pFilter, int filter_size, int arr_size, int result_size);
  12. void Conv2(float** filter, float** arr, float** result, int filter_size, int arr_size) {
  13. float temp;
  14. for (int i = 0; i < arr_size - filter_size + 1; i++) {
  15. for (int j = 0; j < arr_size - filter_size + 1; j++) {
  16. temp = 0;
  17. for (int m = 0; m < filter_size; m++) {
  18. for (int n = 0; n < filter_size; n++) {
  19. temp += filter[m][n] * arr[i + m][j + n];
  20. }
  21. }
  22. result[i][j] = temp;
  23. }
  24. }
  25. }
  26. int main()
  27. {
  28. int dev = 0;
  29. cudaDeviceProp devProp;
  30. cudaGetDeviceProperties(&devProp, dev);
  31. std::cout << "使用GPU device " << dev << ": " << devProp.name << std::endl;
  32. std::cout << "SM的数量:" << devProp.multiProcessorCount << std::endl;
  33. std::cout << "每个线程块的共享内存大小:" << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl;
  34. std::cout << "每个Grid的Block数:" << devProp.maxGridSize[0] << " x " << devProp.maxGridSize[1] << " x " << devProp.maxGridSize[2] << std::endl;
  35. std::cout << "每个线程块的最大线程数:" << devProp.maxThreadsPerBlock << std::endl;
  36. std::cout << "每个EM的最大线程数:" << devProp.maxThreadsPerMultiProcessor << std::endl;
  37. std::cout << "每个EM的最大线程束数:" << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
  38. clock_t CPU_start, CPU_stop;
  39. // Array, filter, result
  40. float** pFilter = new float*[filter_size];
  41. for (int i = 0; i < filter_size; i++)
  42. {
  43. pFilter[i] = new float[filter_size];
  44. }
  45. float** arr = new float*[arr_size];
  46. for (int i = 0; i < arr_size; i++)
  47. {
  48. arr[i] = new float[arr_size];
  49. }
  50. float** res = new float*[result_size];
  51. for (int i = 0; i < result_size; i++)
  52. {
  53. res[i] = new float[result_size];
  54. }
  55. //initialization
  56. for (int i = 0; i < filter_size; i++) {
  57. for (int j = 0; j < filter_size; j++)
  58. pFilter[i][j] = rand() % 11;
  59. }
  60. for (int i = 0; i < arr_size; i++) {
  61. for (int j = 0; j < arr_size; j++)
  62. arr[i][j] = rand() % 11;
  63. }
  64. CPU_start = clock();
  65. Conv2(pFilter, arr, res, filter_size, arr_size);
  66. CPU_stop = clock();
  67. float CPU_time = (float)(CPU_stop - CPU_start) / CLOCKS_PER_SEC;
  68. printf("-------------------CPU version Done!------------------\n");
  69. printf("CPU time:%f \n", CPU_time);
  70. Conv2Kernel(arr, pFilter, filter_size, arr_size, result_size);
  71. }

kernel.cu完整代码:

  1. #include"device_launch_parameters.h"
  2. #include"cuda_runtime.h"
  3. #include <stdlib.h>
  4. #include <stdio.h>
  5. #include <time.h>
  6. //kernel function
  7. __global__
  8. void convolution_2D_basic(float* filter, float* arr, float* result, int filter_size, int arr_size)
  9. {
  10. int Col = blockIdx.x*blockDim.x + threadIdx.x;
  11. int Row = blockIdx.y*blockDim.y + threadIdx.y;
  12. if (Row < arr_size - filter_size + 1 && Col < arr_size - filter_size + 1)
  13. {
  14. float pixVal = 0;
  15. //start
  16. int startCol = Col;
  17. int startRow = Row;
  18. //caculate the res
  19. for (int i = 0; i < filter_size; i++)
  20. {
  21. for (int j = 0; j < filter_size; j++)
  22. {
  23. int curRow = startRow + i;
  24. int curCol = startCol + j;
  25. if (curRow > -1 && curRow<arr_size&&curCol>-1 && curCol < arr_size)
  26. {
  27. pixVal += filter[i*filter_size + j] * arr[curRow*arr_size + curCol];
  28. }
  29. }
  30. }
  31. result[Row*arr_size + Col] = pixVal;
  32. }
  33. }
  34. extern "C" void Conv2Kernel(float** arr, float** pFilter, int filter_size, int arr_size, int result_size)
  35. {
  36. int arr_size_1D = arr_size * arr_size;
  37. int filter_size_1D = filter_size * filter_size;
  38. int result_size_1D = result_size * result_size;
  39. float *arr_1D = (float*)malloc(arr_size_1D * sizeof(float));
  40. float *result_1D = (float*)malloc(result_size_1D * sizeof(float));
  41. float *filter_1D = (float*)malloc(filter_size_1D * sizeof(float));
  42. for (int i = 0; i < arr_size; i++) {
  43. for (int j = 0; j < arr_size; j++) {
  44. arr_1D[i*arr_size + j] = arr[i][j] * 1.0;
  45. }
  46. }
  47. for (int i = 0; i < filter_size; i++) {
  48. for (int j = 0; j < filter_size; j++) {
  49. filter_1D[i*filter_size + j] = pFilter[i][j] * 1.0;
  50. }
  51. }
  52. float *device_input_arr, *device_output_arr, *device_filter_arr;
  53. cudaMalloc((void**)&device_input_arr, sizeof(float) * arr_size_1D);
  54. cudaMalloc((void**)&device_output_arr, sizeof(float) * result_size_1D);
  55. cudaMalloc((void**)&device_filter_arr, sizeof(float) * filter_size_1D);
  56. cudaEvent_t start, stop;
  57. cudaEventCreate(&start); //创建Event
  58. cudaEventCreate(&stop);
  59. cudaMemcpy(device_input_arr, arr_1D, sizeof(float) * arr_size_1D, cudaMemcpyHostToDevice);
  60. cudaMemcpy(device_output_arr, result_1D, sizeof(float) * result_size_1D, cudaMemcpyHostToDevice);
  61. cudaMemcpy(device_filter_arr, filter_1D, sizeof(float) * filter_size_1D, cudaMemcpyHostToDevice);
  62. dim3 ThreadNum = (64, 64);
  63. dim3 BlockNum = ((arr_size - 0.5) / ThreadNum.x + 1, (arr_size - 0.5) / ThreadNum.x + 1, 1);
  64. cudaEventRecord(start, 0);
  65. convolution_2D_basic << <BlockNum, ThreadNum >> > (device_input_arr, device_output_arr, device_filter_arr, filter_size, arr_size);
  66. cudaEventRecord(stop, 0);
  67. cudaEventSynchronize(stop);
  68. cudaMemcpy(result_1D, device_output_arr, sizeof(float)*arr_size_1D, cudaMemcpyDeviceToHost);
  69. float GPU_time;
  70. cudaEventElapsedTime(&GPU_time, start, stop);
  71. printf("-------------------GPU version Done!------------------\n");
  72. printf("GPU_Time: %f \n", GPU_time);
  73. cudaEventDestroy(start);
  74. cudaEventDestroy(stop);
  75. cudaFree(device_input_arr);
  76. cudaFree(device_output_arr);
  77. cudaFree(device_filter_arr);
  78. }

6.如果想读入图片数据,操作如下

  • 为项目配置opencv(网上随便一搜,便能找到)
  • 添加头文件,替换初始化arr部分的代码
    1. #include <opencv2/core/core.hpp>
    2. #include <opencv2/highgui/highgui.hpp>
    3. using namespace cv;
    4. //读取图像img。0表示转换为灰度图像(单通道)读入
    5. Mat img = imread("image.jpg", 0);
    6. int row = img.rows;
    7. int col = img.cols;
    8. float** arr = new float*[row];
    9. for (int i = 0; i < row; i++)
    10. {
    11. arr[i] = new float[col];
    12. }
    13. for (int i = 0; i < row; i++) {
    14. for (int j = 0; j < col; j++) {
    15. arr[i][j] = img.at<uchar>(i, j);
    16. }
    17. }

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/凡人多烦事01/article/detail/617273
推荐阅读
相关标签
  

闽ICP备14008679号