当前位置:   article > 正文

CUDA学习笔记07:shared memory Code

CUDA学习笔记07:shared memory Code

参考视频

宝藏up主!CUDA编程模型系列七(利用shared memory优化矩阵转置)_哔哩哔哩_bilibili

代码

  1. #define BLOCK_SIZE 32
  2. #define M 3000
  3. #define N 1000
  4. __managed__ int matrix[N][M];
  5. __managed__ int gpu_matrix[M][N];
  6. __managed__ int cpu_matrix[M][N];
  7. __global__ void gpu_matrix_transpose(int in[N][M], int out[M][N])
  8. {
  9. int x = threadIdx.x + blockDim.x * blockIdx.x;
  10. int y = threadIdx.y + blockDim.y * blockIdx.y;
  11. if (x < M && y < N)
  12. {
  13. out[x][y] = in[y][x];
  14. }
  15. }
  16. __global__ void gpu_shared_matrix_transpose(int in[N][M], int out[M][N])
  17. {
  18. int x = threadIdx.x + blockDim.x * blockIdx.x;
  19. int y = threadIdx.y + blockDim.y * blockIdx.y;
  20. __shared__ int ken[BLOCK_SIZE + 1][BLOCK_SIZE + 1]; // 有冲突,所以多申请一些
  21. if (x < M && y < N)
  22. {
  23. ken[threadIdx.y][threadIdx.x] = in[y][x];
  24. }
  25. __syncthreads();
  26. int x1 = threadIdx.x + blockDim.y * blockIdx.y;
  27. int y1 = threadIdx.y + blockDim.x * blockIdx.x;
  28. if (x1 < N && y1 < M)
  29. {
  30. out[y1][x1] = ken[threadIdx.x][threadIdx.y];
  31. }
  32. }
  33. void cpu_matrix_transpose(int in[N][M], int out[M][N])
  34. {
  35. for (int y = 0; y < N; y++) {
  36. for (int x = 0; x < M; x++) {
  37. out[x][y] = in[y][x];
  38. }
  39. }
  40. }
  41. void transpose_test()
  42. {
  43. for (int y = 0; y < N; y++) {
  44. for (int x = 0; x < M; x++) {
  45. matrix[y][x] = rand() % 1024;
  46. }
  47. }
  48. cudaEvent_t start, stop_gpu, stop_cpu;
  49. cudaEventCreate(&start);
  50. cudaEventCreate(&stop_gpu);
  51. cudaEventCreate(&stop_cpu);
  52. cudaEventRecord(start);
  53. cudaEventSynchronize(start);
  54. dim3 dimGrid((M + BLOCK_SIZE - 1) / BLOCK_SIZE, (N + BLOCK_SIZE - 1) / BLOCK_SIZE);
  55. dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
  56. for (int i = 0; i < 20; i++)
  57. {
  58. gpu_matrix_transpose<<<dimGrid, dimBlock>>>(matrix, gpu_matrix);
  59. //gpu_shared_matrix_transpose<<<dimGrid, dimBlock>>>(matrix, gpu_matrix);
  60. cudaDeviceSynchronize();
  61. }
  62. cudaEventRecord(stop_gpu);
  63. cudaEventSynchronize(stop_gpu);
  64. cpu_matrix_transpose(matrix, cpu_matrix);
  65. cudaEventRecord(stop_cpu);
  66. cudaEventSynchronize(stop_cpu);
  67. float time_cpu, time_gpu;
  68. cudaEventElapsedTime(&time_gpu, start, stop_gpu);
  69. cudaEventElapsedTime(&time_cpu, stop_gpu, stop_cpu);
  70. bool errors = false;
  71. for (int y = 0; y < M; y++) {
  72. for (int x = 0; x < N; x++) {
  73. if (fabs(cpu_matrix[y][x] - gpu_matrix[y][x]) > (1.0e-10))
  74. {
  75. errors = true;
  76. break;
  77. }
  78. }
  79. }
  80. printf("Result: %s \n", errors ? "Errors" : "Pass");
  81. printf("CPU time: %.2f \n GPU time: %.2f \n", time_cpu, time_gpu / 20);
  82. }

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/2023面试高手/article/detail/293190
推荐阅读
相关标签
  

闽ICP备14008679号