赞
踩
宝藏up主!CUDA编程模型系列七(利用shared memory优化矩阵转置)_哔哩哔哩_bilibili
- #define BLOCK_SIZE 32
- #define M 3000
- #define N 1000
-
- __managed__ int matrix[N][M];
- __managed__ int gpu_matrix[M][N];
- __managed__ int cpu_matrix[M][N];
-
- __global__ void gpu_matrix_transpose(int in[N][M], int out[M][N])
- {
- int x = threadIdx.x + blockDim.x * blockIdx.x;
- int y = threadIdx.y + blockDim.y * blockIdx.y;
-
- if (x < M && y < N)
- {
- out[x][y] = in[y][x];
- }
- }
-
- __global__ void gpu_shared_matrix_transpose(int in[N][M], int out[M][N])
- {
- int x = threadIdx.x + blockDim.x * blockIdx.x;
- int y = threadIdx.y + blockDim.y * blockIdx.y;
-
- __shared__ int ken[BLOCK_SIZE + 1][BLOCK_SIZE + 1]; // 有冲突,所以多申请一些
-
- if (x < M && y < N)
- {
- ken[threadIdx.y][threadIdx.x] = in[y][x];
- }
- __syncthreads();
-
- int x1 = threadIdx.x + blockDim.y * blockIdx.y;
- int y1 = threadIdx.y + blockDim.x * blockIdx.x;
-
- if (x1 < N && y1 < M)
- {
- out[y1][x1] = ken[threadIdx.x][threadIdx.y];
- }
- }
-
- void cpu_matrix_transpose(int in[N][M], int out[M][N])
- {
- for (int y = 0; y < N; y++) {
- for (int x = 0; x < M; x++) {
- out[x][y] = in[y][x];
- }
- }
- }
-
- void transpose_test()
- {
- for (int y = 0; y < N; y++) {
- for (int x = 0; x < M; x++) {
- matrix[y][x] = rand() % 1024;
- }
- }
-
- cudaEvent_t start, stop_gpu, stop_cpu;
- cudaEventCreate(&start);
- cudaEventCreate(&stop_gpu);
- cudaEventCreate(&stop_cpu);
-
- cudaEventRecord(start);
- cudaEventSynchronize(start);
-
- dim3 dimGrid((M + BLOCK_SIZE - 1) / BLOCK_SIZE, (N + BLOCK_SIZE - 1) / BLOCK_SIZE);
- dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
-
- for (int i = 0; i < 20; i++)
- {
- gpu_matrix_transpose<<<dimGrid, dimBlock>>>(matrix, gpu_matrix);
- //gpu_shared_matrix_transpose<<<dimGrid, dimBlock>>>(matrix, gpu_matrix);
- cudaDeviceSynchronize();
- }
-
- cudaEventRecord(stop_gpu);
- cudaEventSynchronize(stop_gpu);
-
- cpu_matrix_transpose(matrix, cpu_matrix);
- cudaEventRecord(stop_cpu);
- cudaEventSynchronize(stop_cpu);
-
- float time_cpu, time_gpu;
- cudaEventElapsedTime(&time_gpu, start, stop_gpu);
- cudaEventElapsedTime(&time_cpu, stop_gpu, stop_cpu);
-
- bool errors = false;
- for (int y = 0; y < M; y++) {
- for (int x = 0; x < N; x++) {
- if (fabs(cpu_matrix[y][x] - gpu_matrix[y][x]) > (1.0e-10))
- {
- errors = true;
- break;
- }
- }
- }
- printf("Result: %s \n", errors ? "Errors" : "Pass");
- printf("CPU time: %.2f \n GPU time: %.2f \n", time_cpu, time_gpu / 20);
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。