CUDA学习笔记07：shared memory Code

作者：2023面试高手 | 2024-03-23 04:44:39

踩

参考视频

宝藏up主！CUDA编程模型系列七(利用shared memory优化矩阵转置)_哔哩哔哩_bilibili

代码


#define BLOCK_SIZE 32
#define M 3000
#define N 1000
 
__managed__ int matrix[N][M];
__managed__ int gpu_matrix[M][N];
__managed__ int cpu_matrix[M][N];
 
__global__ void gpu_matrix_transpose(int in[N][M], int out[M][N])
{
    int x = threadIdx.x + blockDim.x * blockIdx.x;
    int y = threadIdx.y + blockDim.y * blockIdx.y;
 
    if (x < M && y < N) 
    {
        out[x][y] = in[y][x];
    }
}
 
__global__ void gpu_shared_matrix_transpose(int in[N][M], int out[M][N])
{
    int x = threadIdx.x + blockDim.x * blockIdx.x;
    int y = threadIdx.y + blockDim.y * blockIdx.y;
 
    __shared__ int ken[BLOCK_SIZE + 1][BLOCK_SIZE + 1];     // 有冲突，所以多申请一些
    
    if (x < M && y < N)
    {
        ken[threadIdx.y][threadIdx.x] = in[y][x];
    }
    __syncthreads();
 
    int x1 = threadIdx.x + blockDim.y * blockIdx.y;
    int y1 = threadIdx.y + blockDim.x * blockIdx.x;
 
    if (x1 < N && y1 < M) 
    {
        out[y1][x1] = ken[threadIdx.x][threadIdx.y];
    }
}
 
void cpu_matrix_transpose(int in[N][M], int out[M][N])
{
    for (int y = 0; y < N; y++) {
        for (int x = 0; x < M; x++) {
            out[x][y] = in[y][x];
        }
    }
}
 
void transpose_test()
{
    for (int y = 0; y < N; y++) {
        for (int x = 0; x < M; x++) {
            matrix[y][x] = rand() % 1024;
        }
    }
 
    cudaEvent_t start, stop_gpu, stop_cpu;
    cudaEventCreate(&start);
    cudaEventCreate(&stop_gpu);
    cudaEventCreate(&stop_cpu);
    
    cudaEventRecord(start);
    cudaEventSynchronize(start);
 
    dim3 dimGrid((M + BLOCK_SIZE - 1) / BLOCK_SIZE, (N + BLOCK_SIZE - 1) / BLOCK_SIZE);
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
 
    for (int i = 0; i < 20; i++)
    {
        gpu_matrix_transpose<<<dimGrid, dimBlock>>>(matrix, gpu_matrix);
        //gpu_shared_matrix_transpose<<<dimGrid, dimBlock>>>(matrix, gpu_matrix);
        cudaDeviceSynchronize();
    }
 
    cudaEventRecord(stop_gpu);
    cudaEventSynchronize(stop_gpu);
 
    cpu_matrix_transpose(matrix, cpu_matrix);
    cudaEventRecord(stop_cpu);
    cudaEventSynchronize(stop_cpu);
 
    float time_cpu, time_gpu;
    cudaEventElapsedTime(&time_gpu, start, stop_gpu);
    cudaEventElapsedTime(&time_cpu, stop_gpu, stop_cpu);
 
    bool errors = false;
    for (int y = 0; y < M; y++) {
        for (int x = 0; x < N; x++) {
            if (fabs(cpu_matrix[y][x] - gpu_matrix[y][x]) > (1.0e-10))
            {
                errors = true;
                break;
            }
        }
    }
    printf("Result: %s \n", errors ? "Errors" : "Pass");
    printf("CPU time: %.2f \n GPU time: %.2f \n", time_cpu, time_gpu / 20);
}

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/2023面试高手/article/detail/293190