赞
踩
矩阵乘可以利用gpu多线程并行的特点进行加速计算,但是传统简单的方法需要多次读取数据到寄存器中,增加耗时,因此利用gpu的共享内存可以被一个block内的所有线程访问到的特性,结合tiling技术进行加速计算。
理论部分不解释了,网上有很多,关键在于网上很多利用共享内存计算的代码存在错误(大部分只有在设置blockDim.x == blockDim.y 的时候,凑巧能对齐index给出正确的结果,若这俩不等,结果就错了),这里给出一个修正的版本:
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> #include <assert.h> #include "cuda_runtime.h" #include "device_launch_parameters.h" #define dim 128 #define M dim #define K dim #define N dim #define tile_width 32 #define blockdimx 8 // blockDim.x #define blockdimy 4 // blockDim.y void initial(float *array, int size) { for (int i = 0; i < size; i++) { array[i] = (float)(1); } } void printMatrix(float *array, int row, int col) { printf("\n"); float *p = array; for (int y = 0; y < row; y++) { for (int x = 0; x < col; x++) { printf("%.1f ", p[x]); } p = p + col; printf("\n"); } return; } void multiplicateMatrixOnHost(float *array_A, float *array_B, float *array_C, int M_p, int K_p, int N_p) { for (int i = 0; i < M_p; i++) { for (int j = 0; j < N_p; j++) { float sum = 0; for (int k = 0; k < K_p; k++) { sum += array_A[i*K_p + k] * array_B[k*N_p + j]; } array_C[i*N_p + j] = sum; } } } __global__ void multiplicateMatrixOnDevice(float *array_A, float *array_B, float *array_C, int M_p, int K_p, int N_p) { int ix = threadIdx.x + blockDim.x*blockIdx.x;//row number int iy = threadIdx.y + blockDim.y*blockIdx.y;//col number if (ix < N_p && iy < M_p) { float sum = 0; for (int k = 0; k < K_p; k++) { sum += array_A[iy*K_p + k] * array_B[k*N_p + ix]; } array_C[iy*N_p + ix] = sum; } } // Compute C = A * B // M, K, K, N, M, N __global__ void matrixMultiplyShared(float *A, float *B, float *C, int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns) { //@@ Insert code to implement matrix multiplication here //@@ You have to use shared memory for this MP // 1. 相比网上代码,修改这里的index __shared__ float sharedM[blockdimy][blockdimy]; __shared__ float sharedN[blockdimy][blockdimx]; int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int row = by * blockDim.y + ty; // row表示在整个A数组中的第几行,对应C中的第几行 int col = bx * blockDim.x + tx; // col表示在整个B数组中的第几列,对应C中的第几列 float Csub = 0.0; for (int i = 0; i < (int)(ceil((float)numAColumns / tile_width)); i++) { // 多层判断,防止block块不是方形时,当tx不符合要求时,有效数据被强制=0 if (tx < tile_width) { if ((i*tile_width + tx) < numAColumns && row < numARows) { sharedM[ty][tx] = A[row*numAColumns + i*tile_width + tx]; } else sharedM[ty][tx] = 0.0; } // 2. 相比网上代码,修改这里的index if (ty < tile_width) { if (i*tile_width + ty < numBRows && col < numBColumns) sharedN[ty][tx] = B[(i*tile_width + ty)*numBColumns + col]; else sharedN[ty][tx] = 0.0; } __syncthreads(); // if (blockIdx.x == 0 && blockIdx.y == 1 && threadIdx.x == 0 && threadIdx.y ==0 ) { // // printf("i = %d\n", i); // printf("sharedM: \n"); // for (int i = 0; i < blockdimy; ++i) { // for (int j = 0; j < tile_width; ++j) { // printf("%.1f ", sharedM[i][j]); // } // printf("\n"); // } // printf("sharedN: \n"); // for (int i = 0; i < tile_width; ++i) { // for (int j = 0; j < blockdimx; ++j) { // printf("%.1f ", sharedN[i][j]); // } // printf("\n"); // } // } for (int j = 0; j < tile_width; j++) // 3. 相比网上代码,修改这里的index Csub += sharedM[ty][j] * sharedN[j][tx]; __syncthreads(); } if (row < numCRows && col < numCColumns) C[row*numCColumns + col] = Csub; } int main(int argc, char **argv) { clock_t start = 0, finish = 0; float time; int Axy = M * K; int Bxy = K * N; int Cxy = M * N; float *h_A, *h_B, *hostRef, *deviceRef; h_A = (float*)malloc(Axy * sizeof(float)); h_B = (float*)malloc(Bxy * sizeof(float)); int nBytes = M * N * sizeof(float); hostRef = (float*)malloc(Cxy * sizeof(float)); deviceRef = (float*)malloc(Cxy * sizeof(float)); initial(h_A, Axy); initial(h_B, Bxy); // for cpu start = clock(); multiplicateMatrixOnHost(h_A, h_B, hostRef, M, K, N); finish = clock(); time = (float)(finish - start) / CLOCKS_PER_SEC; printf("\n"); printf("------------------------------------------------------------------------------------\n"); printf("Computing matrix product using multiplicateMatrixOnHost \n"); printf("Matrix_hostRef: (%d × %d) CPU运行时间为:%lfs\n", M, N, time); printf("------------------------------------------------------------------------------------\n"); float *d_A, *d_B, *d_C; cudaMalloc((void**)&d_A, Axy * sizeof(float)); cudaMalloc((void**)&d_B, Bxy * sizeof(float)); cudaMalloc((void**)&d_C, Cxy * sizeof(float)); cudaMemcpy(d_A, h_A, Axy * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B, Bxy * sizeof(float), cudaMemcpyHostToDevice); int dimx = blockdimx; int dimy = blockdimy; dim3 block(dimx, dimy); dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y); cudaEvent_t gpustart, gpustop; // for common gpu float elapsedTime = 0.0; cudaEventCreate(&gpustart); cudaEventCreate(&gpustop); cudaEventRecord(gpustart, 0); multiplicateMatrixOnDevice<<<grid,block>>> (d_A, d_B, d_C, M, K, N); // matrixMultiplyShared << < grid, block >> > (d_A, d_B, d_C, M, K, K, N, M, N); cudaDeviceSynchronize(); cudaEventRecord(gpustop, 0); cudaEventSynchronize(gpustop); cudaEventElapsedTime(&elapsedTime, gpustart, gpustop); cudaEventDestroy(gpustart); cudaEventDestroy(gpustop); cudaMemcpy(deviceRef, d_C, Cxy * sizeof(float), cudaMemcpyDeviceToHost); printf("\n\n"); printf("------------------------------------------------------------------------------------\n"); printf("Computing matrix product using multiplicateMatrixOnDevice \n"); printf("Matrix_deviceRef: (%d × %d) <<<(%d, %d), (%d, %d)>>> GPU运行时间为:%fs\n", M, N, grid.x, grid.y, block.x, block.y, elapsedTime / 1000); printf("------------------------------------------------------------------------------------\n"); // for shared memory opt elapsedTime = 0.0; cudaEventCreate(&gpustart); cudaEventCreate(&gpustop); cudaEventRecord(gpustart, 0); // multiplicateMatrixOnDevice<<<grid,block>>> (d_A, d_B, d_C, M, K, N); matrixMultiplyShared << < grid, block >> > (d_A, d_B, d_C, M, K, K, N, M, N); cudaDeviceSynchronize(); cudaEventRecord(gpustop, 0); cudaEventSynchronize(gpustop); cudaEventElapsedTime(&elapsedTime, gpustart, gpustop); cudaEventDestroy(gpustart); cudaEventDestroy(gpustop); cudaMemcpy(deviceRef, d_C, Cxy * sizeof(float), cudaMemcpyDeviceToHost); printf("\n\n"); printf("------------------------------------------------------------------------------------\n"); printf("Computing matrix product using matrixMultiplyShared \n"); printf("Matrix_deviceRef: (%d × %d) <<<(%d, %d), (%d, %d)>>> GPU运行时间为:%fs\n", M, N, grid.x, grid.y, block.x, block.y, elapsedTime / 1000); printf("------------------------------------------------------------------------------------\n"); // printMatrix(deviceRef, M, N); return 0; }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。