当前位置:   article > 正文

【模型推理优化学习笔记】CUDA加速矩阵乘计算

【模型推理优化学习笔记】CUDA加速矩阵乘计算

矩阵乘可以利用gpu多线程并行的特点进行加速计算,但是传统简单的方法需要多次读取数据到寄存器中,增加耗时,因此利用gpu的共享内存可以被一个block内的所有线程访问到的特性,结合tiling技术进行加速计算。
理论部分不解释了,网上有很多,关键在于网上很多利用共享内存计算的代码存在错误(大部分只有在设置blockDim.x == blockDim.y 的时候,凑巧能对齐index给出正确的结果,若这俩不等,结果就错了),这里给出一个修正的版本:


#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define dim 128
#define M dim
#define K dim
#define N dim

#define tile_width 32  

#define blockdimx 8 // blockDim.x
#define blockdimy 4 // blockDim.y


void initial(float *array, int size)
{
	for (int i = 0; i < size; i++)
	{
		array[i] = (float)(1);
	}
}

void printMatrix(float *array, int row, int col)
{
    printf("\n");
	float *p = array;
	for (int y = 0; y < row; y++)
	{
		for (int x = 0; x < col; x++)
		{
			printf("%.1f ", p[x]);
		}
		p = p + col;
		printf("\n");
	}
	return;
}

void  multiplicateMatrixOnHost(float *array_A, float *array_B, float *array_C, int M_p, int K_p, int N_p)
{
	for (int i = 0; i < M_p; i++)
	{
		for (int j = 0; j < N_p; j++)
		{
			float sum = 0;
			for (int k = 0; k < K_p; k++)
			{
				sum += array_A[i*K_p + k] * array_B[k*N_p + j];
			}
			array_C[i*N_p + j] = sum;
		}
	}

}

__global__ void multiplicateMatrixOnDevice(float *array_A, float *array_B, float *array_C, int M_p, int K_p, int N_p)
{
	int ix = threadIdx.x + blockDim.x*blockIdx.x;//row number
	int iy = threadIdx.y + blockDim.y*blockIdx.y;//col number

	if (ix < N_p && iy < M_p)
	{
		float sum = 0;
		for (int k = 0; k < K_p; k++)
		{
			sum += array_A[iy*K_p + k] * array_B[k*N_p + ix];
		}
		array_C[iy*N_p + ix] = sum;
	}
}

// Compute C = A * B
//  M, K, K, N, M, N
__global__ void matrixMultiplyShared(float *A, float *B, float *C,
	int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns)
{
	//@@ Insert code to implement matrix multiplication here
	//@@ You have to use shared memory for this MP
 
    // 1. 相比网上代码,修改这里的index
	__shared__ float sharedM[blockdimy][blockdimy];
	__shared__ float sharedN[blockdimy][blockdimx];

	int bx = blockIdx.x;   
	int by = blockIdx.y;   
	int tx = threadIdx.x; 
	int ty = threadIdx.y; 

	int row = by * blockDim.y + ty;  // row表示在整个A数组中的第几行,对应C中的第几行   
	int col = bx * blockDim.x + tx;  // col表示在整个B数组中的第几列,对应C中的第几列


	float Csub = 0.0;   
	for (int i = 0; i < (int)(ceil((float)numAColumns / tile_width)); i++)
	{
        // 多层判断,防止block块不是方形时,当tx不符合要求时,有效数据被强制=0
		if (tx < tile_width) {
            if ((i*tile_width + tx) < numAColumns && row < numARows) {
                sharedM[ty][tx] = A[row*numAColumns + i*tile_width + tx];
            }
            else 
                sharedM[ty][tx] = 0.0;
        }

        // 2. 相比网上代码,修改这里的index
        if (ty < tile_width) {
            if (i*tile_width + ty < numBRows && col < numBColumns)
                sharedN[ty][tx] = B[(i*tile_width + ty)*numBColumns + col];
            else
                sharedN[ty][tx] = 0.0;
        }
		__syncthreads();

        // if (blockIdx.x == 0 && blockIdx.y == 1 && threadIdx.x == 0 && threadIdx.y ==0 ) {
        //     // printf("i = %d\n", i);
        //     printf("sharedM: \n");
        //     for (int i = 0; i < blockdimy; ++i) {
        //         for (int j = 0; j < tile_width; ++j) {
        //             printf("%.1f ",  sharedM[i][j]);
        //         }
        //         printf("\n");
        //     }
        //     printf("sharedN: \n");
        //     for (int i = 0; i < tile_width; ++i) {
        //         for (int j = 0; j < blockdimx; ++j) {
        //             printf("%.1f ", sharedN[i][j]);
        //         }
        //         printf("\n");
        //     }
        // }

		for (int j = 0; j < tile_width; j++)
        // 3. 相比网上代码,修改这里的index
			Csub += sharedM[ty][j] * sharedN[j][tx];
		__syncthreads();
	}

	if (row < numCRows && col < numCColumns)
		C[row*numCColumns + col] = Csub;

}


int main(int argc, char **argv)
{
	clock_t start = 0, finish = 0;
	float time;

    int Axy = M * K;
	int Bxy = K * N;
	int Cxy = M * N;

    float *h_A, *h_B, *hostRef, *deviceRef;
	h_A = (float*)malloc(Axy * sizeof(float));
	h_B = (float*)malloc(Bxy * sizeof(float));

    int nBytes = M * N * sizeof(float);
	hostRef = (float*)malloc(Cxy * sizeof(float));
	deviceRef = (float*)malloc(Cxy * sizeof(float));

    initial(h_A, Axy);
	initial(h_B, Bxy);

    // for cpu
	start = clock();
	multiplicateMatrixOnHost(h_A, h_B, hostRef, M, K, N);
	finish = clock();
	time = (float)(finish - start) / CLOCKS_PER_SEC;
	printf("\n");
	printf("------------------------------------------------------------------------------------\n");
	printf("Computing matrix product using multiplicateMatrixOnHost \n");
    printf("Matrix_hostRef: (%d × %d)  CPU运行时间为:%lfs\n", M, N, time);
	printf("------------------------------------------------------------------------------------\n");


    float *d_A, *d_B, *d_C;
	cudaMalloc((void**)&d_A, Axy * sizeof(float));
	cudaMalloc((void**)&d_B, Bxy * sizeof(float));
	cudaMalloc((void**)&d_C, Cxy * sizeof(float));

    cudaMemcpy(d_A, h_A, Axy * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, h_B, Bxy * sizeof(float), cudaMemcpyHostToDevice);

    int dimx = blockdimx;
    int dimy = blockdimy;
    dim3 block(dimx, dimy);
    dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y);

    cudaEvent_t gpustart, gpustop;

    // for common gpu 
    float elapsedTime = 0.0;
    cudaEventCreate(&gpustart);
    cudaEventCreate(&gpustop);

    cudaEventRecord(gpustart, 0);
    multiplicateMatrixOnDevice<<<grid,block>>> (d_A, d_B, d_C, M, K, N);
    // matrixMultiplyShared << < grid, block >> > (d_A, d_B, d_C, M, K, K, N, M, N);
    cudaDeviceSynchronize();
    cudaEventRecord(gpustop, 0);
    cudaEventSynchronize(gpustop);

    cudaEventElapsedTime(&elapsedTime, gpustart, gpustop);
    cudaEventDestroy(gpustart);
    cudaEventDestroy(gpustop);
    cudaMemcpy(deviceRef, d_C, Cxy * sizeof(float), cudaMemcpyDeviceToHost);
    printf("\n\n");
	printf("------------------------------------------------------------------------------------\n");
	printf("Computing matrix product using multiplicateMatrixOnDevice \n");
    printf("Matrix_deviceRef: (%d × %d)  <<<(%d, %d), (%d, %d)>>>  GPU运行时间为:%fs\n",
			M, N, grid.x, grid.y, block.x, block.y, elapsedTime / 1000);
	printf("------------------------------------------------------------------------------------\n");


    // for shared memory opt 
    elapsedTime = 0.0;
    cudaEventCreate(&gpustart);
    cudaEventCreate(&gpustop);

    cudaEventRecord(gpustart, 0);
    // multiplicateMatrixOnDevice<<<grid,block>>> (d_A, d_B, d_C, M, K, N);
    matrixMultiplyShared << < grid, block >> > (d_A, d_B, d_C, M, K, K, N, M, N);
    cudaDeviceSynchronize();
    cudaEventRecord(gpustop, 0);
    cudaEventSynchronize(gpustop);

    cudaEventElapsedTime(&elapsedTime, gpustart, gpustop);
    cudaEventDestroy(gpustart);
    cudaEventDestroy(gpustop);
    cudaMemcpy(deviceRef, d_C, Cxy * sizeof(float), cudaMemcpyDeviceToHost);
    printf("\n\n");
	printf("------------------------------------------------------------------------------------\n");
	printf("Computing matrix product using matrixMultiplyShared \n");
    printf("Matrix_deviceRef: (%d × %d)  <<<(%d, %d), (%d, %d)>>>  GPU运行时间为:%fs\n",
			M, N, grid.x, grid.y, block.x, block.y, elapsedTime / 1000);
	printf("------------------------------------------------------------------------------------\n");


    // printMatrix(deviceRef, M, N);
    return 0;
}

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208
  • 209
  • 210
  • 211
  • 212
  • 213
  • 214
  • 215
  • 216
  • 217
  • 218
  • 219
  • 220
  • 221
  • 222
  • 223
  • 224
  • 225
  • 226
  • 227
  • 228
  • 229
  • 230
  • 231
  • 232
  • 233
  • 234
  • 235
  • 236
  • 237
  • 238
  • 239
  • 240
  • 241
  • 242
  • 243
  • 244
  • 245
  • 246
  • 247
  • 248
声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号