赞
踩
找了很久都没有发现 调用cuda库 中 cublasCgemmBatched 使用复数矩阵的乘法,自己尝试写了一个。
1,新建一个cuda项目
配置相关路径和依赖项 ()
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.0\common\inc
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\include
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.0\common\lib
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\lib
cublas.lib
cuda.lib
cudadevrt.lib
cudart.lib
cudart_static.lib
cufft.lib
cufftw.lib
curand.lib
cusolver.lib
cusparse.lib
粘贴源代码。
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <ctype.h>
- #include <math.h>
-
- #include <cuda_runtime.h>
- #include <cublas_v2.h>
-
- #include <helper_cuda.h>
- //#include "batchCUBLAS.h"
-
-
- #define T_ELEM cuComplex
- #define CUBLASTEST_FAILED 1
- const char *sSDKname = "batchCUBLAS";
-
-
-
- static __inline__ int imax(int x, int y)
- {
- return (x > y) ? x : y;
- }
- static __inline__ unsigned cuRand(void)
- {
- /* George Marsaglia's fast inline random number generator */
- #define CUDA_ZNEW (cuda_z=36969*(cuda_z&65535)+(cuda_z>>16))
- #define CUDA_WNEW (cuda_w=18000*(cuda_w&65535)+(cuda_w>>16))
- #define CUDA_MWC ((CUDA_ZNEW<<16)+CUDA_WNEW)
- #define CUDA_SHR3 (cuda_jsr=cuda_jsr^(cuda_jsr<<17), \
- cuda_jsr = cuda_jsr ^ (cuda_jsr >> 13), \
- cuda_jsr = cuda_jsr ^ (cuda_jsr << 5))
- #define CUDA_CONG (cuda_jcong=69069*cuda_jcong+1234567)
- #define KISS ((CUDA_MWC^CUDA_CONG)+CUDA_SHR3)
- static unsigned int cuda_z = 362436069, cuda_w = 521288629;
- static unsigned int cuda_jsr = 123456789, cuda_jcong = 380116160;
- return KISS;
- }
- #include <windows.h>
- static __inline__ double second(void)
- {
- LARGE_INTEGER t;
- static double oofreq;
- static int checkedForHighResTimer;
- static BOOL hasHighResTimer;
-
- if (!checkedForHighResTimer)
- {
- hasHighResTimer = QueryPerformanceFrequency(&t);
- oofreq = 1.0 / (double)t.QuadPart;
- checkedForHighResTimer = 1;
- }
-
- if (hasHighResTimer)
- {
- QueryPerformanceCounter(&t);
- return (double)t.QuadPart * oofreq;
- }
- else
- {
- return (double)GetTickCount() / 1000.0;
- }
- }
- struct gemmOpts
- {
- int m;
- int n;
- int k;
- //testMethod test_method;
- char *elem_type;
- int N; // number of multiplications
- };
- struct gemmTestParams
- {
- cublasOperation_t transa;
- cublasOperation_t transb;
- int m;
- int n;
- int k;
- T_ELEM alpha;
- T_ELEM beta;
- };
- #define CLEANUP() \
- do { \
- if (A) free (A); \
- if (B) free (B); \
- if (C) free (C); \
- for(int i = 0; i < opts.N; ++i) { \
- if(devPtrA[i]) cudaFree(devPtrA[i]);\
- if(devPtrB[i]) cudaFree(devPtrB[i]);\
- if(devPtrC[i]) cudaFree(devPtrC[i]);\
- } \
- if (devPtrA) free(devPtrA); \
- if (devPtrB) free(devPtrB); \
- if (devPtrC) free(devPtrC); \
- if (devPtrA_dev) cudaFree(devPtrA_dev); \
- if (devPtrB_dev) cudaFree(devPtrB_dev); \
- if (devPtrC_dev) cudaFree(devPtrC_dev); \
- fflush (stdout); \
- } while (0)
-
- void fillupMatrixDebug(T_ELEM *A, int lda, int rows, int cols)
- {
- for (int j = 0; j < cols; j++)
- {
- for (int i = 0; i < rows; i++)
- {
- A[i + lda*j].x = (i + 0);
- A[i + lda*j].y = (j + 0);
- }
- }
- }
- void PrintMatrixDebug(T_ELEM *A, int lda, int rows, int cols)
- {
- printf("========The matrix is \n");
- for (int j = 0; j < cols; j++)
- {
- for (int i = 0; i < rows; i++)
- {
- printf("%+.02f%+.02fi\t", A[i + lda*j].x, A[i + lda*j].y);
- //A[i + lda*j] = (i + j);
- }
- printf(" \n");
- }
- }
-
- int main(int argc, char *argv[])
- {
- printf("%s Starting...\n\n", sSDKname);
-
- int dev = findCudaDevice(argc, (const char **)argv);
-
- if (dev == -1)
- {
- return CUBLASTEST_FAILED;
- }
- cublasHandle_t handle;
-
- if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
- {
- fprintf(stdout, "CUBLAS initialization failed!\n");
- exit(EXIT_FAILURE);
- }
-
- cublasStatus_t status1, status2, status3;
- T_ELEM *A = NULL;
- T_ELEM *B = NULL;
- T_ELEM *C = NULL;
- T_ELEM **devPtrA = 0;
- T_ELEM **devPtrB = 0;
- T_ELEM **devPtrC = 0;
- T_ELEM **devPtrA_dev = NULL;
- T_ELEM **devPtrB_dev = NULL;
- T_ELEM **devPtrC_dev = NULL;
-
- int matrixM, matrixN, matrixK;
- int rowsA, rowsB, rowsC;
- int colsA, colsB, colsC;
- int m, n, k;
- int matrixSizeA, matrixSizeB, matrixSizeC;
- int errors;
- double start, stop;
- gemmOpts opts;
- opts.N = 20;
- gemmTestParams params;
-
- printf("Testing Batch INV Cublas\n");
-
- matrixM = 5;
- matrixN = 5;
- matrixK = 5;
-
- rowsA = imax(1, matrixM);
- colsA = imax(1, matrixK);
- rowsB = imax(1, matrixK);
- colsB = imax(1, matrixN);
- rowsC = imax(1, matrixM);
- colsC = imax(1, matrixN);
-
- matrixSizeA = rowsA * colsA;
- matrixSizeB = rowsB * colsB;
- matrixSizeC = rowsC * colsC;
-
- params.transa = CUBLAS_OP_N;
- params.transb = CUBLAS_OP_N;
- m = matrixM;
- n = matrixN;
- k = matrixK;
- params.m = m;
- params.n = n;
- params.k = k;
- params.alpha.x = 1.0; params.alpha.y = 0.0;
- params.beta.x = 0.0; params.beta.y = 0.0;
-
- devPtrA = (T_ELEM **)malloc(opts.N * sizeof(T_ELEM *));
- devPtrB = (T_ELEM **)malloc(opts.N * sizeof(*devPtrB));
- devPtrC = (T_ELEM **)malloc(opts.N * sizeof(*devPtrC));
-
- for (int i = 0; i < opts.N; i++)
- {
- cudaError_t err1 = cudaMalloc((void **)&devPtrA[i], matrixSizeA * sizeof(T_ELEM ));
- cudaError_t err2 = cudaMalloc((void **)&devPtrB[i], matrixSizeB * sizeof(devPtrB[0][0]));
- cudaError_t err3 = cudaMalloc((void **)&devPtrC[i], matrixSizeC * sizeof(devPtrC[0][0]));
-
-
- }
- // For batched processing we need those arrays on the device
-
- cudaError_t err1 = cudaMalloc((void **)&devPtrA_dev, opts.N * sizeof(*devPtrA));
- cudaError_t err2 = cudaMalloc((void **)&devPtrB_dev, opts.N * sizeof(*devPtrB));
- cudaError_t err3 = cudaMalloc((void **)&devPtrC_dev, opts.N * sizeof(*devPtrC));
-
-
-
- err1 = cudaMemcpy(devPtrA_dev, devPtrA, opts.N * sizeof(*devPtrA), cudaMemcpyHostToDevice);
- err2 = cudaMemcpy(devPtrB_dev, devPtrB, opts.N * sizeof(*devPtrB), cudaMemcpyHostToDevice);
- err3 = cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), cudaMemcpyHostToDevice);
-
- A = (T_ELEM *)malloc(matrixSizeA * sizeof(A[0]));
- B = (T_ELEM *)malloc(matrixSizeB * sizeof(B[0]));
- C = (T_ELEM *)malloc(matrixSizeC * sizeof(C[0]));
-
- printf("#### args: lda=%d ldb=%d ldc=%d\n", rowsA, rowsB, rowsC);
- m = cuRand() % matrixM;
- n = cuRand() % matrixN;
- k = cuRand() % matrixK;
- memset(A, 0xFF, matrixSizeA* sizeof(A[0]));
- fillupMatrixDebug(A, rowsA, rowsA, rowsA);
- memset(B, 0xFF, matrixSizeB* sizeof(B[0]));
- fillupMatrixDebug(B, rowsB, rowsA, rowsA);
-
- for (int i = 0; i < opts.N; i++)
- {
- status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA[i], rowsA);
- status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB[i], rowsB);
- status3 = cublasSetMatrix(rowsC, colsC, sizeof(C[0]), C, rowsC, devPtrC[i], rowsC);
-
- }
- start = second();
-
- status1 = cublasCgemmBatched(handle, params.transa, params.transb, params.m, params.n,
- params.k, ¶ms.alpha, (const T_ELEM **)devPtrA_dev, rowsA,
- (const T_ELEM **)devPtrB_dev, rowsB, ¶ms.beta, devPtrC_dev, rowsC, opts.N);
-
-
- //cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), cudaMemcpyHostToDevice);
-
- for (int i = 0; i < opts.N; i++)
- {
- //status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA[i], rowsA);
- //status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB[i], rowsB);
- status3 = cublasGetMatrix(rowsC, colsC, sizeof(C[0]), devPtrC[i], rowsC, C, rowsC);
-
- }
-
- PrintMatrixDebug(A, params.m, params.n, params.k);
- PrintMatrixDebug(B, params.m, params.n, params.k);
- PrintMatrixDebug(C, params.m, params.n, params.k);
-
- stop = second();
- fprintf(stdout, "^^^^ elapsed = %10.8f sec \n", (stop - start));
-
-
-
-
- CLEANUP();
- cublasDestroy(handle);
- printf("\nTest Summary\n");
- system("pause");
- return 0;
-
- }

输出结果,验证乘法结果是正确的。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。