当前位置:   article > 正文

VS中Cuda使用cublas库进行复数矩阵的乘法_c++gpu复数矩阵运算

c++gpu复数矩阵运算

找了很久都没有发现 调用cuda库  中 cublasCgemmBatched 使用复数矩阵的乘法,自己尝试写了一个。

1,新建一个cuda项目 

配置相关路径和依赖项 ()

 

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.0\common\inc
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\include

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.0\common\lib
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\lib

 

cublas.lib

cuda.lib
cudadevrt.lib
cudart.lib
cudart_static.lib
cufft.lib
cufftw.lib
curand.lib
cusolver.lib
cusparse.lib

粘贴源代码。

  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <string.h>
  4. #include <ctype.h>
  5. #include <math.h>
  6. #include <cuda_runtime.h>
  7. #include <cublas_v2.h>
  8. #include <helper_cuda.h>
  9. //#include "batchCUBLAS.h"
  10. #define T_ELEM cuComplex
  11. #define CUBLASTEST_FAILED 1
  12. const char *sSDKname = "batchCUBLAS";
  13. static __inline__ int imax(int x, int y)
  14. {
  15. return (x > y) ? x : y;
  16. }
  17. static __inline__ unsigned cuRand(void)
  18. {
  19. /* George Marsaglia's fast inline random number generator */
  20. #define CUDA_ZNEW (cuda_z=36969*(cuda_z&65535)+(cuda_z>>16))
  21. #define CUDA_WNEW (cuda_w=18000*(cuda_w&65535)+(cuda_w>>16))
  22. #define CUDA_MWC ((CUDA_ZNEW<<16)+CUDA_WNEW)
  23. #define CUDA_SHR3 (cuda_jsr=cuda_jsr^(cuda_jsr<<17), \
  24. cuda_jsr = cuda_jsr ^ (cuda_jsr >> 13), \
  25. cuda_jsr = cuda_jsr ^ (cuda_jsr << 5))
  26. #define CUDA_CONG (cuda_jcong=69069*cuda_jcong+1234567)
  27. #define KISS ((CUDA_MWC^CUDA_CONG)+CUDA_SHR3)
  28. static unsigned int cuda_z = 362436069, cuda_w = 521288629;
  29. static unsigned int cuda_jsr = 123456789, cuda_jcong = 380116160;
  30. return KISS;
  31. }
  32. #include <windows.h>
  33. static __inline__ double second(void)
  34. {
  35. LARGE_INTEGER t;
  36. static double oofreq;
  37. static int checkedForHighResTimer;
  38. static BOOL hasHighResTimer;
  39. if (!checkedForHighResTimer)
  40. {
  41. hasHighResTimer = QueryPerformanceFrequency(&t);
  42. oofreq = 1.0 / (double)t.QuadPart;
  43. checkedForHighResTimer = 1;
  44. }
  45. if (hasHighResTimer)
  46. {
  47. QueryPerformanceCounter(&t);
  48. return (double)t.QuadPart * oofreq;
  49. }
  50. else
  51. {
  52. return (double)GetTickCount() / 1000.0;
  53. }
  54. }
  55. struct gemmOpts
  56. {
  57. int m;
  58. int n;
  59. int k;
  60. //testMethod test_method;
  61. char *elem_type;
  62. int N; // number of multiplications
  63. };
  64. struct gemmTestParams
  65. {
  66. cublasOperation_t transa;
  67. cublasOperation_t transb;
  68. int m;
  69. int n;
  70. int k;
  71. T_ELEM alpha;
  72. T_ELEM beta;
  73. };
  74. #define CLEANUP() \
  75. do { \
  76. if (A) free (A); \
  77. if (B) free (B); \
  78. if (C) free (C); \
  79. for(int i = 0; i < opts.N; ++i) { \
  80. if(devPtrA[i]) cudaFree(devPtrA[i]);\
  81. if(devPtrB[i]) cudaFree(devPtrB[i]);\
  82. if(devPtrC[i]) cudaFree(devPtrC[i]);\
  83. } \
  84. if (devPtrA) free(devPtrA); \
  85. if (devPtrB) free(devPtrB); \
  86. if (devPtrC) free(devPtrC); \
  87. if (devPtrA_dev) cudaFree(devPtrA_dev); \
  88. if (devPtrB_dev) cudaFree(devPtrB_dev); \
  89. if (devPtrC_dev) cudaFree(devPtrC_dev); \
  90. fflush (stdout); \
  91. } while (0)
  92. void fillupMatrixDebug(T_ELEM *A, int lda, int rows, int cols)
  93. {
  94. for (int j = 0; j < cols; j++)
  95. {
  96. for (int i = 0; i < rows; i++)
  97. {
  98. A[i + lda*j].x = (i + 0);
  99. A[i + lda*j].y = (j + 0);
  100. }
  101. }
  102. }
  103. void PrintMatrixDebug(T_ELEM *A, int lda, int rows, int cols)
  104. {
  105. printf("========The matrix is \n");
  106. for (int j = 0; j < cols; j++)
  107. {
  108. for (int i = 0; i < rows; i++)
  109. {
  110. printf("%+.02f%+.02fi\t", A[i + lda*j].x, A[i + lda*j].y);
  111. //A[i + lda*j] = (i + j);
  112. }
  113. printf(" \n");
  114. }
  115. }
  116. int main(int argc, char *argv[])
  117. {
  118. printf("%s Starting...\n\n", sSDKname);
  119. int dev = findCudaDevice(argc, (const char **)argv);
  120. if (dev == -1)
  121. {
  122. return CUBLASTEST_FAILED;
  123. }
  124. cublasHandle_t handle;
  125. if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
  126. {
  127. fprintf(stdout, "CUBLAS initialization failed!\n");
  128. exit(EXIT_FAILURE);
  129. }
  130. cublasStatus_t status1, status2, status3;
  131. T_ELEM *A = NULL;
  132. T_ELEM *B = NULL;
  133. T_ELEM *C = NULL;
  134. T_ELEM **devPtrA = 0;
  135. T_ELEM **devPtrB = 0;
  136. T_ELEM **devPtrC = 0;
  137. T_ELEM **devPtrA_dev = NULL;
  138. T_ELEM **devPtrB_dev = NULL;
  139. T_ELEM **devPtrC_dev = NULL;
  140. int matrixM, matrixN, matrixK;
  141. int rowsA, rowsB, rowsC;
  142. int colsA, colsB, colsC;
  143. int m, n, k;
  144. int matrixSizeA, matrixSizeB, matrixSizeC;
  145. int errors;
  146. double start, stop;
  147. gemmOpts opts;
  148. opts.N = 20;
  149. gemmTestParams params;
  150. printf("Testing Batch INV Cublas\n");
  151. matrixM = 5;
  152. matrixN = 5;
  153. matrixK = 5;
  154. rowsA = imax(1, matrixM);
  155. colsA = imax(1, matrixK);
  156. rowsB = imax(1, matrixK);
  157. colsB = imax(1, matrixN);
  158. rowsC = imax(1, matrixM);
  159. colsC = imax(1, matrixN);
  160. matrixSizeA = rowsA * colsA;
  161. matrixSizeB = rowsB * colsB;
  162. matrixSizeC = rowsC * colsC;
  163. params.transa = CUBLAS_OP_N;
  164. params.transb = CUBLAS_OP_N;
  165. m = matrixM;
  166. n = matrixN;
  167. k = matrixK;
  168. params.m = m;
  169. params.n = n;
  170. params.k = k;
  171. params.alpha.x = 1.0; params.alpha.y = 0.0;
  172. params.beta.x = 0.0; params.beta.y = 0.0;
  173. devPtrA = (T_ELEM **)malloc(opts.N * sizeof(T_ELEM *));
  174. devPtrB = (T_ELEM **)malloc(opts.N * sizeof(*devPtrB));
  175. devPtrC = (T_ELEM **)malloc(opts.N * sizeof(*devPtrC));
  176. for (int i = 0; i < opts.N; i++)
  177. {
  178. cudaError_t err1 = cudaMalloc((void **)&devPtrA[i], matrixSizeA * sizeof(T_ELEM ));
  179. cudaError_t err2 = cudaMalloc((void **)&devPtrB[i], matrixSizeB * sizeof(devPtrB[0][0]));
  180. cudaError_t err3 = cudaMalloc((void **)&devPtrC[i], matrixSizeC * sizeof(devPtrC[0][0]));
  181. }
  182. // For batched processing we need those arrays on the device
  183. cudaError_t err1 = cudaMalloc((void **)&devPtrA_dev, opts.N * sizeof(*devPtrA));
  184. cudaError_t err2 = cudaMalloc((void **)&devPtrB_dev, opts.N * sizeof(*devPtrB));
  185. cudaError_t err3 = cudaMalloc((void **)&devPtrC_dev, opts.N * sizeof(*devPtrC));
  186. err1 = cudaMemcpy(devPtrA_dev, devPtrA, opts.N * sizeof(*devPtrA), cudaMemcpyHostToDevice);
  187. err2 = cudaMemcpy(devPtrB_dev, devPtrB, opts.N * sizeof(*devPtrB), cudaMemcpyHostToDevice);
  188. err3 = cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), cudaMemcpyHostToDevice);
  189. A = (T_ELEM *)malloc(matrixSizeA * sizeof(A[0]));
  190. B = (T_ELEM *)malloc(matrixSizeB * sizeof(B[0]));
  191. C = (T_ELEM *)malloc(matrixSizeC * sizeof(C[0]));
  192. printf("#### args: lda=%d ldb=%d ldc=%d\n", rowsA, rowsB, rowsC);
  193. m = cuRand() % matrixM;
  194. n = cuRand() % matrixN;
  195. k = cuRand() % matrixK;
  196. memset(A, 0xFF, matrixSizeA* sizeof(A[0]));
  197. fillupMatrixDebug(A, rowsA, rowsA, rowsA);
  198. memset(B, 0xFF, matrixSizeB* sizeof(B[0]));
  199. fillupMatrixDebug(B, rowsB, rowsA, rowsA);
  200. for (int i = 0; i < opts.N; i++)
  201. {
  202. status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA[i], rowsA);
  203. status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB[i], rowsB);
  204. status3 = cublasSetMatrix(rowsC, colsC, sizeof(C[0]), C, rowsC, devPtrC[i], rowsC);
  205. }
  206. start = second();
  207. status1 = cublasCgemmBatched(handle, params.transa, params.transb, params.m, params.n,
  208. params.k, &params.alpha, (const T_ELEM **)devPtrA_dev, rowsA,
  209. (const T_ELEM **)devPtrB_dev, rowsB, &params.beta, devPtrC_dev, rowsC, opts.N);
  210. //cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), cudaMemcpyHostToDevice);
  211. for (int i = 0; i < opts.N; i++)
  212. {
  213. //status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA[i], rowsA);
  214. //status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB[i], rowsB);
  215. status3 = cublasGetMatrix(rowsC, colsC, sizeof(C[0]), devPtrC[i], rowsC, C, rowsC);
  216. }
  217. PrintMatrixDebug(A, params.m, params.n, params.k);
  218. PrintMatrixDebug(B, params.m, params.n, params.k);
  219. PrintMatrixDebug(C, params.m, params.n, params.k);
  220. stop = second();
  221. fprintf(stdout, "^^^^ elapsed = %10.8f sec \n", (stop - start));
  222. CLEANUP();
  223. cublasDestroy(handle);
  224. printf("\nTest Summary\n");
  225. system("pause");
  226. return 0;
  227. }

输出结果,验证乘法结果是正确的。

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/不正经/article/detail/245506
推荐阅读
  

闽ICP备14008679号