当前位置:   article > 正文

cublasSgetriBatched的input matrix A 的值,在計算之後是否被改變或叫做污染,答案是No

cublassgetribatched

基於Nvidia的sample源文件改寫;可以發現,Sgetri的輸入矩陣A的元素值,并沒有改變;

編譯的話,在cudaSample對應的blas文件夾中置入如下cu文件,并且修改對應的makefile裏的變量名字來編譯運行:

  1. /*
  2. * Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
  3. *
  4. * Please refer to the NVIDIA end user license agreement (EULA) associated
  5. * with this source code for terms and conditions that govern your use of
  6. * this software. Any use, reproduction, disclosure, or distribution of
  7. * this software and related documentation outside the terms of the EULA
  8. * is strictly prohibited.
  9. *
  10. */
  11. /*
  12. * This example demonstrates how to use the cuBLAS library API
  13. * for lower-upper (LU) decomposition of a matrix. LU decomposition
  14. * factors a matrix as the product of upper triangular matrix and
  15. * lower trianglular matrix.
  16. *
  17. * https://en.wikipedia.org/wiki/LU_decomposition
  18. *
  19. * This sample uses 10000 matrices of size 4x4 and performs
  20. * LU decomposition of them using batched decomposition API
  21. * of cuBLAS library. To test the correctness of upper and lower
  22. * matrices generated, they are multiplied and compared with the
  23. * original input matrix.
  24. *
  25. */
  26. #include <stdio.h>
  27. #include <stdlib.h>
  28. // cuda libraries and helpers
  29. #include <cublas_v2.h>
  30. #include <cuda_runtime.h>
  31. #include <helper_cuda.h>
  32. // configurable parameters
  33. // dimension of matrix
  34. #define N 170
  35. #define BATCH_SIZE 1
  36. // use double precision data type
  37. //LL: #define DOUBLE_PRECISION /* comment this to use single precision */
  38. #ifdef DOUBLE_PRECISION
  39. #define DATA_TYPE double
  40. #define MAX_ERROR 1e-15
  41. #else
  42. #define DATA_TYPE float
  43. #define MAX_ERROR 1e-6
  44. #endif /* DOUBLE_PRCISION */
  45. // use pivot vector while decomposing
  46. #define PIVOT /* comment this to disable pivot use */
  47. // helper functions
  48. // wrapper around cublas<t>getrfBatched()
  49. cublasStatus_t cublasXgetrfBatched(cublasHandle_t handle, int n, DATA_TYPE* const A[], int lda, int* P, int* info, int batchSize)
  50. {
  51. #ifdef DOUBLE_PRECISION
  52. return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
  53. #else
  54. return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
  55. #endif
  56. }
  57. // wrapper around malloc
  58. // clears the allocated memory to 0
  59. // terminates the program if malloc fails
  60. void* xmalloc(size_t size)
  61. {
  62. void* ptr = malloc(size);
  63. if (ptr == NULL)
  64. {
  65. printf("> ERROR: malloc for size %zu failed..\n", size);
  66. exit(EXIT_FAILURE);
  67. }
  68. memset(ptr, 0, size);
  69. return ptr;
  70. }
  71. // initalize identity matrix
  72. void initIdentityMatrix(DATA_TYPE* mat)
  73. {
  74. // clear the matrix
  75. memset(mat, 0, N * N * sizeof(DATA_TYPE));
  76. // set all diagonals to 1
  77. for (int i = 0; i < N; i++)
  78. {
  79. mat[(i * N) + i] = 1.0;
  80. }
  81. }
  82. // initialize matrix with all elements as 0
  83. void initZeroMatrix(DATA_TYPE* mat)
  84. {
  85. memset(mat, 0, N * N * sizeof(DATA_TYPE));
  86. }
  87. // fill random value in column-major matrix
  88. void initRandomMatrix(DATA_TYPE* mat)
  89. {
  90. for (int i = 0; i < N; i++)
  91. {
  92. for (int j = 0; j < N; j++)
  93. {
  94. mat[(j * N) + i] = (DATA_TYPE)1.0 + ((DATA_TYPE)rand() / (DATA_TYPE)RAND_MAX);
  95. }
  96. }
  97. // diagonal dominant matrix to insure it is invertible matrix
  98. for (int i = 0; i < N; i++)
  99. {
  100. mat[(i * N) + i] += (DATA_TYPE)N;
  101. }
  102. }
  103. // print column-major matrix
  104. void printMatrix(DATA_TYPE* mat)
  105. {
  106. for (int i = 0; i < N; i++)
  107. {
  108. for (int j = 0; j < N; j++)
  109. {
  110. printf("%20.16f ", mat[(j * N) + i]);
  111. }
  112. printf("\n");
  113. }
  114. printf("\n");
  115. }
  116. // matrix mulitplication
  117. void matrixMultiply(DATA_TYPE* res, DATA_TYPE* mat1, DATA_TYPE* mat2)
  118. {
  119. initZeroMatrix(res);
  120. for (int i = 0; i < N; i++)
  121. {
  122. for (int j = 0; j < N; j++)
  123. {
  124. for (int k = 0; k < N; k++)
  125. {
  126. res[(j * N) + i] += mat1[(k * N) + i] * mat2[(j * N) + k];
  127. }
  128. }
  129. }
  130. }
  131. // check matrix equality
  132. bool checkRelativeError(DATA_TYPE* mat1, DATA_TYPE* mat2, DATA_TYPE maxError)
  133. {
  134. DATA_TYPE err = (DATA_TYPE) 0.0;
  135. DATA_TYPE refNorm = (DATA_TYPE) 0.0;
  136. DATA_TYPE relError = (DATA_TYPE) 0.0;
  137. DATA_TYPE relMaxError = (DATA_TYPE) 0.0;
  138. for (int i = 0; i < N * N; i++)
  139. {
  140. refNorm = abs(mat1[i]);
  141. err = abs(mat1[i] - mat2[i]);
  142. if (refNorm != 0.0 && err > 0.0)
  143. {
  144. relError = err / refNorm;
  145. relMaxError = MAX(relMaxError, relError);
  146. }
  147. if (relMaxError > maxError)
  148. return false;
  149. }
  150. return true;
  151. }
  152. // decode lower and upper matrix from single matrix
  153. // returned by getrfBatched()
  154. void getLUdecoded(DATA_TYPE* mat, DATA_TYPE* L, DATA_TYPE* U)
  155. {
  156. // init L as identity matrix
  157. initIdentityMatrix(L);
  158. // copy lower triangular values from mat to L (skip diagonal)
  159. for (int i = 0; i < N; i++)
  160. {
  161. for (int j = 0; j < i; j++)
  162. {
  163. L[(j * N) + i] = mat[(j * N) + i];
  164. }
  165. }
  166. // init U as all zero
  167. initZeroMatrix(U);
  168. // copy upper triangular values from mat to U
  169. for (int i = 0; i < N; i++)
  170. {
  171. for (int j = i; j < N; j++)
  172. {
  173. U[(j * N) + i] = mat[(j * N) + i];
  174. }
  175. }
  176. }
  177. // generate permutation matrix from pivot vector
  178. void getPmatFromPivot(DATA_TYPE* Pmat, int* P)
  179. {
  180. int pivot[N];
  181. // pivot vector in base-1
  182. // convert it to base-0
  183. for (int i = 0; i < N; i++)
  184. {
  185. P[i]--;
  186. }
  187. // generate permutation vector from pivot
  188. // initialize pivot with identity sequence
  189. for (int k = 0; k < N; k++)
  190. {
  191. pivot[k] = k;
  192. }
  193. // swap the indices according to pivot vector
  194. for (int k = 0; k < N; k++)
  195. {
  196. int q = P[k];
  197. // swap pivot(k) and pivot(q)
  198. int s = pivot[k];
  199. int t = pivot[q];
  200. pivot[k] = t;
  201. pivot[q] = s;
  202. }
  203. // generate permutation matrix from pivot vector
  204. initZeroMatrix(Pmat);
  205. for (int i = 0; i < N; i++)
  206. {
  207. int j = pivot[i];
  208. Pmat[(j * N) + i] = (DATA_TYPE)1.0;
  209. }
  210. }
  211. void printFloatMatrix(float* A, int n, int lda){
  212. for(int i=0; i<n; i++){
  213. for(int j=0; j<n; j++){
  214. printf(" %7.4f", A[i + j*lda]);
  215. }
  216. printf("\n");
  217. }
  218. printf("\n\n");
  219. }
  220. int main(int argc, char **argv) {
  221. // cuBLAS variables
  222. cublasStatus_t status;
  223. cublasHandle_t handle;
  224. // host variables
  225. size_t matSize = N * N * sizeof(DATA_TYPE);
  226. DATA_TYPE* h_AarrayInput;
  227. DATA_TYPE* h_AarrayOutput;
  228. DATA_TYPE* h_ptr_array[BATCH_SIZE];
  229. int* h_pivotArray;
  230. int* h_infoArray;
  231. // device variables
  232. DATA_TYPE* d_Aarray;
  233. DATA_TYPE** d_ptr_array;
  234. int* d_pivotArray;
  235. int* d_infoArray;
  236. int err_count = 0;
  237. // seed the rand() function with time
  238. srand(12345);
  239. // find cuda device
  240. printf("> initializing..\n");
  241. int dev = findCudaDevice(argc, (const char **)argv);
  242. if (dev == -1)
  243. {
  244. return(EXIT_FAILURE);
  245. }
  246. // initialize cuBLAS
  247. status = cublasCreate(&handle);
  248. if (status != CUBLAS_STATUS_SUCCESS)
  249. {
  250. printf("> ERROR: cuBLAS initialization failed..\n");
  251. return(EXIT_FAILURE);
  252. }
  253. #ifdef DOUBLE_PRECISION
  254. printf("> using DOUBLE precision..\n");
  255. #else
  256. printf("> using SINGLE precision..\n");
  257. #endif
  258. #ifdef PIVOT
  259. printf("> pivot ENABLED..\n");
  260. #else
  261. printf("> pivot DISABLED..\n");
  262. #endif
  263. // allocate memory for host variables
  264. h_AarrayInput = (DATA_TYPE*) xmalloc(BATCH_SIZE * matSize);
  265. h_AarrayOutput = (DATA_TYPE*) xmalloc(BATCH_SIZE * matSize);
  266. h_pivotArray = (int*) xmalloc(N * BATCH_SIZE * sizeof(int));
  267. h_infoArray = (int*) xmalloc(BATCH_SIZE * sizeof(int));
  268. // allocate memory for device variables
  269. checkCudaErrors(cudaMalloc((void**)&d_Aarray, BATCH_SIZE * matSize));
  270. checkCudaErrors(cudaMalloc((void**)&d_pivotArray, N * BATCH_SIZE * sizeof(int)));
  271. checkCudaErrors(cudaMalloc((void**)&d_infoArray, BATCH_SIZE * sizeof(int)));
  272. checkCudaErrors(cudaMalloc((void**)&d_ptr_array, BATCH_SIZE * sizeof(DATA_TYPE*)));
  273. // fill matrix with random data
  274. printf("> generating random matrices..\n");
  275. for (int i = 0; i < BATCH_SIZE; i++)
  276. {
  277. initRandomMatrix(h_AarrayInput + (i * N * N));
  278. }
  279. // copy data to device from host
  280. printf("> copying data from host memory to GPU memory..\n");
  281. checkCudaErrors(cudaMemcpy(d_Aarray, h_AarrayInput, BATCH_SIZE * matSize, cudaMemcpyHostToDevice));
  282. // create pointer array for matrices
  283. for (int i = 0; i < BATCH_SIZE; i++)
  284. h_ptr_array[i] = d_Aarray + (i * N * N);
  285. // copy pointer array to device memory
  286. checkCudaErrors(cudaMemcpy(d_ptr_array, h_ptr_array, BATCH_SIZE * sizeof(DATA_TYPE*), cudaMemcpyHostToDevice));
  287. // perform LU decomposition
  288. printf("> performing LU decomposition..\n");
  289. #ifdef PIVOT
  290. status = cublasXgetrfBatched(handle, N, d_ptr_array, N, d_pivotArray, d_infoArray, BATCH_SIZE);
  291. #else
  292. status = cublasXgetrfBatched(handle, N, d_ptr_array, N, NULL, d_infoArray, BATCH_SIZE);
  293. #endif /* PIVOT */
  294. if (status != CUBLAS_STATUS_SUCCESS)
  295. {
  296. printf("> ERROR: cublasDgetrfBatched() failed with error %s..\n", _cudaGetErrorEnum(status));
  297. return(EXIT_FAILURE);
  298. }
  299. // copy data to host from device
  300. printf("> copying data from GPU memory to host memory..\n");
  301. checkCudaErrors(cudaMemcpy(h_AarrayOutput, d_Aarray, BATCH_SIZE * matSize, cudaMemcpyDeviceToHost));
  302. //
  303. printf("A_LU after getrf =\n");
  304. printFloatMatrix(h_AarrayOutput, N, N);
  305. //
  306. checkCudaErrors(cudaMemcpy(h_infoArray, d_infoArray, BATCH_SIZE * sizeof(int), cudaMemcpyDeviceToHost));
  307. #ifdef PIVOT
  308. checkCudaErrors(cudaMemcpy(h_pivotArray, d_pivotArray, N * BATCH_SIZE * sizeof(int), cudaMemcpyDeviceToHost));
  309. #endif /* PIVOT */
  310. // verify the result
  311. //LL:: P*A = L*U
  312. printf("> verifying the result..\n");
  313. for (int i = 0; i < BATCH_SIZE; i++)
  314. {
  315. if (h_infoArray[i] == 0)
  316. {
  317. DATA_TYPE* A = h_AarrayInput + (i * N * N);
  318. DATA_TYPE* LU = h_AarrayOutput + (i * N * N);
  319. DATA_TYPE L[N * N];
  320. DATA_TYPE U[N * N];
  321. getLUdecoded(LU, L, U);
  322. // test P * A = L * U
  323. int* P = h_pivotArray + (i * N);
  324. DATA_TYPE Pmat[N * N];
  325. #ifdef PIVOT
  326. getPmatFromPivot(Pmat, P);
  327. #else
  328. initIdentityMatrix(Pmat);
  329. #endif /* PIVOT */
  330. // perform matrix multiplication
  331. DATA_TYPE PxA[N * N];
  332. DATA_TYPE LxU[N * N];
  333. matrixMultiply(PxA, Pmat, A);
  334. matrixMultiply(LxU, L, U);
  335. // check for equality of matrices
  336. if (!checkRelativeError(PxA, LxU, (DATA_TYPE)MAX_ERROR))
  337. {
  338. printf("> ERROR: accuracy check failed for matrix number %05d..\n", i+1);
  339. err_count++;
  340. }
  341. }
  342. else if (h_infoArray[i] > 0)
  343. {
  344. printf("> execution for matrix %05d is successful, but U is singular and U(%d,%d) = 0..\n", i + 1,
  345. h_infoArray[i] - 1, h_infoArray[i] - 1);
  346. }
  347. else // (h_infoArray[i] < 0)
  348. {
  349. printf("> ERROR: matrix %05d have an illegal value at index %d = %lf..\n", i + 1,
  350. -h_infoArray[i], *(h_AarrayInput + (i * N * N) + (-h_infoArray[i])));
  351. }
  352. }
  353. /
  354. //status = cublasXgetrfBatched(handle, N, d_ptr_array, N, d_pivotArray, d_infoArray, BATCH_SIZE);
  355. //cublasDgetrfBatched(handle, n, Aarray, lda, PivotArray, infoArray, batchSize);
  356. DATA_TYPE* d_Carray;
  357. DATA_TYPE** d_ptr_C_array;
  358. checkCudaErrors(cudaMalloc((void**)&d_Carray, BATCH_SIZE * matSize));
  359. checkCudaErrors(cudaMalloc((void**)&d_ptr_C_array, BATCH_SIZE * sizeof(DATA_TYPE*)));
  360. for (int i = 0; i < BATCH_SIZE; i++)
  361. h_ptr_array[i] = d_Carray + (i * N * N);
  362. // copy pointer array to device memory
  363. checkCudaErrors(cudaMemcpy(d_ptr_C_array, h_ptr_array, BATCH_SIZE * sizeof(DATA_TYPE*), cudaMemcpyHostToDevice));
  364. cublasSgetriBatched(handle, N, d_ptr_array, N, d_pivotArray, d_ptr_C_array, N, d_infoArray, BATCH_SIZE);
  365. DATA_TYPE* h_getri_Aarray;
  366. h_getri_Aarray=(DATA_TYPE*)malloc(BATCH_SIZE * matSize);
  367. printf("LL: h_getri_Aarray= %llu\n", h_getri_Aarray);
  368. checkCudaErrors(cudaMemcpy(h_getri_Aarray, d_Aarray, matSize, cudaMemcpyDeviceToHost));
  369. //void printFloatMatrix(float* A, int N, int lda);
  370. printf("A_LU after getri =\n");
  371. printFloatMatrix(h_getri_Aarray, N, N);
  372. /
  373. // free device variables
  374. checkCudaErrors(cudaFree(d_ptr_array));
  375. checkCudaErrors(cudaFree(d_infoArray));
  376. checkCudaErrors(cudaFree(d_pivotArray));
  377. checkCudaErrors(cudaFree(d_Aarray));
  378. // free host variables
  379. if (h_infoArray) free(h_infoArray);
  380. if (h_pivotArray) free(h_pivotArray);
  381. if (h_AarrayOutput) free(h_AarrayOutput);
  382. if (h_AarrayInput) free(h_AarrayInput);
  383. // destroy cuBLAS handle
  384. status = cublasDestroy(handle);
  385. if (status != CUBLAS_STATUS_SUCCESS)
  386. {
  387. printf("> ERROR: cuBLAS uninitialization failed..\n");
  388. return(EXIT_FAILURE);
  389. }
  390. if (err_count > 0)
  391. {
  392. printf("> TEST FAILED for %d matrices, with precision: %g\n", err_count, MAX_ERROR);
  393. return(EXIT_FAILURE);
  394. }
  395. printf("> TEST SUCCESSFUL, with precision: %g\n", MAX_ERROR);
  396. return(EXIT_SUCCESS);
  397. }

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/笔触狂放9/article/detail/245556
推荐阅读
相关标签
  

闽ICP备14008679号