当前位置:   article > 正文

一个使用cublasSgetrfBatched计算逆矩阵的示例_cublas 求逆

cublas 求逆

  1. //mat_inv.cu
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <cublas_v2.h>
  5. #include<math.h>
  6. #define cudacall(call) \
  7. do \
  8. { \
  9. cudaError_t err = (call); \
  10. if(cudaSuccess != err) \
  11. { \
  12. fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
  13. cudaDeviceReset(); \
  14. exit(EXIT_FAILURE); \
  15. } \
  16. } \
  17. while (0)
  18. #define cublascall(call) \
  19. do \
  20. { \
  21. cublasStatus_t status = (call); \
  22. if(CUBLAS_STATUS_SUCCESS != status) \
  23. { \
  24. fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
  25. cudaDeviceReset(); \
  26. exit(EXIT_FAILURE); \
  27. } \
  28. \
  29. } \
  30. while(0)
  31. //
  32. //
  33. void invert(float** src, float** dst, int n, int batchSize)
  34. {
  35. cublasHandle_t handle;
  36. cublascall(cublasCreate_v2(&handle));
  37. int *P, *INFO;
  38. cudacall(cudaMalloc(&P, n * batchSize * sizeof(int)));
  39. cudacall(cudaMalloc(&INFO, batchSize * sizeof(int)));
  40. int lda = n;
  41. float **A = (float **)malloc(batchSize*sizeof(float *));
  42. float **A_d, *A_dflat;
  43. cudacall(cudaMalloc(&A_d,batchSize*sizeof(float *)));
  44. cudacall(cudaMalloc(&A_dflat, n*n*batchSize*sizeof(float)));
  45. A[0] = A_dflat;
  46. for (int i = 1; i < batchSize; i++)
  47. A[i] = A[i-1]+(n*n);
  48. cudacall(cudaMemcpy(A_d,A,batchSize*sizeof(float *),cudaMemcpyHostToDevice));
  49. for (int i = 0; i < batchSize; i++)
  50. cudacall(cudaMemcpy(A_dflat+(i*n*n), src[i], n*n*sizeof(float), cudaMemcpyHostToDevice));
  51. cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
  52. int INFOh[batchSize];
  53. cudacall(cudaMemcpy(INFOh,INFO,batchSize*sizeof(int),cudaMemcpyDeviceToHost));
  54. for (int i = 0; i < batchSize; i++)
  55. if(INFOh[i] != 0)
  56. {
  57. fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
  58. cudaDeviceReset();
  59. exit(EXIT_FAILURE);
  60. }
  61. float **C = (float **)malloc(batchSize*sizeof(float *));
  62. float **C_d, *C_dflat;
  63. cudacall(cudaMalloc(&C_d,batchSize*sizeof(float *)));
  64. cudacall(cudaMalloc(&C_dflat, n*n*batchSize*sizeof(float)));
  65. C[0] = C_dflat;
  66. for (int i = 1; i < batchSize; i++)
  67. C[i] = C[i-1] + (n*n);
  68. cudacall(cudaMemcpy(C_d,C,batchSize*sizeof(float *),cudaMemcpyHostToDevice));
  69. cublascall(cublasSgetriBatched(handle,n,(const float **)A_d,lda,P,C_d,lda,INFO,batchSize));
  70. cudacall(cudaMemcpy(INFOh,INFO,batchSize*sizeof(int),cudaMemcpyDeviceToHost));
  71. for (int i = 0; i < batchSize; i++)
  72. if(INFOh[i] != 0)
  73. {
  74. fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
  75. cudaDeviceReset();
  76. exit(EXIT_FAILURE);
  77. }
  78. for (int i = 0; i < batchSize; i++)
  79. cudacall(cudaMemcpy(dst[i], C_dflat + (i*n*n), n*n*sizeof(float), cudaMemcpyDeviceToHost));
  80. cudaFree(A_d); cudaFree(A_dflat); free(A);
  81. cudaFree(C_d); cudaFree(C_dflat); free(C);
  82. cudaFree(P); cudaFree(INFO); cublasDestroy_v2(handle);
  83. }
  84. //
  85. //
  86. __global__ void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width)
  87. {
  88. //2D Thread ID
  89. int col = blockIdx.x*blockDim.x+threadIdx.x;
  90. int row = blockIdx.y*blockDim.y+threadIdx.y;
  91. //Pvalue stores the Pd element that is computed by the thread
  92. float Pvalue = 0;
  93. if(col<Width && row < Width)
  94. {
  95. for(int k = 0; k < Width ; ++k)
  96. {
  97. float Mdelement = Md[row*Width + k];
  98. float Ndelement = Nd[k*Width + col];
  99. Pvalue += (Mdelement*Ndelement);
  100. }
  101. Pd[row*Width + col] = Pvalue;
  102. }
  103. }
  104. void mul(float* M,float* N,int Width)
  105. {
  106. float * P = (float *) malloc(Width*Width*sizeof(float));
  107. float *Md, *Nd, *Pd;
  108. unsigned long int size = Width*Width*sizeof(float);
  109. //Transfer M and N to device memory
  110. cudaMalloc((void**)&Md, size);
  111. cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
  112. cudaMalloc((void**)&Nd, size);
  113. cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);
  114. //Allocate P on the device
  115. cudaMalloc((void**)&Pd,size);
  116. //Setup the execution configuration
  117. dim3 dimBlock(Width,Width);
  118. dim3 dimGrid(1,1);
  119. if (Width*Width > 1024)
  120. {
  121. //printf("\n\n enter inside if condi\n\n");
  122. dimGrid.x = (Width-1)/32+1;
  123. dimGrid.y = (Width-1)/32+1;
  124. dimBlock.x = 32;
  125. dimBlock.y = 32;
  126. }
  127. //Launch the device computation threads!
  128. MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);
  129. //Transfer P from device to host
  130. cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);
  131. //Free device matrices
  132. cudaFree(Md);
  133. cudaFree(Nd);
  134. cudaFree(Pd);
  135. int i;
  136. fprintf(stdout,"\n\n");
  137. if(Width<11)
  138. {
  139. fprintf(stdout,"\n\nMatrix Multiplication, M x Inv(M) :\n\n");
  140. for(i = 0; i < Width*Width; i++)
  141. {
  142. if(P[i])
  143. fprintf(stdout,"%10f ",P[i]) ;
  144. else
  145. fprintf(stdout,"%9f ",P[i]) ;
  146. if((i+1)%Width==0)
  147. fprintf(stdout,"\n");
  148. }
  149. }
  150. else
  151. {
  152. FILE *fp;
  153. fp = fopen("Mat_Inv_out", "a");
  154. if (!fp)
  155. {
  156. fprintf(stderr, "Failed to open matAdata.\n");
  157. exit(1);
  158. }
  159. fprintf(fp,"\n\nMatrix Multiplication, M x Inv(M) :\n\n");
  160. for(i = 0; i < Width*Width; i++)
  161. { if(P[i])
  162. fprintf(fp,"%10f ",P[i]) ;
  163. else
  164. fprintf(fp,"%9f ",P[i]) ;
  165. if((i+1)%Width==0)
  166. fprintf(fp,"\n");
  167. }
  168. fclose(fp);
  169. }
  170. //printf("\n Matrix multiplication completed !!\n\n");
  171. free(M);
  172. free(N);
  173. free(P);
  174. }
  175. //
  176. //
  177. void fill(float* h, int w)
  178. {
  179. unsigned int i, num;
  180. int divide;
  181. FILE *f;
  182. f=fopen("/dev/urandom", "r");
  183. if (!f) {
  184. fprintf(stderr, "Failed open file\n");
  185. exit(1);
  186. }
  187. for(i=0; i< w*w; i++)
  188. {
  189. fread(&num, sizeof(unsigned int), 1, f);
  190. fread(&divide, sizeof(int), 1, f);
  191. h[i] = ((float)num)/((float)divide);
  192. //scanf("%f",&h[i]);
  193. }
  194. fclose(f);
  195. /*
  196. unsigned int i;
  197. srand((unsigned int)time(NULL));
  198. for(i=0; i< w*w; i++)
  199. {
  200. h[i] = ((float)rand()/(float)(RAND_MAX)) * 99;
  201. //scanf("%f",&h[i]);
  202. }
  203. */
  204. }
  205. //
  206. //
  207. void test_invert(int n )
  208. {
  209. //printf("Enter the order of the square matrix :");
  210. //scanf("%d",&n);
  211. const int mybatch = 1;
  212. //float* mat1[n * n];
  213. float mat1_size = sizeof(float) * n * n;
  214. float* mat1 = (float*) malloc(mat1_size);
  215. fill(mat1, n);
  216. float *result_flat = (float *)malloc(mybatch*n*n*sizeof(float));
  217. float **results = (float **)malloc(mybatch*sizeof(float *));
  218. for (int i = 0; i < mybatch; i++)
  219. results[i] = result_flat + (i*n*n);
  220. float **inputs = (float **)malloc(mybatch*sizeof(float *));
  221. //inputs[0] = zero_pivot;
  222. inputs[0] = mat1;
  223. invert(inputs, results, n, mybatch);
  224. if(n<11)
  225. {
  226. for (int qq = 0; qq < mybatch; qq++)
  227. {
  228. if(mybatch==1)
  229. fprintf(stdout, "Input Matrix, M :\n\n");
  230. else
  231. fprintf(stdout, "Input Matrix %d:\n\n", qq);
  232. for(int i=0; i<n; i++)
  233. {
  234. for(int j=0; j<n; j++)
  235. {
  236. if(inputs[qq][i*n+j])
  237. fprintf(stdout,"%12f ",inputs[qq][i*n+j]);
  238. else
  239. fprintf(stdout,"%11f ",inputs[qq][i*n+j]);
  240. }
  241. fprintf(stdout,"\n");
  242. }
  243. }
  244. fprintf(stdout,"\n\n");
  245. for (int qq = 0; qq < mybatch; qq++)
  246. {
  247. if(mybatch==1)
  248. fprintf(stdout, "Inverse of the Input Matrix, Inv(M):\n\n");
  249. else
  250. fprintf(stdout, "Inverse Matrix %d:\n\n", qq);
  251. for(int i=0; i<n; i++)
  252. {
  253. for(int j=0; j<n; j++)
  254. {
  255. if(results[qq][i*n+j])
  256. fprintf(stdout,"%10f ",results[qq][i*n+j]);
  257. else
  258. fprintf(stdout,"%9f ",results[qq][i*n+j]);
  259. }
  260. fprintf(stdout,"\n");
  261. }
  262. }
  263. }
  264. else // order of the matrix is more than 10 x 10 then output the results in the file
  265. {
  266. printf("\nThe order of matrix is too large to display in terminal\n, Please open the file : Mat_Inv_out.txt located in the current folder. To see the output.\n\n");
  267. FILE *fp;
  268. fp = fopen("Mat_Inv_out", "w");
  269. if (!fp)
  270. {
  271. fprintf(stderr, "Failed to open Mat_Inv_out.\n");
  272. exit(1);
  273. }
  274. for (int qq = 0; qq < mybatch; qq++)
  275. {
  276. if(mybatch==1)
  277. fprintf(fp, "Input Matrix , M:\n\n");
  278. else
  279. fprintf(fp, "Input Matrix %d:\n\n", qq);
  280. for(int i=0; i<n; i++)
  281. {
  282. for(int j=0; j<n; j++)
  283. {
  284. if(inputs[qq][i*n+j])
  285. fprintf(fp,"%12f ",inputs[qq][i*n+j]);
  286. else
  287. fprintf(fp,"%11f ",inputs[qq][i*n+j]);
  288. }
  289. fprintf(fp,"\n");
  290. }
  291. }
  292. fprintf(fp,"\n\n");
  293. for (int qq = 0; qq < mybatch; qq++)
  294. {
  295. if(mybatch==1)
  296. fprintf(fp, "Inverse of the Input Matrix, Inv(M):\n\n");
  297. else
  298. fprintf(fp, "Inverse %d:\n\n", qq);
  299. for(int i=0; i<n; i++)
  300. {
  301. for(int j=0; j<n; j++)
  302. {
  303. if(results[qq][i*n+j])
  304. fprintf(fp,"%10f ",results[qq][i*n+j]);
  305. else
  306. fprintf(fp,"%9f ",results[qq][i*n+j]);
  307. }
  308. fprintf(fp,"\n");
  309. }
  310. }
  311. fclose(fp);
  312. }// end of if else condition for output
  313. float *A, *B;
  314. A=inputs[0];
  315. B=results[0];
  316. mul(A, B, n );
  317. //mul(inputs[0][], results[0][], n );
  318. }
  319. //
  320. //
  321. int main(int argc, char *argv[])
  322. {
  323. if(argc!=2)
  324. {
  325. printf("Usage: %s <matrix_width>\n", argv[0]);
  326. return 0;
  327. }
  328. int w;
  329. w = atoi( argv[1] );
  330. test_invert(w);
  331. return 0;
  332. }
  333. /*
  334. $ nvcc -arch=sm_20 -o t540 t540.cu -lcublas
  335. $ ./t540
  336. Input 0:
  337. 0.000000 3.000000 4.000000
  338. 1.000000 3.000000 10.000000
  339. 4.000000 9.000000 16.000000
  340. Input 1:
  341. 0.500000 3.000000 4.000000
  342. 1.000000 3.000000 10.000000
  343. 4.000000 9.000000 16.000000
  344. Input 2:
  345. 0.000000 3.000000 4.000000
  346. 1.000000 5.000000 6.000000
  347. 9.000000 8.000000 2.000000
  348. Input 3:
  349. 22.000000 3.000000 4.000000
  350. 1.000000 5.000000 6.000000
  351. 9.000000 8.000000 2.000000
  352. Inverse 0:
  353. -0.700000 -0.200000 0.300000
  354. 0.400000 -0.266667 0.066667
  355. -0.050000 0.200000 -0.050000
  356. Inverse 1:
  357. -1.076923 -0.307692 0.461538
  358. 0.615385 -0.205128 -0.025641
  359. -0.076923 0.192308 -0.038462
  360. Inverse 2:
  361. -4.750000 3.250000 -0.250000
  362. 6.500000 -4.500000 0.500000
  363. -4.625000 3.375000 -0.375000
  364. Inverse 3:
  365. 0.045894 -0.031401 0.002415
  366. -0.062802 -0.009662 0.154589
  367. 0.044686 0.179952 -0.129227
  368. $
  369. $ nvcc -arch=sm_20 -o t540 t540.cu -lcublas
  370. $ ./t540
  371. Enter the order of the aquare matrix :4
  372. Input 0:
  373. -0.100222 -2.553872 -69.072723 0.016120
  374. -2.752346 -1.230871 1.997387 0.606710
  375. -0.029929 -0.583444 2.733107 0.254404
  376. -1.844285 -0.070541 1.906255 10.758991
  377. Inverse 0:
  378. 0.017501 -0.374555 0.713068 0.004234
  379. -0.056876 -0.005437 -1.457745 0.034861
  380. -0.012399 0.000729 0.052888 -0.001273
  381. 0.004824 -0.064370 0.103305 0.094125
  382. */

  1. //mat_inv_userInput.cu
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <cublas_v2.h>
  5. #include<math.h>
  6. #define cudacall(call) \
  7. do \
  8. { \
  9. cudaError_t err = (call); \
  10. if(cudaSuccess != err) \
  11. { \
  12. fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
  13. cudaDeviceReset(); \
  14. exit(EXIT_FAILURE); \
  15. } \
  16. } \
  17. while (0)
  18. #define cublascall(call) \
  19. do \
  20. { \
  21. cublasStatus_t status = (call); \
  22. if(CUBLAS_STATUS_SUCCESS != status) \
  23. { \
  24. fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
  25. cudaDeviceReset(); \
  26. exit(EXIT_FAILURE); \
  27. } \
  28. \
  29. } \
  30. while(0)
  31. //
  32. //
  33. void invert(float** src, float** dst, int n, int batchSize)
  34. {
  35. cublasHandle_t handle;
  36. cublascall(cublasCreate_v2(&handle));
  37. int *P, *INFO;
  38. cudacall(cudaMalloc(&P, n * batchSize * sizeof(int)));
  39. cudacall(cudaMalloc(&INFO, batchSize * sizeof(int)));
  40. int lda = n;
  41. float **A = (float **)malloc(batchSize*sizeof(float *));
  42. float **A_d, *A_dflat;
  43. cudacall(cudaMalloc(&A_d,batchSize*sizeof(float *)));
  44. cudacall(cudaMalloc(&A_dflat, n*n*batchSize*sizeof(float)));
  45. A[0] = A_dflat;
  46. for (int i = 1; i < batchSize; i++)
  47. A[i] = A[i-1]+(n*n);
  48. cudacall(cudaMemcpy(A_d,A,batchSize*sizeof(float *),cudaMemcpyHostToDevice));
  49. for (int i = 0; i < batchSize; i++)
  50. cudacall(cudaMemcpy(A_dflat+(i*n*n), src[i], n*n*sizeof(float), cudaMemcpyHostToDevice));
  51. cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
  52. int INFOh[batchSize];
  53. cudacall(cudaMemcpy(INFOh,INFO,batchSize*sizeof(int),cudaMemcpyDeviceToHost));
  54. for (int i = 0; i < batchSize; i++)
  55. if(INFOh[i] != 0)
  56. {
  57. fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
  58. cudaDeviceReset();
  59. exit(EXIT_FAILURE);
  60. }
  61. float **C = (float **)malloc(batchSize*sizeof(float *));
  62. float **C_d, *C_dflat;
  63. cudacall(cudaMalloc(&C_d,batchSize*sizeof(float *)));
  64. cudacall(cudaMalloc(&C_dflat, n*n*batchSize*sizeof(float)));
  65. C[0] = C_dflat;
  66. for (int i = 1; i < batchSize; i++)
  67. C[i] = C[i-1] + (n*n);
  68. cudacall(cudaMemcpy(C_d,C,batchSize*sizeof(float *),cudaMemcpyHostToDevice));
  69. cublascall(cublasSgetriBatched(handle,n,(const float **)A_d,lda,P,C_d,lda,INFO,batchSize));
  70. cudacall(cudaMemcpy(INFOh,INFO,batchSize*sizeof(int),cudaMemcpyDeviceToHost));
  71. for (int i = 0; i < batchSize; i++)
  72. if(INFOh[i] != 0)
  73. {
  74. fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
  75. cudaDeviceReset();
  76. exit(EXIT_FAILURE);
  77. }
  78. for (int i = 0; i < batchSize; i++)
  79. cudacall(cudaMemcpy(dst[i], C_dflat + (i*n*n), n*n*sizeof(float), cudaMemcpyDeviceToHost));
  80. cudaFree(A_d); cudaFree(A_dflat); free(A);
  81. cudaFree(C_d); cudaFree(C_dflat); free(C);
  82. cudaFree(P); cudaFree(INFO); cublasDestroy_v2(handle);
  83. }
  84. //
  85. //
  86. __global__ void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width)
  87. {
  88. //2D Thread ID
  89. int col = blockIdx.x*blockDim.x+threadIdx.x;
  90. int row = blockIdx.y*blockDim.y+threadIdx.y;
  91. //Pvalue stores the Pd element that is computed by the thread
  92. float Pvalue = 0;
  93. if(col<Width && row < Width)
  94. {
  95. for(int k = 0; k < Width ; ++k)
  96. {
  97. float Mdelement = Md[row*Width + k];
  98. float Ndelement = Nd[k*Width + col];
  99. Pvalue += (Mdelement*Ndelement);
  100. }
  101. Pd[row*Width + col] = Pvalue;
  102. }
  103. }
  104. void mul(float* M,float* N,int Width)
  105. {
  106. float * P = (float *) malloc(Width*Width*sizeof(float));
  107. float *Md, *Nd, *Pd;
  108. unsigned long int size = Width*Width*sizeof(float);
  109. //Transfer M and N to device memory
  110. cudaMalloc((void**)&Md, size);
  111. cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
  112. cudaMalloc((void**)&Nd, size);
  113. cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);
  114. //Allocate P on the device
  115. cudaMalloc((void**)&Pd,size);
  116. //Setup the execution configuration
  117. dim3 dimBlock(Width,Width);
  118. dim3 dimGrid(1,1);
  119. if (Width*Width > 1024)
  120. {
  121. //printf("\n\n enter inside if condi\n\n");
  122. dimGrid.x = (Width-1)/32+1;
  123. dimGrid.y = (Width-1)/32+1;
  124. dimBlock.x = 32;
  125. dimBlock.y = 32;
  126. }
  127. //Launch the device computation threads!
  128. MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);
  129. //Transfer P from device to host
  130. cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);
  131. //Free device matrices
  132. cudaFree(Md);
  133. cudaFree(Nd);
  134. cudaFree(Pd);
  135. int i;
  136. fprintf(stdout,"\n\n");
  137. if(Width<11)
  138. {
  139. fprintf(stdout,"\n\nMatrix Multiplication, M x Inv(M) :\n\n");
  140. for(i = 0; i < Width*Width; i++)
  141. {
  142. if(P[i])
  143. fprintf(stdout,"%10f ",P[i]) ;
  144. else
  145. fprintf(stdout,"%9f ",P[i]) ;
  146. if((i+1)%Width==0)
  147. fprintf(stdout,"\n");
  148. }
  149. }
  150. else
  151. {
  152. FILE *fp;
  153. fp = fopen("Mat_Inv_out", "a");
  154. if (!fp)
  155. {
  156. fprintf(stderr, "Failed to open matAdata.\n");
  157. exit(1);
  158. }
  159. fprintf(fp,"\n\nMatrix Multiplication, M x Inv(M) :\n\n");
  160. for(i = 0; i < Width*Width; i++)
  161. { if(P[i])
  162. fprintf(fp,"%10f ",P[i]) ;
  163. else
  164. fprintf(fp,"%9f ",P[i]) ;
  165. if((i+1)%Width==0)
  166. fprintf(fp,"\n");
  167. }
  168. fclose(fp);
  169. }
  170. //printf("\n Matrix multiplication completed !!\n\n");
  171. free(M);
  172. free(N);
  173. free(P);
  174. }
  175. //
  176. //
  177. void fill(float* h, int w)
  178. {
  179. unsigned int i;
  180. for(i=0; i< w*w; i++)
  181. {
  182. scanf("%f",&h[i]);
  183. }
  184. }
  185. //
  186. //
  187. void test_invert( )
  188. {
  189. int n;
  190. printf("Enter the order of the square matrix :");
  191. scanf("%d",&n);
  192. const int mybatch = 1;
  193. float mat1_size = sizeof(float) * n * n;
  194. float* mat1 = (float*) malloc(mat1_size);
  195. fill(mat1, n);
  196. float *result_flat = (float *)malloc(mybatch*n*n*sizeof(float));
  197. float **results = (float **)malloc(mybatch*sizeof(float *));
  198. for (int i = 0; i < mybatch; i++)
  199. results[i] = result_flat + (i*n*n);
  200. float **inputs = (float **)malloc(mybatch*sizeof(float *));
  201. //inputs[0] = zero_pivot;
  202. inputs[0] = mat1;
  203. invert(inputs, results, n, mybatch);
  204. if(n<11)
  205. {
  206. for (int qq = 0; qq < mybatch; qq++)
  207. {
  208. if(mybatch==1)
  209. fprintf(stdout, "Input Matrix, M :\n\n");
  210. else
  211. fprintf(stdout, "Input Matrix %d:\n\n", qq);
  212. for(int i=0; i<n; i++)
  213. {
  214. for(int j=0; j<n; j++)
  215. {
  216. if(inputs[qq][i*n+j])
  217. fprintf(stdout,"%12f ",inputs[qq][i*n+j]);
  218. else
  219. fprintf(stdout,"%11f ",inputs[qq][i*n+j]);
  220. }
  221. fprintf(stdout,"\n");
  222. }
  223. }
  224. fprintf(stdout,"\n\n");
  225. for (int qq = 0; qq < mybatch; qq++)
  226. {
  227. if(mybatch==1)
  228. fprintf(stdout, "Inverse of the Input Matrix, Inv(M):\n\n");
  229. else
  230. fprintf(stdout, "Inverse Matrix %d:\n\n", qq);
  231. for(int i=0; i<n; i++)
  232. {
  233. for(int j=0; j<n; j++)
  234. {
  235. if(results[qq][i*n+j])
  236. fprintf(stdout,"%10f ",results[qq][i*n+j]);
  237. else
  238. fprintf(stdout,"%9f ",results[qq][i*n+j]);
  239. }
  240. fprintf(stdout,"\n");
  241. }
  242. }
  243. }
  244. else // order of the matrix is more than 10 x 10 then output the results in the file
  245. {
  246. printf("\nThe order of matrix is too large to display in terminal\n, Please open the file : Mat_Inv_out.txt located in the current folder. To see the output.\n\n");
  247. FILE *fp;
  248. fp = fopen("Mat_Inv_out", "w");
  249. if (!fp)
  250. {
  251. fprintf(stderr, "Failed to open Mat_Inv_out.\n");
  252. exit(1);
  253. }
  254. for (int qq = 0; qq < mybatch; qq++)
  255. {
  256. if(mybatch==1)
  257. fprintf(fp, "Input Matrix , M:\n\n");
  258. else
  259. fprintf(fp, "Input Matrix %d:\n\n", qq);
  260. for(int i=0; i<n; i++)
  261. {
  262. for(int j=0; j<n; j++)
  263. {
  264. if(inputs[qq][i*n+j])
  265. fprintf(fp,"%12f ",inputs[qq][i*n+j]);
  266. else
  267. fprintf(fp,"%11f ",inputs[qq][i*n+j]);
  268. }
  269. fprintf(fp,"\n");
  270. }
  271. }
  272. fprintf(fp,"\n\n");
  273. for (int qq = 0; qq < mybatch; qq++)
  274. {
  275. if(mybatch==1)
  276. fprintf(fp, "Inverse of the Input Matrix, Inv(M):\n\n");
  277. else
  278. fprintf(fp, "Inverse %d:\n\n", qq);
  279. for(int i=0; i<n; i++)
  280. {
  281. for(int j=0; j<n; j++)
  282. {
  283. if(results[qq][i*n+j])
  284. fprintf(fp,"%10f ",results[qq][i*n+j]);
  285. else
  286. fprintf(fp,"%9f ",results[qq][i*n+j]);
  287. }
  288. fprintf(fp,"\n");
  289. }
  290. }
  291. fclose(fp);
  292. }// end of if else condition for output
  293. float *A, *B;
  294. A=inputs[0];
  295. B=results[0];
  296. mul(A, B, n );
  297. //mul(inputs[0][], results[0][], n );
  298. }
  299. //
  300. //
  301. int main()
  302. {
  303. test_invert();
  304. return 0;
  305. }
  306. /*
  307. $ nvcc -arch=sm_20 -o t540 t540.cu -lcublas
  308. $ ./t540
  309. Input 0:
  310. 0.000000 3.000000 4.000000
  311. 1.000000 3.000000 10.000000
  312. 4.000000 9.000000 16.000000
  313. Input 1:
  314. 0.500000 3.000000 4.000000
  315. 1.000000 3.000000 10.000000
  316. 4.000000 9.000000 16.000000
  317. Input 2:
  318. 0.000000 3.000000 4.000000
  319. 1.000000 5.000000 6.000000
  320. 9.000000 8.000000 2.000000
  321. Input 3:
  322. 22.000000 3.000000 4.000000
  323. 1.000000 5.000000 6.000000
  324. 9.000000 8.000000 2.000000
  325. Inverse 0:
  326. -0.700000 -0.200000 0.300000
  327. 0.400000 -0.266667 0.066667
  328. -0.050000 0.200000 -0.050000
  329. Inverse 1:
  330. -1.076923 -0.307692 0.461538
  331. 0.615385 -0.205128 -0.025641
  332. -0.076923 0.192308 -0.038462
  333. Inverse 2:
  334. -4.750000 3.250000 -0.250000
  335. 6.500000 -4.500000 0.500000
  336. -4.625000 3.375000 -0.375000
  337. Inverse 3:
  338. 0.045894 -0.031401 0.002415
  339. -0.062802 -0.009662 0.154589
  340. 0.044686 0.179952 -0.129227
  341. $
  342. $ nvcc -arch=sm_20 -o t540 t540.cu -lcublas
  343. $ ./t540
  344. Enter the order of the aquare matrix :4
  345. Input 0:
  346. -0.100222 -2.553872 -69.072723 0.016120
  347. -2.752346 -1.230871 1.997387 0.606710
  348. -0.029929 -0.583444 2.733107 0.254404
  349. -1.844285 -0.070541 1.906255 10.758991
  350. Inverse 0:
  351. 0.017501 -0.374555 0.713068 0.004234
  352. -0.056876 -0.005437 -1.457745 0.034861
  353. -0.012399 0.000729 0.052888 -0.001273
  354. 0.004824 -0.064370 0.103305 0.094125
  355. */

  1. #Makefile
  2. default: all
  3. .PHONY: default
  4. all:
  5. nvcc -arch=sm_70 -o mat_inv mat_inv.cu -lcublas -lm
  6. .PHONY: all
  7. user:
  8. nvcc -arch=sm_20 -o user_mat_inv mat_inv_userInput.cu -lcublas -lm
  9. clean:
  10. rm mat_inv user_mat_inv
  11. .PHONY: all

$make

$./mat_inv 3

std out:

  1. Input Matrix, M :
  2. -3.662523 -1.507356 1.094427
  3. -0.853989 1.768155 45.548477
  4. -5.994538 6.792342 -323.404816
  5. Inverse of the Input Matrix, Inv(M):
  6. -0.217016 -0.118222 -0.017385
  7. -0.135258 0.293317 0.040853
  8. 0.001182 0.008352 -0.001912
  9. Matrix Multiplication, M x Inv(M) :
  10. 1.000000 0.000000 -0.000000
  11. 0.000000 1.000000 -0.000000
  12. 0.000000 -0.000000 1.000000

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家小花儿/article/detail/245510
推荐阅读
相关标签
  

闽ICP备14008679号