赞
踩
- //mat_inv.cu
- #include <stdio.h>
- #include <stdlib.h>
- #include <cublas_v2.h>
- #include<math.h>
-
-
- #define cudacall(call) \
- do \
- { \
- cudaError_t err = (call); \
- if(cudaSuccess != err) \
- { \
- fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
- cudaDeviceReset(); \
- exit(EXIT_FAILURE); \
- } \
- } \
- while (0)
-
- #define cublascall(call) \
- do \
- { \
- cublasStatus_t status = (call); \
- if(CUBLAS_STATUS_SUCCESS != status) \
- { \
- fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
- cudaDeviceReset(); \
- exit(EXIT_FAILURE); \
- } \
- \
- } \
- while(0)
-
-
- //
- //
-
- void invert(float** src, float** dst, int n, int batchSize)
- {
- cublasHandle_t handle;
- cublascall(cublasCreate_v2(&handle));
-
- int *P, *INFO;
-
- cudacall(cudaMalloc(&P, n * batchSize * sizeof(int)));
- cudacall(cudaMalloc(&INFO, batchSize * sizeof(int)));
-
- int lda = n;
-
- float **A = (float **)malloc(batchSize*sizeof(float *));
- float **A_d, *A_dflat;
-
- cudacall(cudaMalloc(&A_d,batchSize*sizeof(float *)));
- cudacall(cudaMalloc(&A_dflat, n*n*batchSize*sizeof(float)));
-
- A[0] = A_dflat;
- for (int i = 1; i < batchSize; i++)
- A[i] = A[i-1]+(n*n);
-
- cudacall(cudaMemcpy(A_d,A,batchSize*sizeof(float *),cudaMemcpyHostToDevice));
-
- for (int i = 0; i < batchSize; i++)
- cudacall(cudaMemcpy(A_dflat+(i*n*n), src[i], n*n*sizeof(float), cudaMemcpyHostToDevice));
-
-
- cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
-
-
- int INFOh[batchSize];
- cudacall(cudaMemcpy(INFOh,INFO,batchSize*sizeof(int),cudaMemcpyDeviceToHost));
-
- for (int i = 0; i < batchSize; i++)
- if(INFOh[i] != 0)
- {
- fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
- cudaDeviceReset();
- exit(EXIT_FAILURE);
- }
-
- float **C = (float **)malloc(batchSize*sizeof(float *));
- float **C_d, *C_dflat;
-
- cudacall(cudaMalloc(&C_d,batchSize*sizeof(float *)));
- cudacall(cudaMalloc(&C_dflat, n*n*batchSize*sizeof(float)));
- C[0] = C_dflat;
- for (int i = 1; i < batchSize; i++)
- C[i] = C[i-1] + (n*n);
- cudacall(cudaMemcpy(C_d,C,batchSize*sizeof(float *),cudaMemcpyHostToDevice));
- cublascall(cublasSgetriBatched(handle,n,(const float **)A_d,lda,P,C_d,lda,INFO,batchSize));
-
- cudacall(cudaMemcpy(INFOh,INFO,batchSize*sizeof(int),cudaMemcpyDeviceToHost));
-
- for (int i = 0; i < batchSize; i++)
- if(INFOh[i] != 0)
- {
- fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
- cudaDeviceReset();
- exit(EXIT_FAILURE);
- }
- for (int i = 0; i < batchSize; i++)
- cudacall(cudaMemcpy(dst[i], C_dflat + (i*n*n), n*n*sizeof(float), cudaMemcpyDeviceToHost));
-
- cudaFree(A_d); cudaFree(A_dflat); free(A);
- cudaFree(C_d); cudaFree(C_dflat); free(C);
- cudaFree(P); cudaFree(INFO); cublasDestroy_v2(handle);
- }
-
-
-
- //
- //
-
-
-
- __global__ void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width)
- {
- //2D Thread ID
- int col = blockIdx.x*blockDim.x+threadIdx.x;
- int row = blockIdx.y*blockDim.y+threadIdx.y;
-
- //Pvalue stores the Pd element that is computed by the thread
- float Pvalue = 0;
- if(col<Width && row < Width)
- {
- for(int k = 0; k < Width ; ++k)
- {
- float Mdelement = Md[row*Width + k];
- float Ndelement = Nd[k*Width + col];
- Pvalue += (Mdelement*Ndelement);
-
- }
- Pd[row*Width + col] = Pvalue;
- }
- }
-
-
-
- void mul(float* M,float* N,int Width)
- {
-
- float * P = (float *) malloc(Width*Width*sizeof(float));
- float *Md, *Nd, *Pd;
-
-
-
- unsigned long int size = Width*Width*sizeof(float);
-
-
- //Transfer M and N to device memory
- cudaMalloc((void**)&Md, size);
- cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
-
- cudaMalloc((void**)&Nd, size);
- cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);
-
- //Allocate P on the device
- cudaMalloc((void**)&Pd,size);
-
- //Setup the execution configuration
- dim3 dimBlock(Width,Width);
- dim3 dimGrid(1,1);
-
-
- if (Width*Width > 1024)
- {
- //printf("\n\n enter inside if condi\n\n");
-
- dimGrid.x = (Width-1)/32+1;
- dimGrid.y = (Width-1)/32+1;
-
- dimBlock.x = 32;
- dimBlock.y = 32;
-
-
-
- }
-
-
- //Launch the device computation threads!
- MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);
- //Transfer P from device to host
- cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);
-
- //Free device matrices
- cudaFree(Md);
- cudaFree(Nd);
- cudaFree(Pd);
-
- int i;
-
- fprintf(stdout,"\n\n");
-
- if(Width<11)
- {
-
-
- fprintf(stdout,"\n\nMatrix Multiplication, M x Inv(M) :\n\n");
- for(i = 0; i < Width*Width; i++)
- {
- if(P[i])
- fprintf(stdout,"%10f ",P[i]) ;
- else
- fprintf(stdout,"%9f ",P[i]) ;
-
-
-
-
- if((i+1)%Width==0)
- fprintf(stdout,"\n");
- }
-
-
- }
- else
- {
- FILE *fp;
-
- fp = fopen("Mat_Inv_out", "a");
-
- if (!fp)
- {
- fprintf(stderr, "Failed to open matAdata.\n");
- exit(1);
- }
- fprintf(fp,"\n\nMatrix Multiplication, M x Inv(M) :\n\n");
- for(i = 0; i < Width*Width; i++)
- { if(P[i])
- fprintf(fp,"%10f ",P[i]) ;
- else
- fprintf(fp,"%9f ",P[i]) ;
-
- if((i+1)%Width==0)
- fprintf(fp,"\n");
- }
- fclose(fp);
- }
-
-
- //printf("\n Matrix multiplication completed !!\n\n");
- free(M);
- free(N);
- free(P);
-
- }
-
-
-
- //
- //
-
-
-
- void fill(float* h, int w)
- {
-
- unsigned int i, num;
- int divide;
- FILE *f;
-
- f=fopen("/dev/urandom", "r");
- if (!f) {
- fprintf(stderr, "Failed open file\n");
- exit(1);
- }
- for(i=0; i< w*w; i++)
- {
- fread(&num, sizeof(unsigned int), 1, f);
- fread(÷, sizeof(int), 1, f);
- h[i] = ((float)num)/((float)divide);
- //scanf("%f",&h[i]);
- }
- fclose(f);
- /*
- unsigned int i;
- srand((unsigned int)time(NULL));
- for(i=0; i< w*w; i++)
- {
- h[i] = ((float)rand()/(float)(RAND_MAX)) * 99;
- //scanf("%f",&h[i]);
- }
-
- */
-
- }
-
-
- //
- //
-
-
- void test_invert(int n )
- {
-
- //printf("Enter the order of the square matrix :");
- //scanf("%d",&n);
- const int mybatch = 1;
-
-
- //float* mat1[n * n];
- float mat1_size = sizeof(float) * n * n;
- float* mat1 = (float*) malloc(mat1_size);
-
- fill(mat1, n);
-
- float *result_flat = (float *)malloc(mybatch*n*n*sizeof(float));
- float **results = (float **)malloc(mybatch*sizeof(float *));
-
- for (int i = 0; i < mybatch; i++)
- results[i] = result_flat + (i*n*n);
-
- float **inputs = (float **)malloc(mybatch*sizeof(float *));
-
- //inputs[0] = zero_pivot;
-
- inputs[0] = mat1;
-
-
- invert(inputs, results, n, mybatch);
-
- if(n<11)
- {
-
- for (int qq = 0; qq < mybatch; qq++)
- {
- if(mybatch==1)
- fprintf(stdout, "Input Matrix, M :\n\n");
- else
- fprintf(stdout, "Input Matrix %d:\n\n", qq);
-
- for(int i=0; i<n; i++)
- {
- for(int j=0; j<n; j++)
- {
- if(inputs[qq][i*n+j])
- fprintf(stdout,"%12f ",inputs[qq][i*n+j]);
- else
- fprintf(stdout,"%11f ",inputs[qq][i*n+j]);
- }
- fprintf(stdout,"\n");
- }
- }
- fprintf(stdout,"\n\n");
-
-
-
-
- for (int qq = 0; qq < mybatch; qq++)
- {
-
- if(mybatch==1)
- fprintf(stdout, "Inverse of the Input Matrix, Inv(M):\n\n");
- else
- fprintf(stdout, "Inverse Matrix %d:\n\n", qq);
- for(int i=0; i<n; i++)
- {
- for(int j=0; j<n; j++)
- {
- if(results[qq][i*n+j])
- fprintf(stdout,"%10f ",results[qq][i*n+j]);
- else
- fprintf(stdout,"%9f ",results[qq][i*n+j]);
-
- }
- fprintf(stdout,"\n");
- }
- }
- }
-
-
- else // order of the matrix is more than 10 x 10 then output the results in the file
- {
- printf("\nThe order of matrix is too large to display in terminal\n, Please open the file : Mat_Inv_out.txt located in the current folder. To see the output.\n\n");
-
- FILE *fp;
-
-
- fp = fopen("Mat_Inv_out", "w");
-
- if (!fp)
- {
- fprintf(stderr, "Failed to open Mat_Inv_out.\n");
- exit(1);
- }
-
-
-
- for (int qq = 0; qq < mybatch; qq++)
- {
-
- if(mybatch==1)
- fprintf(fp, "Input Matrix , M:\n\n");
- else
- fprintf(fp, "Input Matrix %d:\n\n", qq);
-
-
-
-
- for(int i=0; i<n; i++)
- {
- for(int j=0; j<n; j++)
- {
- if(inputs[qq][i*n+j])
- fprintf(fp,"%12f ",inputs[qq][i*n+j]);
- else
- fprintf(fp,"%11f ",inputs[qq][i*n+j]);
- }
-
- fprintf(fp,"\n");
- }
- }
- fprintf(fp,"\n\n");
-
- for (int qq = 0; qq < mybatch; qq++)
- {
- if(mybatch==1)
- fprintf(fp, "Inverse of the Input Matrix, Inv(M):\n\n");
- else
- fprintf(fp, "Inverse %d:\n\n", qq);
- for(int i=0; i<n; i++)
- {
- for(int j=0; j<n; j++)
- {
- if(results[qq][i*n+j])
- fprintf(fp,"%10f ",results[qq][i*n+j]);
- else
- fprintf(fp,"%9f ",results[qq][i*n+j]);
-
- }
-
- fprintf(fp,"\n");
- }
- }
-
- fclose(fp);
-
- }// end of if else condition for output
-
- float *A, *B;
-
- A=inputs[0];
- B=results[0];
- mul(A, B, n );
-
- //mul(inputs[0][], results[0][], n );
-
- }
-
-
- //
- //
-
-
- int main(int argc, char *argv[])
- {
- if(argc!=2)
- {
- printf("Usage: %s <matrix_width>\n", argv[0]);
- return 0;
- }
-
- int w;
- w = atoi( argv[1] );
-
- test_invert(w);
- return 0;
- }
-
- /*
- $ nvcc -arch=sm_20 -o t540 t540.cu -lcublas
- $ ./t540
- Input 0:
- 0.000000 3.000000 4.000000
- 1.000000 3.000000 10.000000
- 4.000000 9.000000 16.000000
- Input 1:
- 0.500000 3.000000 4.000000
- 1.000000 3.000000 10.000000
- 4.000000 9.000000 16.000000
- Input 2:
- 0.000000 3.000000 4.000000
- 1.000000 5.000000 6.000000
- 9.000000 8.000000 2.000000
- Input 3:
- 22.000000 3.000000 4.000000
- 1.000000 5.000000 6.000000
- 9.000000 8.000000 2.000000
- Inverse 0:
- -0.700000 -0.200000 0.300000
- 0.400000 -0.266667 0.066667
- -0.050000 0.200000 -0.050000
- Inverse 1:
- -1.076923 -0.307692 0.461538
- 0.615385 -0.205128 -0.025641
- -0.076923 0.192308 -0.038462
- Inverse 2:
- -4.750000 3.250000 -0.250000
- 6.500000 -4.500000 0.500000
- -4.625000 3.375000 -0.375000
- Inverse 3:
- 0.045894 -0.031401 0.002415
- -0.062802 -0.009662 0.154589
- 0.044686 0.179952 -0.129227
- $
- $ nvcc -arch=sm_20 -o t540 t540.cu -lcublas
- $ ./t540
- Enter the order of the aquare matrix :4
- Input 0:
- -0.100222 -2.553872 -69.072723 0.016120
- -2.752346 -1.230871 1.997387 0.606710
- -0.029929 -0.583444 2.733107 0.254404
- -1.844285 -0.070541 1.906255 10.758991
- Inverse 0:
- 0.017501 -0.374555 0.713068 0.004234
- -0.056876 -0.005437 -1.457745 0.034861
- -0.012399 0.000729 0.052888 -0.001273
- 0.004824 -0.064370 0.103305 0.094125
- */

- //mat_inv_userInput.cu
- #include <stdio.h>
- #include <stdlib.h>
- #include <cublas_v2.h>
- #include<math.h>
-
-
- #define cudacall(call) \
- do \
- { \
- cudaError_t err = (call); \
- if(cudaSuccess != err) \
- { \
- fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
- cudaDeviceReset(); \
- exit(EXIT_FAILURE); \
- } \
- } \
- while (0)
-
- #define cublascall(call) \
- do \
- { \
- cublasStatus_t status = (call); \
- if(CUBLAS_STATUS_SUCCESS != status) \
- { \
- fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
- cudaDeviceReset(); \
- exit(EXIT_FAILURE); \
- } \
- \
- } \
- while(0)
-
-
- //
- //
-
- void invert(float** src, float** dst, int n, int batchSize)
- {
- cublasHandle_t handle;
- cublascall(cublasCreate_v2(&handle));
-
- int *P, *INFO;
-
- cudacall(cudaMalloc(&P, n * batchSize * sizeof(int)));
- cudacall(cudaMalloc(&INFO, batchSize * sizeof(int)));
-
- int lda = n;
-
- float **A = (float **)malloc(batchSize*sizeof(float *));
- float **A_d, *A_dflat;
-
- cudacall(cudaMalloc(&A_d,batchSize*sizeof(float *)));
- cudacall(cudaMalloc(&A_dflat, n*n*batchSize*sizeof(float)));
-
- A[0] = A_dflat;
- for (int i = 1; i < batchSize; i++)
- A[i] = A[i-1]+(n*n);
-
- cudacall(cudaMemcpy(A_d,A,batchSize*sizeof(float *),cudaMemcpyHostToDevice));
-
- for (int i = 0; i < batchSize; i++)
- cudacall(cudaMemcpy(A_dflat+(i*n*n), src[i], n*n*sizeof(float), cudaMemcpyHostToDevice));
-
-
- cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
-
-
- int INFOh[batchSize];
- cudacall(cudaMemcpy(INFOh,INFO,batchSize*sizeof(int),cudaMemcpyDeviceToHost));
-
- for (int i = 0; i < batchSize; i++)
- if(INFOh[i] != 0)
- {
- fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
- cudaDeviceReset();
- exit(EXIT_FAILURE);
- }
-
- float **C = (float **)malloc(batchSize*sizeof(float *));
- float **C_d, *C_dflat;
-
- cudacall(cudaMalloc(&C_d,batchSize*sizeof(float *)));
- cudacall(cudaMalloc(&C_dflat, n*n*batchSize*sizeof(float)));
- C[0] = C_dflat;
- for (int i = 1; i < batchSize; i++)
- C[i] = C[i-1] + (n*n);
- cudacall(cudaMemcpy(C_d,C,batchSize*sizeof(float *),cudaMemcpyHostToDevice));
- cublascall(cublasSgetriBatched(handle,n,(const float **)A_d,lda,P,C_d,lda,INFO,batchSize));
-
- cudacall(cudaMemcpy(INFOh,INFO,batchSize*sizeof(int),cudaMemcpyDeviceToHost));
-
- for (int i = 0; i < batchSize; i++)
- if(INFOh[i] != 0)
- {
- fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
- cudaDeviceReset();
- exit(EXIT_FAILURE);
- }
- for (int i = 0; i < batchSize; i++)
- cudacall(cudaMemcpy(dst[i], C_dflat + (i*n*n), n*n*sizeof(float), cudaMemcpyDeviceToHost));
-
- cudaFree(A_d); cudaFree(A_dflat); free(A);
- cudaFree(C_d); cudaFree(C_dflat); free(C);
- cudaFree(P); cudaFree(INFO); cublasDestroy_v2(handle);
- }
-
-
-
- //
- //
-
-
-
- __global__ void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width)
- {
- //2D Thread ID
- int col = blockIdx.x*blockDim.x+threadIdx.x;
- int row = blockIdx.y*blockDim.y+threadIdx.y;
-
- //Pvalue stores the Pd element that is computed by the thread
- float Pvalue = 0;
- if(col<Width && row < Width)
- {
- for(int k = 0; k < Width ; ++k)
- {
- float Mdelement = Md[row*Width + k];
- float Ndelement = Nd[k*Width + col];
- Pvalue += (Mdelement*Ndelement);
-
- }
- Pd[row*Width + col] = Pvalue;
- }
- }
-
-
-
- void mul(float* M,float* N,int Width)
- {
-
- float * P = (float *) malloc(Width*Width*sizeof(float));
- float *Md, *Nd, *Pd;
-
-
-
- unsigned long int size = Width*Width*sizeof(float);
-
-
- //Transfer M and N to device memory
- cudaMalloc((void**)&Md, size);
- cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
-
- cudaMalloc((void**)&Nd, size);
- cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);
-
- //Allocate P on the device
- cudaMalloc((void**)&Pd,size);
-
- //Setup the execution configuration
- dim3 dimBlock(Width,Width);
- dim3 dimGrid(1,1);
-
-
- if (Width*Width > 1024)
- {
- //printf("\n\n enter inside if condi\n\n");
-
- dimGrid.x = (Width-1)/32+1;
- dimGrid.y = (Width-1)/32+1;
-
- dimBlock.x = 32;
- dimBlock.y = 32;
-
-
-
- }
-
-
- //Launch the device computation threads!
- MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);
- //Transfer P from device to host
- cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);
-
- //Free device matrices
- cudaFree(Md);
- cudaFree(Nd);
- cudaFree(Pd);
-
- int i;
-
- fprintf(stdout,"\n\n");
-
- if(Width<11)
- {
-
-
- fprintf(stdout,"\n\nMatrix Multiplication, M x Inv(M) :\n\n");
- for(i = 0; i < Width*Width; i++)
- {
- if(P[i])
- fprintf(stdout,"%10f ",P[i]) ;
- else
- fprintf(stdout,"%9f ",P[i]) ;
-
-
-
-
- if((i+1)%Width==0)
- fprintf(stdout,"\n");
- }
-
-
- }
- else
- {
- FILE *fp;
-
- fp = fopen("Mat_Inv_out", "a");
-
- if (!fp)
- {
- fprintf(stderr, "Failed to open matAdata.\n");
- exit(1);
- }
- fprintf(fp,"\n\nMatrix Multiplication, M x Inv(M) :\n\n");
- for(i = 0; i < Width*Width; i++)
- { if(P[i])
- fprintf(fp,"%10f ",P[i]) ;
- else
- fprintf(fp,"%9f ",P[i]) ;
-
- if((i+1)%Width==0)
- fprintf(fp,"\n");
- }
- fclose(fp);
- }
-
-
- //printf("\n Matrix multiplication completed !!\n\n");
- free(M);
- free(N);
- free(P);
-
- }
-
-
-
- //
- //
-
-
-
- void fill(float* h, int w)
- {
- unsigned int i;
- for(i=0; i< w*w; i++)
- {
- scanf("%f",&h[i]);
- }
-
-
-
- }
-
-
- //
- //
-
-
- void test_invert( )
- {
- int n;
-
- printf("Enter the order of the square matrix :");
- scanf("%d",&n);
- const int mybatch = 1;
-
-
- float mat1_size = sizeof(float) * n * n;
- float* mat1 = (float*) malloc(mat1_size);
-
- fill(mat1, n);
-
- float *result_flat = (float *)malloc(mybatch*n*n*sizeof(float));
- float **results = (float **)malloc(mybatch*sizeof(float *));
-
- for (int i = 0; i < mybatch; i++)
- results[i] = result_flat + (i*n*n);
-
- float **inputs = (float **)malloc(mybatch*sizeof(float *));
-
- //inputs[0] = zero_pivot;
-
- inputs[0] = mat1;
-
-
- invert(inputs, results, n, mybatch);
-
- if(n<11)
- {
-
- for (int qq = 0; qq < mybatch; qq++)
- {
- if(mybatch==1)
- fprintf(stdout, "Input Matrix, M :\n\n");
- else
- fprintf(stdout, "Input Matrix %d:\n\n", qq);
-
- for(int i=0; i<n; i++)
- {
- for(int j=0; j<n; j++)
- {
- if(inputs[qq][i*n+j])
- fprintf(stdout,"%12f ",inputs[qq][i*n+j]);
- else
- fprintf(stdout,"%11f ",inputs[qq][i*n+j]);
- }
- fprintf(stdout,"\n");
- }
- }
- fprintf(stdout,"\n\n");
-
-
-
-
- for (int qq = 0; qq < mybatch; qq++)
- {
-
- if(mybatch==1)
- fprintf(stdout, "Inverse of the Input Matrix, Inv(M):\n\n");
- else
- fprintf(stdout, "Inverse Matrix %d:\n\n", qq);
- for(int i=0; i<n; i++)
- {
- for(int j=0; j<n; j++)
- {
- if(results[qq][i*n+j])
- fprintf(stdout,"%10f ",results[qq][i*n+j]);
- else
- fprintf(stdout,"%9f ",results[qq][i*n+j]);
-
- }
- fprintf(stdout,"\n");
- }
- }
- }
-
-
- else // order of the matrix is more than 10 x 10 then output the results in the file
- {
- printf("\nThe order of matrix is too large to display in terminal\n, Please open the file : Mat_Inv_out.txt located in the current folder. To see the output.\n\n");
-
- FILE *fp;
-
-
- fp = fopen("Mat_Inv_out", "w");
-
- if (!fp)
- {
- fprintf(stderr, "Failed to open Mat_Inv_out.\n");
- exit(1);
- }
-
-
-
- for (int qq = 0; qq < mybatch; qq++)
- {
-
- if(mybatch==1)
- fprintf(fp, "Input Matrix , M:\n\n");
- else
- fprintf(fp, "Input Matrix %d:\n\n", qq);
-
-
-
-
- for(int i=0; i<n; i++)
- {
- for(int j=0; j<n; j++)
- {
- if(inputs[qq][i*n+j])
- fprintf(fp,"%12f ",inputs[qq][i*n+j]);
- else
- fprintf(fp,"%11f ",inputs[qq][i*n+j]);
- }
-
- fprintf(fp,"\n");
- }
- }
- fprintf(fp,"\n\n");
-
- for (int qq = 0; qq < mybatch; qq++)
- {
- if(mybatch==1)
- fprintf(fp, "Inverse of the Input Matrix, Inv(M):\n\n");
- else
- fprintf(fp, "Inverse %d:\n\n", qq);
- for(int i=0; i<n; i++)
- {
- for(int j=0; j<n; j++)
- {
- if(results[qq][i*n+j])
- fprintf(fp,"%10f ",results[qq][i*n+j]);
- else
- fprintf(fp,"%9f ",results[qq][i*n+j]);
-
- }
-
- fprintf(fp,"\n");
- }
- }
-
- fclose(fp);
-
- }// end of if else condition for output
-
- float *A, *B;
- A=inputs[0];
- B=results[0];
- mul(A, B, n );
-
- //mul(inputs[0][], results[0][], n );
-
- }
-
- //
- //
-
-
- int main()
- {
-
-
- test_invert();
- return 0;
- }
-
- /*
- $ nvcc -arch=sm_20 -o t540 t540.cu -lcublas
- $ ./t540
- Input 0:
- 0.000000 3.000000 4.000000
- 1.000000 3.000000 10.000000
- 4.000000 9.000000 16.000000
- Input 1:
- 0.500000 3.000000 4.000000
- 1.000000 3.000000 10.000000
- 4.000000 9.000000 16.000000
- Input 2:
- 0.000000 3.000000 4.000000
- 1.000000 5.000000 6.000000
- 9.000000 8.000000 2.000000
- Input 3:
- 22.000000 3.000000 4.000000
- 1.000000 5.000000 6.000000
- 9.000000 8.000000 2.000000
- Inverse 0:
- -0.700000 -0.200000 0.300000
- 0.400000 -0.266667 0.066667
- -0.050000 0.200000 -0.050000
- Inverse 1:
- -1.076923 -0.307692 0.461538
- 0.615385 -0.205128 -0.025641
- -0.076923 0.192308 -0.038462
- Inverse 2:
- -4.750000 3.250000 -0.250000
- 6.500000 -4.500000 0.500000
- -4.625000 3.375000 -0.375000
- Inverse 3:
- 0.045894 -0.031401 0.002415
- -0.062802 -0.009662 0.154589
- 0.044686 0.179952 -0.129227
- $
- $ nvcc -arch=sm_20 -o t540 t540.cu -lcublas
- $ ./t540
- Enter the order of the aquare matrix :4
- Input 0:
- -0.100222 -2.553872 -69.072723 0.016120
- -2.752346 -1.230871 1.997387 0.606710
- -0.029929 -0.583444 2.733107 0.254404
- -1.844285 -0.070541 1.906255 10.758991
- Inverse 0:
- 0.017501 -0.374555 0.713068 0.004234
- -0.056876 -0.005437 -1.457745 0.034861
- -0.012399 0.000729 0.052888 -0.001273
- 0.004824 -0.064370 0.103305 0.094125
- */

- #Makefile
- default: all
- .PHONY: default
-
- all:
- nvcc -arch=sm_70 -o mat_inv mat_inv.cu -lcublas -lm
- .PHONY: all
-
- user:
- nvcc -arch=sm_20 -o user_mat_inv mat_inv_userInput.cu -lcublas -lm
-
- clean:
- rm mat_inv user_mat_inv
- .PHONY: all
$make
$./mat_inv 3
std out:
- Input Matrix, M :
-
- -3.662523 -1.507356 1.094427
- -0.853989 1.768155 45.548477
- -5.994538 6.792342 -323.404816
-
-
- Inverse of the Input Matrix, Inv(M):
-
- -0.217016 -0.118222 -0.017385
- -0.135258 0.293317 0.040853
- 0.001182 0.008352 -0.001912
-
-
-
-
- Matrix Multiplication, M x Inv(M) :
-
- 1.000000 0.000000 -0.000000
- 0.000000 1.000000 -0.000000
- 0.000000 -0.000000 1.000000

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。