赞
踩
搭建一个能够调用cublas函数的环境中编译运行:
- //#include "device_launch_parameters.h"
- #include <cublas_v2.h>
- #include <cuda_runtime.h>
-
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
-
- int printMatrix(char* matrixName, int m, int n, float* a_matrix, int lda) {
- printf("\n%s =\n", matrixName);
-
- for (int i = 0; i < m; i++) {
- printf("\n");
- for (int j = 0; j < n; j++) {
- printf(" %8.5f ", a_matrix[i + j * lda]);
- }
- }
-
- printf("\n");
-
- return 0;
- }
-
- void printVector(const char* log, int* Vector, int size) {
- printf("\n%s\n",log);
- for (int i = 0; i < size; i++) {
- printf(" %d ", Vector[i]);
- }
- printf("\n");
-
- }
-
- void initMatrix(float* A, int dim, int seed) {
-
- srand(2022 + seed);
- printf("Starting Init Matrix ...\n");
- for (int i = 0; i < dim * dim; i++) {
- A[i] = (float)((rand() % 100) - 50) / 100.0 + 1.0;
- //LL:: printf("A[%d]=%f ",i,A[i]);
- }
- printf("Init is OK.\n");
- }
-
- void do_dgemm(int M, int N, int K, float* A, int lda, float* B, int ldb, float* C, int ldc) {
- for (int i = 0; i < M; i++) {
- for (int j = 0; j < N; j++) {
- C[i + j * ldc] = 0.0; //C(i,j)=0.0;
-
- for (int k = 0; k < K; k++) {
- //C(i,j) = ΣA(i,k)B(k,j) k=0,1, ... ,K-1
- C[i + j * ldc] += A[i + k * lda] * B[k + j * ldb];
- }
- }
- }
- }
-
-
- float* d_Get3DInv(float* A, int n, int batchedSize)
- {
- cublasHandle_t cu_cublasHandle;
- cublasCreate(&cu_cublasHandle);
-
- float** AarrayDev;
- float** CarrayDev;
-
- float** AarrayHost=NULL;
- AarrayHost = (float**)malloc(batchedSize * sizeof(float*));
-
- float** CarrayHost = NULL;
- CarrayHost = (float**)malloc(batchedSize * sizeof(float*));
-
- float* A_data_dev;
- float* C_data_dev;
- int* LUPivots_dev;
- int* LUInfo_dev;
- size_t size_data_A = batchedSize * n * n * sizeof(float);
-
- cudaMalloc(&AarrayDev, batchedSize * sizeof(float*));
- cudaMalloc(&CarrayDev, batchedSize * sizeof(float*));
- //cudaMalloc(&adC, sizeof(float*));
- cudaMalloc(&A_data_dev, size_data_A);
- for (int i = 0; i < batchedSize; i++) {
- AarrayHost[i] = A_data_dev + i * n * n;
- }
-
- cudaMalloc(&C_data_dev, size_data_A);
- for (int i = 0; i < batchedSize; i++) {
- CarrayHost[i] = C_data_dev + i * n * n;
- }
-
- cudaMalloc(&LUPivots_dev, batchedSize * n * sizeof(int));
- cudaMalloc(&LUInfo_dev, batchedSize * sizeof(int));
-
- cudaMemcpy(A_data_dev, A, size_data_A, cudaMemcpyHostToDevice);
- cudaMemcpy(AarrayDev, AarrayHost, batchedSize*sizeof(float*), cudaMemcpyHostToDevice);
- cudaMemcpy(CarrayDev, CarrayHost, batchedSize*sizeof(float*), cudaMemcpyHostToDevice);
-
- cublasSgetrfBatched(cu_cublasHandle, n, AarrayDev, n, LUPivots_dev, LUInfo_dev, batchedSize);
- cudaDeviceSynchronize();
-
- float* resGETRF = (float*)malloc(size_data_A);
- cudaMemcpy(resGETRF, A_data_dev, size_data_A, cudaMemcpyDeviceToHost);
-
- int* pivot = (int*)malloc(n * batchedSize*sizeof(int));
- cudaMemcpy(pivot, LUPivots_dev, n * batchedSize*sizeof(int), cudaMemcpyDeviceToHost);
- for (int i = 0; i < batchedSize; i++) {
- printVector("Pivot",pivot+i*n,n);
- }
-
- for (int i = 0; i < batchedSize; i++) {
- printMatrix("Matrix A_LU", n, n, resGETRF+i*n*n, n);
- }
-
- cublasSgetriBatched(cu_cublasHandle, n, (const float**)AarrayDev, n, LUPivots_dev, CarrayDev, n, LUInfo_dev, batchedSize);
- cudaDeviceSynchronize();
-
- float* res = (float*)malloc(size_data_A);
- cudaMemcpy(res, C_data_dev, size_data_A, cudaMemcpyDeviceToHost);
-
-
- cudaFree(LUInfo_dev);
- cudaFree(LUPivots_dev);
- cudaFree(C_data_dev);
- cudaFree(A_data_dev);
- cudaFree(CarrayDev);
- cudaFree(AarrayDev);
-
- cublasDestroy(cu_cublasHandle);
-
- free(AarrayHost);
- free(CarrayHost);
-
- return res;
- }
- int main(int argc, char* argv[]) {
- int n = 5;
- int batchedSize = 2;
- int J = 1;
- float* A = (float*)malloc(batchedSize * n * n * sizeof(float));
-
- for (int i = 0; i < batchedSize; i++) {
- initMatrix(A + i * n * n, n, i);
- }
-
- for (int i = 0; i < batchedSize; i++) {
- printMatrix("Matrix A", n, n, A + i*n*n, n);
- }
-
- float* inv = d_Get3DInv(A, n, batchedSize);
-
- for (int i = 0; i < batchedSize; i++) {
- printMatrix("Matrix A'", n, n, inv+i*n*n, n);
- }
-
- float* C_gemm = (float*)malloc(n * n * sizeof(float));
-
- for (int i = 0; i < batchedSize; i++) {
- do_dgemm(n, n, n, A+i*n*n, n, inv+i*n*n, n, C_gemm, n);
- printMatrix("Matrix A*A'", n, n, C_gemm, n);
- }
-
- free(C_gemm);
-
- if (inv != NULL) {
- free(inv);
- inv = NULL;
- }
-
- free(A);
-
- return 0;
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。