一个简洁的cublasSmatinvBatched应用示例_cublassymmstirdedbatchded

作者：盐析白兔 | 2024-03-15 23:52:10

踩

cublassymmstirdedbatchded

可以简单地粘贴放入一个cuBLAS sample的文件中替代运行：


 
//一个简介的cublasSmatinvBatched 示例：
 
 
 
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
 
/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
 
/* Matrix size */
#define N (2)
#define BATCH_SIZE (1)
 
 
/* Main */
int main(int argc, char **argv) {
  cublasStatus_t status;
  float* h_A;
  float* d_A = 0;
  float* d_Ainv = 0;
  float* h_Ainv = 0;
 
  int n2 = N * N;
  int* info=NULL;
  float** A=NULL;
  float** Ainv=NULL;
 
  cublasHandle_t handle;
 
  printf("LL:: main()\n");
 
  int dev = findCudaDevice(argc, (const char **)argv);
 
  if (dev == -1) {
    return EXIT_FAILURE;
  }
 
  printf("simpleCUBLAS_Smatinv test running..\n");
 
  status = cublasCreate(&handle);
 
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! CUBLAS initialization error\n");
    return EXIT_FAILURE;
  }
 
  h_A = reinterpret_cast<float *>(malloc(n2 * sizeof(h_A[0])));
 
  if (h_A == 0) {
    fprintf(stderr, "!!!! host memory allocation error (A)\n");
    return EXIT_FAILURE;
  }
 
  h_Ainv = reinterpret_cast<float *>(malloc(n2 * sizeof(h_Ainv[0])));
 
  if (h_Ainv == 0) {
    fprintf(stderr, "!!!! host memory allocation error (A)\n");
    return EXIT_FAILURE;
  }
 
  for (int i = 0; i < n2; i++) {
    h_A[i] = rand() / static_cast<float>(RAND_MAX);
  }
 
  cudaMalloc(&A, sizeof(float*));
  cudaMalloc(&Ainv, sizeof(float*));
 
  if (cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0])) !=
      cudaSuccess) {
    fprintf(stderr, "!!!! device memory allocation error (allocate d_A)\n");
    return EXIT_FAILURE;
  }
 
  cudaMemcpy(A, &d_A, sizeof(float*), cudaMemcpyHostToDevice);
 
 
  if (cudaMalloc(reinterpret_cast<void **>(&d_Ainv), n2 * sizeof(d_Ainv[0])) !=
      cudaSuccess) {
    fprintf(stderr, "!!!! device memory allocation error (allocate d_Ainv)\n");
    return EXIT_FAILURE;
  }
 
  cudaMemcpy(Ainv, &d_Ainv, sizeof(float*), cudaMemcpyHostToDevice);
 
  if (cudaMalloc(reinterpret_cast<void **>(&info), BATCH_SIZE*sizeof(int)) !=
      cudaSuccess) {
    fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
    return EXIT_FAILURE;
  }
 
  status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
 
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! device access error (write A)\n");
    return EXIT_FAILURE;
  }
 
  status = cublasSmatinvBatched(handle, N, A, N,
                                Ainv, N, info, BATCH_SIZE);
 
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution error.\n");
    return EXIT_FAILURE;
  }
 
  status = cublasGetVector(n2, sizeof(float), d_Ainv, 1, h_Ainv, 1);
 
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! device access error (read C)\n");
    return EXIT_FAILURE;
  }
 
  printf("A =\n");
 
  for(int i=0; i<n2; i++){
    if(i%N==0)printf("\n");
    printf("%3.3f  ",h_A[i]);
  }
 
  printf("\ninversion of A:\n");
  printf("Ainv =\n");
 
  for(int i=0; i<n2; i++){
    if(i%N==0) printf("\n");
    printf("%3.3f  ",h_Ainv[i]);
  }
 
  printf("\n\n");
 
  free(h_A);
  free(h_Ainv);
 
  if(cudaFree(d_A) != cudaSuccess) {
    fprintf(stderr, "!!!! memory free error (d_A)\n");
    return EXIT_FAILURE;
  }
 
  if (cudaFree(d_Ainv) != cudaSuccess) {
    fprintf(stderr, "!!!! memory free error (d_Ainv)\n");
    return EXIT_FAILURE;
  }
 
  status = cublasDestroy(handle);
 
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! shutdown error (A)\n");
    return EXIT_FAILURE;
  }
 
  return 0;
}

不检查版：


//一个简介的cublasSmatinvBatched 示例：
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
 
#include <cublas_v2.h>
#include <cuda_runtime.h>
 
#define N (5)
#define BATCH_SIZE (1)
int NV_smatinv(float* matrixA, int n2);
 
/* cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float *A[], int lda,
                                               float *Ainv[], int lda_inv, int *info, int batchSize); */
 
int main(){
  float* matrixA;
  matrixA = reinterpret_cast<float *>(malloc(N*N*sizeof(matrixA[0])));
  for(int i=0; i<N*N; i++){
    matrixA[i] = rand() / static_cast<float>(RAND_MAX);
  }
  
  NV_smatinv(matrixA, N*N);
  //IX_smatinv(matrixA, N*N);
 
  free(matrixA);
 
  return 0;
}
 
 
 
 
 
int NV_smatinv(float* matrixA, int n2) {
 
  cublasStatus_t status;
  float* h_A;
  float* d_A = 0;
  float* d_Ainv = 0;
  float* h_Ainv = 0;
 
  int* info=NULL;
  float** A=NULL;        //LL:: array of matrices in d_A
  float** Ainv=NULL;     //LL:: array of inversion of matrices d_Ainv
 
  cublasHandle_t handle;
 
  status = cublasCreate(&handle);
 
  h_A    = reinterpret_cast<float *>(malloc(n2 * sizeof(h_A[0])));
  h_Ainv = reinterpret_cast<float *>(malloc(n2 * sizeof(h_Ainv[0])));
 
  memcpy(h_A, matrixA, n2*sizeof(matrixA[0]));
 
  cudaMalloc(&A, sizeof(float*));       //LL:: this example only has one float Matrix
  cudaMalloc(&Ainv, sizeof(float*));    //LL:: 
 
  if (cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0])) !=
      cudaSuccess) {
    fprintf(stderr, "!!!! device memory allocation error (allocate d_A)\n");
    return EXIT_FAILURE;
  }
 
  cudaMemcpy(A, &d_A, sizeof(float*), cudaMemcpyHostToDevice);
 
  if (cudaMalloc(reinterpret_cast<void **>(&d_Ainv), n2 * sizeof(d_Ainv[0])) !=
      cudaSuccess) {
    fprintf(stderr, "!!!! device memory allocation error (allocate d_Ainv)\n");
    return EXIT_FAILURE;
  }
 
  cudaMemcpy(Ainv, &d_Ainv, sizeof(float*), cudaMemcpyHostToDevice);
 
  if (cudaMalloc(reinterpret_cast<void **>(&info), BATCH_SIZE*sizeof(int)) !=
      cudaSuccess) {
    fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
    return EXIT_FAILURE;
  }
 
  cudaMemcpy(d_A, h_A, n2*sizeof(h_A[0]), cudaMemcpyHostToDevice);
 
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! device access error (write A)\n");
    return EXIT_FAILURE;
  }
/* cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float *A[], int lda,
                                               float *Ainv[], int lda_inv, int *info, int batchSize); */
 
  status = cublasSmatinvBatched(handle, N, A, N,
                                Ainv, N, info, BATCH_SIZE);
 
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution error.\n");
    return EXIT_FAILURE;
  }
 
  cudaMemcpy(h_Ainv, d_Ainv, n2*sizeof(h_Ainv[0]), cudaMemcpyDeviceToHost);
 
 
 
  printf("\nnew A =");         for(int i=0; i<n2; i++){    if(i%N==0)printf("\n");        printf("%3.3f  ",h_A[i]);  }
  printf("\n\nnew Ainv =");    for(int i=0; i<n2; i++){    if(i%N==0) printf("\n");    printf("%3.3f  ",h_Ainv[i]);  }     printf("\n\n");
 
 
 
 
  free(h_A);
  free(h_Ainv);
 
  cudaFree(d_A);
  cudaFree(d_Ainv);
  cudaFree(A);
  cudaFree(Ainv);
  cudaFree(info);
 
  status = cublasDestroy(handle);
 
  return 0;
}

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/盐析白兔/article/detail/245514