赞
踩
折腾了好几天终于把cublas矩阵求逆调好了,但是依然还是有很多疑问,因为是按照网上别人的程序凑出来的。主要的疑惑有两点,在这里贴出来,希望有大神可以指点一二,大家交流交流。
①矩阵初始化的时候,matHost[0],为什么不可以像我注释掉的那两句那样子初始化,那样初始化的时候就会报错:expected an expression。
②为什么要定义一个在host端的指针srchd,它的内容却在device端,然后又要定义一个device端的指针srcDptr,最后再把srchd拷贝到srcDptr,为什么不能省略掉srchd,直接把srcDptr的内存开辟到device端,把数据从matHost直接拷贝到srcDptr,即
float **srcDptr;
cudaMalloc((void**)&srcDptr, sizeof(float*) * NUM);
for(int I=0;i<NUM;i++)
{
cudaMalloc((void**)&srcDptr[i],sizeof(float)*N*N);
cudaMemcpy(srcDptr[i],matHost[i],sizeof(float)*N*N,cudaMemcpyHostToDevice);
}
然后把srcDptr传给cublasSgetrfBatched。
这样我测试是行不通的,不知道是为什么。
③cublasSgetefBatched函数的那个参数pivot到底是什么,是矩阵的各阶顺序主子式吗?因为只有矩阵的各阶顺序主子式都不为0的时候才能进行LU分解,是这样的吗?
下面是我调通的代码:
- #include<iostream>
- #include"cuda_runtime.h"
- #include<cublas_v2.h>
- #include<stdlib.h>
- #include<time.h>
-
-
- //矩阵的阶数
- #define N 3
- //有两个矩阵
- #define NUM 2
-
- int main()
- {
- //开辟一个二维的数组空间
- float **matHost = new float*[NUM];
- for(int i=0;i<NUM;i++)
- matHost[i] = new float[N*N];
-
- //matHost[0] = {-0.997497,0.617481,-0.299417,0.127171,0.170019,
- //0.791925,-0.613392,-0.0402539,0.64568};
- matHost[0][0] = -0.997497;
- matHost[0][1] = 0.617481;
- matHost[0][2] = -0.299417;
- matHost[0][3] = 0.127171;
- matHost[0][4] = 0.170019;
- matHost[0][5] = 0.791925;
- matHost[0][6] = -0.613392;
- matHost[0][7] = -0.0402539;
- matHost[0][8] = 0.64568;
-
-
- //随机初始化矩阵,所有矩阵被初始化成一样的
- for(int j=1;j<NUM;j++)
- {
- for(int i=0;i<N*N;i++)
- {
- matHost[j][i] = matHost[0][i];
- }
- }
-
- //指针在host端,内容却在device端
- float **srchd = new float*[NUM];
-
- for(int i=0;i<NUM;i++)
- {
- cudaMalloc((void**)&srchd[i],sizeof(float)*N*N);
- cudaMemcpy(srchd[i],matHost[i],sizeof(float)*N*N,cudaMemcpyHostToDevice);
- }
-
- float **srcDptr;
- cudaMalloc((void**)&srcDptr,sizeof(float*)*NUM);
- cudaMemcpy(srcDptr,srchd,sizeof(float*)*NUM,cudaMemcpyHostToDevice);
-
-
- //用来记录LU分解是否成功,0表示分解成功
- int *infoArray;
- cudaMalloc((void**)&infoArray,sizeof(int)*NUM);
-
- int *pivotArray;
- cudaMalloc((void**)&pivotArray,sizeof(int)*N*NUM);
-
- cublasHandle_t cublasHandle;
- cublasCreate(&cublasHandle);
-
- //LU分解,原地的
- cublasSgetrfBatched(cublasHandle,N,srcDptr,N,pivotArray,infoArray,NUM);
-
- float **resulthd = new float*[NUM];
- for(int i=0;i<NUM;i++)
- cudaMalloc((void**)&resulthd[i],sizeof(float)*N*N);
-
- float **resultDptr;
- cudaMalloc((void**)&resultDptr,sizeof(float*)*NUM);
- cudaMemcpy(resultDptr,resulthd,sizeof(float*)*NUM,cudaMemcpyHostToDevice);
-
- //把LU分解的结果变成逆矩阵
- cublasSgetriBatched(cublasHandle,N,(const float**)srcDptr,N,pivotArray,resultDptr,N,infoArray,NUM);
-
- float **invresult = new float*[NUM];
- for(int i=0;i<NUM;i++)
- {
- invresult[i] = new float[N*N];
- //注意是resulthd[i]而不是resultDptr[i],否则会出错
- cudaMemcpy(invresult[i],resulthd[i],sizeof(float)*N*N,cudaMemcpyDeviceToHost);
- }
-
-
- int *infoArrayHost = new int[NUM];
- cudaMemcpy(infoArrayHost,infoArray,sizeof(int)*NUM,cudaMemcpyDeviceToHost);
-
- std::cout<<"info array:"<<std::endl;
- for(int i=0;i<NUM;i++)
- std::cout<<infoArrayHost[i]<<" ";
- std::cout<<std::endl;
-
- cublasDestroy(cublasHandle);
-
- std::cout<<"LU decomposition result:"<<std::endl;
- for(int i=0;i<N*N;i++)
- {
- if(i%N == 0)
- std::cout<<std::endl;
-
- std::cout<<invresult[0][i]<<" ";
- }
- std::cout<<std::endl;
-
- //释放空间
- for(int i=0;i<NUM;i++)
- {
- cudaFree(srchd[i]);
- delete []matHost[i];
- matHost[i] = NULL;
- cudaFree(resulthd[i]);
- delete []invresult[i];
- invresult[i] = NULL;
- }
-
- delete []matHost;
- matHost = NULL;
- delete []resulthd;
- resulthd = NULL;
- delete []invresult;
- invresult = NULL;
-
- delete []infoArrayHost;
- infoArrayHost = NULL;
-
- delete []srchd;
- srchd = NULL;
-
- cudaFree(infoArray);
- cudaFree(pivotArray);
- cudaFree(srcDptr);
- cudaFree(resultDptr);
-
- return 0;
-
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。