当前位置:   article > 正文

c++并行计算(基于cuda)_deviceprop.major == 9999

deviceprop.major == 9999

1. 检查NVIDIA显卡驱动程序,我这里装系统时就已经安装了,于是去官网下载更新了版本;
GeForce Game Ready 驱动程序 | 511.79 | Windows 10 64-bit, Windows 11 | NVIDIA下载 Chinese (Simplified) GeForce Game Ready 驱动程序 匹配 Windows 10 64-bit, Windows 11 操作系统. 发布日期 2022.2.14https://www.nvidia.cn/Download/driverResults.aspx/187102/cn/

2. 查看驱动程序对应的CUDA版本,如图所示

3.安装cuda11.6,官网下载:

CUDA Toolkit 11.6 Downloads | NVIDIA Developerhttps://developer.nvidia.com/cuda-downloads4. 测试是否安装成功;

cmd下运行 nvcc -V

下载 cuda示例代码

https://github.com/nvidia/cuda-sampleshttps://github.com/nvidia/cuda-samples5. 安装cudnn,没有找到11.6对应版本,地址:

https://developer.nvidia.com/rdp/cudnn-downloadhttps://developer.nvidia.com/rdp/cudnn-download

下载需要登录,但是注册失败,于是使用迅雷下载绕过登录,版本为8.2.1对应cuda11.X。

 将cudnn解压并复制到cuda目录下同名文件夹

 6. 打开VS2017,新建项目,选择

 7. 自动生成一个.cu程序,实现数组相加功能。

CUDA程序执行过程:

  • 分配host内存,并进行数据初始化;
  • 分配device内存,并从host将数据拷贝到device上;
  • 调用CUDA的核函数在device上完成指定的运算;
  • 将device上的运算结果拷贝到host上;
  • 释放device和host上分配的内存。

kernel核函数是在device上线程中并行执行的函数,用__global__符号声明,在调用时需要用<<<grid, block>>>来指定一个kernel函数要执行的线程数量,在CUDA中,每个线程会分配一个唯一的线程号thread ID,这个ID值可以通过核函数的内置变量threadIdx来获得。

一个kernel核函数在device上执行时实际上启动了很多线程,一个kernel所启动的所有线程称为一个线程格(grid),同一个线程格上的线程共享相同的全局内存空间;一个线程格又分为很多线程块(block),一个线程块里面包含很多线程。

8.编写矩阵乘法运算程序:

  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <time.h>
  6. const int width = 1 << 3;
  7. const int height = 1 << 3;
  8. void getCudaInformation();
  9. // 矩阵相乘kernel函数,2-D,每个线程计算一个元素Cij
  10. __global__ void matMulKernel(float *A, float *B, float *C) {
  11. float Cvalue = 0.0;
  12. int row = threadIdx.y + blockIdx.y * blockDim.y;
  13. int col = threadIdx.x + blockIdx.x * blockDim.x;
  14. for (int i = 0; i < width; ++i) {
  15. Cvalue += A[row * width + i] * B[i * width + col];
  16. }
  17. //Cvalue = A[row * width + col] + B[row * width + col];
  18. C[row * width + col] = Cvalue;
  19. }
  20. cudaError_t mulWithCuda(float *A, float *B, float *C, unsigned int width, unsigned int height);
  21. int main()
  22. {
  23. //设备测试
  24. getCudaInformation();
  25. //数组运算
  26. float *A, *B, *C;
  27. //分配内存
  28. //int nBytes = width * height * sizeof(float);
  29. A = (float *)malloc(width * height * sizeof(float));
  30. B = (float *)malloc(width * height * sizeof(float));
  31. C = (float *)malloc(width * height * sizeof(float));
  32. // 初始化A矩阵所有元素为1.0,B矩阵所有元素为2.0
  33. for (int i = 0; i < width * height; ++i) {
  34. A[i] = 1.0;
  35. B[i] = 2.0;
  36. C[i] = 10.0;
  37. }
  38. //计时
  39. double startTime1 = clock();//1计时开始
  40. // Add vectors in parallel.
  41. cudaError_t cudaStatus = mulWithCuda(A, B, C, width, height);
  42. if (cudaStatus != cudaSuccess) {
  43. fprintf(stderr, "addWithCuda failed!");
  44. return 1;
  45. }
  46. //计时
  47. double endTime1 = clock();//1计时结束
  48. for (int i = 0; i < width * height; ++i)
  49. {
  50. printf("%f ", C[i]);
  51. if (((i + 1) / width != 0) && ((i + 1) % width == 0))
  52. printf("\n");
  53. }
  54. //运行时间
  55. printf("%.9f \n", (double)(endTime1 - startTime1) / CLOCKS_PER_SEC);
  56. // cudaDeviceReset must be called before exiting in order for profiling and
  57. // tracing tools such as Nsight and Visual Profiler to show complete traces.
  58. cudaStatus = cudaDeviceReset();
  59. if (cudaStatus != cudaSuccess) {
  60. fprintf(stderr, "cudaDeviceReset failed!");
  61. return 1;
  62. }
  63. //释放内存
  64. free(A);
  65. free(B);
  66. free(C);
  67. return 0;
  68. }
  69. // 使用CUDA进行矩阵乘法
  70. cudaError_t mulWithCuda(float *A, float *B, float *C, unsigned int width, unsigned int height)
  71. {
  72. float *dev_a;
  73. float *dev_b;
  74. float *dev_c;
  75. cudaError_t cudaStatus;
  76. // Choose which GPU to run on, change this on a multi-GPU system.
  77. cudaStatus = cudaSetDevice(0);
  78. if (cudaStatus != cudaSuccess) {
  79. fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
  80. goto Error;
  81. }
  82. // Allocate GPU buffers for three vectors (two input, one output) .
  83. int nBytes = width * height * sizeof(float);
  84. cudaStatus = cudaMalloc((void**)&dev_c, nBytes);
  85. if (cudaStatus != cudaSuccess) {
  86. fprintf(stderr, "cudaMalloc failed!");
  87. goto Error;
  88. }
  89. cudaStatus = cudaMalloc((void**)&dev_a, nBytes);
  90. if (cudaStatus != cudaSuccess) {
  91. fprintf(stderr, "cudaMalloc failed!");
  92. goto Error;
  93. }
  94. cudaStatus = cudaMalloc((void**)&dev_b, nBytes);
  95. if (cudaStatus != cudaSuccess) {
  96. fprintf(stderr, "cudaMalloc failed!");
  97. goto Error;
  98. }
  99. // Copy input vectors from host memory to GPU buffers.
  100. cudaStatus = cudaMemcpy(dev_a, A, nBytes, cudaMemcpyHostToDevice);
  101. if (cudaStatus != cudaSuccess) {
  102. fprintf(stderr, "cudaMemcpy failed!");
  103. goto Error;
  104. }
  105. cudaStatus = cudaMemcpy(dev_b, B, nBytes, cudaMemcpyHostToDevice);
  106. if (cudaStatus != cudaSuccess) {
  107. fprintf(stderr, "cudaMemcpy failed!");
  108. goto Error;
  109. }
  110. // 定义kernel的blocksize为(32, 32),那么grid大小为(32, 32)
  111. dim3 blockSize(32, 32);
  112. dim3 gridSize((width + blockSize.x - 1) / blockSize.x,
  113. (height + blockSize.y - 1) / blockSize.y);
  114. // Launch a kernel on the GPU with one thread for each element.
  115. matMulKernel << <gridSize, blockSize >> > (dev_a, dev_b, dev_c);
  116. // 同步device数据保证结果能正确访问
  117. //cudaDeviceSynchronize();
  118. // Check for any errors launching the kernel
  119. cudaStatus = cudaGetLastError();
  120. if (cudaStatus != cudaSuccess) {
  121. fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
  122. goto Error;
  123. }
  124. // cudaDeviceSynchronize waits for the kernel to finish, and returns
  125. // any errors encountered during the launch.
  126. cudaStatus = cudaDeviceSynchronize();
  127. if (cudaStatus != cudaSuccess) {
  128. fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
  129. goto Error;
  130. }
  131. // Copy output vector from GPU buffer to host memory.
  132. cudaStatus = cudaMemcpy(C, dev_c, nBytes, cudaMemcpyDeviceToHost);
  133. if (cudaStatus != cudaSuccess) {
  134. fprintf(stderr, "cudaMemcpy failed!");
  135. goto Error;
  136. }
  137. Error:
  138. cudaFree(dev_c);
  139. cudaFree(dev_a);
  140. cudaFree(dev_b);
  141. return cudaStatus;
  142. }
  143. // 设备测试
  144. void getCudaInformation()
  145. {
  146. int deviceCount;
  147. cudaGetDeviceCount(&deviceCount);
  148. int dev;
  149. for (dev = 0; dev < deviceCount; dev++) {
  150. int driver_version(0), runtime_version(0);
  151. cudaDeviceProp deviceProp;
  152. cudaGetDeviceProperties(&deviceProp, dev);
  153. if (dev == 0)
  154. if (deviceProp.minor = 9999 && deviceProp.major == 9999)
  155. printf("\n");
  156. printf("\nDevice%d:\"%s\"\n", dev, deviceProp.name);
  157. cudaDriverGetVersion(&driver_version);
  158. printf("CUDA驱动版本: %d.%d\n", driver_version / 1000, (driver_version % 1000) / 10);
  159. cudaRuntimeGetVersion(&runtime_version);
  160. printf("CUDA运行时版本: %d.%d\n", runtime_version / 1000, (runtime_version % 1000) / 10);
  161. printf("设备计算能力: %d.%d\n", deviceProp.major, deviceProp.minor);
  162. printf("Total amount of Global Memory: %u bytes\n", deviceProp.totalGlobalMem);
  163. printf("Number of SMs: %d\n", deviceProp.multiProcessorCount);
  164. printf("Total amount of Constant Memory: %u bytes\n", deviceProp.totalConstMem);
  165. printf("Total amount of Shared Memory per block: %u bytes\n", deviceProp.sharedMemPerBlock);
  166. printf("Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
  167. printf("Warp size: %d\n", deviceProp.warpSize);
  168. printf("Maximum number of threads per SM: %d\n", deviceProp.maxThreadsPerMultiProcessor);
  169. printf("Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
  170. printf("Maximum size of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
  171. printf("Maximum size of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
  172. printf("Maximum memory pitch: %u bytes\n", deviceProp.memPitch);
  173. printf("Texture alignmemt: %u bytes\n", deviceProp.texturePitchAlignment);
  174. printf("Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f);
  175. printf("Memory Clock rate: %.0f MHz\n", deviceProp.memoryClockRate * 1e-3f);
  176. printf("Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
  177. printf("--------------------------------------------------------------\n");
  178. }
  179. }

运行结果:

 基于GPU实现了8*8的矩阵相乘。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/AllinToyou/article/detail/245693
推荐阅读
相关标签
  

闽ICP备14008679号