赞
踩
#include <iostream> #include <time.h> #include <npp.h> #include <cuda_runtime.h> #include <cuda_runtime_api.h> #include <device_launch_parameters.h> #include <cooperative_groups.h> #include <opencv2/opencv.hpp> /*================================ * @brief 多block,多thread, * 利用原子操作 代码比较简单,但是原子操作对数据的访问是串行的,频繁的原子操作会影响性能 * 此时原子操作了 blockDim.x * gridDim.x 次 =================================*/ __global__ void sumOfCuda1(float* pData, float* pSum, int N) { const int tid = blockIdx.x * blockDim.x + threadIdx.x; const int nStep = blockDim.x * gridDim.x; double dSum = 0.0; for (int i = tid; i < N; i += nStep) { dSum += pData[i]; } atomicAdd(pSum, dSum); // 对所有线程进行原子操作 } const int N = 5120 * 5120; const int THREAD_NUM = 2048; const int BLOCK_NUM = 2048; /*================================ * @brief thread 里面多线程归约,归约后将各个thread的结果保存到共享内存里, * 然后再对block归约, 最后将各个block的结果用原子操作求和 * 利用__shared__ 声明的变量表示这是 shared memory,是一个block中每个thread * 都共享的内存, 这会使用GPU上的内存,存取的速度相当快,不需要担心latency的问题 * __syncthreads()是一个cuda内部的函数,表示block中的所有thread都要同步到这个点, * 才能继续执行下一步操作。 在操作共享内存时,需要注意bank conflict的问题 =================================*/ __global__ void sumOfCuda2(float* pfData, float* pSum, int N) { // printf("blockDim.x = %d\n", blockDim.x); __shared__ double share_dTemp[THREAD_NUM]; const int nStep = gridDim.x * blockDim.x; const int tid = blockIdx.x * blockDim.x + threadIdx.x; double dTempSum = 0.0; for (int i = tid; i < N; i += nStep) { dTempSum += pfData[i]; } share_dTemp[threadIdx.x] = dTempSum; __syncthreads();// 同步操作,等待上面执行完成 // 此时每个block内的每一个线程,都放了各自的求和 // 然后需要对每个block内的线程进行归约 // 每个block内有 blockDim.x 个线程, 也就是对每个block内的 for (int i = blockDim.x / 2; i != 0; i /= 2) { if (threadIdx.x < i) { share_dTemp[threadIdx.x] += share_dTemp[threadIdx.x + i]; } __syncthreads(); } if (0 == threadIdx.x) { atomicAdd(pSum, share_dTemp[0]); } } int main() { cv::Mat matBgrImg = cv::imread("1.jpg"); cv::resize(matBgrImg, matBgrImg, cv::Size(5120, 5120)); cv::Mat matGrayIMg; cv::cvtColor(matBgrImg, matGrayIMg, cv::COLOR_BGR2GRAY); cv::Mat matF32; matGrayIMg.convertTo(matF32, CV_32FC1); matF32 = matF32 / 255.; int nH = matF32.rows; int nW = matF32.cols; int nStep = matF32.step; printf("matF32 h = %d, w = %d, channel = %d, step = %d \n", matF32.rows, matF32.cols, matF32.channels(), nStep); int LOOPS = 10000; clock_t t0 = clock(); for (size_t i = 0; i < LOOPS; i++) { sum(matF32)[0]; } clock_t t1 = clock(); std::cout << "cpu costime is " << t1 - t0 << "ms" << std::endl; void* pvData = malloc(1 * 1 * N * sizeof(float)); memcpy(pvData, (unsigned char*)matF32.data, N * sizeof(float)); float* pfData_dev = NULL; cudaMalloc((void**)& pfData_dev, N * sizeof(float)); cudaMemcpy(pfData_dev, pvData, N * sizeof(float), cudaMemcpyHostToDevice); float fSum = 0.0; clock_t start = clock(); float* pfSum_dev = NULL; cudaMalloc((void**)& pfSum_dev, THREAD_NUM * sizeof(float)); //printf("block num = %d, thread num = %d \n", BLOCK_NUM, THREAD_NUM); for (int i = 0; i < LOOPS; ++i) { cudaMemset(pfSum_dev, 0, THREAD_NUM * sizeof(float)); sumOfCuda2 << <BLOCK_NUM, THREAD_NUM >> > (pfData_dev, pfSum_dev, N); float pfSum[THREAD_NUM] = { 0.0 }; cudaMemcpy(pfSum, pfSum_dev, THREAD_NUM * sizeof(float), cudaMemcpyDeviceToHost); fSum = 0.0; for (int j = 0; j < THREAD_NUM; ++j) { fSum += pfSum[j]; } } clock_t t2 = clock(); std::cout << "costime is " << t2 - t1 << "ms" << std::endl; //std::cout << fSum << std::endl; return 0; }
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include "npp.h" #include <opencv2/opencv.hpp> int main() { cv::Mat matBrgImg = cv::imread("1.jpg"); int nWidth = matBrgImg.cols; int nHeight = matBrgImg.rows; int nStep = matBrgImg.step; // 每一行的步长,这里 = nWidth * 3 cv::Mat matYuvImg; cv::cvtColor(matBrgImg, matYuvImg, cv::COLOR_BGR2YUV); Npp8u* pu8YUV_dev = NULL; cudaMalloc((void**)& pu8YUV_dev, nWidth * nHeight * 3 * sizeof(Npp8u)); cudaMemcpy(pu8YUV_dev, (Npp8u*)matYuvImg.data, nWidth * nHeight * 3 * sizeof(Npp8u), cudaMemcpyHostToDevice); NppStatus nppRet = NPP_NO_ERROR; NppiSize nppSize{ nWidth, nHeight }; int nLineStep_npp = 0; Npp8u* pu8BGR_dev = nppiMalloc_8u_C3(nWidth, nHeight, &nLineStep_npp); printf("nLineStep_npp = %d \n", nLineStep_npp); nppRet = nppiYUVToBGR_8u_C3R(pu8YUV_dev, nStep, pu8BGR_dev, nStep, nppSize); printf("nppRet = %d \n", nppRet); unsigned char* pu8Bgr_host = NULL; pu8Bgr_host = (unsigned char*)malloc(nWidth * nHeight * 3); memset(pu8Bgr_host, 0, nWidth * nHeight * 3); cudaMemcpy(pu8Bgr_host, pu8BGR_dev, nWidth * nHeight * 3, cudaMemcpyDeviceToHost); cv::Mat newimage(nHeight, nWidth, CV_8UC3); memcpy(newimage.data, pu8Bgr_host, nWidth * nHeight * 3); cv::imwrite("YUV2BGR.jpg", newimage); if (NULL != pu8BGR_dev) { nppiFree(pu8BGR_dev); pu8BGR_dev = NULL; } if (NULL != pu8YUV_dev) { cudaFree(pu8YUV_dev); pu8YUV_dev = NULL; } if (NULL != pu8Bgr_host) { free(pu8Bgr_host); pu8Bgr_host = NULL; } return 0; }
#include <stdio.h> #include <iostream> #include "cuda_runtime.h" #include "device_launch_parameters.h" const int N = 2048; const int threadnum = 32;//开32个线程 /* cpu 向量内积 */ template <typename T> void dot_cpu(T* a, T* b, T* c, int n) { double dTemp = 0; for (int i = 0; i < n; ++i) { dTemp += a[i] * b[i]; } *c = dTemp; } /*单block 分散归约 */ template <typename T> __global__ void dot_gpu_1(T* a, T* b, T* c, int n) { __shared__ T tmp[threadnum]; const int tid = threadIdx.x; //线程ID索引号 const int t_n = blockDim.x; // 一个block内开启的线程总数 int nTid = tid; double dTemp = 0.0; while (nTid < n) { dTemp += a[nTid] * b[nTid]; nTid += t_n; } tmp[tid] = dTemp; // 将每个线程中的内积放入到共享内存中 __syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完 int i = 2, j = 1; while (i <= threadnum) { if (tid % i == 0) { tmp[tid] += tmp[tid + j]; } __syncthreads(); i *= 2; j *= 2; } if (0 == tid) { c[0] = tmp[0]; } } /*单block 低线程归约向量内积*/ template <typename T> __global__ void dot_gpu_2(T* a, T* b, T* c, int n) { __shared__ T tmp[threadnum]; const int nThreadIdX = threadIdx.x; //线程ID索引号 const int nBlockDimX = blockDim.x; // 一个block内开启的线程总数 int nTid = nThreadIdX; double dTemp = 0.0; while (nTid < n) { dTemp += a[nTid] * b[nTid]; nTid += nBlockDimX; } tmp[nThreadIdX] = dTemp; // 将每个线程中的内积放入到共享内存中 __syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完 int i = threadnum / 2; while (i != 0) { if (nThreadIdX < i) { tmp[nThreadIdX] += tmp[nThreadIdX + i]; } __syncthreads();// 同步操作,即等所有线程内上面的操作都执行完 i /= 2; } if (0 == nThreadIdX) { c[0] = tmp[0]; } } /*多block多线程向量内积*/ template <typename T> __global__ void dot_gpu_3(T* a, T* b, T* c, int n) { __shared__ T aTmp[threadnum]; const int nThreadIdX = threadIdx.x; //线程ID索引号 const int nStep = gridDim.x * blockDim.x; // 跳步的步长,即所有线程的数量 int nTidIdx = blockIdx.x * blockDim.x + threadIdx.x; // 当前线程在全局线程的索引 double dTemp = 0.0; while (nTidIdx < n) { dTemp += a[nTidIdx] * b[nTidIdx]; nTidIdx += nStep; } aTmp[nThreadIdX] = dTemp; // 将每个线程中的内积放入到对应block的共享内存中 __syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完 int i = threadnum / 2; while (i != 0) { if (nThreadIdX < i) { aTmp[nThreadIdX] += aTmp[nThreadIdX + i]; } __syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完 i /= 2; } if (0 == nThreadIdX) { c[blockIdx.x] = aTmp[0]; } } int main() { float a[N], b[N]; float c = 0; for (int i = 0; i < N; ++i) // 为数组a、b赋值 { a[i] = i * 1.0; b[i] = 1.0; } float* d_a = 0, * d_b = 0, * d_c = 0; cudaMalloc(&d_a, N * sizeof(float)); cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice); cudaMalloc(&d_b, N * sizeof(float)); cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice); cudaMalloc(&d_c, sizeof(float)); dot_cpu(a, b, &c, N); //dot_gpu_1 << <1, threadnum >> > (d_a, d_b, d_c, N); //dot_gpu_2 << <1, threadnum >> > (d_a, d_b, d_c, N); //dot_gpu_3<< <1, threadnum >> > (d_a, d_b, d_c, N); //cudaMemcpy(&c, d_c, sizeof(float), cudaMemcpyDeviceToHost); std::cout << c << std::endl; cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); return 0; }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。