当前位置:   article > 正文

CUDA图像处理NPP库-CUDA和OpenCV联合编程_npp库与opencv联合编程

npp库与opencv联合编程

全图像素值相加

#include <iostream>
#include <time.h>
#include <npp.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#include <cooperative_groups.h>
#include <opencv2/opencv.hpp>


/*================================
* @brief 多block,多thread,
* 利用原子操作 代码比较简单,但是原子操作对数据的访问是串行的,频繁的原子操作会影响性能
* 此时原子操作了 blockDim.x * gridDim.x 次
=================================*/
__global__ void sumOfCuda1(float* pData, float* pSum, int N)
{

	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
	const int nStep = blockDim.x * gridDim.x;

	double dSum = 0.0;
	for (int i = tid; i < N; i += nStep)
	{
		dSum += pData[i];
	}
	atomicAdd(pSum, dSum); // 对所有线程进行原子操作
}

const int N = 5120 * 5120;
const int THREAD_NUM = 2048;
const int BLOCK_NUM = 2048;

/*================================
* @brief thread 里面多线程归约,归约后将各个thread的结果保存到共享内存里,
* 然后再对block归约, 最后将各个block的结果用原子操作求和
* 利用__shared__ 声明的变量表示这是 shared memory,是一个block中每个thread
* 都共享的内存, 这会使用GPU上的内存,存取的速度相当快,不需要担心latency的问题
* __syncthreads()是一个cuda内部的函数,表示block中的所有thread都要同步到这个点,
* 才能继续执行下一步操作。 在操作共享内存时,需要注意bank conflict的问题
=================================*/
__global__ void sumOfCuda2(float* pfData, float* pSum, int N)
{
	// printf("blockDim.x = %d\n", blockDim.x);
	__shared__ double share_dTemp[THREAD_NUM];
	const int nStep = gridDim.x * blockDim.x;
	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
	double dTempSum = 0.0;
	for (int i = tid; i < N; i += nStep)
	{
		dTempSum += pfData[i];
	}
	share_dTemp[threadIdx.x] = dTempSum;
	__syncthreads();// 同步操作,等待上面执行完成
	// 此时每个block内的每一个线程,都放了各自的求和
	// 然后需要对每个block内的线程进行归约
	// 每个block内有 blockDim.x 个线程, 也就是对每个block内的

	for (int i = blockDim.x / 2; i != 0; i /= 2)
	{
		if (threadIdx.x < i)
		{
			share_dTemp[threadIdx.x] += share_dTemp[threadIdx.x + i];
		}
		__syncthreads();
	}

	if (0 == threadIdx.x)
	{
		atomicAdd(pSum, share_dTemp[0]);
	}
}


int main()
{
	cv::Mat matBgrImg = cv::imread("1.jpg");
	cv::resize(matBgrImg, matBgrImg, cv::Size(5120, 5120));
	cv::Mat matGrayIMg;
	cv::cvtColor(matBgrImg, matGrayIMg, cv::COLOR_BGR2GRAY);
	cv::Mat matF32;
	matGrayIMg.convertTo(matF32, CV_32FC1);
	matF32 = matF32 / 255.;
	int nH = matF32.rows;
	int nW = matF32.cols;
	int nStep = matF32.step;
	printf("matF32 h = %d, w = %d, channel = %d, step = %d \n", matF32.rows, matF32.cols, matF32.channels(), nStep);

	int LOOPS = 10000;

	clock_t t0 = clock();
	for (size_t i = 0; i < LOOPS; i++)
	{
		sum(matF32)[0];
	}
	clock_t t1 = clock();
	std::cout << "cpu costime is " << t1 - t0 << "ms" << std::endl;

	void* pvData = malloc(1 * 1 * N * sizeof(float));
	memcpy(pvData, (unsigned char*)matF32.data, N * sizeof(float));
	float* pfData_dev = NULL;

	cudaMalloc((void**)& pfData_dev, N * sizeof(float));
	cudaMemcpy(pfData_dev, pvData, N * sizeof(float), cudaMemcpyHostToDevice);

	float fSum = 0.0;
	clock_t start = clock();
	float* pfSum_dev = NULL;
	cudaMalloc((void**)& pfSum_dev, THREAD_NUM * sizeof(float));

	//printf("block num = %d, thread num = %d \n", BLOCK_NUM, THREAD_NUM);

	for (int i = 0; i < LOOPS; ++i)
	{
		cudaMemset(pfSum_dev, 0, THREAD_NUM * sizeof(float));
		sumOfCuda2 << <BLOCK_NUM, THREAD_NUM >> > (pfData_dev, pfSum_dev, N);

		float pfSum[THREAD_NUM] = { 0.0 };
		cudaMemcpy(pfSum, pfSum_dev, THREAD_NUM * sizeof(float), cudaMemcpyDeviceToHost);
		fSum = 0.0;
		for (int j = 0; j < THREAD_NUM; ++j)
		{
			fSum += pfSum[j];
		}
	}
	
	clock_t t2 = clock();
	std::cout << "costime is " << t2 - t1 << "ms" << std::endl;

	//std::cout << fSum << std::endl;
	return 0;
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132

YUV转BGR

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "npp.h"

#include <opencv2/opencv.hpp>

int main()
{
	cv::Mat matBrgImg = cv::imread("1.jpg");
	int nWidth = matBrgImg.cols;
	int nHeight = matBrgImg.rows;
	int nStep = matBrgImg.step; // 每一行的步长,这里 = nWidth * 3
	cv::Mat matYuvImg;
	cv::cvtColor(matBrgImg, matYuvImg, cv::COLOR_BGR2YUV);

	Npp8u* pu8YUV_dev = NULL;
	cudaMalloc((void**)& pu8YUV_dev, nWidth * nHeight * 3 * sizeof(Npp8u));
	cudaMemcpy(pu8YUV_dev, (Npp8u*)matYuvImg.data, nWidth * nHeight * 3 * sizeof(Npp8u), cudaMemcpyHostToDevice);

	NppStatus nppRet = NPP_NO_ERROR;
	NppiSize nppSize{ nWidth, nHeight };
	int nLineStep_npp = 0;
	Npp8u* pu8BGR_dev = nppiMalloc_8u_C3(nWidth, nHeight, &nLineStep_npp);
	printf("nLineStep_npp = %d \n", nLineStep_npp);

	nppRet = nppiYUVToBGR_8u_C3R(pu8YUV_dev, nStep, pu8BGR_dev, nStep, nppSize);
	printf("nppRet = %d \n", nppRet);

	unsigned char* pu8Bgr_host = NULL;
	pu8Bgr_host = (unsigned char*)malloc(nWidth * nHeight * 3);
	memset(pu8Bgr_host, 0, nWidth * nHeight * 3);
	cudaMemcpy(pu8Bgr_host, pu8BGR_dev, nWidth * nHeight * 3, cudaMemcpyDeviceToHost);

	cv::Mat newimage(nHeight, nWidth, CV_8UC3);
	memcpy(newimage.data, pu8Bgr_host, nWidth * nHeight * 3);

	cv::imwrite("YUV2BGR.jpg", newimage);

	if (NULL != pu8BGR_dev)
	{
		nppiFree(pu8BGR_dev);
		pu8BGR_dev = NULL;
	}

	if (NULL != pu8YUV_dev)
	{
		cudaFree(pu8YUV_dev);
		pu8YUV_dev = NULL;
	}

	if (NULL != pu8Bgr_host)
	{
		free(pu8Bgr_host);
		pu8Bgr_host = NULL;
	}

	return 0;
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58

图像缩放

#include <stdio.h>
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"


const int N = 2048;
const int threadnum = 32;//开32个线程


/* cpu 向量内积 */
template <typename T>
void dot_cpu(T* a, T* b, T* c, int n)
{
	double dTemp = 0;
	for (int i = 0; i < n; ++i)
	{
		dTemp += a[i] * b[i];
	}
	*c = dTemp;
}


/*单block 分散归约 */
template <typename T>
__global__ void dot_gpu_1(T* a, T* b, T* c, int n)
{
	__shared__ T tmp[threadnum];
	const int tid = threadIdx.x; //线程ID索引号
	const int t_n = blockDim.x; // 一个block内开启的线程总数
	int nTid = tid;
	double dTemp = 0.0;
	while (nTid < n)
	{
		dTemp += a[nTid] * b[nTid];
		nTid += t_n;
	}
	tmp[tid] = dTemp; // 将每个线程中的内积放入到共享内存中
	__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完

	int i = 2, j = 1;
	while (i <= threadnum)
	{
		if (tid % i == 0)
		{
			tmp[tid] += tmp[tid + j];
		}
		__syncthreads();
		i *= 2;
		j *= 2;
	}
	if (0 == tid)
	{
		c[0] = tmp[0];
	}
}

/*单block 低线程归约向量内积*/
template <typename T>
__global__ void dot_gpu_2(T* a, T* b, T* c, int n)
{
	__shared__ T tmp[threadnum];
	const int nThreadIdX = threadIdx.x; //线程ID索引号
	const int nBlockDimX = blockDim.x; // 一个block内开启的线程总数
	int nTid = nThreadIdX;
	double dTemp = 0.0;
	while (nTid < n)
	{
		dTemp += a[nTid] * b[nTid];
		nTid += nBlockDimX;
	}
	tmp[nThreadIdX] = dTemp; // 将每个线程中的内积放入到共享内存中
	__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完

	int i = threadnum / 2;
	while (i != 0)
	{
		if (nThreadIdX < i)
		{
			tmp[nThreadIdX] += tmp[nThreadIdX + i];
		}
		__syncthreads();// 同步操作,即等所有线程内上面的操作都执行完
		i /= 2;
	}
	if (0 == nThreadIdX)
	{
		c[0] = tmp[0];
	}
}

/*多block多线程向量内积*/
template <typename T>
__global__ void dot_gpu_3(T* a, T* b, T* c, int n)
{
	__shared__ T aTmp[threadnum];
	const int nThreadIdX = threadIdx.x; //线程ID索引号
	const int nStep = gridDim.x * blockDim.x; // 跳步的步长,即所有线程的数量
	int nTidIdx = blockIdx.x * blockDim.x + threadIdx.x; // 当前线程在全局线程的索引

	double dTemp = 0.0;
	while (nTidIdx < n)
	{
		dTemp += a[nTidIdx] * b[nTidIdx];
		nTidIdx += nStep;
	}
	aTmp[nThreadIdX] = dTemp; // 将每个线程中的内积放入到对应block的共享内存中
	__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完

	int i = threadnum / 2;
	while (i != 0)
	{
		if (nThreadIdX < i)
		{
			aTmp[nThreadIdX] += aTmp[nThreadIdX + i];
		}
		__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完
		i /= 2;
	}

	if (0 == nThreadIdX)
	{
		c[blockIdx.x] = aTmp[0];
	}

}


int main()
{
	float a[N], b[N];
	float c = 0;
	for (int i = 0; i < N; ++i) // 为数组a、b赋值
	{
		a[i] = i * 1.0;
		b[i] = 1.0;
	}

	float* d_a = 0, * d_b = 0, * d_c = 0;
	cudaMalloc(&d_a, N * sizeof(float));
	cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);

	cudaMalloc(&d_b, N * sizeof(float));
	cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice);

	cudaMalloc(&d_c, sizeof(float));
	dot_cpu(a, b, &c, N);
	//dot_gpu_1 << <1, threadnum >> > (d_a, d_b, d_c, N);
	//dot_gpu_2 << <1, threadnum >> > (d_a, d_b, d_c, N);
	//dot_gpu_3<< <1, threadnum >> > (d_a, d_b, d_c, N);
	//cudaMemcpy(&c, d_c, sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << c << std::endl;

	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
	return 0;
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/不正经/article/detail/432392
推荐阅读
相关标签
  

闽ICP备14008679号