赞
踩
一、前言
本文主要讲解了cuda并行加速的一个小例子,对图像缩放的最近邻插值算法做加速。
二、代码实现
由于进行缩放时,每个新像素点的计算方法均一致,故可使用并行计算,opencv中的resize也是这么做的。
- //main.cu
- #include "cuda_runtime.h"
-
- #include <windows.h>
- #include <iostream>
-
- #include <opencv2/core/core.hpp>
- #include <opencv2/highgui/highgui.hpp>
- #include <opencv2/imgproc/imgproc.hpp>
-
- using namespace cv;
- using namespace std;
-
- void resizeImage(const Mat &_src, Mat &_dst, const Size &s )
- {
- _dst = Mat::zeros(s, CV_8UC3);
- double fRows = s.height / (float)_src.rows;
- double fCols = s.width / (float)_src.cols;
- int pX = 0;
- int pY = 0;
- for (int i = 0; i != _dst.rows; ++i){
- for (int j = 0; j != _dst.cols; ++j){
- pX = cvRound(i/(double)fRows);
- pY = cvRound(j/(double)fCols);
- if (pX < _src.rows && pX >= 0 && pY < _src.cols && pY >= 0){
- _dst.at<Vec3b>(i, j)[0] = _src.at<Vec3b>(pX, pY)[0];
- _dst.at<Vec3b>(i, j)[1] = _src.at<Vec3b>(pX, pY)[1];
- _dst.at<Vec3b>(i, j)[2] = _src.at<Vec3b>(pX, pY)[2];
- }
- }
- }
- }
-
- bool initCUDA()
- {
- int count;
- cudaGetDeviceCount(&count);
- if (count == 0){
- fprintf(stderr, "There is no device.\n");
- return false;
- }
-
- int i;
- for (i = 0; i < count; i++){
- cudaDeviceProp prop;
- if (cudaGetDeviceProperties(&prop, i) == cudaSuccess){
- if (prop.major >= 1){
- break;
- }
- }
- }
-
- if (i == count){
- fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
- return false;
- }
-
- cudaSetDevice(i);
- return true;
- }
-
-
- __global__ void kernel(uchar* _src_dev, uchar * _dst_dev, int _src_step, int _dst_step ,
- int _src_rows, int _src_cols, int _dst_rows, int _dst_cols)
- {
- int i = blockIdx.x;
- int j = blockIdx.y;
-
- double fRows = _dst_rows / (float)_src_rows;
- double fCols = _dst_cols / (float)_src_cols;
-
- int pX = 0;
- int pY = 0;
-
- pX = (int)(i / fRows);
- pY = (int)(j / fCols);
- if (pX < _src_rows && pX >= 0 && pY < _src_cols && pY >= 0){
- *(_dst_dev + i*_dst_step + 3 * j + 0) = *(_src_dev + pX*_src_step + 3 * pY);
- *(_dst_dev + i*_dst_step + 3 * j + 1) = *(_src_dev + pX*_src_step + 3 * pY + 1);
- *(_dst_dev + i*_dst_step + 3 * j + 2) = *(_src_dev + pX*_src_step + 3 * pY + 2);
-
- }
-
- }
-
-
- void resizeImageGpu(const Mat &_src, Mat &_dst, const Size &s)
- {
- _dst = Mat(s, CV_8UC3);
- uchar *src_data = _src.data;
- int width = _src.cols;
- int height = _src.rows;
- uchar *src_dev , *dst_dev;
-
- cudaMalloc((void**)&src_dev, 3 * width*height * sizeof(uchar) );
- cudaMalloc((void**)&dst_dev, 3 * s.width * s.height * sizeof(uchar));
- cudaMemcpy(src_dev, src_data, 3 * width*height * sizeof(uchar), cudaMemcpyHostToDevice);
-
- double fRows = s.height / (float)_src.rows;
- double fCols = s.width / (float)_src.cols;
- int src_step = _src.step;
- int dst_step = _dst.step;
-
- dim3 grid(s.height, s.width);
- kernel << < grid, 1 >> >(src_dev, dst_dev, src_step, dst_step, height, width, s.height, s.width);
-
- cudaMemcpy(_dst.data, dst_dev, 3 * s.width * s.height * sizeof(uchar), cudaMemcpyDeviceToHost);
-
-
- }
-
-
- int main()
- {
- Mat src = cv::imread("E:\\学习资料\\测试标准图\\lena.bmp" , 1);
- Mat dst_cpu;
-
- double start = GetTickCount();
- resizeImage(src, dst_cpu, Size(src.cols * 2, src.rows * 2));
- double end = GetTickCount();
-
- cout << "cpu缩放所耗费的时间:" << end - start << "\n";
-
- initCUDA();
-
- Mat dst_gpu;
-
- start = GetTickCount();
- resizeImageGpu(src, dst_gpu, Size(src.cols * 2, src.rows * 2));
- end = GetTickCount();
- cout << "gpu缩放所耗费的时间:" << end - start << "\n";
-
- cv::imshow("Demo", dst_cpu);
- waitKey(0);
-
- return 0;
- }
三、实验结果
本文实验环境为vs2013+cuda7.0+opencv2.4.9,可以得到结果如下,当在将512*512的lena图像放大为1024*1024时,使用gpu并行计算的方法加快了一倍多,但若要进行缩小运算时,使用gpu加速则不一定会快,因为数据上传会占用时间。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。