赞
踩
可能是我的使用方式不对,直接调用C++ OpenCV api比用CV_CUDA快很多。
/* * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <common/NvDecoder.h> #include <common/TestUtils.h> #include <cuda_runtime_api.h> #include <cvcuda/OpCustomCrop.hpp> #include <cvcuda/OpResize.hpp> #include <getopt.h> #include <cmath> #include <opencv2/opencv.hpp> #include <nvcv/Image.hpp> #include <nvcv/Tensor.hpp> #include <chrono> using namespace std; using namespace chrono; /** * @brief Crop and Resize sample app. * * The Crop and Resize is a simple pipeline which demonstrates usage of * CVCuda Tensor along with a few operators. * * Input Batch Tensor -> Crop -> Resize -> WriteImage */ /** * @brief Utility to show usage of sample app * **/ void showUsage() { std::cout << "usage: ./nvcv_cropandresize_app -i <image file path or image directory -b <batch size>" << std::endl; } /** * @brief Utility to parse the command line arguments * **/ int ParseArgs(int argc, char *argv[], std::string &imagePath, uint32_t &batchSize) { static struct option long_options[] = { { "help", no_argument, 0, 'h'}, {"imagePath", required_argument, 0, 'i'}, { "batch", required_argument, 0, 'b'}, { 0, 0, 0, 0} }; int long_index = 0; int opt = 0; while ((opt = getopt_long(argc, argv, "hi:b:", long_options, &long_index)) != -1) { switch (opt) { case 'h': showUsage(); return -1; break; case 'i': imagePath = optarg; break; case 'b': batchSize = std::stoi(optarg); break; case ':': showUsage(); return -1; default: break; } } std::ifstream imageFile(imagePath); if (!imageFile.good()) { showUsage(); std::cerr << "Image path '" + imagePath + "' does not exist\n"; return -1; } return 0; } int main(int argc, char *argv[]) { // Default parameters std::string imagePath = "test.jpg"; uint32_t batchSize = 1; cv::Mat imgMat = cv::imread(imagePath); // Parse the command line paramaters to override the default parameters int retval = ParseArgs(argc, argv, imagePath, batchSize); if (retval != 0) { return retval; } // NvJpeg is used to decode the images to the color format required. // Since we need a contiguous buffer for batched input, a buffer is // preallocated based on the maximum image dimensions and batch size // for NvJpeg to write into. // Note : The maximum input image dimensions needs to be updated in case // of testing with different test images int maxImageWidth = 1920; int maxImageHeight = 1080; int maxChannels = 3; // tag: Create the cuda stream cudaStream_t stream; CHECK_CUDA_ERROR(cudaStreamCreate(&stream)); // tag: Allocate input tensor // Allocating memory for RGBI input image batch of uint8_t data type // without padding since NvDecode utility currently doesnt support // Padded buffers. nvcv::TensorDataStridedCuda::Buffer inBuf; inBuf.strides[3] = sizeof(uint8_t); inBuf.strides[2] = maxChannels * inBuf.strides[3]; inBuf.strides[1] = maxImageWidth * inBuf.strides[2]; inBuf.strides[0] = maxImageHeight * inBuf.strides[1]; CHECK_CUDA_ERROR(cudaMallocAsync(&inBuf.basePtr, batchSize * inBuf.strides[0], stream)); // tag: Tensor Requirements // Calculate the requirements for the RGBI uint8_t Tensor which include // pitch bytes, alignment, shape and tensor layout nvcv::Tensor::Requirements inReqs = nvcv::Tensor::CalcRequirements(batchSize, {maxImageWidth, maxImageHeight}, nvcv::FMT_RGB8); // Create a tensor buffer to store the data pointer and pitch bytes for each plane nvcv::TensorDataStridedCuda inData(nvcv::TensorShape{inReqs.shape, inReqs.rank, inReqs.layout}, nvcv::DataType{inReqs.dtype}, inBuf); // TensorWrapData allows for interoperation of external tensor representations with CVCUDA Tensor. nvcv::Tensor inTensor = nvcv::TensorWrapData(inData); // tag: Image Loading // NvJpeg is used to load the images to create a batched input device buffer. uint8_t *gpuInput = reinterpret_cast<uint8_t *>(inBuf.basePtr); CHECK_CUDA_ERROR(cudaMemcpyAsync(gpuInput, imgMat.data, inBuf.strides[0], cudaMemcpyHostToDevice)); // The total images is set to the same value as batch size for testing uint32_t totalImages = batchSize; // Format in which the decoded output will be saved //nvjpegOutputFormat_t outputFormat = NVJPEG_OUTPUT_RGBI; //NvDecode(imagePath, batchSize, totalImages, outputFormat, gpuInput); // tag: The input buffer is now ready to be used by the operators // Set parameters for Crop and Resize // ROI dimensions to crop in the input image int cropX = 150; int cropY = 50; int cropWidth = 800; int cropHeight = 1000; // Set the resize dimensions int resizeWidth = 1600; int resizeHeight = 2000; // Initialize the CVCUDA ROI struct NVCVRectI crpRect = {cropX, cropY, cropWidth, cropHeight}; cv::Rect Rect(cropX, cropY, cropWidth, cropHeight); auto t1=std::chrono::steady_clock::now(); // 裁剪图像 cv::Mat cropImg = imgMat(Rect); // 调整图像大小 cv::resize(cropImg, cropImg, cv::Size(resizeWidth, resizeHeight)); auto t2=std::chrono::steady_clock::now(); double dr_ms=std::chrono::duration<double,std::milli>(t2-t1).count(); std::cout << "opencv costs: " << dr_ms << "ms" << std::endl; // tag: Allocate Tensors for Crop and Resize // Create a CVCUDA Tensor based on the crop window size. nvcv::Tensor cropTensor(batchSize, {cropWidth, cropHeight}, nvcv::FMT_RGB8); // Create a CVCUDA Tensor based on resize dimensions nvcv::Tensor resizedTensor(batchSize, {resizeWidth, resizeHeight}, nvcv::FMT_RGB8); // tag: Initialize operators for Crop and Resize cvcuda::CustomCrop cropOp; cvcuda::Resize resizeOp; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); // tag: Executes the CustomCrop operation on the given cuda stream cropOp(stream, inTensor, cropTensor, crpRect); // Resize operator can now be enqueued into the same stream resizeOp(stream, cropTensor, resizedTensor, NVCV_INTERP_LINEAR); // tag: Profile section cudaEventRecord(stop); cudaEventSynchronize(stop); float operatorms = 0; cudaEventElapsedTime(&operatorms, start, stop); std::cout << "Time for Crop and Resize : " << operatorms << " ms" << std::endl; // tag: Copy the buffer to CPU and write resized image into .bmp file WriteRGBITensor(resizedTensor, stream); // tag: Clean up CHECK_CUDA_ERROR(cudaStreamDestroy(stream)); // tag: End of Sample }
输出
opencv costs: 3.16336ms
Time for Crop and Resize : 200.148 ms
Writing to ./cvcudatest_0.jpg 4800 1600 2000
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。