【cuda】——npp/cuda图像预处理resize+norm对比

1. npp

核心代码：

// 2. npp 图像预处理
bool keepRation = 0 ,keepCenter= 0;
int width_in = img.cols;
int height_in = img.rows;
NppiSize srcSize = {width_in, height_in};
NppiRect srcROI = {0, 0, width_in, height_in};
int dst_width = inputDim.d[2];
int dst_height = inputDim.d[1];
NppiSize dstSize = {dst_width, dst_height};
NppiRect dstROI  = {0, 0, dst_width, dst_height};
int bgr2rgb[3] = {2, 1, 0};
Npp32f m_scale[3] = {0.003921569, 0.003921569, 0.003921569};
Npp32f* r_plane = (Npp32f*)(mCudaBuffers[0]);
Npp32f* g_plane = (Npp32f*)(mCudaBuffers[0] + dst_width*dst_height*sizeof(float) );
Npp32f* b_plane = (Npp32f*)(mCudaBuffers[0] + dst_width*dst_height*2*sizeof(float) );
Npp32f* dst_planes[3] = {r_plane, g_plane, b_plane};CUDA_CHECK(cudaMemcpy(mCudaImg, img.data, img.step[0]*img.rows, cudaMemcpyHostToDevice));
nppiResize_8u_C3R( (Npp8u*)mCudaImg, width_in * 3, srcSize, srcROI, (Npp8u*)gpu_img_resize_buf, dst_width * 3, dstSize, dstROI, NPPI_INTER_LINEAR);
nppiConvert_8u32f_C3R( (Npp8u*)gpu_img_resize_buf, dst_width*3, (Npp32f*)gpu_img_plane, dst_width*3*sizeof(float), dstSize);      // 转成32float
nppiMulC_32f_C3IR(m_scale, (Npp32f*)gpu_img_plane, dst_width*3*sizeof(float), dstSize);        // 每个通道 × scale
nppiCopy_32f_C3P3R( (Npp32f*)gpu_img_plane, dst_width*3*sizeof(float), dst_planes, dst_width*sizeof(float), dstSize );

耗时

cost: 113  ms
cost: 68  ms
cost: 52.6667  ms
cost: 45.5  ms
cost: 41  ms
cost: 37.8333  ms
cost: 35.7143  ms
cost: 34  ms
cost: 32.7778  ms
cost: 31.6  ms
cost: 30.8182  ms
cost: 30.0833  ms
cost: 29.6154  ms
cost: 29.0714  ms
cost: 28.6667  ms
cost: 28.3125  ms
cost: 27.8824  ms
cost: 27.6111  ms
cost: 27.3158  ms
cost: 27.05  ms
cost: 26.8095  ms
cost: 26.5455  ms
cost: 26.3478  ms
cost: 26.125  ms
cost: 25.96  ms
cost: 25.8077  ms
cost: 25.6667  ms
cost: 25.5  ms
cost: 25.3793  ms
cost: 25.2333  ms
cost: 25.129  ms
cost: 25  ms
cost: 24.9091  ms
cost: 24.7941  ms
cost: 24.7143  ms
cost: 24.5833  ms
cost: 24.5135  ms
cost: 24.3947  ms
cost: 24.3077  ms
cost: 24.2  ms
cost: 24.0976  ms

2. cuda

核心代码：

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <resize.h>
#include <stdio.h>
__forceinline__ __device__ float3 get(uchar3* src, int x,int y,int w,int h){if(x < 0 || x>=w || y<0 || y>=h) return make_float3(0.5,0.5,0.5);uchar3 temp = src[y*w + x];return make_float3(float(temp.x)/255.,float(temp.y)/255.,float(temp.z)/255.);
}__global__ void resizeNormKernel(uchar3* src,float *dst,int dstW, int dstH,int srcW,int srcH,float scaleX, float scaleY,float shiftX, float shiftY) {int idx = blockIdx.x * blockDim.x + threadIdx.x;const int x = idx % dstW;const int y = idx / dstW;if (x >= dstW || y >= dstH)return;float w = (x - shiftX + 0.5) * scaleX - 0.5;        // 缩放的反向映射矩阵float h = (y - shiftY + 0.5) * scaleY - 0.5;        // opencv int h_low = (int)h;int w_low = (int)w;int h_high = h_low + 1;int w_high = w_low + 1;float lh = h - h_low;float lw = w - w_low;float hh = 1 - lh, hw = 1 - lw;float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;float3 v1 = get(src,w_low,h_low,srcW,srcH);float3 v2 = get(src,w_high,h_low,srcW,srcH);float3 v3 = get(src,w_low,h_high,srcW,srcH);float3 v4 = get(src,w_high,h_high,srcW,srcH);int stride = dstW*dstH;dst[y*dstW + x] = w1 *v1.x + w2 * v2.x + w3 *v3.x + w4 * v4.x ;dst[stride + y*dstW + x] = w1 *v1.y + w2 * v2.y + w3 *v3.y + w4 * v4.y ;dst[stride*2 + y*dstW + x] = w1 *v1.z + w2 * v2.z + w3 *v3.z + w4 * v4.z;
}int resizeAndNorm(void * p,float *d,int w,int h,int in_w,int in_h, bool keepration ,bool keepcenter,cudaStream_t stream){float scaleX = (w*1.0f / in_w);float scaleY = (h*1.0f / in_h);float shiftX = 0.f ,shiftY = 0.f;if(keepration)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;if(keepration && keepcenter){shiftX = (in_w - w/scaleX)/2.f;shiftY = (in_h - h/scaleY)/2.f;}const int n = in_w*in_h;int blockSize = 1024;const int gridSize = (n + blockSize - 1) / blockSize;resizeNormKernel<<<gridSize, blockSize, 0, stream>>>((uchar3*)(p),d,in_w,in_h,w,h,scaleX,scaleY,shiftX,shiftY);return 0;
}

自己写的cuda函数

cost: 21  ms
cost: 22  ms
cost: 22  ms
cost: 22.75  ms
cost: 22.8  ms
cost: 23.1667  ms
cost: 23.1429  ms
cost: 23  ms
cost: 23.1111  ms
cost: 22.9  ms
cost: 23  ms
cost: 22.9167  ms
cost: 23.1538  ms
cost: 23.1429  ms
cost: 23  ms
cost: 22.875  ms
cost: 22.7647  ms
cost: 22.6667  ms
cost: 22.6316  ms
cost: 22.55  ms
cost: 22.5714  ms
cost: 22.5  ms
cost: 22.5217  ms
cost: 22.4583  ms
cost: 22.48  ms
cost: 22.4231  ms
cost: 22.4444  ms
cost: 22.3929  ms
cost: 22.4138  ms
cost: 22.3667  ms
cost: 22.3871  ms
cost: 22.3438  ms
cost: 22.3636  ms
cost: 22.3235  ms
cost: 22.3143  ms
cost: 22.25  ms
cost: 22.2162  ms
cost: 22.1579  ms
cost: 22.1538  ms
cost: 22.1  ms
cost: 22.0732  ms

3. 总结

自写cuda算子在开始的时候有更快的速度，在后续二者相差不大
自写cuda，需要懂c，cuda，数据排列。但是灵活度更大
用npp，只需要知道数据排列。但是灵活度不够。

【cuda】——npp/cuda图像预处理resize+norm对比相关推荐

Cuda编程加速图像预处理
文章目录项目简介加速效果 Cuda代码 Git地址项目简介基于 cuda 和 opencv 环境目标: 单独使用,以加速图像处理操作: 结合 TensorRT 使用,进一步加快推理速度加速 ...
cv::cuda与CUDA的NPP库、距离变换和分水岭并行版本尝试
因为不想什么函数都自己写设备核函数,看到opencv有对应的cuda版本的函数比如滤波,然而CUDA的NPP库也提供了对应的滤波函数,我不知道哪个性能更高(当然肯定要比纯CPU版本快,但我没测试过) ...
图像预处理库CV-CUDA开源了，打破预处理瓶颈，提升推理吞吐量20多倍
点击上方"小白学视觉",选择加"星标"或"置顶" 重磅干货,第一时间送达作者:思当 CPU 图像预处理成为视觉任务的瓶颈,最新开源的CV ...
【CV学习笔记】图像预处理warpaffine
1.前言在学习图像预处理的时候发现,之前用的图像预处理方法一般为 resize和letter box,这两种方法比较低效,后来在手写AI中接触到了warpaffine,只需要一步就能够对图像进行预处 ...
pytorch —— 图像预处理模块（Transforms）
transforms运行机制 torchvision是pytorch的计算机视觉工具包,在torchvision中有三个主要的模块: torchvision.transforms,常用的图像预处理方法 ...
【CV学习笔记】图像预处理warpaffine-cuda加速
1.前言在上个学习笔记中学习warpaffine,并且在opencv下面实现了图像的预处理,而warpaffine可以很好的利用cuda加速来实现,于是基于手写AI的项目,又学习了warpaffie ...
图像预处理之减去RGB均值
减去RGB均值(实例以DIV2K数据集为例) 在计算机视觉领域中,一定免不了的就是图像预处理中的逐个样本减去mean值的过程,那么为什么一定要做这一步呢? 为什么每张图片都要减去数据集均值呢? 原因 ...
pytorch 图像预处理之减去均值，除以方差
在使用 torchvision.transforms进行数据处理时我们经常进行的操作是: transforms.Normalize((0.485,0.456,0.406), (0.229,0.224, ...
如何利用图像预处理提高OCR的准确性？
点击上方"小白学视觉",选择加"星标"或"置顶" 重磅干货,第一时间送达 OCR代表光学字符识别,将文档照片或场景照片转换为机器编码的文本. ...
计算机视觉——图像预处理及边缘检测
1.为什么要进行图像预处理? \qquad当对图像进行边缘.轮廓的检测时,图像噪声会对检测产生不利影响,并且为了帮助模型专注于一般细节并获得更高的准确度,我们需要对图像进行预处理. \qquad预处理 ...

【cuda】——npp/cuda图像预处理resize+norm对比

1. npp

2. cuda

3. 总结

【cuda】——npp/cuda图像预处理resize+norm对比相关推荐

最新文章

热门文章