CUDA精进之路（四）：图像处理—

引言

关于图像边缘检测，记得刚开始接触图像处理时，第一个自己实现的程序是通过笔记本摄像头采集图像，利用OpenCV自带的算法库进行Canny算子边缘检测，那时候当看到程序运行后，视频窗口实时显示经Canny算子边缘分割后的图像，觉得十分有科技感，后来慢慢开始自己写边缘检测的源代码，本博客以Sobel算子为例，将边缘检测通过CUDA实现。

任务要求

输入一张图片，将其转为灰度图后，通过CUDA在GPU中对图片实现Sobel算子边缘检测，最后将结果输出至CPU并进行显示，要求输出图与用CPU内实现后的结果一致。

实现思路

关于Sobel算子的边缘检测原理，可看此博客Sobel边缘检测算法
由于检测的原理是通过对Gx和Gy两个方向的卷积，故在CUDA实现时我们需要正确索引到以目标像素点为中心的3*3的小方格中各个元素的位置，由于图像从CPU端传给GPU是一段一维连续的内存，增大了我们索引的难度，故在block和grid的设计上，我把整张图像完整的映射到了grid中，每个thread即对应一个像素，通过二维索引的方法将一维的内存准确映射。

实现环境

VS2013 + CUDA7.5 + Opencv2.4.13

实现代码

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include <opencv2\opencv.hpp>
#include <iostream>
using namespace std;
using namespace cv;//Sobel算子边缘检测核函数
__global__ void sobelInCuda(unsigned char *dataIn, unsigned char *dataOut, int imgHeight, int imgWidth)
{int xIndex = threadIdx.x + blockIdx.x * blockDim.x;int yIndex = threadIdx.y + blockIdx.y * blockDim.y;int index = yIndex * imgWidth + xIndex;int Gx = 0;int Gy = 0;if (xIndex > 0 && xIndex < imgWidth - 1 && yIndex > 0 && yIndex < imgHeight - 1){Gx = dataIn[(yIndex - 1) * imgWidth + xIndex + 1] + 2 * dataIn[yIndex * imgWidth + xIndex + 1] + dataIn[(yIndex + 1) * imgWidth + xIndex + 1]- (dataIn[(yIndex - 1) * imgWidth + xIndex - 1] + 2 * dataIn[yIndex * imgWidth + xIndex - 1] + dataIn[(yIndex + 1) * imgWidth + xIndex - 1]);Gy = dataIn[(yIndex - 1) * imgWidth + xIndex - 1] + 2 * dataIn[(yIndex - 1) * imgWidth + xIndex] + dataIn[(yIndex - 1) * imgWidth + xIndex + 1]- (dataIn[(yIndex + 1) * imgWidth + xIndex - 1] + 2 * dataIn[(yIndex + 1) * imgWidth + xIndex] + dataIn[(yIndex + 1) * imgWidth + xIndex + 1]);dataOut[index] = (abs(Gx) + abs(Gy)) / 2;}
}//Sobel算子边缘检测CPU函数
void sobel(Mat srcImg, Mat dstImg, int imgHeight, int imgWidth)
{int Gx = 0;int Gy = 0;for (int i = 1; i < imgHeight - 1; i++){uchar *dataUp = srcImg.ptr<uchar>(i - 1);uchar *data = srcImg.ptr<uchar>(i);uchar *dataDown = srcImg.ptr<uchar>(i + 1);uchar *out = dstImg.ptr<uchar>(i);for (int j = 1; j < imgWidth - 1; j++){Gx = (dataUp[j + 1] + 2 * data[j + 1] + dataDown[j + 1]) - (dataUp[j - 1] + 2 * data[j - 1] + dataDown[j - 1]);Gy = (dataUp[j - 1] + 2 * dataUp[j] + dataUp[j + 1]) - (dataDown[j - 1] + 2 * dataDown[j] + dataDown[j + 1]);out[j] = (abs(Gx) + abs(Gy)) / 2;}}
}int main()
{Mat grayImg = imread("1.jpg", 0);int imgHeight = grayImg.rows;int imgWidth = grayImg.cols;Mat gaussImg;//高斯滤波GaussianBlur(grayImg, gaussImg, Size(3, 3), 0, 0, BORDER_DEFAULT);//Sobel算子CPU实现Mat dst(imgHeight, imgWidth, CV_8UC1, Scalar(0));sobel(gaussImg, dst, imgHeight, imgWidth);//CUDA实现后的传回的图像Mat dstImg(imgHeight, imgWidth, CV_8UC1, Scalar(0));//创建GPU内存unsigned char *d_in;unsigned char *d_out;cudaMalloc((void**)&d_in, imgHeight * imgWidth * sizeof(unsigned char));cudaMalloc((void**)&d_out, imgHeight * imgWidth * sizeof(unsigned char));//将高斯滤波后的图像从CPU传入GPUcudaMemcpy(d_in, gaussImg.data, imgHeight * imgWidth * sizeof(unsigned char), cudaMemcpyHostToDevice);dim3 threadsPerBlock(32, 32);dim3 blocksPerGrid((imgWidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);//调用核函数sobelInCuda << <blocksPerGrid, threadsPerBlock >> >(d_in, d_out, imgHeight, imgWidth);//将图像传回GPUcudaMemcpy(dstImg.data, d_out, imgHeight * imgWidth * sizeof(unsigned char), cudaMemcpyDeviceToHost);//释放GPU内存cudaFree(d_in);cudaFree(d_out);return 0;
}

实现结果

原图

CPU实现后图像

CUDA实现后图像

通过比对发现CUDA输出结果与CPU实现输出结果一致~