我的CUDA學習之旅2——圖像形態學腐蝕、膨脹CUDA實現

01-28

引言

由於上兩周雜七雜八的事情比較多加上自己寫的演算法有些問題，一直改bug。。。。沒時間繼續寫博客，今天開始補上博客。從這篇文章起，開始將一些較為典型的OpenCV演算法通過CUDA進行實現，本文實現的為圖像處理中最為常見的形態學腐蝕以及膨脹，由於本文目的在於演算法移植後的驗證，故在圖片的選擇上用小圖像作為輸入的示例圖像，有不當之處歡迎評論或私信~

任務要求

輸入一張圖片，將其轉為灰度圖後，通過CUDA在GPU中對圖片實現形態學腐蝕、膨脹操作，最後將結果輸出至CPU並進行顯示，要求輸出圖與用OpenCV實現後的結果一致。

實現思路

關於腐蝕與膨脹的演算法原理網上已有完備的的資料，在這裡不再複述，具體原理可見圖像的腐蝕原理

由於是對經典演算法的移植，故在thread以及block的設計上不能單單針對某一張圖片，而是要通用，同時為了儘可能提高運算速度，將其設計為32*32的1024個thread大小的block（本人顯卡Nvidia GeForce 755 M），block數量則是根據傳入圖片的大小動態變化。

實現環境

VS2013 + CUDA7.5 + Opencv2.4.13

實現代碼

#include "cuda_runtime.h"n#include "device_launch_parameters.h"n#include <cuda.h>n#include <device_functions.h>n#include <opencv2opencv.hpp>n#include <iostream>nusing namespace std;nusing namespace cv;nn//腐蝕n__global__ void erodeInCuda(unsigned char *dataIn, unsigned char *dataOut, Size erodeElement, int imgWidth, int imgHeight)n{nt//Grid中x方向上的索引ntint xIndex = threadIdx.x + blockIdx.x * blockDim.x;nt//Grid中y方向上的索引ntint yIndex = threadIdx.y + blockIdx.y * blockDim.y;nntint elementWidth = erodeElement.width;ntint elementHeight = erodeElement.height;ntint halfEW = elementWidth / 2;ntint halfEH = elementHeight / 2;nntdataOut[yIndex * imgWidth + xIndex] = dataIn[yIndex * imgWidth + xIndex];nt//防止越界ntif (xIndex > halfEW && xIndex < imgWidth - halfEW && yIndex > halfEH && yIndex < imgHeight - halfEH)nt{nttfor (int i = -halfEH; i < halfEH + 1; i++)ntt{ntttfor (int j = -halfEW; j < halfEW + 1; j++)nttt{nttttif (dataIn[(i + yIndex) * imgWidth + xIndex + j] < dataOut[yIndex * imgWidth + xIndex])ntttt{ntttttdataOut[yIndex * imgWidth + xIndex] = dataIn[(i + yIndex) * imgWidth + xIndex + j];ntttt}nttt}ntt}nt}n}nn//膨脹n__global__ void dilateInCuda(unsigned char *dataIn, unsigned char *dataOut, Size dilateElement, int imgWidth, int imgHeight)n{nt//Grid中x方向上的索引ntint xIndex = threadIdx.x + blockIdx.x * blockDim.x;nt//Grid中y方向上的索引ntint yIndex = threadIdx.y + blockIdx.y * blockDim.y;nntint elementWidth = dilateElement.width;ntint elementHeight = dilateElement.height;ntint halfEW = elementWidth / 2;ntint halfEH = elementHeight / 2;nntdataOut[yIndex * imgWidth + xIndex] = dataIn[yIndex * imgWidth + xIndex];nt//防止越界ntif (xIndex > halfEW && xIndex < imgWidth - halfEW && yIndex > halfEH && yIndex < imgHeight - halfEH)nt{nttfor (int i = -halfEH; i < halfEH + 1; i++)ntt{ntttfor (int j = -halfEW; j < halfEW + 1; j++)nttt{nttttif (dataIn[(i + yIndex) * imgWidth + xIndex + j] > dataOut[yIndex * imgWidth + xIndex])ntttt{ntttttdataOut[yIndex * imgWidth + xIndex] = dataIn[(i + yIndex) * imgWidth + xIndex + j];ntttt}nttt}ntt}nt}n}nnnint main()n{ntMat srcImg = imread("1.jpg");//輸入圖片ntMat grayImg = imread("1.jpg", 0);//輸入的灰度圖nnntunsigned char *d_in;//輸入圖片在GPU內的內存ntunsigned char *d_out1;//腐蝕後輸出圖片在GPU內的內存ntunsigned char *d_out2;//膨脹後輸出圖片在GPU內的內存nntint imgWidth = grayImg.cols;ntint imgHeight = grayImg.rows;nntMat dstImg1(imgHeight, imgWidth, CV_8UC1, Scalar(0));//腐蝕後輸出圖片在CPU內的內存ntMat dstImg2(imgHeight, imgWidth, CV_8UC1, Scalar(0));//膨脹後輸出圖片在CPU內的內存nt//在GPU中開闢內存ntcudaMalloc((void**)&d_in, imgWidth * imgHeight * sizeof(unsigned char));ntcudaMalloc((void**)&d_out1, imgWidth * imgHeight * sizeof(unsigned char));ntcudaMalloc((void**)&d_out2, imgWidth * imgHeight * sizeof(unsigned char));nt//將輸入圖片傳入GPUntcudaMemcpy(d_in, grayImg.data, imgWidth * imgHeight * sizeof(unsigned char), cudaMemcpyHostToDevice);nt//定義block中thread的分布ntdim3 threadsPerBlock(32, 32);nt//根據輸入圖片的寬高定義block的大小ntdim3 blocksPerGrid((imgWidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);nt//運算元大小ntSize Element(3, 5);nt//CUDA腐蝕nterodeInCuda << <blocksPerGrid, threadsPerBlock >> >(d_in, d_out1, Element, imgWidth, imgHeight);nt//將結果傳回CPUntcudaMemcpy(dstImg1.data, d_out1, imgWidth * imgHeight * sizeof(unsigned char), cudaMemcpyDeviceToHost);nt//CPU內腐蝕（OpenCV實現）ntMat erodeImg;ntMat element = getStructuringElement(MORPH_RECT, Size(3, 5));nterode(grayImg, erodeImg, element);nt//CUDA膨脹ntdilateInCuda << <blocksPerGrid, threadsPerBlock >> >(d_in, d_out2, Element, imgWidth, imgHeight);nt//將結果傳回CPUntcudaMemcpy(dstImg2.data, d_out2, imgWidth * imgHeight * sizeof(unsigned char), cudaMemcpyDeviceToHost);nt//CPU內膨脹（OpenCV實現）ntMat dilateImg;ntdilate(grayImg, dilateImg, element);nntreturn 0;n}n

實現結果

原灰度圖

腐蝕後圖片

膨脹後圖片

通過比對發現CUDA輸出結果與OpenCV輸出結果一致~