我的CUDA學習之旅2——圖像形態學腐蝕、膨脹CUDA實現
引言
由於上兩周雜七雜八的事情比較多加上自己寫的演算法有些問題,一直改bug。。。。沒時間繼續寫博客,今天開始補上博客。從這篇文章起,開始將一些較為典型的OpenCV演算法通過CUDA進行實現,本文實現的為圖像處理中最為常見的形態學腐蝕以及膨脹,由於本文目的在於演算法移植後的驗證,故在圖片的選擇上用小圖像作為輸入的示例圖像,有不當之處歡迎評論或私信~
任務要求
輸入一張圖片,將其轉為灰度圖後,通過CUDA在GPU中對圖片實現形態學腐蝕、膨脹操作,最後將結果輸出至CPU並進行顯示,要求輸出圖與用OpenCV實現後的結果一致。
實現思路
關於腐蝕與膨脹的演算法原理網上已有完備的的資料,在這裡不再複述,具體原理可見圖像的腐蝕原理
由於是對經典演算法的移植,故在thread以及block的設計上不能單單針對某一張圖片,而是要通用,同時為了儘可能提高運算速度,將其設計為32*32的1024個thread大小的block(本人顯卡Nvidia GeForce 755 M),block數量則是根據傳入圖片的大小動態變化。
實現環境
VS2013 + CUDA7.5 + Opencv2.4.13
實現代碼
#include "cuda_runtime.h"n#include "device_launch_parameters.h"n#include <cuda.h>n#include <device_functions.h>n#include <opencv2opencv.hpp>n#include <iostream>nusing namespace std;nusing namespace cv;nn//腐蝕n__global__ void erodeInCuda(unsigned char *dataIn, unsigned char *dataOut, Size erodeElement, int imgWidth, int imgHeight)n{nt//Grid中x方向上的索引ntint xIndex = threadIdx.x + blockIdx.x * blockDim.x;nt//Grid中y方向上的索引ntint yIndex = threadIdx.y + blockIdx.y * blockDim.y;nntint elementWidth = erodeElement.width;ntint elementHeight = erodeElement.height;ntint halfEW = elementWidth / 2;ntint halfEH = elementHeight / 2;nntdataOut[yIndex * imgWidth + xIndex] = dataIn[yIndex * imgWidth + xIndex];nt//防止越界ntif (xIndex > halfEW && xIndex < imgWidth - halfEW && yIndex > halfEH && yIndex < imgHeight - halfEH)nt{nttfor (int i = -halfEH; i < halfEH + 1; i++)ntt{ntttfor (int j = -halfEW; j < halfEW + 1; j++)nttt{nttttif (dataIn[(i + yIndex) * imgWidth + xIndex + j] < dataOut[yIndex * imgWidth + xIndex])ntttt{ntttttdataOut[yIndex * imgWidth + xIndex] = dataIn[(i + yIndex) * imgWidth + xIndex + j];ntttt}nttt}ntt}nt}n}nn//膨脹n__global__ void dilateInCuda(unsigned char *dataIn, unsigned char *dataOut, Size dilateElement, int imgWidth, int imgHeight)n{nt//Grid中x方向上的索引ntint xIndex = threadIdx.x + blockIdx.x * blockDim.x;nt//Grid中y方向上的索引ntint yIndex = threadIdx.y + blockIdx.y * blockDim.y;nntint elementWidth = dilateElement.width;ntint elementHeight = dilateElement.height;ntint halfEW = elementWidth / 2;ntint halfEH = elementHeight / 2;nntdataOut[yIndex * imgWidth + xIndex] = dataIn[yIndex * imgWidth + xIndex];nt//防止越界ntif (xIndex > halfEW && xIndex < imgWidth - halfEW && yIndex > halfEH && yIndex < imgHeight - halfEH)nt{nttfor (int i = -halfEH; i < halfEH + 1; i++)ntt{ntttfor (int j = -halfEW; j < halfEW + 1; j++)nttt{nttttif (dataIn[(i + yIndex) * imgWidth + xIndex + j] > dataOut[yIndex * imgWidth + xIndex])ntttt{ntttttdataOut[yIndex * imgWidth + xIndex] = dataIn[(i + yIndex) * imgWidth + xIndex + j];ntttt}nttt}ntt}nt}n}nnnint main()n{ntMat srcImg = imread("1.jpg");//輸入圖片ntMat grayImg = imread("1.jpg", 0);//輸入的灰度圖nnntunsigned char *d_in;//輸入圖片在GPU內的內存ntunsigned char *d_out1;//腐蝕後輸出圖片在GPU內的內存ntunsigned char *d_out2;//膨脹後輸出圖片在GPU內的內存nntint imgWidth = grayImg.cols;ntint imgHeight = grayImg.rows;nntMat dstImg1(imgHeight, imgWidth, CV_8UC1, Scalar(0));//腐蝕後輸出圖片在CPU內的內存ntMat dstImg2(imgHeight, imgWidth, CV_8UC1, Scalar(0));//膨脹後輸出圖片在CPU內的內存nt//在GPU中開闢內存ntcudaMalloc((void**)&d_in, imgWidth * imgHeight * sizeof(unsigned char));ntcudaMalloc((void**)&d_out1, imgWidth * imgHeight * sizeof(unsigned char));ntcudaMalloc((void**)&d_out2, imgWidth * imgHeight * sizeof(unsigned char));nt//將輸入圖片傳入GPUntcudaMemcpy(d_in, grayImg.data, imgWidth * imgHeight * sizeof(unsigned char), cudaMemcpyHostToDevice);nt//定義block中thread的分布ntdim3 threadsPerBlock(32, 32);nt//根據輸入圖片的寬高定義block的大小ntdim3 blocksPerGrid((imgWidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);nt//運算元大小ntSize Element(3, 5);nt//CUDA腐蝕nterodeInCuda << <blocksPerGrid, threadsPerBlock >> >(d_in, d_out1, Element, imgWidth, imgHeight);nt//將結果傳回CPUntcudaMemcpy(dstImg1.data, d_out1, imgWidth * imgHeight * sizeof(unsigned char), cudaMemcpyDeviceToHost);nt//CPU內腐蝕(OpenCV實現)ntMat erodeImg;ntMat element = getStructuringElement(MORPH_RECT, Size(3, 5));nterode(grayImg, erodeImg, element);nt//CUDA膨脹ntdilateInCuda << <blocksPerGrid, threadsPerBlock >> >(d_in, d_out2, Element, imgWidth, imgHeight);nt//將結果傳回CPUntcudaMemcpy(dstImg2.data, d_out2, imgWidth * imgHeight * sizeof(unsigned char), cudaMemcpyDeviceToHost);nt//CPU內膨脹(OpenCV實現)ntMat dilateImg;ntdilate(grayImg, dilateImg, element);nntreturn 0;n}n
實現結果
原灰度圖

腐蝕後圖片

膨脹後圖片

通過比對發現CUDA輸出結果與OpenCV輸出結果一致~
推薦閱讀:
※【源碼眾讀】之問題解答,cpp_Part_3
※C++實現神經網路之四—神經網路的預測和輸入輸出的解析
※《C++ Primer》讀書筆記-第十一章 03 關聯容器操作
※當刷題遇見吐槽和涼席椅墊,完美!
※15個C++項目列表
