没想到我2010年买的笔记本显卡GT330M 竟然还能跑CUDA,果断小试了一把,环境为CUDA6.5+VS2012,写了一个积分图实现均值滤波。类似于OpenCV的blur()函数。
使用lena.jpg做测试,效果如下:
代码在此:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <opencv2\opencv.hpp>
using namespace std;
using namespace cv;
__global__ void rowAddKernel(float* pIntegImgLena,int* pPtsImg,int imgW,int imgH)
{
const int tidx=blockDim.x*blockIdx.x + threadIdx.x;
if (tidx<imgW)
{
for (int j=1; j<imgH; j++)
{
pIntegImgLena[j*imgW+ tidx] +=pIntegImgLena[(j-1)*imgW+tidx];
pPtsImg[j*imgW+ tidx] +=pPtsImg[(j-1)*imgW+ tidx];
}
}
}
__global__ void colAddKernel(float* pIntegImgLena,int* pPtsImg,int imgW,int imgH)
{
const int tidy=blockDim.y*blockIdx.y + threadIdx.y;
if (tidy<imgH)
{
for (int i=1; i<imgW; i++)
{
pIntegImgLena[tidy*imgW+ i] +=pIntegImgLena[tidy*imgW+i-1];
pPtsImg[tidy*imgW+ i] +=pPtsImg[tidy*imgW+ i-1];
}
}
}
__global__ void filterKernel(uchar* pImgLena,float* pIntegImgLena,int* pPtsImg,int imgW,int imgH,int win)
{
const int tidx=blockDim.x*blockIdx.x + threadIdx.x;
const int tidy=blockDim.y*blockIdx.y + threadIdx.y;
if (tidx<imgW && tidy<imgH)
{
int left=tidx-win;
int right=tidx+win;
int top=tidy-win;
int bot=tidy+win;
left=max(left, 0);
right=min(right, imgW-1);
top=max(top, 0);
bot=min(bot, imgH-1);
int id1=top*imgW+left;
int id2=top*imgW+right;
int id3=bot*imgW+left;
int id4=bot*imgW+right;
int cnt=pPtsImg[id4]+pPtsImg[id1]-pPtsImg[id2]-pPtsImg[id3];
float sum=pIntegImgLena[id4]+pIntegImgLena[id1]-pIntegImgLena[id2]-pIntegImgLena[id3];
float value=sum/cnt;
pImgLena[tidy*imgW+tidx]=(uchar)value;
}
}
void main()
{
//读取原图像
string imgPath="data/lena.jpg";
Mat imgLena=imread(imgPath, 0);
int imgH=imgLena.rows;
int imgW=imgLena.cols;
namedWindow("lena");
imshow("lena", imgLena);
waitKey(0);
//滤波后的lena
Mat filterLena=imgLena.clone();
filterLena.setTo(0);
//积分图以及坐标索引图
Mat integImgLena=Mat::zeros(imgLena.size(), CV_32FC1);
Mat ptsImg=Mat::zeros(imgLena.size(), CV_32SC1);
//积分图初始化
imgLena.convertTo(imgLena, CV_32FC1);
integImgLena=imgLena.clone();
ptsImg.setTo(1);
//分配内存
uchar* pImgLena=NULL;
float* pIntegImgLena=NULL;
int* pPtsImg=NULL;
cudaMalloc(&pImgLena, imgH*imgW*sizeof(uchar));
cudaMalloc(&pIntegImgLena, imgH*imgW*sizeof(float));
cudaMalloc(&pPtsImg, imgH*imgW*sizeof(int));
//拷贝数据至GPU
cudaMemcpy(pImgLena, imgLena.data,imgH*imgW*sizeof(uchar), cudaMemcpyHostToDevice);
cudaMemcpy(pIntegImgLena, integImgLena.data,imgH*imgW*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(pPtsImg, ptsImg.data,imgH*imgW*sizeof(int), cudaMemcpyHostToDevice);
//按行求前缀和
dim3 block(8,1);
dim3 grid((imgW+block.x-1)/block.x,1);
rowAddKernel<<<grid, block, 0>>>(pIntegImgLena, pPtsImg, imgW, imgH);
cudaThreadSynchronize();
//按列求前缀和
block=dim3(1,8);
grid=dim3(1,(imgH+block.y-1)/block.y);
colAddKernel<<<grid, block, 0>>>(pIntegImgLena, pPtsImg, imgW, imgH);
cudaThreadSynchronize();
//滤波
int win=3;
block=dim3(8,8);
grid=dim3((imgW+block.x-1)/block.x, (imgH+block.y-1)/block.y);
filterKernel<<<grid, block, 0>>>(pImgLena,pIntegImgLena, pPtsImg, imgW, imgH, win);
cudaThreadSynchronize();
cudaMemcpy(filterLena.data, pImgLena, imgH*imgW*sizeof(uchar), cudaMemcpyDeviceToHost);
cudaError err;
err=cudaGetLastError();
if (err!=cudaSuccess)
{
cout<<"err="<<err<<endl;
getchar();
}
namedWindow("filterLena");
imshow("filterLena", filterLena);
waitKey(0);
cudaFree(pImgLena);
cudaFree(pIntegImgLena);
cudaFree(pPtsImg);
}