CUDA纹理内存的访问速度比全局内存要快,因此处理图像数据时,使用纹理内存是一个提升性能的好方法。
贴一段自己写的简单的实现两幅图像加权和的代码,使用纹理内存实现。
输入:两幅图 lena, moon
输出:两幅图像加权和
1 #include <opencv2\opencv.hpp>
2 #include <iostream>
3 #include <string>
4 #include <cuda.h>
5 #include <cuda_runtime.h>
6 #include <device_launch_parameters.h>
7
8 using namespace std;
9 using namespace cv;
10
11 //声明CUDA纹理
12 texture <uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> refTex1;
13 texture <uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> refTex2;
14 //声明CUDA数组
15 cudaArray* cuArray1;
16 cudaArray* cuArray2;
17 //通道数
18 cudaChannelFormatDesc cuDesc = cudaCreateChannelDesc<uchar4>();
19
20
21 __global__ void weightAddKerkel(uchar *pDstImgData, int imgHeight, int imgWidth,int channels)
22 {
23 const int tidx=blockDim.x*blockIdx.x+threadIdx.x;
24 const int tidy=blockDim.y*blockIdx.y+threadIdx.y;
25
26 if (tidx<imgWidth && tidy<imgHeight)
27 {
28 float4 lenaBGR,moonBGR;
29 //使用tex2D函数采样纹理
30 lenaBGR=tex2D(refTex1, tidx, tidy);
31 moonBGR=tex2D(refTex2, tidx, tidy);
32
33 int idx=(tidy*imgWidth+tidx)*channels;
34 float alpha=0.5;
35 pDstImgData[idx+0]=(alpha*lenaBGR.x+(1-alpha)*moonBGR.x)*255;
36 pDstImgData[idx+1]=(alpha*lenaBGR.y+(1-alpha)*moonBGR.y)*255;
37 pDstImgData[idx+2]=(alpha*lenaBGR.z+(1-alpha)*moonBGR.z)*255;
38 pDstImgData[idx+3]=0;
39 }
40 }
41
42 void main()
43 {
44 Mat Lena=imread("data/lena.jpg");
45 Mat moon=imread("data/moon.jpg");
46 cvtColor(Lena, Lena, CV_BGR2BGRA);
47 cvtColor(moon, moon, CV_BGR2BGRA);
48 int imgWidth=Lena.cols;
49 int imgHeight=Lena.rows;
50 int channels=Lena.channels();
51
52 //设置纹理属性
53 cudaError_t t;
54 refTex1.addressMode[0] = cudaAddressModeClamp;
55 refTex1.addressMode[1] = cudaAddressModeClamp;
56 refTex1.normalized = false;
57 refTex1.filterMode = cudaFilterModeLinear;
58 //绑定cuArray到纹理
59 cudaMallocArray(&cuArray1, &cuDesc, imgWidth, imgHeight);
60 t = cudaBindTextureToArray(refTex1, cuArray1);
61
62 refTex2.addressMode[0] = cudaAddressModeClamp;
63 refTex2.addressMode[1] = cudaAddressModeClamp;
64 refTex2.normalized = false;
65 refTex2.filterMode = cudaFilterModeLinear;
66 cudaMallocArray(&cuArray2, &cuDesc, imgWidth, imgHeight);
67 t = cudaBindTextureToArray(refTex2, cuArray2);
68
69 //拷贝数据到cudaArray
70 t=cudaMemcpyToArray(cuArray1, 0,0, Lena.data, imgWidth*imgHeight*sizeof(uchar)*channels, cudaMemcpyHostToDevice);
71 t=cudaMemcpyToArray(cuArray2, 0,0, moon.data, imgWidth*imgHeight*sizeof(uchar)*channels, cudaMemcpyHostToDevice);
72
73 //输出图像
74 Mat dstImg=Mat::zeros(imgHeight, imgWidth, CV_8UC4);
75 uchar *pDstImgData=NULL;
76 t=cudaMalloc(&pDstImgData, imgHeight*imgWidth*sizeof(uchar)*channels);
77
78 //核函数,实现两幅图像加权和
79 dim3 block(8,8);
80 dim3 grid( (imgWidth+block.x-1)/block.x, (imgHeight+block.y-1)/block.y );
81 weightAddKerkel<<<grid, block, 0>>>(pDstImgData, imgHeight, imgWidth, channels);
82 cudaThreadSynchronize();
83
84 //从GPU拷贝输出数据到CPU
85 t=cudaMemcpy(dstImg.data, pDstImgData, imgWidth*imgHeight*sizeof(uchar)*channels, cudaMemcpyDeviceToHost);
86
87 //显示
88 namedWindow("show");
89 imshow("show", dstImg);
90 waitKey(0);
91 }