将两个数组进行加和后赋给另外一个数组,这是CUDA中自带的例程
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
__device__ int getGlobalIdx_1d_1d() {
return blockIdx.x * blockDim.x + threadIdx.x;
}
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
下面我们对这个程序进行说明:
cuda_runtime.h
和device_launch_parameters.h
是调用CUDA的API必须包含的;global__` 和 `__device
在前面的文章中讲过,不再赘述;addKernel
函数中,使用了threadIdx.x
,这是将Block中的线程按一维排列进行计算;包括getGloablIdx_1d_1d
函数中的blockIdx.x
blockDim.x
我们后面会在后面详细讲到,这里先记住;main
函数中,先定义了两个数组,在addWitCuda
中完成两个数组的加法运算;cudaSetDevice
是用来选择GPU的API,由于我这里只有一个GPU,因此设置为0;cudaMalloc
函数为是三个数组在GPU上分配空间,这个函数跟C
中的malloc
函数很像,但这个是指在GPU(即显存)中分配一块空间,那参数值中为什么是两个*
呢?我们先来看这个函数的原型:cudaError_t cudaMalloc (void **devPtr, size_t size );
所有的CUDA API返回值都是CUDA中定义的一个错误代码,这种返回值的方式也是我们在写程序中经常用到的。这也意味着我们如果想得到某个结果,只能通过参数引用的方式,而我们定义的dev_a
本来就是指针,又加了个&
,所以前面是两个*
。
cudaMemcpy
函数将CPU上的数组拷贝到GPU上,可以看到该函数的参数中有一个是cudaMemcpyHostToDevice
;addKernel<<<1, size>>>()
中的<<<1, size>>>
表示线程的索引方式,具体可参考另一篇文章《CUDA核函数与线程索引方式》在学习过程中,如果遇到怎么都不能理解的东西,可以先把这种用法记住,等后面写的多了,自然也就理解了
上面这个程序是学习CUDA最开始接触的程序,就跟刚开始学习任何一门编程语言时,第一个例子是“Hello World”一样,我们在上面的程序中加入计时功能,看下在GPU中执行向量加法需要多长时间,再跟CPU的执行时间做对比。
在addWithCuda
函数中加入计时功能:
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, int *a, int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// start timing
float milliseconds = 0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, 1000>>>(dev_c, dev_a, dev_b);
// finish timing
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// performace
std::cout << "Time used on GPU: " << milliseconds << "ms" << std::endl;
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
CPU计算向量和的代码,使用最原始的方法,跟GPU代码一样,都不做任何优化。
#include <iostream>
#include <Windows.h>
static double now() {
LARGE_INTEGER freq;
LARGE_INTEGER counter;
QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&counter);
return counter.QuadPart / double(freq.QuadPart);
}
int main() {
const int arraySize = 100000000;
int *a = new int[arraySize];
int *b = new int[arraySize];
int *c = new int[arraySize];
// set initial values
for (int i = 0; i < arraySize; ++i) {
a[i] = i;
b[i] = i + 10;
}
// calculate vector add
auto startTime = now();
for (int k = 0; k < arraySize; ++k) {
c[k] = a[k] + b[k];
}
auto endTime = now();
std::cout << "cpu calculate time:" << (endTime - startTime) << "s" << std::endl;
return 0;
}
在Release下执行结果如下:
Time used on GPU: 0.004192ms
cpu calculate time:0.158441s
可以看到,最简单的向量和程序,GPU中运行速度要比CPU快很多。
看到这里,可能很多同学有疑惑,觉得GPU的计时有问题,因为如果使用GPU计算的话,还要把数据先传到GPU,GPU处理完成后子再传回给CPU,这两个传输时间也应该算进去。如果把传输时间也算进去的话,要比只使用CPU计算慢,说明很多时间都花在了数据的传输上。后面,我们还会对GPU代码做一步步的优化。