首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >GPU并行计算之向量和

GPU并行计算之向量和

作者头像
猫叔Rex
发布2020-08-18 14:40:06
1.2K0
发布2020-08-18 14:40:06
举报
文章被收录于专栏:科学计算科学计算

  将两个数组进行加和后赋给另外一个数组,这是CUDA中自带的例程

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}


__device__ int getGlobalIdx_1d_1d() {
    return blockIdx.x * blockDim.x + threadIdx.x;
}


int main()
{
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);

    return cudaStatus;
}

下面我们对这个程序进行说明:

  • 最开始的两个头文件 cuda_runtime.hdevice_launch_parameters.h是调用CUDA的API必须包含的;
  • global__` 和 `__device在前面的文章中讲过,不再赘述;
  • addKernel函数中,使用了threadIdx.x,这是将Block中的线程按一维排列进行计算;包括getGloablIdx_1d_1d函数中的blockIdx.x blockDim.x我们后面会在后面详细讲到,这里先记住;
  • main函数中,先定义了两个数组,在addWitCuda中完成两个数组的加法运算;
  • cudaSetDevice是用来选择GPU的API,由于我这里只有一个GPU,因此设置为0;
  • 使用cudaMalloc函数为是三个数组在GPU上分配空间,这个函数跟C中的malloc函数很像,但这个是指在GPU(即显存)中分配一块空间,那参数值中为什么是两个*呢?我们先来看这个函数的原型:
cudaError_t cudaMalloc (void **devPtr, size_t  size ); 

所有的CUDA API返回值都是CUDA中定义的一个错误代码,这种返回值的方式也是我们在写程序中经常用到的。这也意味着我们如果想得到某个结果,只能通过参数引用的方式,而我们定义的dev_a本来就是指针,又加了个&,所以前面是两个*

  • 使用cudaMemcpy函数将CPU上的数组拷贝到GPU上,可以看到该函数的参数中有一个是cudaMemcpyHostToDevice
  • addKernel<<<1, size>>>()中的<<<1, size>>>表示线程的索引方式,具体可参考另一篇文章《CUDA核函数与线程索引方式》

在学习过程中,如果遇到怎么都不能理解的东西,可以先把这种用法记住,等后面写的多了,自然也就理解了

  上面这个程序是学习CUDA最开始接触的程序,就跟刚开始学习任何一门编程语言时,第一个例子是“Hello World”一样,我们在上面的程序中加入计时功能,看下在GPU中执行向量加法需要多长时间,再跟CPU的执行时间做对比。

addWithCuda函数中加入计时功能:

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, int *a, int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // start timing
    float milliseconds = 0;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, 1000>>>(dev_c, dev_a, dev_b);

    // finish timing
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // performace
    std::cout << "Time used on GPU: " << milliseconds << "ms" << std::endl;

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);

    return cudaStatus;
}

CPU计算向量和的代码,使用最原始的方法,跟GPU代码一样,都不做任何优化。

#include <iostream>
#include <Windows.h>

static double now() {
    LARGE_INTEGER freq;
    LARGE_INTEGER counter;
    QueryPerformanceFrequency(&freq);
    QueryPerformanceCounter(&counter);
    return counter.QuadPart / double(freq.QuadPart);
}


int main() {
    const int arraySize = 100000000;
    int *a = new int[arraySize];
    int *b = new int[arraySize];
    int *c = new int[arraySize];

    // set initial values
    for (int i = 0; i < arraySize; ++i) {
        a[i] = i;
        b[i] = i + 10;
    }

    // calculate vector add 
    auto startTime = now();
    for (int k = 0; k < arraySize; ++k) {
        c[k] = a[k] + b[k];
    }
    auto endTime = now();
    std::cout << "cpu calculate time:" << (endTime - startTime) << "s" << std::endl;

    return 0;
}

在Release下执行结果如下:

Time used on GPU: 0.004192ms

cpu calculate time:0.158441s

可以看到,最简单的向量和程序,GPU中运行速度要比CPU快很多。

  看到这里,可能很多同学有疑惑,觉得GPU的计时有问题,因为如果使用GPU计算的话,还要把数据先传到GPU,GPU处理完成后子再传回给CPU,这两个传输时间也应该算进去。如果把传输时间也算进去的话,要比只使用CPU计算慢,说明很多时间都花在了数据的传输上。后面,我们还会对GPU代码做一步步的优化。

本文参与 腾讯云自媒体分享计划,分享自微信公众号。
原始发表:2020-08-17,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 傅里叶的猫 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档