前往小程序,Get更优阅读体验!
立即前往
发布
社区首页 >专栏 >cuda runtime/driver API解析

cuda runtime/driver API解析

原创
作者头像
tankaro
发布2025-03-04 20:21:22
发布2025-03-04 20:21:22
1500
代码可运行
举报
文章被收录于专栏:NCCLcuda
运行总次数:0
代码可运行

环境

硬件: NVIDIA GeForce RTX 4060 Laptop GPU

软件:ubuntu22.04.3 cuda-12.4 nvcc 12.4.131

cuda API分类

  1. 种类 CUDA提供了三种不同的API:Runtime API、Driver API和Libraries-API。https://github.com/NVIDIA/cuda-samples.git
  2. 说明从Libraries-API由RuntimeAPI封装而来,Runtime API由Driver API封装而来;Driver API调用KMD内核层。 如下图所示,
    ntime API example最简单的记忆方法:相关API以cuda开头。
  3. 最佳参考代码是NV提供的 samples代码,链接如下:
  4. 简单示例 Makefile如下代码,注意sm_89确定方法,注意连接动态库是cudart
代码语言:txt
复制
all: hello driver_api_kernel.ptx

driver_api_kernel.ptx: driver_api_kernel.cu
	nvcc driver_api_kernel.cu -ptx -arch=sm_89 -o driver_api_kernel.ptx -I./

driver_api_example.o: driver_api_example.cu
	nvcc driver_api_example.cu -c -arch=sm_89 -o driver_api_example.o -I./

hello: driver_api_example.o
	g++ $^ -o hello -lcuda -L /usr/local/cuda/lib64 -lcudart

.PHONY: clean
clean:
	-rm -rf hello *.o *.ptx

runtime_api_example.h

代码语言:h
复制
#ifndef __RUNTIME_API_H__
#define __RUNTIME_API_H__

// number of the vectors to sum
#define MAX_NUM                 (16)

#endif //__RUNTIME_API_H__

runtime_api_example.cu

代码语言:cu
复制
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

#include "driver_api_example.h"

/**
 * CUDA Kernel Device code
 *
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
__global__ void sum_self(int *A, int numElements)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < numElements)
    {
        A[i] = A[i] + 1;
    }
}
void show_h_A_value(int *h_A, int cnt)
{
    for (int i = 0; i < cnt; ++i) {
        printf("%d ", h_A[i]);
    }
    printf("\n");
}
/**
 * Host main routine
 */
int main(int argc, char *argv[])
{
    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;
    // Print the vector length to be used, and compute its size
    int numElements = MAX_NUM;
    size_t size = numElements * sizeof(int);
    printf("[Vector addition of %d elements]\n", numElements);
    // Allocate the host input vector A
    int *h_A = (int *)malloc(size);
    // Verify that allocations succeeded
    if (h_A == NULL) {
        fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }

    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i) {
        h_A[i] = i;
    }
    show_h_A_value(h_A, numElements);
    // Allocate the device input vector A
    int *d_A = NULL;
    err = cudaMalloc((void **)&d_A, size);

    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the host input vectors A and B in host memory to the device input
    // vectors in
    // device memory
    printf("Copy input data from the host memory to the CUDA device\n");
    err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
        fprintf(stderr,
            "Failed to copy vector A from host to device (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
            threadsPerBlock);
    sum_self<<<blocksPerGrid, threadsPerBlock>>>(d_A, numElements);
    err = cudaGetLastError();

    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to launch sum_self kernel (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    printf("Copy output data from the CUDA device to the host memory\n");
    err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);

    if (err != cudaSuccess) {
        fprintf(stderr,
            "Failed to copy vector C from device to host (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    show_h_A_value(h_A, numElements);

    // Free device global memory
    err = cudaFree(d_A);
    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    // Free host memory
    free(h_A);
    printf("Done\n");
    return 0;
}
 

运行结果

代码语言:shell
复制
./runtime_api_example.elf
[Vector addition of 16 elements]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 1 blocks of 256 threads
Copy output data from the CUDA device to the host memory
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
Done

driver API example

最简单的记忆方法:相关API以cu开头。

  1. 最佳参考代码是NV提供的 samples代码,链接如下:
代码语言:html
复制
https://github.com/NVIDIA/cuda-samples.git
  1. 简单示例 Makefile如下代码,注意sm_89确定方法,注意连接动态库是cudart
代码语言:txt
复制
current_dir_name := $(notdir $(shell pwd))
TARGET := $(current_dir_name).elf

all: $(TARGET) driver_api_kernel.ptx
 
%.ptx: %.cu
	nvcc $< -ptx -arch=sm_89 -o $@ -I./
 
%.cu.o: %.cu
	nvcc $< -c -arch=sm_89 -o $@ -I./
 
%.cpp.o: %.cpp
	g++ $< -c -o $@ -I /usr/local/cuda/include
 
$(TARGET): driver_api_kernel.cu.o driver_api_example.cpp.o
	g++ $^ -o $(TARGET) -lcuda -L /usr/local/cuda/lib64 -lcudart
 
 
.PHONY: clean
clean:
	-rm -rf $(TARGET) *.o *.ptx
 

driver_api_example.cpp

代码语言:cpp
代码运行次数:0
复制
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #include <stdio.h>
 #include <string.h>
 #include <iostream>
 #include <cstring>
 #include <cuda.h>
 
 // includes, project
 
 // includes, CUDA
 #include <builtin_types.h>
 #include "driver_api_example.h"

#ifndef checkCudaErrors
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)

// These are the inline versions for all of the SDK helper functions
inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
  if (CUDA_SUCCESS != err) {
    const char *errorStr = NULL;
    cuGetErrorString(err, &errorStr);
    fprintf(stderr,
            "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
            "line %i.\n",
            err, errorStr, file, line);
    exit(EXIT_FAILURE);
  }
}
#endif
void show_h_A_value(int *h_A, int cnt)
{
    for (int i = 0; i < cnt; ++i) {
        printf("%d ", h_A[i]);
    }
    printf("\n");
}
/**
 * Host main routine
 */
int main(int argc, char *argv[])
{
    CUdevice cuDevice;
    CUcontext cuContext;
    // Error code to check return values for CUDA calls
    //cudaError_t err = cudaSuccess;
    // Print the vector length to be used, and compute its size
    int numElements = MAX_NUM;
    size_t size = numElements * sizeof(int);
    printf("[Vector addition of %d elements]\n", numElements);
    // Allocate the host input vector A
    int *h_A = (int *)malloc(size);
    // Verify that allocations succeeded
    if (h_A == NULL) {
        fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }

    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i) {
        h_A[i] = i;
    }
    show_h_A_value(h_A, numElements);

    // Initialize
    int deviceCount = 0;
    checkCudaErrors(cuInit(0));
    checkCudaErrors(cuDeviceGetCount(&deviceCount));
    printf("devicecount=%d\n", deviceCount);

    checkCudaErrors(cuDeviceGet(&cuDevice, 0));
    char name[100];
    cuDeviceGetName(name, 100, cuDevice);
    printf("> Using device 0: %s\n", name);

    CUresult err1 = cuCtxCreate(&cuContext, 0, cuDevice);
    if (err1 != CUDA_SUCCESS) {
        fprintf(stderr, "* Error initializing the CUDA context.\n");
        cuCtxDestroy(cuContext);
        exit(-1);
    }
    
    CUdeviceptr d_a;
    //CUdeviceptr *d_a = &d_a1;
    checkCudaErrors( cuMemAlloc(&d_a, sizeof(int) * numElements) );
    checkCudaErrors(cuMemcpyHtoD(d_a, h_A, size));


    char       *module_file = (char*) "driver_api_kernel.ptx";
    char       *kernel_name = (char*) "sum_self";
    CUmodule   module;

    err1 = cuModuleLoad(&module, module_file);
    if (err1 != CUDA_SUCCESS) {
        fprintf(stderr, "* Error loading the module %s\n", module_file);
        cuCtxDestroy(cuContext);
        exit(-1);
    }

    CUfunction function;
    err1 = cuModuleGetFunction(&function, module, kernel_name);
 
    if (err1 != CUDA_SUCCESS) {
        fprintf(stderr, "* Error getting kernel function %s\n", kernel_name);
        cuCtxDestroy(cuContext);
        exit(-1);
    }

    void *args[3] = { &d_a};
    // grid for kernel: <<<MAX_NUM, 1>>>
    checkCudaErrors( cuLaunchKernel(function, MAX_NUM, 1, 1,  // Nx1x1 blocks
                                    1, 1, 1,            // 1x1x1 threads
                                    0, 0, args, 0) );
    checkCudaErrors(cuMemcpyDtoH(h_A, d_a, size));
    show_h_A_value(h_A, numElements);
    checkCudaErrors( cuMemFree(d_a) );

    checkCudaErrors(cuCtxDestroy(cuContext));

    // Free host memory
    free(h_A);
    printf("Done\n");
    return 0;
}
 

driver_api_example.h

代码语言:c
代码运行次数:0
复制
#ifndef __RUNTIME_API_H__
#define __RUNTIME_API_H__

// number of the vectors to sum
#define MAX_NUM                     (16)
#define ADD_VALUE                   (5)

#endif //__RUNTIME_API_H__

driver_api_kernel.cu

代码语言:c
代码运行次数:0
复制
 #include "driver_api_example.h"

extern "C" __global__ void sum_self(int *a)
{
    int tid = blockIdx.x;
    if (tid < MAX_NUM)
        a[tid] = a[tid] + ADD_VALUE;
}

编译后driver_api_kernel.ptx文件内容

运行结果:

代码语言:shell
复制
./driver_api_example.elf
[Vector addition of 16 elements]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
devicecount=1
> Using device 0: NVIDIA GeForce RTX 4060 Laptop GPU
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Done

参考链接

代码语言:html
复制
https://blog.csdn.net/eloudy/article/details/143242909
https://zhuanlan.zhihu.com/p/595143420
https://zhuanlan.zhihu.com/p/685978812

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 环境
  • cuda API分类
  • driver API example
  • 参考链接
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档