cuda runtime/driver API解析

原创

tankaro

发布于 2025-03-04 20:21:22

1500

代码可运行

文章被收录于专栏：NCCL cuda

运行总次数：0

代码可运行

环境

硬件： NVIDIA GeForce RTX 4060 Laptop GPU

软件：ubuntu22.04.3 cuda-12.4 nvcc 12.4.131

cuda API分类

种类 CUDA提供了三种不同的API：Runtime API、Driver API和Libraries-API。https://github.com/NVIDIA/cuda-samples.git
说明从Libraries-API由RuntimeAPI封装而来，Runtime API由Driver API封装而来；Driver API调用KMD内核层。如下图所示，
ntime API example最简单的记忆方法：相关API以cuda开头。
最佳参考代码是NV提供的 samples代码，链接如下：
简单示例 Makefile如下代码，注意sm_89确定方法，注意连接动态库是cudart

all: hello driver_api_kernel.ptx

driver_api_kernel.ptx: driver_api_kernel.cu
	nvcc driver_api_kernel.cu -ptx -arch=sm_89 -o driver_api_kernel.ptx -I./

driver_api_example.o: driver_api_example.cu
	nvcc driver_api_example.cu -c -arch=sm_89 -o driver_api_example.o -I./

hello: driver_api_example.o
	g++ $^ -o hello -lcuda -L /usr/local/cuda/lib64 -lcudart

.PHONY: clean
clean:
	-rm -rf hello *.o *.ptx

runtime_api_example.h

#ifndef __RUNTIME_API_H__
#define __RUNTIME_API_H__

// number of the vectors to sum
#define MAX_NUM                 (16)

#endif //__RUNTIME_API_H__

runtime_api_example.cu

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

#include "driver_api_example.h"

/**
 * CUDA Kernel Device code
 *
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
__global__ void sum_self(int *A, int numElements)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < numElements)
    {
        A[i] = A[i] + 1;
    }
}
void show_h_A_value(int *h_A, int cnt)
{
    for (int i = 0; i < cnt; ++i) {
        printf("%d ", h_A[i]);
    }
    printf("\n");
}
/**
 * Host main routine
 */
int main(int argc, char *argv[])
{
    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;
    // Print the vector length to be used, and compute its size
    int numElements = MAX_NUM;
    size_t size = numElements * sizeof(int);
    printf("[Vector addition of %d elements]\n", numElements);
    // Allocate the host input vector A
    int *h_A = (int *)malloc(size);
    // Verify that allocations succeeded
    if (h_A == NULL) {
        fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }

    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i) {
        h_A[i] = i;
    }
    show_h_A_value(h_A, numElements);
    // Allocate the device input vector A
    int *d_A = NULL;
    err = cudaMalloc((void **)&d_A, size);

    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the host input vectors A and B in host memory to the device input
    // vectors in
    // device memory
    printf("Copy input data from the host memory to the CUDA device\n");
    err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
        fprintf(stderr,
            "Failed to copy vector A from host to device (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
            threadsPerBlock);
    sum_self<<<blocksPerGrid, threadsPerBlock>>>(d_A, numElements);
    err = cudaGetLastError();

    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to launch sum_self kernel (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    printf("Copy output data from the CUDA device to the host memory\n");
    err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);

    if (err != cudaSuccess) {
        fprintf(stderr,
            "Failed to copy vector C from device to host (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    show_h_A_value(h_A, numElements);

    // Free device global memory
    err = cudaFree(d_A);
    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    // Free host memory
    free(h_A);
    printf("Done\n");
    return 0;
}

运行结果

./runtime_api_example.elf
[Vector addition of 16 elements]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 1 blocks of 256 threads
Copy output data from the CUDA device to the host memory
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
Done

driver API example

最简单的记忆方法：相关API以cu开头。

最佳参考代码是NV提供的 samples代码，链接如下：

https://github.com/NVIDIA/cuda-samples.git

简单示例 Makefile如下代码，注意sm_89确定方法，注意连接动态库是cudart

current_dir_name := $(notdir $(shell pwd))
TARGET := $(current_dir_name).elf

all: $(TARGET) driver_api_kernel.ptx
 
%.ptx: %.cu
	nvcc $< -ptx -arch=sm_89 -o $@ -I./
 
%.cu.o: %.cu
	nvcc $< -c -arch=sm_89 -o $@ -I./
 
%.cpp.o: %.cpp
	g++ $< -c -o $@ -I /usr/local/cuda/include
 
$(TARGET): driver_api_kernel.cu.o driver_api_example.cpp.o
	g++ $^ -o $(TARGET) -lcuda -L /usr/local/cuda/lib64 -lcudart
 
 
.PHONY: clean
clean:
	-rm -rf $(TARGET) *.o *.ptx

driver_api_example.cpp

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #include <stdio.h>
 #include <string.h>
 #include <iostream>
 #include <cstring>
 #include <cuda.h>
 
 // includes, project
 
 // includes, CUDA
 #include <builtin_types.h>
 #include "driver_api_example.h"

#ifndef checkCudaErrors
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)

// These are the inline versions for all of the SDK helper functions
inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
  if (CUDA_SUCCESS != err) {
    const char *errorStr = NULL;
    cuGetErrorString(err, &errorStr);
    fprintf(stderr,
            "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
            "line %i.\n",
            err, errorStr, file, line);
    exit(EXIT_FAILURE);
  }
}
#endif
void show_h_A_value(int *h_A, int cnt)
{
    for (int i = 0; i < cnt; ++i) {
        printf("%d ", h_A[i]);
    }
    printf("\n");
}
/**
 * Host main routine
 */
int main(int argc, char *argv[])
{
    CUdevice cuDevice;
    CUcontext cuContext;
    // Error code to check return values for CUDA calls
    //cudaError_t err = cudaSuccess;
    // Print the vector length to be used, and compute its size
    int numElements = MAX_NUM;
    size_t size = numElements * sizeof(int);
    printf("[Vector addition of %d elements]\n", numElements);
    // Allocate the host input vector A
    int *h_A = (int *)malloc(size);
    // Verify that allocations succeeded
    if (h_A == NULL) {
        fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }

    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i) {
        h_A[i] = i;
    }
    show_h_A_value(h_A, numElements);

    // Initialize
    int deviceCount = 0;
    checkCudaErrors(cuInit(0));
    checkCudaErrors(cuDeviceGetCount(&deviceCount));
    printf("devicecount=%d\n", deviceCount);

    checkCudaErrors(cuDeviceGet(&cuDevice, 0));
    char name[100];
    cuDeviceGetName(name, 100, cuDevice);
    printf("> Using device 0: %s\n", name);

    CUresult err1 = cuCtxCreate(&cuContext, 0, cuDevice);
    if (err1 != CUDA_SUCCESS) {
        fprintf(stderr, "* Error initializing the CUDA context.\n");
        cuCtxDestroy(cuContext);
        exit(-1);
    }
    
    CUdeviceptr d_a;
    //CUdeviceptr *d_a = &d_a1;
    checkCudaErrors( cuMemAlloc(&d_a, sizeof(int) * numElements) );
    checkCudaErrors(cuMemcpyHtoD(d_a, h_A, size));


    char       *module_file = (char*) "driver_api_kernel.ptx";
    char       *kernel_name = (char*) "sum_self";
    CUmodule   module;

    err1 = cuModuleLoad(&module, module_file);
    if (err1 != CUDA_SUCCESS) {
        fprintf(stderr, "* Error loading the module %s\n", module_file);
        cuCtxDestroy(cuContext);
        exit(-1);
    }

    CUfunction function;
    err1 = cuModuleGetFunction(&function, module, kernel_name);
 
    if (err1 != CUDA_SUCCESS) {
        fprintf(stderr, "* Error getting kernel function %s\n", kernel_name);
        cuCtxDestroy(cuContext);
        exit(-1);
    }

    void *args[3] = { &d_a};
    // grid for kernel: <<<MAX_NUM, 1>>>
    checkCudaErrors( cuLaunchKernel(function, MAX_NUM, 1, 1,  // Nx1x1 blocks
                                    1, 1, 1,            // 1x1x1 threads
                                    0, 0, args, 0) );
    checkCudaErrors(cuMemcpyDtoH(h_A, d_a, size));
    show_h_A_value(h_A, numElements);
    checkCudaErrors( cuMemFree(d_a) );

    checkCudaErrors(cuCtxDestroy(cuContext));

    // Free host memory
    free(h_A);
    printf("Done\n");
    return 0;
}

driver_api_example.h

#ifndef __RUNTIME_API_H__
#define __RUNTIME_API_H__

// number of the vectors to sum
#define MAX_NUM                     (16)
#define ADD_VALUE                   (5)

#endif //__RUNTIME_API_H__

driver_api_kernel.cu

 #include "driver_api_example.h"

extern "C" __global__ void sum_self(int *a)
{
    int tid = blockIdx.x;
    if (tid < MAX_NUM)
        a[tid] = a[tid] + ADD_VALUE;
}

编译后driver_api_kernel.ptx文件内容

运行结果：

./driver_api_example.elf
[Vector addition of 16 elements]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
devicecount=1
> Using device 0: NVIDIA GeForce RTX 4060 Laptop GPU
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Done

参考链接

https://blog.csdn.net/eloudy/article/details/143242909
https://zhuanlan.zhihu.com/p/595143420
https://zhuanlan.zhihu.com/p/685978812

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

cuda

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

cuda

登录后参与评论

0 条评论

热度

cuda runtime/driver API解析

cuda runtime/driver API解析

环境

cuda API分类

driver API example

参考链接

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐