硬件: NVIDIA GeForce RTX 4060 Laptop GPU
软件:ubuntu22.04.3 cuda-12.4 nvcc 12.4.131
all: hello driver_api_kernel.ptx
driver_api_kernel.ptx: driver_api_kernel.cu
nvcc driver_api_kernel.cu -ptx -arch=sm_89 -o driver_api_kernel.ptx -I./
driver_api_example.o: driver_api_example.cu
nvcc driver_api_example.cu -c -arch=sm_89 -o driver_api_example.o -I./
hello: driver_api_example.o
g++ $^ -o hello -lcuda -L /usr/local/cuda/lib64 -lcudart
.PHONY: clean
clean:
-rm -rf hello *.o *.ptx
runtime_api_example.h
#ifndef __RUNTIME_API_H__
#define __RUNTIME_API_H__
// number of the vectors to sum
#define MAX_NUM (16)
#endif //__RUNTIME_API_H__
runtime_api_example.cu
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
#include "driver_api_example.h"
/**
* CUDA Kernel Device code
*
* Computes the vector addition of A and B into C. The 3 vectors have the same
* number of elements numElements.
*/
__global__ void sum_self(int *A, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
A[i] = A[i] + 1;
}
}
void show_h_A_value(int *h_A, int cnt)
{
for (int i = 0; i < cnt; ++i) {
printf("%d ", h_A[i]);
}
printf("\n");
}
/**
* Host main routine
*/
int main(int argc, char *argv[])
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = MAX_NUM;
size_t size = numElements * sizeof(int);
printf("[Vector addition of %d elements]\n", numElements);
// Allocate the host input vector A
int *h_A = (int *)malloc(size);
// Verify that allocations succeeded
if (h_A == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = i;
}
show_h_A_value(h_A, numElements);
// Allocate the device input vector A
int *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the host input vectors A and B in host memory to the device input
// vectors in
// device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr,
"Failed to copy vector A from host to device (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
threadsPerBlock);
sum_self<<<blocksPerGrid, threadsPerBlock>>>(d_A, numElements);
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch sum_self kernel (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr,
"Failed to copy vector C from device to host (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
show_h_A_value(h_A, numElements);
// Free device global memory
err = cudaFree(d_A);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free host memory
free(h_A);
printf("Done\n");
return 0;
}
运行结果
./runtime_api_example.elf
[Vector addition of 16 elements]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 1 blocks of 256 threads
Copy output data from the CUDA device to the host memory
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
Done
最简单的记忆方法:相关API以cu开头。
https://github.com/NVIDIA/cuda-samples.git
current_dir_name := $(notdir $(shell pwd))
TARGET := $(current_dir_name).elf
all: $(TARGET) driver_api_kernel.ptx
%.ptx: %.cu
nvcc $< -ptx -arch=sm_89 -o $@ -I./
%.cu.o: %.cu
nvcc $< -c -arch=sm_89 -o $@ -I./
%.cpp.o: %.cpp
g++ $< -c -o $@ -I /usr/local/cuda/include
$(TARGET): driver_api_kernel.cu.o driver_api_example.cpp.o
g++ $^ -o $(TARGET) -lcuda -L /usr/local/cuda/lib64 -lcudart
.PHONY: clean
clean:
-rm -rf $(TARGET) *.o *.ptx
driver_api_example.cpp
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <cuda.h>
// includes, project
// includes, CUDA
#include <builtin_types.h>
#include "driver_api_example.h"
#ifndef checkCudaErrors
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
// These are the inline versions for all of the SDK helper functions
inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
if (CUDA_SUCCESS != err) {
const char *errorStr = NULL;
cuGetErrorString(err, &errorStr);
fprintf(stderr,
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
"line %i.\n",
err, errorStr, file, line);
exit(EXIT_FAILURE);
}
}
#endif
void show_h_A_value(int *h_A, int cnt)
{
for (int i = 0; i < cnt; ++i) {
printf("%d ", h_A[i]);
}
printf("\n");
}
/**
* Host main routine
*/
int main(int argc, char *argv[])
{
CUdevice cuDevice;
CUcontext cuContext;
// Error code to check return values for CUDA calls
//cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = MAX_NUM;
size_t size = numElements * sizeof(int);
printf("[Vector addition of %d elements]\n", numElements);
// Allocate the host input vector A
int *h_A = (int *)malloc(size);
// Verify that allocations succeeded
if (h_A == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = i;
}
show_h_A_value(h_A, numElements);
// Initialize
int deviceCount = 0;
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGetCount(&deviceCount));
printf("devicecount=%d\n", deviceCount);
checkCudaErrors(cuDeviceGet(&cuDevice, 0));
char name[100];
cuDeviceGetName(name, 100, cuDevice);
printf("> Using device 0: %s\n", name);
CUresult err1 = cuCtxCreate(&cuContext, 0, cuDevice);
if (err1 != CUDA_SUCCESS) {
fprintf(stderr, "* Error initializing the CUDA context.\n");
cuCtxDestroy(cuContext);
exit(-1);
}
CUdeviceptr d_a;
//CUdeviceptr *d_a = &d_a1;
checkCudaErrors( cuMemAlloc(&d_a, sizeof(int) * numElements) );
checkCudaErrors(cuMemcpyHtoD(d_a, h_A, size));
char *module_file = (char*) "driver_api_kernel.ptx";
char *kernel_name = (char*) "sum_self";
CUmodule module;
err1 = cuModuleLoad(&module, module_file);
if (err1 != CUDA_SUCCESS) {
fprintf(stderr, "* Error loading the module %s\n", module_file);
cuCtxDestroy(cuContext);
exit(-1);
}
CUfunction function;
err1 = cuModuleGetFunction(&function, module, kernel_name);
if (err1 != CUDA_SUCCESS) {
fprintf(stderr, "* Error getting kernel function %s\n", kernel_name);
cuCtxDestroy(cuContext);
exit(-1);
}
void *args[3] = { &d_a};
// grid for kernel: <<<MAX_NUM, 1>>>
checkCudaErrors( cuLaunchKernel(function, MAX_NUM, 1, 1, // Nx1x1 blocks
1, 1, 1, // 1x1x1 threads
0, 0, args, 0) );
checkCudaErrors(cuMemcpyDtoH(h_A, d_a, size));
show_h_A_value(h_A, numElements);
checkCudaErrors( cuMemFree(d_a) );
checkCudaErrors(cuCtxDestroy(cuContext));
// Free host memory
free(h_A);
printf("Done\n");
return 0;
}
driver_api_example.h
#ifndef __RUNTIME_API_H__
#define __RUNTIME_API_H__
// number of the vectors to sum
#define MAX_NUM (16)
#define ADD_VALUE (5)
#endif //__RUNTIME_API_H__
driver_api_kernel.cu
#include "driver_api_example.h"
extern "C" __global__ void sum_self(int *a)
{
int tid = blockIdx.x;
if (tid < MAX_NUM)
a[tid] = a[tid] + ADD_VALUE;
}
编译后driver_api_kernel.ptx文件内容
运行结果:
./driver_api_example.elf
[Vector addition of 16 elements]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
devicecount=1
> Using device 0: NVIDIA GeForce RTX 4060 Laptop GPU
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Done
https://blog.csdn.net/eloudy/article/details/143242909
https://zhuanlan.zhihu.com/p/595143420
https://zhuanlan.zhihu.com/p/685978812
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。