文章/答案/技术大牛

发布

社区首页 >问答首页 >当指定结束位时，DeviceRadixSort失败

问当指定结束位时，DeviceRadixSort失败
EN

Stack Overflow用户

提问于 2022-02-27 14:13:38

回答 1查看 118关注 0票数 1

我正在使用CUB库的GPU基排序算法对N32位无符号整数进行排序，这些整数的值都只使用它们32位中的k位，从最小有效位开始。

因此，为了提高排序性能，我在调用begin_bit时指定了位子范围[end_bit，cub::DeviceRadixSort::SortKeys ]。我正在使用幼崽(1.16.0)的最新版本。

然而，SortKeys崩溃(不是决定性的，但几乎总是如此)，当试图用特定的位范围[begin_bit=0，end_bit=k]对10亿个密钥进行排序时，如果k= {20,19,18}，例如./cub_sort_test 1000000000 0 20，则会报告非法的内存访问错误。

我在Volta和安培NVIDIA GPU上分别测试了CUDA 11.4和11.2版本。以前有人遇到过这样的情况吗?或者知道修复吗？下面是最小的、可复制的示例代码：

// HOW TO BUILD: nvcc -O3 -std=c++17 -Xcompiler -fopenmp cub_sort_test.cu -o cub_sort_test
#include <cub/cub.cuh>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>

#include <algorithm>
#include <chrono>
#include <iostream>
#include <parallel/algorithm>
#include <random>
#include <vector>
#include <iostream>

#define DEBUG

#ifdef DEBUG
#define CheckCudaError(instruction) \
  { AssertNoCudaError((instruction), __FILE__, __LINE__); }
#else
#define CheckCudaError(instruction) instruction
#endif

inline void AssertNoCudaError(cudaError_t error_code, const char* file, int line) {
  if (error_code != cudaSuccess) {
    std::cout << "Error: " << cudaGetErrorString(error_code) << " " << file << " " << line << "\n";
  }
}

template <typename T>
using PinnedHostVector = thrust::host_vector<T, thrust::system::cuda::experimental::pinned_allocator<T>>;

std::mt19937 SeedRandomGenerator(uint32_t distribution_seed) {
    const size_t seeds_bytes = sizeof(std::mt19937::result_type) * std::mt19937::state_size;
    const size_t seeds_length = seeds_bytes / sizeof(std::seed_seq::result_type);

    std::vector<std::seed_seq::result_type> seeds(seeds_length);
    std::generate(seeds.begin(), seeds.end(), [&]() {
        distribution_seed = (distribution_seed << 1) | (distribution_seed >> (-1 & 31));
        return distribution_seed;
    });
    std::seed_seq seed_sequence(seeds.begin(), seeds.end());

    return std::mt19937{seed_sequence};
}

int main(int argc, char* argv[]) {

    if (argc != 4) {
        std::cerr << "Usage: ./cub-sort-test <num_keys> <gpu_id> <bit_entropy>" << std::endl;
        return -1;
    }

    size_t num_keys = std::stoull(argv[1]);
    int gpu = std::stoi(argv[2]);
    size_t bit_entropy = std::stoi(argv[3]);

    cudaStream_t stream;
    CheckCudaError(cudaSetDevice(gpu));
    CheckCudaError(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

    PinnedHostVector<uint32_t> keys(num_keys);

#pragma omp parallel num_threads(64)
    {
        uint32_t max = (1 << bit_entropy) - 1;
  
      if (bit_entropy == sizeof(uint32_t) * 8) {
        max = std::numeric_limits<uint32_t>::max();
      } else if (bit_entropy == 1) {
        max = 2;
      }
  
      std::mt19937 random_generator = SeedRandomGenerator(2147483647 + static_cast<size_t>(omp_get_thread_num()));
      std::uniform_real_distribution<double> uniform_dist(0, max);
  
#pragma omp for schedule(static)
      for (size_t i = 0; i < num_keys; ++i) {
        keys[i] = static_cast<uint32_t>(uniform_dist(random_generator));
      }
    }

    thrust::device_vector<uint32_t> device_vector(num_keys);
    thrust::copy(keys.begin(), keys.end(), device_vector.begin());

    CheckCudaError(cudaDeviceSynchronize());

    size_t num_temporary_bytes = 0;
    cub::DeviceRadixSort::SortKeys(
        NULL, num_temporary_bytes, thrust::raw_pointer_cast(device_vector.data()),
        thrust::raw_pointer_cast(device_vector.data()), num_keys, 0, bit_entropy + 1, stream); // bit subrange is [begin_bit, end_bit), thus bit_entropy + 1

    uint8_t* temporary_storage = nullptr;
    CheckCudaError(cudaMalloc(reinterpret_cast<void**>(&temporary_storage), num_temporary_bytes));

    cub::DeviceRadixSort::SortKeys(
    (void*)temporary_storage, num_temporary_bytes, thrust::raw_pointer_cast(device_vector.data()),
    thrust::raw_pointer_cast(device_vector.data()), num_keys, 0, bit_entropy + 1, stream);

    CheckCudaError(cudaStreamSynchronize(stream));

    thrust::copy(device_vector.begin(), device_vector.end(), keys.begin());

    CheckCudaError(cudaFree(temporary_storage));

    if (std::is_sorted(keys.begin(), keys.end()) == false) {
        std::cout << "Error: Sorting failed." << std::endl;
    }

    return 0;
}

cuda

nvidia

gpgpu

thrust

cub

回答 1

Stack Overflow用户

回答已采纳

发布于 2022-02-27 15:17:18

代码的问题是您没有正确地使用SortKeys。SortKeys不适用于本地.您需要为已排序的数据提供单独的输出缓冲区。

#include <cub/cub.cuh>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>

#include <algorithm>
#include <chrono>
#include <iostream>
#include <parallel/algorithm>
#include <random>
#include <vector>
#include <iostream>

#define DEBUG

#ifdef DEBUG
#define CheckCudaError(instruction) \
  { AssertNoCudaError((instruction), __FILE__, __LINE__); }
#else
#define CheckCudaError(instruction) instruction
#endif

inline void AssertNoCudaError(cudaError_t error_code, const char* file, int line) {
  if (error_code != cudaSuccess) {
    std::cout << "Error: " << cudaGetErrorString(error_code) << " " << file << " " << line << "\n";
  }
}

template <typename T>
using PinnedHostVector = thrust::host_vector<T, thrust::system::cuda::experimental::pinned_allocator<T>>;

std::mt19937 SeedRandomGenerator(uint32_t distribution_seed) {
    const size_t seeds_bytes = sizeof(std::mt19937::result_type) * std::mt19937::state_size;
    const size_t seeds_length = seeds_bytes / sizeof(std::seed_seq::result_type);

    std::vector<std::seed_seq::result_type> seeds(seeds_length);
    std::generate(seeds.begin(), seeds.end(), [&]() {
        distribution_seed = (distribution_seed << 1) | (distribution_seed >> (-1 & 31));
        return distribution_seed;
    });
    std::seed_seq seed_sequence(seeds.begin(), seeds.end());

    return std::mt19937{seed_sequence};
}

int main(int argc, char* argv[]) {

    if (argc != 4) {
        std::cerr << "Usage: ./cub-sort-test <num_keys> <gpu_id> <bit_entropy>" << std::endl;
        return -1;
    }

    size_t num_keys = std::stoull(argv[1]);
    int gpu = std::stoi(argv[2]);
    size_t bit_entropy = std::stoi(argv[3]);

    cudaStream_t stream;
    CheckCudaError(cudaSetDevice(gpu));
    CheckCudaError(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

    PinnedHostVector<uint32_t> keys(num_keys);

#pragma omp parallel num_threads(64)
    {
        uint32_t max = (1 << bit_entropy) - 1;
  
      if (bit_entropy == sizeof(uint32_t) * 8) {
        max = std::numeric_limits<uint32_t>::max();
      } else if (bit_entropy == 1) {
        max = 2;
      }
  
      std::mt19937 random_generator = SeedRandomGenerator(2147483647 + static_cast<size_t>(omp_get_thread_num()));
      std::uniform_real_distribution<double> uniform_dist(0, max);
  
#pragma omp for schedule(static)
      for (size_t i = 0; i < num_keys; ++i) {
        keys[i] = static_cast<uint32_t>(uniform_dist(random_generator));
      }
    }

    thrust::device_vector<uint32_t> device_vector(num_keys);
    thrust::copy(keys.begin(), keys.end(), device_vector.begin());

    thrust::device_vector<uint32_t> device_vector_sorted(num_keys);

    CheckCudaError(cudaDeviceSynchronize());

    size_t num_temporary_bytes = 0;
    cub::DeviceRadixSort::SortKeys(
        NULL, num_temporary_bytes, thrust::raw_pointer_cast(device_vector.data()),
        thrust::raw_pointer_cast(device_vector_sorted.data()), num_keys, 0, bit_entropy + 1, stream); // bit subrange is [begin_bit, end_bit), thus bit_entropy + 1

    uint8_t* temporary_storage = nullptr;
    CheckCudaError(cudaMalloc(reinterpret_cast<void**>(&temporary_storage), num_temporary_bytes));

    cub::DeviceRadixSort::SortKeys(
    (void*)temporary_storage, num_temporary_bytes, thrust::raw_pointer_cast(device_vector.data()),
    thrust::raw_pointer_cast(device_vector_sorted.data()), num_keys, 0, bit_entropy + 1, stream);

    CheckCudaError(cudaStreamSynchronize(stream));

    thrust::copy(device_vector_sorted.begin(), device_vector_sorted.end(), keys.begin());

    CheckCudaError(cudaFree(temporary_storage));

    if (std::is_sorted(keys.begin(), keys.end()) == false) {
        std::cout << "Error: Sorting failed." << std::endl;
    }

    return 0;
}

如果未排序的数组在排序后不再使用，并且可以被覆盖，我建议使用重载，这需要一个DoubleBuffer<Keys>来减少内存的使用。否则，将分配一个临时键数组，因为不能覆盖const Key*输入。

票数 3

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/71285448

复制

相似问题

问当指定结束位时，DeviceRadixSort失败
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问当指定结束位时，DeviceRadixSort失败EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问当指定结束位时，DeviceRadixSort失败
EN