文章/答案/技术大牛

发布

社区首页 >问答首页 >cudaErrorIllegalAdress on cudaMemcpy

问cudaErrorIllegalAdress on cudaMemcpy
EN

Stack Overflow用户

提问于 2016-04-08 20:26:59

回答 1查看 1.6K关注 0票数 1

我对cuda很陌生，并试图编写一些代码，这些代码应该会在球面上生成随机点。这是密码。

    __global__ 
    void setup_kernel(curandStateMRG32k3a *state) 
    { 
      int id = threadIdx.x + blockIdx.x * blockDim.x;
      curand_init(0, id, 0, &state[id]); 
    }

    __global__
    void computeRandomVectors(float* x, float* y, float* z, unsigned int numberOfElements,curandStateMRG32k3a *state)
    {
      float a,b;
      unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
      curandStateMRG32k3a localState = state[i];
      if(i < numberOfElements)
        {
          a = curand_uniform(&localState);
          b = curand_uniform(&localState);
          while(a * a + b * b > 1.0f)
        {
          a = curand_uniform(&localState) * 2.0f - 1.0f;
          b = curand_uniform(&localState) * 2.0f - 1.0f;
        }
          x[i] = 2.0f * a * sqrtf(1.0f - a * a - b * b);
          y[i] = 2.0f * b * sqrtf(1.0f - a * a - b * b);
          z[i] = 1.0f - 2.0f * (a * a + b * b);
        }
     }

    void generatePointsOnASphere(thrust::host_vector<float>& h_x,        thrust::host_vector<float>& h_y, thrust::host_vector<float>& h_z)
    {
      if(h_x.size() != h_y.size() && h_x.size() != h_z.size())
        {
          std::cout << "The three component vectors have unmatching  size()" << std::endl;
          return;
        }

      size_t size = h_x.size() * sizeof(float);

      float* h_p_x = (float*) calloc(h_x.size(),sizeof(float));
      float* h_p_y = (float*) calloc(h_x.size(),sizeof(float));
      float* h_p_z = (float*) calloc(h_x.size(),sizeof(float));
      if(h_p_x==NULL || h_p_y==NULL || h_p_z==NULL)
        {
          std::cout << "Host memory allocation failure" << std::endl;
           return;
         }

      float* d_p_x;
      float* d_p_y;
      float* d_p_z;

    if(cudaMalloc((void **)&d_p_x,size) != cudaSuccess || 
     cudaMalloc((void **)&d_p_y,size) != cudaSuccess ||
     cudaMalloc((void **)&d_p_z,size) != cudaSuccess)
    {
      std::string errorString(cudaGetErrorName(cudaGetLastError()));
      std::cout << errorString << std::endl;
      std::cout << "Device memory allocation failure" << std::endl;
      return;
    }
    curandStateMRG32k3a *devStates;
    if(cudaMalloc((void **)&devStates, h_x.size() * sizeof(curandStateMRG32k3a)) != cudaSuccess)
    {
      std::string errorString(cudaGetErrorName(cudaGetLastError()));
      std::cout << errorString << std::endl;
      std::cout << "Random generator states memory allocation failure" << std::endl;
      return;
    }

  int threads = 256;
  dim3 grid = size / threads;
  setup_kernel<<<grid,threads>>>(devStates);

  if(cudaMemcpy(d_p_x,h_p_x,size,cudaMemcpyHostToDevice) != cudaSuccess ||
     cudaMemcpy(d_p_y,h_p_y,size,cudaMemcpyHostToDevice) != cudaSuccess ||
     cudaMemcpy(d_p_z,h_p_z,size,cudaMemcpyHostToDevice) != cudaSuccess)
    {
      std::string errorString(cudaGetErrorName(cudaGetLastError()));
      std::cout << errorString << std::endl;
      std::cout <<  "Host to Device memory copy failure" << std::endl;
    }

  computeRandomVectors<<< grid, threads >>>(d_p_x,d_p_y,d_p_z,size / sizeof(float), devStates);

  if(cudaMemcpy(h_p_x,d_p_x,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
     cudaMemcpy(h_p_y,d_p_y,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
     cudaMemcpy(h_p_z,d_p_z,size,cudaMemcpyDeviceToHost) != cudaSuccess)
    {
      std::string errorString(cudaGetErrorName(cudaGetLastError()));
      std::cout << errorString << std::endl;      
      std::cout <<  "Device to Host memory copy failure" << std::endl;
    }
  for(size_t i = 0; i < h_x.size(); ++i)
    {
      h_x[i] = h_p_x[i];
      h_y[i] = h_p_y[i];
      h_z[i] = h_p_z[i];
    }

  free (h_p_x);
  free (h_p_y);
  free (h_p_z);
  cudaFree (devStates);
  cudaFree (d_p_x);
  cudaFree (d_p_y);
  cudaFree (d_p_z);
  cudaDeviceReset();
}

如果向量中的元素数小于4000 (我尝试了1K、2K、3K和4K)，则此代码可以工作。而不是它给了我第一个cudaMemcpy中的非法地址。我认为我没有耗尽内存，我正在使用gtx 980 (4GB的全局内存)。知道怎么解决这个问题吗？

编辑：建议修改后的代码如下：

__global__ 
void setup_kernel(curandStateMRG32k3a *state, unsigned int numberOfElements) 
{ 
  int id = threadIdx.x + blockIdx.x * blockDim.x;
  if(id < numberOfElements) curand_init(0, id, 0, &state[id]); 
}

__global__
void computeRandomVectors(float* x, float* y, float* z, unsigned int numberOfElements,curandStateMRG32k3a *state)
{
  float a,b;
  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
  curandStateMRG32k3a localState = state[i];
  if(i < numberOfElements)
    {
      a = curand_uniform(&localState);
      b = curand_uniform(&localState);
      while(a * a + b * b > 1.0f)
     {
       a = curand_uniform(&localState) * 2.0f - 1.0f;
       b = curand_uniform(&localState) * 2.0f - 1.0f;
     }
      x[i] = 2.0f * a * sqrtf(1.0f - a * a - b * b);
      y[i] = 2.0f * b * sqrtf(1.0f - a * a - b * b);
      z[i] = 1.0f - 2.0f * (a * a + b * b);
     }
 }

 void generatePointsOnASphere(thrust::host_vector<float>& h_x, thrust::host_vector<float>& h_y, thrust::host_vector<float>& h_z)
 {
  if(h_x.size() != h_y.size() && h_x.size() != h_z.size())
    {
      std::cout << "The three component vectors have unmatching size()" << std::endl;
      return;
    }

   size_t size = h_x.size() * sizeof(float);

   float* h_p_x = (float*) calloc(h_x.size(),sizeof(float));
   float* h_p_y = (float*) calloc(h_x.size(),sizeof(float));
   float* h_p_z = (float*) calloc(h_x.size(),sizeof(float));
   if(h_p_x==NULL || h_p_y==NULL || h_p_z==NULL)
    {
      std::cout << "Host memory allocation failure" << std::endl;
      return;
    }

   float* d_p_x;
   float* d_p_y;
   float* d_p_z;

   if(cudaMalloc((void **)&d_p_x,size) != cudaSuccess || 
 cudaMalloc((void **)&d_p_y,size) != cudaSuccess ||
 cudaMalloc((void **)&d_p_z,size) != cudaSuccess)
    {
      std::string errorString(cudaGetErrorName(cudaGetLastError()));
      std::cout << errorString << std::endl;
      std::cout << "Device memory allocation failure" << std::endl;
      return;
     }
  curandStateMRG32k3a *devStates;
  if(cudaMalloc((void **)&devStates, h_x.size() * sizeof(curandStateMRG32k3a)) != cudaSuccess)
    {
      std::string errorString(cudaGetErrorName(cudaGetLastError()));
      std::cout << errorString << std::endl;
      std::cout << "Random generator states memory allocation failure" << std::endl;
      return;
    }

  if(cudaMemcpy(d_p_x,h_p_x,size,cudaMemcpyHostToDevice) != cudaSuccess ||
 cudaMemcpy(d_p_y,h_p_y,size,cudaMemcpyHostToDevice) != cudaSuccess ||
 cudaMemcpy(d_p_z,h_p_z,size,cudaMemcpyHostToDevice) != cudaSuccess)
    {
      std::string errorString(cudaGetErrorName(cudaGetLastError()));
      std::cout << errorString << std::endl;
      std::cout <<  "Host to Device memory copy failure" << std::endl;
     }

  int threads = 512;
  dim3 grid = (h_x.size() + threads - 1) / threads;
  setup_kernel<<<grid,threads>>>(devStates, size / sizeof(float));
  computeRandomVectors<<< grid, threads >>>(d_p_x,d_p_y,d_p_z,size / sizeof(float), devStates);
  cudaDeviceSynchronize();
  if(cudaMemcpy(h_p_x,d_p_x,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
 cudaMemcpy(h_p_y,d_p_y,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
 cudaMemcpy(h_p_z,d_p_z,size,cudaMemcpyDeviceToHost) != cudaSuccess)
    {
      std::string errorString(cudaGetErrorName(cudaGetLastError()));
      std::cout << errorString << std::endl;      
      std::cout <<  "Device to Host memory copy failure" << std::endl;
    }
  for(size_t i = 0; i < h_x.size(); ++i)
    {
      h_x[i] = h_p_x[i];
      h_y[i] = h_p_y[i];
      h_z[i] = h_p_z[i];
    }

  free (h_p_x);
  free (h_p_y);
  free (h_p_z);
  cudaFree (devStates);
  cudaFree (d_p_x);
  cudaFree (d_p_y);
  cudaFree (d_p_z);
  cudaDeviceReset();
}

我很抱歉继续在这里发帖，但我想通过理解我现在的错误，我想我可能会对库达有一个更好的理解。现在，当errorIllegalAdress ()为20k时，我在cudaMemcpy设备上获得了h_x.size->host。我仍然不明白代码是如何工作的小数字，但不是大数字。

c++

cuda

thrust

回答 1

Stack Overflow用户

回答已采纳

发布于 2016-04-08 21:50:22

问题在于：

  size_t size = h_x.size() * sizeof(float);

  ...
  int threads = 256;
  dim3 grid = size / threads;

您的size变量按字节数进行缩放。因此，这不是用于网格大小的正确变量。您应该像这样计算网格大小：

  dim3 grid = h_x.size() / threads;

或者类似的。还请注意，除非向量长度(h_x.size())可被threads (即256 )均匀整除，否则这个构造不会正确地初始化所有curand状态。解决这个问题的方法是在setup_kernel中包含一个线程检查，类似于另一个内核中的线程检查：

__global__ 
void setup_kernel(curandStateMRG32k3a *state, int size) 
{ 
  int id = threadIdx.x + blockIdx.x * blockDim.x;
  if (id < size)
    curand_init(0, id, 0, &state[id]); 
}

并启动足够多的线程来覆盖向量大小：

  dim3 grid = (h_x.size()+threads-1) / threads;

票数 2

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/36508747

复制

相似问题

问cudaErrorIllegalAdress on cudaMemcpy
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问cudaErrorIllegalAdress on cudaMemcpyEN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问cudaErrorIllegalAdress on cudaMemcpy
EN