问为什么这个库达内核会产生不确定的结果？
EN

Stack Overflow用户

提问于 2015-07-03 13:12:41

回答 1查看 776关注 0票数 1

我在一个更大的代码示例中构造了一个我所面临的问题的最小示例。在这个例子中，我希望找到一些数据ys与一个函数fs的平方误差之和，但是我想一次在多个函数上这样做，所以我将fs创建为一个矩阵。原始数据的长度为gridSize，我希望一次对nGrids函数执行此成本函数，因此fs的大小为nGrids*gridSize。

我发现CUDA内核以不确定的方式给出了不可靠的结果，这使我相信我没有正确地执行线程处理(这是我的第一个CUDA内核！)我在这个程序上运行了cuda-memcheck，它没有显示错误。

为了测试这些错误的零散性，我编写了一个脚本，运行了100次，并比较了结果被随机关闭的频率。我发现，当gridSize增长时，关闭它的可能性更大：

gridSize ...  Errors
     300 ...   0/100
     400 ...   0/100
     450 ...   4/100
     500 ...   5/100
     550 ...  55/100
     600 ...  59/100
     650 ... 100/100

这里的想法是让每个块在一个网格上工作，当我想要增强并行性时，只需要调用多个CUDA块。因此，我称之为12个街区，因为这里有12个网格。对于这段代码，我永远不会有超过1000个的gridSize，所以我将把Nthreads留在1024 (因为我的NVIDIA GTX 770上每个块有1024个线程)。

以下是代码：

#include <stdio.h>

#define nGrids 12
#define gridSize 700

void H_get_costs(float* h_xs, float* h_ys, float* h_fs, float* h_costs);
void D_get_costs(float* h_xs, float* h_ys, float* h_fs, float* d_costs);

/**************\
 * cuda Costs *
\**************/
__global__ void cuCosts(float* d_xs, float* d_ys, float* d_fs, float* d_costs) {
    int ir = threadIdx.x;
    int ig = blockIdx.x;

    __shared__ float diff[1024];

    diff[ir] = 0.0;
    __syncthreads();

    if( ir < gridSize-1 && ig < nGrids) {
        diff[ir] =   (d_ys[ir] - d_fs[ig*gridSize + ir])*(d_ys[ir] - d_fs[ig*gridSize + ir]);
        __syncthreads();
        // reduction
        for(int s=1; s < blockDim.x; s*=2) {
            if( ir%(2*s) == 0 && ir+s < gridSize){
                diff[ir] += diff[ir+s];
            }
        }
        __syncthreads();
        d_costs[ig] = diff[0];
    }
    __syncthreads();
}


/****************\
 * Main routine *
\****************/
int main(int argc, char** argv) {

    float h_xs[gridSize];
    float h_ys[gridSize];
    float h_fs[gridSize*nGrids];

    for( int ir = 0; ir < gridSize; ir++) {
        h_xs[ir] = (float)ir/10.0;
        h_ys[ir] = (float)ir/10.0;
    }

    for(int ir = 0; ir < gridSize; ir++) {
        for(int jgrid = 0; jgrid < nGrids; jgrid++) {
            float trand = 2.0*((float)rand()/(float)RAND_MAX) - 1.0;
            h_fs[jgrid*gridSize + ir] = h_ys[ir] + trand;
        }
    }

    float h_costs[nGrids];
    float d_costs[nGrids];

    // get all of the costs (on the host)
    H_get_costs(h_xs, h_ys, h_fs, h_costs);

    // get all of the costs (on the device)
    D_get_costs(h_xs, h_ys, h_fs, d_costs);

    // Print the grids
    /*
    for(int ir = 0; ir < gridSize; ir++) {
        printf("%10.5e %15.5e", h_xs[ir], h_ys[ir]);
        for(int jg = 0; jg < nGrids; jg++) {
            printf("%15.5e", h_fs[jg*gridSize + ir]);
        }
        printf("\n");
    }
    */

   // print the results
    printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
    printf("%-25s  ", "Host ... ");
    for(int ig = 0; ig < nGrids; ig++) {
        printf("%15.5e", h_costs[ig]);
    }
    printf("\n");
    printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
    printf("%-25s  ", "Device ... ");
    for(int ig = 0; ig < nGrids; ig++) {
        printf("%15.5e", d_costs[ig]);
    }
    printf("\n");
    printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
    printf("%-25s  ", "Difference ... ");
    for(int ig = 0; ig < nGrids; ig++) {
        printf("%15.5e", d_costs[ig]-h_costs[ig]);
    }
    printf("\n");

    return 0;
}

/*******************************\
 * get the costs (on the host) *
\*******************************/
void H_get_costs(float* h_xs, float* h_ys, float* h_fs, float* h_costs) {
    for(int ig = 0; ig < nGrids; ig++) { h_costs[ig] = 0.0; }
    for(int ir = 0; ir < gridSize-1; ir++) {
        for(int ig = 0; ig < nGrids; ig++) {
            h_costs[ig] += (h_ys[ir] - h_fs[ig*gridSize + ir])*(h_ys[ir] - h_fs[ig*gridSize + ir]);
        }
    }
}

/**************************\
 * wrapper for cuda costs *
\**************************/
void D_get_costs(float* h_xs_p, float* h_ys_p, float* h_fs_p, float* r_costs) {
    float* d_xs;
    float* d_ys;
    float* d_fs;

    float* d_costs; // device costs
    float* t_costs; // temporary costs

    cudaMalloc( (void**)&d_xs, gridSize*sizeof(float) );
    cudaMalloc( (void**)&d_ys, gridSize*sizeof(float) );
    cudaMalloc( (void**)&d_fs, nGrids*gridSize*sizeof(float) );
    cudaMalloc( (void**)&d_costs, nGrids*sizeof(float) );

    t_costs = (float*)malloc(nGrids*sizeof(float));

    cudaMemcpy( d_xs, h_xs_p, gridSize*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy( d_ys, h_ys_p, gridSize*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy( d_fs, h_fs_p, nGrids*gridSize*sizeof(float), cudaMemcpyHostToDevice);

    int Nthreads = 1024;
    int Nblocks = nGrids;

    cuCosts<<<Nblocks, Nthreads>>>(d_xs, d_ys, d_fs, d_costs);

    cudaMemcpy( t_costs, d_costs, nGrids*sizeof(float), cudaMemcpyDeviceToHost);

    for(int ig = 0; ig < nGrids; ig++) {
        r_costs[ig] = t_costs[ig];
    }

    cudaFree( d_xs );
    cudaFree( d_ys );
    cudaFree( d_fs );
}

如果有关系的话，下面是我的硬件规格：

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "GeForce GTX 770"
  CUDA Driver Version / Runtime Version          6.0 / 5.5
  CUDA Capability Major/Minor version number:    3.0
  Total amount of global memory:                 2047 MBytes (2146762752 bytes)
  ( 8) Multiprocessors, (192) CUDA Cores/MP:     1536 CUDA Cores
  GPU Clock rate:                                1084 MHz (1.08 GHz)
  Memory Clock rate:                             3505 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 524288 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Bus ID / PCI location ID:           1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 6.0, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce GTX 770
Result = PASS

cuda

回答 1

Stack Overflow用户

回答已采纳

发布于 2015-07-03 13:44:15

您的内核代码有多个同步问题，导致了这个问题。首先，在__syncthreads()调用周围有分支，这是数据自动化系统中未定义的行为。然后在还原循环中缺少一个同步点，这意味着翘曲到翘曲积累是不正确的。就像这样：

__global__ void cuCosts(float* d_xs, float* d_ys, 
                        float* d_fs, float* d_costs)
{
    int ir = threadIdx.x;
    int ig = blockIdx.x;

    __shared__ float diff[1024];

    diff[ir] = 0.0;
    __syncthreads();

    if( ir < gridSize-1 && ig < nGrids) {
        diff[ir] =   (d_ys[ir] - d_fs[ig*gridSize + ir])*(d_ys[ir] - d_fs[ig*gridSize + ir]);
    }
    __syncthreads();

    // reduction
    for(int s=1; s < blockDim.x; s*=2) {
        if( ir%(2*s) == 0 && ir+s < gridSize){
            diff[ir] += diff[ir+s];
        }
        __syncthreads();
    }
    d_costs[ig] = diff[0];
}

应该正确使用免责声明，在浏览器中编写，不经过测试，自行使用。

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/31207729

复制

相似问题

问为什么这个库达内核会产生不确定的结果？
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问为什么这个库达内核会产生不确定的结果？EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问为什么这个库达内核会产生不确定的结果？
EN