#include <stdio.h>
#include <cuda_runtime.h>
// switch the compiler flag if you don't have the sdk's helper_cuda.h file
#if 1
#include "helper_cuda.h"
#define checkCudaErrors(val) (val)
#define getLastCudaError(msg)
#ifdef __CDT_PARSER__
#define __global__
#define __device__
#define __shared__
#define __host__
// compute sum of val over num threads
__device__ float localSum(const float& val, volatile float* reductionSpace, const uint& localId)
reductionSpace[localId] = val; // load data into shared mem
// complete loop unroll
if (localId < 128) reductionSpace[localId] += reductionSpace[localId + 128];
if (localId < 64) reductionSpace[localId] += reductionSpace[localId + 64];
// within one warp (=32 threads) instructions are SIMD synchronous
// -> __syncthreads() not needed
if (localId < 32)
reductionSpace[localId] += reductionSpace[localId + 32];
reductionSpace[localId] += reductionSpace[localId + 16];
reductionSpace[localId] += reductionSpace[localId + 8];
reductionSpace[localId] += reductionSpace[localId + 4];
reductionSpace[localId] += reductionSpace[localId + 2];
reductionSpace[localId] += reductionSpace[localId + 1];
## Edit: Here we need to sync in order to guarantee that the thread with ID 0 is also done... ##
return reductionSpace[0];
__global__ void d_kernel(float* od, int n)
extern __shared__ float reductionSpace[];
int g_idx = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int linId = threadIdx.x;
__shared__ float partialSums[21];
float tmp[6] =
{ 0, 0, 0, 0, 0, 0 };
// for simplification all computations are remove - this version still shows the same behaviour
if (g_idx < n)
tmp[0] = 1.0f;
tmp[1] = 1.0f;
tmp[2] = 1.0f;
tmp[3] = 1.0f;
tmp[4] = 1.0f;
tmp[5] = 1.0f;
float res = 0.0f;
int c = 0;
for (int i = 0; i < 6; ++i)
for (int j = i; j < 6; ++j, ++c)
res = tmp[i] * tmp[j];
// compute the sum of the values res for blockDim.x threads. This uses
// the shared memory reductionSpace for calculations
partialSums[c] = localSum(res, reductionSpace, linId);
// write back the sum values for this block
if (linId < 21)
atomicAdd(&od[linId], partialSums[linId]);
int main()
int w = 320;
int h = 240;
int n = w * h;
// ------------------------------------------------------------------------------------
float *d_out;
checkCudaErrors(cudaMalloc(&d_out, 21 * sizeof(float)));
float* h_out = new float[21];
int dimBlock = 256;
int dimGrid = (n - 1) / dimBlock + 1;
int sharedMemSize = dimBlock * sizeof(float);
printf("w: %d\n", w);
printf("h: %d\n", h);
printf("dimBlock: %d\n", dimBlock);
printf("dimGrid: %d\n", dimGrid);
printf("sharedMemSize: %d\n", sharedMemSize);
int failcounter = 0;
float target = (float) n;
int c = 0;
// ------------------------------------------------------------------------------------
// run the kernel for 200 times
for (int run = 0; run < 200; ++run)
cudaMemset(d_out, 0, 21 * sizeof(float));
d_kernel<<<dimGrid, dimBlock, sharedMemSize>>>(d_out, n);;
checkCudaErrors(cudaMemcpy(h_out, d_out, 21 * sizeof(float), cudaMemcpyDeviceToHost));
// check if the output has target value
// since all threads get value 1 the kernel output corresponds to counting the elements which is w*h=n
bool failed = false;
for (int i = 0; i < 21; ++i)
if (abs(h_out[i] - target) > 0.01f)
failed = true;
// if failed, print the elements to show which one failed
if (failed)
c = 0;
for (int i = 0; i < 6; ++i)
for (int j = i; j < 6; ++j, ++c)
printf("%10.7f ", h_out[c]);
printf("failcounter: %d\n", failcounter);
// ------------------------------------------------------------------------------------
delete[] h_out;
// ------------------------------------------------------------------------------------
return 0;
我现在测试了很多东西,我发现它必须用BlockSize来做一些事情。如果我将其简化为smth <=64并相应地更改localSum(),那么一切都按照预期的方式工作。
// compute sum of val over num threads
__device__ float localSum(const float& val, volatile float* reductionSpace, const uint& localId)
reductionSpace[localId] = val; // load data into shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
if (localId < s)
reductionSpace[localId] += reductionSpace[localId + s];
return reductionSpace[0];
// compute sum of val over num threads
__device__ float localSum(const float& val, volatile float* reductionSpace, const uint& localId)
reductionSpace[localId] = val; // load data into shared mem
for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
if (localId < s)
reductionSpace[localId] += reductionSpace[localId + s];
if (localId < 32)
reductionSpace[localId] += reductionSpace[localId + 32];
reductionSpace[localId] += reductionSpace[localId + 16];
reductionSpace[localId] += reductionSpace[localId + 8];
reductionSpace[localId] += reductionSpace[localId + 4];
reductionSpace[localId] += reductionSpace[localId + 2];
reductionSpace[localId] += reductionSpace[localId + 1];
return reductionSpace[0];
发布于 2015-05-01 02:44:28
解决办法太简单了,我几乎羞于告诉它。我眼花缭乱,四处张望,却没有看到最明显的代码。在localSum()中的返回语句之前缺少一个简单的localSum()。Bc最后一次弯曲本身是同时执行的,但不能保证使用threadID 0的那个已经完成.这是个愚蠢的错误,我只是没看到。