Cuda not giving correct answer when array size is larger than 1,000,000

Question

I have wrote a simple sum reduction code which seems to work just fine until i increase array size to 1 million what can be the problem.

#define BLOCK_SIZE 128
#define ARRAY_SIZE 10000

cudaError_t addWithCuda(const long *input, long *output, int totalBlocks, size_t size);

__global__ void sumKernel(const long *input, long *output)
{
    int tid = threadIdx.x;
    int bid = blockDim.x * blockIdx.x;

    __shared__ long data[BLOCK_SIZE];

    if(bid+tid < ARRAY_SIZE)
           data[tid] = input[bid+tid];
    else
           data[tid] = 0;

     __syncthreads();

    for(int i = BLOCK_SIZE/2; i >= 1; i >>= 1)
    {
        if(tid < i)
        data[tid] += data[tid + i];
        __syncthreads(); 
    }

    if(tid == 0)
        output[blockIdx.x] = data[0];
}

int main()
{    
    int totalBlocks = ARRAY_SIZE/BLOCK_SIZE;

    if(ARRAY_SIZE % BLOCK_SIZE != 0)
        totalBlocks++;

    long *input = (long*) malloc(ARRAY_SIZE * sizeof(long) );
    long *output = (long*) malloc(totalBlocks * sizeof(long) );

    for(int i=0; i<ARRAY_SIZE; i++)
    {
        input[i] = i+1 ;
    }
// Add vectors in parallel.
        cudaError_t cudaStatus = addWithCuda(input, output, totalBlocks, ARRAY_SIZE);
        if (cudaStatus != cudaSuccess) {
             fprintf(stderr, "addWithCuda failed!");
             return 1;
        }

    long ans = 0;
    for(int i =0 ; i < totalBlocks ;i++)
    {
        ans = ans + output[i];
    }

    printf("Final Ans : %ld",ans);

// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
        if (cudaStatus != cudaSuccess) {
              fprintf(stderr, "cudaDeviceReset failed!");
              return 1;
         }

     getchar();

      return 0;
}

     // Helper function for using CUDA to add vectors in parallel.
     cudaError_t addWithCuda(const long *input, long *output, int totalBlocks, size_t size)
     {
          long *dev_input = 0;
          long *dev_output = 0;

          cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.
           cudaStatus = cudaSetDevice(0);
         if (cudaStatus != cudaSuccess) {
             fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
             goto Error;
     }

// Allocate GPU buffers for two vectors (one input, one output)    .

     cudaStatus = cudaMalloc((void**)&dev_input, size * sizeof(long));
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaMalloc failed!");
         goto Error;
         }

cudaStatus = cudaMalloc((void**)&dev_output, totalBlocks * sizeof(long));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_input, input, size * sizeof(long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(dev_output, output, (totalBlocks) * sizeof(long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

// Launch a kernel on the GPU with one thread for each element.
sumKernel<<<totalBlocks, BLOCK_SIZE>>>(dev_input, dev_output);

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(output, dev_output, totalBlocks * sizeof(long), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

Error:
cudaFree(dev_input);
cudaFree(dev_output);

return cudaStatus;
}

and just for the reference if it has to do somthing with my GPU device, my GPU is GTXX 650ti. and here is the info about GPU:

Maximum number of threads per multiprocessor: 2048

Maximum number of threads per block: 1024

Maximum sizes of each dimension of a block: 1024 x 1024 x 64

Maximum sizes of each dimension of a grid: 2147483647 x 65535 x 65535

Maximum memory pitch: 2147483647 bytes

Texture alignment: 512 bytes

Please do proper cuda error checking on all cuda API calls and kernel calls. Then, if no errors are reported, please run your code through cuda-memcheck i.e. cuda-memcheck myapp and identify if any errors are reported by cuda-memcheck. Please also properly indent and format your code so it is more readable. — Robert Crovella

Shahzaib Ali Khan Shahzaib Ali Khan · Accepted Answer · 2014-01-19T13:48:24

Actually the answer =could not fit in long as well so after using long double for datatypes this issue was resolved. Thanks all!

Cuda not giving correct answer when array size is larger than 1,000,000

3 Answers