Why does this cuda kernel yield non-deterministic results?

Question

I have constructed a minimum example of a problem I am facing in a larger sample of code. In this example, I want to find the sum of squares error of some data ys to a function fs, but I want to do it on multiple functions at a time, so I create fs as a matrix. The original data has length gridSize and I want to perform this cost function on nGrids functions at a time, so fs is of size nGrids*gridSize.

I find that the CUDA kernel gives unreliable results in a non-deterministic fashion, which leads me to believe that I am not performing my threading correctly (this is my first CUDA kernel!). I have run cuda-memcheck on this program and it reveals no errors.

To test the sporadic nature of these errors, I wrote a script to run it 100 times and compare how often the results were randomly off. I find that there is a greater chance of it being off when gridSize grows:

gridSize ...  Errors
     300 ...   0/100
     400 ...   0/100
     450 ...   4/100
     500 ...   5/100
     550 ...  55/100
     600 ...  59/100
     650 ... 100/100

The idea here was to have each block work on one grid and just call multiple CUDA blocks when I want to step up the parallelism. Hence, I call 12 blocks here since there are 12 grids. I will never, for this code, have a gridSize more than 1000, so I will leave Nthreads at 1024 (since I have 1024 threads per block on my NVIDIA GTX 770).

Here is the code:

#include <stdio.h>

#define nGrids 12
#define gridSize 700

void H_get_costs(float* h_xs, float* h_ys, float* h_fs, float* h_costs);
void D_get_costs(float* h_xs, float* h_ys, float* h_fs, float* d_costs);

/**************\
 * cuda Costs *
\**************/
__global__ void cuCosts(float* d_xs, float* d_ys, float* d_fs, float* d_costs) {
    int ir = threadIdx.x;
    int ig = blockIdx.x;

    __shared__ float diff[1024];

    diff[ir] = 0.0;
    __syncthreads();

    if( ir < gridSize-1 && ig < nGrids) {
        diff[ir] =   (d_ys[ir] - d_fs[ig*gridSize + ir])*(d_ys[ir] - d_fs[ig*gridSize + ir]);
        __syncthreads();
        // reduction
        for(int s=1; s < blockDim.x; s*=2) {
            if( ir%(2*s) == 0 && ir+s < gridSize){
                diff[ir] += diff[ir+s];
            }
        }
        __syncthreads();
        d_costs[ig] = diff[0];
    }
    __syncthreads();
}


/****************\
 * Main routine *
\****************/
int main(int argc, char** argv) {

    float h_xs[gridSize];
    float h_ys[gridSize];
    float h_fs[gridSize*nGrids];

    for( int ir = 0; ir < gridSize; ir++) {
        h_xs[ir] = (float)ir/10.0;
        h_ys[ir] = (float)ir/10.0;
    }

    for(int ir = 0; ir < gridSize; ir++) {
        for(int jgrid = 0; jgrid < nGrids; jgrid++) {
            float trand = 2.0*((float)rand()/(float)RAND_MAX) - 1.0;
            h_fs[jgrid*gridSize + ir] = h_ys[ir] + trand;
        }
    }

    float h_costs[nGrids];
    float d_costs[nGrids];

    // get all of the costs (on the host)
    H_get_costs(h_xs, h_ys, h_fs, h_costs);

    // get all of the costs (on the device)
    D_get_costs(h_xs, h_ys, h_fs, d_costs);

    // Print the grids
    /*
    for(int ir = 0; ir < gridSize; ir++) {
        printf("%10.5e %15.5e", h_xs[ir], h_ys[ir]);
        for(int jg = 0; jg < nGrids; jg++) {
            printf("%15.5e", h_fs[jg*gridSize + ir]);
        }
        printf("\n");
    }
    */

   // print the results
    printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
    printf("%-25s  ", "Host ... ");
    for(int ig = 0; ig < nGrids; ig++) {
        printf("%15.5e", h_costs[ig]);
    }
    printf("\n");
    printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
    printf("%-25s  ", "Device ... ");
    for(int ig = 0; ig < nGrids; ig++) {
        printf("%15.5e", d_costs[ig]);
    }
    printf("\n");
    printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
    printf("%-25s  ", "Difference ... ");
    for(int ig = 0; ig < nGrids; ig++) {
        printf("%15.5e", d_costs[ig]-h_costs[ig]);
    }
    printf("\n");

    return 0;
}

/*******************************\
 * get the costs (on the host) *
\*******************************/
void H_get_costs(float* h_xs, float* h_ys, float* h_fs, float* h_costs) {
    for(int ig = 0; ig < nGrids; ig++) { h_costs[ig] = 0.0; }
    for(int ir = 0; ir < gridSize-1; ir++) {
        for(int ig = 0; ig < nGrids; ig++) {
            h_costs[ig] += (h_ys[ir] - h_fs[ig*gridSize + ir])*(h_ys[ir] - h_fs[ig*gridSize + ir]);
        }
    }
}

/**************************\
 * wrapper for cuda costs *
\**************************/
void D_get_costs(float* h_xs_p, float* h_ys_p, float* h_fs_p, float* r_costs) {
    float* d_xs;
    float* d_ys;
    float* d_fs;

    float* d_costs; // device costs
    float* t_costs; // temporary costs

    cudaMalloc( (void**)&d_xs, gridSize*sizeof(float) );
    cudaMalloc( (void**)&d_ys, gridSize*sizeof(float) );
    cudaMalloc( (void**)&d_fs, nGrids*gridSize*sizeof(float) );
    cudaMalloc( (void**)&d_costs, nGrids*sizeof(float) );

    t_costs = (float*)malloc(nGrids*sizeof(float));

    cudaMemcpy( d_xs, h_xs_p, gridSize*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy( d_ys, h_ys_p, gridSize*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy( d_fs, h_fs_p, nGrids*gridSize*sizeof(float), cudaMemcpyHostToDevice);

    int Nthreads = 1024;
    int Nblocks = nGrids;

    cuCosts<<<Nblocks, Nthreads>>>(d_xs, d_ys, d_fs, d_costs);

    cudaMemcpy( t_costs, d_costs, nGrids*sizeof(float), cudaMemcpyDeviceToHost);

    for(int ig = 0; ig < nGrids; ig++) {
        r_costs[ig] = t_costs[ig];
    }

    cudaFree( d_xs );
    cudaFree( d_ys );
    cudaFree( d_fs );
}

If it matters, here is are the specs of my hardware:

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "GeForce GTX 770"
  CUDA Driver Version / Runtime Version          6.0 / 5.5
  CUDA Capability Major/Minor version number:    3.0
  Total amount of global memory:                 2047 MBytes (2146762752 bytes)
  ( 8) Multiprocessors, (192) CUDA Cores/MP:     1536 CUDA Cores
  GPU Clock rate:                                1084 MHz (1.08 GHz)
  Memory Clock rate:                             3505 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 524288 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Bus ID / PCI location ID:           1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 6.0, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce GTX 770
Result = PASS

__syncthreads() should be called by all threads in a block otherwise you get UB. — KiaMorot

Unknown Unknown · Accepted Answer · 2015-07-03T13:44:15

Your kernel code has multiple sychronisation issues which are causing the problem. For one, there is branching around a __syncthreads() call, which is undefined behaviour in CUDA. Then you are lacking a synchronisation point in the reduction loop, meaning that warp to warp accumulation is incorrect. Something like this:

__global__ void cuCosts(float* d_xs, float* d_ys, 
                        float* d_fs, float* d_costs)
{
    int ir = threadIdx.x;
    int ig = blockIdx.x;

    __shared__ float diff[1024];

    diff[ir] = 0.0;
    __syncthreads();

    if( ir < gridSize-1 && ig < nGrids) {
        diff[ir] =   (d_ys[ir] - d_fs[ig*gridSize + ir])*(d_ys[ir] - d_fs[ig*gridSize + ir]);
    }
    __syncthreads();

    // reduction
    for(int s=1; s < blockDim.x; s*=2) {
        if( ir%(2*s) == 0 && ir+s < gridSize){
            diff[ir] += diff[ir+s];
        }
        __syncthreads();
    }
    d_costs[ig] = diff[0];
}

should probably work correctly [disclaimer, written in browser, not tested, use at own risk]

Why does this cuda kernel yield non-deterministic results?

1 Answers