Is this a bug in CUDA? (illegal memory access was encountered)

Question

I am using the following CUDA kernel:

__global__
void sum_worker(int *data, int *sum_ptr)
{
        __shared__ int block_sum;
        int idx = threadIdx.x;
        int thread_sum = 0;

        if (threadIdx.x == 0)
                block_sum = 2;

        for (int i = idx; i < MAX_INDEX; i += blockDim.x)
                thread_sum += data[i];

        __syncthreads();

        atomicAdd(&block_sum, thread_sum);

        __syncthreads();

        if (threadIdx.x == 0)
                *sum_ptr = block_sum;
}

It is launched using this code:

sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer);

And it is working fine (no runtime errors and produces the correct result). However, if I change i += blockDim.x to i += 32 I get an error the next time I call cudaDeviceSynchronize():

Cuda error 'an illegal memory access was encountered' in primes_gpu.cu at line 97

Running the kernel with cuda-memcheck:

========= Invalid __global__ read of size 4
=========     at 0x00000108 in /home/clifford/Work/handicraft/2016/perfmeas/primes_gpu.cu:35:sum_worker(int*, int*)
=========     by thread (31,0,0) in block (0,0,0)
=========     Address 0x703b70d7c is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x472225]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 [0x146ad]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 (cudaLaunch + 0x143) [0x2ece3]
=========     Host Frame:./perfmeas [0x17c7]
=========     Host Frame:./perfmeas [0x16b7]
=========     Host Frame:./perfmeas [0x16e2]
=========     Host Frame:./perfmeas [0x153f]
=========     Host Frame:./perfmeas [0xdcd]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
=========     Host Frame:./perfmeas [0xf39]
....

Address 0x703b70d7c is indeed out of bounds for data: The array starts at 0x703b40000 and has MAX_INDEX elements. MAX_INDEX is 50000 in this test. (0x703b70d7c - 0x703b40000) / 4 = 50015.

Adding an additional check for i >= 50000 makes to problem magically go away:

    for (int i = idx; i < MAX_INDEX; i += 32) {
            if (i >= MAX_INDEX)
                    printf("WTF!\n");
            thread_sum += data[i];
    }

Is this a bug in CUDA or am I doing something stupid here?

I'm using CUDA 7.5 on Ubuntu 2016.04. Output of nvcc --version:

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2015 NVIDIA Corporation
Built on Tue_Aug_11_14:27:32_CDT_2015
Cuda compilation tools, release 7.5, V7.5.17

The complete source code for this test case can be found here:
http://svn.clifford.at/handicraft/2016/perfmeas

(Run with options -gx. This version is using i += blockDim.x. Change that to i += 32 to reproduce the issue.)

Edit: @njuffa said in the comments he doesn't want to follow links off stack overflow because he is "too scared [his] computer might catch something" and would prefer a test case that he can copy&paste from stack overflow directly. So here it goes:

#include <string.h>
#include <stdio.h>
#include <stdbool.h>
#include <math.h>

#define MAX_PRIMES 100000
#define MAX_INDEX (MAX_PRIMES/2)

__global__
void primes_worker(int *data)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= MAX_INDEX)
        return;

    int p = 2*idx+1;
    for (int i = 3; i*i <= p; i += 2) {
        if (p % i == 0) {
            data[idx] = 0;
            return;
        }
    }

    data[idx] = idx ? p : 0;
}

__global__
void sum_worker(int *data, int *sum_ptr)
{
    __shared__ int block_sum;
    int idx = threadIdx.x;
    int thread_sum = 0;

    if (threadIdx.x == 0)
        block_sum = 2;

#ifdef ENABLE_BUG
    for (int i = idx; i < MAX_INDEX; i += 32)
        thread_sum += data[i];
#else
    for (int i = idx; i < MAX_INDEX; i += blockDim.x)
        thread_sum += data[i];
#endif

    __syncthreads();

    atomicAdd(&block_sum, thread_sum);

    __syncthreads();

    if (threadIdx.x == 0)
        *sum_ptr = block_sum;
}

int *primes_or_zeros;
int *sum_buffer;

void primes_gpu_init()
{
    cudaError_t err;

    err = cudaMalloc((void**)&primes_or_zeros, sizeof(int)*MAX_INDEX);

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);

    err = cudaMallocHost((void**)&sum_buffer, sizeof(int));

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}

void primes_gpu_done()
{
    cudaError_t err;

    err = cudaFree(primes_or_zeros);

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);

    err = cudaFreeHost(sum_buffer);

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}

int primes_gpu()
{
    int num_blocks = (MAX_INDEX + 31) / 32;
    int num_treads = 32;

    primes_worker<<<num_blocks, num_treads>>>(primes_or_zeros);
    sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer);
    cudaError_t err = cudaDeviceSynchronize();

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);

    return *sum_buffer;
}

int main()
{
    primes_gpu_init();

    int result = primes_gpu();
    printf("Result: %d\n", result);

    if (result != 454396537) {
        printf("Incorrect result!\n");
        return 1;
    }

    primes_gpu_done();
    return 0;
}

Usage:

$ nvcc -o demo demo.cu 
$ ./demo 
Result: 454396537

$ nvcc -D ENABLE_BUG -o demo demo.cu 
$ ./demo 
Cuda error 'an illegal memory access was encountered' in demo.cu at line 99
Result: 0
Incorrect result!

@njuffa see edit. However, if you are "too scared [your] computer might catch something" then you probably shouldn't execute random code from strangers on SO in the first place.. — CliffordVienna
I was able to reproduce your observations with CUDA 7.5 on Windows, after removing #include <stdbool>. Still trying to figure out what is going on. As a quick experiment, try reducing ptxas optimization level via -Xptxas -O2, then -Xptxas -O1, then -Xptxas -O0. There is residual risk in compiling code from SO, but at least one can inspect it for anything suspicious up front (before running). — njuffa
The problem disappears for me when I use -Xptxas -O1, so this may hint at a back-end code generation issue. Do you see the same at our end? I haven't been able to track down the issue in the machine code yet, because I managed to confuse myself royally, chasing up a dead end. — njuffa
Using the latest available CUDA driver, and forcing JIT compilation (by compiling with -arch=sm_30, but running on an sm_50 device) also makes the problem disappear, again hinting at an issue with PTXAS optimizations (the PTXAS component in the driver is more recent than the PTXAS component of the CUDA 7.5 toolchain). This suggests that whatever the exact problem is, it may already be fixed in CUDA 8.0 RC (not sure whether that is a realistic option for your to try). — njuffa
The problem seems to be with loop unrolling. With the loop increment a variable (threadIdx.x) the loop remains rolled. When it is a compile time constant (32) it is unrolled by a factor of 4. The unrolled loop has an access out-of-bounds. Interestingly enough, when I unroll the loop explicitly by placing #pragma unroll 4 directly before the loop, I get almost identical machine code, which however works properly! The difference is likely which part of the compiler does the unrolling, frontend or backend. So this looks like a ptxas bug with unrolling loops. — njuffa

njuffa njuffa · Accepted Answer · 2016-09-21T16:24:39

TL;DR: The observed behavior is very likely caused by a bug in the ptxas component of the CUDA 7.5 toolchain, specifically the loop unroller. It is possible that the bug is already fixed in CUDA 8.0 RC, which is publicly available.

I was able to reproduce the behavior reported in the question on a 64-bit Windows 7 platform with a Quadro K2200 GPU, which is an sm_50 device. The primary difference in the generated machine code (SASS) with ENABLE_BUG defined is that the loop is unrolled by a factor of four. This is a direct consequence of the loop increment being changed from a variabe, namely threadIdx.x, to a compile time constant, 32, which allows the compiler to compute trip count at compile time.

It is interesting to note that at the intermediate PTX level, the loop is rolled even with increment of 32:

BB7_4:
ld.global.u32 %r12, [%rd10];
add.s32 %r16, %r12, %r16;
add.s64 %rd10, %rd10, 128;
add.s32 %r15, %r15, 32;
setp.lt.s32     %p3, %r15, 50000;
@%p3 bra BB7_4;

As the loop is unrolled in machine code, it must be the ptxas unroller applying that transformation.

If I lower the ptxas optimization level to -O1, by specifying -Xptxas -O1 on the nvcc command line, the code works as expected. If I build the code for sm_30 (causing JIT compilation when running on an sm_50 device) the code works as expected when run with the latest driver, Windows 369.26. This strongly suggests that there is a bug in the unroller of the ptxas component of CUDA 7.5, which however has already been fixed, since the ptxas component inside the CUDA driver is much more recent than the ptxas component of the CUDA 7.5 toolchain.

Placing a #pragma unroll 4 directly in front of the loop also fixes the problem, since in this case the unrolling is performed by the nvvm component of the compiler, meaning the unrolled loop is already present at the PTX level:

#if ENABLE_BUG
#pragma unroll 4
    for (int i = idx; i < MAX_INDEX; i += 32)
        thread_sum += data[i];
#else

Resulting PTX:

BB7_5:
.pragma "nounroll";
ld.global.u32 %r34, [%rd14];
add.s32 %r35, %r34, %r45;
ld.global.u32 %r36, [%rd14+128];
add.s32 %r37, %r36, %r35;
ld.global.u32 %r38, [%rd14+256];
add.s32 %r39, %r38, %r37;
ld.global.u32 %r40, [%rd14+384];
add.s32 %r45, %r40, %r39;
add.s64 %rd14, %rd14, 512;
add.s32 %r44, %r44, 128;
setp.lt.s32     %p5, %r44, %r3;
@%p5 bra BB7_5;

Is this a bug in CUDA? (illegal memory access was encountered)

1 Answers