I am using the following CUDA kernel:
__global__
void sum_worker(int *data, int *sum_ptr)
{
__shared__ int block_sum;
int idx = threadIdx.x;
int thread_sum = 0;
if (threadIdx.x == 0)
block_sum = 2;
for (int i = idx; i < MAX_INDEX; i += blockDim.x)
thread_sum += data[i];
__syncthreads();
atomicAdd(&block_sum, thread_sum);
__syncthreads();
if (threadIdx.x == 0)
*sum_ptr = block_sum;
}
It is launched using this code:
sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer);
And it is working fine (no runtime errors and produces the correct result). However, if I change i += blockDim.x to i += 32 I get an error the next time I call cudaDeviceSynchronize():
Cuda error 'an illegal memory access was encountered' in primes_gpu.cu at line 97
Running the kernel with cuda-memcheck:
========= Invalid __global__ read of size 4
========= at 0x00000108 in /home/clifford/Work/handicraft/2016/perfmeas/primes_gpu.cu:35:sum_worker(int*, int*)
========= by thread (31,0,0) in block (0,0,0)
========= Address 0x703b70d7c is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x472225]
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 [0x146ad]
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 (cudaLaunch + 0x143) [0x2ece3]
========= Host Frame:./perfmeas [0x17c7]
========= Host Frame:./perfmeas [0x16b7]
========= Host Frame:./perfmeas [0x16e2]
========= Host Frame:./perfmeas [0x153f]
========= Host Frame:./perfmeas [0xdcd]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
========= Host Frame:./perfmeas [0xf39]
....
Address 0x703b70d7c is indeed out of bounds for data: The array starts at 0x703b40000 and has MAX_INDEX elements. MAX_INDEX is 50000 in this test. (0x703b70d7c - 0x703b40000) / 4 = 50015.
Adding an additional check for i >= 50000 makes to problem magically go away:
for (int i = idx; i < MAX_INDEX; i += 32) {
if (i >= MAX_INDEX)
printf("WTF!\n");
thread_sum += data[i];
}
Is this a bug in CUDA or am I doing something stupid here?
I'm using CUDA 7.5 on Ubuntu 2016.04. Output of nvcc --version:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2015 NVIDIA Corporation
Built on Tue_Aug_11_14:27:32_CDT_2015
Cuda compilation tools, release 7.5, V7.5.17
The complete source code for this test case can be found here:
http://svn.clifford.at/handicraft/2016/perfmeas
(Run with options -gx. This version is using i += blockDim.x. Change that to i += 32 to reproduce the issue.)
Edit: @njuffa said in the comments he doesn't want to follow links off stack overflow because he is "too scared [his] computer might catch something" and would prefer a test case that he can copy&paste from stack overflow directly. So here it goes:
#include <string.h>
#include <stdio.h>
#include <stdbool.h>
#include <math.h>
#define MAX_PRIMES 100000
#define MAX_INDEX (MAX_PRIMES/2)
__global__
void primes_worker(int *data)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= MAX_INDEX)
return;
int p = 2*idx+1;
for (int i = 3; i*i <= p; i += 2) {
if (p % i == 0) {
data[idx] = 0;
return;
}
}
data[idx] = idx ? p : 0;
}
__global__
void sum_worker(int *data, int *sum_ptr)
{
__shared__ int block_sum;
int idx = threadIdx.x;
int thread_sum = 0;
if (threadIdx.x == 0)
block_sum = 2;
#ifdef ENABLE_BUG
for (int i = idx; i < MAX_INDEX; i += 32)
thread_sum += data[i];
#else
for (int i = idx; i < MAX_INDEX; i += blockDim.x)
thread_sum += data[i];
#endif
__syncthreads();
atomicAdd(&block_sum, thread_sum);
__syncthreads();
if (threadIdx.x == 0)
*sum_ptr = block_sum;
}
int *primes_or_zeros;
int *sum_buffer;
void primes_gpu_init()
{
cudaError_t err;
err = cudaMalloc((void**)&primes_or_zeros, sizeof(int)*MAX_INDEX);
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaMallocHost((void**)&sum_buffer, sizeof(int));
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}
void primes_gpu_done()
{
cudaError_t err;
err = cudaFree(primes_or_zeros);
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaFreeHost(sum_buffer);
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}
int primes_gpu()
{
int num_blocks = (MAX_INDEX + 31) / 32;
int num_treads = 32;
primes_worker<<<num_blocks, num_treads>>>(primes_or_zeros);
sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer);
cudaError_t err = cudaDeviceSynchronize();
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
return *sum_buffer;
}
int main()
{
primes_gpu_init();
int result = primes_gpu();
printf("Result: %d\n", result);
if (result != 454396537) {
printf("Incorrect result!\n");
return 1;
}
primes_gpu_done();
return 0;
}
Usage:
$ nvcc -o demo demo.cu
$ ./demo
Result: 454396537
$ nvcc -D ENABLE_BUG -o demo demo.cu
$ ./demo
Cuda error 'an illegal memory access was encountered' in demo.cu at line 99
Result: 0
Incorrect result!
#include <stdbool>. Still trying to figure out what is going on. As a quick experiment, try reducingptxasoptimization level via-Xptxas -O2, then-Xptxas -O1, then-Xptxas -O0. There is residual risk in compiling code from SO, but at least one can inspect it for anything suspicious up front (before running). - njuffa-Xptxas -O1, so this may hint at a back-end code generation issue. Do you see the same at our end? I haven't been able to track down the issue in the machine code yet, because I managed to confuse myself royally, chasing up a dead end. - njuffa-arch=sm_30, but running on an sm_50 device) also makes the problem disappear, again hinting at an issue with PTXAS optimizations (the PTXAS component in the driver is more recent than the PTXAS component of the CUDA 7.5 toolchain). This suggests that whatever the exact problem is, it may already be fixed in CUDA 8.0 RC (not sure whether that is a realistic option for your to try). - njuffathreadIdx.x) the loop remains rolled. When it is a compile time constant (32) it is unrolled by a factor of 4. The unrolled loop has an access out-of-bounds. Interestingly enough, when I unroll the loop explicitly by placing#pragma unroll 4directly before the loop, I get almost identical machine code, which however works properly! The difference is likely which part of the compiler does the unrolling, frontend or backend. So this looks like aptxasbug with unrolling loops. - njuffa