Several things were wrong in your code.
- Using double-precision on CC < 1.3.
- The size of your thread blocks (as you said, CC <= 1.3 means 512 threads max per block, you used 1024 threads per block). I guess you could use
__CUDA_ARCH__
if you do need some multi-architecture code.
- No error checking or memory checking (
cuda-memcheck
). You may allocate more memory than you have, or use more threads/blocks than your GPU can handle, and you will not detect it.
Consider the following example based on your code (I am using float
instead of double
):
#include <cuda.h>
#include <stdio.h> // printf
#define SIZE 988
#define GRID_SIZE 32
#define BLOCK_SIZE 16 // set to 16 instead of 32 for instance
#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)
// See: http://codeyarns.com/2011/03/02/how-to-do-error-checking-in-cuda/
inline void
__cuda_check_errors (const char *filename, const int line_number)
{
cudaError err = cudaDeviceSynchronize ();
if (err != cudaSuccess)
{
printf ("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString (err));
exit (-1);
}
}
inline void
__cuda_safe_call (cudaError err, const char *filename, const int line_number)
{
if (err != cudaSuccess)
{
printf ("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString (err));
exit (-1);
}
}
__global__ void
createDistanceTable (float *d_distances, float *d_coordinates)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (row < SIZE && col < SIZE)
d_distances[row * SIZE + col] =
acos (__sinf (d_coordinates[row * 2 + 0]) *
__sinf (d_coordinates[col * 2 + 0]) +
__cosf (d_coordinates[row * 2 + 0]) *
__cosf (d_coordinates[col * 2 + 0]) *
__cosf (d_coordinates[col * 2 + 1] -
d_coordinates[row * 2 + 1])) * 6371;
}
int
main ()
{
float *d_distances;
float *d_coordinates;
CUDA_SAFE_CALL (cudaMalloc (&d_distances, SIZE * SIZE * sizeof (float)));
CUDA_SAFE_CALL (cudaMalloc (&d_coordinates, SIZE * SIZE * sizeof (float)));
dim3 dimGrid (GRID_SIZE, GRID_SIZE);
dim3 dimBlock (BLOCK_SIZE, BLOCK_SIZE);
createDistanceTable <<< dimGrid, dimBlock >>> (d_distances, d_coordinates);
CUDA_CHECK_ERROR ();
CUDA_SAFE_CALL (cudaFree (d_distances));
CUDA_SAFE_CALL (cudaFree (d_coordinates));
}
Compilation command (change architecture accordingly):
nvcc prog.cu -g -G -lineinfo -gencode arch=compute_11,code=sm_11 -o prog
With 32x32 block on CC 2.0 or 16x16 on CC 1.1:
cuda-memcheck ./prog
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
With 33x33 block on CC 2.0 or 32x32 block on CC 1.1:
cuda-memcheck ./prog
========= CUDA-MEMCHECK
========= Program hit error 9 on CUDA API call to cudaLaunch
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/nvidia-current-updates/libcuda.so [0x26a230]
========= Host Frame:/opt/cuda/lib64/libcudart.so.5.0 (cudaLaunch + 0x242) [0x2f592]
========= Host Frame:./prog [0xc76]
========= Host Frame:./prog [0xa99]
========= Host Frame:./prog [0xac4]
========= Host Frame:./prog [0x9d1]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xed) [0x2176d]
========= Host Frame:./prog [0x859]
=========
========= ERROR SUMMARY: 1 error
Error 9:
/**
* This indicates that a kernel launch is requesting resources that can
* never be satisfied by the current device. Requesting more shared memory
* per block than the device supports will trigger this error, as will
* requesting too many threads or blocks. See ::cudaDeviceProp for more
* device limitations.
*/ cudaErrorInvalidConfiguration = 9,
cuda-memcheck
and some error checking? – BenC