CUDA illegal memory access

Question

I'm trying to get this code to work with a 3D type structure. I'm using Cuda's 2D functions. So the host side linear data ('board') is of size width * height * depth, and the 2D mallocs are width x height*depth (here the width and height are both DIMxDIM elements). The kernel processes data from A to B. I am getting an illegal memory access error (using the memory checker), at the line

dst[offset] = curr;

The error goes away if I change the malloc to HEIGHT * 2, but the sizes seem to match. What am I missing? Other criticism is welcome too, I'm new to both C++ and CUDA.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

typedef signed int sint;
typedef unsigned int uint;

#define DIM 512
#define TPB 32 // Threads per block

#define CLEARANCE 5
#define MAPLAYERS 2
#define WIDTH (sizeof(sint) * DIM)
#define HEIGHT (DIM * MAPLAYERS)

void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest);
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index);
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch);
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff);
__device__ inline long long calcOffset(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch);

dim3 blocks(DIM / TPB, DIM / TPB, MAPLAYERS);
dim3 threads(TPB, TPB);

/** CUDA Error Check */
#define CER(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        int tmp;
        std::cin >> tmp;
        exit(code);
    }
}

int main(void) {

    sint *A;
    sint *B;
    size_t pitchA, pitchB;
    sint *board = new sint[WIDTH*HEIGHT];

    CER(cudaMallocPitch(&A, &pitchA, WIDTH, HEIGHT));
    CER(cudaMallocPitch(&B, &pitchB, WIDTH, HEIGHT));
    CER(cudaMemset2D(A, pitchA, 0, WIDTH, HEIGHT));
    CER(cudaMemset2D(B, pitchA, 0, WIDTH, HEIGHT));

    route(A, pitchA, B, pitchB, board, 0, DIM*DIM - 1);

    CER(cudaFree(A));
    CER(cudaFree(B));
    delete[] board;
}

void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest) {
    unsigned long *dev_index;
    unsigned long index = NULL;

    CER(cudaMalloc((void**)&dev_index, sizeof(unsigned long)));
    CER(cudaMemcpy(dev_index, &index, sizeof(unsigned long), cudaMemcpyHostToDevice));

    CER(cudaMemcpy2D(A, pitchA, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
    CER(cudaMemcpy2D(B, pitchB, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));

    map << <blocks, threads >> >(B, pitchB, A, pitchA, dev_index);
    CER(cudaPeekAtLastError());
    CER(cudaMemcpy(&index, dev_index, sizeof(unsigned long), cudaMemcpyDeviceToHost));
    if (index != NULL) {
        // break condition
    }

}

__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index) {
    unsigned int x = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int y = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int z = blockIdx.z + blockIdx.z * blockDim.z;
    unsigned long long offset = calcOffset(x, y, z, 0, 0, 0, pitchDst);

    sint curr;

    if (!inBounds(x, y, z, 0, 0, 0))
        return;

    curr = src[calcOffset(x, y, z, 0, 0, 0, pitchSrc)];
    if (z % 2 == 0 && curr == 0 && hasClearance(src, x, y, z, pitchSrc)) {
        // Processing
    }
    else
        dst[offset] = 1;

    return;
}

/** Finds linear offset for a given pixel and offset. */
__device__ inline long long calcOffset(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch) {
    return (x + xoff) + (y + yoff) * pitch + ((z + zoff) * pitch * (HEIGHT / MAPLAYERS));
}


/** Checks if position is valid on the map. */
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff) {
    if (0 > (x + xoff) || (x + xoff) >= DIM || 0 > (y + yoff) || (y + yoff) >= DIM || 0 > (z + zoff) || (z + zoff) >= MAPLAYERS)
        return false;
    return true;
}


/** Returns true if a block has clearnace */
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch) {
    for (int c = -CLEARANCE; c <= CLEARANCE; c++) {
        for (int r = -CLEARANCE; r <= CLEARANCE; r++){
            if (inBounds(x, y, z, r, c, 0)){
                if (src[calcOffset(x, y, z, r, c, 0, pitch)] == 2 || src[calcOffset(x, y, z, r, c, 0, pitch)] == 1)
                    return false;
            }
            else {
                return false;
            }
        }
    }
    return true;
}

The output of the CUDA debugger:

Memory Checker detected 384 access violations.
error = access violation on load (global memory)
gridid = 18
blockIdx = {0,8,0}
threadIdx = {0,4,0}
address = 0x05d08000
accessSize = 4

Please provide a code that someone else could compile. If you're not sure what that is, copy the code out of this question into a brand new project, and keep fixing all the compile errors until there aren't any compile errors left. Then make sure that code demonstrates the access violation that you're asking about. Then paste that fixed code back into the question. — Robert Crovella

Robert Crovella Robert Crovella · Accepted Answer · 2015-04-21T04:41:06

This doesn't look right:

sint *board = new sint[WIDTH*HEIGHT];

I think you meant this:

sint *board = new sint[DIM*HEIGHT];

This doesn't look right:

unsigned int z = blockIdx.z + blockIdx.z * blockDim.z;

I think you meant this:

unsigned int z = threadIdx.z + blockIdx.z * blockDim.z;

But the crux of the issue is that you are using pitch values (which are counting bytes of row width) in arithmetic that is counting indexes into sint arrays. You need to scale your pitch values by sizeof(sint) when you are calculating indices this way. Even that is not quite correct. The proper thing to do is cast to a unsigned char pointer, do your arithmetic in rows times pitch (i.e. bytes), then cast back the start-of-the row pointer from unsigned char to sint, and then index from there via (x+xoff). In effect, this means your calcOffset routine needs to be re-written, and needs to accept the underlying pointer as a parameter, and return a pointer.

So this code has those changes:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

typedef signed int sint;
typedef unsigned int uint;

#define DIM 512
#define TPB 32 // Threads per block

#define CLEARANCE 5
#define MAPLAYERS 2
#define WIDTH (sizeof(sint) * DIM)
#define HEIGHT (DIM * MAPLAYERS)

void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest);
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index);
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch);
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff);
__device__ inline sint * calcOffset(sint *ptr, sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch);

dim3 blocks(DIM / TPB, DIM / TPB, MAPLAYERS);
dim3 threads(TPB, TPB);

/** CUDA Error Check */
#define CER(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        int tmp;
        std::cin >> tmp;
        exit(code);
    }
}

int main(void) {

    sint *A;
    sint *B;
    size_t pitchA, pitchB;
    sint *board = new sint[DIM*HEIGHT];

    CER(cudaMallocPitch(&A, &pitchA, WIDTH, HEIGHT));
    CER(cudaMallocPitch(&B, &pitchB, WIDTH, HEIGHT));
    CER(cudaMemset2D(A, pitchA, 0, WIDTH, HEIGHT));
    CER(cudaMemset2D(B, pitchA, 0, WIDTH, HEIGHT));

    route(A, pitchA, B, pitchB, board, 0, DIM*DIM - 1);

    CER(cudaFree(A));
    CER(cudaFree(B));
    delete[] board;
}

void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest) {
    unsigned long *dev_index;
    unsigned long index = 0;

    CER(cudaMalloc((void**)&dev_index, sizeof(unsigned long)));
    CER(cudaMemcpy(dev_index, &index, sizeof(unsigned long), cudaMemcpyHostToDevice));

    CER(cudaMemcpy2D(A, pitchA, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
    CER(cudaMemcpy2D(B, pitchB, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));

    map << <blocks, threads >> >(B, pitchB, A, pitchA, dev_index);
    CER(cudaPeekAtLastError());
    CER(cudaMemcpy(&index, dev_index, sizeof(unsigned long), cudaMemcpyDeviceToHost));
    if (index != 0) {
        // break condition
    }

}

__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int z = threadIdx.z + blockIdx.z * blockDim.z;
    sint *dst_offset = calcOffset(dst, x, y, z, 0, 0, 0, pitchDst);

    sint curr;

    if (!inBounds(x, y, z, 0, 0, 0))
        return;

    curr = *calcOffset(src, x, y, z, 0, 0, 0, pitchSrc);
    if (z % 2 == 0 && curr == 0 && hasClearance(src, x, y, z, pitchSrc)) {
        // Processing
    }
    else
        *dst_offset = 1;

    return;
}

/** Finds linear offset for a given pixel and offset. */
__device__ sint* calcOffset(sint *ptr, sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch) {
    unsigned char *my_ptr = reinterpret_cast<unsigned char *>(ptr);
    return (x + xoff) + reinterpret_cast<sint *>(my_ptr + (((y + yoff) * pitch) + ((z + zoff) * pitch * (HEIGHT / MAPLAYERS))));
}


/** Checks if position is valid on the map. */
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff) {
    if (0 > (x + xoff) || (x + xoff) >= DIM || 0 > (y + yoff) || (y + yoff) >= DIM || 0 > (z + zoff) || (z + zoff) >= MAPLAYERS)
        return false;
    return true;
}


/** Returns true if a block has clearnace */
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch) {
    for (int c = -CLEARANCE; c <= CLEARANCE; c++) {
        for (int r = -CLEARANCE; r <= CLEARANCE; r++){
            if (inBounds(x, y, z, r, c, 0)){
                if ((*calcOffset(src, x, y, z, r, c, 0, pitch) == 2) || (*calcOffset(src, x, y, z, r, c, 0, pitch)) == 1)
                    return false;
            }
            else {
                return false;
            }
        }
    }
    return true;
}

In the future, you might want to get your code working using non-pitched allocations. Once you have things working, you can see if adding pitched allocations give you any performance advantage.

It also occurs to me that even this won't work if (x+xoff) is negative (or if (x+xoff) causes indexing into the next row). You can't index backward from one row to the previous row (or to the next row) in a pitched allocation this way. It would be necessary to first resolve (x+xoff) into the actual row referenced, and then develop an index into that row, and then do your pitched calculations targetting that row.

CUDA illegal memory access

1 Answers