1
votes

I am trying to implement a basic device array type on CUDA, as an exercise. It should mimic the std::array interface, as a design goal. While implementing operator+, I am getting illegal memory access error and I can't decipher why. Here is the code.

#include <iostream>
#include <array>

enum class memcpy_t {
    host_to_host,
    host_to_device,
    device_to_host,
    device_to_device
};

bool check_cuda_err() {
    cudaError_t err = cudaGetLastError();
    if(err == cudaSuccess) {
        return true;
    }
    else {
        std::cerr << "Cuda Error: " << cudaGetErrorString(err) << "\n" << std::flush;
        return false;
    }
}

template <typename T, std::size_t N>
struct cuda_allocator {
    using pointer = T*;

    static void allocate(T *&dev_mem) {
        cudaMalloc(&dev_mem, N * sizeof(T));
    }

    static void deallocate(T *dev_mem) {
        cudaFree(dev_mem);
    }

    template <memcpy_t ct>
    static void copy (T *dst, T *src) {
        switch(ct) {
        case memcpy_t::host_to_host:
            cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyHostToHost);
            break;
        case memcpy_t::host_to_device:
            cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyHostToDevice);
            break;
        case memcpy_t::device_to_host:
            cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyDeviceToHost);
            break;
        case memcpy_t::device_to_device:
            cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyDeviceToDevice);
            break;
        default:
            break;
        }
    }
};

template <typename T, std::size_t N>
struct gpu_array {
    using allocator = cuda_allocator<T, N>;
    using pointer = typename allocator::pointer;
    using value_type = T;
    using iterator = T*;
    using const_iterator = T const*;

    gpu_array() {
       allocator::allocate(data);
    }

    gpu_array(std::array<T, N> host_arr) {
        allocator::allocate(data);
        allocator::template copy<memcpy_t::host_to_device>(data, host_arr.begin());
    }

    gpu_array& operator=(gpu_array const& o) {
        //allocator::allocate(data);
        allocator::template copy<memcpy_t::device_to_device>(data, o.begin());
    }

    operator std::array<T, N>() {
        std::array<T, N> res;
        allocator::template copy<memcpy_t::device_to_host>(res.begin(), data);
        return res;
    }

    ~gpu_array() {
        allocator::deallocate(data);
    }

    __device__ iterator begin() { return data; }
    __device__ iterator end() { return data + N; }
    __device__ const_iterator begin() const { return data; }
    __device__ const_iterator end() const { return data + N; }

private:
    T* data;
};

template <typename T, std::size_t N>
__global__ void add_kernel(gpu_array<T,N> **r,
                           gpu_array<T,N> const* a1,
                           gpu_array<T,N> const* a2) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    printf("Index: %d\n", i);
    (*r)->begin()[i] = a1->begin()[i] + a2->begin()[i];
}

template <typename T, std::size_t N>
gpu_array<T, N> operator+(gpu_array<T,N> const&a1,
                          gpu_array<T,N> const&a2)
{
    gpu_array<T, N> *res = new gpu_array<T, N>;
    add_kernel<<<(N+3)/4, 4>>>(&res, &a1, &a2);
    cudaDeviceSynchronize();
    check_cuda_err();
    // ignore memory leak for now
    return *res;
}
const int N = 1<<3;

int main() {
    std::array<float, N> x,y;

    for (int i = 0; i < N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    } 

    gpu_array<float, N> dx{x};
    gpu_array<float, N> dy{y};
    check_cuda_err(); // shows no error for memcpy
    std::array<float, N> res = dx + dy;

    for(const auto& elem : res) {
        std::cout << elem << ", ";
    }
}

I am creating a size 8 array, to test things. As you can see, cuda_check_err() shows no error after gpu_array initialization from host arrays. I am guessing copying data works correctly. But in the kernel, when I index the device arrays, I am getting illegal memory access error. Here is the output:

Index: 0

Index: 1

Index: 2

Index: 3

Index: 4

Index: 5

Index: 6

Index: 7

Cuda Error: an illegal memory access was encountered

9.45143e-39, 0, 6.39436e-39, 0, 0, 0, 0, 0,

As you can see, I've printed computed index for each thread and nothing seems to be out of bounds. So, what might cause this illegal memory access error? By the way, cuda-memchecksays:

Invalid global read of size 8

and later

Address 0x7fff9f4c6ec0 is out of bounds

but I've printed the indices, don't know why it is out of bounds.

1
kernels don't support pass by reference for arguments, even if the compiler tries to build code which doestalonmies
And because of that, I think your basic design is not going to worktalonmies
I will look into adding a layer of indirection then.meguli
@talonmies I changed the kernel shown as above edit. Do you see something fishy about array access?meguli
They were, but the source of the error was right there in them -- gpu_array<T, N> *res = new gpu_array<T, N>. Can't pass that to a kernel, it is a host pointer, Same problem as using references, you wind up with a host address in device code which breaks. And if you pass by value you will get scope problems. Like I said, I don't think this is going to worktalonmies

1 Answers

3
votes

We have seen two versions of code in this question, and unfortunately both have different versions of the same problem.

The first used references as arguments to the kernel:

template <typename T, std::size_t N>
 __global__ void add_kernel(gpu_array<T,N> &r,
                       gpu_array<T,N> const&a1,
                       gpu_array<T,N> const&a2) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    printf("Index: %d\n", i);
    r.begin()[i] = a1.begin()[i] + a2.begin()[i];
}

template <typename T, std::size_t N>
gpu_array<T, N> operator+(gpu_array<T,N> const&a1,
                      gpu_array<T,N> const&a2)
{
    gpu_array<T, N> res;
    add_kernel<<<(N+3)/4, 4>>>(res, a1, a2);
    cudaDeviceSynchronize();
    check_cuda_err();
    return res;
 }

While this is clean and elegant, and references are fully supported in CUDA kernel code, passing kernel arguments by reference from the host winds up with host addresses as arguments in the device because the CUDA toolchain, like every other C++ compiler I am aware of, implements references using pointers. The result is a kernel runtime error for illegal addresses.

The second uses pointer indirection instead of references and winds up passing host pointers to the GPU which fails pretty much identically to the first version:

template <typename T, std::size_t N>
__global__ void add_kernel(gpu_array<T,N> **r,
                           gpu_array<T,N> const* a1,
                           gpu_array<T,N> const* a2) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    printf("Index: %d\n", i);
    (*r)->begin()[i] = a1->begin()[i] + a2->begin()[i];
}

template <typename T, std::size_t N>
gpu_array<T, N> operator+(gpu_array<T,N> const&a1,
                          gpu_array<T,N> const&a2)
{
    gpu_array<T, N> *res = new gpu_array<T, N>;
    add_kernel<<<(N+3)/4, 4>>>(&res, &a1, &a2); 
    cudaDeviceSynchronize();
    check_cuda_err();
    // ignore memory leak for now
    return *res;
}

The only safe implementation for passing this structure directly to device kernels will be using pass-by-value. However that will mean that copies will fall out of scope and trigger destruction, which will deallocate the memory backing the arrays and result in unexpected errors of a different kind.