I am trying to implement a basic device array type on CUDA, as an exercise. It should mimic the std::array interface, as a design goal. While implementing operator+
, I am getting illegal memory access error and I can't decipher why.
Here is the code.
#include <iostream>
#include <array>
enum class memcpy_t {
host_to_host,
host_to_device,
device_to_host,
device_to_device
};
bool check_cuda_err() {
cudaError_t err = cudaGetLastError();
if(err == cudaSuccess) {
return true;
}
else {
std::cerr << "Cuda Error: " << cudaGetErrorString(err) << "\n" << std::flush;
return false;
}
}
template <typename T, std::size_t N>
struct cuda_allocator {
using pointer = T*;
static void allocate(T *&dev_mem) {
cudaMalloc(&dev_mem, N * sizeof(T));
}
static void deallocate(T *dev_mem) {
cudaFree(dev_mem);
}
template <memcpy_t ct>
static void copy (T *dst, T *src) {
switch(ct) {
case memcpy_t::host_to_host:
cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyHostToHost);
break;
case memcpy_t::host_to_device:
cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyHostToDevice);
break;
case memcpy_t::device_to_host:
cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyDeviceToHost);
break;
case memcpy_t::device_to_device:
cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyDeviceToDevice);
break;
default:
break;
}
}
};
template <typename T, std::size_t N>
struct gpu_array {
using allocator = cuda_allocator<T, N>;
using pointer = typename allocator::pointer;
using value_type = T;
using iterator = T*;
using const_iterator = T const*;
gpu_array() {
allocator::allocate(data);
}
gpu_array(std::array<T, N> host_arr) {
allocator::allocate(data);
allocator::template copy<memcpy_t::host_to_device>(data, host_arr.begin());
}
gpu_array& operator=(gpu_array const& o) {
//allocator::allocate(data);
allocator::template copy<memcpy_t::device_to_device>(data, o.begin());
}
operator std::array<T, N>() {
std::array<T, N> res;
allocator::template copy<memcpy_t::device_to_host>(res.begin(), data);
return res;
}
~gpu_array() {
allocator::deallocate(data);
}
__device__ iterator begin() { return data; }
__device__ iterator end() { return data + N; }
__device__ const_iterator begin() const { return data; }
__device__ const_iterator end() const { return data + N; }
private:
T* data;
};
template <typename T, std::size_t N>
__global__ void add_kernel(gpu_array<T,N> **r,
gpu_array<T,N> const* a1,
gpu_array<T,N> const* a2) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
printf("Index: %d\n", i);
(*r)->begin()[i] = a1->begin()[i] + a2->begin()[i];
}
template <typename T, std::size_t N>
gpu_array<T, N> operator+(gpu_array<T,N> const&a1,
gpu_array<T,N> const&a2)
{
gpu_array<T, N> *res = new gpu_array<T, N>;
add_kernel<<<(N+3)/4, 4>>>(&res, &a1, &a2);
cudaDeviceSynchronize();
check_cuda_err();
// ignore memory leak for now
return *res;
}
const int N = 1<<3;
int main() {
std::array<float, N> x,y;
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
gpu_array<float, N> dx{x};
gpu_array<float, N> dy{y};
check_cuda_err(); // shows no error for memcpy
std::array<float, N> res = dx + dy;
for(const auto& elem : res) {
std::cout << elem << ", ";
}
}
I am creating a size 8 array, to test things. As you can see, cuda_check_err()
shows no error after gpu_array
initialization from host arrays. I am guessing copying data works correctly. But in the kernel, when I index the device arrays, I am getting illegal memory access
error. Here is the output:
Index: 0
Index: 1
Index: 2
Index: 3
Index: 4
Index: 5
Index: 6
Index: 7
Cuda Error: an illegal memory access was encountered
9.45143e-39, 0, 6.39436e-39, 0, 0, 0, 0, 0,
As you can see, I've printed computed index for each thread and nothing seems to be out of bounds. So, what might cause this illegal memory access error? By the way, cuda-memcheck
says:
Invalid global read of size 8
and later
Address 0x7fff9f4c6ec0 is out of bounds
but I've printed the indices, don't know why it is out of bounds.
gpu_array<T, N> *res = new gpu_array<T, N>
. Can't pass that to a kernel, it is a host pointer, Same problem as using references, you wind up with a host address in device code which breaks. And if you pass by value you will get scope problems. Like I said, I don't think this is going to work – talonmies