an illegal memory access was encountered

Question

I am a beginner at CUDA programming, writing a program composed of a single file main.cu which is shown below.

#include <iostream>
#include <opencv2/opencv.hpp>

#define DEBUG(str) std::cerr << "\033[1;37m" << __FILE__ << ":" << __LINE__ << ": \033[1;31merror:\033[0m " << str << std::endl;

#define CUDADEBUG(cudaError)      \
    if (cudaError != cudaSuccess) \
        DEBUG(cudaGetErrorString(cudaError));

#define ERROR(str)  \
    {               \
        DEBUG(str); \
        exit(1);    \
    }

__global__ void makeGrey(
    unsigned char *&pimage,
    const int &cn,
    const size_t &total)
{
    unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned icn = i * cn;

    printf("%u\n", i);

    if (i < total)
    {
        float result = pimage[icn + 0] * .114 +
                       pimage[icn + 1] * .587 +
                       pimage[icn + 2] * .299;
        pimage[icn + 0] = result; //B
        pimage[icn + 1] = result; //G
        pimage[icn + 2] = result; //R
        // pimage[icn + 3] *= result; //A
    }
}

int main(int argc, char **argv)
{
    if (argc != 3)
        ERROR("usage: executable in out");

    cv::Mat image;
    unsigned char *dimage;

    image = cv::imread(argv[1], cv::IMREAD_UNCHANGED);
    if (!image.data)
        ERROR("Image null");

    if (image.empty())
        ERROR("Image empty");

    if (!image.isContinuous())
        ERROR("image is not continuous");

    const size_t N = image.total();
    const int cn = image.channels();
    const size_t numOfElems = cn * N;
    const int blockSize = 512;
    const int gridSize = (N - 1) / blockSize + 1;

    CUDADEBUG(cudaMalloc(&dimage, numOfElems * sizeof(unsigned char)));
    CUDADEBUG(cudaMemcpy(dimage, image.data, numOfElems * sizeof(unsigned char), cudaMemcpyHostToDevice));

    makeGrey<<<gridSize, blockSize>>>(dimage, cn, N);
    cudaError_t errSync = cudaGetLastError();
    cudaError_t errAsync = cudaDeviceSynchronize();
    if (errSync != cudaSuccess)
        std::cerr << "Sync kernel error: " << cudaGetErrorString(errSync) << std::endl;
    if (errAsync != cudaSuccess)
        std::cerr << "Async kernel error: " << cudaGetErrorString(errAsync) << std::endl;

    CUDADEBUG(cudaMemcpy(image.data, dimage, numOfElems * sizeof(unsigned char), cudaMemcpyDeviceToHost)); //line 73
    CUDADEBUG(cudaFree(dimage));                                                                           //line 74

    cv::imwrite(argv[2], image);
    return 0;
}

When I execute the program, I get

Async kernel error: an illegal memory access was encountered
/path-to-main.cu:73: error: an illegal memory access was encountered
/path-to-main.cu:74: error: an illegal memory access was encountered

I checked CV_VERSION macro which is 4.5.3-dev, and Cuda Toolkit 11.4 is installed (nvcc version 11.4). Also afaik, the kernel does not execute at all (I used Nsight gdb debugger and printf). I could not understand why I am accessing an illegal memory area. I appreciate any help. Thank you in advance.

Your GPU function makeGrey takes it's arguments by reference, those values live on the stack, not in GPU-memory, take them by value instead. — Kaldrr