I am a beginner at CUDA programming, writing a program composed of a single file main.cu which is shown below.
#include <iostream>
#include <opencv2/opencv.hpp>
#define DEBUG(str) std::cerr << "\033[1;37m" << __FILE__ << ":" << __LINE__ << ": \033[1;31merror:\033[0m " << str << std::endl;
#define CUDADEBUG(cudaError) \
if (cudaError != cudaSuccess) \
DEBUG(cudaGetErrorString(cudaError));
#define ERROR(str) \
{ \
DEBUG(str); \
exit(1); \
}
__global__ void makeGrey(
unsigned char *&pimage,
const int &cn,
const size_t &total)
{
unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned icn = i * cn;
printf("%u\n", i);
if (i < total)
{
float result = pimage[icn + 0] * .114 +
pimage[icn + 1] * .587 +
pimage[icn + 2] * .299;
pimage[icn + 0] = result; //B
pimage[icn + 1] = result; //G
pimage[icn + 2] = result; //R
// pimage[icn + 3] *= result; //A
}
}
int main(int argc, char **argv)
{
if (argc != 3)
ERROR("usage: executable in out");
cv::Mat image;
unsigned char *dimage;
image = cv::imread(argv[1], cv::IMREAD_UNCHANGED);
if (!image.data)
ERROR("Image null");
if (image.empty())
ERROR("Image empty");
if (!image.isContinuous())
ERROR("image is not continuous");
const size_t N = image.total();
const int cn = image.channels();
const size_t numOfElems = cn * N;
const int blockSize = 512;
const int gridSize = (N - 1) / blockSize + 1;
CUDADEBUG(cudaMalloc(&dimage, numOfElems * sizeof(unsigned char)));
CUDADEBUG(cudaMemcpy(dimage, image.data, numOfElems * sizeof(unsigned char), cudaMemcpyHostToDevice));
makeGrey<<<gridSize, blockSize>>>(dimage, cn, N);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
std::cerr << "Sync kernel error: " << cudaGetErrorString(errSync) << std::endl;
if (errAsync != cudaSuccess)
std::cerr << "Async kernel error: " << cudaGetErrorString(errAsync) << std::endl;
CUDADEBUG(cudaMemcpy(image.data, dimage, numOfElems * sizeof(unsigned char), cudaMemcpyDeviceToHost)); //line 73
CUDADEBUG(cudaFree(dimage)); //line 74
cv::imwrite(argv[2], image);
return 0;
}
When I execute the program, I get
Async kernel error: an illegal memory access was encountered
/path-to-main.cu:73: error: an illegal memory access was encountered
/path-to-main.cu:74: error: an illegal memory access was encountered
I checked CV_VERSION macro which is 4.5.3-dev, and Cuda Toolkit 11.4 is installed (nvcc version 11.4). Also afaik, the kernel does not execute at all (I used Nsight gdb debugger and printf). I could not understand why I am accessing an illegal memory area. I appreciate any help. Thank you in advance.
makeGreytakes it's arguments by reference, those values live on the stack, not in GPU-memory, take them by value instead. - Kaldrr