RPI GPU horrible performance with OpenCL

Question

im trying to process a camerafeed of 640x480@25fps using OpenCL to let the GPU do the image processing and OpenCV to capture the images, the problem is the horrible performance of the Kernel

Kernel:

 __kernel void brightness(__global uchar *A, uchar B, __global uchar *C) {
int i = get_global_id(0);

C[i] = (A[i]+B) >= 255 ? 255 : (A[i]+B);
}

setupGPU:

    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_uint     ret_num_devices;
    cl_uint     ret_num_platforms;
    cl_int      ret = 0;

    //load the OpenCL code
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("./OpenCLFiles/brightness.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );

    //get platfor and device information
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1, &device_id, &ret_num_devices);

    //create context
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
    if (ret != 0)
        std::cerr << getErrorString(ret) << std::endl;

    //create command queue
    command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
    if (ret != 0)
        std::cerr << getErrorString(ret) << std::endl;

    //create a program
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
    if (ret != 0)
        std::cerr << getErrorString(ret) << std::endl;

    //build the program        
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    if (ret != 0)
        std::cerr << getErrorString(ret) << std::endl;

    //create the kernels
    brightnessKernel = clCreateKernel(program, "brightness", &ret);
    if (ret != 0)
        std::cerr << getErrorString(ret) << std::endl;

    // Create memory buffers on the device for each vector 
    cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, listSize * sizeof(uchar), NULL, &ret);
    if (ret != 0)
        std::cerr << getErrorString(ret) << std::endl;

    cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, listSize * sizeof(uchar), NULL, &ret);
    if (ret != 0)
        std::cerr << getErrorString(ret) << std::endl;
            //convert captured image to gray
            Mat greyImage;
            cvtColor(image, greyImage, COLOR_BGR2GRAY);          

            //"convert" Mat image to input array
            uchar* input = greyImage.isContinuous()? greyImage.data: greyImage.clone().data;
            //allocate memory for output array
            uchar* output = (uchar*)malloc(sizeof(uchar)*listSize);

            //write input array into GPU memory buffer
            ret = clEnqueueWriteBuffer(command_queue, inputBuffer, CL_TRUE, 0, listSize * sizeof(uchar), input, 0, NULL, &eventWrite);
            if (ret != 0)
                std::cerr << getErrorString(ret) << std::endl;

            // Set the arguments of the kernel
            ret = clSetKernelArg(brightnessKernel, 0, sizeof(cl_mem), (void *) &inputBuffer);
            if (ret != 0)
                std::cerr << getErrorString(ret) << std::endl;
            ret = clSetKernelArg(brightnessKernel, 1, sizeof(brightnessValue), (void *) &brightnessValue);
            if (ret != 0)
                std::cerr << getErrorString(ret) << std::endl;
            ret = clSetKernelArg(brightnessKernel, 2, sizeof(cl_mem), (void *) &outputBuffer);
            if (ret != 0)
                std::cerr << getErrorString(ret) << std::endl;

            // Execute the OpenCL kernel
            size_t global_item_size = listSize; // Process the entire lists
            size_t local_item_size = 12;
            ret = clEnqueueNDRangeKernel(command_queue, brightnessKernel, 1, NULL, 
                    &global_item_size, &local_item_size, 0, NULL, &eventKernel);
            if (ret != 0)
                std::cerr << getErrorString(ret) << std::endl;

            // Read the memory buffer on the device to the local variable C
            ret = clEnqueueReadBuffer(command_queue, outputBuffer, CL_TRUE, 0, 
                    listSize * sizeof(uchar), output, 0, NULL, &eventRead);
            if (ret != 0)
                printf("error writing to output buffer: %d\n\n\n", ret);

            // Display the result to the screen
            Mat inputImage(image.rows, image.cols, CV_8UC1, input);
            Mat test(image.rows, image.cols, CV_8UC1, output);

            imshow("Input", inputImage); 
            imshow("Convertedx2", test);

using OpenCL's profiling events these are the results:

OpenCL clEnqueueWriteBuffer: 1.792 ms;
OpenCL Kernel execution time: 85.851 ms;
OpenCL clEnqueueReadBuffer: 1.581 ms;

if i change the kernel line

C[i] = (A[i]+B) >= 255 ? 255 : (A[i]+B);

to

C[i] = A[i];

it gets even worse:
OpenCL clEnqueueWriteBuffer: 1.266 ms;
OpenCL Kernel execution time: 177.103 ms;
OpenCL clEnqueueReadBuffer: 1.656 ms;

With the GPU's theoretical performance of 24 GFLOPS I expected WAY better results, somewhere around less than 1 ms.

Try decorating inputs with const and inputs-outputs with restrict keywords. — huseyin tugrul buyukisik

mogu mogu · Accepted Answer · 2019-09-13T10:51:24

With the GPU's theoretical performance of 24 GFLOPS I expected WAY better results, somewhere around less than 1 ms.

Your expectations are unrealistic (sorry for bad news). GPU can do 24 GFLOPS if it's not limited by memory, IOW when everything is in registers. If your kernel is so simple as the one you posted, you're limited by memory performance, which on RPi 3 is about 2GB/s.

So your limits depends on which RPi you have, and what pixel format you use; e.g. 8bit RGBA it's 4bytes/pixel, 640x480 is about 1.2MB; at RPi3's 2GB/s that's 0.6 ms just to read all pixels from the image. Another 0.6 ms to write the calculated pixels back. Do not expect less than 1ms for that image format.

As to why your real numbers are so much worse - no idea. But if all your kernels are so simple, it might not be worth dealing with GPU at all; just do it on the CPU. GPU only makes sense for computationally heavy kernels.

(BTW, OpenCL has an "add_sat" builtin - addition with saturation)

RPI GPU horrible performance with OpenCL

1 Answers