All, I wrote a very simple OpenCL kernel which transforms an RGB image to gray scale using simple averaging.
Some background:
- The image is stored in mapped memory, as a 24 bit, non padded memory block
- The output array is stored in pinned memory (mapped with
clEnqueueMapBuffer) and is 8 bpp - There are two buffers allocated on the device (
clCreateBuffer), one is specifically read (which weclWriteBufferinto before the kernel starts) and the other is specifically write (which weclReadBufferafter the kernel finishes)
I am running this on a 1280x960 image. A serial version of the algorithm averages 60ms, the OpenCL kernel averages 200ms!!! I'm doing something wrong but I have no idea how to proceed, what to optimize. (Timing my reads/writes without a kernel call, the algorithm runs in 15ms)
I am attaching the kernel setup (sizes and arguments) as well as the kernel
EDIT: So I wrote an even dumber kernel, that does no global memory accesses inside it, and it was only 150ms... This is still ridiculously slow. I thought maybe I'm messing up with global memory reads, they have to be 4 byte aligned or something? Nope...
Edit 2: Removing the all parameters from my kernel gave me significant speed up... I'm confused I thought that since I'm clEnqueueWriteBuffer the kernel should be doing no memory transfer from host->device and device->host....
Edit 3: Figured it out, but I still don't understand why. If anyone could explain it I would be glad to award correct answer to them. The problem was passing the custom structs by value. It looks like I'll need to allocate a global memory location for them and pass their cl_mems
Kernel Call:
//Copy input to device
result = clEnqueueWriteBuffer(handles->queue, d_input_data, CL_TRUE, 0, h_input.widthStep*h_input.height, (void *)input->imageData, 0, 0, 0);
if(check_result(result, "opencl_rgb_to_gray", "Failed to write to input buffer on device!")) return 0;
//Set kernel arguments
result = clSetKernelArg(handles->current_kernel, 0, sizeof(OpenCLImage), (void *)&h_input);
if(check_result(result, "opencl_rgb_to_gray", "Failed to set input struct.")) return 0;
result = clSetKernelArg(handles->current_kernel, 1, sizeof(cl_mem), (void *)&d_input_data);
if(check_result(result, "opencl_rgb_to_gray", "Failed to set input data.")) return 0;
result = clSetKernelArg(handles->current_kernel, 2, sizeof(OpenCLImage), (void *)&h_output);
if(check_result(result, "opencl_rgb_to_gray", "Failed to set output struct.")) return 0;
result = clSetKernelArg(handles->current_kernel, 3, sizeof(cl_mem), (void *)&d_output_data);
if(check_result(result, "opencl_rgb_to_gray", "Failed to set output data.")) return 0;
//Determine run parameters
global_work_size[0] = input->width;//(unsigned int)((input->width / (float)local_work_size[0]) + 0.5);
global_work_size[1] = input->height;//(unsigned int)((input->height/ (float)local_work_size[1]) + 0.5);
printf("Global Work Group Size: %d %d\n", global_work_size[0], global_work_size[1]);
//Call kernel
result = clEnqueueNDRangeKernel(handles->queue, handles->current_kernel, 2, 0, global_work_size, local_work_size, 0, 0, 0);
if(check_result(result, "opencl_rgb_to_gray", "Failed to run kernel!")) return 0;
result = clFinish(handles->queue);
if(check_result(result, "opencl_rgb_to_gray", "Failed to finish!")) return 0;
//Copy output
result = clEnqueueReadBuffer(handles->queue, d_output_data, CL_TRUE, 0, h_output.widthStep*h_output.height, (void *)output->imageData, 0, 0, 0);
if(check_result(result, "opencl_rgb_to_gray", "Failed to write to output buffer on device!")) return 0;
Kernel:
typedef struct OpenCLImage_t
{
int width;
int widthStep;
int height;
int channels;
} OpenCLImage;
__kernel void opencl_rgb_kernel(OpenCLImage input, __global unsigned char* input_data, OpenCLImage output, __global unsigned char * output_data)
{
int pixel_x = get_global_id(0);
int pixel_y = get_global_id(1);
unsigned char * cur_in_pixel, *cur_out_pixel;
float avg = 0;
cur_in_pixel = (unsigned char *)(input_data + pixel_y*input.widthStep + pixel_x * input.channels);
cur_out_pixel = (unsigned char *)(output_data + pixel_y*output.widthStep + pixel_x * output.channels);
avg += cur_in_pixel[0];
avg += cur_in_pixel[1];
avg+= cur_in_pixel[2];
avg /=3.0f;
if(avg > 255.0)
avg = 255.0;
else if(avg < 0)
avg = 0;
*cur_out_pixel = avg;
}
clGetWorkGroupInfoto determine my local work group sizes. - Constantin