0
votes

I'm having a problem with some openCL code I'm writing.

I've written a collection of utility functions to remove some boilerplate code from where I'm using it. The test method runs at the beginning and works absolutely fine, the code being below:

void openCLtest(char *arg_program, char *arg_device)
{
    cl_int ret;

    cl_device_id device_id = getDeviceId(atoi(arg_program), atoi(arg_device));
    cl_context context = get_cl_context(&device_id);
    cl_command_queue queue = get_cl_command_queue(&context, &device_id);
    cl_kernel kernel = compileCLkernel(&context, &device_id, "src/hello.cl", "hello");
    cl_mem memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(char), NULL, &ret);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Allocate Buffer\n");
        exit(1);
    }
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to set kernel Arg\n");
        exit(1);
    }
    ret = clEnqueueTask(queue, kernel, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Enqueue Task\n");
        exit(1);
    }

    ret = clFinish(queue);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to wait for finish\n");
        exit(1);
    }

    char string[MEM_SIZE];
    ret = clEnqueueReadBuffer(queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(char), string, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to read buffer\n");
        exit(1);
    }

    printf("CL Produced: %s\n", string);

    ret = clFlush(queue);
    ret = clFinish(queue);
     if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Wait for test queue to finish\n");
        exit(1);
    }
    ret = clReleaseKernel(kernel);
    ret = clReleaseMemObject(memobj);
    ret = clReleaseCommandQueue(queue);
    ret = clReleaseContext(context);
}

This code works fine, and I then extracted the code into more functions which can be used for the real openCL I'm writing.

The same principle has been applied in the rest of the code, but this time, it doesn't work.

main:

openCLtest(argv[2], argv[3]); //This is the code above and works great

cl_device_id device_id = getDeviceId(atoi(argv[2]), atoi(argv[3]));
cl_context context = get_cl_context(&device_id);
cl_command_queue queue = get_cl_command_queue(&context, &device_id);

....

double *coords_3D = cl_extrude_coords(&device_id, &context, &queue, coords_2D, nodes, LAYERS, LAYER_HEIGHT);

cl_extrude_coords:

double *cl_extrude_coords(cl_device_id* device_id, cl_context* context, cl_command_queue* queue, double *coords, int nodes, int layers, double layer_height)
{

    cl_int ret;

    cl_kernel extrude_coords = compileCLkernel(context, device_id, "src/OpenCL_Kernels/extrude_coords.cl", "extrude_coords");

    cl_mem coords_2d = clCreateBuffer(*context, CL_MEM_READ_ONLY, sizeof(coords) / sizeof(coords[0]), NULL, &ret);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Create coords_2d CL Buffer %d\n", ret);
        exit(1);
    }
    cl_mem result = clCreateBuffer(*context, CL_MEM_WRITE_ONLY, sizeof(double) * nodes * 3 * layers, NULL, &ret);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Create result CL Buffer %d\n", ret);
        exit(1);
    }

    ret = clEnqueueWriteBuffer(*queue, coords_2d, CL_TRUE, 0, sizeof(coords) / sizeof(coords[0]), (const void *)&coords, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed enqueue coords_2d write to buffer %d\n", ret);
        exit(1);
    }

    ret = clSetKernelArg(extrude_coords, 0, sizeof(cl_mem), (void *)&coords_2d);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Set kernel argument coords_2d %d\n", ret);
        exit(1);
    }
    ret = clSetKernelArg(extrude_coords, 1, sizeof(cl_mem), (void *)&result);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Set kernel argument result CL Buffer %d\n", ret);
        exit(1);
    }
    ret = clSetKernelArg(extrude_coords, 2, sizeof(double), (void *)&layer_height);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Set kernel argument layers %d\n", ret);
        exit(1);
    }

    size_t gWorkSize[]  = {nodes, layers};

    cl_event clEvent;
    ret = clEnqueueNDRangeKernel(*queue, extrude_coords, 2, NULL, (const size_t *)&gWorkSize, NULL, 0, NULL, &clEvent);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Enqueue Extrude Coordinates Kernel\n");
        exit(1);
    }

    double *res = (double *)malloc(sizeof(double) * nodes * 3 * layers);

    ret = clFinish(*queue);
        if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to wait for queue to finish in extrude_coords %d\n", ret);
        exit(1);
    }

    ret = clEnqueueReadBuffer(*queue, result, CL_TRUE, 0, sizeof(double) * nodes * 3 * layers, (void *)res, 1, &clEvent, NULL);
        if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Enqueue the extrude_coords result buffer read %d\n", ret);
        exit(1);
    }

    ret = clReleaseKernel(extrude_coords);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to release kernel\n");
        exit(1);
    }
    ret = clReleaseMemObject(coords_2d);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to release result memory object\n");
        exit(1);
    }
    ret = clReleaseMemObject(result);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to release result memory object\n");
        exit(1);
    }

    return res;

}

cl kernel:

#pragma OPENCL EXTENSION cl_khr_fp64: enable

__kernel void extrude_coords(__global const double * coords, __global double * res, const double layer_height){

    uint i=get_global_id(0);
    uint j=get_global_id(1);
    uint layers=get_global_size(0);

    res[3*(i*layers + j)] = coords[2*i];
    res[3*(i*layers + j) + 1] = coords[2*i + 1];
    res[3*(i*layers + j) + 2] = layer_height * j;

}

This function however, does not work, throwing the error below when clFinish(queue) is called.

Failed to wait for queue to finish in extrude_coords -36

Looking this up, I can see -36 is CL_INVALID_COMMAND_QUEUE. If I don't exit here, I then get an error thrown at the buffer read, error code -5, CL_OUT_OF_RESOURCES.

I'm not sure what is going wrong. The values of nodes and layers when this code is being tested are 151731 and 101 respectively. I'm not sure if that has something to do with it.

Does anyone have any ideas on what could be the issue and how to fix it, or even any suggestions on whether this structure for the code is a good idea. The plan was by passing the queue, context and device ID, each function can produce and execute its own kernel(s) to do something with the queue etc being released at the end of the program when they're no longer needed.

Any help would be appreciated, I've been stumped on this for several hours now.

EDIT:

I have since tried changinging the calling convention of clEnqueueNDRange in extrude_coords to

ret = clEnqueueNDRangeKernel(*queue, extrude_coords, 2, NULL, (const size_t *)&gWorkSize[0], NULL, 0, NULL, &clEvent);

as suggested in an answer but this does not work. Testing with printf("%d\n", &gWorkSize == &gWorkSize[0]); shows that the two pointers are functionally the same, so this is not the issue.

I then went on to modify the test openCL code to use clEnqueueNDRange instead of clEnqueueTask as follows:

size_t gWorkSize[]  = {1, 1};
// ret = clEnqueueTask(queue, kernel, 0, NULL, NULL);
ret = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, (const size_t *)&gWorkSize, NULL, 0, NULL, NULL);

This still all works correctly, so something else is clearly wrong... I'm still not sure what...

2
Can you verify that sizeof(coords) / sizeof(coords[0]) is computing the correct value?Austin
That's the problem! It was returning one as both sizeof(coords) and sizeof(coords[0]) were returning 4, the size of a double, giving one. The CL Kernel was therefore seg faulting. I just hadn't realised that due to the asynchronous enqueue of the kernel, the clEnqueueNDRangeKernel() was returning CL_SUCCESS with the error being shown by the GPU invalidating the queue and context! If you write an actual answer saying as much, I shall mark you down as correct!rcbevans
Sure, answer is up. Glad the issue is resolved!Austin

2 Answers

1
votes

The sizeof(coords) / sizeof(coords[0]) will not give the array size in C/C++. Best to use sizeof(coords)*elementsInCoords and pass in elementsInCoords. Alternatively, setup coords to be a std::vector<> and pass that around since you can get a data pointer out of it and the size as well.

0
votes

Look at code:

size_t gWorkSize[]  = {nodes, layers};

cl_event clEvent;
ret = clEnqueueNDRangeKernel(*queue, extrude_coords, 2, NULL, (const size_t *)&gWorkSize, NULL, 0, NULL, &clEvent);

&gWorkSize is of type size_t (*)[2], while argument must be of type const size_t*

Try this:

ret = clEnqueueNDRangeKernel(*queue, extrude_coords, 2, NULL, &gWorkSize[0], NULL, 0, NULL, &clEvent);