I'm using OpenCL 2.0 dynamic parallelism feature and have each workitem enqueue another kernel with single workitem. When work completion time of child kernel is high, parent kernel completes before children and memory consistency is not preserved and corrupt data (randomly updated data items) is returned.
Since clFinish() and clEnqueueMarkerWithWaitList() is for host-only queues, I can't use them for this default-on-device-out-of-order-queue.
How can I make child kernels finish before some synchronization point or at least before a buffer-read command so that memory consistency is achieved?
Here is the code:
__kernel void test( __global float *xyz,__global float *xyzn,__global float *xyzo,__global float * arguments)
{
int threadId=get_global_id(0);
float dx=xyz[threadId*3]-arguments[2];float dy=xyz[threadId*3+1]-arguments[3];float t=arguments[1];
float ctr=arguments[0];float wave=0.02f*ctr*sin(40.0f*t+100.0f*sqrt(dx*dx+dy*dy));
xyzo[threadId*3]=xyz[threadId*3]+xyzn[threadId*3]*wave; // wave equation for all surface vertices
xyzo[threadId*3+1]=xyz[threadId*3+1]+xyzn[threadId*3+1]*wave; // wave equation for all surface vertices
xyzo[threadId*3+2]=xyz[threadId*3+2]+xyzn[threadId*3+2]*wave; // wave equation for all surface vertices
}
__kernel void waveEquation( __global float *xyz,__global float *xyzn,__global float *xyzo,__global float * arguments)
{
int threadId=get_global_id(0);
if(threadId<arguments[4])
{
queue_t q = get_default_queue();
ndrange_t ndrange = ndrange_1D(threadId,1,1);
void (^my_block_A)(void) = ^{test(xyz,xyzn,xyzo,arguments);};
enqueue_kernel(q, CLK_ENQUEUE_FLAGS_NO_WAIT,ndrange,my_block_A);
}
}
when parent kernel has only 1-2 workitems, it works fine but there are normally 256*224 workitems for parent kernel and child kernels cannot complete before data is accessed from host(after clFinish())
Here is construction of default queue(different than the queue for parent-kernel)
commandQueue = cl::CommandQueue(context, device,
CL_QUEUE_ON_DEVICE|
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE_DEFAULT, &err);
edit: this way of creating the queue also does not make it synchronizable:
cl_uint qs=device.getInfo<CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE>();
cl_queue_properties qprop[] = { CL_QUEUE_SIZE, qs, CL_QUEUE_PROPERTIES,
(cl_command_queue_properties)(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE |
CL_QUEUE_ON_DEVICE_DEFAULT |
CL_QUEUE_PROFILING_ENABLE), 0 };
device_queue = clCreateCommandQueueWithProperties(context.get(),
device.get(), qprop, &err);
device=RX550, driver=17.6.2, 64 bit build.
User Parallel Highway's solution also didn't work:
if(threadId<arguments[4])
{
clk_event_t markerEvent;
clk_event_t events[1];
queue_t q = get_default_queue();
ndrange_t ndrange = ndrange_1D(threadId,1,1);
void (^my_block_A)(void) = ^{test(xyz,xyzn,xyzo,arguments);};
enqueue_kernel(q, CLK_ENQUEUE_FLAGS_NO_WAIT,ndrange,0,NULL,&events[0],my_block_A);
enqueue_marker(q, 1, events, &markerEvent);
release_event(events[0]);
release_event(markerEvent);
}
This didn't work:
queue_t q = get_default_queue();
ndrange_t ndrange = ndrange_1D(threadId,1,1);
void (^my_block_A)(void) = ^{test(xyz,xyzn,xyzo,arguments);};
int ctr=0;
while((enqueue_kernel(q, CLK_ENQUEUE_FLAGS_NO_WAIT,ndrange,my_block_A)&
( CLK_DEVICE_QUEUE_FULL|
CLK_EVENT_ALLOCATION_FAILURE|
CLK_OUT_OF_RESOURCES |
CLK_INVALID_NDRANGE |
CLK_INVALID_QUEUE |
CLK_INVALID_EVENT_WAIT_LIST |
CLK_INVALID_ARG_SIZE
))>0 )
{
}
this doesn't work but completes so there is no infinite loop.