I have the following problem, and I'm wondering which is the best approach:
__kernel void test(__global int* output){
// ... Code execution to define myValue.
// V1 : Threads are idle and wait for the first Work-item to write
// the output value.
if(get_global_id(0) == 0) output[0] = myValue;
// V2 : All work-items perform the same action and try to write in
// same global memory. (Is there any lock ?)
output[0] = myValue;
}
Both are working on my AMD GPU. But I don't know which one is the best approach.
EDIT:
Based on kanna answer, I added more code for more information (since Im currently working on that, its getting up-to-date as it goes).
My goal is to keep track of the head/next_head from each kernel, and by having consistency of memory block pointer in-between work-group.
On first approach, I modify the head directly in the global memory ptr, which cause problem when the work-group number was higher, a de-sync of the block location appear, with the following code, it's seems that everything is running as expected and each work-group access the same block ptr, despite the code execution is using those block based on their get_global_id afterward.
So I'm looking for OpenCL good practices to enhance that code and make sure I won't have any 'bottle-neck' in the future. Feel free to advice on the following code if so.
__global void* malloc(size_t sizePtr, __global uchar* heap, ulong* head){
// Get the new ptr inside the heap
__global void* ptr = heap + head[0];
// Increment the head.
head[0] = head[0] + sizePtr;
return ptr;
}
__kernel void test(__global uchar* heap,
__global ulong* head,
__global ulong* next){
// Each work-item set its own local head based on the
// global variable. So every thread in any work-group
// will start at the same head in the heap.
ulong local_head = head[0];
// If get_global_size(0) is 1000. We allocate 1000 + 4000.
const uint g_size = get_global_size(0);
// Get pointers in a Huge memory block (heap) which allows
// to have less memory transfer in-between kernel.
// Just need to keep track of them (work in-progess).
__global uchar* block1 = malloc(sizeof(uchar) * g_size , heap, &local_head);
__global int* block2 = malloc(sizeof(int) * g_size , heap, &local_head);
// Process the blocks in here, access them via the get_global_id(0)
// as index.
// V1
if(get_global_id(0) == 0) next[0] = local_head;
// V2
next[0] = local_head;
// If head was 0, the next is now 5000 for all the work-items,
// whenever the work-group they are in.
}