I have written a kernel in opencl using local memory to get the faster execution. This is the first time I am using local memory. My global_work_size = 16 and local_work_size = 8.
Opencl kernel: mapper.cl
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
typedef struct data
{
double dattr[10];
int d_id;
int bestCent;
}Data;
typedef struct cent
{
double cattr[5];
int c_id;
}Cent;
__kernel void mapper(__global int *keyMobj, __global int *valueMobj, __global Data *dataMobj, __global Cent *centMobj)
{
int bx = get_group_id(0);
int tx = get_local_id(0);
int size = get_local_size(0);
__local double localData[8][2];
__local double localCent[2][2];
__local int local_id[8];
int index = tx + bx*size;
int j,k,color=0;
double dmin=1000000.0, dx;
for(j=0; j<2; j++)
{
if(tx<2)
localCent[tx][j] = centMobj[tx].cattr[j];
localData[tx][j] = dataMobj[index].dattr[j];
local_id[tx] = dataMobj[index].d_id;
}
barrier(CLK_LOCAL_MEM_FENCE);
for(j=0; j<2; j++)
{
dx = 0.0;
for(k=0; k<2; k++)
dx+= ((localCent[j][k] - localData[index][k]) * (localCent[j][k] - localData[index][k]));
if(dx<dmin)
{ color = j;
dmin = dx;
}
}
keyMobj[index] = color;
valueMobj[index] = local_id[tx];
}
In the above kernel, I am fetching first 8 objects of structure dataMobj and 2 objects of centMobj from global memory to the local memory localData and localCent respectively. As far as my programming is concerned I have used the proper for loops for data fetching.
But it gives me correct execution for first 8 work-items(for the first work-group) only. It doesn't fetch the the data using next remaining 8 work-items for remaining block.
Please tell me how this exactly works and let me know if I am going wrong. I have also used the memory fence for thread synchronization.