I have written a opencl code snippet which I want to improve further:
global_size is 1920x1080
local size is kept NULL. I have left this to compiler.
__kernel void experiment(__read_only image2d_t YIn, __write_only image2d_t YOut)
{
uint4 best_suited=0;
uint4 temp =0;
int best_sum,ssum;
int2 coord_src = (int2)(get_global_id(0), 2*get_global_id(1)+1);
const sampler_t smp = CLK_FILTER_NEAREST | CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE;
uint4 pixel1 = read_imageui(YIn, smp, coord_src + (int2)(-3,0));
uint4 pixel2 = read_imageui(YIn, smp, coord_src + (int2)(-2,0));
uint4 pixel3 = read_imageui(YIn, smp, coord_src + (int2)(-1,0));
uint4 pixel4 = read_imageui(YIn, smp, coord_src + (int2)( 0,0));
uint4 pixel5 = read_imageui(YIn, smp, coord_src + (int2)( 1,0));
uint4 pixel6 = read_imageui(YIn, smp, coord_src + (int2)( 2,0));
uint4 pixel7 = read_imageui(YIn, smp, coord_src + (int2)( 3,0));
/* Read luma pixels of next line */
uint4 pixel_nxt1 = read_imageui(YIn, smp, coord_src + (int2)(-3,2));
uint4 pixel_nxt2 = read_imageui(YIn, smp, coord_src + (int2)(-2,2));
uint4 pixel_nxt3 = read_imageui(YIn, smp, coord_src + (int2)(-1,2));
uint4 pixel_nxt4 = read_imageui(YIn, smp, coord_src + (int2)( 0,2));
uint4 pixel_nxt5 = read_imageui(YIn, smp, coord_src + (int2)( 1,2));
uint4 pixel_nxt6 = read_imageui(YIn, smp, coord_src + (int2)( 2,2));
uint4 pixel_nxt7 = read_imageui(YIn, smp, coord_src + (int2)( 3,2));
/* main loop: */
{
best_sum= abs_diff(pixel3.x,pixel_nxt4.x) + abs_diff(pixel4.x,pixel_nxt5.x) + abs_diff(pixel5.x,pixel_nxt6.x) -8;
best_suited.x = (pixel4.x+pixel_nxt2.x) >> 1;
sum = abs_diff(pixel2.x,pixel_nxt2.x) + abs_diff(pixel3.x,pixel_nxt6.x) + abs_diff(pixel4.x,pixel_nxt1.x);
if (sum < best_sum)
{
best_sum = sum;
best_suited.x = (pixel3.x+pixel_nxt3.x) >> 1;
sum = abs_diff(pixel1.x,pixel_nxt5.x) + abs_diff(pixel2.x,pixel_nxt6.x) + abs_diff(pixel3.x,pixel_nxt7.x) + 16;
if (sum < best_sum)
{
best_sum = sum;
best_suited.x = (pixel5.x+pixel_nxt1.x) >> 1;
}
}
sum = abs_diff(pixel4.x,pixel_nxt5.x) + abs_diff(pixel5.x,pixel_nxt2.x) + abs_diff(pixel6.x,pixel_nxt1.x);
if (sum < best_sum)
{
best_sum = sum;
best_suited.x = (pixel4.x+pixel_nxt3.x)>> 1;
sum = abs_diff(pixel5.x,pixel_nxt3.x) + abs_diff(pixel6.x,pixel_nxt4.x) + abs_diff(pixel7.x,pixel_nxt3.x);
if (sum < best_sum)
{
best_sum = sum;
best_suited.x = (pixel6.x+pixel_nxt2.x) >> 1;
}
}
}
/* Pix4(0,0) is the current pixel in below calculations */
write_imageui(YOut, coord_src, pixel4);
/* store the result: */
write_imageui(YOut, coord_src+(int2)(0,1),best_suited);
}
I have tried the following things: 1) abs_diff is the inbuilt function and by replacing abs_diff with the bitwise code is not giving any improvement.
2) Analysed its performance using intel Vtune and saw execution units are idle for 30% of time. GPU memory read is 7.6GB/sec and write is 3.942GB/sec.Number of L3 cache misses is close to 177x10^9 and Computing Thread are close to 35 lacs. Also Sampler Bottlenecks are 8.3%.
Thinking further: 1) I don't know whether reading the data in local memory will benefit me or not. Since local memory cache line access is same as accessing L3 cache on intel architecture. And reading via image api's I am already accessing the cache memory for image objects i.e. Texture memory. The only help I can think can be reducing sampler bottlenecks if I write code something like this: __local smem [256] ; smem[get_local_id(0) = read_imageui(YIn, smp, coord_src);
2) I also don't know what should be the optimal work group size here.
Can anyone explain to me, in full detail, how this code can be optimized? How can I reduce my execution idle time, computing threads, L3 cache misses, and increase my GPU memory read and write access. If you can re-write the code that will be really helpful.