I wrote an OpenCL kernel that performs a box blur on an input matrix. The implementation was originally written for a GPU, and uses local memory to store the neighborhoods of work items in a work group. Then, I ran the kernel on a CPU and compared the running times to an implemenation which relied on caching reads from global memory automatically instead of manually storing them in local memory first.
Under the assumption that a CPU has no "local memory" and instead uses RAM, using local memory on a CPU should do more harm than good. However, the "local memory" kernel was faster than the one that relied on caching by 10ms (~112ms vs. ~122ms on a 8192x8192 matrix with Work Item / Work Group / "number of values calculated by each work item" settings deemed optimal for both implementations since they were found by an auto-tuner for both kernels separately).
The kernels were run on a Intel Xeon E5-1620 v2 CPU using an OpenCL intel platform available on the host.
What are reasons for this to happen?
"Local Memory" kernel: Each work item works on a "block" of values. Each block is copied to shared memory, and its neighborhood is copied to local memory depending on where the block is in the work group so no values are copied twice. Then, after the barrier, the final value is calculated.
The code below is the X-direction kernel; the y-direction kernel is exactly the same except for the direction in which the values are inspect to calculate the output value.
__kernel void boxblur_x (__read_only __global float* image,
__local float* localmem,
__write_only __global float* output)
{
// size of input and output matrix
int MATRIX_SIZE_Y = IMAGE_HEIGHT;
int MATRIX_SIZE_X = IMAGE_WIDTH;
int MATRIX_SIZE = MATRIX_SIZE_Y * MATRIX_SIZE_X;
// mask size
int S_L = MASK_SIZE_LEFT;
int S_U = 0;
int S_R = MASK_SIZE_RIGHT;
int S_D = 0;
int SHAPE_SIZE_Y = S_U + S_D + 1;
int SHAPE_SIZE_X = S_L + S_R + 1;
int SHAPE_SIZE = SHAPE_SIZE_Y * SHAPE_SIZE_X;
// tuning parameter
// ---------------------------------------------------------------
//work items in y/x dimension per work group
int NUM_WI_Y = get_local_size(1);
int NUM_WI_X = get_local_size(0);
//size of blocks
int BLOCKHEIGHT = X_BLOCKHEIGHT;
int BLOCKWIDTH = X_BLOCKWIDTH;
//position in matrix
int GLOBAL_POS_X = get_global_id(0) * BLOCKWIDTH;
int GLOBAL_POS_Y = get_global_id(1) * BLOCKHEIGHT;
//localMemory size
int LOCALMEM_WIDTH = S_L + NUM_WI_X * BLOCKWIDTH + S_R;
//position in localmem
int LOCAL_POS_X = S_L + get_local_id(0) * BLOCKWIDTH;
int LOCAL_POS_Y = S_U + get_local_id(1) * BLOCKHEIGHT;
// copy values to shared memory
for (int i = 0; i < BLOCKHEIGHT; i++)
{
for (int j = 0; j < BLOCKWIDTH; j++)
{
localmem[(LOCAL_POS_X + j) + (LOCAL_POS_Y + i) * LOCALMEM_WIDTH] = image[GLOBAL_POS_X + j + (GLOBAL_POS_Y + i) * MATRIX_SIZE_X];
}
}
// only when all work items have arrived here,
// computation continues - otherwise, not all needed
// values might be available in local memory
barrier (CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < BLOCKHEIGHT; i++)
{
for (int j = 0; j < BLOCKWIDTH; j++)
{
float sum = 0;
for (int b = 0; b <= S_L + S_R; b++)
{
sum += localmem[(get_local_id(0) * BLOCKWIDTH + j + b) + (get_local_id(1) * BLOCKHEIGHT + i) * LOCALMEM_WIDTH];
}
// divide by size of mask
float pixelValue = sum / SHAPE_SIZE;
// write new pixel value to output image
output[GLOBAL_POS_X + j + ((GLOBAL_POS_Y + i) * get_global_size(0) * BLOCKWIDTH)] = pixelValue;
}
}
}
"L1 Caching kernel": Despite the many defines, it does exactly the same, but relies on global memory caching of the blocks instead of explicitly managing local memory.
#define WG_BLOCK_SIZE_Y ( OUTPUT_SIZE_Y / NUM_WG_Y )
#define WG_BLOCK_SIZE_X ( OUTPUT_SIZE_X / NUM_WG_X )
#define WI_BLOCK_SIZE_Y ( WG_BLOCK_SIZE_Y / NUM_WI_Y )
#define WI_BLOCK_SIZE_X ( WG_BLOCK_SIZE_X / NUM_WI_X )
#define WG_BLOCK_OFFSET_Y ( WG_BLOCK_SIZE_Y * WG_ID_Y )
#define WG_BLOCK_OFFSET_X ( WG_BLOCK_SIZE_X * WG_ID_X )
#define WI_BLOCK_OFFSET_Y ( WI_BLOCK_SIZE_Y * WI_ID_Y )
#define WI_BLOCK_OFFSET_X ( WI_BLOCK_SIZE_X * WI_ID_X )
#define NUM_CACHE_BLOCKS_Y ( WI_BLOCK_SIZE_Y / CACHE_BLOCK_SIZE_Y )
#define NUM_CACHE_BLOCKS_X ( WI_BLOCK_SIZE_X / CACHE_BLOCK_SIZE_X )
#define CACHE_BLOCK_OFFSET_Y ( CACHE_BLOCK_SIZE_Y * ii )
#define CACHE_BLOCK_OFFSET_X ( CACHE_BLOCK_SIZE_X * jj )
#define reorder(j) ( ( (j) / WI_BLOCK_SIZE_X) + ( (j) % WI_BLOCK_SIZE_X) * NUM_WI_X )
#define reorder_inv(j) reorder(j)
#define view( i, j, x, y ) input[ ((i) + (x)) * INPUT_SIZE_X + ((j) + (y)) ]
#define a_wg( i, j, x, y ) view( WG_BLOCK_OFFSET_Y + (i), WG_BLOCK_OFFSET_X + reorder(j), (x), (y) )
#define a_wi( i, j, x, y ) a_wg( WI_BLOCK_OFFSET_Y + (i), WI_BLOCK_OFFSET_X + (j) , (x), (y) )
#define a_cache( i, j, x, y ) a_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j) , (x), (y) )
#define res_wg( i, j ) output[ (WG_BLOCK_OFFSET_Y + i) * OUTPUT_SIZE_X + WG_BLOCK_OFFSET_X + reorder_inv(j) ]
#define res(i, j) output[ (i) * OUTPUT_SIZE_X + (j) ]
#define res_wg( i, j ) res( WG_BLOCK_OFFSET_Y + (i) , WG_BLOCK_OFFSET_X + reorder_inv(j) )
#define res_wi( i, j ) res_wg( WI_BLOCK_OFFSET_Y + (i) , WI_BLOCK_OFFSET_X + (j) )
#define res_cache( i, j ) res_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j) )
float f_stencil( __global float* input, int ii, int jj, int i, int j )
{
// indices
const int WG_ID_X = get_group_id(0);
const int WG_ID_Y = get_group_id(1);
const int WI_ID_X = get_local_id(0);
const int WI_ID_Y = get_local_id(1);
// computation
float sum = 0;
for( int y = 0 ; y < SHAPE_SIZE_Y ; ++y )
for( int x = 0 ; x < SHAPE_SIZE_X ; ++x)
sum += a_cache(i, j, y, x);
return sum / SHAPE_SIZE;
}
__kernel void stencil( __global float* input,
__global float* output
)
{
//indices
const int WG_ID_X = get_group_id(0);
const int WG_ID_Y = get_group_id(1);
const int WI_ID_X = get_local_id(0);
const int WI_ID_Y = get_local_id(1);
// iteration over cache blocks
for( int ii=0 ; ii < NUM_CACHE_BLOCKS_Y ; ++ii )
for( int jj=0 ; jj < NUM_CACHE_BLOCKS_X ; ++jj )
// iteration within a cache block
for( int i=0 ; i < CACHE_BLOCK_SIZE_Y ; ++i )
for( int j=0 ; j < CACHE_BLOCK_SIZE_X ; ++j )
res_cache( i, j ) = f_stencil( input, ii, jj, i , j );
}
fix5
, note how I'm explicitly directing the compiler with the use of the extraAp
,Up
, andLp
variables to precalc the loop invariants. – Craig Estey