I'm trying to store 8 unsigned char data to global memory. However, cuda built vector type only support to uchar4. So, I write the uchar_8 by myself. However, when I profile the code, I find the store behavior takes two STG.E instructions rather than one STG.E.64. I know cuda have STG.E.64, so how can I change my code to let compiler to do that.
I have added __align(8)__ directive on my structure, but it still not work. My cuda version is cuda8.0
typedef struct __align__(8){
unsigned char x0;
unsigned char y0;
unsigned char z0;
unsigned char w0;
unsigned char x1;
unsigned char y1;
unsigned char z1;
unsigned char w1;
}uchar_8;
_global__ void yuv420_to_rgb_gpu_(
const uchar2* y_component,
const unsigned char* u_component,
const unsigned char* v_component,
uchar_8* rgb_data,
uint len,
uint width
){
uint bx = blockIdx.x;
uint tx = threadIdx.x;
uint current_index = bx*BLOCK_SIZE + tx;
uchar2 y_tmp;
uchar_8 rgb_tmp;
if(current_index < len){
unsigned char u_data = u_component[current_index];
uint current_line = current_index / width ;
unsigned char v_data = v_component[current_index];
uint current_col = current_index - current_line * width;
uint index_00 = 2*current_line*width + current_col;
//uint index_01 = 2*current_line*width + 2*current_col + 1;
y_tmp = y_component[index_00];
unsigned char y_data_00 = y_tmp.x;
unsigned char y_data_01 = y_tmp.y;
uint index_10 = index_00 + width;
// uint index_11 = 2*current_line*width + 2*current_col + width + 1;
y_tmp = y_component[index_10];
unsigned char y_data_10 = y_tmp.x;
unsigned char y_data_11 = y_tmp.y;
float r_component_0;
float g_component_0;
float b_component_0;
float r_component_1;
float g_component_1;
float b_component_1;
float r_v_tmp = (1.4075f * (v_data-128.0f));
float g_v_tmp = (0.7169f * (v_data-128.0f));
float g_u_tmp = (0.3455f * (u_data-128.0f));
float y_u_tmp = (1.7790f * (u_data-128.0f));
r_component_0 = y_data_00 + r_v_tmp;
g_component_0 = y_data_00 - g_v_tmp - g_u_tmp;
b_component_0 = y_data_00 + y_u_tmp;
r_component_1 = y_data_01 + r_v_tmp;
g_component_1 = y_data_01 - g_v_tmp - g_u_tmp;
b_component_1 = y_data_01 + y_u_tmp;
rgb_tmp.x0 = float_to_char(r_component_0);
rgb_tmp.y0 = float_to_char(g_component_0);
rgb_tmp.z0 = float_to_char(b_component_0);
rgb_tmp.w0 = 0;
rgb_tmp.x1 = float_to_char(r_component_1);
rgb_tmp.y1 = float_to_char(g_component_1);
rgb_tmp.z1 = float_to_char(b_component_1);
rgb_tmp.w1 = 0;
rgb_data[index_00] = rgb_tmp;
r_component_0 = y_data_10 + r_v_tmp;
g_component_0 = y_data_10 - g_v_tmp - g_u_tmp;
b_component_0 = y_data_10 + y_u_tmp;
r_component_1 = y_data_11 + r_v_tmp;
g_component_1 = y_data_11 - g_v_tmp - g_u_tmp;
b_component_1 = y_data_11 + y_u_tmp;
rgb_tmp.x0 = float_to_char(r_component_0);
rgb_tmp.y0 = float_to_char(g_component_0);
rgb_tmp.z0 = float_to_char(b_component_0);
rgb_tmp.w0 = 0;
rgb_tmp.x1 = float_to_char(r_component_1);
rgb_tmp.y1 = float_to_char(g_component_1);
rgb_tmp.z1 = float_to_char(b_component_1);
rgb_tmp.w1 = 0;
// tmp.w = 0;
rgb_data[index_10] = rgb_tmp;
}
}
the memory store only occur at rgb_data[index_00] = rgb_tmp; andrgb_data[index_10] = rgb_tmp; And the disassembly code like this
BFI R3, R7, 0x808, R14;
BFI R5, R9, 0x808, R8;
LEA R4.CC, R2.reuse, c[0x0][0x158], 0x3;
BFI R6, R6, 0x810, R3;
BFI R5, R0, 0x810, R5;
LEA.HI.X R3, R2, c[0x0][0x15c], RZ, 0x3;
MOV R2, R4;
BFI R0, RZ, 0x818, R6;
{ BFI R4, RZ, 0x818, R5;
STG.E [R2], R0; }
STG.E [R2+0x4], R4;
EXIT;
ST.E.64in the SASS. When I compile it with CUDA 8 I don't. Switch to a newer version of CUDA. - Robert Crovella