0
votes

I'm trying to store 8 unsigned char data to global memory. However, cuda built vector type only support to uchar4. So, I write the uchar_8 by myself. However, when I profile the code, I find the store behavior takes two STG.E instructions rather than one STG.E.64. I know cuda have STG.E.64, so how can I change my code to let compiler to do that.

I have added __align(8)__ directive on my structure, but it still not work. My cuda version is cuda8.0

typedef struct __align__(8){
    unsigned char x0;
    unsigned char y0;
    unsigned char z0;
    unsigned char w0;
    unsigned char x1;
    unsigned char y1;
    unsigned char z1;
    unsigned char w1;
}uchar_8;

_global__ void yuv420_to_rgb_gpu_(
        const uchar2*  y_component,
        const unsigned char*   u_component,
        const unsigned char*  v_component,
        uchar_8*  rgb_data,
        uint len,
        uint width
){
    uint bx = blockIdx.x;
    uint tx = threadIdx.x;
    uint current_index = bx*BLOCK_SIZE + tx;
    uchar2 y_tmp;
    uchar_8 rgb_tmp;

    if(current_index < len){
        unsigned char u_data = u_component[current_index];
        uint current_line = current_index / width ;
        unsigned char v_data = v_component[current_index];
        uint current_col = current_index - current_line * width;
        uint index_00 = 2*current_line*width + current_col;
        //uint index_01 = 2*current_line*width + 2*current_col + 1;
        y_tmp = y_component[index_00];
        unsigned char y_data_00 = y_tmp.x;
        unsigned char y_data_01 = y_tmp.y;
        uint index_10 = index_00 + width;
       // uint index_11 = 2*current_line*width + 2*current_col + width + 1;
        y_tmp = y_component[index_10];
        unsigned char y_data_10 = y_tmp.x;
        unsigned char y_data_11 = y_tmp.y;

        float r_component_0;
        float g_component_0;
        float b_component_0;
        float r_component_1;
        float g_component_1;
        float b_component_1;



        float r_v_tmp = (1.4075f * (v_data-128.0f));
        float g_v_tmp = (0.7169f * (v_data-128.0f));
        float g_u_tmp = (0.3455f * (u_data-128.0f));
        float y_u_tmp = (1.7790f * (u_data-128.0f));


        r_component_0 = y_data_00 + r_v_tmp;
        g_component_0 = y_data_00 - g_v_tmp - g_u_tmp;
        b_component_0 = y_data_00 + y_u_tmp;
        r_component_1 = y_data_01 + r_v_tmp;
        g_component_1 = y_data_01 - g_v_tmp - g_u_tmp;
        b_component_1 = y_data_01 + y_u_tmp;

        rgb_tmp.x0 = float_to_char(r_component_0);
        rgb_tmp.y0 = float_to_char(g_component_0);
        rgb_tmp.z0 = float_to_char(b_component_0);
        rgb_tmp.w0 = 0;
        rgb_tmp.x1 = float_to_char(r_component_1);
        rgb_tmp.y1 = float_to_char(g_component_1);
        rgb_tmp.z1 = float_to_char(b_component_1);
        rgb_tmp.w1 = 0;
        rgb_data[index_00] = rgb_tmp;




        r_component_0 = y_data_10 + r_v_tmp;
        g_component_0 = y_data_10 - g_v_tmp - g_u_tmp;
        b_component_0 = y_data_10 + y_u_tmp;
        r_component_1 = y_data_11 + r_v_tmp;
        g_component_1 = y_data_11 - g_v_tmp - g_u_tmp;
        b_component_1 = y_data_11 + y_u_tmp;

        rgb_tmp.x0 = float_to_char(r_component_0);
        rgb_tmp.y0 = float_to_char(g_component_0);
        rgb_tmp.z0 = float_to_char(b_component_0);
        rgb_tmp.w0 = 0;
        rgb_tmp.x1 = float_to_char(r_component_1);
        rgb_tmp.y1 = float_to_char(g_component_1);
        rgb_tmp.z1 = float_to_char(b_component_1);
        rgb_tmp.w1 = 0;
       // tmp.w = 0;
        rgb_data[index_10] = rgb_tmp;




    }
}

the memory store only occur at rgb_data[index_00] = rgb_tmp; andrgb_data[index_10] = rgb_tmp; And the disassembly code like this

        BFI R3, R7, 0x808, R14;
        BFI R5, R9, 0x808, R8;
        LEA R4.CC, R2.reuse, c[0x0][0x158], 0x3;
        BFI R6, R6, 0x810, R3;
        BFI R5, R0, 0x810, R5;
        LEA.HI.X R3, R2, c[0x0][0x15c], RZ, 0x3;
        MOV R2, R4;
        BFI R0, RZ, 0x818, R6;
        {         BFI R4, RZ, 0x818, R5;
        STG.E [R2], R0;        }
        STG.E [R2+0x4], R4;
        EXIT;
1
when I compile your code with cuda 10.1 I see ST.E.64 in the SASS. When I compile it with CUDA 8 I don't. Switch to a newer version of CUDA. - Robert Crovella

1 Answers

2
votes

Just expanding comments into an answer:

Every older version of the compiler I tested (8.0, 9.1, 10.0) will emit two st.global.v4.u8 instructions in PTX (i.e. two 32bit writes) for the uchar_8 assignment at the end of your kernel. CUDA 10.1, on the other hand, emits a single st.global.v4.u16 instruction to handle the write.

So the solution will be upgrading to CUDA 10.1. Any toolkit before that will not honor the 64 bit write request.