0
votes

Based on OpenCL guide for Mali 600 gpu, CL_MEM_ALLOC_HOST_PTR should be used to remove any data copy and to improve performance.

Today, I was testing on memory copy time using CL_MEM_ALLOC_HOST_PTR on Arndale board having a Mali 604 gpu. I tested with CL_MEM_ALLOC_HOST_PTR and with clEnqueueWriteBuffer. I found that overall I do not get much performance improvement if I use CL_MEM_ALLOC_HOST_PTR. Because the clEnqueueMap function takes almost same time as clEnqueueWriteBuffer. This test has been done on vector addition.

How I tested: Instead of having a pointer created with malloc and transfer data to device, I created a buffer at first using CL_MEM_ALLOC_HOST_PTR . Then I mapped this buffer using OpenCL API. This returns a pointer and I filled the memory pointed by this pointer with data. The mapping mechanism in this case takes time. The mapping time is almost equal to clENqueuewritebuffer. So, from this example, I did not get any significant improvement using CL_MEM_ALLOC_HOST_PTR.

My question is, why mapping time is so big when I use CL_MEM_ALLOC_HOST_PTR?

Here is the performance measurements: Element size: 10000000, Kernel : vector addition, All times are in microseconds

Normal read write buffer time buffer creation time 20 enqueue write buffer time 108019

CL_MEM_ALLOC_HOST_PTR-with direct data copying inside allocated buffer time Filling the pointer returned by clEnqueueMap with data 208009 mapping time 81346 unmapping time 269

CL_MEM_ALLOC_HOST_PTR-with data copying from a malloc pointer to host alloc pointer using memcpy Time mapping time 64134 unmapping time 190 memcpy time (copy data from already created malloc pointer to host allocated pinned pointer)
56987

Here is the code snippet I used for Host_alloc_ptr:

start = getTime();
    a_st=getTime();
    bufferA = clCreateBuffer(context,  CL_MEM_ALLOC_HOST_PTR, sizeof(cl_float) * ELE_NUM, NULL, &err);
    cl_float* src_a=(cl_float*)clEnqueueMapBuffer(commandQueue, bufferA,CL_TRUE,CL_MAP_WRITE, 0, sizeof(cl_float) * ELE_NUM, 0, NULL, NULL, &err);

    bufferB = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, sizeof(cl_float) * ELE_NUM, NULL, &err);
    cl_float* src_b=(cl_float*)clEnqueueMapBuffer(commandQueue, bufferB,CL_TRUE,CL_MAP_WRITE, 0, sizeof(cl_float) * ELE_NUM, 0, NULL, NULL, &err);
    clFinish(commandQueue);
    a_en=getTime();
    a_time=a_time+(a_en-a_st);

    pfill_s=getTime();
    for (int i = 0; i < ELE_NUM; i++){
        src_a[i] = 100.0;
        src_b[i] = 11.1;

    }
    pfill_e=getTime();
    pfill_time=pfill_time+(pfill_e-pfill_s);

    b_st=getTime();
    clEnqueueUnmapMemObject(commandQueue, bufferB, src_b, 0, NULL, NULL);
    clEnqueueUnmapMemObject(commandQueue, bufferA, src_a, 0, NULL, NULL);
    clFinish(commandQueue);
    b_en=getTime();
    b_time=b_time+(b_en-b_st);


    end = getTime();
    creat_buffer += (end-start);
    bufferC = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, sizeof(cl_float) * ELE_NUM, NULL, &err);
1
You should move the memory to a mapped pointer by memcpy(). Doing element by element operation will probably kill the DMA access, which is what you gain from Map() vs Read(). I saw this behavior on nVIDIA.DarkZeros

1 Answers

0
votes

According to your source code, your're measuring OpenCL Host-side time of operations. Generally, it gives poor results, as time you get is consisted of OpenCL Device-side operation time (which is needed) + various overheads. OpenCL events are powerful tool for performance measurement & tasks synchronization and they can be used to get exact operation times. To see, how memory mapping works in term of speed, I recommend you to run code below:

static void checkError(cl_int ret_code, int line)
{
    if(ret_code != CL_SUCCESS){
        fprintf(stderr, "Error %d happened at line %d.\n", ret_code, line);
    }
}

static long getRunTime(cl_event evt){
    cl_ulong 
        start = 0, 
        end   = 0;

    cl_int ret = CL_SUCCESS;

    ret = clWaitForEvents(1, event);
    checkError(ret, __LINE__);

    ret = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_START,
            sizeof(cl_ulong), &start, NULL);
    checkError(ret, __LINE__);

    ret = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
            &end, NULL);
    checkError(ret, __LINE__);

    return end - start;
}

int 
    kilobyte = 1024,
    megabyte = kilobyte * 1024;

cl_int ret = CL_SUCCESS;

for(int size = 64 * kilobyte; size < 10 * megabyte; size *= 2){
    cl_mem buff1 = clCreateBuffer(some_cl_context, CL_MEM_ALLOC_HOST_PTR, size, &ret);
    checkError(ret, __LINE__);

    cl_mem buff2 = clCreateBuffer(some_cl_context, CL_MEM_READ_WRITE, size, &ret);
    checkError(ret, __LINE__);

    void *host_buffer = malloc(size);
    cl_event evt;

    // Checking clEnqueueMapBuffer time
    void *mapped = clEnqueueMapBuffer(..., buff1, ..., &evt, &ret);
    checkError(ret, __LINE__);

    long time = getRunTime(evt);
    fprintf(stdout, "clEnqueueMapBuffer: size: %d bytes time: %l nanoseconds.\n");

    clEnqueueUnmapMemObject(..., buff1, ...);

    // Checking clEnqueueReadBuffer from CL_MEM_ALLOC_HOST_PTR
    ret  = clEnqueueReadBuffer(..., buff1, ..., host_buffer, ..., size, ..., &evt);
    checkError(ret, __LINE__);

    time = getRunTime(evt);
    fprintf(stdout, "clEnqueueReadBuffer from CL_MEM_ALLOC_HOST_PTR: size: %d bytes time: %l nanoseconds.\n");

    // Checking clEnqueueReadBuffer from CL_MEM_READ_WRITE
    ret  = clEnqueueReadBuffer(..., buff2, ..., host_buffer, ..., size, ..., &evt);
    checkError(ret, __LINE__);

    time = getRunTime(evt);
    fprintf(stdout, "clEnqueueReadBuffer from CL_MEM_READ_WRITE: size: %d bytes time: %l nanoseconds.\n");

    clReleaseMemObject(buff1);
    clReleaseMemObject(buff2);
    free(host_buffer);
}

Please, provide time results you will get.