OpenCL Transfer rate exceed PCI-e Bandwidth

Question

I made an OpenCL program and use pinned memory (CL_MEM_ALLOC_HOST_PTR) to get a higher transfer rate from device to host.

The transfer rate is increased as I expected (get transfer rate using AMD APP Profiler 2.4). The problem is the transfer rate is higher than PCIe bandwidth (93703 GB /s) for matrix 4096 x 4096 (64 MB).

It happened too when I use zero copy buffer ( CL_MEM_ALLOC_HOST_PTR + clEnqueueMapBuffer). I search some information that it is true if pinned memory and zero copy buffer have high transfer rate but it still limited with PCIe bandwidth for discrete GPU. So, is it normal if the transfer rate exceed PCIe bandwidth (using PCIe bandwidth 2.0 x 16)?

My OS is Windows 7 64 bit. I use AMD APP SDK 2.6 and discrete GPU AMD HD 6630M.

Edit: Here is the code:

#include <Windows.h>
#include <iostream>
#include <fstream>
#include <string>
using namespace std;

#ifdef __APPLE__   
   #include <OpenCL/opencl.h>   
#else  
   #include <CL/cl.h>   
#endif 

#define MAX_SOURCE_SIZE (0x100000)

cl_context context = NULL; 
cl_command_queue queue = NULL; 
cl_program program = NULL; 

void MatrixMul(cl_mem d_A, cl_mem d_B, cl_mem d_C, int size)
{
cl_int err;
cl_kernel naive;

// Create Kernel Object Bound To Kernel Function 
naive = clCreateKernel(program, "naiveAlgorithm", &err);

//Set size of global work item and work tem in each work goups
int globalsize = size;
int localsize;

if(globalsize >= 16)
{
    localsize =16;
}else
{
    localsize = globalsize;
}

size_t global_work_items [2] = {globalsize, globalsize};
size_t local_work_items  [2] = {localsize, localsize};

// Setup Kernel Argument
err = clSetKernelArg(naive, 0, sizeof(cl_mem), (void *)&d_A);
err = clSetKernelArg(naive, 1, sizeof(cl_mem), (void *)&d_B);
err = clSetKernelArg(naive, 2, sizeof(cl_mem), (void *)&d_C);
err = clSetKernelArg(naive, 3, sizeof(cl_int), (void *)&size);



// Execute OpenCL kernel for Naive Algorithm
err = clEnqueueNDRangeKernel(queue, naive, 2, NULL, global_work_items, local_work_items, 0, NULL, NULL);
clFinish(queue);

//Release Kernel
err = clReleaseKernel(naive);
}

void Naive(cl_float* matrixA, cl_float* matrixB, cl_float* matrixC, int size)
{
int err;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;

// Allocate Device Memory For Input And Output
d_A = clCreateBuffer(context,  CL_MEM_READ_ONLY   ,   sizeof(cl_float)*size*size, 0, &err);
d_B = clCreateBuffer(context,  CL_MEM_READ_ONLY   ,   sizeof(cl_float)*size*size, 0, &err);
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);     

// Copy Host Memory To Memory Device
err = clEnqueueWriteBuffer(queue, d_A, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixA, 0, NULL, NULL); 
err = clEnqueueWriteBuffer(queue, d_B, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixB, 0, NULL, NULL); 

MatrixMul(d_A, d_B, d_C, size);

err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, sizeof(cl_float)*size*size, matrixC, 0, NULL, NULL);

err = clReleaseMemObject(d_A);
err = clReleaseMemObject(d_B);
err = clReleaseMemObject(d_C);
}



//Main Function
int main(int argc, char **argv)
{
//Size of matrix for Strassen Algorithm
cl_int size = 4096; 

//Matrix for input and output
cl_float * matrixA;
cl_float * matrixB;
cl_float * matrixC;

//Allocate  and init memory for the host
matrixA = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixB = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixC = (cl_float *) malloc(size*size*sizeof(cl_float));

//Fill matrix
fillMatrix(matrixA,size);
fillMatrix(matrixB,size);

//print input for matrix A and B
cout<<"Input for matrix A :"<<endl;
printMatrix(matrixA, size*size, size);
cout<<"Input for matrix B :"<<endl;
printMatrix(matrixB, size*size, size);

cl_int err;     // error code   

cl_platform_id* platforms;
cl_uint platformCount;

cl_device_id device;

int platformtype = 0; //if 0 using amd app sdk but if 1 using intel sdk

clGetPlatformIDs(0, NULL, &platformCount); //get number of platform
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); 
clGetPlatformIDs(platformCount, platforms, NULL);  //get list of platform
clGetDeviceIDs (platforms [platformtype], CL_DEVICE_TYPE_GPU, 1, &device, NULL); //get list of devices

const cl_context_properties contextProperties [] =
{CL_CONTEXT_PLATFORM,
     reinterpret_cast<cl_context_properties> (platforms [platformtype]),
     0, 0
};


context = clCreateContext(contextProperties, 1, &device, NULL, NULL, &err);
    ![enter image description here][2]queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);


//Load Kernel Source 
FILE *fp;
const char fileName[] = "./MatMul_Kernel.cl";
size_t source_size;
char *source_str;

fp = fopen(fileName, "r");
if (!fp) 
{
    fprintf(stderr, "Failed to load kernel.\n");
    exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);

// Create Program Object 
program = clCreateProgramWithSource(context, 1, (const char **) &source_str,(const size_t *),
    &source_size, &err); 

// Build Program 
    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);

Naive(matrixA, matrixB, matrixC, size);

    //Cleanup all memory
err = clFlush(queue);
    err = clFinish(queue);
    err = clReleaseProgram(program);
    err = clReleaseCommandQueue(queue);
    err = clReleaseContext(context);

// Display result of matrix multiplication
cout<<"Output for matrix C :"<<endl;
    printMatrix(matrixC, size*size, size);
cout<<endl;

free(matrixA);
    free(matrixB);
    free(matrixC);
free(source_str);

    return 0;
}

And here is the kernel code:

 __kernel void naiveAlgorithm(__global float *A, __global float *B, __global float *C, int size) {

 int tx = get_global_id(0); //2D Thread IDx
 int ty = get_global_id(1); //2D Thread IDy

 float sum = 0;

 //Calculate result of one element of Matrix C
 for (int k = 0; k < size; k++) {
    sum += A[ty*size+k] * B[k*size+tx];
 }
  C[ty*size+tx] = sum;
 }

And here is the image:

enter image description here

What is your code? You are probably seen the effects of asynchronous transfers. — DarkZeros
Thanks for reply, I already add the code and image from AMD APP Profiler. — arvin99
Interesting results, It could be either that your kernel is directly writing on host memory or that the profiler is not profiling properly. However, those transfer rates are very high to be real (including the overhead). — DarkZeros
Because if your kernel directly writes to host memory. Then the transfer is just an NOP operation that takes no time. — DarkZeros
It looks like a bug on AMD APP Profiler. Look at the time consumed by clEnqueueReadBuffer but nothing is shown on the Data Transfer row except for that little blip at the end. My guess is that it didn't time the transfer correctly and therefore got the bandwidth measurement wrong. You could see of Code XL fixes this, or just use the readback timing to figure out your bandwidth. — Dithermaster

chippies chippies · Accepted Answer · 2014-05-19T12:21:16

I see that your output array is actually located in host memory because of the CL_MEM_ALLOC_HOST_PTR flag in the following line:

d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);

This means that you should be using clEnqueueMapBuffer, followed by using the matrix in whatever way you see fit, followed by clEnqueueUnmapMemObject. There is no need for the array matrixC since d_C is already in host memory.

The data transfer from GPU to host actually happens while your kernel is running. The map call makes sure that all data has finished moving from the GPU to the CPU. That is why the transfer times are actually so small.

I can't find any documentation on whether clEnqueueReadBuffer works for pinned memory or not. I also see that you are retrieving the error codes of each operation but do not check these error codes, hence your code may be silently failing.

Regarding the large difference between the time taken by clEnqueueReadBuffer and the time spent transferring data, note that all queued operations don't immediately get dispatched to the GPU. One source of delay is the Windows display driver model (WDDM) for graphics cards. The +-20 micro-seconds used for the clEnqueueReadBuffer sounds right for this delay (I've actually seen longer delays).

OpenCL Transfer rate exceed PCI-e Bandwidth

1 Answers