I made an OpenCL program and use pinned memory (CL_MEM_ALLOC_HOST_PTR
) to get a higher transfer rate from device to host.
The transfer rate is increased as I expected (get transfer rate using AMD APP Profiler 2.4). The problem is the transfer rate is higher than PCIe bandwidth (93703 GB /s) for matrix 4096 x 4096 (64 MB).
It happened too when I use zero copy buffer ( CL_MEM_ALLOC_HOST_PTR + clEnqueueMapBuffer). I search some information that it is true if pinned memory and zero copy buffer have high transfer rate but it still limited with PCIe bandwidth for discrete GPU. So, is it normal if the transfer rate exceed PCIe bandwidth (using PCIe bandwidth 2.0 x 16)?
My OS is Windows 7 64 bit. I use AMD APP SDK 2.6 and discrete GPU AMD HD 6630M.
Edit: Here is the code:
#include <Windows.h>
#include <iostream>
#include <fstream>
#include <string>
using namespace std;
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
cl_context context = NULL;
cl_command_queue queue = NULL;
cl_program program = NULL;
void MatrixMul(cl_mem d_A, cl_mem d_B, cl_mem d_C, int size)
{
cl_int err;
cl_kernel naive;
// Create Kernel Object Bound To Kernel Function
naive = clCreateKernel(program, "naiveAlgorithm", &err);
//Set size of global work item and work tem in each work goups
int globalsize = size;
int localsize;
if(globalsize >= 16)
{
localsize =16;
}else
{
localsize = globalsize;
}
size_t global_work_items [2] = {globalsize, globalsize};
size_t local_work_items [2] = {localsize, localsize};
// Setup Kernel Argument
err = clSetKernelArg(naive, 0, sizeof(cl_mem), (void *)&d_A);
err = clSetKernelArg(naive, 1, sizeof(cl_mem), (void *)&d_B);
err = clSetKernelArg(naive, 2, sizeof(cl_mem), (void *)&d_C);
err = clSetKernelArg(naive, 3, sizeof(cl_int), (void *)&size);
// Execute OpenCL kernel for Naive Algorithm
err = clEnqueueNDRangeKernel(queue, naive, 2, NULL, global_work_items, local_work_items, 0, NULL, NULL);
clFinish(queue);
//Release Kernel
err = clReleaseKernel(naive);
}
void Naive(cl_float* matrixA, cl_float* matrixB, cl_float* matrixC, int size)
{
int err;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
// Allocate Device Memory For Input And Output
d_A = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err);
d_B = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err);
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);
// Copy Host Memory To Memory Device
err = clEnqueueWriteBuffer(queue, d_A, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixA, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, d_B, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixB, 0, NULL, NULL);
MatrixMul(d_A, d_B, d_C, size);
err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, sizeof(cl_float)*size*size, matrixC, 0, NULL, NULL);
err = clReleaseMemObject(d_A);
err = clReleaseMemObject(d_B);
err = clReleaseMemObject(d_C);
}
//Main Function
int main(int argc, char **argv)
{
//Size of matrix for Strassen Algorithm
cl_int size = 4096;
//Matrix for input and output
cl_float * matrixA;
cl_float * matrixB;
cl_float * matrixC;
//Allocate and init memory for the host
matrixA = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixB = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixC = (cl_float *) malloc(size*size*sizeof(cl_float));
//Fill matrix
fillMatrix(matrixA,size);
fillMatrix(matrixB,size);
//print input for matrix A and B
cout<<"Input for matrix A :"<<endl;
printMatrix(matrixA, size*size, size);
cout<<"Input for matrix B :"<<endl;
printMatrix(matrixB, size*size, size);
cl_int err; // error code
cl_platform_id* platforms;
cl_uint platformCount;
cl_device_id device;
int platformtype = 0; //if 0 using amd app sdk but if 1 using intel sdk
clGetPlatformIDs(0, NULL, &platformCount); //get number of platform
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
clGetPlatformIDs(platformCount, platforms, NULL); //get list of platform
clGetDeviceIDs (platforms [platformtype], CL_DEVICE_TYPE_GPU, 1, &device, NULL); //get list of devices
const cl_context_properties contextProperties [] =
{CL_CONTEXT_PLATFORM,
reinterpret_cast<cl_context_properties> (platforms [platformtype]),
0, 0
};
context = clCreateContext(contextProperties, 1, &device, NULL, NULL, &err);
![enter image description here][2]queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
//Load Kernel Source
FILE *fp;
const char fileName[] = "./MatMul_Kernel.cl";
size_t source_size;
char *source_str;
fp = fopen(fileName, "r");
if (!fp)
{
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Create Program Object
program = clCreateProgramWithSource(context, 1, (const char **) &source_str,(const size_t *),
&source_size, &err);
// Build Program
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
Naive(matrixA, matrixB, matrixC, size);
//Cleanup all memory
err = clFlush(queue);
err = clFinish(queue);
err = clReleaseProgram(program);
err = clReleaseCommandQueue(queue);
err = clReleaseContext(context);
// Display result of matrix multiplication
cout<<"Output for matrix C :"<<endl;
printMatrix(matrixC, size*size, size);
cout<<endl;
free(matrixA);
free(matrixB);
free(matrixC);
free(source_str);
return 0;
}
And here is the kernel code:
__kernel void naiveAlgorithm(__global float *A, __global float *B, __global float *C, int size) {
int tx = get_global_id(0); //2D Thread IDx
int ty = get_global_id(1); //2D Thread IDy
float sum = 0;
//Calculate result of one element of Matrix C
for (int k = 0; k < size; k++) {
sum += A[ty*size+k] * B[k*size+tx];
}
C[ty*size+tx] = sum;
}
And here is the image: