I am currently porting a CUDA code that finds shortest paths from each node to other nodes in a (undirected) graph. So basically, the CUDA code constructs a graph read from a text file. Then it proceeds to build adjancent arrays h_v and h_e.
For example
A B
A C
B C
gives
h_v[0] = 0, h_e[0]=1
h_v[1] = 0, h_e[1]=2
h_v[2] = 1, h_e[2]=2
Then it calls the kernel to compute shortest paths from each node using BFS.
The cuda host code is as follow:
int cc_bfs(int n_count, int e_count, int *h_v, int *h_e, float *h_cc, bool ec){
int *d_v, *d_e;
cudaCheckError(cudaMalloc((void **)&d_v, sizeof(int)*e_count));
cudaCheckError(cudaMalloc((void **)&d_e, sizeof(int)*e_count));
cudaCheckError(cudaMemcpy(d_v, h_v, sizeof(int)*e_count, cudaMemcpyHostToDevice));
cudaCheckError(cudaMemcpy(d_e, h_e, sizeof(int)*e_count, cudaMemcpyHostToDevice));
int *d_d, *d_dist;
cudaCheckError(cudaMalloc((void **)&d_d, sizeof(int)*n_count));
cudaCheckError(cudaMalloc((void **)&d_dist, sizeof(int)));
int *h_d;
h_d=(int *)malloc(sizeof(int)*n_count);
bool *d_continue;
cudaCheckError(cudaMalloc((void**)&d_continue, sizeof(bool)));
for(int s=0; s<n_count; s++){ //BIG FOR LOOP
//////code to initalize h_d[i]
for(int i=0; i<n_count; i++)
h_d[i]=-1;
h_d[s]=0; //for marking the root
cudaCheckError(cudaMemcpy(d_d, h_d, sizeof(int)*n_count, cudaMemcpyHostToDevice));
//////////////////////////////
///////////////////////////////
int threads_per_block=e_count;
int blocks=1;
if(e_count>MAX_THREADS_PER_BLOCK){
blocks = (int)ceil(e_count/(float)MAX_THREADS_PER_BLOCK);
threads_per_block = MAX_THREADS_PER_BLOCK;
}
dim3 grid(blocks);
dim3 threads(threads_per_block);
/////////////////////////////////
bool h_continue;
int h_dist=0;
cudaCheckError(cudaMemset(d_dist, 0, sizeof(int)));
do{
h_continue=false;
cudaCheckError(cudaMemcpy(d_continue, &h_continue, sizeof(bool), cudaMemcpyHostToDevice));
cc_bfs_kernel<<<grid, threads>>>(d_v, d_e, d_d, d_continue, d_dist, e_count);
checkCUDAError("Kernel invocation");
cudaThreadSynchronize();
h_dist++;
cudaCheckError(cudaMemcpy(d_dist, &h_dist, sizeof(int), cudaMemcpyHostToDevice));//for what?
cudaCheckError(cudaMemcpy(&h_continue, d_continue, sizeof(bool), cudaMemcpyDeviceToHost));
}while(h_continue);
///////////////////
//then code to read back h_d from device
}
And here is cuda kernel
__global__ void cc_bfs_kernel(int *d_v, int *d_e, int *d_d,
bool *d_continue, int *d_dist, int e_count){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if(tid<e_count){
/* for each edge (u, w) */
int u=d_v[tid];
int w=d_e[tid];
if(d_d[u]==*d_dist){ //of the interest root
if(d_d[w]==-1){ //not yet check
*d_continue=true; //continue
d_d[w]=*d_dist+1; //increase
}
}
}
}
Here is my effort to port it to openCL. I am just an amateur in openCL, so I am trying the best to port the original code line by line :(
openCL host code
cl_mem d_d= clCreateBuffer(context,CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*n_count, NULL,NULL);
cl_mem d_dist= clCreateBuffer(context,CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,sizeof(int), NULL,NULL);
int *h_d;
h_d=(int *)malloc(sizeof(int)*n_count);
cl_mem d_continue = clCreateBuffer(context,CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,sizeof(bool), NULL,NULL);
float* h_cc;
h_cc = (float *)malloc(sizeof(float)*n_count);
cl_mem d_v= clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*e_count, NULL,NULL);
cl_mem d_e= clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*e_count, NULL,NULL);
err = clEnqueueWriteBuffer(queue, d_v, CL_TRUE, 0, e_count * sizeof(int), host_v, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, d_e, CL_TRUE, 0, e_count * sizeof(int), host_e, 0, NULL, NULL);
size_t global_size= e_count;
for(int s=0; s<n_count; s++)
{ //BIG LOOP
//initalize h_d[i]
for(int i=0; i<n_count; i++)
h_d[i]=-1;
h_d[s]=0;
//copy h_d to d_d
err = clEnqueueWriteBuffer(queue, d_d, CL_TRUE, 0,
n_count * sizeof(int), h_d, 0, NULL, NULL);
bool h_continue;
int h_dist=0;
int mark = 0;
int* h_id;
h_id= (int*) malloc(sizeof(int)*e_count);
cl_mem id= clCreateBuffer(context,CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR,
sizeof(int)*e_count, NULL,NULL);
do{
h_continue=false;
err = clEnqueueWriteBuffer(queue, d_continue, CL_TRUE, 0,
sizeof(bool), &h_continue, 0, NULL, NULL);
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_v);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_e);
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_d);
err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&d_continue);
err = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&d_dist);
err = clSetKernelArg(kernel, 5, sizeof(int), (void *)&e_count);
err = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&id);
/////EXECUTE
cl_event sync1;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
&global_size, NULL, 0, NULL, &sync1); //wait for this to finish (to synchronize)
err = clWaitForEvents(1, &sync1);
clReleaseEvent(sync1);
///////////////////
err = clEnqueueReadBuffer(queue, id, CL_TRUE, 0,
sizeof(int)*e_count, h_id, 0, NULL, NULL);
printf("e_count = %d error : %d\n",e_count, err);//check error?
for(int j = 0; j< e_count; j++)
{
printf("%d ",h_id[j]);
}
h_dist++;
mark++;//for debug
err = clEnqueueWriteBuffer(queue, d_dist, CL_TRUE, 0,
sizeof(int), &h_dist, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, d_continue, CL_TRUE, 0,
sizeof(bool), &h_continue, 0, NULL, NULL);
}
while(h_continue);
err = clEnqueueReadBuffer(queue, d_d, CL_TRUE, 0,
n_count*sizeof(int), h_d, 0, NULL, NULL);
and openCL kernel
__kernel void cc_bfs_kernel(__global int *d_v, __global int *d_e, __global int *d_d,
__global bool *d_continue, __global int *d_dist, const int e_count, __global int *id)
{
int tid = get_global_id(0)-get_global_offset(0);
//barrier(CLK_GLOBAL_MEM_FENCE);
for (int i = 0; i< e_count; i++)
{
id[i]=i;
}
if(tid<e_count){
id[tid]= tid;
/* for each edge (u, w) */
int u=d_v[tid];
int w=d_e[tid];
if(d_d[u]==*d_dist){ //of the interest root
if(d_d[w]==-1)
{ //not yet check
*d_continue=true; //continue
d_d[w]=*d_dist+1; //increase
}
}
}
}
The code cant give the correct result, so I debug it by printing some values (the tid inside the kernel, the marks value to check how many times the code goes through the while loop). Sadly, the tid gives rubbish values,and it goes through the while loop only once.Could you please pointing out what I am missing here?
I have another doubt: How can I do something similar as cudathreadsynchronize()? In this version of openCL, I associate clEnqueueNDRangeKernel with a command event and wait for it, but apparently I seems not to work :(
Thank you greatly.