Shortest paths by BFS, porting a code from CUDA to openCL

Question

I am currently porting a CUDA code that finds shortest paths from each node to other nodes in a (undirected) graph. So basically, the CUDA code constructs a graph read from a text file. Then it proceeds to build adjancent arrays h_v and h_e.

For example
A B
A C
B C
gives 
h_v[0] = 0, h_e[0]=1
h_v[1] = 0, h_e[1]=2
h_v[2] = 1, h_e[2]=2

Then it calls the kernel to compute shortest paths from each node using BFS.

The cuda host code is as follow:

int cc_bfs(int n_count, int e_count, int *h_v, int *h_e, float *h_cc, bool ec){
  int *d_v, *d_e;


  cudaCheckError(cudaMalloc((void **)&d_v, sizeof(int)*e_count));
  cudaCheckError(cudaMalloc((void **)&d_e, sizeof(int)*e_count)); 

  cudaCheckError(cudaMemcpy(d_v, h_v, sizeof(int)*e_count, cudaMemcpyHostToDevice));
  cudaCheckError(cudaMemcpy(d_e, h_e, sizeof(int)*e_count, cudaMemcpyHostToDevice));

  int *d_d, *d_dist; 

  cudaCheckError(cudaMalloc((void **)&d_d, sizeof(int)*n_count));
  cudaCheckError(cudaMalloc((void **)&d_dist, sizeof(int)));

  int *h_d;
  h_d=(int *)malloc(sizeof(int)*n_count);
  bool *d_continue;
  cudaCheckError(cudaMalloc((void**)&d_continue, sizeof(bool)));

  for(int s=0; s<n_count; s++){ //BIG FOR LOOP

    //////code to initalize h_d[i]
    for(int i=0; i<n_count; i++)
      h_d[i]=-1;
    h_d[s]=0; //for marking the root
    cudaCheckError(cudaMemcpy(d_d, h_d, sizeof(int)*n_count, cudaMemcpyHostToDevice));
    //////////////////////////////

    ///////////////////////////////
    int threads_per_block=e_count;
    int blocks=1;
    if(e_count>MAX_THREADS_PER_BLOCK){
      blocks = (int)ceil(e_count/(float)MAX_THREADS_PER_BLOCK); 
      threads_per_block = MAX_THREADS_PER_BLOCK; 
    }
    dim3 grid(blocks);
    dim3 threads(threads_per_block);
    /////////////////////////////////

    bool h_continue;
    int h_dist=0;
    cudaCheckError(cudaMemset(d_dist, 0, sizeof(int)));
    do{
      h_continue=false;
      cudaCheckError(cudaMemcpy(d_continue, &h_continue, sizeof(bool), cudaMemcpyHostToDevice));
      cc_bfs_kernel<<<grid, threads>>>(d_v, d_e, d_d, d_continue, d_dist, e_count);
      checkCUDAError("Kernel invocation");
      cudaThreadSynchronize();
      h_dist++;
      cudaCheckError(cudaMemcpy(d_dist, &h_dist, sizeof(int), cudaMemcpyHostToDevice));//for what?
      cudaCheckError(cudaMemcpy(&h_continue, d_continue, sizeof(bool), cudaMemcpyDeviceToHost));
    }while(h_continue);

    ///////////////////
    //then code to read back h_d from device


}

And here is cuda kernel

__global__ void cc_bfs_kernel(int *d_v, int *d_e, int  *d_d,
    bool *d_continue, int *d_dist, int e_count){
  int tid = blockIdx.x*blockDim.x + threadIdx.x;
  if(tid<e_count){
    /* for each edge (u, w) */
    int u=d_v[tid];
    int w=d_e[tid];
    if(d_d[u]==*d_dist){ //of the interest root
      if(d_d[w]==-1){ //not yet check
        *d_continue=true; //continue
        d_d[w]=*d_dist+1; //increase
      }   
    }   
  }
}

Here is my effort to port it to openCL. I am just an amateur in openCL, so I am trying the best to port the original code line by line :(

openCL host code

cl_mem d_d= clCreateBuffer(context,CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*n_count, NULL,NULL);
cl_mem d_dist= clCreateBuffer(context,CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,sizeof(int), NULL,NULL);
int *h_d;
h_d=(int *)malloc(sizeof(int)*n_count);
cl_mem d_continue = clCreateBuffer(context,CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,sizeof(bool), NULL,NULL);
float* h_cc;
h_cc = (float *)malloc(sizeof(float)*n_count);

cl_mem d_v= clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*e_count, NULL,NULL);
cl_mem d_e= clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*e_count, NULL,NULL);

err = clEnqueueWriteBuffer(queue, d_v, CL_TRUE, 0, e_count * sizeof(int), host_v, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, d_e, CL_TRUE, 0, e_count * sizeof(int), host_e, 0, NULL, NULL);

size_t global_size= e_count;

for(int s=0; s<n_count; s++)
{ //BIG LOOP

    //initalize h_d[i]
    for(int i=0; i<n_count; i++)
        h_d[i]=-1;
    h_d[s]=0;

    //copy h_d to d_d
     err = clEnqueueWriteBuffer(queue, d_d, CL_TRUE, 0,
        n_count * sizeof(int), h_d, 0, NULL, NULL);

    bool h_continue;
    int h_dist=0;
    int mark = 0;
    int* h_id;
    h_id= (int*) malloc(sizeof(int)*e_count);
    cl_mem id= clCreateBuffer(context,CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR,
            sizeof(int)*e_count, NULL,NULL);


    do{
        h_continue=false;
        err = clEnqueueWriteBuffer(queue, d_continue, CL_TRUE, 0,
          sizeof(bool), &h_continue, 0, NULL, NULL);


        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_v);
        err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_e);
        err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_d);
        err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&d_continue);
        err = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&d_dist);
        err = clSetKernelArg(kernel, 5, sizeof(int), (void *)&e_count);
        err = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&id);

        /////EXECUTE
        cl_event sync1;
        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, 
            &global_size, NULL, 0, NULL, &sync1); //wait for this to finish (to synchronize)                
        err = clWaitForEvents(1, &sync1);
        clReleaseEvent(sync1);          
        ///////////////////


        err = clEnqueueReadBuffer(queue, id, CL_TRUE, 0,
        sizeof(int)*e_count, h_id, 0, NULL, NULL);

        printf("e_count = %d error : %d\n",e_count, err);//check error?

        for(int j = 0; j< e_count; j++)
        {
            printf("%d ",h_id[j]);
        }

        h_dist++;
        mark++;//for debug
        err = clEnqueueWriteBuffer(queue, d_dist, CL_TRUE, 0,
        sizeof(int), &h_dist, 0, NULL, NULL);
        err = clEnqueueReadBuffer(queue, d_continue, CL_TRUE, 0,
        sizeof(bool), &h_continue, 0, NULL, NULL);
    }
    while(h_continue);

    err = clEnqueueReadBuffer(queue, d_d, CL_TRUE, 0,
        n_count*sizeof(int), h_d, 0, NULL, NULL);

and openCL kernel

__kernel void cc_bfs_kernel(__global int *d_v, __global int *d_e, __global int  *d_d,
    __global bool *d_continue, __global int *d_dist, const int e_count, __global int *id)
{


        int tid = get_global_id(0)-get_global_offset(0);
        //barrier(CLK_GLOBAL_MEM_FENCE);

        for (int i = 0; i< e_count; i++)
        {
            id[i]=i;
        }

        if(tid<e_count){
            id[tid]= tid;
            /* for each edge (u, w) */
            int u=d_v[tid];
            int w=d_e[tid];
            if(d_d[u]==*d_dist){ //of the interest root
                if(d_d[w]==-1)
                { //not yet check 
                    *d_continue=true; //continue
                    d_d[w]=*d_dist+1; //increase
                }   
            }   
        }
}

The code cant give the correct result, so I debug it by printing some values (the tid inside the kernel, the marks value to check how many times the code goes through the while loop). Sadly, the tid gives rubbish values,and it goes through the while loop only once.Could you please pointing out what I am missing here?

I have another doubt: How can I do something similar as cudathreadsynchronize()? In this version of openCL, I associate clEnqueueNDRangeKernel with a command event and wait for it, but apparently I seems not to work :(

Thank you greatly.

Pragmateek Pragmateek · Accepted Answer · 2012-12-29T12:24:00

First you should ensure that every step is correct by checking the error codes.

As an example, AFAIK, this code is not valid :

clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(int)*e_count, NULL,NULL)

because you are asking to use an allocated memory area whereas you're not providing any : the host_ptr parameter is indeed NULL.

Either remove this flag or if you really want host memory specify : CL_MEM_ALLOC_HOST_PTR.

Check the API documentation to know for each function how to retrieve status : either using the return value or a dedicated parameter (the last one) like clCreateBuffer does : http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clCreateBuffer.html

For your code it should give a CL_INVALID_HOST_PTR error.

Shortest paths by BFS, porting a code from CUDA to openCL

1 Answers