0
votes

I am starting out OpenCL by converting existing C codes to an OpenCL. I am getting strange results with the both CPU and GPU calculation. Their values change 'every time' when I run the code. When I compare with the normal C, I would get 'somewhat' acceptable results from the CPU (but, still the results are not identical with the that of native C or even other languages), but when I run the 'exact same' code with GPU, I get gibberish results.

Here is my code on the Host

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>

double *arange(double start, double end, double step)
{
   // 'arange' routine.
   int i;
   int arr_size = ((end - start) / step) + 1;
   double *output = malloc(arr_size * sizeof(double));

   for(i=0;i<arr_size;i++)
   {
      output[i] = start + (step * i);
   }

   return output;
}

int main()
{
   // This code executes on the OpenCL Host

   // Host data
   double nu_ini = 100.0, nu_end = 2000.0, nu_step = 1.0;
   double *delnu = arange(nu_ini, nu_end, nu_step);
   double *nu, *inten, A, *gam_air, gam_self, E_pprime, *n_air, *del_air;
   double *gamma, *f; 
   double prs = 950.0;

   int i, j, dum, lines=0, ID, delnu_size = (((nu_end - nu_ini)/nu_step) + 1);
   FILE *fp = fopen("h2o_HITRAN.par","r");
   char string[320];


   while(!feof(fp))
   {
     dum = fgetc(fp);
     if(dum == '\n')
     {
       lines++;
     }
   }

   rewind(fp);

   nu       = malloc(lines * sizeof(double));
   inten    = malloc(lines * sizeof(double));
   gam_air  = malloc(lines * sizeof(double));
   n_air    = malloc(lines * sizeof(double));
   del_air  = malloc(lines * sizeof(double));
   gamma    = malloc(lines * sizeof(double));
   f        = malloc(delnu_size * sizeof(double));

   i=0;
   while(fgets(string, 320, fp))
   {
      sscanf(string, "%2d %12lf %10le %10le %5lf %5lf %10lf %4lf %8lf", &ID, &nu[i], &inten[i], &A, &gam_air[i], &gam_self, &E_pprime, &n_air[i], &del_air[i]);
      i++;
   }

   size_t line_siz = sizeof(double) * lines;
   size_t delnu_siz = sizeof(double) * delnu_size;

   // gamma calculation
   for(i=0;i<lines;i++)
   {
      gamma[i] = pow((296.0/300.0),n_air[i]) * (gam_air[i]*(prs/1013.0));
   }


   // Use this to check the output of each API call
   cl_int status;

   // Retrieve the number of Platforms
   cl_uint numPlatforms = 0;
   status = clGetPlatformIDs(0, NULL, &numPlatforms);

   // Allocate enough space for each Platform
   cl_platform_id *platforms = NULL;
   platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));

   // Fill in the Platforms
   status = clGetPlatformIDs(numPlatforms, platforms, NULL);

   // Retrieve the number of Devices
   cl_uint numDevices = 0;
   status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);

   // Allocate enough spaces for each Devices
   char name_data[100];
   int *comp_units;
   cl_device_fp_config cfg;
   cl_device_id *devices = NULL;
   devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));

   // Fill in the Devices
   status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);

   // Create a context and associate it with the devices
   cl_context context = NULL;
   context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);

   // Create a command queue and associate it with the devices
   cl_command_queue cmdQueue = NULL;
   cmdQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status);



   // Create a buffer objects that will contain the data from the host array 'buf_xxxx'
   cl_mem buf_inten     = NULL;
   cl_mem buf_gamma     = NULL;
   cl_mem buf_delnu     = NULL;
   cl_mem buf_nu        = NULL;
   cl_mem buf_del_air   = NULL;
   cl_mem buf_f         = NULL;

   buf_inten   = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
   buf_gamma   = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
   buf_delnu   = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
   buf_nu      = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
   buf_del_air = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
   buf_f       = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);



   // Write input array A to the Device buffer 'buf_xxx'
   status = clEnqueueWriteBuffer(cmdQueue, buf_inten, CL_FALSE, 0, line_siz, inten, 0, NULL, NULL);
   status = clEnqueueWriteBuffer(cmdQueue, buf_gamma, CL_FALSE, 0, line_siz, gamma, 0, NULL, NULL);
   status = clEnqueueWriteBuffer(cmdQueue, buf_delnu, CL_FALSE, 0, delnu_siz, delnu, 0, NULL, NULL);
   status = clEnqueueWriteBuffer(cmdQueue, buf_nu, CL_FALSE, 0, line_siz, nu, 0, NULL, NULL);
   status = clEnqueueWriteBuffer(cmdQueue, buf_del_air, CL_FALSE, 0, line_siz, del_air, 0, NULL, NULL);


   // Create Program with the source code
   cl_program program = NULL;
   size_t program_size;
   char *program_Source;
   FILE *program_handle = fopen("abs_calc.cl","r");

   fseek(program_handle, 0, SEEK_END);
   program_size = ftell(program_handle);
   rewind(program_handle);
   program_Source = (char*)malloc(program_size+1);
   program_Source[program_size] = '\0';
   fread(program_Source, sizeof(char), program_size, program_handle);
   fclose(program_handle);

   program = clCreateProgramWithSource(context, 1, (const char**)&program_Source, &program_size, &status);

   // Compile the Program for the Device
   status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);

   // Create the vector addition kernel
   cl_kernel kernel = NULL;
   kernel = clCreateKernel(program, "abs_cross", &status);

   // Associate the input and output buffers with the kernel
   status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_inten);
   status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_gamma);
   status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_delnu);
   status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nu);
   status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_del_air);
   status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_f);

   // Define index space (global work size) of work items for execution.
   // A workgroup size (local work size) is not required, but can be used.
   size_t globalWorkSize[2] = {lines, delnu_size};


   // Execute the kernel for execution
   status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);

   // Read the Device output buffer to the host output array
   clEnqueueReadBuffer(cmdQueue, buf_f, CL_TRUE, 0, delnu_siz, f, 0, NULL, NULL);

   // Verify the output
   FILE *file = fopen("opencl_output","w");

   for(i=0;i<delnu_size;i++)
   {
      fprintf(file, "%le %le\n", delnu[i], f[i]);
   }

   // Free OpenCL resources
   clReleaseKernel(kernel);
   clReleaseProgram(program);
   clReleaseCommandQueue(cmdQueue);
   clReleaseMemObject(buf_nu);
   clReleaseMemObject(buf_inten);
   clReleaseMemObject(buf_del_air);
   clReleaseMemObject(buf_gamma);
   clReleaseMemObject(buf_f);
   clReleaseMemObject(buf_delnu);
   clReleaseContext(context);

   // Free host resources
   free(nu);
   free(inten);
   free(gam_air);
   free(n_air);
   free(del_air);
   free(delnu);
   free(gamma);
   free(f);
   free(platforms);
   free(devices);
   fclose(fp);
   fclose(file);
   return 0;
}

and this is my kernel code

#pragma OPENCL EXTENSION cl_khr_fp64 : enable
kernel void abs_cross(global double *inten,
                      global double *gamma,
                      global double *delnu,
                      global double *nu,
                      global double *del_air,
                      global double *f)
{
   double pie = 4.0*atan(1.0);

   int i = get_global_id(0);
   int j = get_global_id(1);

   f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pown(gamma[i],2) + pown((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));

}

Am I doing something wrong?

Thank you.

3
What GPU are you using? Are you sure it supports the cl_khr_fp64 extension?Jovasa
As per OpenCL specification minimum accuracy in ULP for pown is 4 ulp. Math library normally has more precision and likely returns correctly rounded results i.e 0.5ulp. Could you add some reference code for actual computation your are doing? Inside kernel output is written to f[j] and it seems wrong, you might need atomic addition.kanna

3 Answers

2
votes

You appear to be running a 2D global work size, but storing into a location based only on dimension 1 (not 0). Therefore multiple work items are storing into the same location using +=. You have a race condition. You could use atomics to solve this, but it will likely slow the performance down too much. Therefore, you should store intermediate results and then do a parallel reduction operation.

0
votes

I am using AMD W2100, and yes, I have printed out all the supported extension and it included cl_khr_fp64 extension.

Sorry, I forgot to include the original calculation. The actual calculation goes like the following..

for(i=0,i<lines;i++)
{
for(j=0;j<delnu_size;j++)
{
  f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
0
votes

I would write OpenCL kernel as below, Without using atomics and only single work dimension. global_work_size = delnu_size There could be a better way but its the simplest one.

__kernel void test(__global double *gamma, 
                   __global double *inten,
                   __global double *delnu,
                   __global double *delair,
                   __global double *f,
                   const int lines)
{
   double pie = 4.0*atan(1.0);
   int j = get_global_id(0);
   f[j] = 0;
   for(i=0,i<lines;i++)
   {
     f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
   }
}

You need to understand how OpenCL kernel is executed. You can think of it as large number of threads executing concurrently and each thread could be identified with get_global_id