OpenCL GPU calculation wrong

Question

I am starting out OpenCL by converting existing C codes to an OpenCL. I am getting strange results with the both CPU and GPU calculation. Their values change 'every time' when I run the code. When I compare with the normal C, I would get 'somewhat' acceptable results from the CPU (but, still the results are not identical with the that of native C or even other languages), but when I run the 'exact same' code with GPU, I get gibberish results.

Here is my code on the Host

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>

double *arange(double start, double end, double step)
{
   // 'arange' routine.
   int i;
   int arr_size = ((end - start) / step) + 1;
   double *output = malloc(arr_size * sizeof(double));

   for(i=0;i<arr_size;i++)
   {
      output[i] = start + (step * i);
   }

   return output;
}

int main()
{
   // This code executes on the OpenCL Host

   // Host data
   double nu_ini = 100.0, nu_end = 2000.0, nu_step = 1.0;
   double *delnu = arange(nu_ini, nu_end, nu_step);
   double *nu, *inten, A, *gam_air, gam_self, E_pprime, *n_air, *del_air;
   double *gamma, *f; 
   double prs = 950.0;

   int i, j, dum, lines=0, ID, delnu_size = (((nu_end - nu_ini)/nu_step) + 1);
   FILE *fp = fopen("h2o_HITRAN.par","r");
   char string[320];


   while(!feof(fp))
   {
     dum = fgetc(fp);
     if(dum == '\n')
     {
       lines++;
     }
   }

   rewind(fp);

   nu       = malloc(lines * sizeof(double));
   inten    = malloc(lines * sizeof(double));
   gam_air  = malloc(lines * sizeof(double));
   n_air    = malloc(lines * sizeof(double));
   del_air  = malloc(lines * sizeof(double));
   gamma    = malloc(lines * sizeof(double));
   f        = malloc(delnu_size * sizeof(double));

   i=0;
   while(fgets(string, 320, fp))
   {
      sscanf(string, "%2d %12lf %10le %10le %5lf %5lf %10lf %4lf %8lf", &ID, &nu[i], &inten[i], &A, &gam_air[i], &gam_self, &E_pprime, &n_air[i], &del_air[i]);
      i++;
   }

   size_t line_siz = sizeof(double) * lines;
   size_t delnu_siz = sizeof(double) * delnu_size;

   // gamma calculation
   for(i=0;i<lines;i++)
   {
      gamma[i] = pow((296.0/300.0),n_air[i]) * (gam_air[i]*(prs/1013.0));
   }


   // Use this to check the output of each API call
   cl_int status;

   // Retrieve the number of Platforms
   cl_uint numPlatforms = 0;
   status = clGetPlatformIDs(0, NULL, &numPlatforms);

   // Allocate enough space for each Platform
   cl_platform_id *platforms = NULL;
   platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));

   // Fill in the Platforms
   status = clGetPlatformIDs(numPlatforms, platforms, NULL);

   // Retrieve the number of Devices
   cl_uint numDevices = 0;
   status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);

   // Allocate enough spaces for each Devices
   char name_data[100];
   int *comp_units;
   cl_device_fp_config cfg;
   cl_device_id *devices = NULL;
   devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));

   // Fill in the Devices
   status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);

   // Create a context and associate it with the devices
   cl_context context = NULL;
   context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);

   // Create a command queue and associate it with the devices
   cl_command_queue cmdQueue = NULL;
   cmdQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status);



   // Create a buffer objects that will contain the data from the host array 'buf_xxxx'
   cl_mem buf_inten     = NULL;
   cl_mem buf_gamma     = NULL;
   cl_mem buf_delnu     = NULL;
   cl_mem buf_nu        = NULL;
   cl_mem buf_del_air   = NULL;
   cl_mem buf_f         = NULL;

   buf_inten   = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
   buf_gamma   = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
   buf_delnu   = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
   buf_nu      = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
   buf_del_air = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
   buf_f       = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);



   // Write input array A to the Device buffer 'buf_xxx'
   status = clEnqueueWriteBuffer(cmdQueue, buf_inten, CL_FALSE, 0, line_siz, inten, 0, NULL, NULL);
   status = clEnqueueWriteBuffer(cmdQueue, buf_gamma, CL_FALSE, 0, line_siz, gamma, 0, NULL, NULL);
   status = clEnqueueWriteBuffer(cmdQueue, buf_delnu, CL_FALSE, 0, delnu_siz, delnu, 0, NULL, NULL);
   status = clEnqueueWriteBuffer(cmdQueue, buf_nu, CL_FALSE, 0, line_siz, nu, 0, NULL, NULL);
   status = clEnqueueWriteBuffer(cmdQueue, buf_del_air, CL_FALSE, 0, line_siz, del_air, 0, NULL, NULL);


   // Create Program with the source code
   cl_program program = NULL;
   size_t program_size;
   char *program_Source;
   FILE *program_handle = fopen("abs_calc.cl","r");

   fseek(program_handle, 0, SEEK_END);
   program_size = ftell(program_handle);
   rewind(program_handle);
   program_Source = (char*)malloc(program_size+1);
   program_Source[program_size] = '\0';
   fread(program_Source, sizeof(char), program_size, program_handle);
   fclose(program_handle);

   program = clCreateProgramWithSource(context, 1, (const char**)&program_Source, &program_size, &status);

   // Compile the Program for the Device
   status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);

   // Create the vector addition kernel
   cl_kernel kernel = NULL;
   kernel = clCreateKernel(program, "abs_cross", &status);

   // Associate the input and output buffers with the kernel
   status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_inten);
   status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_gamma);
   status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_delnu);
   status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nu);
   status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_del_air);
   status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_f);

   // Define index space (global work size) of work items for execution.
   // A workgroup size (local work size) is not required, but can be used.
   size_t globalWorkSize[2] = {lines, delnu_size};


   // Execute the kernel for execution
   status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);

   // Read the Device output buffer to the host output array
   clEnqueueReadBuffer(cmdQueue, buf_f, CL_TRUE, 0, delnu_siz, f, 0, NULL, NULL);

   // Verify the output
   FILE *file = fopen("opencl_output","w");

   for(i=0;i<delnu_size;i++)
   {
      fprintf(file, "%le %le\n", delnu[i], f[i]);
   }

   // Free OpenCL resources
   clReleaseKernel(kernel);
   clReleaseProgram(program);
   clReleaseCommandQueue(cmdQueue);
   clReleaseMemObject(buf_nu);
   clReleaseMemObject(buf_inten);
   clReleaseMemObject(buf_del_air);
   clReleaseMemObject(buf_gamma);
   clReleaseMemObject(buf_f);
   clReleaseMemObject(buf_delnu);
   clReleaseContext(context);

   // Free host resources
   free(nu);
   free(inten);
   free(gam_air);
   free(n_air);
   free(del_air);
   free(delnu);
   free(gamma);
   free(f);
   free(platforms);
   free(devices);
   fclose(fp);
   fclose(file);
   return 0;
}

and this is my kernel code

#pragma OPENCL EXTENSION cl_khr_fp64 : enable
kernel void abs_cross(global double *inten,
                      global double *gamma,
                      global double *delnu,
                      global double *nu,
                      global double *del_air,
                      global double *f)
{
   double pie = 4.0*atan(1.0);

   int i = get_global_id(0);
   int j = get_global_id(1);

   f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pown(gamma[i],2) + pown((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));

}

Am I doing something wrong?

Thank you.

What GPU are you using? Are you sure it supports the cl_khr_fp64 extension? — Jovasa
As per OpenCL specification minimum accuracy in ULP for pown is 4 ulp. Math library normally has more precision and likely returns correctly rounded results i.e 0.5ulp. Could you add some reference code for actual computation your are doing? Inside kernel output is written to f[j] and it seems wrong, you might need atomic addition. — kanna

Dithermaster Dithermaster · Accepted Answer · 2017-03-14T20:03:38

You appear to be running a 2D global work size, but storing into a location based only on dimension 1 (not 0). Therefore multiple work items are storing into the same location using +=. You have a race condition. You could use atomics to solve this, but it will likely slow the performance down too much. Therefore, you should store intermediate results and then do a parallel reduction operation.

OpenCL GPU calculation wrong

3 Answers