OpenCL - GPU sum and CPU sum not the same

Question

I'm new to OpenCL.

I wrote a program that should do parallel reduction on 1 million elements array. In the last part of the code I'm comparing the CPU sum and GPU sum , they are not the same, and that is the problem. My local size is 64. From index "90" the sum in the GPU is starting to get bigger.

EDIT: if i sum smaller number (now i sum 0 - 1m) suppose 1's the final sum is correct.

KERNEL:

__kernel void gpuSumfunc(  __global float *vec ,__global float* sum, int n)          
{
    __local float tempSum[64];

    const int i;                                                        
    const int globalID = get_global_id(0); //BLOCK_DIM*BLOCK_IND+THREAD_ID
    const int tid = get_local_id(0);         //THREAD_ID
    const int BlockDIM = get_local_size(0);//BLOCK_DIM=64

    if (globalID < n)
    {
        tempSum[tid] = vec[globalID];    //Inserting global data to local data

    }
    else
    {
        tempSum[tid] = 0;
    }
        barrier(CLK_LOCAL_MEM_FENCE);    //Wating for all the threads to copy their data

        for (i = BlockDIM / 2; i > 0; i /= 2)
        {

            if (tid < i)
            {

                tempSum[tid] += tempSum[tid + i];
            }
            barrier(CLK_LOCAL_MEM_FENCE);

        }

        if (tid == 0)
        {
            sum[get_group_id(0)] = tempSum[0];
        }

    }

MAIN:

//HOST-cpu
    float *h_a;//input
    float *h_b;//output
    float *h_s;
    //DEVICE-gpu
    cl_mem d_a;//input buffer
    cl_mem d_b;//Output

               //Kernel File
    FILE* fileKernel;

    //Memory allocation - cpu input 
    vector = (float*)malloc(n * sizeof(float));
    h_a = (float*)malloc(n * sizeof(float));
    h_b = (float*)malloc(n * sizeof(float));
    h_s = (float*)malloc(n * sizeof(float));

    *vector = { 0 };
    *h_a = { 0 };
    *h_b = { 0 };
    *h_s = { 0 };


    //Initializing Data for gpu
    for (i = 0; i < n; i++) {
        h_a[i] = i;//(float)i;
    }


    //Initializing Data for cpu
    for (i = 0; i < n; i++) {
        vector[i] = i;//(float)i;
    }
    fileKernel = fopen("KernelCode.cl", "r");
    if (!fileKernel)
    {
        printf("Cannot open kernel file!\n");
        exit(1);
    }

    // Read kernel code
    kernelSource = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread(kernelSource, 1, MAX_SOURCE_SIZE, fileKernel);
    fclose(fileKernel);


    error = clGetPlatformIDs(2, cp_Platform, NULL); //array with two devices

    error = clGetDeviceIDs(cp_Platform[1], CL_DEVICE_TYPE_GPU, 1, &Device_ID, NULL); // cp_platform[1] = Nvidia GPU

    context = clCreateContext(NULL, 1, &Device_ID, NULL, NULL, &error); // creating openCL context 

    queue = clCreateCommandQueue(context, Device_ID, 0, &error); // creating command queue, executing openCL context on device cp_Platform[1] 


    globalSize = ceil(n / (float)localSize)*localSize;

    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, n * sizeof(float), NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, n * sizeof(float), NULL, NULL);

    error = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, n * sizeof(float), h_a, 0, NULL, NULL); //Enqueue commands to write to a buffer object from host memory.
    error |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,n * sizeof(float), h_s, 0, NULL, NULL); //Enqueue commands to write to a buffer object from host memory.


    program = clCreateProgramWithSource(context, 1, (const char **)& kernelSource, (const size_t *)&source_size, &error); //this function creates a program object for this specific openCL context
    error = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); //compiles and links a program executable from the program source


    kernel = clCreateKernel(program, "gpuSumfunc", &error); //creating kernel object 
    error = clGetKernelWorkGroupInfo(kernel, Device_ID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void*)&workGroupSize, NULL);
    error = clGetKernelWorkGroupInfo(kernel, Device_ID, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), (void*)&pWorkGroupSize, NULL);
    error = clGetDeviceInfo(Device_ID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(NumOfCU), &NumOfCU, NULL);

    error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); //Used to set the argument value for a specific argument of a kernel.
    error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    error |= clSetKernelArg(kernel, 2, sizeof(int), &n);
    error |= clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Enqueues a command to execute a kernel on a device.

    clFinish(queue);

    clEnqueueReadBuffer(queue, d_b, CL_TRUE, 0, n*sizeof(float) , h_b, 0, NULL, NULL); ////writing data from the device (d_b) to host(h_b)
    clock_t end = clock();

    for (i = 0; i < (n+localSize-1)/localSize; i++)
    {
        gpuSum += h_b[i];
        cpuSum = cpuSumfunc(vector, 64*(i+1));
    if ((gpuSum - cpuSum) > Tolerance)
        {
            printf("\nfailed! for index:%d",i);
            printf("\nCPU sum = %f", cpuSum);
            printf("\nGPU sum = %f\n", gpuSum);
        }
        else
        {
            printf("\nPassed! for index:%d",i);
            printf("\nCPU sum: %.2f", cpuSum);
            printf("\nGPU sum: %.2f\n", gpuSum);
        }
    }


    // cpu


    time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    //printf("\nTotal program's running time is: %.2f\n", time_spent);

    free(h_a);
    free(h_b);
    free(h_s);
    free(vector);
    //free(kernelSource);
    clReleaseProgram(program);
    clReleaseContext(context);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
}

float cpuSumfunc(float * vec, int n)
{

    float sum = 0;
    int i;

    for (i = 0; i < n; i++)
    {
        sum += vec[i];

    }
    return sum;
}

This clearly isn't a CUDA related question, so why tag it as one? — talonmies
How much is the difference? Can you show some actual values? — rustyx
@RustyX Well at index 90 the gpu sum is bigger by 16 from cpu sum, but its get bigger and bigger; cpu sum =16956560 gpu sum =16956576 — MishaOk

DarkZeros DarkZeros · Accepted Answer · 2018-01-04T15:18:35

Float32 values are not accurate enough for your summation operation and there will be rounding errors which will be different in CPU and GPU devices.

16956560 needs 25bits for accurate representation. Float32 only provides 23bits of accuracy. Which means: 16956560 + 1 = 16956560 if the operation is performed in Float32.

The differences in both devices is that:

Ordering: CPU and GPU will sum in different order, having different rounding errors.
Accuracy: Most CPUs (x86 and so) use internal 48bits float math, and then save it as 32bits. While GPUs do all math in pure 32bits.

You can solve it by using Float64 (double) or using integers (int64_t = Long).

Note: Actually, your GPU sum is more accurate than the CPU one, since it first packs small values together, and then adds those big values with the final sum.

OpenCL - GPU sum and CPU sum not the same

1 Answers