1
votes

I'm a beginner in OpenCL and am trying to run the sample codes of the "OpenLC in Action" book. I have the following code to get the preferred vector width of my device. The platforms detected on my computer are from Intel Core i7 and HD graphics and another one from NVIDIA GeForce 940M. Whenever I run the code, it gives "1" for vector width of every type unless type double which is zero because it is not supported. Even when I change the platform in my computer to check its devices, results are the same. I ran the code on an AMD computer and it seemed to work properly because it gave me different numbers for different types. But, I am not sure why this code keeps giving me "1" for every type on different platforms of my computer. Any ideas? Here is the output: enter image description here

Here is the code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <CL/cl.h>

int main(){

    cl_int err, i, j;
    cl_platform_id *platforms;
    cl_device_id *devices;
    cl_uint num_platforms, num_devices, vector_width;
    size_t plat_name_size, devi_name_size;
    char *plat_name_data, *devi_name_data;


    err = clGetPlatformIDs(1, NULL, &num_platforms);
    if (err < 0){
        perror("No platform is found");
        exit(1);
    }
    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
    clGetPlatformIDs(num_platforms, platforms, NULL);

    printf("Number of found platforms is %d\n ", num_platforms);

    for (i = 0; i < num_platforms; i++){

        err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &plat_name_size);
        if (err < 0){
            perror("Couldn't read platform name.");
            exit(1);
        }
        plat_name_data = (char*)malloc(plat_name_size);
        clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, plat_name_size, plat_name_data, NULL);
        printf("Platform No.%d is: %s\n", i, plat_name_data);

        err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices);
        if (err < 0){
            perror("No device is found in this platform");
            exit(1);
        }
        devices = (cl_device_id*)malloc(sizeof(cl_device_id)*(num_devices));
        clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
        printf("Number of devices found in this platform is: %d\n", num_devices);
        for (j = 0; j < num_devices; j++){
            err = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &devi_name_size);
            if (err < 0){
                perror("Couldn't read the device name.");
                exit(1);
            }
            devi_name_data = (char*)malloc(devi_name_size);
            clGetDeviceInfo(devices[j], CL_DEVICE_NAME, devi_name_size, devi_name_data, NULL);
            printf("Device No.%d name is: %s\n", j + 1, devi_name_data);
            if (strstr(devi_name_data, "GeForce 940M")){
                clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, 
                    sizeof(cl_uint), &vector_width, NULL);
                printf("Preferred vector width in chars: %u\n", vector_width);
                clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
                    sizeof(cl_uint), &vector_width, NULL);
                printf("Preferred vector width in shorts: %u\n", vector_width);
                clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
                    sizeof(cl_uint), &vector_width, NULL);
                printf("Preferred vector width in ints: %u\n", vector_width);
                clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
                    sizeof(cl_uint), &vector_width, NULL);
                printf("Preferred vector width in longs: %u\n", vector_width);
                clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
                    sizeof(cl_uint), &vector_width, NULL);
                printf("Preferred vector width in floats: %u\n", vector_width);
                clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                    sizeof(cl_uint), &vector_width, NULL);
                printf("Preferred vector width in doubles: %u\n", vector_width);
            }
        }

    }
    return 0;
}
2

2 Answers

2
votes

Short answer: You are querying it correctly, and the platform compiler knows what is the best vector width size. So yes, it is correct the value of 1.

Long answer: For a CPU (any type of CPU) it is likely to prefer non vectored. Specially on Intel CPU + Compiler, since the Intel compiler does the vectorization as part of the optimization process, so it prefers the user NOT to vectorize the code in the first place.

Indeed it looks like nVIDIA also prefers the user to input non vectorized code. It does not mean code will run slower if vectorized already. It just means the compiler (due to the optimization techniques it has) prefers the code to be unvectorized.

Updates to the OpenCL drivers may lead to a change of these values. Also, you should take them as orientative. Other factors as: local memory usage, coalesced global access, local size, etc... are way more important usually.

2
votes

Here is one experiment that I've done to see how vectorized operations perform in a device which prefers to do scalar operations. I have implemented the reduction algorithm with two different kernels. The first kernel treats data as scalars while the second treats data as float4 vectors (Codes are given below). Here is the execution results. It is clear that although the NVIDIA device prefers non-vectorized operation, vectorized operation is faster.

Preferred vector width: 1 reduction_scalar: Check passed. Total time = 4471424 reduction_vector: Check passed. Total time = 1723776

And here is the code:

#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "reduction.cl"

#define ARRAY_SIZE 1048576
#define NUM_KERNELS 2

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {

    cl_platform_id platform;
    cl_device_id dev;
    int err;

    /* Identify a platform */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err < 0) {
        perror("Couldn't identify a platform");
        exit(1);
    }

    /* Access a device */
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
    if (err == CL_DEVICE_NOT_FOUND) {
        printf(" GPU is not first! Going on CPU :(");
        err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
    }
    if (err < 0) {
        perror("Couldn't access any devices");
        exit(1);
    }

    return dev;
}

/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

    cl_program program;
    FILE *program_handle;
    char *program_buffer, *program_log;
    size_t program_size, log_size;
    int err;

    /* Read program file and place content into buffer */
    program_handle = fopen(filename, "r");
    if (program_handle == NULL) {
        perror("Couldn't find the program file");
        exit(1);
    }
    fseek(program_handle, 0, SEEK_END);
    program_size = ftell(program_handle);
    rewind(program_handle);
    program_buffer = (char*)malloc(program_size + 1);
    program_buffer[program_size] = '\0';
    fread(program_buffer, sizeof(char), program_size, program_handle);
    fclose(program_handle);

    /* Create program from file */
    program = clCreateProgramWithSource(ctx, 1,
        (const char**)&program_buffer, &program_size, &err);
    if (err < 0) {
        perror("Couldn't create the program");
        exit(1);
    }
    free(program_buffer);

    /* Build program */
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err < 0) {

        /* Find size of log and print to std output */
        clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
            0, NULL, &log_size);
        program_log = (char*)malloc(log_size + 1);
        program_log[log_size] = '\0';
        clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
            log_size + 1, program_log, NULL);
        printf("%s\n", program_log);
        free(program_log);
        exit(1);
    }

    return program;
}

int main() {

    /* OpenCL structures */
    cl_device_id device;
    cl_context context;
    cl_program program;
    cl_kernel kernel[NUM_KERNELS];
    cl_command_queue queue;
    cl_event prof_event;
    cl_int i, j, err, preferred_width;
    size_t local_size, global_size;
    char kernel_names[NUM_KERNELS][20] =
    { "reduction_scalar", "reduction_vector" };

    /* Data and buffers */
    float *data = (float *)malloc(sizeof(float)* ARRAY_SIZE);
    //float data[ARRAY_SIZE];
    float sum, actual_sum, *scalar_sum, *vector_sum;
    cl_mem data_buffer, scalar_sum_buffer, vector_sum_buffer;
    cl_int num_groups;
    cl_ulong time_start, time_end, total_time;

    /* Initialize data */
    for (i = 0; i<ARRAY_SIZE; i++) {
        data[i] = 1.0f*i;
    }

    /* Create device and determine local size */
    device = create_device();
    clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
        sizeof(preferred_width), &preferred_width, NULL);
    printf("Preferred vector width: %d\n", preferred_width);
    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
        sizeof(local_size), &local_size, NULL);
    if (err < 0) {
        perror("Couldn't obtain device information");
        exit(1);
    }

    /* Allocate and initialize output arrays */
    num_groups = ARRAY_SIZE / local_size;
    scalar_sum = (float*)malloc(num_groups * sizeof(float));
    vector_sum = (float*)malloc(num_groups / 4 * sizeof(float));
    for (i = 0; i<num_groups; i++) {
        scalar_sum[i] = 0.0f;
    }
    for (i = 0; i<num_groups / 4; i++) {
        vector_sum[i] = 0.0f;
    }

    /* Create a context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if (err < 0) {
        perror("Couldn't create a context");
        exit(1);
    }

    /* Build program */
    program = build_program(context, device, PROGRAM_FILE);

    /* Create data buffer */
    data_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
        CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
    scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
        CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), scalar_sum, &err);
    vector_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
        CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), vector_sum, &err);
    if (err < 0) {
        perror("Couldn't create a buffer");
        exit(1);
    };

    /* Create a command queue */
    queue = clCreateCommandQueue(context, device,
        CL_QUEUE_PROFILING_ENABLE, &err);
    if (err < 0) {
        perror("Couldn't create a command queue");
        exit(1);
    };

    for (i = 0; i<NUM_KERNELS; i++) {

        /* Create a kernel */
        kernel[i] = clCreateKernel(program, kernel_names[i], &err);
        if (err < 0) {
            perror("Couldn't create a kernel");
            exit(1);
        };

        /* Create kernel arguments */
        err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &data_buffer);
        if (i == 0) {
            global_size = ARRAY_SIZE;
            err |= clSetKernelArg(kernel[i], 1, local_size * sizeof(float), NULL);
            err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &scalar_sum_buffer);
        }
        else {
            global_size = ARRAY_SIZE / 4;
            err |= clSetKernelArg(kernel[i], 1, local_size * 4 * sizeof(float), NULL);
            err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &vector_sum_buffer);
        }
        if (err < 0) {
            perror("Couldn't create a kernel argument");
            exit(1);
        }

        /* Enqueue kernel */
        err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, &global_size,
            &local_size, 0, NULL, &prof_event);
        if (err < 0) {
            perror("Couldn't enqueue the kernel");
            exit(1);
        }

        /* Finish processing the queue and get profiling information */
        clFinish(queue);
        clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
            sizeof(time_start), &time_start, NULL);
        clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
            sizeof(time_end), &time_end, NULL);
        total_time = time_end - time_start;

        /* Read the result */
        if (i == 0) {
            err = clEnqueueReadBuffer(queue, scalar_sum_buffer, CL_TRUE, 0,
                num_groups * sizeof(float), scalar_sum, 0, NULL, NULL);
            if (err < 0) {
                perror("Couldn't read the buffer");
                exit(1);
            }
            sum = 0.0f;
            for (j = 0; j<num_groups; j++) {
                sum += scalar_sum[j];
            }
        }
        else {
            err = clEnqueueReadBuffer(queue, vector_sum_buffer, CL_TRUE, 0,
                num_groups / 4 * sizeof(float), vector_sum, 0, NULL, NULL);
            if (err < 0) {
                perror("Couldn't read the buffer");
                exit(1);
            }
            sum = 0.0f;
            for (j = 0; j<num_groups / 4; j++) {
                sum += vector_sum[j];
            }
        }

        /* Check result */
        printf("%s: ", kernel_names[i]);
        actual_sum = 1.0f * ARRAY_SIZE / 2 * (ARRAY_SIZE - 1);
        if (fabs(sum - actual_sum) > 0.01*fabs(sum))
            printf("Check failed.\n");
        else
            printf("Check passed.\n");
        printf("Total time = %lu\n\n", total_time);

        /* Deallocate event */
        clReleaseEvent(prof_event);
    }

    /* Deallocate resources */
    free(scalar_sum);
    free(vector_sum);
    for (i = 0; i<NUM_KERNELS; i++) {
        clReleaseKernel(kernel[i]);
    }
    clReleaseMemObject(scalar_sum_buffer);
    clReleaseMemObject(vector_sum_buffer);
    clReleaseMemObject(data_buffer);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program);
    clReleaseContext(context);
    return 0;
}

and the kernels:

__kernel void reduction_scalar(__global float* data, 
      __local float* partial_sums, __global float* output) {

   int lid = get_local_id(0);
   int group_size = get_local_size(0);

   partial_sums[lid] = data[get_global_id(0)];
   barrier(CLK_LOCAL_MEM_FENCE);

   for(int i = group_size/2; i>0; i >>= 1) {
      if(lid < i) {
         partial_sums[lid] += partial_sums[lid + i];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
   }

   if(lid == 0) {
      output[get_group_id(0)] = partial_sums[0];
   }
}

__kernel void reduction_vector(__global float4* data, 
      __local float4* partial_sums, __global float* output) {

   int lid = get_local_id(0);
   int group_size = get_local_size(0);

   partial_sums[lid] = data[get_global_id(0)];
   barrier(CLK_LOCAL_MEM_FENCE);

   for(int i = group_size/2; i>0; i >>= 1) {
      if(lid < i) {
         partial_sums[lid] += partial_sums[lid + i];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
   }

   if(lid == 0) {
      output[get_group_id(0)] = dot(partial_sums[0], (float4)(1.0f));
   }
}