0
votes

I have been working on an c/c++ OpenCL solution for the past few weeks now. For my solution, I need to pass a class from my CPU(Host) to GPU(Device). When I try to pass the class as an argument it gives an error "Unknown Type-Identifier Class". My doubt whether OpenCL on Intel Platform does it allow us to pass a class to kernel or any work around is available for it. In CUDA I have seen some examples and it works perfectly fine for the platform. However, with respect to OpenCL I am not able to find any references and No examples related to this query. I would be really thankful to any help with regards to this issue. I have posted the same question on Intel website but to no avail. If someone would be kind enough to help me understand where I am going wrong or how I should proceed with this I would be really thankful to you.

//HOST SIDE CODE

#include<stdio.h>
#include<iostream>
#include"CL/cl.h"


class test
{
public:
    cl_int a;
    cl_char b;
};

int main()
{
    test *tempstruct = new test;

    cl_platform_id platfrom_id;
    cl_device_id device_id; // compute device id 
    cl_context context; // compute context
    cl_command_queue commands; // compute command queue
    cl_program program; // compute program
    cl_kernel kernel; // compute kernel

    int err;

    err = clGetPlatformIDs(1, &platfrom_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a platfrom group!\n");
        return -1;
    }

    err = clGetDeviceIDs(platfrom_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return -1;
    }

    context = clCreateContext(0, 1, &device_id, NULL, NULL, NULL);

    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return -1;
    }

    commands = clCreateCommandQueue(context, device_id, 0, NULL);

    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return -1;
    }

    #define MAX_SOURCE_SIZE (0x100000) 
    FILE *fp, *fp1;
    char filename[] = "Template.cl";
    fp = fopen(filename, "r");
    if (fp == NULL)
    {
        printf("\n file  not found \n");
        return -1;
    }
    char * source_str = (char*)malloc(MAX_SOURCE_SIZE);
    size_t size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);

    cl_mem classobj = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(tempstruct), &tempstruct, &err);

    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to allocate device memory!\n");
        return -1;
    }

    program = clCreateProgramWithSource(context, 1, (const char **)& source_str, (const size_t *)&size, &err);

    if (!program)
    {
        printf("Error: Failed to create program with source!\n");
        return -1;
    }

    err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to build program executable!\n");
        return -1;
    }


    test  *resptr = (test *)clEnqueueMapBuffer(commands, classobj, CL_TRUE, CL_MAP_WRITE, NULL, sizeof(test), NULL, NULL, NULL, &err);
    // INITIALISATION OF CLASS

    tempstruct->a = 10;
    if (!resptr)
    {
        printf("Error: Failed to create enqueuemapbuffer!\n");
        return -1;
    }

    err = clEnqueueUnmapMemObject(commands, classobj, resptr, 0, NULL, NULL);

    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        return -1;
    }

    kernel = clCreateKernel(program, "CLASS", &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        return -1;
    }

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &classobj);

    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        return -1;
    }

    size_t globalsize = 1;
    size_t local = 1;

    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &globalsize, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute nd range!\n");
        return -1;
    }

    test  *resptr1 = (test *)clEnqueueMapBuffer(commands, classobj, CL_TRUE, CL_MAP_READ, NULL, sizeof(test), NULL, NULL, NULL, &err);

    err = clEnqueueUnmapMemObject(commands, classobj, resptr1, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        return -1;
    }

    // again i am printing the class value

    printf("\n in cpu side = %d\n", tempstruct->a);

}


//HOST END


//DEVICE SIDE(KERNEL CODE)

 // filename :  Template.cl

class test
{
public:
    cl_int a;
    cl_char b;
};

__kernel void CLASS(__global test *inclass )
{
   inclass->a = 10;
    printf("\n in kernel side = %d \n",inclass->a);
}

//KERNEL END

ERRORS:

I am facing these all errors at kernel side only

1) Error Tempalte.CL unknown type name 'test'
2) Error Tempalte.CL expected ';' after top level declarator
3) Error Tempalte.CL program scope variables are required to be declared in constant address space
4) Error Tempalte.CL unknown type name 'class'

QUERY:

Q)My main Question is How to Pass a CLASS to kernel in Intel architecture
* I have successfully passed class to kernel in AMD. whenever I tried with same code in Intel side it shows the above four errors.
* Is there any alternative method to pass class to kernel in Intel or is it possible to pass class to kernel in Intel architecture ?

2
you need sycl for that. it lets you use nearly purely c++. Can you post codes that makes error? Which version of opencl you use?huseyin tugrul buyukisik
@huseyin tugrul buyukisik, I have added the code as per your request. Let me know what you can find from it. Thanks in advance.Harrisson

2 Answers

4
votes

OpenCL uses C99. So you can pass structs, but not classes, to the kernel.

As huseyin tugrul buyukisik says, you can use SYCL, which supports c++14 (or thereabouts).

Alternatively, if you want to support both NVIDIA® CUDA™ and OpenCL, you could write it only in NVIDIA® CUDA™, and then use https://github.com/hughperkins/cuda-on-cl to run the NVIDIA® CUDA™ code on OpenCL 1.2 GPU devices. Full disclosure: I'm the author of cuda-on-cl, and it's a bit of a work-in-progress for now. It does work though, with some caveats/limitations. It can handle full-blown C++11, templates, classes etc. For example, it can be used to compile and run Eigen on OpenCL 1.2 GPUs https://bitbucket.org/hughperkins/eigen/src/eigen-cl/unsupported/test/cuda-on-cl/?at=eigen-cl

2
votes

If sycl(and Hugh Perkins's nice solution) is not option for you and if your class doesn't have any methods, you can use structs instead(serialize to byte array when copying to device):

typedef struct Warrior_tag
{
    int id;
    float hp;
    int strength;
    int dexterity;
    int constitution;
} Warrior;

typedef struct Mage_tag
{
    int id;
    Warrior summoned_warriors[90];
} Mage; 
// should be more than 32*90 + 32*90 => 5760(2.8k *2) => 8192(4k*2) bytes
// because id + padding = 90 warriors or it doesn't work in runtime
// reversing order of fields should make it 4k + 4 bytes


__kernel void test0(__global Warrior * warriors)
{
    int id=get_global_id(0);
    Warrior battal_gazi = warriors[0];
    Warrior achilles = warriors[1];
    Warrior aramis = warriors[2];
    Warrior hattori_hanzo = warriors[3];
    Warrior ip_man = warriors[4];

    Mage nakano = (Mage){0,{battal_gazi, achilles}};
    Mage gul_dan = (Mage){0,{aramis , hattori_hanzo,ip_man  }};
}

and then you are responsible for handling of alignment and sizes of structs. For example, Warrior struct has fields totally 20 bytes but it is likely 32 bytes in device side(because of some rules forcing it being power of 2 in memory) and you should acknowledge it from host side and put data accordingly in tune with alignment and variable sizes. Not even mentioning endianness which is a pain to handle for "write once, run everywhere". So you should run it only in your computer which is optimized for.

Pack biggest fields on top of struct, add smaller ones in bottom. Calculate their in-struct alignment as powers of 2 too!. Keep an eye of float3, int3 and similar not-so-documented-well implementations as they may or may not use float4,int4 in background. If performance of global memory access is not important for you, you can simply select a big number like N for every struct smaller than that for simplicity and put relative byte addressings to a structs beginning byte. Such as byte address of hp field in a Warrior struct at top(in a packed 4-bytes into single int). Then device side it can be queried to which byte does a field start. (endianness can make it more trickier so don't use buffercopies for pure structs)

If alignment of struct fields in host side is not an option:

  • send arrays of fields to a constructor kernel(float array -> hp, int array -> id)
  • construct in device using kernel (a buffer only on device side, Warrior is build from arrays of hp,id,...)
  • don't fiddle with alignments nor sizes anymore, just make buffer large enough to fit all structs inside. Picking 32 * number of warriors bytes should be enough for a warrior array.
  • when it works, return results as arrays again, to host side, after using another kernel to expand struct to arrays on device side.