5
votes

I played a bit with the experimental device lambdas that where introduced in CUDA 7.5 and promoted in this blog post by Mark Harris.

For the following example I removed a lot of stuff that is not needed to show my problem (my actual implementation looks a bit nicer...).

I tried to write a foreach function that operates either on vectors on device (1 thread per element) or host (serial) depending on a template parameter. With this foreach function I can easily implement BLAS functions. As an example I use assigning a scalar to each component of a vector (I attach the complete code in the end):

template<bool onDevice> void assignScalar( size_t size, double* vector, double a )
{
    auto assign = [=] __host__ __device__ ( size_t index ) { vector[index] = a; };
    if( onDevice )
    {
        foreachDevice( size, assign );
    }
    else
    {
        foreachHost( size, assign );
    }
}

However, this code gives a compiler error because of the __host__ __device__ lambda:

The closure type for a lambda ("lambda ->void") cannot be used in the template argument type of a __global__ function template instantiation, unless the lambda is defined within a __device__ or __global__ function

I get the same error if I remove the __device__ from the lambda expression and I get no compile error if I remove __host__ (only __device__ lambda), but in this case the host part is not executed...

If I define the lambda as either __host__ or __device__ separately, the code compiles and works as expected.

template<bool onDevice> void assignScalar2( size_t size, double* vector, double a )
{
    if( onDevice )
    {
        auto assign = [=] __device__ ( size_t index ) { vector[index] = a; };
        foreachDevice( size, assign );
    }
    else
    {
        auto assign = [=] __host__ ( size_t index ) { vector[index] = a; };
        foreachHost( size, assign );
    }
}

However, this introduces code duplication and actually makes the whole idea of using lambdas useless for this example.

Is there a way to accomplish what I want to do or is this a bug in the experimental feature? Actually, defining a __host__ __device__ lambda is explicitly mentioned in the first example in the programming guide. Even for that simpler example (just return a constant value from the lambda) I couldn't find a way to use the lambda expression on both host and device.

Here is the full code, compile with options -std=c++11 --expt-extended-lambda:

#include <iostream>
using namespace std;

template<typename Operation> void foreachHost( size_t size, Operation o )
{
    for( size_t i = 0; i < size; ++i )
    {
        o( i );
    }
}

template<typename Operation> __global__ void kernel_foreach( Operation o )
{
    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
    o( index );
}

template<typename Operation> void foreachDevice( size_t size, Operation o )
{
    size_t blocksize = 32;
    size_t gridsize = size/32;
    kernel_foreach<<<gridsize,blocksize>>>( o );
}

__global__ void printFirstElementOnDevice( double* vector )
{
    printf( "dVector[0] = %f\n", vector[0] );
}

void assignScalarHost( size_t size, double* vector, double a )
{
    auto assign = [=] ( size_t index ) { vector[index] = a; };
    foreachHost( size, assign );
}

void assignScalarDevice( size_t size, double* vector, double a )
{
    auto assign = [=] __device__ ( size_t index ) { vector[index] = a; };
    foreachDevice( size, assign );
}

// compile error:
template<bool onDevice> void assignScalar( size_t size, double* vector, double a )
{
    auto assign = [=]  __host__ __device__ ( size_t index ) { vector[index] = a; };
    if( onDevice )
    {
        foreachDevice( size, assign );
    }
    else
    {
        foreachHost( size, assign );
    }
}

// works:
template<bool onDevice> void assignScalar2( size_t size, double* vector, double a )
{
    if( onDevice )
    {
        auto assign = [=] __device__ ( size_t index ) { vector[index] = a; };
        foreachDevice( size, assign );
    }
    else
    {
        auto assign = [=] __host__ ( size_t index ) { vector[index] = a; };
        foreachHost( size, assign );
    }
}

int main()
{
    size_t SIZE = 32;

    double* hVector = new double[SIZE];
    double* dVector;
    cudaMalloc( &dVector, SIZE*sizeof(double) );

    // clear memory
    for( size_t i = 0; i < SIZE; ++i )
    {
        hVector[i] = 0;
    }
    cudaMemcpy( dVector, hVector, SIZE*sizeof(double), cudaMemcpyHostToDevice );

    assignScalarHost( SIZE, hVector, 1.0 );
    cout << "hVector[0] = " << hVector[0] << endl;

    assignScalarDevice( SIZE, dVector, 2.0 );
    printFirstElementOnDevice<<<1,1>>>( dVector );
    cudaDeviceSynchronize();

    assignScalar2<false>( SIZE, hVector, 3.0 );
    cout << "hVector[0] = " << hVector[0] << endl;

    assignScalar2<true>( SIZE, dVector, 4.0 );
    printFirstElementOnDevice<<<1,1>>>( dVector );
    cudaDeviceSynchronize();

//  assignScalar<false>( SIZE, hVector, 5.0 );
//  cout << "hVector[0] = " << hVector[0] << endl;
//
//  assignScalar<true>( SIZE, dVector, 6.0 );
//  printFirstElementOnDevice<<<1,1>>>( dVector );
//  cudaDeviceSynchronize();

    cudaError_t error = cudaGetLastError();
    if(error!=cudaSuccess)
    {
        cout << "ERROR: " << cudaGetErrorString(error);
    }
}

I used the production release of CUDA 7.5.

Update

I tried this third version for the assignScalar function:

template<bool onDevice> void assignScalar3( size_t size, double* vector, double a )
{
#ifdef __CUDA_ARCH__
#define LAMBDA_HOST_DEVICE __device__
#else
#define LAMBDA_HOST_DEVICE __host__
#endif

    auto assign = [=] LAMBDA_HOST_DEVICE ( size_t index ) { vector[index] = a; };
    if( onDevice )
    {
        foreachDevice( size, assign );
    }
    else
    {
        foreachHost( size, assign );
    }
}

It compiles and runs without error, but the device version (assignScalar3<true>) is not executed. Actually, I thought that __CUDA_ARCH__ will always be undefined (since the function is not __device__) but I checked explicitly that there is a compile path where it is defined.

1
I think the error is instructive, and it may be a further implementation limitation that is not clearly spelled out in the documentation. If you follow the suggestion of the reported error, and mark the assignScalar templated function as __host__ __device__ I think you can get past this particular issue. That will then raise compiler warnings, which could be safely ignored, or perhaps worked around with use of the __CUDA_ARCH__ macro, to get to a clean compile. At that point, I think you will then perhaps stumble onto some sort of implementation bug. I have no other info at this time.Robert Crovella
I would say the error is misleading since it is not correct if you check the example assignScalar2. There the lambda is used in the same way and is not defined within a __device__ or __global__ function.havogt
@RobertCrovella As you say, making the assignScalar functions resolves the error, but not the problem, because the function is only called from host (actually neither the host nor the device foreach is called when I follow the suggestion). But your comment made me think about a third version which I will add to the question.havogt
"the function is only called from the host" is the implementation bug I referred to. Actually the device version starts to get called, but the kernel launch process fails at runtime for some reason. You can discover this if you poke around with gdb and the profilers.Robert Crovella
Second, restriction 6 from the programming guide: "When parsing a function, the CUDA compiler assigns a counter value to each __device__ lambda within that function. This counter value is used in the substituted named type passed to the host compiler. Hence, whether or not a __device__ lambda is defined within a function should not depend on a particular value of __CUDA_ARCH__, or on __CUDA_ARCH__ being undefined."harrism

1 Answers

3
votes

The task that I tried to accomplish with the examples provided in the question is not possible with CUDA 7.5, though it was not explicitly excluded from the allowed cases for the experimental lambda support.

NVIDIA announced that CUDA Toolkit 8.0 will support __host__ __device__ lambdas as an experimental feature, according to the blog post CUDA 8 Features Revealed.

I verified that my example works with the CUDA 8 Release Candidate (Cuda compilation tools, release 8.0, V8.0.26).

Here is the code that I finally used, compiled with nvcc -std=c++11 --expt-extended-lambda:

#include <iostream>
using namespace std;

template<typename Operation> __global__ void kernel_foreach( Operation o )
{
    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
    o( i );
}

template<bool onDevice, typename Operation> void foreach( size_t size, Operation o )
{
    if( onDevice )
    {
        size_t blocksize = 32;
        size_t gridsize = size/32;
        kernel_foreach<<<gridsize,blocksize>>>( o );
    }
    else
    {
        for( size_t i = 0; i < size; ++i )
        {
            o( i );
        }
    }
}

__global__ void printFirstElementOnDevice( double* vector )
{
    printf( "dVector[0] = %f\n", vector[0] );
}

template<bool onDevice> void assignScalar( size_t size, double* vector, double a )
{
    auto assign = [=]  __host__ __device__ ( size_t i ) { vector[i] = a; };
    foreach<onDevice>( size, assign );
}

int main()
{
    size_t SIZE = 32;

    double* hVector = new double[SIZE];
    double* dVector;
    cudaMalloc( &dVector, SIZE*sizeof(double) );

    // clear memory
    for( size_t i = 0; i < SIZE; ++i )
    {
        hVector[i] = 0;
    }
    cudaMemcpy( dVector, hVector, SIZE*sizeof(double), cudaMemcpyHostToDevice );

    assignScalar<false>( SIZE, hVector, 3.0 );
    cout << "hVector[0] = " << hVector[0] << endl;

    assignScalar<true>( SIZE, dVector, 4.0 );
    printFirstElementOnDevice<<<1,1>>>( dVector );
    cudaDeviceSynchronize();

    cudaError_t error = cudaGetLastError();
    if(error!=cudaSuccess)
    {
        cout << "ERROR: " << cudaGetErrorString(error);
    }
}