1
votes

The code below is legal C++ (compiles clean with g++ -Wall):

#ifdef MAKE_COMPILE
#define __restrict__ /* empty */
#define NO_FORWARD_DECLARATIONS
#endif

#include <stdio.h>

template <class T>
struct Array
{
int width, height;
T *ptr;
};

#ifdef HAVE_CUDA
template<typename T, int KernelSize>
     static __global__ void genConvolve_kernel(const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth )
{
    if ((threadIdx.x == 4) && (threadIdx.y == 2))
       printf("Hello world from CUDA!\n");
}
#endif

#ifndef NO_FORWARD_DECLARATIONS
template <typename T, int KernelSize>
     void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
#endif

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16)
{
#ifdef HAVE_CUDA
    dim3 block(blockWidth,blockHeight);
    dim3 grid(1,1);
    genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
#else
    printf("Hello, world!\n");
#endif
}

template <typename T, int KernelSize>
     void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16)
{
    genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
}

int main(int argc, char *argv[])
{
    Array<float> a;

    genConvolve_cuda<float,3>(a,a,a);
#ifdef HAVE_CUDA
    cudaDeviceSynchronize();
#endif

    return 0;
}

However, when I try to compile this with nvcc I get the errors:

nvcc t.cu

t.cu(39): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard

t.cu(39): warning: redefinition of default argument

t.cu(51): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard

t.cu(51): warning: redefinition of default argument

t.cu(53): error: template instantiation resulted in unexpected function type of "void (const float *, float *, int, int, int, int, int)" (the meaning of a name may have changed since the template declaration -- the type of the template is "void (const __restrict__ T *, __restrict__ T *, int, int, int, int, int)") detected during: instantiation of "genConvolve_cuda_deviceptrs" based on template arguments (53): here instantiation of "void genConvolve_cuda(const Array &, const Array &, Array &, int, int) [with T=float, KernelSize=3]" (60): here

(line numbers lightly offset as I clean-up the example before posting.)

The warnings and errors go away when I define -DMAKE_COMPILE; however, I really would like to specify the forward declarations in a header file, and to use restrict !

So two questions:

  1. How to specify forward declarations of template functions with NVCC when there are default function arguments (in my case blockWidth and blockHeight?)
  2. How to properly use __restrict__ with template arguments?
1
The default template parameter question is just this: stackoverflow.com/questions/4906116/… . Only define them in one place (forward declaration or definition), but not both.talonmies
The default template question above is about default template arguments; the issue above is on default parameters. Note that the code does compile OK with g++ (so it seems legal C++), the problem is with nvcc.Klamer Schutte
Reading through the error message from the compiler it seems to me the issue is not due to __restrict__, but rather due to const? The template has const T*, T*, ... but the instantiation has const Array &, const Array &, ..., not const Array &, Array &, .... The fact that g++ compiles the code may be a red herring, since [1] this code has CUDA specific code paths prsumably not processed by g++ [2] a particular compiler's behavior is not the final measure of a code's standard compliance (or lack thereof).njuffa
njuffa, code has overloaded calls to const Array<T> & , const Array<T> & , Array<T> & ,.. as well as calls to const T * i, T * , ... -- the ones using const having one more argument (which in the real code is transferred to the Cuda code by using a __constant__ array not show here. I accepted the answer below by Robert, pointing to me having ordered * and __restrict__ the wrong way around.Klamer Schutte

1 Answers

4
votes

How to properly use __restrict__ with template arguments?

After conferring with colleagues, it was pointed out to me that this __restrict__ usage:

const T __restrict__ * inputImageArray ...

is questionable. In order for __restrict__ to have any effect, it is expected to be placed between the asterisk and the pointer name:

const T * __restrict__ inputImageArray ...

(gcc reference, and CUDA reference)

In the non-standard usage you have shown, gcc seems to allow this but silently "drops" the intent; the effect of __restrict__ is not applied in that case. In this respect, it is true that CUDA differs from gcc behavior. However because it is questionable usage as described above, it's unlikely that nvcc would be modified to "fix" this issue.

You can make the compile error disappear in the code you have shown if you switch to standard __restrict__ usage. This is recommended anyway if your intent is to declare to the compiler that these are in fact restricted pointers:

#ifdef MAKE_COMPILE
#define __restrict__ /* empty */
#define NO_FORWARD_DECLARATIONS
#endif

#include <stdio.h>

template <class T>
struct Array
{
int width, height;
T *ptr;
};

#ifdef HAVE_CUDA
template<typename T, int KernelSize>
     static __global__ void genConvolve_kernel(const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth )
{
    if ((threadIdx.x == 4) && (threadIdx.y == 2))
       printf("Hello world from CUDA!\n");
}
#endif

#ifndef NO_FORWARD_DECLARATIONS
template <typename T, int KernelSize>
     void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
#endif

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16)
{
#ifdef HAVE_CUDA
    dim3 block(blockWidth,blockHeight);
    dim3 grid(1,1);
    genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
#else
    printf("Hello, world!\n");
#endif
}

template <typename T, int KernelSize>
     void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16)
{
    genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
}

int main(int argc, char *argv[])
{
    Array<float> a;

    genConvolve_cuda<float,3>(a,a,a);
#ifdef HAVE_CUDA
    cudaDeviceSynchronize();
#endif

    return 0;
}

The warnings remain; that appears to be a separate issue:

t986.cu(33): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard

t986.cu(33): warning: redefinition of default argument

t986.cu(45): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard

t986.cu(45): warning: redefinition of default argument

Those warnings can be made to disappear if the default (template) function arguments are included on the first declaration but not the subsequent declarations, as follows:

#ifdef MAKE_COMPILE
#define __restrict__ /* empty */
#define NO_FORWARD_DECLARATIONS
#endif

#include <stdio.h>

template <class T>
struct Array
{
int width, height;
T *ptr;
};

#ifdef HAVE_CUDA
template<typename T, int KernelSize>
     static __global__ void genConvolve_kernel(const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth )
{
    if ((threadIdx.x == 4) && (threadIdx.y == 2))
       printf("Hello world from CUDA!\n");
}
#endif

#ifndef NO_FORWARD_DECLARATIONS
template <typename T, int KernelSize>
     void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
#endif

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth, int blockHeight)
{
#ifdef HAVE_CUDA
    dim3 block(blockWidth,blockHeight);
    dim3 grid(1,1);
    genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
#else
    printf("Hello, world!\n");
#endif
}

template <typename T, int KernelSize>
     void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth, int blockHeight)
{
    genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
}

int main(int argc, char *argv[])
{
    Array<float> a;

    genConvolve_cuda<float,3>(a,a,a);
#ifdef HAVE_CUDA
    cudaDeviceSynchronize();
#endif

    return 0;
}

although I agree that still differs from g++ behavior. The gnu tools may still be the unusual case here, however. The redefinition of default arguments is still unexpected, and both clang and cl.exe (microsoft) will have issues with it.