The code below is legal C++ (compiles clean with g++ -Wall):
#ifdef MAKE_COMPILE
#define __restrict__ /* empty */
#define NO_FORWARD_DECLARATIONS
#endif
#include <stdio.h>
template <class T>
struct Array
{
int width, height;
T *ptr;
};
#ifdef HAVE_CUDA
template<typename T, int KernelSize>
static __global__ void genConvolve_kernel(const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth )
{
if ((threadIdx.x == 4) && (threadIdx.y == 2))
printf("Hello world from CUDA!\n");
}
#endif
#ifndef NO_FORWARD_DECLARATIONS
template <typename T, int KernelSize>
void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);
template <typename T, int KernelSize>
void genConvolve_cuda_deviceptrs( const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
#endif
template <typename T, int KernelSize>
void genConvolve_cuda_deviceptrs( const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16)
{
#ifdef HAVE_CUDA
dim3 block(blockWidth,blockHeight);
dim3 grid(1,1);
genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
#else
printf("Hello, world!\n");
#endif
}
template <typename T, int KernelSize>
void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16)
{
genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
}
int main(int argc, char *argv[])
{
Array<float> a;
genConvolve_cuda<float,3>(a,a,a);
#ifdef HAVE_CUDA
cudaDeviceSynchronize();
#endif
return 0;
}
However, when I try to compile this with nvcc I get the errors:
nvcc t.cu
t.cu(39): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard
t.cu(39): warning: redefinition of default argument
t.cu(51): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard
t.cu(51): warning: redefinition of default argument
t.cu(53): error: template instantiation resulted in unexpected function type of "void (const float *, float *, int, int, int, int, int)" (the meaning of a name may have changed since the template declaration -- the type of the template is "void (const __restrict__ T *, __restrict__ T *, int, int, int, int, int)") detected during: instantiation of "genConvolve_cuda_deviceptrs" based on template arguments (53): here instantiation of "void genConvolve_cuda(const Array &, const Array &, Array &, int, int) [with T=float, KernelSize=3]" (60): here
(line numbers lightly offset as I clean-up the example before posting.)
The warnings and errors go away when I define -DMAKE_COMPILE; however, I really would like to specify the forward declarations in a header file, and to use restrict !
So two questions:
- How to specify forward declarations of template functions with NVCC when there are default function arguments (in my case blockWidth and blockHeight?)
- How to properly use __restrict__ with template arguments?
__restrict__
, but rather due toconst
? The template hasconst T*, T*, ...
but the instantiation hasconst Array &, const Array &, ...
, notconst Array &, Array &, ...
. The fact that g++ compiles the code may be a red herring, since [1] this code has CUDA specific code paths prsumably not processed by g++ [2] a particular compiler's behavior is not the final measure of a code's standard compliance (or lack thereof). – njuffaconst Array<T> & , const Array<T> & , Array<T> & ,..
as well as calls toconst T * i, T * , ...
-- the ones usingconst
having one more argument (which in the real code is transferred to the Cuda code by using a__constant__
array not show here. I accepted the answer below by Robert, pointing to me having ordered*
and__restrict__
the wrong way around. – Klamer Schutte