Calling a “device host” function from an external file by a CUDA kernel function

Question

I'm trying to play with mixing CUDA and C++. I encountered the following error:

main.cpp: define "main()". Call "gpu_main()" and "add_test()"
|
|--> add_func.cu: define "gpu_main()" and "__global__ void add()" as kernel. The "add()" will call "add_test()"
|
|--> basic_add.cu: define "__host__ __device__ int add_test(int a, int b)"

I compile the code this way:

nvcc basic_add.cu -c
nvcc -rdc=true add_func.cu -c
g++ main.cpp -c
g++ -o main main.o basic_add.o add_func.o -lcudart -L/usr/local/cuda/lib64

At the 2nd step, it gave me this error:

add_func.cu(14): error: calling a host function("add_test") from a global function("add") is not allowed

add_func.cu(14): error: identifier "add_test" is undefined in device code

Does anyone have any idea of how to fix this problem? Or I shouldn't call a host & device function from an external file? Thanks.

The code is as following (just for reference):

basic_add.h:

#ifndef BASIC_ADD_H_
#define BASIC_ADD_H_

int add_test( int a, int b );

#endif

basic_add.cu:

__host__ __device__ int add_test(int a, int b)
{
    return a + b;
}

add_func.h

#ifndef ADD_FUNC_H_
#define ADD_FUNC_H_

#include <iostream>
#include <math.h>
#include "basic_add.h"

int gpu_main(void);

#endif

add_func.cu

#include "add_func.h"

// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  printf("gridDim %d, blockDim %d, blockIdx %d, threadIdx %d\n", gridDim.x, blockDim.x, blockIdx.x, threadIdx.x);
  for (int i = index; i < n; i += stride)
  {
    y[i] = add_test(x[i],y[i]);
    printf("blockIdx %d, threadIdx %d, %d\n", blockIdx.x, threadIdx.x, i);
    break;
  }
}

int gpu_main(void)
{
  int N = 1<<10;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  int blockSize = 256;
  int numBlocks = (N + blockSize - 1) / blockSize;
  add<<<numBlocks, blockSize>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}

main.cpp:

#include <iostream>
#include <math.h>
#include "add_func.h"
#include "basic_add.h"

int main(void)
{
  gpu_main();
  int a = add_test(1,2);
  std::cout << a << std::endl;
  
  return 0;
}

Robert Crovella Robert Crovella · Accepted Answer · 2020-10-25T19:00:16

One of the things you are trying to do here isn't workable. If you want to have a function decorated with __host__ __device__, first of all you should decorate it the same way everywhere (i.e. also in your header file where you declare it) and such a function won't be directly callable from a .cpp file unless you compile that .cpp file with nvcc and pass -x cu as a compile command line switch, so you may as well just put in in a .cu file from my perspective.

You're also not doing relocatable device code linking properly, but that is fixable.

If you want to have a __host__ __device__ function callable from a .cpp file compiled with e.g. g++, then the only suggestion I have is to provide a wrapper for it.

The following is the closest I could come to what you have:

$ cat basic_add.h
#ifndef BASIC_ADD_H_
#define BASIC_ADD_H_

__host__ __device__ int add_test( int a, int b );

#endif
$ cat basic_add.cu
__host__ __device__ int add_test(int a, int b)
{
            return a + b;
}

int my_add_test(int a, int b){ return add_test(a,b);} //wrapper
$ cat add_func.h
#ifndef ADD_FUNC_H_
#define ADD_FUNC_H_

#include <iostream>
#include <math.h>
int my_add_test(int a, int b);
int gpu_main(void);

#endif
$ cat add_func.cu
#include "basic_add.h"
#include <iostream>

// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  printf("gridDim %d, blockDim %d, blockIdx %d, threadIdx %d\n", gridDim.x, blockDim.x, blockIdx.x, threadIdx.x);
  for (int i = index; i < n; i += stride)
  {
    y[i] = add_test(x[i],y[i]);
    printf("blockIdx %d, threadIdx %d, %d\n", blockIdx.x, threadIdx.x, i);
    break;
  }
}

int gpu_main(void)
{
  int N = 1<<10;
  float *x, *y;

  // Allocate Unified Memory . accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  int blockSize = 256;
  int numBlocks = (N + blockSize - 1) / blockSize;
  add<<<numBlocks, blockSize>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);

  return 0;
}
$ cat main.cpp
#include <iostream>
#include <math.h>
#include "add_func.h"

int main(void)
{
          gpu_main();
          int a = my_add_test(1,2);
          std::cout << a << std::endl;
          return 0;
}
$ nvcc -dc basic_add.cu
$ nvcc -dc add_func.cu
$ nvcc -dlink -o add.dlink.o add_func.o basic_add.o   
$ g++ -c main.cpp
$ g++ main.o add.dlink.o add_func.o basic_add.o -o test -L/usr/local/cuda/lib64 -lcudart
$

Calling a “__device__ __host__” function from an external file by a CUDA kernel function

1 Answers

Calling a “device host” function from an external file by a CUDA kernel function