3
votes

I noticed a weird phenomenon that allows you to launch a host function using the triple angle bracket notation in CUDA. To test this, I wrote a simple kernel that copies data between two integer arrays. Note that I am running all of this code on a Tesla K40 and compiling with -gencode arch=compute_35,code=sm_35:

#ifndef HOST_LAUNCH_H
#define HOST_LAUNCH_H
using namespace std;

// Assumes input and output are both length 32

__global__ void CopyKernel(const int* input, int* output) {
  size_t global_idx = blockIdx.x * blockDim.x + threadIdx.x;
  output[global_idx] = input[global_idx];
}

__host__ void Copy(const int* input, int* output) {
  int* d_input = 0;
  int* d_output = 0;
  cudaMalloc((void**)&d_input, 32 * sizeof(int));
  cudaMalloc((void**)&d_output, 32 * sizeof(int));
  cudaMemcpy(d_input, input, 32 * sizeof(int), cudaMemcpyHostToDevice);
  CopyKernel<<<1,32>>>(d_input, d_output);
  cudaMemcpy(output, d_output, 32 * sizeof(int), cudaMemcpyDeviceToHost);
  cudaFree(d_input);
  cudaFree(d_output);
}

#endif

I then wrote the following unit test:

#include "host_launch.h"
#include <assert.h>
using namespace std;

__host__ void TestKernelLaunch() {
  int input[32];
  int output[32];
  for(int i = 0; i < 32; i++) {
    input[i] = i;
    output[i] = 0;
  }

  int* d_input = 0;
  int* d_output = 0;
  cudaMalloc((void**)&d_input, 32 * sizeof(int));
  cudaMalloc((void**)&d_output, 32 * sizeof(int));
  cudaMemcpy(d_input, input, 32 * sizeof(int), cudaMemcpyHostToDevice);

  for(int i = 0; i < 32; i++) {
    assert(output[i] == 0);
  }
  CopyKernel<<<1,32>>>(d_input, d_output);
  cudaMemcpy(output, d_output, 32 * sizeof(int), cudaMemcpyDeviceToHost);
  for(int i = 0; i < 32; i++) {
    assert(output[i] == i);
  }

  cudaFree(d_input);
  cudaFree(d_output);
}

__host__ void TestHostLaunch() {
  int input[32];
  int output[32];
  for(int i = 0; i < 32; i++) {
    input[i] = i + 1;
    output[i] = 0;
  }

  int* d_input = 0;
  int* d_output = 0;
  cudaMalloc((void**)&d_input, 32 * sizeof(int));
  cudaMalloc((void**)&d_output, 32 * sizeof(int));
  cudaMemcpy(d_input, input, 32 * sizeof(int), cudaMemcpyHostToDevice);

  for(int i = 0; i < 32; i++) {
    assert(output[i] == 0);
  }
  //Copy<<<1,32>>>(d_input, d_output);
  cudaMemcpy(output, d_output, 32 * sizeof(int), cudaMemcpyDeviceToHost);
  for(int i = 0; i < 32; i++) {
    assert(output[i] == i + 1);
  }

  cudaFree(d_input);
  cudaFree(d_output);
}

__host__ void TestFunctionPointerLaunch(void (*f)(const int*, int*)) {
  int input[32];
  int output[32];
  for(int i = 0; i < 32; i++) {
    input[i] = i + 2;
    output[i] = 0;
  }

  int* d_input = 0;
  int* d_output = 0;
  cudaMalloc((void**)&d_input, 32 * sizeof(int));
  cudaMalloc((void**)&d_output, 32 * sizeof(int));
  cudaMemcpy(d_input, input, 32 * sizeof(int), cudaMemcpyHostToDevice);

  for(int i = 0; i < 32; i++) {
    assert(output[i] == 0);
  }
  f<<<1,32>>>(d_input, d_output);
  cudaMemcpy(output, d_output, 32 * sizeof(int), cudaMemcpyDeviceToHost);
  for(int i = 0; i < 32; i++) {
    assert(output[i] == i + 2);
  }

  cudaFree(d_input);
  cudaFree(d_output);
}

int main() {
  TestKernelLaunch();
  TestFunctionPointerLaunch(CopyKernel);
  TestFunctionPointerLaunch(Copy);
}

If I uncomment the line:

//Copy<<<1,32>>>(d_input, d_output);

I get:

host_launch_unittest.cu(49): error: a host function call cannot be configured

But the equivalent is performed with:

f<<<1,32>>>(d_input, d_output);

in TestFunctionPointerLaunch, and it passes all of the assertions. I'm just wondering what the GPU is actually doing under the hood that makes this host function launch behave correctly. I wrote these tests to isolate the behavior, but also found that it works for more complicated kernels/host functions. Also, I decided to time these to see if they were somehow compiled to equivalent operations:

#include "host_launch.h"
#include <iostream>
#include <assert.h>
using namespace std;

__host__ float MeanCopyTime(const int copy_count, void (*f)(const int*, int*)) {
  int input[32 * copy_count];
  int output[32 * copy_count];
  for(int i = 0; i < 32 * copy_count; i++) {
    input[i] = i;
    output[i] = 0;
  }

  int* d_input = 0;
  int* d_output = 0;
  cudaMalloc((void**)&d_input, 32 * copy_count * sizeof(int));
  cudaMalloc((void**)&d_output, 32 * copy_count * sizeof(int));
  cudaMemcpy(d_input, input, 32 * copy_count * sizeof(int), cudaMemcpyHostToDevice);
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaEventRecord(start);
  for(int i = 0; i < copy_count; i++)
    f<<<1,32>>>(d_input + i * 32, d_output + i * 32);
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);
  float msecs = 0;
  cudaEventElapsedTime(&msecs, start, stop);
  cudaMemcpy(output, d_output, 32 * copy_count * sizeof(int), cudaMemcpyDeviceToHost);

  cudaFree(d_input);
  cudaFree(d_output);
  for(int i = 0; i < 32 * copy_count; i++) {
    assert(output[i] == i);
  }
  return msecs / copy_count;
}

int main() {
  int copy_count = 10000;
  cout << endl;
  cout << "Average Kernel Launch Time: " << MeanCopyTime(copy_count, CopyKernel) << endl;
  cout << "Average Host Function Launch Time: " << MeanCopyTime(copy_count, Copy) << endl;
  cout << endl;
}

For my architecture, this returns:

Average Kernel Launch Time: 0.00420756
Average Host Function Launch Time: 0.169097

Again any thoughts on what's happening here would be greatly appreciated.

1

1 Answers

1
votes

I can see why this might be a bit confusing, but despite what you might think is happening Copy is never running on the GPU. The CopyKernel is being called three times on the device, but all of the launches are being initiation on the host. Here is how.

The first insight required is to demystify how kernels are compiled and how their launches actually work in the CUDA runtime API. When the nvcc compiles your CopyKernel and a runtime API style launch for that kernel, a pair of host functions get emitted which looks like this:

void __device_stub__Z10CopyKernelPKiPi(const int *__par0, int *__par1)
{
    if (cudaSetupArgument((void *)(char *)&__par0, sizeof(__par0), (size_t)0Ui64) != cudaSuccess) return;
    if (cudaSetupArgument((void *)(char *)&__par1, sizeof(__par1), (size_t)8Ui64) != cudaSuccess) return;
    {
       volatile static char *__f; 
       __f = ((char *)((void ( *)(const int *, int *))CopyKernel)); 
       (void)cudaLaunch(((char *)((void ( *)(const int *, int *))CopyKernel)));
    };
}

void CopyKernel( const int *__cuda_0,int *__cuda_1)
{
    __device_stub__Z10CopyKernelPKiPi( __cuda_0,__cuda_1);
}

These provide a wrapper around the necessary API calls to push the kernel arguments to the CUDA driver and launch the kernel. You will notice that the execution configuration for the kernel is not handled within these functions. Instead, whenever a CopyKernel<<< >>>() call is encountered by the preprocessor, this sort of code is emitted:

(cudaConfigureCall(1, 32)) ? (void)0 : (CopyKernel)(d_input, d_output); 

ie. the kernel launch configuration is pushed to the driver, and then the wrapper function is called, where the arguments are pushed to the driver and the kernel launched.

So what happens in TestFunctionPointerLaunch? Basically the same thing. This code

f<<<1,32>>>(d_input, d_output);

is compiled to this by the CUDA front end preprocessor

(cudaConfigureCall(1, 32)) ? (void)0 : f(d_input, d_output); 

ie. launch parameters for a kernel launch are pushed onto the driver, and the host function supplied as f is called. If f happened to be a kernel wrapper function (ie.CopyKernel), then a kernel launch will result via the API calls which the wrapper contains, otherwise it won't. If f happens to be a host function which itself contains a runtime API kernel call (ie. Copy), then that host code will do the same thing, and a kernel launch will eventually result, just further down the call stack.

This is how you can provide either CopyKernel or Copy as a argument to TestFunctionPointerLaunch and it will still work. Technically, it is undefined behaviour, because the way that kernel launches work internally inside the CUDA runtime API are deliberately opaque and implementation details might change over time. But right now it works.

The reason why

Copy<<<1,32>>>(d_input, d_output);

doesn't compile, is because Copy is a host function and nvcc can detect that at compile time -- in the language specification only __global__ functions can launched and the compiler enforces this check.

But when you pass a function pointer, the compiler cannot apply that check. The code which is produced happens to work with either a host function or a host kernel wrapper function because the runtime support code doesn't (and probably can't) emit code which could perform introspection on the function pointer and identify that the function pointer isn't going to call a kernel. So the language specification requirements are skipped and things accidentally work.

I would strongly recommend not trying to rely on this behaviour.