I would like to create a list of function pointers dynamically on the CPU (with some sort of push_back()
method called from main()
) and copy it to a GPU __constant__
or __device__
array, without needing to resort to static __device__
function pointers. I believe this question is related to my problem; however, my goal is to create the __host__
function pointer array iteratively and then copy it to the __constant__
function pointer array instead of initialising the latter on declaration.
A working code example with static function pointers (as seen here or here) would be:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
if (code != cudaSuccess)
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
__constant__ fptr_t constant_fList[num_functions];
// Kernel called from main(): choose the function to apply whose index is equal to thread ID
__global__ void kernel(int a, int b) {
fptr_t f;
if (threadIdx.x < num_functions) {
f = constant_fList[threadIdx.x];
#include "common.h"
// Static device function pointers
__device__ fptr_t p_Add = Add;
__device__ fptr_t p_Sub = Subtract;
__device__ fptr_t p_Mul = Multiply;
// Load function list to constant memory
void loadList_staticpointers() {
fptr_t h_fList[num_functions];
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) );
int main() {
int a = 12, b = 15;
kernel<<<1,3>>>(a, b);
return 0;
Specs: GeForce GTX 670, compiled for -arch=sm_30
, CUDA 6.5, Ubuntu 14.04
I wish to avoid the use of static device function pointers, as appending each function would require code maintenance on the user side - declaration of a new static pointer like p_Add
or p_Mul
, manipulation of void loadList_functionpointers()
, etc. To make it clear, I am trying something like the following (crashing) code:
#include "common.h"
#include <vector>
// Global variable: list of function pointers in host memory
std::vector<fptr_t> vec_fList;
// Add function to functions list
void addFunc(fptr_t f) {vec_fList.push_back(f);}
// Upload the functions in the std::vector<fptr_t> to GPU memory
// Copies CPU-side pointers to constant_fList, therefore crashes on kernel call
void UploadVector() {
fptr_t* h_vpointer = vec_fList.data();
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_vpointer, vec_fList.size() * sizeof(fptr_t)) );
int main() {
int a = 12, b = 15;
kernel<<<1,3>>>(a, b); // Wrong to call a host-side function pointer from a kernel
return 0;
My understanding is that function pointers pointing to host addresses are copied to the GPU and are unusable by the kernel, which needs pointers pointing to GPU addresses when the function f(a,b)
is called. Populating a host-side array with device-side pointers would work for me with raw data (see this question) but not with function pointers. Trivial attempts with Unified Memory have failed as well... so far, I have only found static device-side pointers to work. Is there no other way to copy a dynamically created CPU array of function pointers onto the GPU?
inside the kernel? – m.s.switch
inside the kernel involves manipulation of the code on the user side - each time they introduce a new function (that they already need to code), a new case has to be introduced for theswitch
. I do not want the user to need to understand CUDA mechanics but provide them with someaddFunc(f)
mechanism to be called frommain()
, the only C++ code would involvef
. – user3519303