0
votes

I ran into one compilation problem when I want to compile the MPI+CUDA mixed program with dynamic parallelism support.

Source code of dyn_pal.cu

#include <stdio.h>
#include <cuda.h>
#define N 100
#define M 32
#define K 2
__device__ volatile int vint = 0;
__global__ void entry( volatile int* foo ) {
    for (int i = 0; i < N; ++i) {
        atomicAdd((int*)foo, 1);
    }
}

//extern "C" __global__ void diverge_cta( volatile int *foo )
extern "C" __global__ void diverge_cta( volatile int *foo )
{
  __shared__ int x;
  if ((threadIdx.x%32) != 0) {return;}
////    entry(foo);   //original design: each thread call entry()
    if (threadIdx.x == 0) {
        entry<<<1,M>>>( foo );
        cudaDeviceSynchronize();
        x = 5;
        return;
    }
    __syncthreads();
    atomicAdd((int*)foo, x);
}

extern "C" void mycal(int myrank){
    int *foo; int h_foo;
    cudaMalloc((void**)&foo, sizeof(int)); cudaMemset(foo, 0, sizeof(int));
    printf("foo addr: 0x%x\n", (unsigned)(size_t)foo);
    diverge_cta<<<K,M*32>>>( foo );
    cudaDeviceSynchronize();
    cudaMemcpy(&h_foo, foo, sizeof(int), cudaMemcpyDeviceToHost);
    if (h_foo == K*(M*N+5*(M-1))) {
        printf("simple_scan_test test PASSED\n");
    } else {
        printf("Result: %d\n", h_foo); printf("simple_scan_test test FAILED\n");
    }
}

Sourcecode of MPI code: indexed_gpu.c.

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
void diverge_cta( volatile int *foo);
void mycal(int myrank);

int main(int argc, char *argv[]){
  int myid, numprocs, i, j, k;
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
  mycal(myid);
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
  return EXIT_SUCCESS;
}

The compilation command I used:

  • nvcc -arch=sm_35 -dc dyn_pal.cu -o dynpal.o -lcudadevrt
  • mpicc -c indexed_gpu.c -o mpi_bench.o
  • mpicc mpi_bench.o dynpal.o -o gpu_idx -L/opt/cuda/5.5/lib64 -lcudart

Error I got:

dynpal.o: In function __sti____cudaRegisterAll_42_tmpxft_000044ae_00000000_6_dyn_pal_cpp1_ii_vint()': tmpxft_000044ae_00000000-3_dyn_pal.cudafe1.cpp:(.text+0x314): undefined reference to__cudaRegisterLinkedBinary_42_tmpxft_000044ae_00000000_6_dyn_pal_cpp1_ii_vint' collect2: ld returned 1 exit status

BTW, if I didn't include '-dc' when compiling cuda object file (line 1), I got this error:

dyn_pal.cu(25): error: kernel launch from device or global functions requires separate compilation mode 1 error detected in the compilation of "/tmp/tmpxft_00004303_00000000-6_dyn_pal.cpp1.ii".

If I don't use MPI programs, the pure CUDA program with parallelism support can compile and run sucessfully on my Kepler GPUs.

I am wondering, whether CUDA dynamic parallelism is supported for mixed program?

Thanks a lot.

1

1 Answers

2
votes
  1. I suspect the indexed_gpu.c source code you have posted isn't really what you are using. You don't need to include cuda.h and cuda_runtime.h in that file, but according to my understanding of mpi, you do need to include mpi.h in that file.
  2. With this compile command: nvcc -arch=sm_35 -dc dyn_pal.cu -o dynpal.o -lcudadevrt it's not necessary to provide -lcudadevrt switch. That is a linker switch and you are not linking anything with this command.
  3. Referring to the nvcc documentation, we see that when we want to use the host linker separately for final linking of the executable, it's necessary to perform an extra device-link step, when we are using separate compilation and linking (which is required for dynamic parallelism).

With the following sequence of commands and the above changes, I was able to compile and link your code:

nvcc -arch=sm_35 -dc dyn_pal.cu -o dynpal.o
mpicc -c indexed_gpu.c -o mpi_bench.o
nvcc -arch=sm_35 -dlink dynpal.o -o dynpal_link.o -lcudadevrt
mpicc mpi_bench.o dynpal.o dynpal_link.o -o gpu_idx -L/opt/cuda/5.5/lib64 -lcudart