I ran into one compilation problem when I want to compile the MPI+CUDA mixed program with dynamic parallelism support.
Source code of dyn_pal.cu
#include <stdio.h>
#include <cuda.h>
#define N 100
#define M 32
#define K 2
__device__ volatile int vint = 0;
__global__ void entry( volatile int* foo ) {
for (int i = 0; i < N; ++i) {
atomicAdd((int*)foo, 1);
}
}
//extern "C" __global__ void diverge_cta( volatile int *foo )
extern "C" __global__ void diverge_cta( volatile int *foo )
{
__shared__ int x;
if ((threadIdx.x%32) != 0) {return;}
//// entry(foo); //original design: each thread call entry()
if (threadIdx.x == 0) {
entry<<<1,M>>>( foo );
cudaDeviceSynchronize();
x = 5;
return;
}
__syncthreads();
atomicAdd((int*)foo, x);
}
extern "C" void mycal(int myrank){
int *foo; int h_foo;
cudaMalloc((void**)&foo, sizeof(int)); cudaMemset(foo, 0, sizeof(int));
printf("foo addr: 0x%x\n", (unsigned)(size_t)foo);
diverge_cta<<<K,M*32>>>( foo );
cudaDeviceSynchronize();
cudaMemcpy(&h_foo, foo, sizeof(int), cudaMemcpyDeviceToHost);
if (h_foo == K*(M*N+5*(M-1))) {
printf("simple_scan_test test PASSED\n");
} else {
printf("Result: %d\n", h_foo); printf("simple_scan_test test FAILED\n");
}
}
Sourcecode of MPI code: indexed_gpu.c.
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
void diverge_cta( volatile int *foo);
void mycal(int myrank);
int main(int argc, char *argv[]){
int myid, numprocs, i, j, k;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
mycal(myid);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
return EXIT_SUCCESS;
}
The compilation command I used:
- nvcc -arch=sm_35 -dc dyn_pal.cu -o dynpal.o -lcudadevrt
- mpicc -c indexed_gpu.c -o mpi_bench.o
- mpicc mpi_bench.o dynpal.o -o gpu_idx -L/opt/cuda/5.5/lib64 -lcudart
Error I got:
dynpal.o: In function __sti____cudaRegisterAll_42_tmpxft_000044ae_00000000_6_dyn_pal_cpp1_ii_vint()':
tmpxft_000044ae_00000000-3_dyn_pal.cudafe1.cpp:(.text+0x314): undefined reference to
__cudaRegisterLinkedBinary_42_tmpxft_000044ae_00000000_6_dyn_pal_cpp1_ii_vint'
collect2: ld returned 1 exit status
BTW, if I didn't include '-dc' when compiling cuda object file (line 1), I got this error:
dyn_pal.cu(25): error: kernel launch from device or global functions requires separate compilation mode 1 error detected in the compilation of "/tmp/tmpxft_00004303_00000000-6_dyn_pal.cpp1.ii".
If I don't use MPI programs, the pure CUDA program with parallelism support can compile and run sucessfully on my Kepler GPUs.
I am wondering, whether CUDA dynamic parallelism is supported for mixed program?
Thanks a lot.