Dynamic Parallelism Invalid File Format

Question

I am facing a problem in correctly compiling CUDA code containing dynamic parallelism. The problem is that compilation and linking show no error, but the generated file is invalid executable.

Configuration:

Tesla K40, Ubuntu 14.04 LTS, CUDA 7.5

Compilation Command:

nvcc -o cdp -rdc=true -dc -dlink -arch=sm_35 cdp.cu -lcudadevrt

Code:

#include <iostream>
#include <cuda_runtime.h>

using namespace std;

__global__ void kernel_find(int* data, int count, int value, int* index)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if(idx<count)
    {
        bool exists = (data[idx] == value);

        if(exists)
            atomicMin(index, idx);
    }
}

__host__ __device__ int find_device(int* data, int count, int value)
{
    int* idx = new int;
    (*idx) = count;

    dim3 block(8);
    dim3 grid((count + block.x - 1)/block.x);

    kernel_find<<<grid, block>>>(data, count, value, idx);

    cudaDeviceSynchronize();

    int retval = *idx;
    delete idx;

    return retval;
}

__global__ void kernel_find_bulk(int* data, int count, const int* toFind, int* foundIndices, int toFindCount)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if(idx<toFindCount)
    {
        int val = toFind[idx];

        int foundIndex = find_device(data, count, val);

        foundIndices[idx] = foundIndex;
    }
}

int main()
{
    const int count = 100, toFindCount = 10;

    int *data, *toFind, *foundIndices;

    cudaMallocManaged(&data, count * sizeof(int));
    cudaMallocManaged(&toFind, toFindCount * sizeof(int));
    cudaMallocManaged(&foundIndices, toFindCount * sizeof(int));

    for(int i=0; i<count; i++)
    {
        data[i] = rand() % 30;
    }

    for(int i=0; i<toFindCount; i++)
    {
        toFind[i] = i;
    }


    dim3 block(8);
    dim3 grid((toFindCount + block.x - 1)/block.x);

    kernel_find_bulk<<<grid, block>>>(data, count, toFind, foundIndices, toFindCount);

    cudaDeviceSynchronize();


    for(int i=0; i<toFindCount; i++)
    {
        if(foundIndices[i] < count)
        {
            cout<<toFind[i]<<" found at index "<<foundIndices[i]<<endl;
        }
        else
        {
            cout<<toFind[i]<<" not found"<<endl;
        }
    }

    return 0;
}

If I try to run the executable, I get Permission denied error. If permissions are changed forcefully using chmod, the error changes to cannot execute binary file: Exec format error.

I can't figure out the solution, as CUDA dynamic parallelism samples are running fine and CUDA programs without Dynamic Parallelism are also working fine. Any help would be appreciated.

Output of `file` command:

cdp: ELF 64-bit LSB relocatable, x86-64, version 1 (SYSV), not stripped

Output of `objdump -f` command:

cdp: file format elf64-x86-64 architecture: i386:x86-64, flags 0x00000011: HAS_RELOC, HAS_SYMS start address 0x0000000000000000

I don't think you can do the device compile, device link, and host link in a single command. What does file or objdump tell you about the emitted compiler output. I would guess it is a device elf object file. — talonmies
@talonmies... output of both commands added. Do I have to compile and link separately? — sgarizvi
device compile, device link, and final link can be performed in a single command. As the answer by @talonmies indicates, the correct command line switch for that is -rdc=true (only). -dc is in many ways like the -c switch. If you specify that, you will get a device compile only, regardless of other switches specified. This creates a non-executable object, which is the source of some of the difficulties you outline in your question (and -o cdp further clouds the affair.) Note that multiple source files and/or objects can be specified using this approach (single command). — Robert Crovella
@RobertCrovella... Thankyou for you input. Eventually, I ended up doing separate compilation. — sgarizvi

talonmies talonmies · Accepted Answer · 2016-02-09T11:58:44

If you run your compile command with the --dryrun option:

$ nvcc --dryrun -o cdp -rdc=true -dc -dlink -arch=sm_35 cdp.cu -lcudadevrt
#$ _SPACE_= 
#$ _CUDART_=cudart
#$ _HERE_=/opt/cuda-7.5/bin
#$ _THERE_=/opt/cuda-7.5/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_SIZE_=64
#$ TOP=/opt/cuda-7.5/bin/..
#$ NVVMIR_LIBRARY_DIR=/opt/cuda-7.5/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/opt/cuda-7.5/bin/../lib:/opt/cuda-7.5/lib64
#$ PATH=/opt/cuda-7.5/bin/../open64/bin:/opt/cuda-7.5/bin/../nvvm/bin:/opt/cuda-7.5/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/opt/cuda-7.5/bin
#$ INCLUDES="-I/opt/cuda-7.5/bin/..//include"  
#$ LIBRARIES=  "-L/opt/cuda-7.5/bin/..//lib64/stubs" "-L/opt/cuda-7.5/bin/..//lib64"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -D__CUDA_ARCH__=350 -E -x c++     -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__  "-I/opt/cuda-7.5/bin/..//include"   -D"__CUDACC_VER__=70517" -D"__CUDACC_VER_BUILD__=17" -D"__CUDACC_VER_MINOR__=5" -D"__CUDACC_VER_MAJOR__=7" -include "cuda_runtime.h" -m64 "cdp.cu" > "/tmp/tmpxft_000022ba_00000000-7_cdp.cpp1.ii" 
#$ cudafe --allow_managed --m64 --gnu_version=40603 -tused --no_remove_unneeded_entities  --device-c --gen_c_file_name "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.c" --stub_file_name "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.gpu" --nv_arch "compute_35" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_000022ba_00000000-3_cdp.module_id" --include_file_name "tmpxft_000022ba_00000000-2_cdp.fatbin.c" "/tmp/tmpxft_000022ba_00000000-7_cdp.cpp1.ii" 
#$ gcc -D__CUDA_ARCH__=350 -E -x c     -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__ -D__CUDANVVM__  -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/opt/cuda-7.5/bin/..//include"   -m64 "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.gpu" > "/tmp/tmpxft_000022ba_00000000-8_cdp.cpp2.i" 
#$ cudafe -w --allow_managed --m64 --gnu_version=40603 --c  --device-c --gen_c_file_name "/tmp/tmpxft_000022ba_00000000-9_cdp.cudafe2.c" --stub_file_name "/tmp/tmpxft_000022ba_00000000-9_cdp.cudafe2.stub.c" --gen_device_file_name "/tmp/tmpxft_000022ba_00000000-9_cdp.cudafe2.gpu" --nv_arch "compute_35" --module_id_file_name "/tmp/tmpxft_000022ba_00000000-3_cdp.module_id" --include_file_name "tmpxft_000022ba_00000000-2_cdp.fatbin.c" "/tmp/tmpxft_000022ba_00000000-8_cdp.cpp2.i" 
#$ gcc -D__CUDA_ARCH__=350 -E -x c     -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDABE__ -D__CUDANVVM__  -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/opt/cuda-7.5/bin/..//include"   -m64 "/tmp/tmpxft_000022ba_00000000-9_cdp.cudafe2.gpu" > "/tmp/tmpxft_000022ba_00000000-10_cdp.cpp3.i" 
#$ filehash -s "--compile-only  " "/tmp/tmpxft_000022ba_00000000-10_cdp.cpp3.i" > "/tmp/tmpxft_000022ba_00000000-11_cdp.hash"
#$ gcc -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__  "-I/opt/cuda-7.5/bin/..//include"   -D"__CUDACC_VER__=70517" -D"__CUDACC_VER_BUILD__=17" -D"__CUDACC_VER_MINOR__=5" -D"__CUDACC_VER_MAJOR__=7" -include "cuda_runtime.h" -m64 "cdp.cu" > "/tmp/tmpxft_000022ba_00000000-5_cdp.cpp4.ii" 
#$ cudafe++ --allow_managed --m64 --gnu_version=40603 --parse_templates  --device-c --gen_c_file_name "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.cpp" --stub_file_name "tmpxft_000022ba_00000000-4_cdp.cudafe1.stub.c" --module_id_file_name "/tmp/tmpxft_000022ba_00000000-3_cdp.module_id" "/tmp/tmpxft_000022ba_00000000-5_cdp.cpp4.ii" 
#$ cicc  -arch compute_35 -m64 -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 -nvvmir-library "/opt/cuda-7.5/bin/../nvvm/libdevice/libdevice.compute_35.10.bc"  --device-c --orig_src_file_name "cdp.cu"  "/tmp/tmpxft_000022ba_00000000-10_cdp.cpp3.i" -o "/tmp/tmpxft_000022ba_00000000-6_cdp.ptx"
#$ ptxas  -arch=sm_35 -m64 --compile-only  "/tmp/tmpxft_000022ba_00000000-6_cdp.ptx"  -o "/tmp/tmpxft_000022ba_00000000-13_cdp.sm_35.cubin" 
#$ fatbinary --create="/tmp/tmpxft_000022ba_00000000-2_cdp.fatbin" -64 --key="xxxxxxxxxx" --cmdline="--compile-only  " "--image=profile=sm_35,file=/tmp/tmpxft_000022ba_00000000-13_cdp.sm_35.cubin" "--image=profile=compute_35,file=/tmp/tmpxft_000022ba_00000000-6_cdp.ptx" --embedded-fatbin="/tmp/tmpxft_000022ba_00000000-2_cdp.fatbin.c" --cuda --device-c
#$ rm /tmp/tmpxft_000022ba_00000000-2_cdp.fatbin
#$ gcc -D__CUDA_ARCH__=350 -E -x c++     -DCUDA_DOUBLE_MATH_FUNCTIONS   -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/opt/cuda-7.5/bin/..//include"   -m64 "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.cpp" > "/tmp/tmpxft_000022ba_00000000-14_cdp.ii" 
#$ gcc -c -x c++ "-I/opt/cuda-7.5/bin/..//include"   -fpreprocessed -m64 -o "cdp" "/tmp/tmpxft_000022ba_00000000-14_cdp.ii"

it becomes obvious that this has only emitted a host object file with an embedded cubin payload. There is no host code compilation or linking to an executable, which is confirmed by the output of objdump posted in an edit to your question.

The complicating factor here is that you must perform device independent compilation to use dynamic parallelism and then link the device code, but you only have a single source file, so the conventional build approach (device compile, device link, host compile) would fail with duplicate symbols.

The solution seems to be this:

$ nvcc --dryrun -o cdp -rdc=true  -arch=sm_35 cdp.cu 
#$ _SPACE_= 
#$ _CUDART_=cudart
#$ _HERE_=/opt/cuda-7.5/bin
#$ _THERE_=/opt/cuda-7.5/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_SIZE_=64
#$ TOP=/opt/cuda-7.5/bin/..
#$ NVVMIR_LIBRARY_DIR=/opt/cuda-7.5/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/opt/cuda-7.5/bin/../lib:/opt/cuda-7.5/lib64
#$ PATH=/opt/cuda-7.5/bin/../open64/bin:/opt/cuda-7.5/bin/../nvvm/bin:/opt/cuda-7.5/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/opt/cuda-7.5/bin
#$ INCLUDES="-I/opt/cuda-7.5/bin/..//include"  
#$ LIBRARIES=  "-L/opt/cuda-7.5/bin/..//lib64/stubs" "-L/opt/cuda-7.5/bin/..//lib64"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -D__CUDA_ARCH__=350 -E -x c++     -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__  "-I/opt/cuda-7.5/bin/..//include"   -D"__CUDACC_VER__=70517" -D"__CUDACC_VER_BUILD__=17" -D"__CUDACC_VER_MINOR__=5" -D"__CUDACC_VER_MAJOR__=7" -include "cuda_runtime.h" -m64 "cdp.cu" > "/tmp/tmpxft_00002454_00000000-9_cdp.cpp1.ii" 
#$ cudafe --allow_managed --m64 --gnu_version=40603 -tused --no_remove_unneeded_entities  --device-c --gen_c_file_name "/tmp/tmpxft_00002454_00000000-4_cdp.cudafe1.c" --stub_file_name "/tmp/tmpxft_00002454_00000000-4_cdp.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00002454_00000000-4_cdp.cudafe1.gpu" --nv_arch "compute_35" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00002454_00000000-3_cdp.module_id" --include_file_name "tmpxft_00002454_00000000-2_cdp.fatbin.c" "/tmp/tmpxft_00002454_00000000-9_cdp.cpp1.ii" 
#$ gcc -D__CUDA_ARCH__=350 -E -x c     -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__ -D__CUDANVVM__  -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/opt/cuda-7.5/bin/..//include"   -m64 "/tmp/tmpxft_00002454_00000000-4_cdp.cudafe1.gpu" > "/tmp/tmpxft_00002454_00000000-10_cdp.cpp2.i" 
#$ cudafe -w --allow_managed --m64 --gnu_version=40603 --c  --device-c --gen_c_file_name "/tmp/tmpxft_00002454_00000000-11_cdp.cudafe2.c" --stub_file_name "/tmp/tmpxft_00002454_00000000-11_cdp.cudafe2.stub.c" --gen_device_file_name "/tmp/tmpxft_00002454_00000000-11_cdp.cudafe2.gpu" --nv_arch "compute_35" --module_id_file_name "/tmp/tmpxft_00002454_00000000-3_cdp.module_id" --include_file_name "tmpxft_00002454_00000000-2_cdp.fatbin.c" "/tmp/tmpxft_00002454_00000000-10_cdp.cpp2.i" 
#$ gcc -D__CUDA_ARCH__=350 -E -x c     -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDABE__ -D__CUDANVVM__  -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/opt/cuda-7.5/bin/..//include"   -m64 "/tmp/tmpxft_00002454_00000000-11_cdp.cudafe2.gpu" > "/tmp/tmpxft_00002454_00000000-12_cdp.cpp3.i" 
#$ filehash -s "--compile-only  " "/tmp/tmpxft_00002454_00000000-12_cdp.cpp3.i" > "/tmp/tmpxft_00002454_00000000-13_cdp.hash"
#$ gcc -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__  "-I/opt/cuda-7.5/bin/..//include"   -D"__CUDACC_VER__=70517" -D"__CUDACC_VER_BUILD__=17" -D"__CUDACC_VER_MINOR__=5" -D"__CUDACC_VER_MAJOR__=7" -include "cuda_runtime.h" -m64 "cdp.cu" > "/tmp/tmpxft_00002454_00000000-5_cdp.cpp4.ii" 
#$ cudafe++ --allow_managed --m64 --gnu_version=40603 --parse_templates  --device-c --gen_c_file_name "/tmp/tmpxft_00002454_00000000-4_cdp.cudafe1.cpp" --stub_file_name "tmpxft_00002454_00000000-4_cdp.cudafe1.stub.c" --module_id_file_name "/tmp/tmpxft_00002454_00000000-3_cdp.module_id" "/tmp/tmpxft_00002454_00000000-5_cdp.cpp4.ii" 
#$ cicc  -arch compute_35 -m64 -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 -nvvmir-library "/opt/cuda-7.5/bin/../nvvm/libdevice/libdevice.compute_35.10.bc"  --device-c --orig_src_file_name "cdp.cu"  "/tmp/tmpxft_00002454_00000000-12_cdp.cpp3.i" -o "/tmp/tmpxft_00002454_00000000-6_cdp.ptx"
#$ ptxas  -arch=sm_35 -m64 --compile-only  "/tmp/tmpxft_00002454_00000000-6_cdp.ptx"  -o "/tmp/tmpxft_00002454_00000000-15_cdp.sm_35.cubin" 
#$ fatbinary --create="/tmp/tmpxft_00002454_00000000-2_cdp.fatbin" -64 --key="xxxxxxxxxx" --cmdline="--compile-only  " "--image=profile=sm_35,file=/tmp/tmpxft_00002454_00000000-15_cdp.sm_35.cubin" "--image=profile=compute_35,file=/tmp/tmpxft_00002454_00000000-6_cdp.ptx" --embedded-fatbin="/tmp/tmpxft_00002454_00000000-2_cdp.fatbin.c" --cuda --device-c
#$ rm /tmp/tmpxft_00002454_00000000-2_cdp.fatbin
#$ gcc -D__CUDA_ARCH__=350 -E -x c++     -DCUDA_DOUBLE_MATH_FUNCTIONS   -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/opt/cuda-7.5/bin/..//include"   -m64 "/tmp/tmpxft_00002454_00000000-4_cdp.cudafe1.cpp" > "/tmp/tmpxft_00002454_00000000-16_cdp.ii" 
#$ gcc -c -x c++ "-I/opt/cuda-7.5/bin/..//include"   -fpreprocessed -m64 -o "/tmp/tmpxft_00002454_00000000-17_cdp.o" "/tmp/tmpxft_00002454_00000000-16_cdp.ii" 
#$ nvlink --arch=sm_35 --register-link-binaries="/tmp/tmpxft_00002454_00000000-7_cdp_dlink.reg.c" -m64   "-L/opt/cuda-7.5/bin/..//lib64/stubs" "-L/opt/cuda-7.5/bin/..//lib64" -cpu-arch=X86_64 "/tmp/tmpxft_00002454_00000000-17_cdp.o"  -lcudadevrt  -o "/tmp/tmpxft_00002454_00000000-18_cdp_dlink.sm_35.cubin"
#$ fatbinary --create="/tmp/tmpxft_00002454_00000000-8_cdp_dlink.fatbin" -64 --key="cdp_dlink" --cmdline="--compile-only  " -link "--image=profile=sm_35,file=/tmp/tmpxft_00002454_00000000-18_cdp_dlink.sm_35.cubin" --embedded-fatbin="/tmp/tmpxft_00002454_00000000-8_cdp_dlink.fatbin.c" 
#$ rm /tmp/tmpxft_00002454_00000000-8_cdp_dlink.fatbin
#$ gcc -c -x c++ -DFATBINFILE="\"/tmp/tmpxft_00002454_00000000-8_cdp_dlink.fatbin.c\"" -DREGISTERLINKBINARYFILE="\"/tmp/tmpxft_00002454_00000000-7_cdp_dlink.reg.c\"" -I. "-I/opt/cuda-7.5/bin/..//include"   -D"__CUDACC_VER__=70517" -D"__CUDACC_VER_BUILD__=17" -D"__CUDACC_VER_MINOR__=5" -D"__CUDACC_VER_MAJOR__=7" -m64 -o "/tmp/tmpxft_00002454_00000000-19_cdp_dlink.o" "/opt/cuda-7.5/bin/crt/link.stub" 
#$ g++ -m64 -o "cdp" -Wl,--start-group "/tmp/tmpxft_00002454_00000000-19_cdp_dlink.o" "/tmp/tmpxft_00002454_00000000-17_cdp.o"   "-L/opt/cuda-7.5/bin/..//lib64/stubs" "-L/opt/cuda-7.5/bin/..//lib64" -lcudadevrt  -lcudart_static  -lrt -lpthread  -ldl  -Wl,--end-group

i.e. just pass -rdc=true. It seems for the single source file case, the necessary device link stage in implicitly performed, and the result is an executable which should work:

$ nvcc -o cdp -rdc=true  -arch=sm_35 cdp.cu 
$ file cdp
cdp: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.24, BuildID[sha1]=0xdcd6119fb9e2efdf2759093e8e9b762d0a55ddfd, not stripped

Note that I haven't run this because I am doing the build on a system with a GPU without dynamic parallelism support.