I am facing a problem in correctly compiling CUDA code containing dynamic parallelism. The problem is that compilation and linking show no error, but the generated file is invalid executable.
Configuration:
Tesla K40, Ubuntu 14.04 LTS, CUDA 7.5
Compilation Command:
nvcc -o cdp -rdc=true -dc -dlink -arch=sm_35 cdp.cu -lcudadevrt
Code:
#include <iostream>
#include <cuda_runtime.h>
using namespace std;
__global__ void kernel_find(int* data, int count, int value, int* index)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx<count)
{
bool exists = (data[idx] == value);
if(exists)
atomicMin(index, idx);
}
}
__host__ __device__ int find_device(int* data, int count, int value)
{
int* idx = new int;
(*idx) = count;
dim3 block(8);
dim3 grid((count + block.x - 1)/block.x);
kernel_find<<<grid, block>>>(data, count, value, idx);
cudaDeviceSynchronize();
int retval = *idx;
delete idx;
return retval;
}
__global__ void kernel_find_bulk(int* data, int count, const int* toFind, int* foundIndices, int toFindCount)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx<toFindCount)
{
int val = toFind[idx];
int foundIndex = find_device(data, count, val);
foundIndices[idx] = foundIndex;
}
}
int main()
{
const int count = 100, toFindCount = 10;
int *data, *toFind, *foundIndices;
cudaMallocManaged(&data, count * sizeof(int));
cudaMallocManaged(&toFind, toFindCount * sizeof(int));
cudaMallocManaged(&foundIndices, toFindCount * sizeof(int));
for(int i=0; i<count; i++)
{
data[i] = rand() % 30;
}
for(int i=0; i<toFindCount; i++)
{
toFind[i] = i;
}
dim3 block(8);
dim3 grid((toFindCount + block.x - 1)/block.x);
kernel_find_bulk<<<grid, block>>>(data, count, toFind, foundIndices, toFindCount);
cudaDeviceSynchronize();
for(int i=0; i<toFindCount; i++)
{
if(foundIndices[i] < count)
{
cout<<toFind[i]<<" found at index "<<foundIndices[i]<<endl;
}
else
{
cout<<toFind[i]<<" not found"<<endl;
}
}
return 0;
}
If I try to run the executable, I get Permission denied
error. If permissions are changed forcefully using chmod
, the error changes to cannot execute binary file: Exec format error
.
I can't figure out the solution, as CUDA dynamic parallelism samples are running fine and CUDA programs without Dynamic Parallelism are also working fine. Any help would be appreciated.
Output of file
command:
cdp: ELF 64-bit LSB relocatable, x86-64, version 1 (SYSV), not stripped
Output of objdump -f
command:
cdp: file format elf64-x86-64 architecture: i386:x86-64, flags 0x00000011: HAS_RELOC, HAS_SYMS start address 0x0000000000000000
file
orobjdump
tell you about the emitted compiler output. I would guess it is a device elf object file. – talonmies-rdc=true
(only).-dc
is in many ways like the-c
switch. If you specify that, you will get a device compile only, regardless of other switches specified. This creates a non-executable object, which is the source of some of the difficulties you outline in your question (and-o cdp
further clouds the affair.) Note that multiple source files and/or objects can be specified using this approach (single command). – Robert Crovella