Why is @cuda.jit python program faster than its cuda-C equivalent?

Question

I run C and Python codes which adds two arrays on GPU. But I found that Python code is 100 times faster than C.

Here is my code

@cuda.jit Python

import sys
import time
import numpy as np
from numba import cuda

@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add(a,b,c):

    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    tx = cuda.threadIdx.x

    i = tx + bx * bw

    if i > c.size:
        return

    c[i] = a[i] + b[i]



def main(num):

    device = cuda.get_current_device()

    #num = 100
    #Host memory

    a = np.full(num, 1.0, dtype = np.float32)
    b = np.full(num, 1.0, dtype = np.float32)


    #create device memory

    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.device_array_like(a)

    #tpb = device.WARP_SIZE
    tpb = 1024

    bpg = int(np.ceil(float(num)/tpb))

    print 'Blocks per grid:', bpg
    print 'Threads per block', tpb

    #launch kernel
    st = time.time()

    cu_add[bpg, tpb](d_a, d_b, d_c)

    et = time.time()

    print "Time taken ", (et - st), " seconds"
    c = d_c.copy_to_host()

    for i in xrange(1000):
        if c[i] != 2.0:
            raise Exception
    #print c
if __name__ == "__main__":
    main(int(sys.argv[1]))

Run : python numba_vec_add_float.py 697932185

Output : Blocks per grid: 681575 Threads per block 1024 Time taken 0.000330924987793 seconds

CUDA C

 #define MEMSIZE (2.6L * 1024L * 1024L * 1024L)
 #include<stdio.h>
 __global__ void add(float *a, float *b, float *c, unsigned long long num)     {
     unsigned long long idx = (blockIdx.x * blockDim.x) + threadIdx.x;
     if(idx < num) {
         c[idx] = a[idx] + b[idx];
     }
 }

 int main() {

     cudaEvent_t start, stop;
     cudaError_t err;

     float *a, *b, *d_a, *c, *d_b, *d_c;
     unsigned long long num = MEMSIZE/4;
     float elapsedTime;

     err = cudaMalloc((void **)&d_a, MEMSIZE);
     if (err != cudaSuccess) {
         printf("failed to allocate memory to d_a\n");
         exit(0);
     }

     err = cudaMalloc((void **)&d_b, MEMSIZE);
     if (err != cudaSuccess) {
         printf("failed to allocate memory to d_b\n");
         exit(0);
     }

     err = cudaMalloc((void **)&d_c, MEMSIZE);
     if (err != cudaSuccess) {
         printf("failed to allocate memory to d_c\n");
         exit(0);
     }

a = (float *)malloc(MEMSIZE);
if(a==NULL) {
    printf("Failed to allocate memory to a");
    exit(0);
}

b = (float *)malloc(MEMSIZE);
if(b==NULL) {
    printf("Failed to allocate memory to b");
    exit(0);
}
c = (float *)malloc(MEMSIZE);
if(c==NULL) {
    printf("Failed to allocate memory to c");
    exit(0);
}

for(unsigned long long i=0; i<num; i++) {
    float v = i/1000.0;
    a[i] = v;
    b[i] = v;
}

err = cudaMemcpy(d_a, a, MEMSIZE, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
    printf("failed to copy memory from host to device\n");
    exit(0);
}

err = cudaMemcpy(d_b, b, MEMSIZE, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
    printf("failed to copy memory from host to device\n");
    exit(0);
}

int thr = 1024;
long int bloc = (num/thr)+1;
printf("Blocks per grid: %ld", bloc);
printf("\nThreads per bloc: %d", thr);

cudaEventCreate(&start);
cudaEventRecord(start, 0);

add<<<bloc, thr>>>(d_a, d_b, d_c, num);

cudaError_t errSync  = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess) {
    printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
    exit(0);
}
if (errAsync != cudaSuccess) {
    printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
    exit(0);
}

cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);;

printf("\nGPu time --> %f milliseconds\n", elapsedTime);
printf("Gpus time --> %f seconds\n", elapsedTime/1000);

err = cudaMemcpy(c, d_c, MEMSIZE, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
    printf("failed to copy memory from Device to host\n");
    exit(0);
}
free(a); free(b); free(c);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

return 0;
}

Compile : nvcc --gpu-architecture=compute_61 nvidia_vector_addition.cu

Run : ./a.out

Output : Blocks per grid: 681575 Threads per bloc: 1024 GPu time --> 34.359295 milliseconds Gpus time --> 0.034359 seconds

It is observed that @cuda.jit python is 103 times faster than cuda C. Can anyone clarify what I am doing right or wrong?

Unknown Unknown · Accepted Answer · 2017-08-13T13:10:22

In the numba case, you are only measuring kernel launch overhead, not the full time it takes to run the kernel. In the CUDA-C case you are measuring the full time it takes to run the kernel.

To make the numba case perform a similar measurement to the CUDA-C case, try this modification:

#launch kernel
mystream = cuda.stream()
st = time.time()

cu_add[bpg, tpb, mystream](d_a, d_b, d_c)
mystream.synchronize()
et = time.time()

(from here).

Why is @cuda.jit python program faster than its cuda-C equivalent?

1 Answers