I run C and Python codes which adds two arrays on GPU. But I found that Python code is 100 times faster than C.
Here is my code
@cuda.jit Python
import sys
import time
import numpy as np
from numba import cuda
@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add(a,b,c):
bx = cuda.blockIdx.x
bw = cuda.blockDim.x
tx = cuda.threadIdx.x
i = tx + bx * bw
if i > c.size:
return
c[i] = a[i] + b[i]
def main(num):
device = cuda.get_current_device()
#num = 100
#Host memory
a = np.full(num, 1.0, dtype = np.float32)
b = np.full(num, 1.0, dtype = np.float32)
#create device memory
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.device_array_like(a)
#tpb = device.WARP_SIZE
tpb = 1024
bpg = int(np.ceil(float(num)/tpb))
print 'Blocks per grid:', bpg
print 'Threads per block', tpb
#launch kernel
st = time.time()
cu_add[bpg, tpb](d_a, d_b, d_c)
et = time.time()
print "Time taken ", (et - st), " seconds"
c = d_c.copy_to_host()
for i in xrange(1000):
if c[i] != 2.0:
raise Exception
#print c
if __name__ == "__main__":
main(int(sys.argv[1]))
Run : python numba_vec_add_float.py 697932185
Output : Blocks per grid: 681575 Threads per block 1024 Time taken 0.000330924987793 seconds
CUDA C
#define MEMSIZE (2.6L * 1024L * 1024L * 1024L)
#include<stdio.h>
__global__ void add(float *a, float *b, float *c, unsigned long long num) {
unsigned long long idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if(idx < num) {
c[idx] = a[idx] + b[idx];
}
}
int main() {
cudaEvent_t start, stop;
cudaError_t err;
float *a, *b, *d_a, *c, *d_b, *d_c;
unsigned long long num = MEMSIZE/4;
float elapsedTime;
err = cudaMalloc((void **)&d_a, MEMSIZE);
if (err != cudaSuccess) {
printf("failed to allocate memory to d_a\n");
exit(0);
}
err = cudaMalloc((void **)&d_b, MEMSIZE);
if (err != cudaSuccess) {
printf("failed to allocate memory to d_b\n");
exit(0);
}
err = cudaMalloc((void **)&d_c, MEMSIZE);
if (err != cudaSuccess) {
printf("failed to allocate memory to d_c\n");
exit(0);
}
a = (float *)malloc(MEMSIZE);
if(a==NULL) {
printf("Failed to allocate memory to a");
exit(0);
}
b = (float *)malloc(MEMSIZE);
if(b==NULL) {
printf("Failed to allocate memory to b");
exit(0);
}
c = (float *)malloc(MEMSIZE);
if(c==NULL) {
printf("Failed to allocate memory to c");
exit(0);
}
for(unsigned long long i=0; i<num; i++) {
float v = i/1000.0;
a[i] = v;
b[i] = v;
}
err = cudaMemcpy(d_a, a, MEMSIZE, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("failed to copy memory from host to device\n");
exit(0);
}
err = cudaMemcpy(d_b, b, MEMSIZE, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("failed to copy memory from host to device\n");
exit(0);
}
int thr = 1024;
long int bloc = (num/thr)+1;
printf("Blocks per grid: %ld", bloc);
printf("\nThreads per bloc: %d", thr);
cudaEventCreate(&start);
cudaEventRecord(start, 0);
add<<<bloc, thr>>>(d_a, d_b, d_c, num);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess) {
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
exit(0);
}
if (errAsync != cudaSuccess) {
printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
exit(0);
}
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);;
printf("\nGPu time --> %f milliseconds\n", elapsedTime);
printf("Gpus time --> %f seconds\n", elapsedTime/1000);
err = cudaMemcpy(c, d_c, MEMSIZE, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
printf("failed to copy memory from Device to host\n");
exit(0);
}
free(a); free(b); free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
Compile : nvcc --gpu-architecture=compute_61 nvidia_vector_addition.cu
Run : ./a.out
Output : Blocks per grid: 681575 Threads per bloc: 1024 GPu time --> 34.359295 milliseconds Gpus time --> 0.034359 seconds
It is observed that @cuda.jit python is 103 times faster than cuda C. Can anyone clarify what I am doing right or wrong?