I've been testing out some basic CUDA functions using the Numba
package. My main goal is to implement a Richardson-Lucy algorithm on the GPU. It is possible to accelerate the algorithm and one of the main steps in doing so can be summarized in the following dummy function
def dummy(arr1, arr2):
return (arr1 * arr2).sum() / ((arr2**2).sum() + eps)
This function runs reasonably fast on the CPU but I'd like to keep everything on the GPU to avoid host <---> device copies.
To compare the speeds of the different calculations I wrote a short set of functions:
import numpy as np
from numba import njit, jit
import numba
import numba.cuda as cuda
import timeit
import time
# define our functions
@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def add_gpu(a, b):
return a + b
@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def mult_gpu(a, b):
return a * b
@cuda.reduce
def sum_gpu(a, b):
return a + b
@cuda.jit
def add_gpu_1d(a, b, c):
x = cuda.grid(1)
if x < c.size:
c[x] = a[x] + b[x]
@cuda.jit
def mult_gpu_1d(a, b, c):
x = cuda.grid(1)
if x < c.size:
c[x] = a[x] * b[x]
@cuda.jit
def mult_gpu_2d(a, b, c):
x, y = cuda.grid(2)
if x < c.shape[0] and y < c.shape[1]:
c[x, y] = a[x, y] * b[x, y]
@cuda.jit
def add_gpu_2d(a, b, c):
x, y = cuda.grid(2)
if x < c.shape[0] and y < c.shape[1]:
c[x, y] = a[x, y] + b[x, y]
and some timer functions:
def avg_t(t, num):
return np.mean(t) / num
def format_t(t):
"""Turn t into nice formating"""
if t < 1e-3:
return "{:.1f} us".format(t * 1e6)
elif t < 1:
return "{:.1f} ms".format(t * 1e3)
else:
return "{:.1f} s".format(t)
def test_1d_times(data_len, dtype=np.float32):
num_times = 10
title = "Testing 1D Data, Data length = {}, data type = {}".format(data_len, dtype)
print(len(title) * "=")
print(title)
print(len(title) * "=")
t = time.time()
arr1, arr2 = np.empty((2, data_len), dtype=dtype)
d_arr1 = cuda.to_device(arr1)
d_arr2 = cuda.to_device(arr2)
d_result = cuda.device_array_like(d_arr1)
print("Data generated in " + format_t(time.time() - t))
print("d_arr1 dtype =", d_arr1.dtype)
print("d_arr1 size = ", d_arr1.size)
print()
print("Testing multiplication times")
print("----------------------------")
t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: mult_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_mult_1d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing sum times")
print("------------------")
t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: add_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_add_1d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing reduction times")
print("-----------------------")
t = timeit.repeat((lambda: arr1.sum()), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu.reduce(d_arr1)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: sum_gpu(d_arr1)), number=num_times)
print("sum_gpu time = " + format_t(avg_t(t, num_times)))
print()
def test_2d_times(data_len, dtype=np.float32):
num_times = 10
title = "Testing 2D Data, Data length = {}, data type = {}".format(data_len, dtype)
print(len(title) * "=")
print(title)
print(len(title) * "=")
t = time.time()
arr1, arr2 = np.empty((2, data_len, data_len), dtype=dtype)
d_arr1 = cuda.to_device(arr1)
d_arr2 = cuda.to_device(arr2)
d_result = cuda.device_array_like(d_arr1)
print("Data generated in {} seconds".format(time.time() - t))
print("d_arr1 dtype =", d_arr1.dtype)
print("d_arr1 size = ", d_arr1.size)
print()
print("Testing multiplication times")
print("----------------------------")
t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: mult_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_mult_2d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing sum times")
print("------------------")
t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: add_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_add_2d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing reduction times")
print("-----------------------")
t = timeit.repeat((lambda: arr1.sum()), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu.reduce(d_arr1.ravel())), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: sum_gpu(d_arr1.ravel())), number=num_times)
print("sum_gpu time = " + format_t(avg_t(t, num_times)))
print()
Running the test functions
numba.cuda.detect()
test_1d_times(2**24)
test_2d_times(2**12)
test_1d_times(2**24, dtype=np.float64)
test_2d_times(2**12, dtype=np.float64)
gives the following output:
Found 1 CUDA devices
id 0 b'GeForce GTX TITAN X' [SUPPORTED]
compute capability: 5.2
pci device id: 0
pci bus id: 3
Summary:
1/1 devices are supported
============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float32'>
============================================================================
Data generated in 88.2 ms
d_arr1 dtype = float32
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 122.8 ms
cuda_mult_1d time = 206.8 us
Testing sum times
------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 106.1 ms
cuda_add_1d time = 212.6 us
Testing reduction times
-----------------------
cpu/numpy time = 16.7 ms
cuda vectorize time = 11.1 ms
sum_gpu time = 127.3 ms
========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float32'>
========================================================================
Data generated in 0.0800013542175293 seconds
d_arr1 dtype = float32
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 35.4 ms
cuda vectorize time = 97.9 ms
cuda_mult_2d time = 208.9 us
Testing sum times
------------------
cpu/numpy time = 36.3 ms
cuda vectorize time = 94.5 ms
cuda_add_2d time = 250.8 us
Testing reduction times
-----------------------
cpu/numpy time = 16.4 ms
cuda vectorize time = 15.8 ms
sum_gpu time = 125.4 ms
============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float64'>
============================================================================
Data generated in 171.0 ms
d_arr1 dtype = float64
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 73.2 ms
cuda vectorize time = 114.9 ms
cuda_mult_1d time = 201.9 us
Testing sum times
------------------
cpu/numpy time = 71.4 ms
cuda vectorize time = 71.0 ms
cuda_add_1d time = 217.2 us
Testing reduction times
-----------------------
cpu/numpy time = 29.0 ms
cuda vectorize time = 12.8 ms
sum_gpu time = 123.5 ms
========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float64'>
========================================================================
Data generated in 0.301849365234375 seconds
d_arr1 dtype = float64
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 73.7 ms
cuda vectorize time = 84.2 ms
cuda_mult_2d time = 226.2 us
Testing sum times
------------------
cpu/numpy time = 74.9 ms
cuda vectorize time = 84.3 ms
cuda_add_2d time = 208.7 us
Testing reduction times
-----------------------
cpu/numpy time = 29.9 ms
cuda vectorize time = 14.3 ms
sum_gpu time = 121.2 ms
It seems like the @cuda.vectorize
decorated functions perform slower than the CPU and custom written @cuda.jit
functions. While the @cuda.jit
functions give the expected orders of magnitude speed up and nearly constant time performance (results not shown).
On the other hand the @cuda.reduce
function runs significantly slower than either the @cuda.vectorize
function or the CPU function.
Is there a reason for the poor performance of the @cuda.vectorize
and @cuda.reduce
functions? Is it possible to write a CUDA reduction kernel using just Numba?
EDIT:
Looks like this is a legitimate bug in Numba
: https://github.com/numba/numba/issues/2266, https://github.com/numba/numba/issues/2268