I am very new to opencl and trying my first program. I implemented a simple sinc filtering of waveforms. The code works, however i have two questions:
- Once I increase the size of the input matrix (numrows needs to go up to 100 000) I get (clEnqueueReadBuffer failed: OUT_OF_RESOURCES) even though matrix is relatively small (few mb). This is to some extent related to the work group size I think, but could someone elaborate how I could fix this issue ? Could it be driver issue ?
UPDATE:
- leaving groups size
None
crashes - adjusting groups size for GPU
(1,600)
and IntelHD(1,50)
lets me go up to some 6400 rows. However for larger size it crashes on GPU and IntelHD just freezes and does nothing ( 0% on resource monitor)
2.I have Intel HD4600 and Nvidia K1100M GPU available, however the Intel is ~2 times faster. I understand partially this is due to the fact that I don't need to copy my arrays to internal Intel memory different from my external GPU. However I expected marginal difference. Is this normal or should my code be better optimized to use on GPU ? (resolved)
Thanks for your help !!
from __future__ import absolute_import, print_function
import numpy as np
import pyopencl as cl
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
import matplotlib.pyplot as plt
def resample_opencl(y,key='GPU'):
#
# selecting to run on GPU or CPU
#
newlen = 1200
my_platform = cl.get_platforms()[0]
device =my_platform.get_devices()[0]
for found_platform in cl.get_platforms():
if (key == 'GPU') and (found_platform.name == 'NVIDIA CUDA'):
my_platform = found_platform
device =my_platform.get_devices()[0]
print("using GPU")
#
#Create context for GPU/CPU
#
ctx = cl.Context([device])
#
# Create queue for each kernel execution
#
queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
# queue = cl.CommandQueue(ctx)
prg = cl.Program(ctx, """
__kernel void resample(
int M,
__global const float *y_g,
__global float *res_g)
{
int row = get_global_id(0);
int col = get_global_id(1);
int gs = get_global_size(1);
__private float tmp,tmp2,x;
__private float t;
t = (float)(col)/2+1;
tmp=0;
tmp2=0;
for (int i=0; i<M ; i++)
{
x = (float)(i+1);
tmp2 = (t- x)*3.14159;
if (t == x) {
tmp += y_g[row*M + i] ;
}
else
tmp += y_g[row*M +i] * sin(tmp2)/tmp2;
}
res_g[row*gs + col] = tmp;
}
""").build()
mf = cl.mem_flags
y_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
res = np.zeros((np.shape(y)[0],newlen)).astype(np.float32)
res_g = cl.Buffer(ctx, mf.WRITE_ONLY, res.nbytes)
M = np.array(600).astype(np.int32)
prg.resample(queue, res.shape, (1,200),M, y_g, res_g)
event = cl.enqueue_copy(queue, res, res_g)
print("success")
event.wait()
return res,event
if __name__ == "__main__":
#
# this is the number i need to increase ( up to some 100 000)
numrows = 2000
Gaussian = lambda t : 10 * np.exp(-(t - 50)**2 / (2. * 2**2))
x = np.linspace(1, 101, 600, endpoint=False).astype(np.float32)
t = np.linspace(1, 101, 1200, endpoint=False).astype(np.float32)
y= np.zeros(( numrows,np.size(x)))
y[:] = Gaussian(x).astype(np.float32)
y = y.astype(np.float32)
res,event = resample_opencl(y,'GPU')
print ("OpenCl GPU profiler",(event.profile.end-event.profile.start)*1e-9)
#
# test plot if it worked
#
plt.figure()
plt.plot(x,y[1,:],'+')
plt.plot(t,res[1,:])