Has anyone experiences with running the OpenCL FFT library from AMD (http://developer.amd.com/libraries/appmathlibs/pages/default.aspx) on NVIDIA GPUs?
I'm trying to port an existing algorithm from CUDA (with the most recent CUFFT) to OpenCL. The new code is running fine with an AMD GPU but not with my NVIDIA GPU. The NVIDIA GPU is recognized properly but the resulting array is zero all over without throwing any errors. By the way, the code runs also fine on an Intel Core i3 CPU. So my code seems to be fine.
AMD as well as NVIDIA seem to refuse support for this topic.
Any ideas?
EDIT:
My environment is a Windows 7 Professional x64 OS and I'm using the Visual Studio C++ Professional IDE with it's build-in x86 compiler. The NVIDIA GPU is a GeForce GTX 560 Ti (MSI N560GTX-Ti Twin Frozr II/OC 1GB). The well working CPU is a Intel Core i3-2100 (2x3.1GHz) and than there is the Radeon HD 6850 (Sapphire Radeon HD 6850 1GB). I tried compiling the code against the newest OpenCL releases of AMD, NVIDIA and Intel, with the same results and have of cause the newest developer drivers installed.
Here is my pretty basic sample code ...
#include <stdio.h>
#include <stdlib.h>
#include <complex>
#include <clAmdFft.h>
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
// Typedef for complex field objects
using namespace std;
typedef std::complex<float> cl_compl_flt;
int main(int argc, char* argv[])
{
cl_uint width = 1024, height = 1024; // Field dimensions
cl_uint cl_platformsN = 0; // Platform count
cl_platform_id *cl_platformIDs = NULL; // IDs of OpenCL platforms
cl_uint cl_deviceCount = 0; // Device count
cl_device_id *cl_devices = NULL; // Device IDs
cl_int cl_err = 0; // Buffer for error informations
cl_context cl_dev_context; // Context
cl_command_queue cl_queue; // Queue
clAmdFftSetupData fftSetupData; // FFT setup data
clAmdFftPlanHandle fftPlan; // FFT plan
clAmdFftDim fftDim = CLFFT_2D; // FFT dimension
size_t fftSize[2]; // FFT size
fftSize[0] = width;
fftSize[1] = height;
cl_mem d_data; // Device level data
cl_compl_flt* h_src; // Host level input data
cl_compl_flt* h_res; // Host level output data
// Allocate host memory
h_src = (cl_compl_flt*)malloc(width*height*sizeof(cl_compl_flt));
h_res = (cl_compl_flt*)malloc(width*height*sizeof(cl_compl_flt));
// Get source field
createPinholeField( h_src, width, height, 5 );
// Get FFT version
checkCL( clAmdFftInitSetupData(&fftSetupData) );
printf("Using clAmdFft %u.%u.%u\n",fftSetupData.major,fftSetupData.minor,fftSetupData.patch);
// Get available platforms
checkCL( clGetPlatformIDs ( 0, NULL, &cl_platformsN));
cl_platformIDs = (cl_platform_id*) malloc( cl_platformsN * sizeof(cl_platform_id));
checkCL( clGetPlatformIDs( cl_platformsN, cl_platformIDs, NULL) );
// Loop over platforms
for( cl_uint i = 0; i < cl_platformsN; i++)
{
// Get number of available devices for this platform
checkCL( clGetDeviceIDs( cl_platformIDs[i], CL_DEVICE_TYPE_ALL, NULL, NULL, &cl_deviceCount));
// Skip platform if no device available
if(cl_deviceCount < 1)
continue;
// Get available device IDs for this platform
cl_devices = (cl_device_id*) malloc( cl_deviceCount * sizeof(cl_device_id));
checkCL( clGetDeviceIDs( cl_platformIDs[i], CL_DEVICE_TYPE_ALL, cl_deviceCount, cl_devices, NULL));
// Print platform name
char platform_name[1024];
checkCL( clGetPlatformInfo( cl_platformIDs[i], CL_PLATFORM_NAME, 1024, &platform_name, NULL) );
printf("\nCompute using OpenCl platfrom #%i [ %s ]\n", i,platform_name);
// Loop over devices
for( cl_uint j = 0; j < cl_deviceCount; j++)
{
// Print device name and type
cl_device_type device_type;
char device_name[1024];
checkCL( clGetDeviceInfo( cl_devices[j], CL_DEVICE_NAME, 1024, &device_name, NULL) );
checkCL( clGetDeviceInfo( cl_devices[j],CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL) );
printf("\n\tUsing OpenCl device #%i [ %s -- %s ]\n", j, device_name, getDevTypeString(device_type));
// Create OpenCL context
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)cl_platformIDs[i],
0
};
cl_dev_context = clCreateContext( cps, cl_deviceCount, cl_devices, NULL, NULL, &cl_err);
checkCL( cl_err);
// Create command queue
cl_queue = clCreateCommandQueue( cl_dev_context, cl_devices[j], CL_QUEUE_PROFILING_ENABLE, &cl_err);
checkCL( cl_err);
// Create device buffer
d_data = clCreateBuffer( cl_dev_context, CL_MEM_READ_WRITE, width*height*sizeof(cl_compl_flt), NULL, &cl_err);
checkCL( cl_err);
// Setup FFT
checkCL( clAmdFftSetup(&fftSetupData) );
// Create FFT plan
checkCL( clAmdFftCreateDefaultPlan( &fftPlan, cl_dev_context, fftDim, fftSize) );
// Copy data from host to device
clEnqueueWriteBuffer( cl_queue, d_data, CL_TRUE, 0, width*height*sizeof(cl_compl_flt), h_src, 0, NULL, NULL);
// Execute FFT
checkCL( clAmdFftEnqueueTransform( fftPlan, CLFFT_FORWARD, 1, &cl_queue, 0, NULL, NULL, &d_data, NULL, NULL) );
clFinish( cl_queue);
// Copy result from device to host
checkCL( clEnqueueReadBuffer(cl_queue, d_data, CL_TRUE, 0, width*height*sizeof(cl_compl_flt), h_res, 0, NULL, NULL) );
clFinish( cl_queue);
// Save result
char filename[512];
sprintf( filename, "raw/result_%u_%u_in.raw",i,j);
printf("\tSave result to \"%s\" ", filename);
saveRawData( h_res, filename, width, height, true);
printf("\n");
// Free FFT plan
checkCL( clAmdFftDestroyPlan( &fftPlan) );
// Free FFT
checkCL( clAmdFftTeardown() );
// Free device memory
checkCL( clReleaseMemObject(d_data) );
// Release OpenCL context and queue
checkCL( clReleaseCommandQueue( cl_queue ) );
checkCL( clReleaseContext( cl_dev_context) );
}
// Free OpenCL devices
free( cl_devices);
}
free( h_src);
free( h_res);
printf("\n\nPress any key ...");
getchar();
return 0;
}
and the additional used functions ...
// Generate a pinhole
void createPinholeField( cl_compl_flt* data, cl_uint width, cl_uint height, cl_uint radius)
{
if(data==NULL)
data = (cl_compl_flt*)malloc(width*height*sizeof(cl_compl_flt));
if(radius < 1)
radius = (width>height)?height/2:width/2;
cl_float min_val = 0.0f;
cl_float max_val = 255.0f;
for(cl_uint y = 0; y < height; y++)
for(cl_uint x = 0; x < width; x++)
{
if ( ceil( sqrt( pow(x-width/2., 2.) + pow(y-height/2., 2.) )) <= radius )
{
data[x+y*width].real(max_val);
data[x+y*width].imag(0.f);
}
else
{
data[x+y*width].real(min_val);
data[x+y*width].imag(0.f);
}
}
}
// Save a cl_compl_flt array as an unsigned char raw image file
void saveRawData( cl_compl_flt* char_array, const char* filepath, cl_uint width, cl_uint height, bool print_minmax )
{
cl_float* abs_v = (cl_float*) malloc(width*height*sizeof(cl_float));
for( cl_uint i = 0; i < width*height; i++)
abs_v[i] = abs(char_array[i]);
cl_float min = abs_v[0];
cl_float max = abs_v[0];
for( cl_uint i = 1; i < width*height; i++)
{
if( abs_v[i] < min)
min = abs_v[i];
if( abs_v[i] > max)
max = abs_v[i];
}
if( print_minmax)
printf(" [min=%f , max=%f] ",min,max);
max *= .01f;
cl_uchar* temp = (cl_uchar*) malloc(width*height*sizeof(cl_uchar));
for( cl_uint i = 0; i < width*height; i++)
temp[i] = 255*(cl_uchar)(( (cl_float)abs_v[i] - min) / ( max-min ));
FILE *pFile = NULL;
pFile=fopen(filepath,"wb");
fwrite(temp,1,width*height,pFile);
fclose(pFile);
free(abs_v);
free(temp);
}
// Check functions that return OpenCL error IDs.
bool checkCL( cl_int oclErrorCode)
{
if( oclErrorCode == CL_SUCCESS)
return true;
else
{
printf("\n\nAn OpenCL related error occured!\nError ID #%d\nPress ENTER to exit the program...\n\n", oclErrorCode);
getchar();
exit( oclErrorCode);
return false;
}
}
// Get device type as string
char* getDevTypeString(cl_device_type type)
{
switch(type)
{
case CL_DEVICE_TYPE_CPU:
return "CPU";
break;
case CL_DEVICE_TYPE_GPU:
return "GPU";
break;
case CL_DEVICE_TYPE_ACCELERATOR:
return "ACCELERATOR";
break;
default:
return "DEFAULT";
break;
}
}
I hope this helps to narrow down the problem.
P.S.: Images can be seen here: http://devgurus.amd.com/thread/159149