Can you post some code? A CPU implementation can be found in the samples/cpp folder called matcher_simple.cpp. Are you able to run that? I also have run the GPU version of SURF on OpenCV with no problem using:
SURF_GPU surf(1000, 4, 2, false, 0.5);
// detecting keypoints & computing descriptors
GpuMat keypoints1GPU, keypoints2GPU;
GpuMat descriptors1GPU, descriptors2GPU;
surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
// matching descriptors
BruteForceMatcher_GPU< L2<float> > matcher;
GpuMat trainIdx, distance;
matcher.matchSingle(descriptors1GPU, descriptors2GPU, trainIdx, distance);
// downloading results
vector<KeyPoint> keypoints1, keypoints2;
vector<float> descriptors1, descriptors2;
vector<DMatch> matches;
surf.downloadKeypoints(keypoints1GPU, keypoints1);
surf.downloadKeypoints(keypoints2GPU, keypoints2);
surf.downloadDescriptors(descriptors1GPU, descriptors1);
surf.downloadDescriptors(descriptors2GPU, descriptors2);
BruteForceMatcher_GPU< L2<float> >::matchDownload(trainIdx, distance, matches);
// drawing the results
Mat img_matches, image1, image2;
img1.download(image1);
img2.download(image2);
drawMatches(image1, keypoints1, image2, keypoints2, matches, img_matches);