I am testing Nvidia Cublas Library on my GTX Titan. I have the following code:
#include "cublas.h"
#include <stdlib.h>
#include <conio.h>
#include <Windows.h>
#include <iostream>
#include <iomanip>
/* Vector size */
#define N (1024 * 1024 * 32)
/* Main */
int main(int argc, char** argv)
{
LARGE_INTEGER frequency;
LARGE_INTEGER t1, t2;
float* h_A;
float* h_B;
float* d_A = 0;
float* d_B = 0;
/* Initialize CUBLAS */
cublasInit();
/* Allocate host memory for the vectors */
h_A = (float*)malloc(N * sizeof(h_A[0]));
h_B = (float*)malloc(N * sizeof(h_B[0]));
/* Fill the vectors with test data */
for (int i = 0; i < N; i++)
{
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&t1);
/* Allocate device memory for the vectors */
cublasAlloc(N, sizeof(d_A[0]), (void**)&d_A);
cublasAlloc(N, sizeof(d_B[0]), (void**)&d_B);
/* Initialize the device matrices with the host vectors */
cublasSetVector(N, sizeof(h_A[0]), h_A, 1, d_A, 1);
cublasSetVector(N, sizeof(h_B[0]), h_B, 1, d_B, 1);
/* Performs operation using cublas */
float res = cublasSdot(N, d_A, 1, d_B, 1);
/* Memory clean up */
cublasFree(d_A);
cublasFree(d_B);
QueryPerformanceCounter(&t2);
double elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;
std::cout << "GPU time = " << std::setprecision(16) << elapsedTime << std::endl;
std::cout << "GPU result = " << res << std::endl;
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&t1);
float sum = 0.;
for (int i = 0; i < N; i++) {
sum += h_A[i] * h_B[i];
}
QueryPerformanceCounter(&t2);
elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;
std::cout << "CPU time = " << std::setprecision(16) << elapsedTime << std::endl;
std::cout << "CPU result = " << sum << std::endl;
free(h_A);
free(h_B);
/* Shutdown */
cublasShutdown();
getch();
return EXIT_SUCCESS;
}
When I run the code I get the following result:
GPU time = 164.7487009845991
GPU result = 8388851
CPU time = 45.22368030957917
CPU result = 7780599.5
Why using cublas library on GTX Titan is 3 times slower than calculations on one Xeon 2.4GHz IvyBridge core? When I increase or decrease the vector sizes, I get the same results: GPU is slower than CPU. Double precision doesn't change it.