I am trying to understand the difference in performances between single and double precisions of our GPU workstation.
Our workstation is equipped with two TITAN RTX GPUs, but I am running the benchmark on a sigle Titan RTX. I am testing the performance with cublas matrix-matrix multiplications. I multiply 8192x8192 matrices that consist of random floats or doubles. To ensure that there is no mistake on my end, I also repeat this procedure in Python using cupy library, and the results are very similar.
The test results are ~75 ms per 1 multiplication for floats and ~2,000 ms for doubles.
If I had an older GPU, this would make a lot of sense, as 75*32 = 2,400~2000, so that my double-precision performance would be ~32 times poorer as expected from the table https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions.
However, my GPU has Compute Capability 7.5, therefore I expect degradation of the performance with doubles only by a factor of 2.
Other info: Ubuntu 18 LTS, nvcc 10.2, driver 440.82.
Here is the CUDA code:
#include <iostream>
#include <chrono>
#include <string>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <math.h>
#include <stdio.h>
#include <cuda.h>
#include <device_functions.h>
#include <sstream>
#include <time.h>
unsigned long mix(unsigned long a, unsigned long b, unsigned long c)
{
a=a-b; a=a-c; a=a^(c >> 13);
b=b-c; b=b-a; b=b^(a << 8);
c=c-a; c=c-b; c=c^(b >> 13);
a=a-b; a=a-c; a=a^(c >> 12);
b=b-c; b=b-a; b=b^(a << 16);
c=c-a; c=c-b; c=c^(b >> 5);
a=a-b; a=a-c; a=a^(c >> 3);
b=b-c; b=b-a; b=b^(a << 10);
c=c-a; c=c-b; c=c^(b >> 15);
return c;
}
using namespace std;
int main()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
cudaDeviceProp deviceProp;
cublasStatus_t err;
cudaGetDeviceProperties(&deviceProp, 0);
printf("Detected %d devices \n", deviceCount);
printf("Device %d has compute capability %d.%d:\n\t maxshmem %d. \n\t maxthreads per block %d. \n\t max threads dim %d. %d. %d.\n ", 0,
deviceProp.major, deviceProp.minor, deviceProp.sharedMemPerBlock, deviceProp.maxThreadsPerBlock, deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
cudaEvent_t start_d, stop_d;
cudaEventCreate(&start_d);
cudaEventCreate(&stop_d);
//RND insicialization
unsigned long seed = mix(clock(), time(NULL), 0);
srand(seed);
int N=8192;
int Nloops=2;
int memsize=N*N*sizeof(double);
double *a = (double *)malloc(memsize);
double *b = (double *)malloc(memsize);
double *c = (double *)malloc(memsize);
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++){
a[i*N+j]=((double)rand() / RAND_MAX);
b[i*N+j]=((double)rand() / RAND_MAX);
}
double *a_d, *b_d, *c_d;
cudaMalloc((void **)&a_d, memsize);
cudaMalloc((void **)&b_d, memsize);
cudaMalloc((void **)&c_d, memsize);
cudaMemcpy(a_d, a, memsize, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b, memsize, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha=1.0;
double beta=0.0;
auto start = chrono::steady_clock::now();
clock_t start1;
start1 = clock();
cudaEventRecord(start_d);
if (cudaGetLastError() != cudaSuccess)
printf("%s \n",cudaGetErrorString(cudaGetLastError()));
for (int i=0; i<Nloops; i++)
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N,N,N,&alpha,a_d,N,b_d,N,&beta,c_d,N);
cudaEventRecord(stop_d);
cudaDeviceSynchronize();
auto end = chrono::steady_clock::now();
start1 = clock() - start1;
cudaEventSynchronize(stop_d);
cublasDestroy(handle);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start_d, stop_d);
std::cout << "Cuda event " << milliseconds /Nloops << " ms" <<endl;
std::cout << " time elapsed " << start1 / (double)CLOCKS_PER_SEC /Nloops << '\n';
cout << "time elapsed for 1 multiplication: " << ((double)chrono::duration_cast<chrono::microseconds>(end-start).count() )/(Nloops*1000.0)<< " milliseconds" <<endl;
free(a); free(b); free(c);
cudaFree(a_d); cudaFree(b_d); cudaFree(c_d);
}
And this is the python code that yields consistent results:
import cupy as cp
import time
iterations = 2
a = cp.random.rand(8192,8192).astype(cp.float64)
b = cp.random.rand(8192,8192).astype(cp.float64)
def ab(a,b,iterations):
for i in range(iterations):
cp.matmul(a,b,out=None)
ab(a,b,1) # warm up
cp.cuda.Device(0).synchronize()
t1 = time.time()
ab(a,b,iterations)
cp.cuda.Device(0).synchronize()
t2 = time.time()
total = (t2-t1)/iterations
print(total)