Problem with response times in CUDA why cpu is faster than gpu?

Question

Hi i have a problem with the time response I am getting a longer response time on GPU than CPU the algorithm used is a matrix multiplication algorithm

using the next functions:

// Start timers
cudaEvent_t timer1, timer2;
cudaEventCreate(&timer1);
cudaEventCreate(&timer2);
cudaEventRecord(timer1, 0);
cudaEventSynchronize(timer1);

// Stop timers
    cudaEventRecord(timer2, 0);
    cudaEventSynchronize(timer1);
    cudaEventSynchronize(timer2);
    float elapsed;
    cudaEventElapsedTime(&elapsed, timer1, timer2);

    cudaDeviceReset();
    return elapsed;

here is my code on GPU:

float Mult_gpu(float* hostPtr, float* hostPtr2, float* hostPtr3, int size, int Ncols, int Nrows, int n) {
size_t pitch;

check("Creating timers");
cudaEvent_t timer1, timer2;
cudaEventCreate(&timer1);
cudaEventCreate(&timer2);
cudaEventRecord(timer1, 0);
cudaEventSynchronize(timer1);

/******************************************/
/***Configuracion de las matrices en gpu***/
/******************************************/
float* devPtr;
cudaMallocPitch(&devPtr, &pitch, n * sizeof(float), Nrows);
cudaMemcpy2D(devPtr, pitch, hostPtr, n * sizeof(float), n * sizeof(float), Nrows, cudaMemcpyHostToDevice);

float* devPtr2;
cudaMallocPitch(&devPtr2, &pitch, Ncols * sizeof(float), n);
cudaMemcpy2D(devPtr2, pitch, hostPtr2, Ncols * sizeof(float), Ncols * sizeof(float), n, cudaMemcpyHostToDevice);

float* devPtr3;
cudaMallocPitch(&devPtr3, &pitch, Ncols * sizeof(float), Nrows);

//dim3 gridSize(iDivUp(Ncols3, BLOCKSIZE_x), iDivUp(Nrows3, BLOCKSIZE_y));
//dim3 blockSize(BLOCKSIZE_y, BLOCKSIZE_x);
dim3 block(32, 32);                                                     //hilos por bloque
dim3 grid((size / block.x) + 1, (size / block.y) + 1);                  //numero de bloques

/**************************/
/**Lanzamiento del kernel**/
/**************************/

Mult << <grid, block >> > (devPtr, devPtr2, devPtr3, pitch, Ncols, Nrows, n);
cudaDeviceSynchronize();

/*********************************/
/***Copiado de devPtr a hosPtr2***/
/*********************************/
cudaMemcpy2D(hostPtr3, Ncols * sizeof(float), devPtr3, pitch, Ncols * sizeof(float), Nrows, cudaMemcpyDeviceToHost);
//cudaMemcpy(hostPtr3, devPtr3, size * sizeof(float), cudaMemcpyDeviceToHost);

cudaFree(devPtr);
cudaFree(devPtr2);
cudaFree(devPtr3);
// Stop timers
cudaEventRecord(timer2, 0);
cudaEventSynchronize(timer1);
cudaEventSynchronize(timer2);
float elapsed;
cudaEventElapsedTime(&elapsed, timer1, timer2);

cudaDeviceReset();
return elapsed;

}

and here is my code on CPU

   float Mult_cpu(float* hostPtrA, float* HostPtrB, float* hostPtrC, int Ncols, int Nrows, int n)
{
    cudaEvent_t timer1, timer2;
    cudaEventCreate(&timer1);
    cudaEventCreate(&timer2);
    cudaEventRecord(timer1, 0);
    cudaEventSynchronize(timer1);

    for (int i = 0; i < Nrows; ++i) {
        for (int j = 0; j < Ncols; ++j) {
            float suma = 0;
            for (int k = 0; k < n; ++k) {
                suma += hostPtrA[i * n + k] * HostPtrB[k * Ncols + j];
            }
            hostPtrC[i * Ncols + j] = suma;

        }
    }

    // Stop timers
    cudaEventRecord(timer2, 0);
    cudaEventSynchronize(timer1);
    cudaEventSynchronize(timer2);
    float elapsed;
    cudaEventElapsedTime(&elapsed, timer1, timer2);

        return elapsed;
    }

when i use a matrix 500x500 or any matrix the CPU is faster than GPU and i don't understand why i don't know if the problem is my kernel program or the CUDA functions that im using

my kernel code

__global__ void Mult(float* devPtrA, float* devPtrB, float* devPtrC, size_t pitch, int Ncols, int Nrows, int n)
{
    float temp;

    int r = blockDim.y * blockIdx.y + threadIdx.y;      //for (int f = 0; f <= fil - 1; f += 1) equivalencia en for
    int c = blockDim.x * blockIdx.x + threadIdx.x;      //for (int c = 0; c <= col - 1; c += 1)
    if ((r < Ncols) && (c < Nrows)) {
        for (int c2 = 0; c2 < n; c2++) {
            float* vertical = (float*)((char*)devPtrA + r * pitch);
            float element1 = vertical[c2];
            float* horizontal = (float*)((char*)devPtrB + c2 * pitch);
            float element2 = horizontal[c];
            temp += element1 * element2;
        }


        //printf("\nla fila es: %d la columna es: %d el valor es: %8.4f\n\n", r, c, temp);
        float* vertical2 = (float*)((char*)devPtrC + r * pitch);
        vertical2[c] = temp;
    }
}

Person.Woman.Man.Camera.TV Person.Woman.Man.Camera.TV · Accepted Answer · 2020-04-03T16:44:04

You should read on the concept of SIMT architecture, CUDA execution model and branch divergence. Analyze your CUDA kernel performance with a profiler. I suspect that the condition if ((r < Ncols) && (c < Nrows)) in your kernel causes threads in each warp to diverge and hence the reduced performance. Also pitch affects the global memory access pattern in your code which is another factor in the performance of CUDA kernels. Some other excellent optimization tips can be found here.

Problem with response times in CUDA why cpu is faster than gpu?

2 Answers