Performance of CPU decreases after calling Kernel of GPU

Question

I am working on Harris corner detection on GPU. I am observing unusual behavior of my CPU performance.

Following is my Main.cpp file if I run this call by commenting my kernal function (This function calls various kernal of GPU) "Harris_Algo(a,d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response,Res,Height,length,SIZE);" my function call to read next frame and converting to gray scale ("cap.read(Masrc) and cvtColor( Masrc, src, CV_BGR2GRAY )) takes average 0.003 and 0.004 Sec per frame.

Surprisingly when I uncomment my GPU kernel call function "Harris_Algo" the same CPU function (cap.read(Masrc) and cvtColor( Masrc, src, CV_BGR2GRAY )) takes on an average 0.009 and 0.008 Sec per frame.

As timing is very critical in my application this variation is killing the advantage I am getting by using GPU.This two function call has nothing to do with GPU still it is taking more time when I call GPU function (kernel).

What, I think is calling my GPU funtion (Kernel) increase the overhead on CPU so its utilization increases and performance decreases. But this variation is huge. Any other suitable method to do this.

Any help is appreciated.

I am using Jetson TK1 GPU board.

Main.cpp file

#include <iostream>
#include <time.h>
#include <fstream>
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"

using namespace std;
using namespace cv;
 void Cuda_Free(unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response);

void Harris_Algo(unsigned char *a,unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response,int *Res, int Height,int length,int SIZE);

void MemAlloc(unsigned char *&d_a,unsigned char *&d_g,int *&dx_My,int *&dy_My,int *&dxdy_My,int *&suppressed,int *&corner_response,int SIZE);


    int main(int argc, char** argv)
    {
        cv::VideoCapture cap(argv[1]);

        if (!cap.isOpened())
        {
            std::cout << "!!! Failed to open file: " << argv[1] << std::endl;
            return -1;
        }

        double time_spent;
        clock_t begin3, end3,begin4;
        bool start = false;

        Mat src;
        unsigned char *a,*d_a,*d_g;
        int *dx_My,*Res;
        int *dy_My;
        int *dxdy_My;
        int *suppressed;
        int *corner_response;
        int length;
        int Height;
        int SIZE;
        Size S;
        VideoWriter outputVideo;   

        Mat Masrc;
        for(;;)
        {

            begin4 = clock();
            begin3 = clock();
            if (!cap.read(Masrc))             
                break;
                    end3 = clock();
            time_spent = (double)(end3 - begin3) / CLOCKS_PER_SEC;
            cout<<"Read Frame              : "<<time_spent<<endl;

            begin3 = clock();
            cvtColor( Masrc, src, CV_BGR2GRAY );
                        end3 = clock();
            time_spent = (double)(end3 - begin3) / CLOCKS_PER_SEC;
            cout<<"Gray Convert             : "<<time_spent<<endl;   

            begin3 = clock(); 
            if(start == false)
            {
                length     = src.cols;
                Height     = src.rows;

                cout<<"Width"<<length<<endl;
                cout<<"Height"<<Height<<endl;
                SIZE = ((length)*(Height));

                Res = new int [SIZE];

                MemAlloc(d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response,SIZE);

                start = true;

            }

            a = src.data;

            end3 = clock();
            time_spent = (double)(end3 - begin3) / CLOCKS_PER_SEC;
            cout<<"Initial Processsing Time              : "<<time_spent<<endl;

            Harris_Algo(a,d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response,Res,Height,length,SIZE);

            begin3 = clock();

          // imshow( "Harris_OUT", Masrc );
         //   char key = cvWaitKey(1);
         //   if (key == 27) // ESC
          //      break;

            end3 = clock();
            time_spent = (double)(end3 - begin3) / CLOCKS_PER_SEC;
            cout<<"Time After Displaying image on Output : "<<time_spent<<endl;
            time_spent = (double)(end3 - begin4) / CLOCKS_PER_SEC;
            cout<<"Overall Time of entire program exec   : "<<time_spent<<endl;
            cout<<"-----------------------------------------------------------------------------"<<endl;

        }

        Cuda_Free(d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response);
        delete Res;
        cvWaitKey(0);
    }

Kernal.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <time.h>
#include <fstream>

using namespace std;
__global__ void Harris_Mat(int *corner_response,int* dx_My,int* dy_My,int* dxdy_My,int rows, int cols,int Size)
{

   /*...*/
}

__global__ void Supress_Neighbour(int *input,int *output, int rows, int cols, int Size) 
{
   /* ... */
}

__global__ void VectorGauss(unsigned char *D, unsigned char *M,int Length, int size_m )
{
    float Val;
    int i =  blockIdx . x * blockDim . x + threadIdx . x;
    if(i>0 & i<size_m)
    {
        if ((i%Length) ==(0) || (i%Length) == (Length-1)|| (i<Length) || (i>(size_m-Length))){
            M[i] = 0; 
        }

        Val = (( D[i] +(D[Length+i]) + D[2*Length+i]) +(D[i]+ (D[Length+i])+ D[2*Length+i])
                +(D[i+1] + D[i+Length+1] + D[2*Length+i+]));
    }
}

__global__ void VectorAdd(unsigned char *D,int* dx,int* dy,int* dxdy,int Length, int size_m)
{

/* ... */
}


__host__ void MemAlloc(unsigned char *&d_a,unsigned char *&d_g,int *&dx_My,int *&dy_My,int *&dxdy_My,int *&suppressed,int *&corner_response,int SIZE)
{
    cudaMalloc (&d_a,SIZE*sizeof(unsigned char));
    cudaMalloc (&d_g,SIZE*sizeof(unsigned char));
    cudaMalloc (&dx_My,SIZE*sizeof(int));
    cudaMalloc (&dy_My,SIZE*sizeof(int));
    cudaMalloc (&dxdy_My,SIZE*sizeof(int));
    cudaMalloc (&suppressed,SIZE*sizeof(int));
    cudaMalloc (&corner_response,SIZE*sizeof(int));
}

__host__ void Harris_Algo(unsigned char *a,unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response,int *Res, int Height,int length,int SIZE)
{
    double time_spent;
    clock_t begin3, end3;
    begin3 = clock();
    cudaMemcpy(d_a,a,SIZE*sizeof(unsigned char),cudaMemcpyHostToDevice);


    VectorGauss<<< SIZE/512+1,512>>>(d_a,d_g,length,SIZE);

    VectorAdd<<< SIZE/512+1,512>>>(d_g,dx_My,dy_My,dxdy_My,length,SIZE);

    Harris_Mat<<< SIZE/512+1,512>>>(corner_response,dx_My,dy_My,dxdy_My,Height,length,SIZE);

    Supress_Neighbour<<< SIZE/512+1,512>>>(corner_response, suppressed,Height, length, SIZE); 


    cudaMemcpy(Res,suppressed,SIZE*sizeof(int),cudaMemcpyDeviceToHost);
    end3 = clock();
    time_spent = (double)(end3 - begin3) / CLOCKS_PER_SEC;

    cout<<"Processsing Time of Algorithm         : "<<time_spent<<endl;
}

__host__ void Cuda_Free(unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response)
{
    cudaFree(d_a);
    cudaFree(d_g);
    cudaFree(dx_My);
    cudaFree(dy_My);
    cudaFree(dxdy_My);
    cudaFree(corner_response);
    cudaFree(suppressed);

}

I have use NVCC to compile and also used (NVCC and g++) both but same result.

running my code using

g++-4.8 -c Main.cpp
nvcc -c Kernal.cu
g++-4.8 -o Output Main.o Kernal.o -L/usr/local/cuda/lib -lcudart -lcuda `pkg-config opencv --cflags --libs`

I think your CUDA memory allocation is creating the overhead. I think since you statically define the memory only once, and clear only once. Why dont you try cudaMalloc and cudaFree for each frame (inside this function Harris_Algo) since in your algorithm previous info is not required. — kcc__
Yes, CUDA Memory allocation is expensive that is the reason i am doing it only once to process entire video frames. If i do it for every frames then my overhead will be huge. I have tried that also but overall time will be more if we allocate memory for every frame. — Vinay Patel
Your entire time measurement approach is wrong. Please read the man page for clock(). You cannot use clock to time code in the way you are doing. CPU seconds and seconds are not the same thing — talonmies
@talonmies The issue here is calling GPU function increases the time for CPU functions. Even if its CPU second, its values is more when calling GPU function. — Vinay Patel
cplusplus.com/reference/ctime/clock This link explains use of clock(). I — Vinay Patel

Taro Taro · Accepted Answer · 2016-04-08T10:20:17

I see two main reasons for which you have longer times on CPU when calling your function related to the GPU :

It calls two copies, one from RAM to VRAM, and one from VRAM back to RAM, with cudaMemCpy. This has a cost.
The second copy is called after your kernel launches, it makes you wait for the GPU to finish the computation, as cudaMemCpy is blocking/sync.

You may have a performance enhancement by doing this computation on the GPU, but if the memory copies cost more, then you have less performance that if you did everything on CPU side.

Performance of CPU decreases after calling Kernel of GPU

1 Answers