I am working on Harris corner detection on GPU. I am observing unusual behavior of my CPU performance.
Following is my Main.cpp file if I run this call by commenting my kernal function (This function calls various kernal of GPU) "Harris_Algo(a,d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response,Res,Height,length,SIZE);"
my function call to read next frame and converting to gray scale ("cap.read(Masrc) and cvtColor( Masrc, src, CV_BGR2GRAY )) takes average 0.003 and 0.004 Sec per frame.
Surprisingly when I uncomment my GPU kernel call function "Harris_Algo" the same CPU function (cap.read(Masrc) and cvtColor( Masrc, src, CV_BGR2GRAY )) takes on an average 0.009 and 0.008 Sec per frame.
As timing is very critical in my application this variation is killing the advantage I am getting by using GPU.This two function call has nothing to do with GPU still it is taking more time when I call GPU function (kernel).
What, I think is calling my GPU funtion (Kernel) increase the overhead on CPU so its utilization increases and performance decreases. But this variation is huge. Any other suitable method to do this.
Any help is appreciated.
I am using Jetson TK1 GPU board.
Main.cpp file
#include <iostream>
#include <time.h>
#include <fstream>
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
using namespace std;
using namespace cv;
void Cuda_Free(unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response);
void Harris_Algo(unsigned char *a,unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response,int *Res, int Height,int length,int SIZE);
void MemAlloc(unsigned char *&d_a,unsigned char *&d_g,int *&dx_My,int *&dy_My,int *&dxdy_My,int *&suppressed,int *&corner_response,int SIZE);
int main(int argc, char** argv)
{
cv::VideoCapture cap(argv[1]);
if (!cap.isOpened())
{
std::cout << "!!! Failed to open file: " << argv[1] << std::endl;
return -1;
}
double time_spent;
clock_t begin3, end3,begin4;
bool start = false;
Mat src;
unsigned char *a,*d_a,*d_g;
int *dx_My,*Res;
int *dy_My;
int *dxdy_My;
int *suppressed;
int *corner_response;
int length;
int Height;
int SIZE;
Size S;
VideoWriter outputVideo;
Mat Masrc;
for(;;)
{
begin4 = clock();
begin3 = clock();
if (!cap.read(Masrc))
break;
end3 = clock();
time_spent = (double)(end3 - begin3) / CLOCKS_PER_SEC;
cout<<"Read Frame : "<<time_spent<<endl;
begin3 = clock();
cvtColor( Masrc, src, CV_BGR2GRAY );
end3 = clock();
time_spent = (double)(end3 - begin3) / CLOCKS_PER_SEC;
cout<<"Gray Convert : "<<time_spent<<endl;
begin3 = clock();
if(start == false)
{
length = src.cols;
Height = src.rows;
cout<<"Width"<<length<<endl;
cout<<"Height"<<Height<<endl;
SIZE = ((length)*(Height));
Res = new int [SIZE];
MemAlloc(d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response,SIZE);
start = true;
}
a = src.data;
end3 = clock();
time_spent = (double)(end3 - begin3) / CLOCKS_PER_SEC;
cout<<"Initial Processsing Time : "<<time_spent<<endl;
Harris_Algo(a,d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response,Res,Height,length,SIZE);
begin3 = clock();
// imshow( "Harris_OUT", Masrc );
// char key = cvWaitKey(1);
// if (key == 27) // ESC
// break;
end3 = clock();
time_spent = (double)(end3 - begin3) / CLOCKS_PER_SEC;
cout<<"Time After Displaying image on Output : "<<time_spent<<endl;
time_spent = (double)(end3 - begin4) / CLOCKS_PER_SEC;
cout<<"Overall Time of entire program exec : "<<time_spent<<endl;
cout<<"-----------------------------------------------------------------------------"<<endl;
}
Cuda_Free(d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response);
delete Res;
cvWaitKey(0);
}
Kernal.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <time.h>
#include <fstream>
using namespace std;
__global__ void Harris_Mat(int *corner_response,int* dx_My,int* dy_My,int* dxdy_My,int rows, int cols,int Size)
{
/*...*/
}
__global__ void Supress_Neighbour(int *input,int *output, int rows, int cols, int Size)
{
/* ... */
}
__global__ void VectorGauss(unsigned char *D, unsigned char *M,int Length, int size_m )
{
float Val;
int i = blockIdx . x * blockDim . x + threadIdx . x;
if(i>0 & i<size_m)
{
if ((i%Length) ==(0) || (i%Length) == (Length-1)|| (i<Length) || (i>(size_m-Length))){
M[i] = 0;
}
Val = (( D[i] +(D[Length+i]) + D[2*Length+i]) +(D[i]+ (D[Length+i])+ D[2*Length+i])
+(D[i+1] + D[i+Length+1] + D[2*Length+i+]));
}
}
__global__ void VectorAdd(unsigned char *D,int* dx,int* dy,int* dxdy,int Length, int size_m)
{
/* ... */
}
__host__ void MemAlloc(unsigned char *&d_a,unsigned char *&d_g,int *&dx_My,int *&dy_My,int *&dxdy_My,int *&suppressed,int *&corner_response,int SIZE)
{
cudaMalloc (&d_a,SIZE*sizeof(unsigned char));
cudaMalloc (&d_g,SIZE*sizeof(unsigned char));
cudaMalloc (&dx_My,SIZE*sizeof(int));
cudaMalloc (&dy_My,SIZE*sizeof(int));
cudaMalloc (&dxdy_My,SIZE*sizeof(int));
cudaMalloc (&suppressed,SIZE*sizeof(int));
cudaMalloc (&corner_response,SIZE*sizeof(int));
}
__host__ void Harris_Algo(unsigned char *a,unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response,int *Res, int Height,int length,int SIZE)
{
double time_spent;
clock_t begin3, end3;
begin3 = clock();
cudaMemcpy(d_a,a,SIZE*sizeof(unsigned char),cudaMemcpyHostToDevice);
VectorGauss<<< SIZE/512+1,512>>>(d_a,d_g,length,SIZE);
VectorAdd<<< SIZE/512+1,512>>>(d_g,dx_My,dy_My,dxdy_My,length,SIZE);
Harris_Mat<<< SIZE/512+1,512>>>(corner_response,dx_My,dy_My,dxdy_My,Height,length,SIZE);
Supress_Neighbour<<< SIZE/512+1,512>>>(corner_response, suppressed,Height, length, SIZE);
cudaMemcpy(Res,suppressed,SIZE*sizeof(int),cudaMemcpyDeviceToHost);
end3 = clock();
time_spent = (double)(end3 - begin3) / CLOCKS_PER_SEC;
cout<<"Processsing Time of Algorithm : "<<time_spent<<endl;
}
__host__ void Cuda_Free(unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response)
{
cudaFree(d_a);
cudaFree(d_g);
cudaFree(dx_My);
cudaFree(dy_My);
cudaFree(dxdy_My);
cudaFree(corner_response);
cudaFree(suppressed);
}
I have use NVCC to compile and also used (NVCC and g++) both but same result.
running my code using
g++-4.8 -c Main.cpp
nvcc -c Kernal.cu
g++-4.8 -o Output Main.o Kernal.o -L/usr/local/cuda/lib -lcudart -lcuda `pkg-config opencv --cflags --libs`
clock()
. You cannot use clock to time code in the way you are doing. CPU seconds and seconds are not the same thing – talonmies