I currently have three methos of measuring the elapsed time, two using CUDA events and the other recording start and end UNIX. The ones using CUDA events measure two things, one measures the entire outer loop time, and the other sum all kernel execution times.
Here's the code:
int64 x1, x2;
cudaEvent_t start;
cudaEvent_t end;
cudaEvent_t s1, s2;
float timeValue;
#define timer_s cudaEventRecord(start, 0);
#define timer_e cudaEventRecord(end, 0); cudaEventSynchronize(end); cudaEventElapsedTime( &timeValue, start, end ); printf("time: %f ms \n", timeValue);
cudaEventCreate( &start );
cudaEventCreate( &end );
cudaEventCreate( &s1 );
cudaEventCreate( &s2 );
cudaEventRecord(s1, 0);
x1 = GetTimeMs64();
for(int r = 0 ; r < 2 ; r++)
{
timer_s
kernel1<<<1, x>>>(gl_devdata_ptr);
cudaThreadSynchronize();
timer_e
sum += timeValue;
for(int j = 0 ; j < 5; j++)
{
timer_s
kernel2<<<1,x>>>(gl_devdata_ptr);
cudaThreadSynchronize();
timer_e
sum += timeValue;
timer_s
kernel3<<<1,x>>>(gl_devdata_ptr);
cudaThreadSynchronize();
timer_e
sum += timeValue;
}
timer_s
kernel4<<<y, x>>> (gl_devdata_ptr);
cudaThreadSynchronize();
timer_e
sum += timeValue;
}
x2 = GetTimeMs64();
cudaEventRecord(s2, 0);
cudaEventSynchronize(s2);
cudaEventElapsedTime( &timeValue, s1, s2 );
printf("elapsed cuda : %f ms \n", timeValue);
printf("elapsed sum : %f ms \n", sum);
printf("elapsed win : %d ms \n", x2-x1);
The GetTimeMs64 is something I found here on StackOverflow:
int64 GetTimeMs64()
{
/* Windows */
FILETIME ft;
LARGE_INTEGER li;
uint64 ret;
/* Get the amount of 100 nano seconds intervals elapsed since January 1, 1601 (UTC) and copy it
* to a LARGE_INTEGER structure. */
GetSystemTimeAsFileTime(&ft);
li.LowPart = ft.dwLowDateTime;
li.HighPart = ft.dwHighDateTime;
ret = li.QuadPart;
ret -= 116444736000000000LL; /* Convert from file time to UNIX epoch time. */
ret /= 10000; /* From 100 nano seconds (10^-7) to 1 millisecond (10^-3) intervals */
return ret;
}
Those aren't the real variable names nor the right kernel names, I just removed some to make the code smaller.
So the problem is, every measure gives me a really different total time.
Some examples I just ran:
elapsed cuda : 21.076832
elapsed sum : 4.177984
elapsed win : 27
So why is there such a huge difference? The sum of all kernel calls is around 4 ms, where are the other 18ms? CPU time?