Is there a predefined time that is required for offload call to transfer the data(parameters) of a function from host to Intel MIC(Xeon Phi coprocessor 3120 series)?
Specifically I do offload call ("#pragma offload target(mic)") for a function that I want to be executed on MIC. The function has 15 parameters(pointers and variables) and I have already confirmed the right passing of the parameters on MIC. However I have simplified the code with purpose to check the time for the passing of the parameters and so it contains just one simple "printf()" function. I use the "gettimeofday()" of "sys/time.h" header file for measuring time as it seems in the code below:
Some hardware informations for the host: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz / CentOS release 6.8 / PCI Express Revision 2.0
main.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
#include <string.h>
__attribute__ (( target (mic))) unsigned long long ForSolution = 0;
__attribute__ (( target (mic))) unsigned long long sufficientSol = 1;
__attribute__ (( target (mic))) float timer = 0.0;
__attribute__ (( target (mic))) void function(float *grid, float *displ, unsigned long long *li, unsigned long long *repet, float *solution, unsigned long long dim, unsigned long long numOfa, unsigned long long numLoops, unsigned long long numBlock, unsigned long long thread, unsigned long long blockGrid, unsigned long long station, unsigned long long bytesSol, unsigned long long totalSol, volatile unsigned long long *prog);
float *grid, *displ, *solution;
unsigned long long *li,repet;
volatile unsigned long long *prog;
unsigned long long dim = 10, grid_a = 3, numLoops = 2, numBlock = 0;
unsigned long long thread = 220, blockGrid = 0, station = 12;
unsigned long long station_at = 8, bytesSol, totalSol;
bytesSol = dim*sizeof(float);
totalSol = ((1024 * 1024 * 1024) / bytesSol) * bytesSol;
/******** Some memcpy() functions here for the pointers*********/
gettimeofday(&start, NULL);
#pragma offload target(mic) \
in(grid:length(dim * grid_a * sizeof(float))) \
in(displ:length(station * station_at * sizeof(float))) \
in(li:length(dim * sizeof(unsigned long long))) \
in(repet:length(dim * sizeof(unsigned long long))) \
out(solution:length(totalSol/sizeof(float))) \
in(dim,grid_a,numLoops,numBlock,thread,blockGrid,station,bytesSol,totalSol) \
in(prog:length(sizeof(volatile unsigned long long))) \
inout(ForSolution,sufficientSol,timer)
{
function(grid, displ, li, repet, solution, dim, grid_a, numLoops, numBlock, thread, blockGrid, station, bytesSol, totalSol, prog);
}
gettimeofday(&end, NULL);
printf("Time to tranfer data on Intel Xeon Phi: %f sec\n", (((end.tv_sec - start.tv_sec) * 1000000.0 + (end.tv_usec - start.tv_usec)) / 1000000.0) - timer);
printf("Time for calculations: %f sec\n", timer);
function.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
#include <string.h>
#include <omp.h>
void function(float *grid, float *displ, unsigned long long *li, unsigned long long *repet, float *solution, unsigned long long dim, unsigned long long numOfa, unsigned long long numLoops, unsigned long long numBlock, unsigned long long thread, unsigned long long blockGrid, unsigned long long station, unsigned long long bytesSol, unsigned long long totalSol, volatile unsigned long long *prog)
{
struct timeval timer_start, timer_end;
gettimeofday(&timer_start, NULL);
printf("Hello World!!!\n");
gettimeofday(&timer_end, NULL);
timer = ((timer_end.tv_sec - timer_start.tv_sec) * 1000000.0 + (timer_end.tv_usec - timer_start.tv_usec)) / 1000000.0 ;
}
Results of terminal:
Time to tranfer data on Intel Xeon Phi: 3.512706 sec
Time for calculations: 0.000002 sec
Hello World!!!
The code require 3.5 seconds to complete the "offload target". Is the above result normal? Is there any way to reduce that significant time delay of offload call?