I'm struggling with manual vectorization on MIC (intel Xeon Phi Coprocessor), I'm working a simple computation benchmarks (actually benchmarking CPU vs MIC and analyzing the vectorizing effect auto vs manual). I wanted to try the effects of the intrinsics. Here is my problem on the CPU, I can observe a gain of performance of 30% with the m256 intrinsics function (vs the CPU without intrinsics) but on the MIC with the m512 the performance is same than the MIC without the intrinsics (OpenMP + intrinsics), is it normal ?
- MIC+INTR ~ 3.18 sec
MIC ~ 3.19 sec
CPU+INTR ~ 4.31 sec
- CPU ~ 6.47 sec
The option I used: (intel compiler)
- To compile for MIC + intrinsic: -O3 -openmp -DWITH_INTR -restrict
To compile for MIC: -O3 -openmp -restrict
To compile for CPU + intrinsic: -O3 -openmp -DWITH_INTR -no-offload -restrict
- To compile for CPU: -O3 -openmp -no-offload -restrict
My hardware configuration:
- CPU: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz | SandyBridge (2x8cores | 32 threads)
- MIC: Intel(R) Xeon Phi(TM) coprocessor x100 family (61 cores | 244 threads)
The code seems long but it is just because there is the calculations without the use of intrinsics and the calculations with 256 bit vector and 512 bit vector.
and the code for whom want to reproduce the results:
#include <stdio.h>
#include <omp.h>
#include <offload.h>
#include <math.h>
#include <immintrin.h>
#define N 2<<17
#define P 2<<14
__declspec(target(mic:0)) void testVctr( double * restrict a, double * restrict b, double * restrict c )
{
double t1( omp_get_wtime() );
omp_set_num_threads(omp_get_max_threads());
__assume_aligned( a, 64 );
__assume_aligned( b, 64 );
__assume_aligned( c, 64 );
int i;
int j;
int k;
#ifdef WITH_INTR
#ifdef __MIC__
__m512d n1 = _mm512_set1_pd( 1. );
__m512d n1024 = _mm512_set1_pd( 1024. );
__m512d n230 = _mm512_set1_pd( 230. );
#else
__m256d n1 = _mm256_set1_pd( 1. );
__m256d n1024 = _mm256_set1_pd( 1024. );
__m256d n230 = _mm256_set1_pd( 230. );
#endif
#endif
#pragma omp parallel for private( i, j, k ) schedule( dynamic )
for( i=0; i<N; ++i )
{
#ifdef WITH_INTR
#ifdef __MIC__
double * restrict A = (double *restrict) _mm_malloc( (size_t)( (8) * sizeof(double) ), 64 );
__m512d res = _mm512_setzero_pd(), r0, r1;
for( j=0; j<P; j+=8 )
{
r0 = _mm512_load_pd( &b[j] );
r0 = _mm512_add_pd( r0, n1 );
r0 = _mm512_div_pd( n1, r0 );
r0 = _mm512_exp_pd( r0 );
r1 = _mm512_load_pd( &c[j] );
r1 = _mm512_mul_pd( r1, n1024 );
r1 = _mm512_add_pd( r1, n230 );
r1 = _mm512_log_pd( r1 );
r0 = _mm512_div_pd( r0, r1 );
res = _mm512_add_pd( res, r0 );
}
_mm512_store_pd( A, res );
double tmp(0.);
for( k=0; k<8; ++k )
tmp += A[k];
a[i] = tmp;
_mm_free( (double * restrict) A );
#else
double * restrict A = (double * restrict) _mm_malloc( (size_t)( (4) * sizeof(double) ), 64 );
__m256d res = _mm256_setzero_pd(), r0, r1;
for( j=0; j<P; j+=4 )
{
r0 = _mm256_load_pd( &b[j] );
r0 = _mm256_add_pd( r0, n1 );
r0 = _mm256_div_pd( n1, r0 );
r0 = _mm256_exp_pd( r0 );
r1 = _mm256_load_pd( &c[j] );
r1 = _mm256_mul_pd( r1, n1024 );
r1 = _mm256_add_pd( r1, n230 );
r1 = _mm256_log_pd( r1 );
r0 = _mm256_div_pd( r0, r1 );
res = _mm256_add_pd( res, r0 );
}
_mm256_store_pd( A, res );
double tmp(0.);
for( k=0; k<4; ++k )
tmp += A[k];
a[i] = tmp;
_mm_free( (double * restrict) A );
#endif
#else
double res = 0.;
#pragma simd
for( j=0; j<P; ++j )
{
double tmp0 = 1./(b[j]+1.);
double tmp1 = exp( tmp0 );
double tmp2 = c[j] * 1024;
double tmp3 = tmp2 + 230;
double tmp4 = log( tmp3 );
double tmp5 = tmp1 / tmp4;
res += tmp5;
}
a[i] = res;
#endif
}
printf("\nElapsed time: %f sec\n", omp_get_wtime() - t1 );
}
int main( void )
{
int i;
printf("\nOuter loop (N) %d iterations \nInner loop (P) %d iterations\n", N, P );
double * restrict a = (double * restrict) _mm_malloc( (size_t)( (N) * sizeof(double) ), 64 );
double * restrict b = (double * restrict) _mm_malloc( (size_t)( (P) * sizeof(double) ), 64 );
double * restrict c = (double * restrict) _mm_malloc( (size_t)( (P) * sizeof(double) ), 64 );
for( i=0; i<P; ++i )
{
b[i] = rand()/RAND_MAX;
c[i] = rand()/RAND_MAX;
}
#pragma offload target( mic : 0 ) \
out( a : length( N ) align(512) ) \
in ( b : length( P ) align(512) ) \
in ( c : length( P ) align(512) )
testVctr( a, b, c );
printf( "\nCheck last result: %f (~ 1.)\n", a[N-1]*2./(P) );
_mm_free( (double * restrict) a );
_mm_free( (double * restrict) b );
_mm_free( (double * restrict) c );
return 0;
}
Perhaps, I missed something in the code or some option in the compilation command.
I'll try any suggestion.
Thank you.
GS