Should Eigen dense matrix * dense vector multiplication be 7 times slower than GSL?

Question

The answer to this question of mine made me expect that (for matrices with 1/4 of non-vanishing entries) in Eigen the product Dense Matrix * Dense Vector should outperform Sparse Matrix*Dense Vector.

Not only do I see the opposite, but also both are outperformed by GSL by a factor of 7 and 4 respectively.

Am I using Eigen incorrectly? Am I timing carelessly? I am very startled.

My compile options read:

ludi@ludi-M17xR4:~/Desktop/tests$ g++ -o eigenfill.x eigenfill.cc -L/usr/local/lib -lgsl -lgslcblas && ./eigenfill.x

My code reads:

#include <iostream>
#include <stdio.h>
#include <stdlib.h>     
#include <Eigen/Sparse>
#include <Eigen/Dense>
#include <gsl/gsl_matrix.h>
#include <sys/time.h>
#include <gsl/gsl_blas.h>
#define helix 100
#define rows helix*helix
#define cols rows
#define filling rows/4
#define REPS 10


using namespace Eigen;

/*-- DECLARATIONES --*/
int FillSparseMatrix(SparseMatrix<double> & mat);
int FillDenseMatrices(MatrixXd & Mat, gsl_matrix *testmat);
double vee(int i, int j);
int set_vectors_randomly(gsl_vector * v2, VectorXd v1);

int main()
{
int rep;
    struct timeval tval_before, tval_after, tval_result;

gsl_matrix *testmat     = gsl_matrix_calloc(rows, cols);
gsl_vector *v2      =gsl_vector_calloc(cols);
gsl_vector *prod    =gsl_vector_calloc(cols);

SparseMatrix<double> mat(rows,cols);         // default is column major
MatrixXd Mat(rows,cols);         // default is column major
VectorXd v1(cols), vv1(cols);

FillSparseMatrix(mat);
FillDenseMatrices(Mat, testmat);
    printf("\n/*--- --- --- ---*/\n");
for(rep=0;rep<REPS;rep++)
{
set_vectors_randomly(v2, v1);

    gettimeofday(&tval_before, NULL);       
vv1 = mat*v1;
    gettimeofday(&tval_after, NULL);
    timersub(&tval_after, &tval_before, &tval_result);

    printf("Time for one product, SPARSE EIGEN / secs: %ld.%06ld\n", (long int)tval_result.tv_sec, (long int)tval_result.tv_usec);
    gettimeofday(&tval_before, NULL);       
gsl_blas_dgemv( CblasNoTrans,1.0, testmat, v2, 0.0, prod);
    gettimeofday(&tval_after, NULL);
    timersub(&tval_after, &tval_before, &tval_result);
    printf("Time for one product, GSL / secs: %ld.%06ld\n", (long int)tval_result.tv_sec, (long int)tval_result.tv_usec);

    gettimeofday(&tval_before, NULL);       
vv1 = Mat*v1;
    gettimeofday(&tval_after, NULL);
    timersub(&tval_after, &tval_before, &tval_result);
    printf("Time for one product, DENSE EIGEN / secs: %ld.%06ld\n", (long int)tval_result.tv_sec, (long int)tval_result.tv_usec);
    printf("/*--- --- --- ---*/\n\n");


  //std::cout << mat << std::endl;
}
gsl_matrix_free(testmat);   
printf("--- --- --->DONE\n");
return(0);
}

/*-- --*/
int FillSparseMatrix(SparseMatrix<double> &mat)
{
int i, j;
Eigen::VectorXd Vres;
mat.reserve(Eigen::VectorXi::Constant(cols,filling));

printf("Filling Sparse Matrix ...");
    for(i=0;i<rows;i++)
    {
        if(i%2500==0){printf("i= %i\n", i);}
    for(j=0;j<cols;j++)
        {
        if (vee(i,j) != 0){mat.insert(i,j) = vee(i,j);    /*alternative: mat.coeffRef(i,j) += v_ij;*/ }
        }

    }

return(0);
}
/*-- --*/

/*-- --*/
int FillDenseMatrices(MatrixXd &Mat, gsl_matrix * testmat)
{
int i, j;
Eigen::VectorXd Vres;
double aux;
printf("Filling Dense Matrix ...");
    for(i=0;i<rows;i++)
    {
        if(i%2500==0){printf("i= %i\n", i);}
    for(j=0;j<cols;j++)
        {
        aux = vee(i,j);
        if (aux != 0)
        {
        Mat(i,j) = aux;    
        gsl_matrix_set(testmat, i, j, aux);
        }
        }

    }
return(0);
}
/*-- --*/

double vee(int i, int j)
{
    double result = 0.0;

    if(i%4 == 0){result =1.0;}

    return result;
}
/*-- --*/
int set_vectors_randomly(gsl_vector * v2, VectorXd v1){
printf("Setting vectors rendomly anew ...\n");
for (int j=0;j<cols;j++) 
{
double r=drand48();
v1(j) =r;
gsl_vector_set(v2, j, r);

}
return(0);
}
/*-- --*/

You should add an optimization flag -O2 or -O3 when compiling. — Ari Hietanen
Indeed. Now both outperform GSL and the sparse one is best. Would you like to write an answer and get some points? ;) — Ludi
You can speed it up a little bit more by adding the -DNDEBUG flag. This will disable bounds checking. — Avi Ginsburg
Can the culprit explain why he down voted this question? This seems important and non-trivial to me! — Ludi

Avi Ginsburg Avi Ginsburg · Accepted Answer · 2015-10-28T08:41:16

With Eigen, performance is abysmal when compiling without compiler optimizations. There are several ways to increase performance dramatically:

With optimizations turned on (-O2 or -O3 in g++) performance can be two-three orders of magnitude higher.
An additional (smaller) speedup can be attained by defining NDEBUG before including the Eigen library. This disables bounds checking, so make sure there are no issues before enabling this feature.
Eigen can also take advantage of vectorization (SSE as of 3.2.6 and AVX as of 3.3, PowerPC and ARM as well) leading to further speedups. Just enable the relevant flags in your compiler.
Enabling OMP can lead to speedups as well. Again, enable the relevant flags in your compiler and Eigen will do the rest.

Should Eigen dense matrix * dense vector multiplication be 7 times slower than GSL?

2 Answers