I'm trying to do a multiplication of to larger matrices (1000x1000 to 5000x5000 double precision). I have to use OpenMP to parallelise the multiplication. The parallel for loop is processed by p number of threads and they are scheduled correctly I guess based on printing out omp_get_thread_num(). I'm running on a 4 core CPU and have confirmed that the max number of threads is 4. The CPU's are virtual if that makes any difference. The problem is that the run time doesn't decrease when I change the nb of threads.
I have checked that the
libgomp
library is installed byldconfig -p | grep -i "gomp"
.I have tried changing the place of the parallel loop to one of the nested loops.
I have tried changing the scheduling and chunk size.
#include <stdio.h> #include <stdlib.h> #include <omp.h> #include <time.h> double** createMatrix(int N) { double** rndMatrix; srand48((long int)time(NULL)); rndMatrix = malloc(sizeof(double*)*N); int n,m; for(n=0; n<N; n++){ rndMatrix[n] = malloc(sizeof(double*)*N); for (m=0;m<N;m++){ rndMatrix[n][m] = drand48(); } } return rndMatrix; } void problem1(double** a, double** b, int N, int p){ int i,k,j; int g; double** c; c = malloc(sizeof(double*)*N); for(g=0; g<N; ++g) c[g] = malloc(sizeof(double*)*N); //Timer start clock_t tStart = clock(); //time_t tStart, tEnd; //tStart =time(NULL); //Parallelised part #pragma omp parallel shared(a,b,c,N) private(i,k,j) num_threads(p) { #pragma omp for schedule(static) nowait for(i=0; i<N; ++i){ for(j=0; j<N; ++j){ double sum = 0; for(k=0; k<N; ++k){ sum += a[i][k] * b[k][j]; } c[i][j]=sum; } } } //Timer end printf("Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC); //tEnd = time(NULL); //printf("Time taken: %ds\n", tEnd - tStart); } int main(void) { int p=0; int N=0; //User input: printf("Enter matrix dimension:\n"); scanf("%d", &N); printf("Please enter nb of threads:\n"); scanf("%d", &p); double **a; double **b; a = createMatrix(N); sleep(2); b = createMatrix(N); problem1(a,b,N,p); return 0; }
clock
andtime
? – Zulan