I have the following code that uses OMP to parallelize a monte carlo method. My question is why does the serial version of the code (monte_carlo_serial) run a lot faster than the parallel version (monte_carlo_parallel). I am running the code on a machine with 32 cores and get the following result printed to the console:
-bash-4.1$ gcc -fopenmp hello.c ;
-bash-4.1$ ./a.out
Pi (Serial): 3.140856
Time taken 0 seconds 50 milliseconds
Pi (Parallel): 3.132103
Time taken 127 seconds 990 milliseconds
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include <time.h>
int niter = 1000000; //number of iterations per FOR loop
int monte_carlo_parallel() {
double x,y; //x,y value for the random coordinate
int i; //loop counter
int count=0; //Count holds all the number of how many good coordinates
double z; //Used to check if x^2+y^2<=1
double pi; //holds approx value of pi
int numthreads = 32;
#pragma omp parallel firstprivate(x, y, z, i) reduction(+:count) num_threads(numthreads)
{
srand48((int)time(NULL) ^ omp_get_thread_num()); //Give random() a seed value
for (i=0; i<niter; ++i) //main loop
{
x = (double)drand48(); //gets a random x coordinate
y = (double)drand48(); //gets a random y coordinate
z = ((x*x)+(y*y)); //Checks to see if number is inside unit circle
if (z<=1)
{
++count; //if it is, consider it a valid random point
}
}
}
pi = ((double)count/(double)(niter*numthreads))*4.0;
printf("Pi (Parallel): %f\n", pi);
return 0;
}
int monte_carlo_serial(){
double x,y; //x,y value for the random coordinate
int i; //loop counter
int count=0; //Count holds all the number of how many good coordinates
double z; //Used to check if x^2+y^2<=1
double pi; //holds approx value of pi
srand48((int)time(NULL) ^ omp_get_thread_num()); //Give random() a seed value
for (i=0; i<niter; ++i) //main loop
{
x = (double)drand48(); //gets a random x coordinate
y = (double)drand48(); //gets a random y coordinate
z = ((x*x)+(y*y)); //Checks to see if number is inside unit circle
if (z<=1)
{
++count; //if it is, consider it a valid random point
}
}
pi = ((double)count/(double)(niter))*4.0;
printf("Pi (Serial): %f\n", pi);
return 0;
}
void main(){
clock_t start = clock(), diff;
monte_carlo_serial();
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds \n", msec/1000, msec%1000);
start = clock(), diff;
monte_carlo_parallel();
diff = clock() - start;
msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds \n", msec/1000, msec%1000);
}
drand48()
isn't thread-safe as it uses a global state (look atdrand48_r()
for possible replacement if you want to stick to this RNG); and 2/clock()
gives you CPU time, not elapsed time... You should useomp_get_wtime()
for all you timing tasks here. Finally, your issue has nothing to do with false sharing ofcount
. – Gilles