I just started to use OpenMP to do parallel computing in C++. The program has a bad parallel performance. Since I don't know many multi-threading profiling tool (unlike simple gprof for single thread), I wrote a sample program to test the performance.
I have a 2D matrix(N * N), with each element a 3d vector(x, y, z). I simply do a double for loop to set each value in the matrix:
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
vectorStack[i][j] = VECTOR3D(1.0*i*i, 1.0*j*j, 1.0*i*j);
}
}
where VECTOR3D
is a simple class has x, y, z
attributes:
class VECTOR3D {
double x, y, z; // component along each axis
}
On the other hand, I can also use a (N * N * 3) 3D array to do this:
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
arrayHeap[i][j][0] = (1.0*i*i);
arrayHeap[i][j][1] = (1.0*j*j);
arrayHeap[i][j][2] = (1.0*i*j);
}
}
From the memory aspect, there are also several different choices, such as use raw pointers with manually allocate and deallocate:
double ***arrayHeap;
arrayHeap = new double** [N];
for(int i = 0; i < N; ++i) {
arrayHeap[i] = new double* [N];
for(int j = 0; j < N; ++j) {
arrayHeap[i][j] = new double[3];
}
}
or simply use std::vector
:
vector< vector<VECTOR3D>> vectorStack(N, vector<VECTOR3D>(N, VECTOR3D(0, 0, 0)));
I also considered to manually allocate continuous memory for arrays as did in LAMMP(Molecular Simulation Package source code.
So my results for N=10000
are listed here:
For single thread:
OMP_NUM_THREADS=1 ./a.out
Allocating memory for array on heap...
======= Array on heap Results =======
Finished within time (total): 0.720385 seconds
Finished within time (real): 0.720463 seconds
Deallocating memory for array on heap...
Allocating memory for array continous...
======= Array continuous Results =======
Finished within time (total): 0.819733 seconds
Finished within time (real): 0.819774 seconds
Deallocating memory for array continuous...
Allocating memory for vector on heap...
======= Vector on heap Results =======
Finished within time (total): 3.08715 seconds
Finished within time (real): 3.08725 seconds
Deallocating memory for vector on heap...
Allocating memory for vector on stack...
======= Vector on stack Results =======
Finished within time (total): 1.49888 seconds
Finished within time (real): 1.49895 seconds
For multi-threads (threads=4):
OMP_NUM_THREADS=4 ./a.out
Allocating memory for array on heap...
======= Array on heap Results =======
Finished within time (total): 2.29184 seconds
Finished within time (real): 0.577807 seconds
Deallocating memory for array on heap...
Allocating memory for array continous...
======= Array continuous Results =======
Finished within time (total): 1.79501 seconds
Finished within time (real): 0.454139 seconds
Deallocating memory for array continuous...
Allocating memory for vector on heap...
======= Vector on heap Results =======
Finished within time (total): 6.80917 seconds
Finished within time (real): 1.92541 seconds
Deallocating memory for vector on heap...
Allocating memory for vector on stack...
======= Vector on stack Results =======
Finished within time (total): 1.64086 seconds
Finished within time (real): 0.411 seconds
The overall parallel efficiency is not good. Unexpected, the fancy continuous memory allocating is not helpful?! Why is this happening? It seems the std::vector
is good enough for this case?
Could someone explain the results for me? and I also need suggestions on how to improve the code.
Thanks very much!!!
Attached all the source code. (please directly go to main, there are several functions to manually manage the memory at the beginning).
#include <iostream>
#include <omp.h>
#include <vector>
#include <stdlib.h>
#include <cinttypes>
#include "vector3d.h"
typedef int64_t bigint;
void *smalloc(bigint nbytes, const char *name)
{
if (nbytes == 0) return NULL;
void *ptr = malloc(nbytes);
return ptr;
}
template <typename TYPE>
TYPE ***create(TYPE ***&array, int n1, int n2, int n3, const char *name)
{
bigint nbytes = ((bigint) sizeof(TYPE)) * n1*n2*n3;
TYPE *data = (TYPE *) smalloc(nbytes,name);
nbytes = ((bigint) sizeof(TYPE *)) * n1*n2;
TYPE **plane = (TYPE **) smalloc(nbytes,name);
nbytes = ((bigint) sizeof(TYPE **)) * n1;
array = (TYPE ***) smalloc(nbytes,name);
int i,j;
bigint m;
bigint n = 0;
for (i = 0; i < n1; i++) {
m = ((bigint) i) * n2;
array[i] = &plane[m];
for (j = 0; j < n2; j++) {
plane[m+j] = &data[n];
n += n3;
}
}
return array;
}
template <typename TYPE>
TYPE ***create3d_offset(TYPE ***&array, int n1lo, int n1hi,
int n2, int n3, const char *name)
{
int n1 = n1hi - n1lo + 1;
create(array,n1,n2,n3,name);
array -= n1lo;
return array;
}
void sfree(void *ptr) {
if (ptr == NULL) return;
free(ptr);
}
template <typename TYPE>
void destroy(TYPE ***&array)
{
if (array == NULL) return;
sfree(array[0][0]);
sfree(array[0]);
sfree(array);
array = NULL;
}
template <typename TYPE>
void destroy3d_offset(TYPE ***&array, int offset)
{
if (array == NULL) return;
sfree(&array[offset][0][0]);
sfree(&array[offset][0]);
sfree(&array[offset]);
array = NULL;
}
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
int main() {
using namespace std;
const int N = 10000;
///////////////////////////////////////
double sum = 0.0;
clock_t t;
double startTime, stopTime, secsElapsed;
printf("\n\nAllocating memory for array on heap...\n");
double ***arrayHeap;
arrayHeap = new double** [N];
for(int i = 0; i < N; ++i) {
arrayHeap[i] = new double* [N];
for(int j = 0; j < N; ++j) {
arrayHeap[i][j] = new double[3];
}
}
printf("======= Array on heap Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
arrayHeap[i][j][0] = (1.0*i*i);
arrayHeap[i][j][1] = (1.0*j*j);
arrayHeap[i][j][2] = (1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
printf("Deallocating memory for array on heap...\n");
for(int i = 0; i < N; ++i) {
for(int j = 0; j < N; ++j) {
delete [] arrayHeap[i][j];
}
delete [] arrayHeap[i];
}
delete [] arrayHeap;
///////////////////////////////////////
printf("\n\nAllocating memory for array continous...\n");
double ***array_continuous;
create3d_offset(array_continuous,0, N, N, 3, "array");
printf("======= Array continuous Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
array_continuous[i][j][0] = (1.0*i*i);
array_continuous[i][j][1] = (1.0*j*j);
array_continuous[i][j][2] = (1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
printf("Deallocating memory for array continuous...\n");
destroy3d_offset(array_continuous, 0);
///////////////////////////////////////k
printf("\n\nAllocating memory for vector on heap...\n");
VECTOR3D ***vectorHeap;
vectorHeap = new VECTOR3D**[N];
for(int i = 0; i < N; ++i) {
vectorHeap[i] = new VECTOR3D* [N];
for(int j = 0; j < N; ++j) {
vectorHeap[i][j] = new VECTOR3D(0,0,0);
}
}
printf("======= Vector on heap Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
vectorHeap[i][j] = new VECTOR3D(1.0*i*i, 1.0*j*j, 1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
printf("Deallocating memory for vector on heap...\n");
for(int i = 0; i < N; ++i) {
for(int j = 0; j < N; ++j) {
delete [] vectorHeap[i][j];
}
delete [] vectorHeap[i];
}
delete [] vectorHeap;
/////////////////////////////////////////////////
printf("\n\nAllocating memory for vector on stack...\n");
vector< vector<VECTOR3D>> vectorStack(N, vector<VECTOR3D>(N, VECTOR3D(0, 0, 0)));
printf("======= Vector on stack Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
vectorStack[i][j] = VECTOR3D(1.0*i*i, 1.0*j*j, 1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
/////////////////////////////////
return 0;
}
And the VECTOR3D
class:
#ifndef _VECTOR3D_H
#define _VECTOR3D_H
#include <iostream>
#include <cmath>
#include <iomanip>
#include <limits>
class VECTOR3D {
public:
double x, y, z; // component along each axis (cartesian)
VECTOR3D(double xx = 0.0, double yy = 0.0, double zz = 0.0) : x(xx), y(yy), z(zz) // make a 3d vector
{
}
}