I am trying to parallel my C++ Neural Network Training Process using OpenMP. But it won't work.
And then I used a simple C++ code with nested loops to test the OpenMP.
But it is much slower with OpenMP multithread than single thread.
Did I do something wrong to make it slower? Or did I miss something?
MacOS 4 cores
Time functions
I used both high_resolution_clock::now() and omp_get_wtime().
single thread cost time: 0.00000000000000
2 threads cost time: 0.00010013580322
4 threads cost time: 0.00016403198242
6 threads cost time: 0.00017309188843
8 threads cost time: 0.00112605094910
10 threads cost time: 0.00013613700867
12 threads cost time: 0.00082898139954
single thread cost time: 0.00000005900000
2 threads cost time: 0.00009907600000
4 threads cost time: 0.00018207300000
6 threads cost time: 0.00014479500000
8 threads cost time: 0.00070604400000
10 threads cost time: 0.00057277700000
12 threads cost time: 0.00074358000000
#include <iostream>
#include <omp.h>
#include <chrono>
#include <iomanip>
using namespace std;
void test() {
int j = 0;
for (int i = 0; i < 100000; i++) {
// do something to kill time...
int main()
auto startTime = chrono::high_resolution_clock::now();
auto endTime = chrono::high_resolution_clock::now();
// without openMp
startTime = chrono::high_resolution_clock::now();
for (int i = 0; i < 100000; i++) {
endTime = chrono::high_resolution_clock::now();
chrono::duration<double> diff = endTime - startTime;
cout << setprecision(14) << fixed;
cout << "single thread cost time: " << diff.count() << endl;
// 2 threads
startTime = chrono::high_resolution_clock::now();
#pragma omp parallel for num_threads(2)
for (int i = 0; i < 100000; i++) {
endTime = chrono::high_resolution_clock::now();
diff = endTime - startTime;
cout << "2 threads cost time: " << diff.count() << endl;
// 4 threads
startTime = chrono::high_resolution_clock::now();
#pragma omp parallel for num_threads(4)
for (int i = 0; i < 100000; i++) {
endTime = chrono::high_resolution_clock::now();
diff = endTime - startTime;
cout << "4 threads cost time: " << diff.count() << endl;
// 6 threads
startTime = chrono::high_resolution_clock::now();
#pragma omp parallel for num_threads(6)
for (int i = 0; i < 100000; i++) {
endTime = chrono::high_resolution_clock::now();
diff = endTime - startTime;
cout << "6 threads cost time: " << diff.count() << endl;
startTime = chrono::high_resolution_clock::now();
#pragma omp parallel for num_threads(8)
for (int i = 0; i < 100000; i++) {
endTime = chrono::high_resolution_clock::now();
diff = endTime - startTime;
cout << "8 threads cost time: " << diff.count() << endl;
startTime = chrono::high_resolution_clock::now();
#pragma omp parallel for num_threads(10)
for (int i = 0; i < 100000; i++) {
endTime = chrono::high_resolution_clock::now();
diff = endTime - startTime;
cout << "10 threads cost time: " << diff.count() << endl;
startTime = chrono::high_resolution_clock::now();
#pragma omp parallel for num_threads(12)
for (int i = 0; i < 100000; i++) {
endTime = chrono::high_resolution_clock::now();
diff = endTime - startTime;
cout << "12 threads cost time: " << diff.count() << endl;
// system("pause");
return 0;
How I compile the code
clang++ -std=c++11 -Xpreprocessor -fopenmp parallel.cpp -O3 -o parallel -lomp
Hi guys, the previous problem has solved, I think I should not use NUM_THREAD.
But when I use OpenMP to accelerate my neural network, it takes longer time.
Data size
MNIST dataset, 60000 each epoch
Time Function
Single thread result
***** train epoch 1.
Batch count: 6000.
batch size: 10.
Progress: 5999/6000.
train time is ... 64.7082.
Accuracy: 97.72% 9772/10000.
predict time is ... 3.51836.
Releasing Data Samples...
Releasing Neural Network...
Result with OpenMP
***** train epoch 1.
Batch count: 6000.
batch size: 10.
Progress: 5999/6000.
train time is: 247.615.
Accuracy: 97.72% 9772/10000.
predict time is: 30.739.
Code using parallel for
#pragma omp parallel for
for (int k = 0; k < size; k++) {
layer->map[i].data[k] = activation_func::tan_h(layer->map_common[k] + layer->map[i].b);
// cout << "current thread: " << omp_get_thread_num() << endl;
Code using parallel for and omp critical
for (int k = 0; k < layer->map_count; k++) {
for (int i = 0; i < map_h; i++) {
for (int j = 0; j < map_w; j++) {
double max_value = prev_layer->map[k].data[2*i*upmap_w + 2*j];
for (int n = 2*i; n < 2*(i + 1); n++) {
#pragma omp parallel for
for (int m = 2*j; m < 2*(j + 1); m++) {
#pragma omp critical
max_value = MAX(max_value, prev_layer->map[k].data[n*upmap_w + m]);
layer->map[k].data[i*map_w + j] = activation_func::tan_h(max_value);
and notnum_threads(...)
. – Jérôme Richard