OpenACC bad performance with PGI Compiler

Question

I'm trying make my final paper using OpenACC for improve a performance of a code. But, the code optimized is worse than the serial code I tried many things, but none works. If anyone can help me I'll be very grateful.

1332    void SOR()
1333    {
1334        iteracoes_solver = 0;
1335        
1336        
1337        {
1338    
1339            #pragma acc data copy(AC_lin[2:nx*ny], Po1_lin[2:nx*ny], AUXW1[2:nx*ny], Qo_lin[2:nx*ny], Tgx[2:nx*ny], Po3_lin[2:nx*ny], Tgy[2:nx*ny], P_auxi[2:nx*ny], vetErro[2:nx*ny]) \
1340            copyin(AC_lin[2:nx*ny], Tgx[2:nx*ny], Tgy[2:nx*ny])
1341            while (iteracoes_solver < 10001)
1342            {
1343    
1344                
1345                //#pragma acc kernels loop private(n) copyin(P_auxi[0:nxy]) vector(512)
1346                for (n = 1; n <= nxy; n++)    
1347                    P_auxi[n] = Po3_lin[n];
1348    
1349                
1350                {
1351    
1352                    erro = 0.0;
1353    
1354                    if (total_iter == 10000)
1355                    {
1356                        printf("---> TEMPO=%g\n---> Numero maximo de iteracoes atingido no solver\n---> Corrigir parametros\n", tempo_atual);//getch();
1357                        exit_tool();
1358                    }
1359    
1360                    n = 1;
1361                    auxf1 = (AC_lin[n] * Po1_lin[n]) + (AUXW1[n] * Qo_lin[n]) + (Tgx[n] * Po3_lin[n + 1]) + (Tgy[n] * Po3_lin[n + nx]) + ((1 - AUXW1[n]));
1362                    Paux = auxf1 / (AC_lin[n] + Tgx[n] + Tgy[n] + (1 - AUXW1[n]));
1363                    Paux = (omega * Paux) + ((1.0 - omega) * P_auxi[n]);
1364                    vetErro[n] = fabs(Po3_lin[n] - Paux);
1365                    Po3_lin[n] = Paux;
1366    
1367                    n = nx;
1368                    auxf1 = (AC_lin[n] * Po1_lin[n]) + (AUXW1[n] * Qo_lin[n]) + (Tgx[n - 1] * Po3_lin[n - 1]) + (Tgy[n] * Po3_lin[n + nx]) + ((1 - AUXW1[n]));
1369                    Paux = auxf1 / (AC_lin[n] + Tgx[n - 1] + Tgy[n] + ((1 - AUXW1[n])));
1370                    Paux = (omega * Paux) + ((1.0 - omega) * P_auxi[n]);
1371                    vetErro[n] = fabs(Po3_lin[n] - Paux);
1372                    Po3_lin[n] = Paux;
1373    
1374                    auxi11 = nxy - nx;
1375    
1376                    n = auxi11 + 1;
1377                    auxf1 = (AC_lin[n] * Po1_lin[n]) + (AUXW1[n] * Qo_lin[n]) + (Tgx[n] * Po3_lin[n + 1]) + (Tgy[n - nx] * Po3_lin[n - nx]) + ((1 - AUXW1[n]));
1378                    Paux = auxf1 / (AC_lin[n] + Tgx[n] + Tgy[n - nx] + ((1 - AUXW1[n])));
1379                    Paux = (omega * Paux) + ((1.0 - omega) * P_auxi[n]);
1380                    vetErro[n] = fabs(Po3_lin[n] - Paux);
1381                    Po3_lin[n] = Paux;
1382    
1383                    n = nxy;
1384                    auxf1 = (AC_lin[n] * Po1_lin[n]) + (AUXW1[n] * Qo_lin[n]) + (Tgx[n - 1] * Po3_lin[n - 1]) + (Tgy[n - nx] * Po3_lin[n - nx]) + ((1 - AUXW1[n]));
1385                    Paux = auxf1 / (AC_lin[n] + Tgx[n - 1] + Tgy[n - nx] + ((1 - AUXW1[n])));
1386                    Paux = (omega * Paux) + ((1.0 - omega) * P_auxi[n]);
1387                    vetErro[n] = fabs(Po3_lin[n] - Paux);
1388                    Po3_lin[n] = Paux;
1389    
1390    
1391                   
1392                    #pragma acc parallel loop private(n)
1393                    for (n = 2; n < nx; n++)
1394                    {
1395                        auxf1 = (AC_lin[n] * Po1_lin[n]) + (AUXW1[n] * Qo_lin[n]) + (Tgx[n] * Po3_lin[n + 1]) + (Tgx[n - 1] * Po3_lin[n - 1]) + (Tgy[n] * Po3_lin[n + nx]) + ((1 - AUXW1[n]));
1396                        Paux = auxf1 / (AC_lin[n] + Tgx[n] + Tgx[n - 1] + Tgy[n] + ((1 - AUXW1[n])));
1397                        Paux = (omega * Paux) + ((1.0 - omega) * P_auxi[n]);
1398                        vetErro[n] = fabs(Po3_lin[n] - Paux);
1399                        Po3_lin[n] = Paux;
1400                    }
1401    
1402    
1403                    
1404                    #pragma acc parallel loop private(n)
1405                    for (j = 2; j < ny; j++)
1406                    {
1407                        auxi11 = nx * (j - 1);
1408    
1409                        n = auxi11 + 1;
1410                        auxf1 = (AC_lin[n] * Po1_lin[n]) + (AUXW1[n] * Qo_lin[n]) + (Tgx[n] * Po3_lin[n + 1]) + (Tgy[n] * Po3_lin[n + nx]) + (Tgy[n - nx] * Po3_lin[n - nx]) + ((1 - AUXW1[n]));
1411                        Paux = auxf1 / (AC_lin[n] + Tgx[n] + Tgy[n] + Tgy[n - nx] + ((1 - AUXW1[n])));
1412                        Paux = (omega * Paux) + ((1.0 - omega) * P_auxi[n]);
1413                        vetErro[n] = fabs(Po3_lin[n] - Paux);
1414                        Po3_lin[n] = Paux;
1415                    }
1416    
1417    
1418                    
1419                    //#pragma acc kernels
1420                    for (j = 2; j < ny; j++)
1421                    {
1422                        auxi11 = nx * (j - 1);
1423    
1424                        n = auxi11 + nx;
1425                        auxf1 = (AC_lin[n] * Po1_lin[n]) + (AUXW1[n] * Qo_lin[n]) + (Tgx[n - 1] * Po3_lin[n - 1]) + (Tgy[n] * Po3_lin[n + nx]) + (Tgy[n - nx] * Po3_lin[n - nx]) + ((1 - AUXW1[n]));
1426                        Paux = auxf1 / (AC_lin[n] + Tgx[n - 1] + Tgy[n] + Tgy[n - nx] + ((1 - AUXW1[n])));
1427                        Paux = (omega * Paux) + ((1.0 - omega) * P_auxi[n]);
1428                        vetErro[n] = fabs(Po3_lin[n] - Paux);
1429                        Po3_lin[n] = Paux;
1430    
1431                    }
1432    
1433                   
1434                    auxi11 = nxy - nx;
1435    
1436    
1437                   
1438                    //#pragma acc kernels
1439                    for (i = 2; i < nx; i++)
1440                    {
1441                        n = auxi11 + i;
1442                        auxf1 = (AC_lin[n] * Po1_lin[n]) + (AUXW1[n] * Qo_lin[n]) + (Tgx[n] * Po3_lin[n + 1]) + (Tgx[n - 1] * Po3_lin[n - 1]) + (Tgy[n - nx] * Po3_lin[n - nx]) + ((1 - AUXW1[n]));
1443                        Paux = auxf1 / (AC_lin[n] + Tgx[n] + Tgx[n - 1] + Tgy[n - nx] + ((1 - AUXW1[n])));
1444                        Paux = (omega * Paux) + ((1.0 - omega) * P_auxi[n]);
1445                        vetErro[n] = fabs(Po3_lin[n] - Paux);
1446                        Po3_lin[n] = Paux;
1447                    }
1448                }
1449                
1450    
1451                
1452                
1453                //#pragma acc kernels loop private(auxf1,Paux,i,j,n) vector(512) independent
1454                for (j = 2; j < ny; j++)
1455                {
1456                    for (i = 2 + (j % 2); i < nx; i += 2)
1457                    {
1458                        n = (nx * (j - 1)) + i;
1459                        auxf1 = (AC_lin[n] * Po1_lin[n]) + (AUXW1[n] * Qo_lin[n]) + (Tgx[n] * Po3_lin[n + 1]) + (Tgx[n - 1] * Po3_lin[n - 1]) + (Tgy[n] * Po3_lin[n + nx]) + (Tgy[n - nx] * Po3_lin[n - nx]) + ((1 - AUXW1[n]));
1460                        Paux = auxf1 / (AC_lin[n] + Tgx[n] + Tgx[n - 1] + Tgy[n] + Tgy[n - nx] + ((1 - AUXW1[n])));
1461                        Paux = (omega * Paux) + ((1.0 - omega) * P_auxi[n]);
1462                        vetErro[n] = fabs(Po3_lin[n] - Paux);
1463                        Po3_lin[n] = Paux;
1464                    }
1465                }
1466                
1467                
1468                //#pragma acc kernels loop private(auxf1,Paux,i,j,n) vector(512) independent
1469                for (j = 2; j < ny; j++)
1470                {
1471                    for (i = 3 - (j % 2); i < nx; i += 2)
1472                    {
1473                        n = (nx * (j - 1)) + i;
1474                        auxf1 = (AC_lin[n] * Po1_lin[n]) + (AUXW1[n] * Qo_lin[n]) + (Tgx[n] * Po3_lin[n + 1]) + (Tgx[n - 1] * Po3_lin[n - 1]) + (Tgy[n] * Po3_lin[n + nx]) + (Tgy[n - nx] * Po3_lin[n - nx]) + ((1 - AUXW1[n]));
1475                        Paux = auxf1 / (AC_lin[n] + Tgx[n] + Tgx[n - 1] + Tgy[n] + Tgy[n - nx] + ((1 - AUXW1[n])));
1476                        Paux = (omega * Paux) + ((1.0 - omega) * P_auxi[n]);
1477                        vetErro[n] = fabs(Po3_lin[n] - Paux);
1478                        Po3_lin[n] = Paux;
1479                    }
1480                }
1481                
1482                
1483                #pragma acc parallel loop private(n)
1484                for (n = 0; n < nxy; n++) {
1485                    erro += vetErro[n];
1486                }
1487    
1488                
1489                if ((erro / nxy < tol)) {
1490                    //#pragma omp cancel parallel
1491                    break;
1492                }
1493                iteracoes_solver += 1;
1494            }
1495    
1496        }
1497    }

I can't put all de code, so I put a part of the code. I comment some pragmas for test. When my nx=ny >=1000 the serial code is better than the optimized code. And another thing I don't undestand was I remove all pragmas of the code and I run the PGI compiler to optimize the code and the serial code was better and I don't know why. Finally, I have two questions:

1 - What I have done of wrong? 2 - Why the code without the pragmas was so worse than the serial code?

Best regards, Breno.

Update: I forgot to put the compile message. I run the code

pgcc -ta=multicore -Minfo=all file.c -o file.exe

And generated

 1360, FMA (fused multiply-add) instruction(s) generated
   1392, Generating Multicore code
       1393, #pragma acc loop gang
   1393, FMA (fused multiply-add) instruction(s) generated
   1404, Generating Multicore code
       1405, #pragma acc loop gang
   1405, FMA (fused multiply-add) instruction(s) generated
   1420, FMA (fused multiply-add) instruction(s) generated
   1439, FMA (fused multiply-add) instruction(s) generated
   1456, FMA (fused multiply-add) instruction(s) generated
   1471, FMA (fused multiply-add) instruction(s) generated
   1483, Generating Multicore code
       1484, #pragma acc loop gang
   1485, Generating implicit reduction(+:erro)

Since the code is incomplete, in error (as noted the while loop wouldn't terminate), and you've commented out needed pragmas, it's very difficult to offer any specific suggestions. Instead, what are the compiler feedback messages (-Minfo=accel) telling you? Are the loops being successfully parallelized or being run serially on the device? What does a profiler show? You can do a quick profile by setting the environment variable "PGI_ACC_TIME=1", or use pgprof or Nsight-Systems depending on which version of the compiler you'r using. — Mat Colgrove
As Mat said, -Minfo=accel is your friend. Also note that a typical approach is to progressively parallelize the code and check performance and correctness as you go, rather than doing a mass conversion. If you time individual loops before and after parallelizing that'll give you good info. — Richard
@MatColgrove, thanks for the answer. I update the question with the compiler message. I try change the environment variable PGI_ACC_TIME, but I couldn't. I replace many files with "export PGI_ACC_TIME=1", but none worked :/ — BRENO Luís Dutra
@Richard, thanks for the answer too. And thanks for the hint. I'm trying this now. :) — BRENO Luís Dutra
Ok, from the update I see you're targeting multicore. PGI_ACC_TIME will only profile device execution so explains why no output is given. For multicore, check your binding and how may cores are being used. Are you running through a batch job or locally on a system? If locally, open a new shell and monitor the run via 'top' to see the binding. By default, all cores will be used, including hyper-threads, with one possibility being over-subscription. You can explicitly set the number of cores to use via the env variable ACC_NUM_CORES. — Mat Colgrove

user3629249 user3629249 · Accepted Answer · 2020-07-02T05:49:54

in general, using the OMP features results in numerous context switches, and that actually slows the execution of the code. SO OMP is great for I/O bound code but NOT for CPU bound code.

repetitive calculations should be moved before the loop:

what is: 2:nx*ny?

The 'scope' of index variables should be as limited as reasonable. Therefore, when writing a for() statement, the first parameter should also declare the index variable, which for most of these for() statements would be similar to:

for( size_t i = 0; .,.. )

regarding: 1.0 this is a double literal, but the code is calling fabs() which expects a float, Suggest the literal be: 1.0f

as with several computations insize the loops this computation: (1.0 - omega) does not change, so this computation should be done only once, before the loop

how can we say why the code, without the pragma's, is slower than the serial code when we have not seen the serial code?

regarding:

while (iteracoes_solver < 10001)

the variable: iteracoes_solver is initialized to 0 and not incremented anywhere in the posted code.

OpenACC bad performance with PGI Compiler

1 Answers