MPI Block matrix multiplication

Question

I am trying to generate two matrices A&B of size n, partition them into s*s sub-matrices and after scattering them through the processors, perform a multiplication between the block matrices. I have been able to successfully generate and scatter the sub-matrices through the processors; however, I am stuck in performing multiplication on the sub-matrices at each processor. My code is pretty much similar to the one in the following post (the code in the answer section) but I modified it for two matrices: MPI partition matrix into blocks

Could you please tell me how I can modify this to perform multiplication?

I stayed with the same tags to be easier to follow up.

    #include <stdio.h>
    #include <stdlib.h>
    #include <mpi.h>
    #include <time.h>

    #define COLSa 10
    #define ROWSa 10

    #define COLSb 10
    #define ROWSb 10
    #define s 2

   int main(int argc, char **argv) {

    MPI_Init(&argc, &argv);
    int p, rank;
    MPI_Comm_size(MPI_COMM_WORLD, &p);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    char i;
    char j;

    char a[ROWSa*COLSa];
    char b[ROWSb*COLSb];
    char c[ROWSa*COLSb];  // c=a*b

    const int NPROWS=s;  /* number of rows in _decomposition_ */
    const int NPCOLS=s;  /* number of cols in _decomposition_ */

    const int BLOCKROWSa = ROWSa/NPROWS;  /* number of rows in _block_ */
    const int BLOCKCOLSa = COLSa/NPCOLS; /* number of cols in _block_ */

    const int BLOCKROWSb = ROWSb/NPROWS;  /* number of rows in _block_ */
    const int BLOCKCOLSb= COLSb/NPCOLS; /* number of cols in _block_ */

    if (rank == 0) {

    for (int ii=0; ii<ROWSa*COLSa; ii++) {
         a[ii]=rand() %10 ;
    }

    for (int ii=0; ii<ROWSb*COLSb; ii++) {
         b[ii]=rand() %10 ;

       }
    }

    char BLa[BLOCKROWSa*BLOCKCOLSa];
    for (int ii=0; ii<BLOCKROWSa*BLOCKCOLSa; ii++)
    BLa[ii] = 0;

    char BLb[BLOCKROWSb*BLOCKCOLSb];
    for (int ii=0; ii<BLOCKROWSb*BLOCKCOLSb; ii++)
    BLb[ii] = 0;  

    char BLc[BLOCKROWSa*BLOCKCOLSb];
    for (int ii=0; ii<BLOCKROWSa*BLOCKCOLSb; ii++)
    BLc[ii] = 0; 

    MPI_Datatype blocktype;
    MPI_Datatype blocktype2;

    MPI_Type_vector(BLOCKROWSa, BLOCKCOLSa, COLSa, MPI_CHAR, &blocktype2);
    MPI_Type_vector(BLOCKROWSb, BLOCKCOLSb, COLSb, MPI_CHAR, &blocktype2);

    MPI_Type_create_resized( blocktype2, 0, sizeof(char), &blocktype);
    MPI_Type_commit(&blocktype);

    int dispsa[NPROWS*NPCOLS];
    int countsa[NPROWS*NPCOLS];
    int dispsb[NPROWS*NPCOLS];
    int countsb[NPROWS*NPCOLS];

    //*******************************Start Time Record****************//

    clock_t t;
    t=clock();

    for (int ii=0; ii<NPROWS; ii++) {
    for (int jj=0; jj<NPCOLS; jj++) {
    dispsa[ii*NPCOLS+jj] = ii*COLSa*BLOCKROWSa+jj*BLOCKCOLSa;
    countsa [ii*NPCOLS+jj] = 1;
        }
    }

    MPI_Scatterv(a, countsa, dispsa, blocktype, BLa, BLOCKROWSa*BLOCKCOLSa, MPI_CHAR, 0,   MPI_COMM_WORLD);


    for (int ii=0; ii<NPROWS; ii++) {
    for (int jj=0; jj<NPCOLS; jj++) {
    dispsb[ii*NPCOLS+jj] = ii*COLSb*BLOCKROWSb+jj*BLOCKCOLSb;
    countsb [ii*NPCOLS+jj] = 1;
         }
    }

    MPI_Scatterv(b, countsb, dispsb, blocktype, BLb, BLOCKROWSb*BLOCKCOLSb, MPI_CHAR, 0, MPI_COMM_WORLD);




     for (int proc=0; proc<p; proc++) {
        if (proc == rank) {

          printf("Rank = %d\n", rank);

                if (rank == 0) {
                  printf("Global matrix A : \n");

                   for (int ii=0; ii<ROWSa; ii++) {
                     for (int jj=0; jj<COLSa; jj++) {
                       printf("%3d ",(int)a[ii*COLSa+jj]);
                }
                        printf("\n");
            }
                 printf("\n");
            printf("Global matrix B : \n");

           for (int ii=0; ii<ROWSb; ii++) {
             for (int jj=0; jj<COLSb; jj++) {
              printf("%3d ",(int)b[ii*COLSb+jj]);
                }
         printf("\n");
            }
        printf("\n");
                  printf("Local Matrix A:\n");
              for (int ii=0; ii<BLOCKROWSa; ii++) {
                for (int jj=0; jj<BLOCKCOLSa; jj++) {
            printf("%3d ",(int)BLa[ii*BLOCKCOLSa+jj]);

                }

             printf("\n");
            }

           printf("\n");
              printf("Local Matrix B:\n");
                for (int ii=0; ii<BLOCKROWSb; ii++) {
                   for (int jj=0; jj<BLOCKCOLSb; jj++) {
                       printf("%3d ",(int)BLb[ii*BLOCKCOLSb+jj]);

                }

          printf("\n");
            }
                }


            printf("Local Matrix A:\n");
                    for (int ii=0; ii<BLOCKROWSa; ii++) {
                   for (int jj=0; jj<BLOCKCOLSa; jj++) {
                       printf("%3d ",(int)BLa[ii*BLOCKCOLSa+jj]);
                  }

             printf("\n");
            }

          printf("Local Matrix B:\n");
            for (int ii=0; ii<BLOCKROWSb; ii++) {
                for (int jj=0; jj<BLOCKCOLSb; jj++) {
                   printf("%3d ",(int)BLb[ii*BLOCKCOLSb+jj]);
                }

        printf("\n");
            }

  //**********************Multiplication***********************//

       for (int i = 0; i < BLOCKROWSa; i++) {
          for (j = 0; j < BLOCKCOLSb; j++) {

        for (k = 0; k < BLOCKCOLSb; k++) {  //I am considering square matrices with the same sizes
               BLc[i + j*BLOCKROWSa] += BLa[i + k*BLOCKROWSa]*BLb[k + BLOCKCOLb*j];
                  printf("%3d ",(int)BLc[i+j*BLOCKROWSa]);
                     }
    printf("\n");

                 }

      printf("\n");

             }

       }

      MPI_Barrier(MPI_COMM_WORLD);
    }

   MPI_Finalize();

   //**********************End Time Record************************//

    t=clock()-t;
     printf("It took %f seconds (%d clicks).\n",t,((float)t)/CLOCKS_PER_SEC);


       return 0;
     }

Except a missing int k; and a BLc[i + j*BLOCKROWSa] += BLa[i + k*BLOCKROWSa]*BLb[k + BLOCKCOLb*j]; to turn into BLc[i + j*BLOCKROWSa] += BLa[i + k*BLOCKROWSa]*BLb[k + BLOCKCOLSb*j]; (one S more), there is nothing particularly weird with your code, as long as you wish to perform multiplications between block matrices. Why do you think you are stuck ? Why are you unsatisfied with your code ? It worked with mpicc main.c -o main -std=c99 and mpirun -np 4 main. — francis
Hi Francis. Thanks for your comment and corrections. However, by this code I am not able to get one single matrix as a result of multiplication at each processors, for some reason I get 5 of them at each processor!! — THTH
OK I managed to fix that part and now I get one single product outcome. But the multiplications are not correct! Something is wrong with the math! — THTH
A minor addition, unrelated to your issue. You're using rand(), but you haven't initiated the seed with srand. So each run you're actually using a same matrix. You can add a ''srand(time(NULL)) '' to fix that one. — Hrishikesh

francis francis · Accepted Answer · 2014-03-16T15:04:44

To get the blocks back into the matrix on proc 0, you may use the "opposite" of MPI_Scatterv() which is called MPI_Gatherv() http://www.mpich.org/static/docs/latest/www3/MPI_Gatherv.html :

MPI_Gatherv(BLc, BLOCKROWSb*BLOCKCOLSb,MPI_CHAR, c, countsb, dispsb,blocktype, 0, MPI_COMM_WORLD);

if (rank == 0) {
    printf("Global matrix C : \n");

    for (int ii=0; ii<ROWSa; ii++) {
        for (int jj=0; jj<COLSa; jj++) {
            printf("%3d ",(int)c[ii*COLSa+jj]);
        }
        printf("\n");
    }
}

Keep in mind that you are performing blockwise multiplications, that is different from a matrix multiplication.

Bye,

Francis

MPI Block matrix multiplication

1 Answers