Matrix divide and Conquer memory optimization

Question

I am currently implementing Strassen's Matrix Multiplication algorithm on C and got it to work correctly. However, the way I am passing around the sub matrices on each recursion is far from optimal as I make a new array and copy the elements from the original matrix. Here is my implementation

// this algorithm assumes that both matrices have the same size and are square matrices.
int **matrix_multiply_divide_conquer(const int **A, const int **B, int length, int threshold)
{
  int **result;
  int i, j, k;
  // allocate result space
  result = (int **)malloc(sizeof(int *) * length);
  for (i = 0; i < length; i++)
  {
    result[i] = (int *)malloc(sizeof(int) * length);
  }

  if (length == threshold)
  {
    // compute the dot product
    matrix_multiply((const int **)A, (const int **)B, result, length, length, length, false);
    return result;
  }
  else
  {
    // stored in an int given that the matrix length is guaranteed to be a power of 2
    int nLength = length / 2;
    int i;
    int **a11;
    int **a12;
    int **a21;
    int **a22;

    int **b11;
    int **b12;
    int **b21;
    int **b22;

    // allocate new sub matrices
    a11 = (int **)malloc(sizeof(int *) * nLength);
    a12 = (int **)malloc(sizeof(int *) * nLength);
    a21 = (int **)malloc(sizeof(int *) * nLength);
    a22 = (int **)malloc(sizeof(int *) * nLength);

    b11 = (int **)malloc(sizeof(int *) * nLength);
    b12 = (int **)malloc(sizeof(int *) * nLength);
    b21 = (int **)malloc(sizeof(int *) * nLength);
    b22 = (int **)malloc(sizeof(int *) * nLength);

    for (i = 0; i < nLength; i++)
    {
      a11[i] = (int *)malloc(sizeof(int) * nLength);
      a12[i] = (int *)malloc(sizeof(int) * nLength);
      a21[i] = (int *)malloc(sizeof(int) * nLength);
      a22[i] = (int *)malloc(sizeof(int) * nLength);

      b11[i] = (int *)malloc(sizeof(int) * nLength);
      b12[i] = (int *)malloc(sizeof(int) * nLength);
      b21[i] = (int *)malloc(sizeof(int) * nLength);
      b22[i] = (int *)malloc(sizeof(int) * nLength);
    }

    a11 = get_sub_matrix((const int **)A, 0, 0, nLength);
    a12 = get_sub_matrix((const int **)A, 0, nLength, nLength);
    a21 = get_sub_matrix((const int **)A, nLength, 0, nLength);
    a22 = get_sub_matrix((const int **)A, nLength, nLength, nLength);


    b11 = get_sub_matrix((const int **)B, 0, 0, nLength);
    b12 = get_sub_matrix((const int **)B, 0, nLength, nLength);
    b21 = get_sub_matrix((const int **)B, nLength, 0, nLength);
    b22 = get_sub_matrix((const int **)B, nLength, nLength, nLength);

    // combine the results
    int **c11 = matrix_addition((const int **)matrix_multiply_divide_conquer((const int **)a11, (const int **)b11, nLength, 4), (const int **)matrix_multiply_divide_conquer((const int **)a12, (const int **)b21, nLength, 4), nLength, nLength);
    int **c12 = matrix_addition((const int **)matrix_multiply_divide_conquer((const int **)a11, (const int **)b12, nLength, 4), (const int **)matrix_multiply_divide_conquer((const int **)a12, (const int **)b22, nLength, 4), nLength, nLength);
    int **c21 = matrix_addition((const int **)matrix_multiply_divide_conquer((const int **)a21, (const int **)b11, nLength, 4), (const int **)matrix_multiply_divide_conquer((const int **)a22, (const int **)b21, nLength, 4), nLength, nLength);
    int **c22 = matrix_addition((const int **)matrix_multiply_divide_conquer((const int **)a21, (const int **)b12, nLength, 4), (const int **)matrix_multiply_divide_conquer((const int **)a22, (const int **)b22, nLength, 4), nLength, nLength);

    // combine result quarters
    combine_sub_matrix((const int **)c11, result, nLength, 0, 0);
    combine_sub_matrix((const int **)c12, result, nLength, 0, nLength);
    combine_sub_matrix((const int **)c21, result, nLength, nLength, 0);
    combine_sub_matrix((const int **)c22, result, nLength, nLength, nLength);
    return result;
  }
}

this is the get_sub_matrix method:

int **get_sub_matrix(const int **A, int verticalOffset, int horizontalOffset, int size)
{
  int **sub_matrix;
  sub_matrix = (int **)malloc(sizeof(int *) * size);
  int i, j;
  for (i = 0; i < size; i++)
  {
    //each row will contain a number of columns size
    sub_matrix[i] = (int *)malloc(sizeof(int) * size);
  }
  for (i = 0; i < size; i++)
  {
    for (j = 0; j < size; j++)
    {
      sub_matrix[i][j] = A[i + verticalOffset][j + horizontalOffset];
    }
  }
  return sub_matrix;
}

I am sure this is not the optimal way of passing the submatrices around as they are only read and never modified, so I would like to ask you if using pointers for this problem is possible and how to do it (as I am still wrapping my head around them).

I have tried the following but it did not work:

int **a11 = &A[0][0];
int **a12 = &A[0][nLength];
int **a21 = &A[nLength][0] ;
int **a22 = &A[nLength][nLength];

int **b11 = &B[0][0];
int **b12 = &B[0][nLength];
int **b21 = &B[nLength][0] ;
int **b22 = &B[nLength][nLength];

Thanks for helping me out.

Paul Ogilvie Paul Ogilvie · Accepted Answer · 2017-11-05T12:26:02

You are now working with an array of rows,

 int **x = malloc(sizeof(int *) * size);

and then you allocate the row memory for each row.

for (i = 0; i < length; i++)
{
    x[i] = malloc(sizeof(int) * length);
}

Instead, you could work with a single block of memory that you treat as a 2 dimensional array:

int *x = malloc(sizeof(int) * size * size);

This allows you to point to the beginning of a sub-matrix and pass that to your other functions. But note how you must do your matrix indexation. Because the compiler now doesn't know that you treat the single block as a 2 dimensinal aray, you must do the indexing explicitly yourself. So given a matrix of size length by length, you address x[2][3] (the fourth element of the third row, as C is zero-based) as:

x [ (2*length) + 3 ];

Also, when passing pointers to sub matrices to your functions, you must develop a convention to inform your functions on which sub matrix it operates. For example:

void my_matrix_op(int *X, int quadrant, int length)

I have not gone through all your code to determine if this approach is really feasible for your algorithm but I hope this helps you.

Matrix divide and Conquer memory optimization

1 Answers