16
votes

I am trying to find an optimized C or Assembler implementation of a function that multiplies two 4x4 matrices with each other. The platform is an ARM6 or ARM7 based iPhone or iPod.

Currently, I am using a fairly standard approach - just a little loop-unrolled.

#define O(y,x) (y + (x<<2))

static inline void Matrix4x4MultiplyBy4x4 (float *src1, float *src2, float *dest)
{
    *(dest+O(0,0)) = (*(src1+O(0,0)) * *(src2+O(0,0))) + (*(src1+O(0,1)) * *(src2+O(1,0))) + (*(src1+O(0,2)) * *(src2+O(2,0))) + (*(src1+O(0,3)) * *(src2+O(3,0))); 
    *(dest+O(0,1)) = (*(src1+O(0,0)) * *(src2+O(0,1))) + (*(src1+O(0,1)) * *(src2+O(1,1))) + (*(src1+O(0,2)) * *(src2+O(2,1))) + (*(src1+O(0,3)) * *(src2+O(3,1))); 
    *(dest+O(0,2)) = (*(src1+O(0,0)) * *(src2+O(0,2))) + (*(src1+O(0,1)) * *(src2+O(1,2))) + (*(src1+O(0,2)) * *(src2+O(2,2))) + (*(src1+O(0,3)) * *(src2+O(3,2))); 
    *(dest+O(0,3)) = (*(src1+O(0,0)) * *(src2+O(0,3))) + (*(src1+O(0,1)) * *(src2+O(1,3))) + (*(src1+O(0,2)) * *(src2+O(2,3))) + (*(src1+O(0,3)) * *(src2+O(3,3))); 
    *(dest+O(1,0)) = (*(src1+O(1,0)) * *(src2+O(0,0))) + (*(src1+O(1,1)) * *(src2+O(1,0))) + (*(src1+O(1,2)) * *(src2+O(2,0))) + (*(src1+O(1,3)) * *(src2+O(3,0))); 
    *(dest+O(1,1)) = (*(src1+O(1,0)) * *(src2+O(0,1))) + (*(src1+O(1,1)) * *(src2+O(1,1))) + (*(src1+O(1,2)) * *(src2+O(2,1))) + (*(src1+O(1,3)) * *(src2+O(3,1))); 
    *(dest+O(1,2)) = (*(src1+O(1,0)) * *(src2+O(0,2))) + (*(src1+O(1,1)) * *(src2+O(1,2))) + (*(src1+O(1,2)) * *(src2+O(2,2))) + (*(src1+O(1,3)) * *(src2+O(3,2))); 
    *(dest+O(1,3)) = (*(src1+O(1,0)) * *(src2+O(0,3))) + (*(src1+O(1,1)) * *(src2+O(1,3))) + (*(src1+O(1,2)) * *(src2+O(2,3))) + (*(src1+O(1,3)) * *(src2+O(3,3))); 
    *(dest+O(2,0)) = (*(src1+O(2,0)) * *(src2+O(0,0))) + (*(src1+O(2,1)) * *(src2+O(1,0))) + (*(src1+O(2,2)) * *(src2+O(2,0))) + (*(src1+O(2,3)) * *(src2+O(3,0))); 
    *(dest+O(2,1)) = (*(src1+O(2,0)) * *(src2+O(0,1))) + (*(src1+O(2,1)) * *(src2+O(1,1))) + (*(src1+O(2,2)) * *(src2+O(2,1))) + (*(src1+O(2,3)) * *(src2+O(3,1))); 
    *(dest+O(2,2)) = (*(src1+O(2,0)) * *(src2+O(0,2))) + (*(src1+O(2,1)) * *(src2+O(1,2))) + (*(src1+O(2,2)) * *(src2+O(2,2))) + (*(src1+O(2,3)) * *(src2+O(3,2))); 
    *(dest+O(2,3)) = (*(src1+O(2,0)) * *(src2+O(0,3))) + (*(src1+O(2,1)) * *(src2+O(1,3))) + (*(src1+O(2,2)) * *(src2+O(2,3))) + (*(src1+O(2,3)) * *(src2+O(3,3))); 
    *(dest+O(3,0)) = (*(src1+O(3,0)) * *(src2+O(0,0))) + (*(src1+O(3,1)) * *(src2+O(1,0))) + (*(src1+O(3,2)) * *(src2+O(2,0))) + (*(src1+O(3,3)) * *(src2+O(3,0))); 
    *(dest+O(3,1)) = (*(src1+O(3,0)) * *(src2+O(0,1))) + (*(src1+O(3,1)) * *(src2+O(1,1))) + (*(src1+O(3,2)) * *(src2+O(2,1))) + (*(src1+O(3,3)) * *(src2+O(3,1))); 
    *(dest+O(3,2)) = (*(src1+O(3,0)) * *(src2+O(0,2))) + (*(src1+O(3,1)) * *(src2+O(1,2))) + (*(src1+O(3,2)) * *(src2+O(2,2))) + (*(src1+O(3,3)) * *(src2+O(3,2))); 
    *(dest+O(3,3)) = (*(src1+O(3,0)) * *(src2+O(0,3))) + (*(src1+O(3,1)) * *(src2+O(1,3))) + (*(src1+O(3,2)) * *(src2+O(2,3))) + (*(src1+O(3,3)) * *(src2+O(3,3))); 
};

Would I benefit from using the Strassen- or the Coppersmith–Winograd algorithm?

5
does cpu involved have fpu operations?Jack
If you would like to use a nice little iOS framework to do the timing have a look at my blog post modejong.com/blog/post10_arm_timing_framework/index.htmlMoDJ

5 Answers

36
votes

No, the Strassen or Coppersmith-Winograd algorithm wouldn't make much difference here. They start to pay off for larger matrices only.

If your matrix-multiplication is really a bottleneck you could rewrite the algorithm using NEON SIMD instructions. That would only help for ARMv7 as ARMv6 does not has this extension though.

I'd expect a factor 3 speedup over the compiled C-code for your case.

EDIT: You can find a nice implementation in ARM-NEON here: http://code.google.com/p/math-neon/

For your C-code there are two things you could do to speed up the code:

  1. Don't inline the function. Your matrix multiplication generates quite a bit of code as it's unrolled, and the ARM only has a very tiny instruction cache. Excessive inlining can make your code slower because the CPU will be busy loading code into the cache instead of executing it.

  2. Use the restrict keyword to tell the compiler that the source- and destination pointers don't overlap in memory. Currently the compiler is forced to reload every source value from memory whenever a result is written because it has to assume that source and destination may overlap or even point to the same memory.

20
votes

Just nitpicking. I wonder why people still obfuscate their code voluntarly? C is already difficult to read, no need to add to it.

static inline void Matrix4x4MultiplyBy4x4 (float src1[4][4], float src2[4][4], float dest[4][4])
{
dest[0][0] = src1[0][0] * src2[0][0] + src1[0][1] * src2[1][0] + src1[0][2] * src2[2][0] + src1[0][3] * src2[3][0]; 
dest[0][1] = src1[0][0] * src2[0][1] + src1[0][1] * src2[1][1] + src1[0][2] * src2[2][1] + src1[0][3] * src2[3][1]; 
dest[0][2] = src1[0][0] * src2[0][2] + src1[0][1] * src2[1][2] + src1[0][2] * src2[2][2] + src1[0][3] * src2[3][2]; 
dest[0][3] = src1[0][0] * src2[0][3] + src1[0][1] * src2[1][3] + src1[0][2] * src2[2][3] + src1[0][3] * src2[3][3]; 
dest[1][0] = src1[1][0] * src2[0][0] + src1[1][1] * src2[1][0] + src1[1][2] * src2[2][0] + src1[1][3] * src2[3][0]; 
dest[1][1] = src1[1][0] * src2[0][1] + src1[1][1] * src2[1][1] + src1[1][2] * src2[2][1] + src1[1][3] * src2[3][1]; 
dest[1][2] = src1[1][0] * src2[0][2] + src1[1][1] * src2[1][2] + src1[1][2] * src2[2][2] + src1[1][3] * src2[3][2]; 
dest[1][3] = src1[1][0] * src2[0][3] + src1[1][1] * src2[1][3] + src1[1][2] * src2[2][3] + src1[1][3] * src2[3][3]; 
dest[2][0] = src1[2][0] * src2[0][0] + src1[2][1] * src2[1][0] + src1[2][2] * src2[2][0] + src1[2][3] * src2[3][0]; 
dest[2][1] = src1[2][0] * src2[0][1] + src1[2][1] * src2[1][1] + src1[2][2] * src2[2][1] + src1[2][3] * src2[3][1]; 
dest[2][2] = src1[2][0] * src2[0][2] + src1[2][1] * src2[1][2] + src1[2][2] * src2[2][2] + src1[2][3] * src2[3][2]; 
dest[2][3] = src1[2][0] * src2[0][3] + src1[2][1] * src2[1][3] + src1[2][2] * src2[2][3] + src1[2][3] * src2[3][3]; 
dest[3][0] = src1[3][0] * src2[0][0] + src1[3][1] * src2[1][0] + src1[3][2] * src2[2][0] + src1[3][3] * src2[3][0]; 
dest[3][1] = src1[3][0] * src2[0][1] + src1[3][1] * src2[1][1] + src1[3][2] * src2[2][1] + src1[3][3] * src2[3][1]; 
dest[3][2] = src1[3][0] * src2[0][2] + src1[3][1] * src2[1][2] + src1[3][2] * src2[2][2] + src1[3][3] * src2[3][2]; 
dest[3][3] = src1[3][0] * src2[0][3] + src1[3][1] * src2[1][3] + src1[3][2] * src2[2][3] + src1[3][3] * src2[3][3]; 
};
3
votes

Are you sure that your unrolled code is faster than the explicit loop based approach? Mind that the compilers are usually better than humans performing optimizations!

In fact, I'd bet there are more chances for a compiler to emit automatically SIMD instructions from a well written loop than from a series of "unrelated" statements...

You could also specify the matrices sizes in the argument declaration. Then you could use the normal bracket syntax to access the elements, and it could also be a good hint for the compiler to make its optimisations too.

2
votes

Are these arbitrary matrices or do they have any symmetries? If so, those symmetries can often to exploited for improved performance (for example in rotation matrices).

Also, I agree with fortran above, and would run some timing tests to verify that your hand unrolled code is faster than an optimizing compiler can create. At the least, you may be able to simplify your code.

Paul

2
votes

Your completely unrolled traditional product is likely pretty fast.

Your matrix is too small to overcome the overheard of managing a Strassen multiplication in its traditional form with explicit indexes and partitioning code; you'd likely lose any effect on optimization to that overhead.

But if you want fast, I'd use SIMD instructions if they are available. I'd be surprised if the ARM chips these days don't have them. If they do, you can manage all the products in row/colum in a single instruction; if the SIMD is 8 wide, you might manage 2 row/column multiplies in a single instruction. Setting the operands up to do that instruction might require some dancing around; SIMD instructions will easily pick up your rows (adjacent values), but will not pick up the columns (non-contiguous). And it may take some effort to compute the sum of the products in a row/column.