I'm doing dense matrix multiplication on 1024x1024 matrices. I do this using loop blocking/tiling using 64x64 tiles. I have created a highly optimized 64x64 matrix multiplication function (see the end of my question for the code).
gemm64(float *a, float *b, float *c, int stride).
Here is the code which runs over the tiles. A 1024x1204 matrix which has 16x16 tiles.
for(int i=0; i<16; i++) {
for(int j=0; j<16; j++) {
for(int k=0; k<16; k++) {
gemm64(&a[64*(i*1024 + k)], &b[64*(k*1024 + j)], &c[64*(i*1024 + j)], 1024);
}
}
}
However, as a guess, I tried rearranging the memory of all the tiles (see the end of this question for the code) for the b
matrix in a new matrix b2
so that the stride of each tile is 64 instead of 1024. This effectively makes an array of 64x64 matrices with stride=64.
float *b2 = (float*)_mm_malloc(1024*1024*sizeof(float), 64);
reorder(b, b2, 1024);
for(int i=0; i<16; i++) {
for(int j=0; j<16; j++) {
for(int k=0; k<16; k++) {
gemm64_v2(&a[64*(i*1024 + k)], &b2[64*64*(k*16 + j)], &c[64*(i*1024 + j)], 64);
}
}
}
_mm_free(b2);
Notice how the offset for b changed from &b[64*(k*1024 + j)]
to &b2[64*64*(k*16 + j)]
and that the stride passed to gemm64
has changed from 1024 to 64.
The performance of my code jumps from less that 20% to 70% of the peak flops on my Sandy Bridge system!
Why does rearranging the tiles in the b matrix in this way make such a huge difference?
The arrays a,b, b2, and c are 64 byte aligned.
extern "C" void gemm64(float *a, float*b, float*c, int stride) {
for(int i=0; i<64; i++) {
row_m64x64(&a[1024*i], b, &c[1024*i], stride);
}
}
void row_m64x64(const float *a, const float *b, float *c, int stride) {
__m256 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
tmp0 = _mm256_loadu_ps(&c[ 0]);
tmp1 = _mm256_loadu_ps(&c[ 8]);
tmp2 = _mm256_loadu_ps(&c[16]);
tmp3 = _mm256_loadu_ps(&c[24]);
tmp4 = _mm256_loadu_ps(&c[32]);
tmp5 = _mm256_loadu_ps(&c[40]);
tmp6 = _mm256_loadu_ps(&c[48]);
tmp7 = _mm256_loadu_ps(&c[56]);
for(int i=0; i<64; i++) {
__m256 areg0 = _mm256_broadcast_ss(&a[i]);
__m256 breg0 = _mm256_loadu_ps(&b[stride*i + 0]);
tmp0 = _mm256_add_ps(_mm256_mul_ps(areg0,breg0), tmp0);
__m256 breg1 = _mm256_loadu_ps(&b[stride*i + 8]);
tmp1 = _mm256_add_ps(_mm256_mul_ps(areg0,breg1), tmp1);
__m256 breg2 = _mm256_loadu_ps(&b[stride*i + 16]);
tmp2 = _mm256_add_ps(_mm256_mul_ps(areg0,breg2), tmp2);
__m256 breg3 = _mm256_loadu_ps(&b[stride*i + 24]);
tmp3 = _mm256_add_ps(_mm256_mul_ps(areg0,breg3), tmp3);
__m256 breg4 = _mm256_loadu_ps(&b[stride*i + 32]);
tmp4 = _mm256_add_ps(_mm256_mul_ps(areg0,breg4), tmp4);
__m256 breg5 = _mm256_loadu_ps(&b[stride*i + 40]);
tmp5 = _mm256_add_ps(_mm256_mul_ps(areg0,breg5), tmp5);
__m256 breg6 = _mm256_loadu_ps(&b[stride*i + 48]);
tmp6 = _mm256_add_ps(_mm256_mul_ps(areg0,breg6), tmp6);
__m256 breg7 = _mm256_loadu_ps(&b[stride*i + 56]);
tmp7 = _mm256_add_ps(_mm256_mul_ps(areg0,breg7), tmp7);
}
_mm256_storeu_ps(&c[ 0], tmp0);
_mm256_storeu_ps(&c[ 8], tmp1);
_mm256_storeu_ps(&c[16], tmp2);
_mm256_storeu_ps(&c[24], tmp3);
_mm256_storeu_ps(&c[32], tmp4);
_mm256_storeu_ps(&c[40], tmp5);
_mm256_storeu_ps(&c[48], tmp6);
_mm256_storeu_ps(&c[56], tmp7);
}
The code to reorder the b matrix.
reorder(float *b, float *b2, int stride) {
//int k = 0;
for(int i=0; i<16; i++) {
for(int j=0; j<16; j++) {
for(int i2=0; i2<64; i2++) {
for(int j2=0; j2<64; j2++) {
//b2[k++] = b[1024*(64*i+i2) + 64*j + j2];
b2[64*64*(i*16 + j) + 64*i2+j2] = b[1024*(64*i+i2) + 64*j + j2];
}
}
}
}
}
bli_kernel.h
in github.com/flame/blis/tree/master/config for good values for modern microarchitectures. – Marat Dukhan