I'm playing around with the vector instrinsics in GCC, particularly for AVX, and I'm tempted to write something like this to do a vector multiply between two arrays:
#include <unistd.h>
void __attribute__((target("avx"))) vmul(float* __restrict__ cc, const float* __restrict__ aa, const float* __restrict__ bb, ssize_t size) {
const ssize_t VECSIZE=8;
typedef float vfloat __attribute__((vector_size(sizeof(float)*VECSIZE)));
// duff's device, process any remainder up front
ssize_t rem = size % VECSIZE;
switch (rem) {
case 7: cc[6] = aa[6]*bb[6]; /* FALLTHRU */
case 6: cc[5] = aa[5]*bb[5]; /* FALLTHRU */
case 5: cc[4] = aa[4]*bb[4]; /* FALLTHRU */
case 4: cc[3] = aa[3]*bb[3]; /* FALLTHRU */
case 3: cc[2] = aa[2]*bb[2]; /* FALLTHRU */
case 2: cc[1] = aa[1]*bb[1]; /* FALLTHRU */
case 1: cc[0] = aa[0]*bb[0]; /* FALLTHRU */
case 0: break;
}
size -= rem;
// process rest of array
const vfloat *va = (const vfloat*)(aa+rem);
const vfloat *vb = (const vfloat*)(bb+rem);
vfloat *vc = (vfloat*)(cc+rem);
for (ssize_t ii=0; ii < size; ii++) {
vc[ii] = va[ii]*vb[ii];
}
}
int main() {
}
The problem is the pointer-aliasing required to get data into the vector type. GCC happily lets you do it (no warning with -Wall -Wextra -ansi -pedantic), but then assumes the underlying memory alignment is appropriate. So it generates vmovaps instructions in the inner loop:
0x0000000000400660 <+176>: vmovaps (%rsi,%rax,1),%ymm0
0x0000000000400665 <+181>: vmulps (%rdx,%rax,1),%ymm0,%ymm0
0x000000000040066a <+186>: vmovaps %ymm0,(%rdi,%rax,1)
0x000000000040066f <+191>: add $0x20,%rax
0x0000000000400673 <+195>: cmp %r8,%rax
0x0000000000400676 <+198>: jne 0x400660 <_Z4vmulPfPKfS1_l+176>
Which is fine, until you pass in some non-aligned memory (or a size not a multiple of 8 in my case), and then it happily segfaults your program trying to load unaligned memory with an aligned instruction.
Is there a proper way to do this with the vector extensions?