If you want 128bit multiplication then this should work this is in AT&T format.
__uint128_t FASTMUL128(const __uint128_t TA,const __uint128_t TB)
{
union
{
__uint128_t WHOLE;
struct
{
unsigned long long int LWORDS[2];
} SPLIT;
} KEY;
register unsigned long long int __RAX,__RDX,__RSI,__RDI;
__uint128_t RESULT;
KEY.WHOLE=TA;
__RAX=KEY.SPLIT.LWORDS[0];
__RDX=KEY.SPLIT.LWORDS[1];
KEY.WHOLE=TB;
__RSI=KEY.SPLIT.LWORDS[0];
__RDI=KEY.SPLIT.LWORDS[1];
__asm__ __volatile__(
"movq %0, %%rax \n\t"
"movq %1, %%rdx \n\t"
"movq %2, %%rsi \n\t"
"movq %3, %%rdi \n\t"
"movq %%rsi, %%rbx \n\t"
"movq %%rdi, %%rcx \n\t"
"movq %%rax, %%rsi \n\t"
"movq %%rdx, %%rdi \n\t"
"xorq %%rax, %%rax \n\t"
"xorq %%rdx, %%rdx \n\t"
"movq %%rdi, %%rax \n\t"
"mulq %%rbx \n\t"
"xchgq %%rbx, %%rax \n\t"
"mulq %%rsi \n\t"
"xchgq %%rax, %%rsi \n\t"
"addq %%rdx, %%rbx \n\t"
"mulq %%rcx \n\t"
"addq %%rax, %%rbx \n\t"
"movq %%rsi, %%rax \n\t"
"movq %%rbx, %%rdx \n\t"
"movq %%rax, %0 \n\t"
"movq %%rdx, %1 \n\t"
"movq %%rsi, %2 \n\t"
"movq %%rdi, %3 \n\t"
: "=m"(__RAX),"=m"(__RDX),"=m"(__RSI),"=m"(__RDI)
: "m"(__RAX), "m"(__RDX), "m"(__RSI), "m"(__RDI)
: "rax","rbx","ecx","rdx","rsi","rdi"
);
KEY.SPLIT.LWORDS[0]=__RAX;
KEY.SPLIT.LWORDS[1]=__RDX;
RESULT=KEY.WHOLE;
return RESULT;
}
pmuludq
for 2 of the 4 partial products, and scalarmul
for the other two. – Peter Cordes