2
votes

I'm working with the Algorand contract code which has a very limited scope of possible operations in their assembly code - e.g., it is not possible to control flow of the code. Basic 64 bit arithmetic operations are available.

What I need to do is to divide two 64-bit segments (as a whole 128-bit number) of a certain 128-bit integer by another 64-bit integer knowing that the result will fit within 64-bit integer. Is it possible without controlling flow of the code and only with 64-bit arithmetics?

1
Division is a fundamental arithmetic operation. What's wrong with / (opcode 0x0A)? (With a little stack juggling)Thomas Jager
It's 64-bit and I have 128-bit integer in two parts.Sebastian
@Sebastian Then you need to use textbook long division with 32 bit “digits.” It's possible under the constraints you mentioned.fuz
@Sebastian In your question, you said that you needed to divide the two 64-bit segments, not that you want to divide the whole 128-bit number.Thomas Jager
@Sebastian Does the Algorand programming language provide any constructs commonly used to avoid branches such as: (1) conditional moves (2) predicated execution (3) a select-type operation (akin to C's ternary operator, or a 2:1 multiplexer in hardware)?njuffa

1 Answers

3
votes

I am not familiar with the Algorand language. It is certainly possible to create branch-free code for dividing an 128-bit operand by a 64-bit operand using 64-bit operations throughout. The easiest way to do this is to use long-hand division and multiplication algorithms we all learned in elementary school, but using 232 as the base, not 10. Generally speaking, this requires shifts and logical operations in addition to simple arithmetic, and this is what I assume is available in ISO-C99 code below.

One can use the existing 64-bit division to generate a close estimate of the next quotient "digit" by dividing the leading two "digits" of the dividend by the leading "digit" of the divisor, where "digit" is to be understood base 232. In TAOCP Knuth shows that the error is positive and at most 2, provided the divisor is normalized, that is, its most significant bit is set. We compute the remainder via a back-multiply and subtraction and then can check whether it is negative. If so, we lower the (partial) quotient by one and recompute the remainder.

As branching is not allowed, a mechanism is needed for selecting between two possible conditionally selecting from two operands. This can be done by computing a predicate into an integer which takes the value 0 or 1, and passing that to a 2:1 multiplexer function that multiplies one source operand with the predicate, the other one with the complement of the predicate and adds the results. In the code below this function is called mux_64().

I have tested the division code in function my_div_128_64() by comparing it to the DIV instruction of my 64-bit Intel CPU, accessing that instruction via inline assembly. The code was built with the Intel compiler version 13. I have implemented two test modes, one based on purely random operands and the other one is based on simple patterns that are useful for hitting various boundary cases. No mismatches have been found so far. More elaborate tests are possible, but this should be sufficient for a decent "smoke" test.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

#define PATTERN_TEST  (1)
#define MSVC_STYLE    (1)
#define GCC_STYLE     (2)
#define ASM_STYLE     (GCC_STYLE)

// our own 128-bit unsigned integer type
typedef struct
{
    uint64_t x;
    uint64_t y;
} ulonglong2;

// (a < b) ? 1 : 0
uint64_t ltu_64 (uint64_t a, uint64_t b)
{
    return ((~a & b) + ((~a ^ b) >> 1)) >> 63;
}

// (a != b) ? 1 : 0
uint64_t neq_64 (uint64_t a, uint64_t b)
{
    return ((((a ^ b) | (1ULL << 63)) - 1ULL) | (a ^ b)) >> 63;
}

// (a >= b) ? 1 : 0
uint64_t geu_64 (uint64_t a, uint64_t b)
{
    return ((a | ~b) - ((a ^ ~b) >> 1)) >> 63;
}

//#define ADDcc(a,b,cy,t0,t1) (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)
#define ADDcc(a,b,cy,t0,t1) (t0=(b), t1=(a), t0=t0+t1, cy=ltu_64(t0,t1), t0=t0)
#define ADDC(a,b,cy,t0,t1) (t0=(b)+cy, t1=(a), t0+t1)

//#define SUBcc(a,b,cy,t0,t1) (t0=(b), t1=(a), cy=t1<t0, t1-t0)
#define SUBcc(a,b,cy,t0,t1) (t0=(b), t1=(a), cy=ltu_64(t1,t0), t1-t0)
#define SUBC(a,b,cy,t0,t1)  (t0=(b)+cy, t1=(a), t1-t0)

// 128-bit subtraction
ulonglong2 sub_128 (ulonglong2 a, ulonglong2 b)
{
    ulonglong2 res;
    uint64_t cy, t0, t1;
    res.x = SUBcc (a.x, b.x, cy, t0, t1);
    res.y = SUBC  (a.y, b.y, cy, t0, t1);
    return res;
}

// 128-bit addition
ulonglong2 add_128 (ulonglong2 a, ulonglong2 b)
{
    ulonglong2 res;
    uint64_t cy, t0, t1;
    res.x = ADDcc (a.x, b.x, cy, t0, t1);
    res.y = ADDC  (a.y, b.y, cy, t0, t1);
    return res;
}

// branch free implementation of ((cond) ? a : b). cond must be in {0,1}
uint64_t mux_64 (uint64_t cond, uint64_t a, int64_t b)
{
    return cond * a + (1 - cond) * b;
}

// count leading zeros
uint64_t clz_64 (uint64_t x)
{
    uint64_t n = 64;
    uint64_t y;

    y = x >> 32; n = mux_64 (neq_64 (y, 0), n - 32, n); x = mux_64 (neq_64 (y, 0), y, x);
    y = x >> 16; n = mux_64 (neq_64 (y, 0), n - 16, n); x = mux_64 (neq_64 (y, 0), y, x);
    y = x >>  8; n = mux_64 (neq_64 (y, 0), n -  8, n); x = mux_64 (neq_64 (y, 0), y, x);
    y = x >>  4; n = mux_64 (neq_64 (y, 0), n -  4, n); x = mux_64 (neq_64 (y, 0), y, x);
    y = x >>  2; n = mux_64 (neq_64 (y, 0), n -  2, n); x = mux_64 (neq_64 (y, 0), y, x);
    y = x >>  1; n = mux_64 (neq_64 (y, 0), n -  2, n - x);
    return n;
}

// 64x64->128 bit unsigned multiplication
ulonglong2 my_mul_64_128 (uint64_t a, uint64_t b)
{
    ulonglong2 res;
    const uint64_t mask_lo32 = 0xffffffffULL;

    uint64_t a_lo = a & mask_lo32;
    uint64_t a_hi = a >> 32;
    uint64_t b_lo = b & mask_lo32;
    uint64_t b_hi = b >> 32;
    uint64_t p0 = a_lo * b_lo;
    uint64_t p1 = a_lo * b_hi;
    uint64_t p2 = a_hi * b_lo;
    uint64_t p3 = a_hi * b_hi;
    uint64_t cy = ((p0 >> 32) + (p1 & mask_lo32) + (p2 & mask_lo32)) >> 32;
    res.x = p0 + (p1 << 32) + (p2 << 32);
    res.y = p3 + (p1 >> 32) + (p2 >> 32) + cy;
    return res;
}

// 128/64->64 bit unsigned division
uint64_t my_div_128_64 (ulonglong2 dvnd, uint64_t dvsr)
{
    ulonglong2 prod, rem;
    uint64_t lz, q, qh, ql, rem_msb, nrm_dvsr, cy, t0, t1;

    // normalize divisor; adjust dividend accordingly (initial partial remainder)
    lz = clz_64 (dvsr);
    nrm_dvsr = dvsr << lz;
    rem.y = (dvnd.y << lz) | mux_64 (neq_64 (lz, 0), dvnd.x >> (64 - lz), 0);
    rem.x = dvnd.x << lz;

    // compute most significant quotient "digit"; TAOCP: may be off by 0, +1, +2
    qh = mux_64 (geu_64 (rem.y >> 32, nrm_dvsr >> 32), 0xffffffff, rem.y / (nrm_dvsr >> 32));

    // compute remainder; correct quotient "digit" if remainder negative
    prod   = my_mul_64_128 (qh << 32, nrm_dvsr);
    rem    = sub_128 (rem, prod);
    rem_msb= rem.y >> 63;
    qh     = mux_64 (rem_msb, qh - 1, qh);
    rem.x  = mux_64 (rem_msb, ADDcc (rem.x, nrm_dvsr << 32, cy, t0, t1), rem.x);
    rem.y  = mux_64 (rem_msb, ADDC  (rem.y, nrm_dvsr >> 32, cy, t0, t1), rem.y);
    rem_msb= rem.y >> 63;
    qh     = mux_64 (rem_msb, qh - 1, qh);
    rem.x  = mux_64 (rem_msb, ADDcc (rem.x, nrm_dvsr << 32, cy, t0, t1), rem.x);
    rem.y  = mux_64 (rem_msb, ADDC  (rem.y, nrm_dvsr >> 32, cy, t0, t1), rem.y);

    // pull down next dividend "digit"
    rem.y = (rem.y << 32) | (rem.x >> 32);
    rem.x = rem.x << 32;

    // compute least significant quotient "digit"; TAOCP: may be off by 0, +1, +2
    ql = mux_64 (geu_64 (rem.y >> 32, nrm_dvsr >> 32), 0xffffffff, rem.y / (nrm_dvsr >> 32));

    // combine partial results into complete quotient
    q = (qh << 32) + ql;

    // compute remainder; correct quotient "digit" if remainder negative
    prod   = my_mul_64_128 (q, dvsr);
    rem    = sub_128 (dvnd, prod);
    rem_msb= rem.y >> 63;
    q      = mux_64 (rem_msb, q - 1, q);
    rem.x  = mux_64 (rem_msb, ADDcc (rem.x, dvsr, cy, t0, t1), rem.x);
    rem.y  = mux_64 (rem_msb, ADDC  (rem.y,    0, cy, t0, t1), rem.y);
    rem_msb= rem.y >> 63;
    q      = mux_64 (rem_msb, q - 1, q);
    
    return q;
}

uint64_t ref_div_128_64 (ulonglong2 dvnd, uint64_t dvsr)
{
    uint64_t quot;
#if ASM_STYLE == MSVC_STYLE
    __asm mov rax, qword ptr [dvnd.x];
    __asm mov rdx, qword ptr [dvnd.y];
    __asm div qword ptr [dvsr];
    __asm mov qword ptr [quot], rax;
#else // ASM_STYLE
    __asm__ (
        "movq %1, %%rax\n\t"
        "movq %2, %%rdx\n\t"
        "divq %3\n\t"
        "movq %%rax, %0\n\t"
        : "=r" (quot)
        : "r" (dvnd.x), "r" (dvnd.y), "r" (dvsr)
        : "rax", "rdx");
#endif // ASM_STYLE
   return quot;
}

ulonglong2 ref_mul_64_128 (uint64_t a, uint64_t b)
{
    ulonglong2 prod;
#if ASM_STYLE == MSVC_STYLE
    __asm mov rax, qword ptr [a];
    __asm mul qword ptr [b];
    __asm mov qword ptr [prod.x], rax;
    __asm mov qword ptr [prod.y], rdx;
#else // ASM_STYLE
    __asm__(
        "movq %2, %%rax\n\t"
        "mulq %3\n\t"
        "movq %%rax, %0\n\t"
        "movq %%rdx, %1\n\t"
        : "=r" (prod.x), "=r" (prod.y) 
        : "r" (a), "r" (b) 
        : "rax", "rdx");
#endif // ASM_STYLE
   return prod;
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <[email protected]>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;

#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

#define PATTERN  (1)

int main (void) 
{
    ulonglong2 dvnd, prod, diff;
    uint64_t dvsr, ref, res, count = 0;

    do {
        count++;
        do {
#if PATTERN_TEST
            ulonglong2 temp1, temp2;
            uint64_t shift, shift1, shift2;
            shift = KISS64 & 127;
            temp1.y = (shift > 63) ? (1ULL << (shift - 64)) : 0;
            temp1.x = (shift > 63) ? 0ULL : (1ULL << shift);
            shift = KISS64 & 127;
            temp2.y = (shift > 63) ? (1ULL << (shift - 64)) : 0;
            temp2.x = (shift > 63) ? 0ULL : (1ULL << shift);
            dvnd = (KISS64 & 1) ? add_128 (temp1, temp2) : sub_128 (temp1, temp2);
            shift1 = KISS64 & 63;
            shift2 = KISS64 & 63;
            dvsr = (KISS64 & 1) ? ((1ULL << shift1) + (1ULL << shift2)) :
                                  ((1ULL << shift1) - (1ULL << shift2));
#else // PATTERN_TEST
            dvnd.y = KISS64;
            dvnd.x = KISS64;
            dvsr = KISS64;
#endif // PATTERN_TEST
        } while ((dvsr == 0ULL) || (dvnd.y >= dvsr)); // guard against overflow
        
        ref = ref_div_128_64 (dvnd, dvsr);
        res = my_div_128_64 (dvnd, dvsr);

        prod = ref_mul_64_128 (res, dvsr);
        diff = sub_128 (dvnd, prod);

        if ((diff.y != 0) || (diff.x >= dvsr)) {
            printf ("error @ chk1: dvnd= %016llx_%016llx  dvsr=%016llx  res=%016llx ref=%016llx diff=%016llx_%016llx\n", 
                    dvnd.y, dvnd.x, dvsr, res, ref, diff.y, diff.x);
            return EXIT_FAILURE;
        }
        if (res != ref) {
            printf ("error @ chk2: dvnd= %016llx_%016llx  dvsr=%016llx  res=%016llx ref=%016llx diff=%016llx_%016llx\n", 
                    dvnd.y, dvnd.x, dvsr, res, ref, diff.y, diff.x);
            return EXIT_FAILURE;
        }           
        if ((count & 0xffffff) == 0) printf ("\r%llu", count);
    } while (dvsr);
    return EXIT_SUCCESS;
}

If the programming language is more severely restricted, all of shifts and all logical operations, except those used inside the predicate functions neq_64(), ltu_64(), and geu_64(), can be replaced with arithmetic. Examplary ISO-C99 code is shown below. There may be programming language specific ways to implement these predicates without the use of logical operations.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

#define PATTERN_TEST  (0)

#define MSVC_STYLE    (1)
#define GCC_STYLE     (2)
#define ASM_STYLE     (GCC_STYLE)

#define POW2_63       (0x8000000000000000ULL)
#define POW2_32       (0x0000000100000000ULL)
#define POW2_16       (0x0000000000010000ULL)
#define POW2_8        (0x0000000000000100ULL)
#define POW2_5        (0x0000000000000020ULL)
#define POW2_4        (0x0000000000000010ULL)
#define POW2_3        (0x0000000000000008ULL)
#define POW2_2        (0x0000000000000004ULL)
#define POW2_1        (0x0000000000000002ULL)
#define POW2_0        (0x0000000000000001ULL)

// our own 128-bit unsigned integer type
typedef struct
{
    uint64_t x;
    uint64_t y;
} ulonglong2;

// (a < b) ? 1 : 0
uint64_t ltu_64 (uint64_t a, uint64_t b)
{
    return ((~a & b) + ((~a ^ b) / POW2_1)) / POW2_63;
}

// (a >= b) ? 1 : 0
uint64_t geu_64 (uint64_t a, uint64_t b)
{
    return ((a | ~b) - ((a ^ ~b) / POW2_1)) / POW2_63;
}

// (a != b) ? 1 : 0
uint64_t neq_64 (uint64_t a, uint64_t b)
{
    return ((a - b) | (b - a)) / POW2_63;
}

//#define ADDcc(a,b,cy,t0,t1) (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)
#define ADDcc(a,b,cy,t0,t1) (t0=(b), t1=(a), t0=t0+t1, cy=ltu_64(t0,t1), t0=t0)
#define ADDC(a,b,cy,t0,t1) (t0=(b)+cy, t1=(a), t0+t1)

//#define SUBcc(a,b,cy,t0,t1) (t0=(b), t1=(a), cy=t1<t0, t1-t0)
#define SUBcc(a,b,cy,t0,t1) (t0=(b), t1=(a), cy=ltu_64(t1,t0), t1-t0)
#define SUBC(a,b,cy,t0,t1)  (t0=(b)+cy, t1=(a), t1-t0)

// 128-bit subtraction
ulonglong2 sub_128 (ulonglong2 a, ulonglong2 b)
{
    ulonglong2 res;
    uint64_t cy, t0, t1;
    res.x = SUBcc (a.x, b.x, cy, t0, t1);
    res.y = SUBC  (a.y, b.y, cy, t0, t1);
    return res;
}

// 128-bit addition
ulonglong2 add_128 (ulonglong2 a, ulonglong2 b)
{
    ulonglong2 res;
    uint64_t cy, t0, t1;
    res.x = ADDcc (a.x, b.x, cy, t0, t1);
    res.y = ADDC  (a.y, b.y, cy, t0, t1);
    return res;
}

// branch free implementation of ((cond) ? a : b). cond must be in {0,1}
uint64_t mux_64 (uint64_t cond, uint64_t a, int64_t b)
{
    return cond * a + (1 - cond) * b;
}

// count leading zeros
uint64_t clz_64 (uint64_t x)
{
    uint64_t n = 64;
    uint64_t c, y;

    y = x / POW2_32; c = neq_64 (y, 0); n = mux_64 (c, n - 32, n); x = mux_64 (c, y, x);
    y = x / POW2_16; c = neq_64 (y, 0); n = mux_64 (c, n - 16, n); x = mux_64 (c, y, x);
    y = x / POW2_8;  c = neq_64 (y, 0); n = mux_64 (c, n -  8, n); x = mux_64 (c, y, x);
    y = x / POW2_4;  c = neq_64 (y, 0); n = mux_64 (c, n -  4, n); x = mux_64 (c, y, x);
    y = x / POW2_2;  c = neq_64 (y, 0); n = mux_64 (c, n -  2, n); x = mux_64 (c, y, x);
    y = x / POW2_1;  c = neq_64 (y, 0); n = mux_64 (c, n -  2, n - x);
    return n;
}

// compute 2**i, 0 <= i <=63 
uint64_t pow2i (uint64_t i)
{
    uint64_t r = 1ULL;

    r = mux_64 ((i / POW2_5) % 2, r * POW2_32, r);
    r = mux_64 ((i / POW2_4) % 2, r * POW2_16, r);
    r = mux_64 ((i / POW2_3) % 2, r * POW2_8,  r);
    r = mux_64 ((i / POW2_2) % 2, r * POW2_4,  r);
    r = mux_64 ((i / POW2_1) % 2, r * POW2_2,  r);
    r = mux_64 ((i / POW2_0) % 2, r * POW2_1,  r);
    return r;
}

// 64x64->128 bit unsigned multiplication
ulonglong2 my_mul_64_128 (uint64_t a, uint64_t b)
{
    ulonglong2 res;

    uint64_t a_hi = a / POW2_32;
    uint64_t a_lo = a % POW2_32; 
    uint64_t b_hi = b / POW2_32;
    uint64_t b_lo = b % POW2_32;
    uint64_t p0 = a_lo * b_lo;
    uint64_t p1 = a_lo * b_hi;
    uint64_t p2 = a_hi * b_lo;
    uint64_t p3 = a_hi * b_hi;
    uint64_t cy = ((p0 / POW2_32) + (p1 % POW2_32) + (p2 % POW2_32)) / POW2_32;
    res.x = p0 + (p1 * POW2_32) + (p2 * POW2_32);
    res.y = p3 + (p1 / POW2_32) + (p2 / POW2_32) + cy;
    return res;
}

// 128/64->64 bit unsigned division
uint64_t my_div_128_64 (ulonglong2 dvnd, uint64_t dvsr)
{
    ulonglong2 prod, rem;
    uint64_t lz, lz_nz, scal, shft, q, qh, ql, rem_msb, nrm_dvsr, nrm_dvsr_hi, cy, t0, t1;

    // normalize divisor; adjust dividend accordingly (initial partial remainder)
    lz = clz_64 (dvsr);
    lz_nz = neq_64 (lz, 0);
    scal = pow2i (lz);
    shft = mux_64 (lz_nz, 64 - lz, 0);
    nrm_dvsr = dvsr * scal;
    rem.y = dvnd.y * scal + mux_64 (lz_nz, dvnd.x / pow2i (shft), 0);
    rem.x = dvnd.x * scal;
    nrm_dvsr_hi = nrm_dvsr / POW2_32;

    // compute most significant quotient "digit"; TAOCP: may be off by 0, +1, +2
    qh = mux_64 (geu_64 (rem.y / POW2_32, nrm_dvsr_hi), 0xffffffff, rem.y / (nrm_dvsr_hi));

    // compute remainder; correct quotient "digit" if remainder negative
    prod   = my_mul_64_128 (qh * POW2_32, nrm_dvsr);
    rem    = sub_128 (rem, prod);
    rem_msb= rem.y / POW2_63;
    qh     = mux_64 (rem_msb, qh - 1, qh);
    rem.x  = mux_64 (rem_msb, ADDcc (rem.x, nrm_dvsr * POW2_32, cy, t0, t1), rem.x);
    rem.y  = mux_64 (rem_msb, ADDC  (rem.y, nrm_dvsr / POW2_32, cy, t0, t1), rem.y);
    rem_msb= rem.y / POW2_63;
    qh     = mux_64 (rem_msb, qh - 1, qh);
    rem.x  = mux_64 (rem_msb, ADDcc (rem.x, nrm_dvsr * POW2_32, cy, t0, t1), rem.x);
    rem.y  = mux_64 (rem_msb, ADDC  (rem.y, nrm_dvsr / POW2_32, cy, t0, t1), rem.y);

    // pull down next dividend "digit"
    rem.y = rem.y * POW2_32 + rem.x / POW2_32;
    rem.x = rem.x * POW2_32;

    // compute least significant quotient "digit"; TAOCP: may be off by 0, +1, +2
    ql = mux_64 (geu_64 (rem.y / POW2_32, nrm_dvsr_hi), 0xffffffff, rem.y / (nrm_dvsr_hi));

    // combine partial results into complete quotient
    q = qh * POW2_32 + ql;

    // compute remainder; correct quotient "digit" if remainder negative
    prod   = my_mul_64_128 (q, dvsr);
    rem    = sub_128 (dvnd, prod);
    rem_msb= rem.y / POW2_63;
    q      = mux_64 (rem_msb, q - 1, q);
    rem.x  = mux_64 (rem_msb, ADDcc (rem.x, dvsr, cy, t0, t1), rem.x);
    rem.y  = mux_64 (rem_msb, ADDC  (rem.y,    0, cy, t0, t1), rem.y);
    rem_msb= rem.y / POW2_63;
    q      = mux_64 (rem_msb, q - 1, q);
    
    return q;
}

uint64_t ref_div_128_64 (ulonglong2 dvnd, uint64_t dvsr)
{
    uint64_t quot;
#if ASM_STYLE == MSVC_STYLE
    __asm mov rax, qword ptr [dvnd.x];
    __asm mov rdx, qword ptr [dvnd.y];
    __asm div qword ptr [dvsr];
    __asm mov qword ptr [quot], rax;
#else // ASM_STYLE
    __asm__ (
        "movq %1, %%rax\n\t"
        "movq %2, %%rdx\n\t"
        "divq %3\n\t"
        "movq %%rax, %0\n\t"
        : "=r" (quot)
        : "r" (dvnd.x), "r" (dvnd.y), "r" (dvsr)
        : "rax", "rdx");
#endif // ASM_STYLE
   return quot;
}

ulonglong2 ref_mul_64_128 (uint64_t a, uint64_t b)
{
    ulonglong2 prod;
#if ASM_STYLE == MSVC_STYLE
    __asm mov rax, qword ptr [a];
    __asm mul qword ptr [b];
    __asm mov qword ptr [prod.x], rax;
    __asm mov qword ptr [prod.y], rdx;
#else // ASM_STYLE
    __asm__(
        "movq %2, %%rax\n\t"
        "mulq %3\n\t"
        "movq %%rax, %0\n\t"
        "movq %%rdx, %1\n\t"
        : "=r" (prod.x), "=r" (prod.y) 
        : "r" (a), "r" (b) 
        : "rax", "rdx");
#endif // ASM_STYLE
   return prod;
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <[email protected]>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;

#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

#define PATTERN  (1)

int main (void) 
{
    ulonglong2 dvnd, prod, diff;
    uint64_t dvsr, ref, res, count = 0;

    do {
        count++;
        do {
#if PATTERN_TEST
            ulonglong2 temp1, temp2;
            uint64_t shift, shift1, shift2;
            shift = KISS64 & 127;
            temp1.y = (shift > 63) ? (1ULL << (shift - 64)) : 0;
            temp1.x = (shift > 63) ? 0ULL : (1ULL << shift);
            shift = KISS64 & 127;
            temp2.y = (shift > 63) ? (1ULL << (shift - 64)) : 0;
            temp2.x = (shift > 63) ? 0ULL : (1ULL << shift);
            dvnd = (KISS64 & 1) ? add_128 (temp1, temp2) : sub_128 (temp1, temp2);
            shift1 = KISS64 & 63;
            shift2 = KISS64 & 63;
            dvsr = (KISS64 & 1) ? ((1ULL << shift1) + (1ULL << shift2)) :
                                  ((1ULL << shift1) - (1ULL << shift2));
#else // PATTERN_TEST
            dvnd.y = KISS64;
            dvnd.x = KISS64;
            dvsr = KISS64;
#endif // PATTERN_TEST
        } while ((dvsr == 0ULL) || (dvnd.y >= dvsr)); // guard against overflow
        
        ref = ref_div_128_64 (dvnd, dvsr);
        res = my_div_128_64 (dvnd, dvsr);

        prod = ref_mul_64_128 (res, dvsr);
        diff = sub_128 (dvnd, prod);

        if ((diff.y != 0) || (diff.x >= dvsr)) {
            printf ("error @ chk1: dvnd= %016llx_%016llx  dvsr=%016llx  res=%016llx ref=%016llx diff=%016llx_%016llx\n", 
                    dvnd.y, dvnd.x, dvsr, res, ref, diff.y, diff.x);
            return EXIT_FAILURE;
        }
        if (res != ref) {
            printf ("error @ chk2: dvnd= %016llx_%016llx  dvsr=%016llx  res=%016llx ref=%016llx diff=%016llx_%016llx\n", 
                    dvnd.y, dvnd.x, dvsr, res, ref, diff.y, diff.x);
            return EXIT_FAILURE;
        }           
        if ((count & 0xffffff) == 0) printf ("\r%llu", count);
    } while (dvsr);
    return EXIT_SUCCESS;
}