Dear Assembly/C++ dev,
The question is: Does propagate the carry (or any flag) between two ASM block is realistic or totally insane, even if it works ?
A few years ago I developed an integer library for large arithmetic lower than 512 bits (at compile time). I did not use GMP at this time because for this scale, GMP become slower due to the memory allocation and the model choose for the binary representation bench.
I must confess I created my ASM ( the string block) using BOOST_PP, it is not very glorious (for curious have a look on it vli). The library was working well.
However I remark at this time it was impossible to propagate the carry flag of the state register between two ASM inline block. This is logical because for any mnemonic generated by the compiler between two blocks, the register is reset ( except for the mov instruction (from my assembly knowledge)).
Yesterday I get an idea to propagate the carry between two ASM block a bit tricky (using a recursive algo). It is working but I think I am lucky.
#include <iostream>
#include <array>
#include <cassert>
#include <algorithm>
//forward declaration
template<std::size_t NumBits>
struct integer;
//helper using object function, partial specialization is forbiden on functions
template <std::size_t NumBits, std::size_t W, bool K = W == integer<NumBits>::numwords>
struct helper {
static inline void add(integer<NumBits> &a, const integer<NumBits> &b){
helper<NumBits, integer<NumBits>::numwords>::add(a,b);
}
};
// first addition (call first)
template<std::size_t NumBits, std::size_t W>
struct helper<NumBits, W, 1> {
static inline void add(integer<NumBits> &a, const integer<NumBits> &b){
__asm__ (
"movq %1, %%rax \n"
"addq %%rax, %0 \n"
: "+m"(a[0]) // output
: "m" (b[0]) // input only
: "rax", "cc", "memory");
helper<NumBits,W-1>::add(a,b);
}
};
//second and more propagate the carry (call next)
template<std::size_t NumBits, std::size_t W>
struct helper<NumBits, W, 0> {
static inline void add(integer<NumBits> &a, const integer<NumBits> &b){
__asm__ (
"movq %1, %%rax \n"
"adcq %%rax, %0 \n"
: "+m"(a[integer<NumBits>::numwords-W])
: "m" (b[integer<NumBits>::numwords-W])
: "rax", "cc", "memory");
helper<NumBits,W-1>::add(a,b);
}
};
//nothing end reccursive process (call last)
template<std::size_t NumBits>
struct helper<NumBits, 0, 0> {
static inline void add(integer<NumBits> &a, const integer<NumBits> &b){};
};
// tiny integer class
template<std::size_t NumBits>
struct integer{
typedef uint64_t value_type;
static const std::size_t numbits = NumBits;
static const std::size_t numwords = (NumBits+std::numeric_limits<value_type>::digits-1)/std::numeric_limits<value_type>::digits;
using container = std::array<uint64_t, numwords>;
typedef typename container::iterator iterator;
iterator begin() { return data_.begin();}
iterator end() { return data_.end();}
explicit integer(value_type num = value_type()){
assert( -1l >> 1 == -1l );
std::fill(begin(),end(),value_type());
data_[0] = num;
}
inline value_type& operator[](std::size_t n){ return data_[n];}
inline const value_type& operator[](std::size_t n) const { return data_[n];}
integer& operator+=(const integer& a){
helper<numbits,numwords>::add(*this,a);
return *this;
}
integer& operator~(){
std::transform(begin(),end(),begin(),std::bit_not<value_type>());
return *this;
}
void print_raw(std::ostream& os) const{
os << "(" ;
for(std::size_t i = numwords-1; i > 0; --i)
os << data_[i]<<" ";
os << data_[0];
os << ")";
}
void print(std::ostream& os) const{
assert(false && " TO DO ! \n");
}
private:
container data_;
};
template <std::size_t NumBits>
std::ostream& operator<< (std::ostream& os, integer<NumBits> const& i){
if(os.flags() & std::ios_base::hex)
i.print_raw(os);
else
i.print(os);
return os;
}
int main(int argc, const char * argv[]) {
integer<256> a; // 0
integer<256> b(1);
~a; //all the 0 become 1
std::cout << " a: " << std::hex << a << std::endl;
std::cout << " ref: (ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff) " << std::endl;
a += b; // should propagate the carry
std::cout << " a+=b: " << a << std::endl;
std::cout << " ref: (0 0 0 0) " << std::endl; // it works but ...
return 0;
}
I get a correct result (it must be compiled in release -O2 or -O3!) and the ASM is right (on my Mac with clang++: Apple LLVM version 9.0.0 (clang-900.0.39.2))
movq -96(%rbp), %rax
addq %rax, -64(%rbp)
## InlineAsm End
## InlineAsm Start
movq -88(%rbp), %rax
adcq %rax, -56(%rbp)
## InlineAsm End
## InlineAsm Start
movq -80(%rbp), %rax
adcq %rax, -48(%rbp)
## InlineAsm End
## InlineAsm Start
movq -72(%rbp), %rax
adcq %rax, -40(%rbp)
I am conscient it is working because during the optimization the compiler remove all useless instruction between the ASM block (in debug mode it failed).
What do you think ? Definitively unsafe ? Does a compiler guy know how much it will be stable ?
To conclude: I am just doing that for fun :) Yes, GMP is the solution for large arithmetic !