1
votes

Hullo, i ve got some c procedure :

    inline float intersectRaySphere(float3* rayO, float3* rayV, float3* sO, float sR)
    {
    static float3 Q;

    Q = sub(sO,rayO);

    float cc = dot(&Q,&Q);
    float v = dot(&Q,rayV);
    float d = sR*sR - (cc - v*v);

    // If there was no intersection, return -1
    if (d < 0.0) return (-1.0f);
    // Return the distance to the [first] intersecting point
    return (v - sqrt(d));
    }

I was trying to rewrite it in x86 fpu asm and create such one

    _asm_intersectRaySphere:; Function begin
    push    ebp                                     ; 0000 _ 55
    mov     ebp, esp                                ; 0001 _ 89. E5
    add     esp, -20                                ; 0003 _ 83. C4, EC
    mov     eax, dword [ebp+8H]                     ; 0006 _ 8B. 45, 08
    mov     ecx, dword [ebp+0CH]                    ; 0009 _ 8B. 4D, 0C
    mov     edx, dword [ebp+10H]                    ; 000C _ 8B. 55, 10
    fld     dword [edx]                             ; 000F _ D9. 02
    fsub    dword [eax]                             ; 0011 _ D8. 20
    fld     dword [edx+4H]                          ; 0013 _ D9. 42, 04
    fsub    dword [eax+4H]                          ; 0016 _ D8. 60, 04
    fld     dword [edx+8H]                          ; 0019 _ D9. 42, 08
    fsub    dword [eax+8H]                          ; 001C _ D8. 60, 08
    fld     st2                                     ; 001F _ D9. C2
    fmul    st0, st(0)                              ; 0021 _ DC. C8
    fld     st2                                     ; 0023 _ D9. C2
    fmul    st0, st(0)                              ; 0025 _ DC. C8
    fld     st2                                     ; 0027 _ D9. C2
    fmul    st0, st(0)                              ; 0029 _ DC. C8
    faddp   st1, st(0)                              ; 002B _ DE. C1
    faddp   st1, st(0)                              ; 002D _ DE. C1
    fld     dword [ecx]                             ; 002F _ D9. 01
    fmul    st(0), st4                              ; 0031 _ D8. CC
    fld     dword [ecx+4H]                          ; 0033 _ D9. 41, 04
    fmul    st(0), st4                              ; 0036 _ D8. CC
    fld     dword [ecx+8H]                          ; 0038 _ D9. 41, 08
    fmul    st(0), st4                              ; 003B _ D8. CC
    faddp   st1, st(0)                              ; 003D _ DE. C1
    faddp   st1, st(0)                              ; 003F _ DE. C1
    fst     dword [ebp-4H]                          ; 0041 _ D9. 55, FC
    fmul    st0, st(0)                              ; 0044 _ DC. C8
    fld     dword [ebp+14H]                         ; 0046 _ D9. 45, 14
    fmul    st0, st(0)                              ; 0049 _ DC. C8
    faddp   st1, st(0)                              ; 004B _ DE. C1
    fsubrp  st1, st(0)                              ; 004D _ DE. E1
    fxch    st3                                     ; 004F _ D9. CB
    fstp    st0                                     ; 0051 _ DD. D8
    fstp    st0                                     ; 0053 _ DD. D8
    fstp    st0                                     ; 0055 _ DD. D8
    ftst                                            ; 0057 _ D9. E4
    fwait                                           ; 0059 _ 9B
    fnstsw  ax                                      ; 005A _ DF. E0
    fwait                                           ; 005C _ 9B
    sahf                                            ; 005D _ 9E
    jc      ?_001                                   ; 005E _ 72, 07
    fsqrt                                           ; 0060 _ D9. FA
    fsubr   dword [ebp-4H]                          ; 0062 _ D8. 6D, FC
    jmp     ?_002                                   ; 0065 _ EB, 06

     ?_001:  
    fstp    st0                                     ; 0067 _ DD. D8
    fld1                                            ; 0069 _ D9. E8
    fchs                                            ; 006B _ D9. E0
     ?_002:  
    mov     esp, ebp                                ; 006D _ 89. EC
    pop     ebp                                     ; 006F _ 5D
    ret                                             ; 0070 _ C3
    ; _asm_intersectRaySphere End of function

tested, and it is working ok, c routine takes about 150 cycles (on my 6 or 7 years old old pentium 4), my asm routine takes about 66 cycles (*) - so it is good improvement, but maybe it can be also yet improved a little ?

tnx

(*) i was testing with not much care, on random input data so possibly it was a 'non intersection' cause - with no sqrt involved

1

1 Answers

1
votes

I would replace this:

fstp st0
fstp st0
fstp st0
ftst
fwait
fnstsw  ax
fwait
sahf
jc ?__001

By this:

fcompp
fstp st0
fldz
fcomip st0, st1
ja ?__001

fnstsw isn't fast, and sahf isn't great either, especially not on P4's. If you can't use fcomi (ie if it has to work on P1 or PMMX), you can still skip the sahf by testing a bit in ax directly.