Computing Square Root of A Vector of Fixed-Point Numbers

August 9, 2011 Coded in ASM for the TI C64x

Occasionally, we have to do nonlinear computations on fixed-point DSPs. If they are required only once, say, during initialization, then floating-point emulation is a good option. However, when they happen in the main iteration and are performed on a vector of numbers, floating-point emulation will pose a serious burden on run time. 

If you need to compute square roots of a vector of Q.15 numbers on C64x or C64x+, the following hand tuned linear assembly code might help. (Q.15, as a storage type, is equivalent to 16-bit short, but arithmetically, it is a fractional number within [-1, 1), or the fractional point right after the MSB.)

To compute 

    y[i] = sqrt(x[i]),  for i = 0,1,...,N-1, and 0 ≤ x[i] < 1, 

in C, simply call

    DSP_vsqrt_q15(x, y, N);      

The runtime cost is 4 cycles per number (compiler option -O2) and the maximum error is 2-15

The code is based on 4th order minimum square error (MSE) polynomial fitting of the function f(x)=x^.5 for x in [.5,1). Input numbers are first scaled to [.5,1) for higher precision. It can be adapted to other nonlinear computations by using appropriate fitting polynomials as long as there is no overflow at each stage of polynomial evaluation. (By Horner's scheme, a 4th order polynomial is evaluated as (((C4*x+C3)*x+C2)*x+C1)*x+C0.)

* =========================================================================== *
*                                                                             *
*  Compute square root of a Q.15 vector by 4th order polynomial fitting       *
*      y[i] = sqrt(x[i]), 0 <= x[i] < 1;                                      *
*                                                                             * 
*  C prototype:                                                               *
*      void DSP_vsqrt_q15(short* x, short* y, int N);                         *
*                                                                             * 
*  Performance:                                                               *
*      O(4*N) or 4 cycles per number (software pipelining enabled by -O2)     * 
*                                                                             * 
*  Error:                                                                     *
*      (-2^-15, 2^-15)                                                        *
*                                                                             *
* =========================================================================== *  

; chebfun, min max error        
; 0xFFF1024A = Q31(-0.0005)      
; 0x0046E0AB = Q31(0.0022)       
; 0xFE764EE1 = Q31(-0.0120)      
; 0x1277E288 = Q31(0.1443)       
; 0x6ED9EBA1 = Q31(0.8660)       

; polyfit, min sqr error   
; 0xFFF1203D = Q31(-0.0005)
; 0x004581E7 = Q31(0.0021) 
; 0xFE7645AF = Q31(-0.0120)
; 0x1278CF97 = Q31(0.1443) 
; 0x6ED9E687 = Q31(0.8660) 

SQRT_C4 .set 0xFFF1203D  
SQRT_C3 .set 0x004581E7  
SQRT_C2 .set 0xFE7645AF  
SQRT_C1 .set 0x1278CF97  
SQRT_C0 .set 0x6ED9E687  
SQRT_S  .set 0x5A82799A ;  Q31(0.7071)

        .sect ".text: _DSP_vsqrt_q15"
        .global _DSP_vsqrt_q15

_DSP_vsqrt_q15: .cproc  A_X, B_Y, A_n

        .rega A_C4, A_C3, A_C2, A_C1, A_C0, A_xx0, A_rnd, A_y0
        .rega A_y0c4, A_y0c3, A_y0c2, A_y0c1, A_x0c3, A_x0c2, A_x0c1, A_x0c0
        .rega A_S, A_e0, A_x0x, A_xm, A_y0l, A_y0h, A_y0s
        .regb B_C4, B_C3, B_C2, B_C1, B_C0, B_xx1, B_rnd, B_y1, B_y10
        .regb B_y1c4, B_y1c3, B_y1c2, B_y1c1, B_x1c3, B_x1c2, B_x1c1, B_x1c0
        .regb B_S, B_e1, B_x1x, B_xm, B_y1l, B_y1h, B_y1s, B_X
        .reg  B_i, C0, C1
            ADD         2,              A_X,            B_X
            MVK         0x1,            A_rnd
            SHL         A_rnd,          15,             A_rnd
            MV          A_rnd,          B_rnd
            MVKL        SQRT_C4,        A_C4
            MVKH        SQRT_C4,        A_C4
            MVKL        SQRT_C3,        B_C3
            MVKH        SQRT_C3,        B_C3

            MV          A_C4,           B_C4
            MV          B_C3,           A_C3
            MVKL        SQRT_C2,        A_C2
            MVKH        SQRT_C2,        A_C2
            MVKL        SQRT_C1,        B_C1
            MVKH        SQRT_C1,        B_C1

            MV          A_C2,           B_C2
            MV          B_C1,           A_C1
            MVKL        SQRT_C0,        A_C0
            MVKH        SQRT_C0,        A_C0
            MVKL        SQRT_S,         B_S
            MVKH        SQRT_S,         B_S
            MV          B_S,            A_S
            MV          A_C0,           B_C0

            MVKL        0x6000,         A_xm
            SHL         A_xm,           16,             A_xm
            MV          A_xm,           B_xm

            SHR         A_n,            1,              B_i
            SUB         B_i,            2,              B_i
LOOP_vsqrt: .trip 8
            LDH.D1T1    *A_X++[2],      A_xx0
            LDH.D2T2    *B_X++[2],      B_xx1
            NORM.L1     A_xx0,          A_e0
            NORM.L2     B_xx1,          B_e1          
            SSHVL.M1    A_xx0,          A_e0,           A_x0x
            SSHVL.M2    B_xx1,          B_e1,           B_x1x

            SUB.D1      A_e0,           16,             A_e0
            SUB.D2      B_e1,           16,             B_e1
            AND.D1      0x1,            A_e0,           C0
            AND.D2      0x1,            B_e1,           C1
            SHR.S1      A_e0,           1,              A_e0
            SHR.S2      B_e1,           1,              B_e1

            SUB.D1      A_x0x,          A_xm,           A_x0x
            SUB.D2      B_x1x,          B_xm,           B_x1x
            SHL.S1      A_x0x,          2,              A_x0x
            SHL.S2      B_x1x,          2,              B_x1x

            MPYHIR.M1   A_x0x,          A_C4,           A_y0c4
            MPYHIR.M2   B_x1x,          B_C4,           B_y1c4
            SADD.L1     A_y0c4,         A_C3,           A_x0c3
            SADD.L2     B_y1c4,         B_C3,           B_x1c3
            MPYHIR.M1   A_x0x,          A_x0c3,         A_y0c3
            MPYHIR.M2   B_x1x,          B_x1c3,         B_y1c3
            SADD.L1     A_y0c3,         A_C2,           A_x0c2
            SADD.L2     B_y1c3,         B_C2,           B_x1c2

            MPYHIR.M1   A_x0x,          A_x0c2,         A_y0c2
            MPYHIR.M2   B_x1x,          B_x1c2,         B_y1c2
            SADD.L1     A_y0c2,         A_C1,           A_x0c1
            SADD.L2     B_y1c2,         B_C1,           B_x1c1

            MPYHIR.M1   A_x0x,          A_x0c1,         A_y0c1
            MPYHIR.M2   B_x1x,          B_x1c1,         B_y1c1
            SADD.L1     A_y0c1,         A_C0,           A_x0c0
            SADD.L2     B_y1c1,         B_C0,           B_x1c0

            ; A_S = B_S = 0x5A82799A ~= 0x5A820000 + 0x00008000
     [C0]   MPYHIR.M1   A_S,            A_x0c0,         A_y0h
     [C1]   MPYHIR.M2   B_S,            B_x1c0,         B_y1h
     [C0]   SHR         A_x0c0,         16,             A_y0l
     [C1]   SHR         B_x1c0,         16,             B_y1l 
     [C0]   SADD.L1     A_y0h,          A_y0l,          A_x0c0
     [C1]   SADD.L2     B_y1h,          B_y1l,          B_x1c0

            SHR.S1      A_x0c0,         A_e0,           A_y0s
            SHR.S2      B_x1c0,         B_e1,           B_y1s

            SADD.L1     A_y0s,          A_rnd,          A_y0
            SADD.L2     B_y1s,          B_rnd,          B_y1
            PACKH2.S2X  B_y1,           A_y0,           B_y10                       

            STW.D2T2    B_y10,          *B_Y++

            BDEC        LOOP_vsqrt,     B_i