DSPRelated.com
Code

Computing Reciprocals of Fixed-Point Numbers by the Newton-Raphson Method — Division by Multiplication

August 11, 20115 comments Coded in ASM for the TI C64x
* =========================================================================== *
*                                                                             *
*  Compute reciprocals of a Q.15 vector by Newton-Raphson method              *
*      y[i] = ym[i] * 2^ye[i] = 1 / x[i], -1 <= x[i] < 1 and x[i] != 0        *
*      where ym is the mantissa vector and ye is the exponent vector          *
*                                                                             *
*  C prototype:                                                               *
*       void DSP_vrecip16n(short* x,    // Input, Q.15 vector                 *
*                          short* ym,   // Output, Q.15 vector                *
*                          short* ye,   // Output, int16 vector               *
*                          int    N);   // Input, vector length               *
*                                                                             *
*  Restriction:                                                               *
*       (N % 4) == 0 and N >= 24                                              *
*                                                                             *
*  Performance:                                                               *
*       55+2.5*N (software pipelining enabled by -O2, CCS5.1)                 *
*                                                                             *
*  Relative error:                                                            *
*       (-2^-16, 2^-16)                                                       *
*                                                                             *
*  Algorithm:                                                                 *
*       Input: V, abs(x[i]) normalized to [.5, 1)                             *
*       Initialization: U0 = (V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits      *
*       Iteration 1:    U1 = U0*(1-V*U0),          ~1/(4*V), 11.481 bits      *
*       Iteration 2:    U2 = 8*U1*(.5-V*U1),       ~1/(2*V), 22.585 bits      *
*       Output in the format of Fl16 = Q.15(mant)|Int16(expt)                 *
*                                                                             *
* =========================================================================== *
    
        .sect ".text: _DSP_vrecip16n"
        .global _DSP_vrecip16n

_DSP_vrecip16n: .cproc  A_X, B_YM, A_YE, B_n   
    
        .no_mdep
        .rega   A_x0, A_x1, A_rr, A_nx0, A_nx1, A_nx10, A_ny10, A_v10, A_vc10  
        .rega   A_vs1:A_vs0, A_vs10, A_u10, A_vu0, A_vu1, A_vu10, A_u0, A_u1  
        .rega   A_y0, A_y1, A_y32:A_y10, A_vp10, A_x32:A_x10 
        .rega   A_ss, A_cc, A_mm, A_ww, A_w, A_rnd
        .regb   B_x2, B_x3, B_rr, B_nx2, B_nx3, B_nx32, B_y32, B_v32, B_vc32 
        .regb   B_vs3:B_vs2, B_vs32, B_u32, B_vu2, B_vu3, B_vu32, B_u2, B_u3    
        .regb   B_y2, B_y3, B_ny32:B_ny10, B_vp32 
        .regb   B_ss, B_cc, B_mm, B_ww, B_w, B_rnd, B_nn, B_X 
        .reg    B_i, C10, C32, Cl10, Cl32

            ADD         4,              A_X,            B_X
            
            MVKL        0xB67BB67B,     B_mm                    ; Q15(1.4256)                  
            MVKH        0xB67BB67B,     B_mm                    ; Q15(1.4256)
            MV          B_mm,           A_mm
            MVKL        0x60006000,     A_cc
            MVKH        0x60006000,     A_cc
            MV          A_cc,           B_cc       
            MVKL        0xFFF1FFF1,     B_rr
            MVKH        0xFFF1FFF1,     B_rr
            MV          B_rr,           A_rr
            MVKL        0x80008000,     A_ww
            MVKH        0x80008000,     A_ww
            MV          A_ww,           B_ww
            SHL         A_ww,           15,             A_w     ; 0x40000000  
            SHL         B_ww,           15,             B_w     ; 0x40000000           
            ROTL        A_w,            17,             A_rnd   ; 0x00008000
            ROTL        B_w,            17,             B_rnd   ; 0x00008000
            MVK         15,             A_ss
            MVK         15,             B_ss

            SHR         B_n,            2,              B_i
            SUB         B_i,            2,              B_i
    
LOOP_vrecip: .trip 16 
            LDH         *A_X++,         A_x0
            LDH         *A_X++[3],      A_x1  
            LDH         *B_X++,         B_x2
            LDH         *B_X++[3],      B_x3
 
            NORM        A_x0,           A_nx0
            NORM        A_x1,           A_nx1
            NORM        B_x2,           B_nx2
            NORM        B_x3,           B_nx3
            PACK2       A_nx1,          A_nx0,          A_nx10
            PACK2       B_nx3,          B_nx2,          B_nx32
            ADD2        A_rr,           A_nx10,         A_ny10
            ADD2        B_rr,           B_nx32,         B_ny32

            SSHVL       A_x0,           A_nx0,          A_x0
            SSHVL       A_x1,           A_nx1,          A_x1
            SSHVL       B_x2,           B_nx2,          B_x2
            SSHVL       B_x3,           B_nx3,          B_x3
            PACKH2      A_x1,           A_x0,           A_v10
            PACKH2      B_x3,           B_x2,           B_v32
            
            ; Initial value, U0=(V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits
            ABS2        A_v10,          A_vp10
            ABS2        B_v32,          B_vp32
            SUB2        A_vp10,         A_cc,           A_vc10
            SUB2        B_vp32,         B_cc,           B_vc32
            SUB2        A_mm,           A_vp10,         A_u10
            SUB2        B_mm,           B_vp32,         B_u32
            SMPY2       A_vc10,         A_vc10,         A_vs1:A_vs0
            SMPY2       B_vc32,         B_vc32,         B_vs3:B_vs2
            PACKH2      A_vs1,          A_vs0,          A_vs10
            PACKH2      B_vs3,          B_vs2,          B_vs32
            ADD2        A_vs10,         A_u10,          A_u10
            ADD2        B_vs32,         B_u32,          B_u32
            SHR2        A_v10,          A_ss,           C10
            SHR2        B_v32,          B_ss,           C32
            XOR         C10,            A_u10,          A_u10
            XOR         C32,            B_u32,          B_u32
                       
            ; Iteration 1, U1=U0*(1-V*U0), ~1/(4*V), 11.481 bits
            SMPY2       A_v10,          A_u10,          A_vu1:A_vu0
            SMPY2       B_v32,          B_u32,          B_vu3:B_vu2
            PACKH2      A_vu1,          A_vu0,          A_vu10
            PACKH2      B_vu3,          B_vu2,          B_vu32
            SUB2        A_ww,           A_vu10,         A_vu10
            SUB2        B_ww,           B_vu32,         B_vu32
            SMPY2       A_u10,          A_vu10,         A_u1:A_u0
            SMPY2       B_u32,          B_vu32,         B_u3:B_u2
            PACKH2      A_u1,           A_u0,           A_u10
            PACKH2      B_u3,           B_u2,           B_u32

            ; Iteration 2, U2=8*U1*(1/2-V*U1), ~1/(2*V), 22.585 bits
            SMPY2       A_v10,          A_u10,          A_vu1:A_vu0
            SMPY2       B_v32,          B_u32,          B_vu3:B_vu2
            SUB         A_w,            A_vu0,          A_vu0
            SUB         A_w,            A_vu1,          A_vu1
            SUB         B_w,            B_vu2,          B_vu2
            SUB         B_w,            B_vu3,          B_vu3
            MPYHIR      A_u0,           A_vu0,          A_u0
            MPYHIR      A_u1,           A_vu1,          A_u1
            MPYHIR      B_u2,           B_vu2,          B_u2
            MPYHIR      B_u3,           B_vu3,          B_u3
            SSHL        A_u0,           3,              A_u0
            SSHL        A_u1,           3,              A_u1
            SSHL        B_u2,           3,              B_u2
            SSHL        B_u3,           3,              B_u3          

            ; Fl16 = Q15(mant)|int16(expt)
            SADD        A_u0,           A_rnd,          A_y0
            SADD        A_u1,           A_rnd,          A_y1
            SADD        B_u2,           B_rnd,          B_y2
            SADD        B_u3,           B_rnd,          B_y3
            PACKH2      A_y1,           A_y0,           A_y10
            PACKH2      B_y3,           B_y2,           B_y32
  
            MV          B_y32,          A_y32
            MV          A_ny10,         B_ny10

            STDW        A_y32:A_y10,    *B_YM++
            STDW        B_ny32:B_ny10,  *A_YE++

            BDEC        LOOP_vrecip,    B_i
            
            .endproc

Computing Square Root of A Vector of Fixed-Point Numbers

August 9, 2011 Coded in ASM for the TI C64x
* =========================================================================== *
*                                                                             *
*  Compute square root of a Q.15 vector by 4th order polynomial fitting       *
*      y[i] = sqrt(x[i]), 0 <= x[i] < 1;                                      *
*                                                                             * 
*  C prototype:                                                               *
*      void DSP_vsqrt_q15(short* x, short* y, int N);                         *
*                                                                             * 
*  Performance:                                                               *
*      O(4*N) or 4 cycles per number (software pipelining enabled by -O2)     * 
*                                                                             * 
*  Error:                                                                     *
*      (-2^-15, 2^-15)                                                        *
*                                                                             *
* =========================================================================== *  

; chebfun, min max error        
; 0xFFF1024A = Q31(-0.0005)      
; 0x0046E0AB = Q31(0.0022)       
; 0xFE764EE1 = Q31(-0.0120)      
; 0x1277E288 = Q31(0.1443)       
; 0x6ED9EBA1 = Q31(0.8660)       

; polyfit, min sqr error   
; 0xFFF1203D = Q31(-0.0005)
; 0x004581E7 = Q31(0.0021) 
; 0xFE7645AF = Q31(-0.0120)
; 0x1278CF97 = Q31(0.1443) 
; 0x6ED9E687 = Q31(0.8660) 

SQRT_C4 .set 0xFFF1203D  
SQRT_C3 .set 0x004581E7  
SQRT_C2 .set 0xFE7645AF  
SQRT_C1 .set 0x1278CF97  
SQRT_C0 .set 0x6ED9E687  
SQRT_S  .set 0x5A82799A ;  Q31(0.7071)

        .sect ".text: _DSP_vsqrt_q15"
        .global _DSP_vsqrt_q15

_DSP_vsqrt_q15: .cproc  A_X, B_Y, A_n

        .no_mdep
        .rega A_C4, A_C3, A_C2, A_C1, A_C0, A_xx0, A_rnd, A_y0
        .rega A_y0c4, A_y0c3, A_y0c2, A_y0c1, A_x0c3, A_x0c2, A_x0c1, A_x0c0
        .rega A_S, A_e0, A_x0x, A_xm, A_y0l, A_y0h, A_y0s
        .regb B_C4, B_C3, B_C2, B_C1, B_C0, B_xx1, B_rnd, B_y1, B_y10
        .regb B_y1c4, B_y1c3, B_y1c2, B_y1c1, B_x1c3, B_x1c2, B_x1c1, B_x1c0
        .regb B_S, B_e1, B_x1x, B_xm, B_y1l, B_y1h, B_y1s, B_X
        .reg  B_i, C0, C1
        
            ADD         2,              A_X,            B_X
            MVK         0x1,            A_rnd
            SHL         A_rnd,          15,             A_rnd
            MV          A_rnd,          B_rnd
            
            MVKL        SQRT_C4,        A_C4
            MVKH        SQRT_C4,        A_C4
            MVKL        SQRT_C3,        B_C3
            MVKH        SQRT_C3,        B_C3

            MV          A_C4,           B_C4
            MV          B_C3,           A_C3
            MVKL        SQRT_C2,        A_C2
            MVKH        SQRT_C2,        A_C2
            MVKL        SQRT_C1,        B_C1
            MVKH        SQRT_C1,        B_C1

            MV          A_C2,           B_C2
            MV          B_C1,           A_C1
            MVKL        SQRT_C0,        A_C0
            MVKH        SQRT_C0,        A_C0
            MVKL        SQRT_S,         B_S
            MVKH        SQRT_S,         B_S
            
            MV          B_S,            A_S
            MV          A_C0,           B_C0

            MVKL        0x6000,         A_xm
            SHL         A_xm,           16,             A_xm
            MV          A_xm,           B_xm

            SHR         A_n,            1,              B_i
            SUB         B_i,            2,              B_i
                  
LOOP_vsqrt: .trip 8
            LDH.D1T1    *A_X++[2],      A_xx0
            LDH.D2T2    *B_X++[2],      B_xx1
            
            NORM.L1     A_xx0,          A_e0
            NORM.L2     B_xx1,          B_e1          
            SSHVL.M1    A_xx0,          A_e0,           A_x0x
            SSHVL.M2    B_xx1,          B_e1,           B_x1x

            SUB.D1      A_e0,           16,             A_e0
            SUB.D2      B_e1,           16,             B_e1
            AND.D1      0x1,            A_e0,           C0
            AND.D2      0x1,            B_e1,           C1
            SHR.S1      A_e0,           1,              A_e0
            SHR.S2      B_e1,           1,              B_e1

            SUB.D1      A_x0x,          A_xm,           A_x0x
            SUB.D2      B_x1x,          B_xm,           B_x1x
            SHL.S1      A_x0x,          2,              A_x0x
            SHL.S2      B_x1x,          2,              B_x1x

            MPYHIR.M1   A_x0x,          A_C4,           A_y0c4
            MPYHIR.M2   B_x1x,          B_C4,           B_y1c4
            SADD.L1     A_y0c4,         A_C3,           A_x0c3
            SADD.L2     B_y1c4,         B_C3,           B_x1c3
    
            MPYHIR.M1   A_x0x,          A_x0c3,         A_y0c3
            MPYHIR.M2   B_x1x,          B_x1c3,         B_y1c3
            SADD.L1     A_y0c3,         A_C2,           A_x0c2
            SADD.L2     B_y1c3,         B_C2,           B_x1c2

            MPYHIR.M1   A_x0x,          A_x0c2,         A_y0c2
            MPYHIR.M2   B_x1x,          B_x1c2,         B_y1c2
            SADD.L1     A_y0c2,         A_C1,           A_x0c1
            SADD.L2     B_y1c2,         B_C1,           B_x1c1

            MPYHIR.M1   A_x0x,          A_x0c1,         A_y0c1
            MPYHIR.M2   B_x1x,          B_x1c1,         B_y1c1
            SADD.L1     A_y0c1,         A_C0,           A_x0c0
            SADD.L2     B_y1c1,         B_C0,           B_x1c0

            ; A_S = B_S = 0x5A82799A ~= 0x5A820000 + 0x00008000
     [C0]   MPYHIR.M1   A_S,            A_x0c0,         A_y0h
     [C1]   MPYHIR.M2   B_S,            B_x1c0,         B_y1h
     [C0]   SHR         A_x0c0,         16,             A_y0l
     [C1]   SHR         B_x1c0,         16,             B_y1l 
     [C0]   SADD.L1     A_y0h,          A_y0l,          A_x0c0
     [C1]   SADD.L2     B_y1h,          B_y1l,          B_x1c0

            SHR.S1      A_x0c0,         A_e0,           A_y0s
            SHR.S2      B_x1c0,         B_e1,           B_y1s

            SADD.L1     A_y0s,          A_rnd,          A_y0
            SADD.L2     B_y1s,          B_rnd,          B_y1
            PACKH2.S2X  B_y1,           A_y0,           B_y10                       

            STW.D2T2    B_y10,          *B_Y++

            BDEC        LOOP_vsqrt,     B_i
            
            .endproc

Fast MDCT/IMDCT Based on Forward FFT

August 6, 2011 Coded in C
/******** begin of mdct.h ******** */
#ifndef __MDCT_H
#define __MDCT_H

#include <fftw3.h>

#ifdef __cplusplus
extern "C" {
#endif

#ifdef SINGLE_PRECISION
typedef float         FLOAT;
typedef fftwf_complex FFTW_COMPLEX;
typedef fftwf_plan    FFTW_PLAN;
#else // DOUBLE_PRECISION
typedef double        FLOAT;
typedef fftw_complex  FFTW_COMPLEX;
typedef fftw_plan     FFTW_PLAN;
#endif  // SINGLE_PRECISION

typedef struct {
    int           M;            // MDCT spectrum size (number of bins)
    FLOAT*        twiddle;      // twiddle factor
    FFTW_COMPLEX* fft_in;       // fft workspace, input
    FFTW_COMPLEX* fft_out;      // fft workspace, output
    FFTW_PLAN     fft_plan;     // fft configuration
} mdct_plan; 

mdct_plan* mdct_init(int M);    // MDCT spectrum size (number of bins)

void mdct_free(mdct_plan* m_plan);

void mdct(FLOAT* mdct_line, FLOAT* time_signal, mdct_plan* m_plan);

void imdct(FLOAT* time_signal, FLOAT* mdct_line, mdct_plan* m_plan);

#ifdef __cplusplus
}
#endif

#endif // __MDCT_H
/******** end of mdct.h ******** */

/******** begin of mdct.c ******** */
#ifdef SINGLE_PRECISION

#define FFTW_MALLOC   fftwf_malloc
#define FFTW_FREE     fftwf_free
#define FFTW_PLAN_1D  fftwf_plan_dft_1d
#define FFTW_DESTROY  fftwf_destroy_plan 
#define FFTW_EXECUTE  fftwf_execute

#else // DOUBLE_PRECISION

#define FFTW_MALLOC   fftw_malloc
#define FFTW_FREE     fftw_free
#define FFTW_PLAN_1D  fftw_plan_dft_1d
#define FFTW_DESTROY  fftw_destroy_plan 
#define FFTW_EXECUTE  fftw_execute

#endif // SINGLE_PRECISION

void mdct_free(mdct_plan* m_plan)
{
    if(m_plan) 
    {
        FFTW_DESTROY(m_plan->fft_plan);
        FFTW_FREE(m_plan->fft_in);
        FFTW_FREE(m_plan->fft_out);

        if(m_plan->twiddle)
            free(m_plan->twiddle);

        free(m_plan);
    }
}

#define MDCT_CLEAUP(msg, ...) \
    {fprintf(stderr, msg", %s(), %s:%d \n", \
            __VA_ARGS__, __func__, __FILE__, __LINE__); \
     mdct_free(m_plan); return NULL;}

mdct_plan* mdct_init(int M)
{
    int        n;
    FLOAT      alpha, omega, scale;
    mdct_plan* m_plan = NULL;

    if(0x00 != (M & 0x01)) 
        MDCT_CLEAUP(" Expect an even number of MDCT coeffs, but meet %d", M);

    m_plan = (mdct_plan*) malloc(sizeof(mdct_plan));
    if(NULL == m_plan)
        MDCT_CLEAUP(" malloc error: %s", "m_plan");
    memset(m_plan, 0, sizeof(m_plan[0]));

    m_plan->M = M;

    m_plan->twiddle = (FLOAT*) malloc(sizeof(FLOAT) * M);
    if(NULL == m_plan->twiddle)
        MDCT_CLEAUP(" malloc error: %s", "m_plan->twiddle");
    alpha = M_PI / (8.f * M);
    omega = M_PI / M;
    scale = sqrt(sqrt(2.f / M));    
    for(n = 0; n < (M >> 1); n++)
    {    
        m_plan->twiddle[2*n+0] = (FLOAT) (scale * cos(omega * n + alpha));
        m_plan->twiddle[2*n+1] = (FLOAT) (scale * sin(omega * n + alpha));
    }    
	
    m_plan->fft_in   
        = (FFTW_COMPLEX*) FFTW_MALLOC(sizeof(FFTW_COMPLEX) * M >> 1);    
    if(NULL == m_plan->fft_in)
        MDCT_CLEAUP(" malloc error: %s", "m_plan->fft_in");

    m_plan->fft_out  
        = (FFTW_COMPLEX*) FFTW_MALLOC(sizeof(FFTW_COMPLEX) * M >> 1);    
    if(NULL == m_plan->fft_out)
        MDCT_CLEAUP(" malloc error: %s", "m_plan->fft_out");

    m_plan->fft_plan = FFTW_PLAN_1D(M >> 1, 
                                    m_plan->fft_in, 
                                    m_plan->fft_out,    
                                    FFTW_FORWARD,
                                    FFTW_MEASURE);
    if(NULL == m_plan->fft_plan)
        MDCT_CLEAUP(" malloc error: %s", "m_plan->fft_plan");

    return m_plan;
}

void mdct(FLOAT* mdct_line, FLOAT* time_signal, mdct_plan* m_plan)
{
    FLOAT *xr, *xi, r0, i0;
    FLOAT *cos_tw, *sin_tw, c, s;
    int    M, M2, M32, M52, n;

    M   = m_plan->M;
    M2  = M >> 1;
    M32 = 3 * M2;
    M52 = 5 * M2;

    cos_tw = m_plan->twiddle;
    sin_tw = cos_tw + 1; 
    
    /* odd/even folding and pre-twiddle */
    xr = (FLOAT*) m_plan->fft_in;
    xi = xr + 1;
    for(n = 0; n < M2; n += 2) 
    {
        r0 = time_signal[M32-1-n] + time_signal[M32+n];    
        i0 = time_signal[M2+n]    - time_signal[M2-1-n];    
        
        c = cos_tw[n];
        s = sin_tw[n];

        xr[n] = r0 * c + i0 * s;
        xi[n] = i0 * c - r0 * s;
    }

    for(; n < M; n += 2) 
    {
        r0 = time_signal[M32-1-n] - time_signal[-M2+n];    
        i0 = time_signal[M2+n]    + time_signal[M52-1-n];    
        
        c = cos_tw[n];
        s = sin_tw[n];

        xr[n] = r0 * c + i0 * s;
        xi[n] = i0 * c - r0 * s;
    }

    /* complex FFT of size M/2 */
    FFTW_EXECUTE(m_plan->fft_plan);

    /* post-twiddle */
    xr = (FLOAT*) m_plan->fft_out;
    xi = xr + 1;
    for(n = 0; n < M; n += 2)
    {
        r0 = xr[n];
        i0 = xi[n];
        
        c = cos_tw[n];
        s = sin_tw[n];    

        mdct_line[n]     = - r0 * c - i0 * s;
        mdct_line[M-1-n] = - r0 * s + i0 * c;
    }
}

void imdct(FLOAT* time_signal, FLOAT* mdct_line, mdct_plan* m_plan)
{
    FLOAT *xr, *xi, r0, i0, r1, i1;
    FLOAT *cos_tw, *sin_tw, c, s;
    int    M, M2, M32, M52, n;

    M   = m_plan->M;
    M2  = M >> 1;
    M32 = 3 * M2;
    M52 = 5 * M2;

    cos_tw = m_plan->twiddle;
    sin_tw = cos_tw + 1; 
    
    /* pre-twiddle */
    xr = (FLOAT*) m_plan->fft_in;
    xi = xr + 1;
    for(n = 0; n < M; n += 2)
    {
        r0 =  mdct_line[n];	
        i0 =  mdct_line[M-1-n];
        
        c = cos_tw[n];
        s = sin_tw[n];    
        
        xr[n] = -i0 * s - r0 * c;
        xi[n] = -i0 * c + r0 * s;
    } 
    
    /* complex FFT of size M/2 */
    FFTW_EXECUTE(m_plan->fft_plan);

    /* odd/even expanding and post-twiddle */
    xr = (FLOAT*) m_plan->fft_out;
    xi = xr + 1;
    for(n = 0; n < M2; n += 2) 
    {
        r0 = xr[n]; 
        i0 = xi[n];    
        
        c = cos_tw[n]; 
        s = sin_tw[n];

        r1 = r0 * c + i0 * s;
        i1 = r0 * s - i0 * c;

        time_signal[M32-1-n] =  r1;
        time_signal[M32+n]   =  r1;
        time_signal[M2+n]    =  i1;
        time_signal[M2-1-n]  = -i1;
    }

    for(; n < M; n += 2) 
    {
        r0 = xr[n]; 
        i0 = xi[n];    
        
        c = cos_tw[n]; 
        s = sin_tw[n];
        
        r1 = r0 * c + i0 * s;
        i1 = r0 * s - i0 * c;
        
        time_signal[M32-1-n] =  r1;
        time_signal[-M2+n]   = -r1;
        time_signal[M2+n]    =  i1;
        time_signal[M52-1-n] =  i1;
    }
}
/******** end of mdct.c ******** */

/******** begin of mdct_test.c ******** */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <time.h>
#include "mdct.h"

int main(int argc, char* argv[])
{
    int        M, r, i;
    FLOAT*     time = NULL;
    FLOAT*     freq = NULL;
    mdct_plan* m_plan = NULL;
    char*      precision = NULL;
    struct timeval t0, t1;
    long long elps;

    if(3 != argc)
    {
        fprintf(stderr, " Usage: %s <MDCT_SPECTRUM_SIZE> <run_times> \n", argv[0]);
        return -1;
    }

    sscanf(argv[1], "%d", &M);
    sscanf(argv[2], "%d", &r);    
    if(NULL == (m_plan = mdct_init(M)))
        return -1;
    if(NULL == (time = (FLOAT*) malloc(2 * M * sizeof(FLOAT))))
        return -1;
    if(NULL == (freq = (FLOAT*) malloc(M * sizeof(FLOAT))))
        return -1;

    for(i = 0; i < 2 * M; i++)
        time[i] = 2.f * rand() / RAND_MAX - 1.f;        
    for(i = 0; i < M; i++)
        freq[i] = 2.f * rand() / RAND_MAX - 1.f;        

    precision = (sizeof(float) == sizeof(FLOAT))? 
                "single precision" : "double precision";

#if 1
    gettimeofday(&t0, NULL);
    for(i = 0; i < r; i++)
        mdct(freq, time, m_plan);
    gettimeofday(&t1, NULL);

    elps = (t1.tv_sec - t0.tv_sec) * 1000000 + (t1.tv_usec - t0.tv_usec);    
    fprintf(stdout, "MDCT size of %d, %s, running %d times, average %.3f ms\n", 
            M, precision, r, (FLOAT) elps / r / 1000.f); 
#endif // 0

#if 1    
    gettimeofday(&t0, NULL);
    for(i = 0; i < r; i++)
        imdct(time, freq, m_plan);
    gettimeofday(&t1, NULL);

    elps = (t1.tv_sec - t0.tv_sec) * 1000000 + (t1.tv_usec - t0.tv_usec);    
    fprintf(stdout, "IMDCT size of %d, %s, running %d times, average %.3f ms\n", 
            M, precision, r, (FLOAT) elps / r / 1000.f); 
#endif //0

#if 0
    for(i = 0; i < 2 * M; i++)
        fprintf(stdout, "%f    ", time[i]);
    fprintf(stdout, "\n");

    for(i = 0; i < M; i++)
        fprintf(stdout, "%f    ", freq[i]);
    fprintf(stdout, "\n");
#endif // 0

    free(time);
    free(freq);
    mdct_free(m_plan);    

    return 0;
}
/******** end of mdct_test.c ******** */