Code Snippets Submitted by shuhua.zhang
Computing Reciprocals of Fixed-Point Numbers by the Newton-Raphson Method — Division by Multiplication
* =========================================================================== *
* *
* Compute reciprocals of a Q.15 vector by Newton-Raphson method *
* y[i] = ym[i] * 2^ye[i] = 1 / x[i], -1 <= x[i] < 1 and x[i] != 0 *
* where ym is the mantissa vector and ye is the exponent vector *
* *
* C prototype: *
* void DSP_vrecip16n(short* x, // Input, Q.15 vector *
* short* ym, // Output, Q.15 vector *
* short* ye, // Output, int16 vector *
* int N); // Input, vector length *
* *
* Restriction: *
* (N % 4) == 0 and N >= 24 *
* *
* Performance: *
* 55+2.5*N (software pipelining enabled by -O2, CCS5.1) *
* *
* Relative error: *
* (-2^-16, 2^-16) *
* *
* Algorithm: *
* Input: V, abs(x[i]) normalized to [.5, 1) *
* Initialization: U0 = (V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits *
* Iteration 1: U1 = U0*(1-V*U0), ~1/(4*V), 11.481 bits *
* Iteration 2: U2 = 8*U1*(.5-V*U1), ~1/(2*V), 22.585 bits *
* Output in the format of Fl16 = Q.15(mant)|Int16(expt) *
* *
* =========================================================================== *
.sect ".text: _DSP_vrecip16n"
.global _DSP_vrecip16n
_DSP_vrecip16n: .cproc A_X, B_YM, A_YE, B_n
.no_mdep
.rega A_x0, A_x1, A_rr, A_nx0, A_nx1, A_nx10, A_ny10, A_v10, A_vc10
.rega A_vs1:A_vs0, A_vs10, A_u10, A_vu0, A_vu1, A_vu10, A_u0, A_u1
.rega A_y0, A_y1, A_y32:A_y10, A_vp10, A_x32:A_x10
.rega A_ss, A_cc, A_mm, A_ww, A_w, A_rnd
.regb B_x2, B_x3, B_rr, B_nx2, B_nx3, B_nx32, B_y32, B_v32, B_vc32
.regb B_vs3:B_vs2, B_vs32, B_u32, B_vu2, B_vu3, B_vu32, B_u2, B_u3
.regb B_y2, B_y3, B_ny32:B_ny10, B_vp32
.regb B_ss, B_cc, B_mm, B_ww, B_w, B_rnd, B_nn, B_X
.reg B_i, C10, C32, Cl10, Cl32
ADD 4, A_X, B_X
MVKL 0xB67BB67B, B_mm ; Q15(1.4256)
MVKH 0xB67BB67B, B_mm ; Q15(1.4256)
MV B_mm, A_mm
MVKL 0x60006000, A_cc
MVKH 0x60006000, A_cc
MV A_cc, B_cc
MVKL 0xFFF1FFF1, B_rr
MVKH 0xFFF1FFF1, B_rr
MV B_rr, A_rr
MVKL 0x80008000, A_ww
MVKH 0x80008000, A_ww
MV A_ww, B_ww
SHL A_ww, 15, A_w ; 0x40000000
SHL B_ww, 15, B_w ; 0x40000000
ROTL A_w, 17, A_rnd ; 0x00008000
ROTL B_w, 17, B_rnd ; 0x00008000
MVK 15, A_ss
MVK 15, B_ss
SHR B_n, 2, B_i
SUB B_i, 2, B_i
LOOP_vrecip: .trip 16
LDH *A_X++, A_x0
LDH *A_X++[3], A_x1
LDH *B_X++, B_x2
LDH *B_X++[3], B_x3
NORM A_x0, A_nx0
NORM A_x1, A_nx1
NORM B_x2, B_nx2
NORM B_x3, B_nx3
PACK2 A_nx1, A_nx0, A_nx10
PACK2 B_nx3, B_nx2, B_nx32
ADD2 A_rr, A_nx10, A_ny10
ADD2 B_rr, B_nx32, B_ny32
SSHVL A_x0, A_nx0, A_x0
SSHVL A_x1, A_nx1, A_x1
SSHVL B_x2, B_nx2, B_x2
SSHVL B_x3, B_nx3, B_x3
PACKH2 A_x1, A_x0, A_v10
PACKH2 B_x3, B_x2, B_v32
; Initial value, U0=(V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits
ABS2 A_v10, A_vp10
ABS2 B_v32, B_vp32
SUB2 A_vp10, A_cc, A_vc10
SUB2 B_vp32, B_cc, B_vc32
SUB2 A_mm, A_vp10, A_u10
SUB2 B_mm, B_vp32, B_u32
SMPY2 A_vc10, A_vc10, A_vs1:A_vs0
SMPY2 B_vc32, B_vc32, B_vs3:B_vs2
PACKH2 A_vs1, A_vs0, A_vs10
PACKH2 B_vs3, B_vs2, B_vs32
ADD2 A_vs10, A_u10, A_u10
ADD2 B_vs32, B_u32, B_u32
SHR2 A_v10, A_ss, C10
SHR2 B_v32, B_ss, C32
XOR C10, A_u10, A_u10
XOR C32, B_u32, B_u32
; Iteration 1, U1=U0*(1-V*U0), ~1/(4*V), 11.481 bits
SMPY2 A_v10, A_u10, A_vu1:A_vu0
SMPY2 B_v32, B_u32, B_vu3:B_vu2
PACKH2 A_vu1, A_vu0, A_vu10
PACKH2 B_vu3, B_vu2, B_vu32
SUB2 A_ww, A_vu10, A_vu10
SUB2 B_ww, B_vu32, B_vu32
SMPY2 A_u10, A_vu10, A_u1:A_u0
SMPY2 B_u32, B_vu32, B_u3:B_u2
PACKH2 A_u1, A_u0, A_u10
PACKH2 B_u3, B_u2, B_u32
; Iteration 2, U2=8*U1*(1/2-V*U1), ~1/(2*V), 22.585 bits
SMPY2 A_v10, A_u10, A_vu1:A_vu0
SMPY2 B_v32, B_u32, B_vu3:B_vu2
SUB A_w, A_vu0, A_vu0
SUB A_w, A_vu1, A_vu1
SUB B_w, B_vu2, B_vu2
SUB B_w, B_vu3, B_vu3
MPYHIR A_u0, A_vu0, A_u0
MPYHIR A_u1, A_vu1, A_u1
MPYHIR B_u2, B_vu2, B_u2
MPYHIR B_u3, B_vu3, B_u3
SSHL A_u0, 3, A_u0
SSHL A_u1, 3, A_u1
SSHL B_u2, 3, B_u2
SSHL B_u3, 3, B_u3
; Fl16 = Q15(mant)|int16(expt)
SADD A_u0, A_rnd, A_y0
SADD A_u1, A_rnd, A_y1
SADD B_u2, B_rnd, B_y2
SADD B_u3, B_rnd, B_y3
PACKH2 A_y1, A_y0, A_y10
PACKH2 B_y3, B_y2, B_y32
MV B_y32, A_y32
MV A_ny10, B_ny10
STDW A_y32:A_y10, *B_YM++
STDW B_ny32:B_ny10, *A_YE++
BDEC LOOP_vrecip, B_i
.endproc
Fast MDCT/IMDCT Based on Forward FFT
/******** begin of mdct.h ******** */
#ifndef __MDCT_H
#define __MDCT_H
#include <fftw3.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifdef SINGLE_PRECISION
typedef float FLOAT;
typedef fftwf_complex FFTW_COMPLEX;
typedef fftwf_plan FFTW_PLAN;
#else // DOUBLE_PRECISION
typedef double FLOAT;
typedef fftw_complex FFTW_COMPLEX;
typedef fftw_plan FFTW_PLAN;
#endif // SINGLE_PRECISION
typedef struct {
int M; // MDCT spectrum size (number of bins)
FLOAT* twiddle; // twiddle factor
FFTW_COMPLEX* fft_in; // fft workspace, input
FFTW_COMPLEX* fft_out; // fft workspace, output
FFTW_PLAN fft_plan; // fft configuration
} mdct_plan;
mdct_plan* mdct_init(int M); // MDCT spectrum size (number of bins)
void mdct_free(mdct_plan* m_plan);
void mdct(FLOAT* mdct_line, FLOAT* time_signal, mdct_plan* m_plan);
void imdct(FLOAT* time_signal, FLOAT* mdct_line, mdct_plan* m_plan);
#ifdef __cplusplus
}
#endif
#endif // __MDCT_H
/******** end of mdct.h ******** */
/******** begin of mdct.c ******** */
#ifdef SINGLE_PRECISION
#define FFTW_MALLOC fftwf_malloc
#define FFTW_FREE fftwf_free
#define FFTW_PLAN_1D fftwf_plan_dft_1d
#define FFTW_DESTROY fftwf_destroy_plan
#define FFTW_EXECUTE fftwf_execute
#else // DOUBLE_PRECISION
#define FFTW_MALLOC fftw_malloc
#define FFTW_FREE fftw_free
#define FFTW_PLAN_1D fftw_plan_dft_1d
#define FFTW_DESTROY fftw_destroy_plan
#define FFTW_EXECUTE fftw_execute
#endif // SINGLE_PRECISION
void mdct_free(mdct_plan* m_plan)
{
if(m_plan)
{
FFTW_DESTROY(m_plan->fft_plan);
FFTW_FREE(m_plan->fft_in);
FFTW_FREE(m_plan->fft_out);
if(m_plan->twiddle)
free(m_plan->twiddle);
free(m_plan);
}
}
#define MDCT_CLEAUP(msg, ...) \
{fprintf(stderr, msg", %s(), %s:%d \n", \
__VA_ARGS__, __func__, __FILE__, __LINE__); \
mdct_free(m_plan); return NULL;}
mdct_plan* mdct_init(int M)
{
int n;
FLOAT alpha, omega, scale;
mdct_plan* m_plan = NULL;
if(0x00 != (M & 0x01))
MDCT_CLEAUP(" Expect an even number of MDCT coeffs, but meet %d", M);
m_plan = (mdct_plan*) malloc(sizeof(mdct_plan));
if(NULL == m_plan)
MDCT_CLEAUP(" malloc error: %s", "m_plan");
memset(m_plan, 0, sizeof(m_plan[0]));
m_plan->M = M;
m_plan->twiddle = (FLOAT*) malloc(sizeof(FLOAT) * M);
if(NULL == m_plan->twiddle)
MDCT_CLEAUP(" malloc error: %s", "m_plan->twiddle");
alpha = M_PI / (8.f * M);
omega = M_PI / M;
scale = sqrt(sqrt(2.f / M));
for(n = 0; n < (M >> 1); n++)
{
m_plan->twiddle[2*n+0] = (FLOAT) (scale * cos(omega * n + alpha));
m_plan->twiddle[2*n+1] = (FLOAT) (scale * sin(omega * n + alpha));
}
m_plan->fft_in
= (FFTW_COMPLEX*) FFTW_MALLOC(sizeof(FFTW_COMPLEX) * M >> 1);
if(NULL == m_plan->fft_in)
MDCT_CLEAUP(" malloc error: %s", "m_plan->fft_in");
m_plan->fft_out
= (FFTW_COMPLEX*) FFTW_MALLOC(sizeof(FFTW_COMPLEX) * M >> 1);
if(NULL == m_plan->fft_out)
MDCT_CLEAUP(" malloc error: %s", "m_plan->fft_out");
m_plan->fft_plan = FFTW_PLAN_1D(M >> 1,
m_plan->fft_in,
m_plan->fft_out,
FFTW_FORWARD,
FFTW_MEASURE);
if(NULL == m_plan->fft_plan)
MDCT_CLEAUP(" malloc error: %s", "m_plan->fft_plan");
return m_plan;
}
void mdct(FLOAT* mdct_line, FLOAT* time_signal, mdct_plan* m_plan)
{
FLOAT *xr, *xi, r0, i0;
FLOAT *cos_tw, *sin_tw, c, s;
int M, M2, M32, M52, n;
M = m_plan->M;
M2 = M >> 1;
M32 = 3 * M2;
M52 = 5 * M2;
cos_tw = m_plan->twiddle;
sin_tw = cos_tw + 1;
/* odd/even folding and pre-twiddle */
xr = (FLOAT*) m_plan->fft_in;
xi = xr + 1;
for(n = 0; n < M2; n += 2)
{
r0 = time_signal[M32-1-n] + time_signal[M32+n];
i0 = time_signal[M2+n] - time_signal[M2-1-n];
c = cos_tw[n];
s = sin_tw[n];
xr[n] = r0 * c + i0 * s;
xi[n] = i0 * c - r0 * s;
}
for(; n < M; n += 2)
{
r0 = time_signal[M32-1-n] - time_signal[-M2+n];
i0 = time_signal[M2+n] + time_signal[M52-1-n];
c = cos_tw[n];
s = sin_tw[n];
xr[n] = r0 * c + i0 * s;
xi[n] = i0 * c - r0 * s;
}
/* complex FFT of size M/2 */
FFTW_EXECUTE(m_plan->fft_plan);
/* post-twiddle */
xr = (FLOAT*) m_plan->fft_out;
xi = xr + 1;
for(n = 0; n < M; n += 2)
{
r0 = xr[n];
i0 = xi[n];
c = cos_tw[n];
s = sin_tw[n];
mdct_line[n] = - r0 * c - i0 * s;
mdct_line[M-1-n] = - r0 * s + i0 * c;
}
}
void imdct(FLOAT* time_signal, FLOAT* mdct_line, mdct_plan* m_plan)
{
FLOAT *xr, *xi, r0, i0, r1, i1;
FLOAT *cos_tw, *sin_tw, c, s;
int M, M2, M32, M52, n;
M = m_plan->M;
M2 = M >> 1;
M32 = 3 * M2;
M52 = 5 * M2;
cos_tw = m_plan->twiddle;
sin_tw = cos_tw + 1;
/* pre-twiddle */
xr = (FLOAT*) m_plan->fft_in;
xi = xr + 1;
for(n = 0; n < M; n += 2)
{
r0 = mdct_line[n];
i0 = mdct_line[M-1-n];
c = cos_tw[n];
s = sin_tw[n];
xr[n] = -i0 * s - r0 * c;
xi[n] = -i0 * c + r0 * s;
}
/* complex FFT of size M/2 */
FFTW_EXECUTE(m_plan->fft_plan);
/* odd/even expanding and post-twiddle */
xr = (FLOAT*) m_plan->fft_out;
xi = xr + 1;
for(n = 0; n < M2; n += 2)
{
r0 = xr[n];
i0 = xi[n];
c = cos_tw[n];
s = sin_tw[n];
r1 = r0 * c + i0 * s;
i1 = r0 * s - i0 * c;
time_signal[M32-1-n] = r1;
time_signal[M32+n] = r1;
time_signal[M2+n] = i1;
time_signal[M2-1-n] = -i1;
}
for(; n < M; n += 2)
{
r0 = xr[n];
i0 = xi[n];
c = cos_tw[n];
s = sin_tw[n];
r1 = r0 * c + i0 * s;
i1 = r0 * s - i0 * c;
time_signal[M32-1-n] = r1;
time_signal[-M2+n] = -r1;
time_signal[M2+n] = i1;
time_signal[M52-1-n] = i1;
}
}
/******** end of mdct.c ******** */
/******** begin of mdct_test.c ******** */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <time.h>
#include "mdct.h"
int main(int argc, char* argv[])
{
int M, r, i;
FLOAT* time = NULL;
FLOAT* freq = NULL;
mdct_plan* m_plan = NULL;
char* precision = NULL;
struct timeval t0, t1;
long long elps;
if(3 != argc)
{
fprintf(stderr, " Usage: %s <MDCT_SPECTRUM_SIZE> <run_times> \n", argv[0]);
return -1;
}
sscanf(argv[1], "%d", &M);
sscanf(argv[2], "%d", &r);
if(NULL == (m_plan = mdct_init(M)))
return -1;
if(NULL == (time = (FLOAT*) malloc(2 * M * sizeof(FLOAT))))
return -1;
if(NULL == (freq = (FLOAT*) malloc(M * sizeof(FLOAT))))
return -1;
for(i = 0; i < 2 * M; i++)
time[i] = 2.f * rand() / RAND_MAX - 1.f;
for(i = 0; i < M; i++)
freq[i] = 2.f * rand() / RAND_MAX - 1.f;
precision = (sizeof(float) == sizeof(FLOAT))?
"single precision" : "double precision";
#if 1
gettimeofday(&t0, NULL);
for(i = 0; i < r; i++)
mdct(freq, time, m_plan);
gettimeofday(&t1, NULL);
elps = (t1.tv_sec - t0.tv_sec) * 1000000 + (t1.tv_usec - t0.tv_usec);
fprintf(stdout, "MDCT size of %d, %s, running %d times, average %.3f ms\n",
M, precision, r, (FLOAT) elps / r / 1000.f);
#endif // 0
#if 1
gettimeofday(&t0, NULL);
for(i = 0; i < r; i++)
imdct(time, freq, m_plan);
gettimeofday(&t1, NULL);
elps = (t1.tv_sec - t0.tv_sec) * 1000000 + (t1.tv_usec - t0.tv_usec);
fprintf(stdout, "IMDCT size of %d, %s, running %d times, average %.3f ms\n",
M, precision, r, (FLOAT) elps / r / 1000.f);
#endif //0
#if 0
for(i = 0; i < 2 * M; i++)
fprintf(stdout, "%f ", time[i]);
fprintf(stdout, "\n");
for(i = 0; i < M; i++)
fprintf(stdout, "%f ", freq[i]);
fprintf(stdout, "\n");
#endif // 0
free(time);
free(freq);
mdct_free(m_plan);
return 0;
}
/******** end of mdct_test.c ******** */
Computing Square Root of A Vector of Fixed-Point Numbers
* =========================================================================== *
* *
* Compute square root of a Q.15 vector by 4th order polynomial fitting *
* y[i] = sqrt(x[i]), 0 <= x[i] < 1; *
* *
* C prototype: *
* void DSP_vsqrt_q15(short* x, short* y, int N); *
* *
* Performance: *
* O(4*N) or 4 cycles per number (software pipelining enabled by -O2) *
* *
* Error: *
* (-2^-15, 2^-15) *
* *
* =========================================================================== *
; chebfun, min max error
; 0xFFF1024A = Q31(-0.0005)
; 0x0046E0AB = Q31(0.0022)
; 0xFE764EE1 = Q31(-0.0120)
; 0x1277E288 = Q31(0.1443)
; 0x6ED9EBA1 = Q31(0.8660)
; polyfit, min sqr error
; 0xFFF1203D = Q31(-0.0005)
; 0x004581E7 = Q31(0.0021)
; 0xFE7645AF = Q31(-0.0120)
; 0x1278CF97 = Q31(0.1443)
; 0x6ED9E687 = Q31(0.8660)
SQRT_C4 .set 0xFFF1203D
SQRT_C3 .set 0x004581E7
SQRT_C2 .set 0xFE7645AF
SQRT_C1 .set 0x1278CF97
SQRT_C0 .set 0x6ED9E687
SQRT_S .set 0x5A82799A ; Q31(0.7071)
.sect ".text: _DSP_vsqrt_q15"
.global _DSP_vsqrt_q15
_DSP_vsqrt_q15: .cproc A_X, B_Y, A_n
.no_mdep
.rega A_C4, A_C3, A_C2, A_C1, A_C0, A_xx0, A_rnd, A_y0
.rega A_y0c4, A_y0c3, A_y0c2, A_y0c1, A_x0c3, A_x0c2, A_x0c1, A_x0c0
.rega A_S, A_e0, A_x0x, A_xm, A_y0l, A_y0h, A_y0s
.regb B_C4, B_C3, B_C2, B_C1, B_C0, B_xx1, B_rnd, B_y1, B_y10
.regb B_y1c4, B_y1c3, B_y1c2, B_y1c1, B_x1c3, B_x1c2, B_x1c1, B_x1c0
.regb B_S, B_e1, B_x1x, B_xm, B_y1l, B_y1h, B_y1s, B_X
.reg B_i, C0, C1
ADD 2, A_X, B_X
MVK 0x1, A_rnd
SHL A_rnd, 15, A_rnd
MV A_rnd, B_rnd
MVKL SQRT_C4, A_C4
MVKH SQRT_C4, A_C4
MVKL SQRT_C3, B_C3
MVKH SQRT_C3, B_C3
MV A_C4, B_C4
MV B_C3, A_C3
MVKL SQRT_C2, A_C2
MVKH SQRT_C2, A_C2
MVKL SQRT_C1, B_C1
MVKH SQRT_C1, B_C1
MV A_C2, B_C2
MV B_C1, A_C1
MVKL SQRT_C0, A_C0
MVKH SQRT_C0, A_C0
MVKL SQRT_S, B_S
MVKH SQRT_S, B_S
MV B_S, A_S
MV A_C0, B_C0
MVKL 0x6000, A_xm
SHL A_xm, 16, A_xm
MV A_xm, B_xm
SHR A_n, 1, B_i
SUB B_i, 2, B_i
LOOP_vsqrt: .trip 8
LDH.D1T1 *A_X++[2], A_xx0
LDH.D2T2 *B_X++[2], B_xx1
NORM.L1 A_xx0, A_e0
NORM.L2 B_xx1, B_e1
SSHVL.M1 A_xx0, A_e0, A_x0x
SSHVL.M2 B_xx1, B_e1, B_x1x
SUB.D1 A_e0, 16, A_e0
SUB.D2 B_e1, 16, B_e1
AND.D1 0x1, A_e0, C0
AND.D2 0x1, B_e1, C1
SHR.S1 A_e0, 1, A_e0
SHR.S2 B_e1, 1, B_e1
SUB.D1 A_x0x, A_xm, A_x0x
SUB.D2 B_x1x, B_xm, B_x1x
SHL.S1 A_x0x, 2, A_x0x
SHL.S2 B_x1x, 2, B_x1x
MPYHIR.M1 A_x0x, A_C4, A_y0c4
MPYHIR.M2 B_x1x, B_C4, B_y1c4
SADD.L1 A_y0c4, A_C3, A_x0c3
SADD.L2 B_y1c4, B_C3, B_x1c3
MPYHIR.M1 A_x0x, A_x0c3, A_y0c3
MPYHIR.M2 B_x1x, B_x1c3, B_y1c3
SADD.L1 A_y0c3, A_C2, A_x0c2
SADD.L2 B_y1c3, B_C2, B_x1c2
MPYHIR.M1 A_x0x, A_x0c2, A_y0c2
MPYHIR.M2 B_x1x, B_x1c2, B_y1c2
SADD.L1 A_y0c2, A_C1, A_x0c1
SADD.L2 B_y1c2, B_C1, B_x1c1
MPYHIR.M1 A_x0x, A_x0c1, A_y0c1
MPYHIR.M2 B_x1x, B_x1c1, B_y1c1
SADD.L1 A_y0c1, A_C0, A_x0c0
SADD.L2 B_y1c1, B_C0, B_x1c0
; A_S = B_S = 0x5A82799A ~= 0x5A820000 + 0x00008000
[C0] MPYHIR.M1 A_S, A_x0c0, A_y0h
[C1] MPYHIR.M2 B_S, B_x1c0, B_y1h
[C0] SHR A_x0c0, 16, A_y0l
[C1] SHR B_x1c0, 16, B_y1l
[C0] SADD.L1 A_y0h, A_y0l, A_x0c0
[C1] SADD.L2 B_y1h, B_y1l, B_x1c0
SHR.S1 A_x0c0, A_e0, A_y0s
SHR.S2 B_x1c0, B_e1, B_y1s
SADD.L1 A_y0s, A_rnd, A_y0
SADD.L2 B_y1s, B_rnd, B_y1
PACKH2.S2X B_y1, A_y0, B_y10
STW.D2T2 B_y10, *B_Y++
BDEC LOOP_vsqrt, B_i
.endproc
Computing Reciprocals of Fixed-Point Numbers by the Newton-Raphson Method — Division by Multiplication
* =========================================================================== *
* *
* Compute reciprocals of a Q.15 vector by Newton-Raphson method *
* y[i] = ym[i] * 2^ye[i] = 1 / x[i], -1 <= x[i] < 1 and x[i] != 0 *
* where ym is the mantissa vector and ye is the exponent vector *
* *
* C prototype: *
* void DSP_vrecip16n(short* x, // Input, Q.15 vector *
* short* ym, // Output, Q.15 vector *
* short* ye, // Output, int16 vector *
* int N); // Input, vector length *
* *
* Restriction: *
* (N % 4) == 0 and N >= 24 *
* *
* Performance: *
* 55+2.5*N (software pipelining enabled by -O2, CCS5.1) *
* *
* Relative error: *
* (-2^-16, 2^-16) *
* *
* Algorithm: *
* Input: V, abs(x[i]) normalized to [.5, 1) *
* Initialization: U0 = (V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits *
* Iteration 1: U1 = U0*(1-V*U0), ~1/(4*V), 11.481 bits *
* Iteration 2: U2 = 8*U1*(.5-V*U1), ~1/(2*V), 22.585 bits *
* Output in the format of Fl16 = Q.15(mant)|Int16(expt) *
* *
* =========================================================================== *
.sect ".text: _DSP_vrecip16n"
.global _DSP_vrecip16n
_DSP_vrecip16n: .cproc A_X, B_YM, A_YE, B_n
.no_mdep
.rega A_x0, A_x1, A_rr, A_nx0, A_nx1, A_nx10, A_ny10, A_v10, A_vc10
.rega A_vs1:A_vs0, A_vs10, A_u10, A_vu0, A_vu1, A_vu10, A_u0, A_u1
.rega A_y0, A_y1, A_y32:A_y10, A_vp10, A_x32:A_x10
.rega A_ss, A_cc, A_mm, A_ww, A_w, A_rnd
.regb B_x2, B_x3, B_rr, B_nx2, B_nx3, B_nx32, B_y32, B_v32, B_vc32
.regb B_vs3:B_vs2, B_vs32, B_u32, B_vu2, B_vu3, B_vu32, B_u2, B_u3
.regb B_y2, B_y3, B_ny32:B_ny10, B_vp32
.regb B_ss, B_cc, B_mm, B_ww, B_w, B_rnd, B_nn, B_X
.reg B_i, C10, C32, Cl10, Cl32
ADD 4, A_X, B_X
MVKL 0xB67BB67B, B_mm ; Q15(1.4256)
MVKH 0xB67BB67B, B_mm ; Q15(1.4256)
MV B_mm, A_mm
MVKL 0x60006000, A_cc
MVKH 0x60006000, A_cc
MV A_cc, B_cc
MVKL 0xFFF1FFF1, B_rr
MVKH 0xFFF1FFF1, B_rr
MV B_rr, A_rr
MVKL 0x80008000, A_ww
MVKH 0x80008000, A_ww
MV A_ww, B_ww
SHL A_ww, 15, A_w ; 0x40000000
SHL B_ww, 15, B_w ; 0x40000000
ROTL A_w, 17, A_rnd ; 0x00008000
ROTL B_w, 17, B_rnd ; 0x00008000
MVK 15, A_ss
MVK 15, B_ss
SHR B_n, 2, B_i
SUB B_i, 2, B_i
LOOP_vrecip: .trip 16
LDH *A_X++, A_x0
LDH *A_X++[3], A_x1
LDH *B_X++, B_x2
LDH *B_X++[3], B_x3
NORM A_x0, A_nx0
NORM A_x1, A_nx1
NORM B_x2, B_nx2
NORM B_x3, B_nx3
PACK2 A_nx1, A_nx0, A_nx10
PACK2 B_nx3, B_nx2, B_nx32
ADD2 A_rr, A_nx10, A_ny10
ADD2 B_rr, B_nx32, B_ny32
SSHVL A_x0, A_nx0, A_x0
SSHVL A_x1, A_nx1, A_x1
SSHVL B_x2, B_nx2, B_x2
SSHVL B_x3, B_nx3, B_x3
PACKH2 A_x1, A_x0, A_v10
PACKH2 B_x3, B_x2, B_v32
; Initial value, U0=(V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits
ABS2 A_v10, A_vp10
ABS2 B_v32, B_vp32
SUB2 A_vp10, A_cc, A_vc10
SUB2 B_vp32, B_cc, B_vc32
SUB2 A_mm, A_vp10, A_u10
SUB2 B_mm, B_vp32, B_u32
SMPY2 A_vc10, A_vc10, A_vs1:A_vs0
SMPY2 B_vc32, B_vc32, B_vs3:B_vs2
PACKH2 A_vs1, A_vs0, A_vs10
PACKH2 B_vs3, B_vs2, B_vs32
ADD2 A_vs10, A_u10, A_u10
ADD2 B_vs32, B_u32, B_u32
SHR2 A_v10, A_ss, C10
SHR2 B_v32, B_ss, C32
XOR C10, A_u10, A_u10
XOR C32, B_u32, B_u32
; Iteration 1, U1=U0*(1-V*U0), ~1/(4*V), 11.481 bits
SMPY2 A_v10, A_u10, A_vu1:A_vu0
SMPY2 B_v32, B_u32, B_vu3:B_vu2
PACKH2 A_vu1, A_vu0, A_vu10
PACKH2 B_vu3, B_vu2, B_vu32
SUB2 A_ww, A_vu10, A_vu10
SUB2 B_ww, B_vu32, B_vu32
SMPY2 A_u10, A_vu10, A_u1:A_u0
SMPY2 B_u32, B_vu32, B_u3:B_u2
PACKH2 A_u1, A_u0, A_u10
PACKH2 B_u3, B_u2, B_u32
; Iteration 2, U2=8*U1*(1/2-V*U1), ~1/(2*V), 22.585 bits
SMPY2 A_v10, A_u10, A_vu1:A_vu0
SMPY2 B_v32, B_u32, B_vu3:B_vu2
SUB A_w, A_vu0, A_vu0
SUB A_w, A_vu1, A_vu1
SUB B_w, B_vu2, B_vu2
SUB B_w, B_vu3, B_vu3
MPYHIR A_u0, A_vu0, A_u0
MPYHIR A_u1, A_vu1, A_u1
MPYHIR B_u2, B_vu2, B_u2
MPYHIR B_u3, B_vu3, B_u3
SSHL A_u0, 3, A_u0
SSHL A_u1, 3, A_u1
SSHL B_u2, 3, B_u2
SSHL B_u3, 3, B_u3
; Fl16 = Q15(mant)|int16(expt)
SADD A_u0, A_rnd, A_y0
SADD A_u1, A_rnd, A_y1
SADD B_u2, B_rnd, B_y2
SADD B_u3, B_rnd, B_y3
PACKH2 A_y1, A_y0, A_y10
PACKH2 B_y3, B_y2, B_y32
MV B_y32, A_y32
MV A_ny10, B_ny10
STDW A_y32:A_y10, *B_YM++
STDW B_ny32:B_ny10, *A_YE++
BDEC LOOP_vrecip, B_i
.endproc
Computing Square Root of A Vector of Fixed-Point Numbers
* =========================================================================== *
* *
* Compute square root of a Q.15 vector by 4th order polynomial fitting *
* y[i] = sqrt(x[i]), 0 <= x[i] < 1; *
* *
* C prototype: *
* void DSP_vsqrt_q15(short* x, short* y, int N); *
* *
* Performance: *
* O(4*N) or 4 cycles per number (software pipelining enabled by -O2) *
* *
* Error: *
* (-2^-15, 2^-15) *
* *
* =========================================================================== *
; chebfun, min max error
; 0xFFF1024A = Q31(-0.0005)
; 0x0046E0AB = Q31(0.0022)
; 0xFE764EE1 = Q31(-0.0120)
; 0x1277E288 = Q31(0.1443)
; 0x6ED9EBA1 = Q31(0.8660)
; polyfit, min sqr error
; 0xFFF1203D = Q31(-0.0005)
; 0x004581E7 = Q31(0.0021)
; 0xFE7645AF = Q31(-0.0120)
; 0x1278CF97 = Q31(0.1443)
; 0x6ED9E687 = Q31(0.8660)
SQRT_C4 .set 0xFFF1203D
SQRT_C3 .set 0x004581E7
SQRT_C2 .set 0xFE7645AF
SQRT_C1 .set 0x1278CF97
SQRT_C0 .set 0x6ED9E687
SQRT_S .set 0x5A82799A ; Q31(0.7071)
.sect ".text: _DSP_vsqrt_q15"
.global _DSP_vsqrt_q15
_DSP_vsqrt_q15: .cproc A_X, B_Y, A_n
.no_mdep
.rega A_C4, A_C3, A_C2, A_C1, A_C0, A_xx0, A_rnd, A_y0
.rega A_y0c4, A_y0c3, A_y0c2, A_y0c1, A_x0c3, A_x0c2, A_x0c1, A_x0c0
.rega A_S, A_e0, A_x0x, A_xm, A_y0l, A_y0h, A_y0s
.regb B_C4, B_C3, B_C2, B_C1, B_C0, B_xx1, B_rnd, B_y1, B_y10
.regb B_y1c4, B_y1c3, B_y1c2, B_y1c1, B_x1c3, B_x1c2, B_x1c1, B_x1c0
.regb B_S, B_e1, B_x1x, B_xm, B_y1l, B_y1h, B_y1s, B_X
.reg B_i, C0, C1
ADD 2, A_X, B_X
MVK 0x1, A_rnd
SHL A_rnd, 15, A_rnd
MV A_rnd, B_rnd
MVKL SQRT_C4, A_C4
MVKH SQRT_C4, A_C4
MVKL SQRT_C3, B_C3
MVKH SQRT_C3, B_C3
MV A_C4, B_C4
MV B_C3, A_C3
MVKL SQRT_C2, A_C2
MVKH SQRT_C2, A_C2
MVKL SQRT_C1, B_C1
MVKH SQRT_C1, B_C1
MV A_C2, B_C2
MV B_C1, A_C1
MVKL SQRT_C0, A_C0
MVKH SQRT_C0, A_C0
MVKL SQRT_S, B_S
MVKH SQRT_S, B_S
MV B_S, A_S
MV A_C0, B_C0
MVKL 0x6000, A_xm
SHL A_xm, 16, A_xm
MV A_xm, B_xm
SHR A_n, 1, B_i
SUB B_i, 2, B_i
LOOP_vsqrt: .trip 8
LDH.D1T1 *A_X++[2], A_xx0
LDH.D2T2 *B_X++[2], B_xx1
NORM.L1 A_xx0, A_e0
NORM.L2 B_xx1, B_e1
SSHVL.M1 A_xx0, A_e0, A_x0x
SSHVL.M2 B_xx1, B_e1, B_x1x
SUB.D1 A_e0, 16, A_e0
SUB.D2 B_e1, 16, B_e1
AND.D1 0x1, A_e0, C0
AND.D2 0x1, B_e1, C1
SHR.S1 A_e0, 1, A_e0
SHR.S2 B_e1, 1, B_e1
SUB.D1 A_x0x, A_xm, A_x0x
SUB.D2 B_x1x, B_xm, B_x1x
SHL.S1 A_x0x, 2, A_x0x
SHL.S2 B_x1x, 2, B_x1x
MPYHIR.M1 A_x0x, A_C4, A_y0c4
MPYHIR.M2 B_x1x, B_C4, B_y1c4
SADD.L1 A_y0c4, A_C3, A_x0c3
SADD.L2 B_y1c4, B_C3, B_x1c3
MPYHIR.M1 A_x0x, A_x0c3, A_y0c3
MPYHIR.M2 B_x1x, B_x1c3, B_y1c3
SADD.L1 A_y0c3, A_C2, A_x0c2
SADD.L2 B_y1c3, B_C2, B_x1c2
MPYHIR.M1 A_x0x, A_x0c2, A_y0c2
MPYHIR.M2 B_x1x, B_x1c2, B_y1c2
SADD.L1 A_y0c2, A_C1, A_x0c1
SADD.L2 B_y1c2, B_C1, B_x1c1
MPYHIR.M1 A_x0x, A_x0c1, A_y0c1
MPYHIR.M2 B_x1x, B_x1c1, B_y1c1
SADD.L1 A_y0c1, A_C0, A_x0c0
SADD.L2 B_y1c1, B_C0, B_x1c0
; A_S = B_S = 0x5A82799A ~= 0x5A820000 + 0x00008000
[C0] MPYHIR.M1 A_S, A_x0c0, A_y0h
[C1] MPYHIR.M2 B_S, B_x1c0, B_y1h
[C0] SHR A_x0c0, 16, A_y0l
[C1] SHR B_x1c0, 16, B_y1l
[C0] SADD.L1 A_y0h, A_y0l, A_x0c0
[C1] SADD.L2 B_y1h, B_y1l, B_x1c0
SHR.S1 A_x0c0, A_e0, A_y0s
SHR.S2 B_x1c0, B_e1, B_y1s
SADD.L1 A_y0s, A_rnd, A_y0
SADD.L2 B_y1s, B_rnd, B_y1
PACKH2.S2X B_y1, A_y0, B_y10
STW.D2T2 B_y10, *B_Y++
BDEC LOOP_vsqrt, B_i
.endproc
Fast MDCT/IMDCT Based on Forward FFT
/******** begin of mdct.h ******** */
#ifndef __MDCT_H
#define __MDCT_H
#include <fftw3.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifdef SINGLE_PRECISION
typedef float FLOAT;
typedef fftwf_complex FFTW_COMPLEX;
typedef fftwf_plan FFTW_PLAN;
#else // DOUBLE_PRECISION
typedef double FLOAT;
typedef fftw_complex FFTW_COMPLEX;
typedef fftw_plan FFTW_PLAN;
#endif // SINGLE_PRECISION
typedef struct {
int M; // MDCT spectrum size (number of bins)
FLOAT* twiddle; // twiddle factor
FFTW_COMPLEX* fft_in; // fft workspace, input
FFTW_COMPLEX* fft_out; // fft workspace, output
FFTW_PLAN fft_plan; // fft configuration
} mdct_plan;
mdct_plan* mdct_init(int M); // MDCT spectrum size (number of bins)
void mdct_free(mdct_plan* m_plan);
void mdct(FLOAT* mdct_line, FLOAT* time_signal, mdct_plan* m_plan);
void imdct(FLOAT* time_signal, FLOAT* mdct_line, mdct_plan* m_plan);
#ifdef __cplusplus
}
#endif
#endif // __MDCT_H
/******** end of mdct.h ******** */
/******** begin of mdct.c ******** */
#ifdef SINGLE_PRECISION
#define FFTW_MALLOC fftwf_malloc
#define FFTW_FREE fftwf_free
#define FFTW_PLAN_1D fftwf_plan_dft_1d
#define FFTW_DESTROY fftwf_destroy_plan
#define FFTW_EXECUTE fftwf_execute
#else // DOUBLE_PRECISION
#define FFTW_MALLOC fftw_malloc
#define FFTW_FREE fftw_free
#define FFTW_PLAN_1D fftw_plan_dft_1d
#define FFTW_DESTROY fftw_destroy_plan
#define FFTW_EXECUTE fftw_execute
#endif // SINGLE_PRECISION
void mdct_free(mdct_plan* m_plan)
{
if(m_plan)
{
FFTW_DESTROY(m_plan->fft_plan);
FFTW_FREE(m_plan->fft_in);
FFTW_FREE(m_plan->fft_out);
if(m_plan->twiddle)
free(m_plan->twiddle);
free(m_plan);
}
}
#define MDCT_CLEAUP(msg, ...) \
{fprintf(stderr, msg", %s(), %s:%d \n", \
__VA_ARGS__, __func__, __FILE__, __LINE__); \
mdct_free(m_plan); return NULL;}
mdct_plan* mdct_init(int M)
{
int n;
FLOAT alpha, omega, scale;
mdct_plan* m_plan = NULL;
if(0x00 != (M & 0x01))
MDCT_CLEAUP(" Expect an even number of MDCT coeffs, but meet %d", M);
m_plan = (mdct_plan*) malloc(sizeof(mdct_plan));
if(NULL == m_plan)
MDCT_CLEAUP(" malloc error: %s", "m_plan");
memset(m_plan, 0, sizeof(m_plan[0]));
m_plan->M = M;
m_plan->twiddle = (FLOAT*) malloc(sizeof(FLOAT) * M);
if(NULL == m_plan->twiddle)
MDCT_CLEAUP(" malloc error: %s", "m_plan->twiddle");
alpha = M_PI / (8.f * M);
omega = M_PI / M;
scale = sqrt(sqrt(2.f / M));
for(n = 0; n < (M >> 1); n++)
{
m_plan->twiddle[2*n+0] = (FLOAT) (scale * cos(omega * n + alpha));
m_plan->twiddle[2*n+1] = (FLOAT) (scale * sin(omega * n + alpha));
}
m_plan->fft_in
= (FFTW_COMPLEX*) FFTW_MALLOC(sizeof(FFTW_COMPLEX) * M >> 1);
if(NULL == m_plan->fft_in)
MDCT_CLEAUP(" malloc error: %s", "m_plan->fft_in");
m_plan->fft_out
= (FFTW_COMPLEX*) FFTW_MALLOC(sizeof(FFTW_COMPLEX) * M >> 1);
if(NULL == m_plan->fft_out)
MDCT_CLEAUP(" malloc error: %s", "m_plan->fft_out");
m_plan->fft_plan = FFTW_PLAN_1D(M >> 1,
m_plan->fft_in,
m_plan->fft_out,
FFTW_FORWARD,
FFTW_MEASURE);
if(NULL == m_plan->fft_plan)
MDCT_CLEAUP(" malloc error: %s", "m_plan->fft_plan");
return m_plan;
}
void mdct(FLOAT* mdct_line, FLOAT* time_signal, mdct_plan* m_plan)
{
FLOAT *xr, *xi, r0, i0;
FLOAT *cos_tw, *sin_tw, c, s;
int M, M2, M32, M52, n;
M = m_plan->M;
M2 = M >> 1;
M32 = 3 * M2;
M52 = 5 * M2;
cos_tw = m_plan->twiddle;
sin_tw = cos_tw + 1;
/* odd/even folding and pre-twiddle */
xr = (FLOAT*) m_plan->fft_in;
xi = xr + 1;
for(n = 0; n < M2; n += 2)
{
r0 = time_signal[M32-1-n] + time_signal[M32+n];
i0 = time_signal[M2+n] - time_signal[M2-1-n];
c = cos_tw[n];
s = sin_tw[n];
xr[n] = r0 * c + i0 * s;
xi[n] = i0 * c - r0 * s;
}
for(; n < M; n += 2)
{
r0 = time_signal[M32-1-n] - time_signal[-M2+n];
i0 = time_signal[M2+n] + time_signal[M52-1-n];
c = cos_tw[n];
s = sin_tw[n];
xr[n] = r0 * c + i0 * s;
xi[n] = i0 * c - r0 * s;
}
/* complex FFT of size M/2 */
FFTW_EXECUTE(m_plan->fft_plan);
/* post-twiddle */
xr = (FLOAT*) m_plan->fft_out;
xi = xr + 1;
for(n = 0; n < M; n += 2)
{
r0 = xr[n];
i0 = xi[n];
c = cos_tw[n];
s = sin_tw[n];
mdct_line[n] = - r0 * c - i0 * s;
mdct_line[M-1-n] = - r0 * s + i0 * c;
}
}
void imdct(FLOAT* time_signal, FLOAT* mdct_line, mdct_plan* m_plan)
{
FLOAT *xr, *xi, r0, i0, r1, i1;
FLOAT *cos_tw, *sin_tw, c, s;
int M, M2, M32, M52, n;
M = m_plan->M;
M2 = M >> 1;
M32 = 3 * M2;
M52 = 5 * M2;
cos_tw = m_plan->twiddle;
sin_tw = cos_tw + 1;
/* pre-twiddle */
xr = (FLOAT*) m_plan->fft_in;
xi = xr + 1;
for(n = 0; n < M; n += 2)
{
r0 = mdct_line[n];
i0 = mdct_line[M-1-n];
c = cos_tw[n];
s = sin_tw[n];
xr[n] = -i0 * s - r0 * c;
xi[n] = -i0 * c + r0 * s;
}
/* complex FFT of size M/2 */
FFTW_EXECUTE(m_plan->fft_plan);
/* odd/even expanding and post-twiddle */
xr = (FLOAT*) m_plan->fft_out;
xi = xr + 1;
for(n = 0; n < M2; n += 2)
{
r0 = xr[n];
i0 = xi[n];
c = cos_tw[n];
s = sin_tw[n];
r1 = r0 * c + i0 * s;
i1 = r0 * s - i0 * c;
time_signal[M32-1-n] = r1;
time_signal[M32+n] = r1;
time_signal[M2+n] = i1;
time_signal[M2-1-n] = -i1;
}
for(; n < M; n += 2)
{
r0 = xr[n];
i0 = xi[n];
c = cos_tw[n];
s = sin_tw[n];
r1 = r0 * c + i0 * s;
i1 = r0 * s - i0 * c;
time_signal[M32-1-n] = r1;
time_signal[-M2+n] = -r1;
time_signal[M2+n] = i1;
time_signal[M52-1-n] = i1;
}
}
/******** end of mdct.c ******** */
/******** begin of mdct_test.c ******** */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <time.h>
#include "mdct.h"
int main(int argc, char* argv[])
{
int M, r, i;
FLOAT* time = NULL;
FLOAT* freq = NULL;
mdct_plan* m_plan = NULL;
char* precision = NULL;
struct timeval t0, t1;
long long elps;
if(3 != argc)
{
fprintf(stderr, " Usage: %s <MDCT_SPECTRUM_SIZE> <run_times> \n", argv[0]);
return -1;
}
sscanf(argv[1], "%d", &M);
sscanf(argv[2], "%d", &r);
if(NULL == (m_plan = mdct_init(M)))
return -1;
if(NULL == (time = (FLOAT*) malloc(2 * M * sizeof(FLOAT))))
return -1;
if(NULL == (freq = (FLOAT*) malloc(M * sizeof(FLOAT))))
return -1;
for(i = 0; i < 2 * M; i++)
time[i] = 2.f * rand() / RAND_MAX - 1.f;
for(i = 0; i < M; i++)
freq[i] = 2.f * rand() / RAND_MAX - 1.f;
precision = (sizeof(float) == sizeof(FLOAT))?
"single precision" : "double precision";
#if 1
gettimeofday(&t0, NULL);
for(i = 0; i < r; i++)
mdct(freq, time, m_plan);
gettimeofday(&t1, NULL);
elps = (t1.tv_sec - t0.tv_sec) * 1000000 + (t1.tv_usec - t0.tv_usec);
fprintf(stdout, "MDCT size of %d, %s, running %d times, average %.3f ms\n",
M, precision, r, (FLOAT) elps / r / 1000.f);
#endif // 0
#if 1
gettimeofday(&t0, NULL);
for(i = 0; i < r; i++)
imdct(time, freq, m_plan);
gettimeofday(&t1, NULL);
elps = (t1.tv_sec - t0.tv_sec) * 1000000 + (t1.tv_usec - t0.tv_usec);
fprintf(stdout, "IMDCT size of %d, %s, running %d times, average %.3f ms\n",
M, precision, r, (FLOAT) elps / r / 1000.f);
#endif //0
#if 0
for(i = 0; i < 2 * M; i++)
fprintf(stdout, "%f ", time[i]);
fprintf(stdout, "\n");
for(i = 0; i < M; i++)
fprintf(stdout, "%f ", freq[i]);
fprintf(stdout, "\n");
#endif // 0
free(time);
free(freq);
mdct_free(m_plan);
return 0;
}
/******** end of mdct_test.c ******** */