Yet Another FIR design algorithm

Markus Nentwig August 21, 20113 comments Coded in Matlab
% ********************************************
% least-mean-squares FIR design algorithm
% Markus Nentwig, 2010-2011
% release 2011/8/22
% ********************************************
function LMSFIR()
    close all;
    
    h1 = demo1('basic'); compareDemo1WithRemez(h1);    
    % h1 = demo1('basicLMS'); disp('demo: convergence failure is on purpose to show LMS solution');
    % demo1('allpass');
    % demo1('stopband');
    % demo1('equalize');
    % demo1('nominalResponse');
    % demo1('multipassband');
    % demo1('complex');
    % demo1('rootRaisedCosineUpsampler');
    % demo1('componentModel');
    % demo1('componentModel2');
end

function h = demo1(nameOfDemo)
    dpar = struct();
    % parameters for a basic FIR lowpass filter design.
    % kept in a struct(), so that individual examples
    % can easily change them.    
    
    % sampling rate at the input of the filter
    dpar.inRate_Hz = 104e6;

    % number of physical FIR taps 
    % in a polyphase decimator, the number of internal
    % coefficients will be fDecim * fStages
    dpar.mStages = 36; 
    
    % end of passband. A single value will be internally
    % expanded to [-9e6, 9e6]. 
    % Asymmetric designs require
    % the complexValued = true option.
    % This 'default' passband can be omitted entirely, if passbands
    % are declared individually later
    dpar.req_passbandF_Hz = 9e6;
    
    % defines the maximum allowed ripple in the passband.
    dpar.req_passbandRipple_dB = 0.083;
    
    % as alternative to ripple, the in-band error
    % vector magnitude (EVM) can be limited
    % dpar.req_passbandEVM_dB = -44;
    
    % the passband specification may use multiple points
    % dpar.req_passbandF_Hz = [-1, 5e6, 6e6, 9e6];
    % dpar.req_passbandEVM_dB = [-44, -44, -34, -34];
    
    % start of default stopband. 
    % as with the passband, the default stopband can be omitted, 
    % if individual bands are placed later.
    dpar.req_stopbandF_Hz = 14e6;  
    dpar.req_stopbandMaxGain_dB = -30;

    % dpar.req_stopbandF_Hz = [14e6, 34e6];  
    % dpar.req_stopbandGainMax_dB = [-30, -20];
    
    % ********************************************
    % create a filter design object "design"
    % * access with LMSFIR_stage2 functions
    % * evaluate with LMSFIR_stage3 function    
    % ********************************************    

    switch nameOfDemo
      case 'basic'
        % ********************************************
        % simple filter using the parameters above
        % ********************************************
        design = LMSFIR_stage1_setup(dpar);
      
      case 'basicLMS'
        % ********************************************
        % LMS design for comparison:
        % Iterations are disabled
        % ********************************************
        dpar.nIter = 1;

        % balance in-band / out-of-band performance as needed
        dpar.inbandWeight_initValue = 5;
        dpar.outOfBandWeight_initValue = 1;
        
        design = LMSFIR_stage1_setup(dpar);
        
      case 'allpass'
        % ********************************************
        % allpass design Offset the nominal delay by 1/3
        % of a sample, compared to the "basic" example
        % (compare the impulse responses)
        % ********************************************
        dpar.delayOffset = 1/3; % signal arrives now earlier
        design = LMSFIR_stage1_setup(dpar);
        
      case 'stopband'        
        % ********************************************
        % Filter with added stopbands
        % ********************************************
        % the following features require more taps
        dpar.mStages = 48;
        
        % create filter design object
        design = LMSFIR_stage1_setup(dpar);
        
        % place a stopband from 14 to 16 MHz with -50 dB
        design = LMSFIR_stage2_placeStopband(...
            design, ...
            'f_Hz', [14e6, 16e6], ...
            'g_dB', [-50, -50]);
        
        % place another stopband from 16 to 28 MHz with 
        % -50 dB, linearly relaxing to -40 dB
        design = LMSFIR_stage2_placeStopband(...
            design, ...
            'f_Hz', [16e6, 28e6], ...
            'g_dB', [-50, -40]);

      case 'equalize'
        % ********************************************
        % Equalize the frequency response of another 
        % filter in the passband(s)
        % ********************************************
        % As an equalizer, this is rather inefficient with so much 
        % unused bandwidth. Should operate at the smallest possible BW instead.
        dpar.mStages = 52; 
        
        [ffilter_Hz, H] = getExampleLaplaceDomainFilter();

        % set the frequency points...
        dpar.equalizeFreq_Hz = ffilter_Hz;

        % ... and the filter response. The design routine will
        % use linear interpolation, therefore provide a sufficiently
        % dense grid.
        % Equalizing an asymmetric response requires 
        % the complexValued=true option, leading to a complex-valued
        % FIR filter.
        % The equalization function needs to be normalized. 
        % Otherwise, pass- and stopband targets will be offset 
        % by the gain mismatch.
        dpar.equalizeComplexGain = H;

        % as alternative to the complex gain, a magnitude response
        % can be given via an equalizeGain_dB argument.
        % dpar.equalizeGain_dB = 20*log10(abs(H));        

        % an asymmetric (non-linear-phase) impulse response may 
        % require a group delay that is not centered in the
        % FIR length.
        dpar.delayOffset = 2;
        
        design = LMSFIR_stage1_setup(dpar);
      case 'componentModel'
        % ********************************************
        % Create a FIR filter that approximates the passband behavior of 
        % the analog filter accurately, and gives a similar stopband rejection
        % 
        % The most straightforward way to model an infinite-impulse-response
        % lowpass is to simply sample the impulse response. However, it needs to be
        % cut to size (since the FIR filter has only finite length)
        % => Chopping it off destroys the out-of-band performance (=rectangular window)
        % => use a window function that trades off between passband accuracy and 
        %    stopband rejection
        % => or use the design example below.
        % ********************************************
        dpar.mStages = 52; 
        
        [ffilter_Hz, H] = getExampleLaplaceDomainFilter();

        % set the frequency points...
        dpar.nominalFreq_Hz = ffilter_Hz;
        dpar.nominalComplexGain = H;
        
        dpar.req_stopbandF_Hz = [15e6, 30e6, 55e6];
        dpar.req_stopbandMaxGain_dB = [-38, -80, -115];

        dpar.req_passbandF_Hz = 9e6;        

        % offset the impulse response, it is not centered
        dpar.delayOffset = 18;

        design = LMSFIR_stage1_setup(dpar);
      
      case 'componentModel2'
        % ********************************************
        % an extension of "componentModel1"
        % stopband rejection does not matter, but we need
        % phase-accurate modeling on a region of the stopband edge
        % ********************************************
        dpar.mStages = 80; % this won't be cheap...
      
        [ffilter_Hz, H] = getExampleLaplaceDomainFilter();
        dpar.nominalFreq_Hz = ffilter_Hz;
        dpar.nominalComplexGain = H;
        
        dpar.req_stopbandF_Hz = [ 16e6, 100e6];
        dpar.req_stopbandMaxGain_dB = [ -30, -30];
        
        dpar.req_passbandF_Hz = 9e6;        
        
        % offset the impulse response, it is not centered
        dpar.delayOffset = dpar.mStages / 2 - 8;
        
        design = LMSFIR_stage1_setup(dpar);
            
        % place a passband in the area on the slope that is to be modeled accurately
        design = LMSFIR_stage2_placePassband(...
            design, ...
            'f_Hz', [12e6, 16e6], ...
            'EVM_dB', [-40, -50] - 30); % nominal gain -40..-50 dB, -30 dBc EVM
        
      case 'nominalResponse'
        % ********************************************
        % Design a filter to a given frequency response
        % ********************************************    
        dpar.mStages = 50;
        % the frequency response is approximated in any
        % declared passband, but must be valid for any 
        % frequency to allow plotting.

        dpar.nominalFreq_Hz = [0, 3e6, 9e6, 1e9];
        dpar.nominalGain_dB = [0, 0, -6, -6]; 
        
        % instead, nominalComplexGain can be used
        % g = [0, 0, -3, -3]; 
        % dpar.nominalComplexGain = 10 .^ (g/20);
        design = LMSFIR_stage1_setup(dpar);
        
      case 'multipassband'
        % ********************************************
        % Design a filter with three passbands
        % ********************************************    
        dpar.mStages = 50;
        dpar = rmfield(dpar, 'req_passbandF_Hz');
        dpar = rmfield(dpar, 'req_passbandRipple_dB');
        
        design = LMSFIR_stage1_setup(dpar);

        design = LMSFIR_stage2_placePassband(...
            design, ...
            'f_Hz', [-2e6, 2e6], ...
            'EVM_dB', -45);
        
        design = LMSFIR_stage2_placePassband(...
            design, ...
            'f_Hz', [3e6, 7e6], ...
            'EVM_dB', [-45, -40]);

        design = LMSFIR_stage2_placeStopband(...
            design, ...
            'f_Hz', [11.8e6, 12.4e6], ...
            'g_dB', -70);

      case 'complex'
        % ********************************************
        % Design a complex-valued filter
        % ********************************************    
        % this is also an example for what can go wrong: 
        % In the unconstrained section around -40 MHz, the
        % frequency response is allowed to go berserk. Which
        % it does.
        % Solution: Place a "soft" stopband (for example at -5 dB)
        % in the "don't-care" regions and add a couple of taps.
        
        % remove passband from default parameters
        dpar = rmfield(dpar, 'req_passbandF_Hz');
        dpar = rmfield(dpar, 'req_passbandRipple_dB');

        % remove stopband from default parameters
        dpar = rmfield(dpar, 'req_stopbandF_Hz');
        dpar = rmfield(dpar, 'req_stopbandMaxGain_dB');
        
        dpar.complexValued = true;        
        design = LMSFIR_stage1_setup(dpar);

        design = LMSFIR_stage2_placeStopband(...
            design, ...
            'f_Hz', [-30e6, -16e6], ...
            'g_dB', -50);

        design = LMSFIR_stage2_placePassband(...
            design, ...
            'f_Hz', [-8e6, -2e6], ...
            'EVM_dB', -45);
        
        design = LMSFIR_stage2_placeStopband(...
            design, ...
            'f_Hz', [3e6, 40e6], ...
            'g_dB', [-30, -50]);

      case 'rootRaisedCosineUpsampler'
        % ********************************************
        % root-raised cosine upsampling filter for WCDMA transmitter
        % The input chip stream arrives at 3.84 Msps, using the 
        % full bandwidth. 
        % Before the filter, it is upsampled (zero insertion) to 
        % 7.68 Msps. 
        % The filter applies RRC-filtering with 1.22 rolloff.
        % ********************************************
        % calculate nominal RRC response for lookup table / linear
        % interpolation
        f_Hz = logspace(1, 8, 10000); f_Hz(1) = -1;
        c = abs(f_Hz * 2 / 3.84e6);
        c = (c-1)/(0.22); % -1..1 in the transition region
        c=min(c, 1);
        c=max(c, -1);
        RRC_h = sqrt(1/2+cos(pi/2*(c+1))/2);
        
        % ********************************************
        % once the targets are achieved, use the remaining
        % 'degrees of freedom' for least-squares optimization.
        % The LMS solver will improve, where it is 'cheapest'
        % The parameters are not a real-world design 
        % (0.5 percent EVM => -46 dB)
        % ********************************************    
        ci = 0;
        % ci = 10; % for comparison: force equiripple 
        
        dpar = struct(...
            'inRate_Hz', 3.84e6, ...
            'fInterp', 2, ...
            'mStages', 45, ...
            'req_passbandF_Hz', 3.84e6 * 1.22 / 2, ...
            'req_passbandEVM_dB', -46, ...
            'req_stopbandF_Hz', 2.46e6, ...
            'req_stopbandMaxGain_dB', -50, ...
            'nominalFreq_Hz', f_Hz, ...
            'nominalGain_dB', 20*log10(RRC_h + 1e-19), ...
            'convergedIterations', ci);
        
        design = LMSFIR_stage1_setup(dpar);
        [h, status] = LMSFIR_stage3_run(design); 
        % save('hRRC_upsampler.txt', 'h', '-ascii');
        disp(status);        
        
      otherwise
        assert(false);
    end % switch nameOfDemo
    
    % ********************************************
    % Design the filter
    % ********************************************    
    % h is the impulse response (FIR tap coefficients).
    [h, status] = LMSFIR_stage3_run(design); 
    disp(status);    
end

function compareDemo1WithRemez(hLMS)
% identical target settings to demo1 "basic".
% note, the demo uses targets that are exactly "on the edge"
% what the algorithm can achieve. This results in an equiripple-
% design that can be compared with remez().
% If the targets are too loosely set, pass- and stopband quality
% start to "sag" in the band middle (LMS solution => lowest overall
% error, the optimizer improves where it's "cheapest").
    
    r_Hz = 104e6;
    m = 35; % definition differs by 1
    f = [0 9e6 14e6 r_Hz/2] / (r_Hz/2);
    a = [1 1 0 0];
    ripple_dB = 0.1;
    att_dB = 30;
    err1 = 1 - 10 ^ (-ripple_dB / 20);
    err2 = 10 ^ (-att_dB / 20);
    w = [1/err1 1/err2];

    % get remez design impulse response
    hRemez = remez(m, f, a, w);
    
    figure(); hold on;
    handle = plot(hLMS, 'b+'); set(handle, 'lineWidth', 3);
    plot(hRemez, 'k+'); set(handle, 'lineWidth', 2);
    legend('this algorithm', 'Remez');
    title('comparison with Remez design (optimum equiripple)');
end

% Gets the frequency response of an "analog" (Laplace-domain) filter.
% => Chebyshev response
% => 6th order
% => 1.2 dB ripple
% => cutoff frequency at 10 MHz
% returns
% f_Hz: list of frequencies
% H: complex-valued H(f_Hz)
function [f_Hz, H] = getExampleLaplaceDomainFilter()
    [IIR_b, IIR_a] = cheby1(6, 1.2, 1, 's');
    
    % evaluate it on a wide enough frequency range
    f_Hz = logspace(1, 10, 1000); f_Hz(1) = -1;
    
    % Laplace domain operator for normalized frequency
    fCutoff_Hz = 10e6;
    s = 1i * f_Hz / fCutoff_Hz;
    
    % polynomial in s
    H_num = polyval(IIR_b, s);
    H_denom = polyval(IIR_a, s);
    H = H_num ./ H_denom;
end

% === LMSFIR_xyz "API" functions ===

% ********************************************
% LMSFIR_stagex_... functions to interact with design 'object'
% to be executed in 'stage'-order
% ********************************************
function d = LMSFIR_stage1_setup(varargin)
    p = varargin2struct(varargin);
    d = struct();

    % number of frequency points. Increase to improve accuracy.
    % Frequencies are quantized to +/- rate / (2 * nSamples) 
    d.nSamples = 1024;
    
    % default polyphase interpolation: none 
    d.fInterp = 1;
    
    % default polyphase decimation: none
    d.fDecim = 1;

    % max. number of iterations
    d.nIter = 100; 

    % for pure LMS solution, set nIter to 1 and change the weights below as needed
    d.inbandWeight_initValue = 1;
    d.outOfBandWeight_initValue = 1;
    
    % abort when the iteration weights grow too large. 
    % This happens when targets are impossible.
    % The result may still be meaningful, though.
    d.abortWeight = 1e12;

    % keep iterating, if the targets are reached. 
    % Once the "equi"-ripple iteration has brought all peaks to an acceptable level, 
    % the LMS solver will use the remaining "degrees of freedom" for a LMS optimization.
    % The solver improves "where it is easy / cheap". This results in sloped
    % stopbands and "drooping" EVM in passbands.
    % Often, LMS is the best choice => set converged iterations to 0.
    d.convergedIterations = 10;

    % for a complex-valued filter, use "true".
    % With a real-valued design, user input is only evaluated for positive frequencies!
    d.complexValued = false;
    
    % by default, the basis waveforms given to the optimizer are
    % within a delay range of +/- half the FIR length.
    % For nonlinear phase types (equalization / nominal frequency 
    % response), this may be suboptimal.
    % Meaningful values shouldn't exceed +/- half the number of
    % coefficients in the impulse response.
    d.delayOffset = 0;

    % copy parameters
    fn = fieldnames(p);
    for ix = 1:size(fn, 1)
        key = fn{ix};
        d.(key) = p.(key);           
    end

    % frequency base over FFT range
    fb = 0:(d.nSamples - 1);
    fb = fb + floor(d.nSamples / 2);
    fb = mod(fb, d.nSamples);
    fb = fb - floor(d.nSamples / 2);
    fb = fb / d.nSamples; % now [0..0.5[, [-0.5..0[
    fb = fb * d.inRate_Hz * d.fInterp;
    d.fb = fb;
    
    % in real-valued mode, negative frequencies are treated as 
    % positive, when 'user input' is evaluated
    if d.complexValued
        d.fbUser = fb;
    else
        d.fbUser = abs(fb);
    end
    
    % ********************************************
    % target settings. Those will be modified by 
    % LMSFIR_stage2_xyz()
    % ********************************************
    % initial value of NaN indicates: all entries are unset
    d.errorSpecBinVal_inband_dB = zeros(size(d.fb)) + NaN;
    d.errorSpecBinVal_outOfBand_dB = zeros(size(d.fb)) + NaN;
    
    % ********************************************
    % process req_passband requirement
    % needs to be done at stage 1, because it is 
    % used for 'gating' with the tightenExisting / 
    % relaxExisting options
    % ********************************************
    if isfield(d, 'req_passbandF_Hz')  

        par = struct('onOverlap', 'error');
        if isfield(d, 'req_passbandRipple_dB')
            par.ripple_dB = d.req_passbandRipple_dB;
        end
        if isfield(d, 'req_passbandEVM_dB')
            par.EVM_dB = d.req_passbandEVM_dB;
        end
        par.f_Hz = d.req_passbandF_Hz;

        d = LMSFIR_stage2_placePassband(d, par);
    end % if req_passbandF_Hz

    % ********************************************
    % process req_stopband requirement
    % needs to be done at stage 1, because it is 
    % used for 'gating' with the tightenExisting / 
    % relaxExisting options
    % ********************************************
    if isfield(d, 'req_stopbandF_Hz')  
        f_Hz = d.req_stopbandF_Hz;
        g_dB = d.req_stopbandMaxGain_dB;
        
        % extend to infinity
        if isscalar(f_Hz)
            f_Hz = [f_Hz 9e19]; 
            g_dB = [g_dB g_dB(end)];
        end
        
        d = placeBand...
            (d, ...
             'f_Hz', f_Hz, 'g_dB', g_dB, ...
             'type', 'stopband', ...
             'onOverlap', 'tighten');
    end    
    
    % ********************************************
    % plot management
    % ********************************************
    d.nextPlotIx = 700;
end

function d = LMSFIR_stage2_placeStopband(d, varargin)
    p = varargin2struct(varargin);
    
    % shorthand notation g_dB = -30; f_Hz = 9e6;
    % extend fixed passband to positive infinity
    if isscalar(p.f_Hz)
        assert(p.f_Hz > 0);
        p.f_Hz = [p.f_Hz 9e99];
    end

    if isscalar(p.g_dB)
        p.g_dB = ones(size(p.f_Hz)) * p.g_dB;
    end
    
    % default action is to use the stricter requirement
    if ~isfield(p, 'onOverlap')
        p.onOverlap = 'tighten';
    end        
    
    d = placeBand(d, 'type', 'stopband', p);
end

function d = LMSFIR_stage2_placePassband(d, varargin)
    p = varargin2struct(varargin);
    
    % default action is to use the stricter requirement
    if ~isfield(p, 'onOverlap')
        p.onOverlap = 'tighten';
    end        
    
    % translate ripple spec to error
    if isfield(p, 'ripple_dB')
        assert(p.ripple_dB > 0);
        eSamplescale = 10 ^ (p.ripple_dB / 20) - 1;            
        EVM_dB = 20*log10(eSamplescale);
    end
    
    if isfield(p, 'EVM_dB')
        EVM_dB = p.EVM_dB;
    end

    % convert scalar to two-element vector
    if isscalar(EVM_dB)
        EVM_dB = [EVM_dB EVM_dB];
    end
    
    % *** handle f_Hz ***
    f_Hz = p.f_Hz;
    
    % extend to 0 Hz
    if isscalar(f_Hz)
        f_Hz = [0 f_Hz];
    end
    
    % *** create the passband ***
    d = placeBand(d, ...
                  'type', 'passband', ...
                  'f_Hz', f_Hz, ...
                  'g_dB', EVM_dB, ...
                  'onOverlap', p.onOverlap);
end

% ********************************************
% the filter design algorithm
% h: impulse response
% status: converged or not
% note that even if convergence was not reached, 
% the resulting impulse response is "the best we
% can do" and often meaningful.
% ********************************************
function [h, status] = LMSFIR_stage3_run(d)
    1;    
    
    % mTaps is number of physical FIR stages
    % m is number of polyphase coefficients
    d.m = d.mStages * d.fInterp;
    
    % masks flagging pass-/stopband frequencies
    mask_inband = NaN_to_0_else_1(d.errorSpecBinVal_inband_dB);
    mask_outOfBand = NaN_to_0_else_1(d.errorSpecBinVal_outOfBand_dB);
    
    % sanity check... (orthogonality of wanted and unwanted component)
    assert(sum(mask_inband) > 0, 'passband is empty');
    assert(sum(mask_inband .* mask_outOfBand) == 0, ...
           'passband and stopband overlap');
    
    % ********************************************
    % start with flat passband signals at input and output of filter
    % those will become the input to the LMS solver.
    % ********************************************
    sigSolverAtInput_fd = mask_inband;
    sigSolverAtOutput_fd = sigSolverAtInput_fd;

    % ********************************************
    % for even-sized FFT length, there is one bin at the
    % Nyquist limit that gives a [-1, 1, -1, 1] time domain
    % waveform. It has no counterpart with opposite frequency
    % sign and is therefore problematic (time domain component
    % cannot be delayed). 
    % Don't assign any input power here.
    % ********************************************
    if mod(d.nSamples, 2) == 0
        ixNyquistBin = floor(d.nSamples/2) + 1;
        sigSolverAtInput_fd(ixNyquistBin) = 0;
        sigSolverAtOutput_fd(ixNyquistBin) = 0;
    end
    
    if isfield(d, 'equalizeFreq_Hz')
        % ********************************************
        % Filter equalizes a given passband frequency response
        % ********************************************
        if isfield(d, 'equalizeGain_dB')
            cgain = 10 .^ (equalizeGain_dB / 20);
        else
            cgain = d.equalizeComplexGain;
        end
        d.Heq = evaluateFilter(d.fb, d.equalizeFreq_Hz, cgain, d.complexValued);
        assert(isempty(find(isnan(d.Heq), 1)), ...
               ['equalizer frequency response interpolation failed. ' ...
                'Please provide full range data for plotting, even if it does not ', ...
                'affect the design']);
        
        % ********************************************
        % apply frequency response to input signal. 
        % The LMS solver will invert this response
        % ********************************************
        sigSolverAtInput_fd = sigSolverAtInput_fd .* d.Heq;
    end

    if isfield(d, 'nominalFreq_Hz')
        % ********************************************
        % (equalized) filter matches a given passband frequency response
        % ********************************************
        if isfield(d, 'nominalGain_dB')
            cgain = 10 .^ (d.nominalGain_dB / 20);
        else
            cgain = d.nominalComplexGain;
        end
        
        d.Hnom = evaluateFilter(d.fb, d.nominalFreq_Hz, cgain, d.complexValued);
        assert(isempty(find(isnan(d.Hnom), 1)), ...
               ['nominal frequency response interpolation failed. ' ...
                'Please provide full range data for plotting, even if it does not ', ...
                'affect the design']);
        
        % ********************************************
        % apply frequency response to output signal. 
        % The LMS solver will adapt this response
        % ********************************************
        sigSolverAtOutput_fd = sigSolverAtOutput_fd .* d.Hnom;
    end
    
    % ********************************************
    % compensate constant group delay from equalizer and nominal
    % frequency response. This isn't optimal, but it is usually
    % a good starting point (use delayOffset parameter)
    % ********************************************
    [coeff, ref_shiftedAndScaled, deltaN] = fitSignal_FFT(...
        ifft(sigSolverAtInput_fd), ifft(sigSolverAtOutput_fd));

    % the above function also scales for best fit. This is not desired here, instead
    % let the LMS solver match the gain. Simply scale it back:
    ref_shifted = ref_shiftedAndScaled / coeff;
    sigSolverAtOutput_fd = fft(ref_shifted);
    
    if false
        % ********************************************
        % plot time domain waveforms (debug)
        % ********************************************
        figure(76); hold on;
        plot(fftshift(abs(ifft(sigSolverAtOutput_fd))), 'k');
        plot(fftshift(abs(ifft(sigSolverAtInput_fd))), 'b');
        title('time domain signals');
        legend('reference (shifted)', 'input signal');
    end
    
    % ********************************************
    % main loop of the design algorithm
    % => initialize weights
    % => loop
    % =>   design optimum LMS filter that transforms weighted input 
    %      into weighted output
    % =>   adapt weights 
    % => iterate
    % ********************************************
    % at this stage, the input to the algorithm is as follows:
    % => errorSpec for in-band and out-of-band frequencies
    %    (masks are redundant, can be derived from above)
    % => LMS_in_fd and
    % => LMS_out_fd: Signals that are given to the LMS solver.
    %   Its task is: "design a FIR filter that transforms LMS_in_fd into LMS_out_fd".
    
    % initialize weights
    outOfBandWeight = mask_outOfBand * d.outOfBandWeight_initValue;
    inbandWeight = mask_inband * d.inbandWeight_initValue;
    
    status = '? invalid ?';
    hConv = [];
    remConvIter = d.convergedIterations;
    for iter=1:d.nIter
        % inband weight is applied equally to both sides to shape the error
        % out-of-band weight is applied to the unwanted signal
        LMS_in_fd = sigSolverAtInput_fd .* inbandWeight...
            + mask_outOfBand .* outOfBandWeight; 
        
        LMS_out_fd = sigSolverAtOutput_fd .* inbandWeight;
        
        % ********************************************
        % cyclic time domain waveforms from complex spectrum
        % ********************************************
        LMS_in_td = ifft(LMS_in_fd); 
        LMS_out_td = ifft(LMS_out_fd);
        
        % ********************************************
        % construct FIR basis (output per coeffient)
        % time domain waveforms, shifted according to each FIR tap
        % ********************************************
        basis = zeros(d.m, d.nSamples);

        % introduce group delay target
        ix1 = -d.m/2+0.5 + d.delayOffset;
        ix2 = ix1 + d.m - 1;
        rowIx = 1;
        for ix = ix1:ix2 % index 1 appears at ix1                         
            basis(rowIx, :) = FFT_delay(LMS_in_td, ix);
            rowIx = rowIx + 1;
        end
        
        if d.complexValued
            rightHandSide_td = LMS_out_td;
        else
            % use real part only
            basis = [real(basis)];
            rightHandSide_td = [real(LMS_out_td)]; 

            pRp = real(rightHandSide_td) * real(rightHandSide_td)' + eps;
            pIp = imag(rightHandSide_td) * imag(rightHandSide_td)';
            assert(pIp / pRp < 1e-16, ...
                   ['got an imaginary part where there should be none. ', ...
                    'uncomment the following lines, if needed']);
            % if designing a real-valued equalizer for a complex-valued frequency response,
            % use the following to solve LMS over the average:
            % basis = [real(basis) imag(basis)];
            % rightHandSide_td = [real(LMS_out_td), imag(LMS_out_td)]; 
        end
        
        % ********************************************
        % LMS solver
        % find a set of coefficients that scale the 
        % waveforms in "basis", so that their sum matches
        % "rightHandSide_td" LMS-optimally
        % ********************************************
        pbasis = pinv(basis .');
        h = transpose(pbasis * rightHandSide_td .');
        
        % pad impulse response to n
        irIter = [h, zeros(1, d.nSamples-d.m)];

        % undo the nominal group delay
        irIter = FFT_delay(irIter, ix1);
        HIter = fft(irIter);
        
        % ********************************************
        % filter test signal
        % ********************************************
        eq_fd = sigSolverAtInput_fd .* HIter; 

        % ********************************************
        % subtract actual output from targeted output
        % results in error spectrum
        % ********************************************
        err_fd = sigSolverAtOutput_fd - eq_fd;
        err_fd = err_fd .* mask_inband; % only in-band matters
        EVM_dB = 20*log10(abs(err_fd)+1e-15);
        
        % ********************************************
        % out-of-band leakage
        % ********************************************
        leakage_dB = 20*log10(abs(HIter .* mask_outOfBand + 1e-15));

        % ********************************************
        % compare achieved and targeted performance 
        % ********************************************
        deltaLeakage_dB = leakage_dB - d.errorSpecBinVal_outOfBand_dB;        
        deltaEVM_dB = EVM_dB - d.errorSpecBinVal_inband_dB;
        
        % ********************************************
        % find bins where performance should be improved
        % or relaxed
        % ********************************************
        ixImprLeakage = find(deltaLeakage_dB > 0);
        ixImprEVM = find(deltaEVM_dB > 0);
        ixRelLeakage = find(deltaLeakage_dB < -3);
        ixRelEVM = find(deltaEVM_dB < -3);        
        
        status = 'iteration limit reached';
        if isempty(ixImprLeakage) && isempty(ixImprEVM)
            % both targets met. Convergence!
            if remConvIter > 0
                remConvIter = remConvIter - 1;
                status = 'converged once, now trying to improve';
                hConv = h;
            else
                status = 'converged';
                break;
            end
        end
        
        % ********************************************
        % improve / relax in-band and out-of-band
        % ********************************************
        if ~isempty(ixImprLeakage)
            % tighten out-of-band
            outOfBandWeight(ixImprLeakage) = outOfBandWeight(ixImprLeakage)...
                .* 10 .^ ((deltaLeakage_dB(ixImprLeakage) + 0.1) / 20);
        end       
        
        if ~isempty(ixRelLeakage)
            % relax out-of-band
            outOfBandWeight(ixRelLeakage) = outOfBandWeight(ixRelLeakage)...
                .* 10 .^ (deltaLeakage_dB(ixRelLeakage) / 3 / 20); 
        end
        
        if ~isempty(ixImprEVM)
            % tighten in-band
            inbandWeight(ixImprEVM) = inbandWeight(ixImprEVM)...
                .* 10 .^ ((deltaEVM_dB(ixImprEVM) + 0.01) / 20); 
        end
        if ~isempty(ixRelEVM)
            % relax in-band
            inbandWeight(ixRelEVM) = inbandWeight(ixRelEVM)...
                .* 10 .^ (deltaEVM_dB(ixRelEVM) / 2 / 20); 
        end
        
        if max([inbandWeight, outOfBandWeight] > d.abortWeight)
            status = 'weight vector is diverging';
            break;
        end
    end % for iter
    
    % ********************************************
    % recover from convergence failure after convergence
    % during improvement phase
    % ********************************************
    if ~strcmp(status, 'converged')
        if ~isempty(hConv)
            h = hConv;
            status = 'converged';
        end
    end
    
    if true
        % ********************************************
        % plot impulse response
        % ********************************************
        if d.complexValued
            figure(); hold on; 
            stem(real(h), 'k');
            stem(imag(h), 'b');
            legend('real(h)', 'imag(h)');
        else
            figure(); hold on; 
            stem(h);
            legend('h');
        end
        title('impulse response');
    end
    
    
    % ********************************************
    % plot frequency response
    % ********************************************
    inbandBins = find(mask_inband);
    outOfBandBins = find(mask_outOfBand);
    d=doPlotStart(d, ['Frequency response (Status:', status, ')']);
    d=doPlotH(d, HIter, 'b', '|H_{design}(f)|', 2);
    
    handle = plot(d.fb(outOfBandBins), d.errorSpecBinVal_outOfBand_dB(outOfBandBins), 'b+');
    set(handle, 'markersize', 2);
    d=addLegend(d, 'req. stopband');

    d = doPlot_dB(d, EVM_dB, 'r', 'error');
    
    handle = plot(d.fb(inbandBins), d.errorSpecBinVal_inband_dB(inbandBins), 'r+');
    set(handle, 'markersize', 2);
    d=addLegend(d, 'req. passband error');
    
    d=doPlotEnd(d);
    ylim([-100, 10]);
    
    if false
        % ********************************************
        % plot constraint signal and weights
        % ********************************************
        figure(31); grid on; hold on;
        handle = plot(fftshift(d.fb), fftshift(20*log10(mask_outOfBand))); 
        set(handle, 'lineWidth', 3);
        
        x = d.fb; y = 20*log10(inbandWeight / max(inbandWeight));
        handle = plot(x(inbandBins), y(inbandBins), 'k+'); set(handle, 'lineWidth', 3);
        
        x = d.fb;
        y = 20*log10(outOfBandWeight / max(outOfBandWeight));
        handle = plot(x(outOfBandBins), y(outOfBandBins), 'b+'); set(handle, 'lineWidth', 3); 
        
        xlabel('f/Hz'); ylabel('dB');
        ylim([-80, 40]);
        legend('constraint signal', 'in-band weight', 'out-of-band weight');
        title('weighting factor (normalized to 0 dB)');
    end
    
    hasEq = isfield(d, 'Heq');
    hasNom = isfield(d, 'Hnom');
    if hasEq || hasNom
        % ********************************************
        % plot equalization / nominal target
        % ********************************************
        d=doPlotStart(d, 'equalization / nominal target');
        d=doPlotH(d, HIter, 'b', '|H_{design}(f)|', 2);
        if hasEq
            d=doPlotH(d, d.Heq, 'k', '|H_{eq}(f)| to equalize (invert)');
            eqR = HIter .* d.Heq;
            d=doPlotH(d, eqR, 'c', '|H_{design}(f)H_{eq}(f)|', 2);
            handle = plot(d.fb(inbandBins), ...
                          20*log10(abs(eqR(inbandBins)) + 1e-15), 'c*');
            set(handle, 'markersize', 3);
            d=addLegend(d, '|H_{eq}(in-band)');
        end
        if hasNom
            d = doPlotH(d, d.Hnom, 'g', '|H_{nom}|', 2);
            handle = plot(d.fb(inbandBins), ...
                          20*log10(abs(HIter(inbandBins)) + 1e-15), 'b*');
            set(handle, 'markersize', 3);
            d=addLegend(d, '|H_{design}(f)H_{eq}(f) in-band');
        end
        d=doPlotEnd(d);
        % set y-range
        ymax = 20*log10(max(abs(HIter)));
        ylim([-50, ymax+3]);
    end
end

% === LMSFIR helper functions ===

% evaluates frequency response f_dB; g_Hz at fb
% the return value will contain NaN for out-of-range entries
% in fb
function binVal = buildBinVal(varargin)
    p = varargin2struct(varargin);
    
    f_Hz = p.f_Hz;
    g_dB = p.g_dB;

    % shorthand notation f = [f1, f2]; g = -30;
    if isscalar(g_dB)
        g_dB = ones(size(f_Hz)) * g_dB;
    end
    
    % tolerate sloppy two-argument definition
    if size(f_Hz, 2) == 2 && f_Hz(1) > f_Hz(2)
        f_Hz = fliplr(f_Hz);
        g_dB = fliplr(g_dB);
    end
    
    binVal = interp1(f_Hz, g_dB, p.fbUser, 'linear');
end

function d = placeBand(d, varargin)
    p = varargin2struct(varargin);
    
    % create requirements vector
    binVal = buildBinVal('f_Hz', p.f_Hz, ...
                         'g_dB', p.g_dB, ...
                         'fbUser', d.fbUser);
    
    % look up requirements vector from design object
    switch p.type
      case 'passband'
        fn = 'errorSpecBinVal_inband_dB';
      case 'stopband'
        fn = 'errorSpecBinVal_outOfBand_dB';
      otherwise
        assert(false);
    end
    designObject_binVal = d.(fn);

    % check overlap
    if strcmp(p.onOverlap, 'error')
        m1 = NaN_to_0_else_1(designObject_binVal);
        m2 = NaN_to_0_else_1(binVal);
        assert(isempty(find(m1 .* m2, 1)), ...
               ['newly declared band overlaps existing band, '...
                'which was explicitly forbidden by onOverlap=error']);
        p.onOverlap = 'tighten'; % there won't be overlap,
                                 % merging is dummy operation
    end
    
    % merging rules
    switch p.onOverlap
      case 'tighten'
        logicOp = 'or';
        valueOp = 'min';
      case 'relax'
        logicOp = 'or';
        valueOp = 'max';
      case 'tightenExisting'
        logicOp = 'and';
        valueOp = 'min';
      case 'relaxExisting'
        logicOp = 'and';
        valueOp = 'max';
      otherwise
        assert(false);
    end
    
    % merge requirements tables
    binValMerged = mergeBinVal(...
        'binVal1', designObject_binVal, ...
        'binVal2', binVal, ...
        'logicalOperator', logicOp, ...
        'valueOperator', valueOp);

    % assign new requirements table
    d.(fn) = binValMerged;    
end

function r = NaN_to_0_else_1(vec)
    r = zeros(size(vec));
    % logical indexing, instead of r(find(~isnan(vec))) = 1;
    r(~isnan(vec)) = 1; 
end

function binVal = mergeBinVal(varargin)
    p = varargin2struct(varargin);
    
    % region where first argument is defined
    mask1 = NaN_to_0_else_1(p.binVal1);

    % region where second argument is defined    
    mask2 = NaN_to_0_else_1(p.binVal2);

    % region where result will be defined    
    switch(p.logicalOperator)
      case 'or'
        mask = mask1 + mask2;
      case 'and'
        mask = mask1 .* mask2;
      otherwise
        assert(false);
    end 
    ix = find(mask);

    % merge into result
    binVal = zeros(size(p.binVal1)) + NaN;
    switch(p.valueOperator)
      case 'min'
        % note: The function min/max ignore NaNs (see "min" man 
        % page in Matlab)
        % if one entry is NaN, the other entry will be returned
        binVal(ix) = min(p.binVal1(ix), p.binVal2(ix));
      case 'max'
        binVal(ix) = max(p.binVal1(ix), p.binVal2(ix));
      otherwise
        assert(false);
    end
end        

% evaluates [f / gain] filter specification on the frequency grid
function H = evaluateFilter(f_eval, modelF, modelH, complexValued)

    oneSided = false;
    if ~complexValued
        oneSided = true;
    else
        if min(modelF) > min(f_eval)
            disp(['Warning: Filter model does not contain (enough) negative frequencies. ', ...
                  'assuming symmetric H(f) / real-valued h(t)']);
            oneSided = true;
        end
    end

    if oneSided
        f_evalOrig = f_eval;
        f_eval = abs(f_eval);
    end
    
    H = interp1(modelF, modelH, f_eval, 'linear');

    if oneSided
        % enforce symmetry (=> real-valued impulse response)
        logicalIndex = (f_evalOrig < 0);
        H(logicalIndex) = conj(H(logicalIndex));
    end
end

function [d, handle] = doPlotH(d, H, spec, legEntry, linewidth)
    handle = plot(fftshift(d.fb), fftshift(20*log10(abs(H)+1e-15)), spec);
    d = addLegend(d, legEntry);
    if exist('linewidth', 'var')
        set(handle, 'lineWidth', linewidth);
    end
end

function [d, handle] = doPlot_dB(d, H, spec, legEntry, linewidth)
    handle = plot(fftshift(d.fb), fftshift(H), spec);
    d.legList{size(d.legList, 2) + 1} = legEntry;
    if exist('linewidth', 'var')
        set(handle, 'lineWidth', linewidth);
    end
end

function d = doPlotStart(d, plotTitle)
    figure(d.nextPlotIx); 
    title(plotTitle);
    grid on; hold on;
    d.nextPlotIx = d.nextPlotIx + 1;
    d.legList = {};
end

function d = doPlotEnd(d)
    legend(d.legList);
    xlabel('f/Hz');
    ylabel('dB');
end

function d = addLegend(d, legEntry)
    d.legList{size(d.legList, 2) + 1} = legEntry;
end

% === general-purpose library functions ===

% handling of function arguments
% someFun('one', 1, 'two', 2, 'three', 3) => struct('one', 1, 'two', 2, 'three', 3)
% a struct() may appear in place of a key ('one') and gets merged into the output.
function r = varargin2struct(arg)
    assert(iscell(arg));
    
    switch(size(arg, 2))
      case 0 % varargin was empty
        r=struct();    
      case 1 % single argument, wrapped by varargin into a cell list 
        r=arg{1}; % unwrap
        assert(isstruct(r));
      otherwise
        r=struct();
        % iterate through cell elements
        ix=1;
        ixMax=size(arg, 2);
        while ix <= ixMax
            e=arg{ix};
            if ischar(e)
                % string => key/value. The next field is a value
                ix = ix + 1;
                v = arg{ix};
                r.(e) = v;
            elseif isstruct(e)
                names = fieldnames(e);
                assert(size(names, 2)==1); % column
                for ix2 = 1:size(names, 1) 
                    k = names{ix2};
                    v = e.(k);
                    r.(k) = v;
                end
            else 
                disp('invalid token in vararg handling. Expecting key or struct. Got:');
                disp(e);
                assert(false)
            end
            ix=ix+1;
        end % while
    end % switch
end

function sig = FFT_delay(sig, nDelay)
    sig = fft(sig); % to frequency domain
    nSigSamples = size(sig, 2);
    binFreq=(mod(((0:nSigSamples-1)+floor(nSigSamples/2)), nSigSamples)-floor(nSigSamples/2));
    phase = -2*pi*nDelay / nSigSamples .* binFreq;
    rot = exp(1i*phase);
    if mod(nSigSamples, 2)==0
        % even length - bin at Nyquist limit
        rot(nSigSamples/2+1)=cos(phase(nSigSamples/2+1));
    end
    sig = sig .* rot;
    sig = ifft(sig); % to time domain
end

% *******************************************************
% delay-matching between two signals (complex/real-valued)
%
% * matches the continuous-time equivalent waveforms
%   of the signal vectors (reconstruction at Nyquist limit =>
%   ideal lowpass filter)
% * Signals are considered cyclic. Use arbitrary-length 
%   zero-padding to turn a one-shot signal into a cyclic one.
%
% * output:
%   => coeff: complex scaling factor that scales 'ref' into 'signal'
%   => delay 'deltaN' in units of samples (subsample resolution)
%      apply both to minimize the least-square residual      
%   => 'shiftedRef': a shifted and scaled version of 'ref' that 
%      matches 'signal' 
%   => (signal - shiftedRef) gives the residual (vector error)
%
% Example application
% - with a full-duplex soundcard, transmit an arbitrary cyclic test signal 'ref'
% - record 'signal' at the same time
% - extract one arbitrary cycle
% - run fitSignal
% - deltaN gives the delay between both with subsample precision
% - 'shiftedRef' is the reference signal fractionally resampled 
%   and scaled to optimally match 'signal'
% - to resample 'signal' instead, exchange the input arguments
% *******************************************************
function [coeff, shiftedRef, deltaN] = fitSignal_FFT(signal, ref)
    n=length(signal);
    % xyz_FD: Frequency Domain
    % xyz_TD: Time Domain
    % all references to 'time' and 'frequency' are for illustration only

    forceReal = isreal(signal) && isreal(ref);
    
    % *******************************************************
    % Calculate the frequency that corresponds to each FFT bin
    % [-0.5..0.5[
    % *******************************************************
    binFreq=(mod(((0:n-1)+floor(n/2)), n)-floor(n/2))/n;

    % *******************************************************
    % Delay calculation starts:
    % Convert to frequency domain...
    % *******************************************************
    sig_FD = fft(signal);
    ref_FD = fft(ref, n);

    % *******************************************************
    % ... calculate crosscorrelation between 
    % signal and reference...
    % *******************************************************
    u=sig_FD .* conj(ref_FD);
    if mod(n, 2) == 0
        % for an even sized FFT the center bin represents a signal
        % [-1 1 -1 1 ...] (subject to interpretation). It cannot be delayed. 
        % The frequency component is therefore excluded from the calculation.
        u(length(u)/2+1)=0;
    end
    Xcor=abs(ifft(u));

    %  figure(); plot(abs(Xcor));
    
    % *******************************************************
    % Each bin in Xcor corresponds to a given delay in samples.
    % The bin with the highest absolute value corresponds to
    % the delay where maximum correlation occurs.
    % *******************************************************
    integerDelay = find(Xcor==max(Xcor));
    
    % (1): in case there are several bitwise identical peaks, use the first one
    % Minus one: Delay 0 appears in bin 1
    integerDelay=integerDelay(1)-1;

    % Fourier transform of a pulse shifted by one sample
    rotN = exp(2i*pi*integerDelay .* binFreq);

    uDelayPhase = -2*pi*binFreq;
    
    % *******************************************************
    % Since the signal was multiplied with the conjugate of the
    % reference, the phase is rotated back to 0 degrees in case
    % of no delay. Delay appears as linear increase in phase, but
    % it has discontinuities.
    % Use the known phase (with +/- 1/2 sample accuracy) to 
    % rotate back the phase. This removes the discontinuities.
    % *******************************************************
    %  figure(); plot(angle(u)); title('phase before rotation');
    u=u .* rotN;
    
    % figure(); plot(angle(u)); title('phase after rotation');
    
    % *******************************************************
    % Obtain the delay using linear least mean squares fit
    % The phase is weighted according to the amplitude.
    % This suppresses the error caused by frequencies with
    % little power, that may have radically different phase.
    % *******************************************************
    weight = abs(u); 
    constRotPhase = 1 .* weight;
    uDelayPhase = uDelayPhase .* weight;
    ang = angle(u) .* weight;
    r = [constRotPhase; uDelayPhase] .' \ ang.'; %linear mean square
    
    %rotPhase=r(1); % constant phase rotation, not used.
    % the same will be obtained via the phase of 'coeff' further down
    fractionalDelay=r(2);
    
    % *******************************************************
    % Finally, the total delay is the sum of integer part and
    % fractional part.
    % *******************************************************
    deltaN = integerDelay + fractionalDelay;

    % *******************************************************
    % provide shifted and scaled 'ref' signal
    % *******************************************************
    % this is effectively time-convolution with a unit pulse shifted by deltaN
    rotN = exp(-2i*pi*deltaN .* binFreq);
    ref_FD = ref_FD .* rotN;
    shiftedRef = ifft(ref_FD);
    
    % *******************************************************
    % Again, crosscorrelation with the now time-aligned signal
    % *******************************************************
    coeff=sum(signal .* conj(shiftedRef)) / sum(shiftedRef .* conj(shiftedRef));
    shiftedRef=shiftedRef * coeff;

    if forceReal
        shiftedRef = real(shiftedRef);
    end
end

Remez (FIR design) weights from requirements

Markus Nentwig August 19, 20111 comment Coded in Matlab
% *********************************************
% Weighting factors for Remez equiripple FIR design
% M. Nentwig
% *********************************************
close all; clear all;

f = [0:9]/10; % normalized frequency 0..1

% a = nominal |H(f)| on a linear scale (sample amplitude)
% 1 1     : passband from frequency 0 to 0.1 
% 0 0     : stopband from frequency 0.2 to 0.3 
% 0.5 0.5 : passband with -6 dB (0.5 amplitude) from frequency 0.4 to 0.5 
% 0 0     : stopband from frequency 0.6 to 0.7
% 1 1     : passband from frequency 0.8 to 0.9
% other frequency ranges are "don't care" areas.
a = [1 1 0 0 0.5 0.5 0 0 1 1];

% *********************************************
% design specification for each band
% *********************************************
ripple1_dB = 0.3;
att2_dB = 60;
ripple3_dB = 0.2;
att4_dB = 70;
ripple5_dB = 0.1;

% *********************************************
% error for each band, on a linear scale
% *********************************************
% note: passband 3 has a nominal gain of 0.5 in 'a'.
% the allowed error of |H(f)| scales accordingly.
err1 = 1 - 10 ^ (-ripple1_dB / 20);
err2 = 10 ^ (-att2_dB / 20);
err3 = (1 - 10 ^ (-ripple3_dB / 20)) * 0.5; 
err4 = 10 ^ (-att4_dB / 20);
err5 = 1 - 10 ^ (-ripple5_dB / 20);

% the weight of each band is chosen as the inverse of the targeted error
% (stricter design target => higher weight).
% we could normalize each entry in w to (err1+err2+err3+err4+err5)
% but it would appear as common factor in all entries and therefore make no difference.
w = [1/err1 1/err2 1/err3 1/err4 1/err5];

% filter order
% this design problem is 'tweaked' so that the resulting filter is (quite) exactly on target
% for the given n, which can be changed only in steps of 1.
% Note that increasing order by 1 can make the filter worse due to even / odd 
% number of points (impulse response symmetry)
n = 52;

% *********************************************
% Run Remez / Parks McClellan filter design
% *********************************************
h = remez(n, f, a, w);
% force row vector (OctaveForge and Matlab's "remez" differ)
h = reshape(h, 1, prod(size(h))); 
% *********************************************
% Plot the results
% *********************************************
figure(1); hold on;

% zero-pad the impulse response to set frequency resolution
% of the FFT
h = [h, zeros(1, 1000)];

% frequency base
npts = size(h, 2);
fbase = (0:npts-1)/npts; 

plot(fbase, 20*log10(abs((fft(h)+1e-15))), 'b');
xlim([0, 0.5]);

% *********************************************
% sketch the requirements
% *********************************************
e = [1 1];
plot([f(1), f(2)]/2, e * ripple1_dB, 'k');
plot([f(1), f(2)]/2, e * -ripple1_dB, 'k');
plot([f(3), f(4)]/2, e * -att2_dB, 'k');
plot([f(5), f(6)]/2, e * ripple3_dB - 6.02, 'k');
plot([f(5), f(6)]/2, e * -ripple3_dB - 6.02, 'k');
plot([f(7), f(8)]/2, e * -att4_dB, 'k');
plot([f(9), f(10)]/2, e * ripple5_dB, 'k');
plot([f(9), f(10)]/2, e * -ripple5_dB, 'k');
xlabel('normalized frequency 0..1');
ylabel('dB');

Resampling by Lagrange-polynomial interpolation

Markus Nentwig August 17, 2011 Coded in Matlab
% Lagrange interpolation for resampling
% References:
% [1] A digital signal processing approach to Interpolation
%     Ronald W. Schafer and Lawrence R. Rabiner
%     Proc. IEEE vol 61, pp.692-702, June 1973
% [2] https://ccrma.stanford.edu/~jos/Interpolation/Lagrange_Interpolation.html
% [3] Splitting the Unit delay: Tools for fractional delay filter design
%     T. I. Laakso, V. Valimaki, M. Karjalainen, and U. K. Laine
%     IEEE Signal Processing Magazine, vol. 13, no. 1, pp. 30..60, January 1996
function lagrangeResamplingDemo()
    originDefinition = 0; % see comment for LagrangeBasisPolynomial() below
   
    % Regarding order: From [1]
    % "Indeed, it is easy to show that whenever Q is odd, none of the
    % impulse responses corresponding to Lagrange interpolation can have
    % linear phase"
    % Here, order = Q+1 => odd orders are preferable
    order = 3;
   
    % *******************************************************************
    % Set up signals
    % *******************************************************************    
    nIn = order + 1;
    nOut = nIn * 5 * 20;

    % Reference data: from [1] fig. 8, linear-phase type
    ref = [-0.032, -0.056, -0.064, -0.048, 0, ...
           0.216, 0.448, 0.672, 0.864, 1, ...
           0.864, 0.672, 0.448, 0.216, 0, ...
           -0.048, -0.064, -0.056, -0.032, 0];
    tRef = (1:size(ref, 2)) / size(ref, 2);
   
    % impulse input to obtain impulse response
    inData = zeros(1, nIn);
    inData(1) = 1;
    outData = zeros(1, nOut);
   
    outTime = 0:(nOut-1);
    outTimeAtInput = outTime / nOut * nIn;
    outTimeAtInputInteger = floor(outTimeAtInput);
    outTimeAtInputFractional = outTimeAtInput - outTimeAtInputInteger;
    evalFracTime = outTimeAtInputFractional - 0.5 + originDefinition;
 
    % odd-order modification
    if mod(order, 2) == 0
        % Continuity of the impulse response is achieved when support points are located at
        % the intersections between adjacent segments "at +/- 0.5"
        % For an even order polynomial (odd number of points), this is only possible with 
        % an asymmetric impulse response
       
        offset = 0.5;
        %offset = -0.5; % alternatively, its mirror image
    else
        offset = 0;
    end
   
    % *******************************************************************
    % Apply Lagrange interpolation to input data
    % *******************************************************************    
    for ixTap = 0:order
        % ixInSample is the input sample that appears at FIR tap 'ixTap' to contribute 
        % to the output sample
        % Row vector, for all output samples in parallel
        ixInSample = mod(outTimeAtInputInteger + ixTap - order, nIn) + 1;

        % the value of said input sample, for all output samples in parallel
        d = inData(ixInSample);

        % Get Lagrange polynomial coefficients of this tap
        c = lagrangeBasisPolynomial(order, ixTap, originDefinition + offset);

        % Evaluate the Lagrange polynomial, resulting in the time-varying FIR tap weight
        cTap = polyval(c, evalFracTime);

        % FIR operation: multiply FIR tap weight with input sample and add to 
        % output sample (all outputs in parallel)
        outData = outData + d .* cTap;
    end

    % *******************************************************************
    % Plot
    % *******************************************************************    
    figure(); hold on;
    h = plot((0:nOut-1) / nOut, outData, 'b-'); set(h, 'lineWidth', 3);
    stem(tRef, ref, 'r'); set(h, 'lineWidth', 3);
    legend('impulse response', 'reference result');
    title('Lagrange interpolation, impulse response');
end

% Returns the coefficients of a Lagrange basis polynomial
% 1 <= order: Polynomial order
% 0 <= evalIx <= order: index of basis function.
%
% At the set of support points, the basis polynomials evaluate as follows:
% evalIx = 1: [1 0 0 ...]
% evalIx = 2: [0 1 0 ...]
% evalIx = 3: [0 0 1 ...]
%
% The support point are equally spaced.
% Example, using originDefinition=0:
% order = 1: [-0.5 0.5]
% order = 2: [-1 0 1]
% order = 3: [-1.5 -0.5 0.5 1.5]
%
% The area around the mid-point corresponds to -0.5 <= x <= 0.5.
% If a resampler implementation uses by convention 0 <= x <= 1 instead, set 
% originDefinition=0.5 to offset
% the polynomial.
function polyCoeff = lagrangeBasisPolynomial(order, evalIx, originDefinition)
    assert(evalIx >= 0);
    assert(evalIx <= order);
   
    tapLocations = -0.5*(order) + (0:order) + originDefinition;

    polyCoeff = [1];
    for loopIx = 0:order
        if loopIx ~= evalIx        
            % numerator: places a zero in the polynomial via (x-xTap(k)), with k != evalIx
            % denominator: scales to 1 amplitude at x=xTap(evalIx)
            term = [1 -tapLocations(loopIx+1)] / (tapLocations(evalIx+1)-tapLocations(loopIx+1));

            % multiply polynomials => convolve coefficients
            polyCoeff = conv(polyCoeff, term);
        end
    end

    % TEST:

    % The Lagrange polynomial evaluates to 1 at the location of the tap
    % corresponding to evalIx
    thisTapLocation = tapLocations(evalIx+1);
    pEval = polyval(polyCoeff, thisTapLocation);
    assert(max(abs(pEval) - 1) < 1e6*eps);
    
    % The Lagrange polynomial evaluates to 0 at all other tap locations
    tapLocations(evalIx+1) = [];
    pEval = polyval(polyCoeff, tapLocations);
    assert(max(abs(pEval)) < 1e6*eps);
end

Signal fitting with subsample resolution

Markus Nentwig August 16, 20112 comments Coded in Matlab
% *******************************************************
% delay-matching between two signals (complex/real-valued)
% M. Nentwig
%
% * matches the continuous-time equivalent waveforms
%   of the signal vectors (reconstruction at Nyquist limit =>
%   ideal lowpass filter)
% * Signals are considered cyclic. Use arbitrary-length 
%   zero-padding to turn a one-shot signal into a cyclic one.
%
% * output:
%   => coeff: complex scaling factor that scales 'ref' into 'signal'
%   => delay 'deltaN' in units of samples (subsample resolution)
%      apply both to minimize the least-square residual      
%   => 'shiftedRef': a shifted and scaled version of 'ref' that 
%      matches 'signal' 
%   => (signal - shiftedRef) gives the residual (vector error)
%
% *******************************************************
function [coeff, shiftedRef, deltaN] = fitSignal_FFT(signal, ref)
    n=length(signal);
    % xyz_FD: Frequency Domain
    % xyz_TD: Time Domain
    % all references to 'time' and 'frequency' are for illustration only

    forceReal = isreal(signal) && isreal(ref);
    
    % *******************************************************
    % Calculate the frequency that corresponds to each FFT bin
    % [-0.5..0.5[
    % *******************************************************
    binFreq=(mod(((0:n-1)+floor(n/2)), n)-floor(n/2))/n;

    % *******************************************************
    % Delay calculation starts:
    % Convert to frequency domain...
    % *******************************************************
    sig_FD = fft(signal);
    ref_FD = fft(ref, n);

    % *******************************************************
    % ... calculate crosscorrelation between 
    % signal and reference...
    % *******************************************************
    u=sig_FD .* conj(ref_FD);
    if mod(n, 2) == 0
        % for an even sized FFT the center bin represents a signal
        % [-1 1 -1 1 ...] (subject to interpretation). It cannot be delayed. 
        % The frequency component is therefore excluded from the calculation.
        u(length(u)/2+1)=0;
    end
    Xcor=abs(ifft(u));

    %  figure(); plot(abs(Xcor));
    
    % *******************************************************
    % Each bin in Xcor corresponds to a given delay in samples.
    % The bin with the highest absolute value corresponds to
    % the delay where maximum correlation occurs.
    % *******************************************************
    integerDelay = find(Xcor==max(Xcor));
    
    % (1): in case there are several bitwise identical peaks, use the first one
    % Minus one: Delay 0 appears in bin 1
    integerDelay=integerDelay(1)-1;

    % Fourier transform of a pulse shifted by one sample
    rotN = exp(2i*pi*integerDelay .* binFreq);

    uDelayPhase = -2*pi*binFreq;
    
    % *******************************************************
    % Since the signal was multiplied with the conjugate of the
    % reference, the phase is rotated back to 0 degrees in case
    % of no delay. Delay appears as linear increase in phase, but
    % it has discontinuities.
    % Use the known phase (with +/- 1/2 sample accuracy) to 
    % rotate back the phase. This removes the discontinuities.
    % *******************************************************
    %  figure(); plot(angle(u)); title('phase before rotation');
    u=u .* rotN;
    
    % figure(); plot(angle(u)); title('phase after rotation');
    
    % *******************************************************
    % Obtain the delay using linear least mean squares fit
    % The phase is weighted according to the amplitude.
    % This suppresses the error caused by frequencies with
    % little power, that may have radically different phase.
    % *******************************************************
    weight = abs(u); 
    constRotPhase = 1 .* weight;
    uDelayPhase = uDelayPhase .* weight;
    ang = angle(u) .* weight;
    r = [constRotPhase; uDelayPhase] .' \ ang.'; %linear mean square
    
    %rotPhase=r(1); % constant phase rotation, not used.
    % the same will be obtained via the phase of 'coeff' further down
    fractionalDelay=r(2);
    
    % *******************************************************
    % Finally, the total delay is the sum of integer part and
    % fractional part.
    % *******************************************************
    deltaN = integerDelay + fractionalDelay;

    % *******************************************************
    % provide shifted and scaled 'ref' signal
    % *******************************************************
    % this is effectively time-convolution with a unit pulse shifted by deltaN
    rotN = exp(-2i*pi*deltaN .* binFreq);
    ref_FD = ref_FD .* rotN;
    shiftedRef = ifft(ref_FD);
    
    % *******************************************************
    % Again, crosscorrelation with the now time-aligned signal
    % *******************************************************
    coeff=sum(signal .* conj(shiftedRef)) / sum(shiftedRef .* conj(shiftedRef));
    shiftedRef=shiftedRef * coeff;

    if forceReal
        shiftedRef = real(shiftedRef);
    end
end

Resampling on arbitrary grid (vectorized)

Markus Nentwig August 14, 20111 comment Coded in Matlab
% **************************************************************
% Efficient resampling of a cyclic signal on an arbitrary grid
% M. Nentwig, 2011
% => calculates Fourier series coefficients
% => Evaluates the series on an arbitrary grid
% => takes energy exactly on the Nyquist limit into account (even-
%    size special case)
% => The required matrix calculation is vectorized, but conceptually
%    much more CPU-intensive than an IFFT reconstruction on a regular grid
% **************************************************************
close all; clear all;

% **************************************************************
% example signals
% **************************************************************
sig = [1 2 3 4 5 6 7 8 7 6 5 4 3 2 1 0];
%sig = repmat([1 -1], 1, 8); % highlights the special case at the Nyquist limit
%sig = rand(1, 16);

nIn = size(sig, 2);

% **************************************************************
% example resampling grid
% **************************************************************
evalGrid = rand(1, 500); 
evalGrid = cumsum(evalGrid);
evalGrid = evalGrid / max(evalGrid) * nIn;
nOut = size(evalGrid, 2);

% **************************************************************
% determine Fourier series coefficients of signal
% **************************************************************
fCoeff = fft(sig);
nCoeff = 0;
if mod(nIn, 2) == 0
    % **************************************************************
    % special case for even-sized length: There is ambiguity, whether
    % the bin at the Nyquist limit corresponds to a positive or negative
    % frequency. Both give a -1, 1, -1, 1, ... waveform on the
    % regular sampling grid.
    % This coefficient will be treated separately. Effectively, it will
    % be interpreted as being half positive, half negative frequency.
    % **************************************************************
    bin = floor(nIn / 2) + 1;
    nCoeff = fCoeff(bin);
    fCoeff(bin) = 0; % remove it from the matrix-based evaluation
end

% **************************************************************
% indices for Fourier series
% since evaluation does not take place on a regular grid, 
% one needs to distinguish between negative and positive frequencies
% **************************************************************
o = floor(nIn/2);
k = 0:nIn-1;
k = mod(k + o, nIn) - o;

% **************************************************************
% m(yi, xi) = exp(2i * pi * evalGrid(xi) * k(yi))
% each column of m evaluates the series for one requested output location
% **************************************************************
m = exp(2i * pi * transpose(repmat(transpose(evalGrid / nIn), 1, nIn) ...
                            * diag(k))) / nIn;
% each output point is the dot product between the Fourier series 
% coefficients and the column in m that corresponds to the location of
% the output point
out = fCoeff * m;
out = out + nCoeff * cos(pi*evalGrid) / nIn;

% **************************************************************
% plot
% **************************************************************
out = real(out); % chop off roundoff error for plotting
figure(); grid on; hold on; 
h = stem((0:nIn-1), sig, 'k'); set(h, 'lineWidth', 3);
h = plot(evalGrid, out, '+'); set(h, 'markersize', 2);
legend('input', 'output');
title('resampling of cyclic signal on arbitrary grid');

Farrow resampler implementation

Markus Nentwig August 13, 20111 comment Coded in C
/* ****************************************************
 * Farrow resampler (with optional bank switching)
 * M. Nentwig, 2011
 * Input stream is taken from stdin
 * Output stream goes to stdout
 * Main target was readability, is not optimized for efficiency.
 * ****************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <assert.h>

#if 1
/* **************************************************************
 * example coefficients. 
 * Each column [c0; c1; c2; ...] describes a polynomial for one tap coefficent in fractional time ft [0, 1]:
 * tapCoeff = c0 + c1 * ft + c2 * ft ^ 2 + ...
 * Each column corresponds to one tap. 
 * The example filters is based on a 6th order Chebyshev Laplace-domain prototype.
 * This version uses three sub-segments per tap (NBANKS = 3)
 * **************************************************************/
const double cMatrix[] = {
  2.87810386e-4, 4.70096244e-3, 7.93412570e-2, 4.39824536e-1, 1.31192924e+000, 2.67892232e+000, 4.16465421e+000, 5.16499621e+000, 5.15592605e+000, 3.99000369e+000, 2.00785470e+000, -7.42377060e-2, -1.52569354e+000, -1.94402804e+000, -1.40915797e+000, -3.86484652e-1, 5.44712939e-1, 9.77559688e-1, 8.32191447e-1, 3.22691788e-1, -2.13133045e-1, -5.08501962e-1, -4.82928807e-1, -2.36313854e-1, 4.76034568e-2, 2.16891966e-1, 2.20894063e-1, 1.08361553e-1, -2.63421832e-2, -1.06276015e-1, -1.07491548e-1, -5.53793711e-2, 4.86314061e-3, 3.94357182e-2, 4.06217506e-2, 2.17199064e-2, 1.60318761e-3, -8.40370106e-3, -8.10525279e-3, -3.62112499e-3, -4.13413072e-4, 2.33101911e-4, 
  -3.26760325e-3, -6.46028234e-3, 1.46793247e-1, 5.90235537e-1, 1.18931309e+000, 1.57853546e+000, 1.40402774e+000, 5.76506323e-1, -6.33522788e-1, -1.74564700e+000, -2.24153717e+000, -1.91309453e+000, -9.55568978e-1, 1.58239169e-1, 9.36193787e-1, 1.10969783e+000, 7.33284446e-1, 1.06542194e-1, -4.15412084e-1, -6.06616434e-1, -4.54898908e-1, -1.20841199e-1, 1.82941623e-1, 3.12543429e-1, 2.49935829e-1, 8.05376898e-2, -7.83213666e-2, -1.47769751e-1, -1.18735248e-1, -3.70656555e-2, 3.72608374e-2, 6.71425397e-2, 5.17812605e-2, 1.55564930e-2, -1.40896327e-2, -2.35058137e-2, -1.59635057e-2, -3.44701792e-3, 4.14108065e-3, 4.56234829e-3, 1.59503132e-3, -3.17301882e-4,
  5.64310141e-3, 7.74786707e-2, 2.11791763e-1, 2.84703201e-1, 1.85158633e-1, -8.41118142e-2, -3.98497442e-1, -5.86821615e-1, -5.40397941e-1, -2.47558080e-1, 1.50864737e-1, 4.59312895e-1, 5.41539400e-1, 3.84673917e-1, 9.39576331e-2, -1.74932542e-1, -3.01635463e-1, -2.56239225e-1, -9.87146864e-2, 6.82216764e-2, 1.59795852e-1, 1.48668245e-1, 6.62563431e-2, -2.71234898e-2, -8.07045577e-2, -7.76841351e-2, -3.55333136e-2, 1.23206602e-2, 3.88535040e-2, 3.64199073e-2, 1.54608563e-2, -6.59814558e-3, -1.72735099e-2, -1.46307777e-2, -5.04363288e-3, 3.31049461e-3, 6.01267607e-3, 3.83904192e-3, 3.92549958e-4, -1.36315264e-3, -9.76017430e-4, 7.46699178e-5};
#define NTAPS (14)
#define NBANKS (3)
#define ORDER (2)
#else
/* Alternative example
 * Similar impulse response as above
 * A conventional Farrow design (no subdivisions => NBANKS = 1), order 3
 */ 
const double cMatrix[] = {
  -8.57738278e-3, 7.82989032e-1, 7.19303539e+000, 6.90955718e+000, -2.62377450e+000, -6.85327127e-1, 1.44681608e+000, -8.79147907e-1, 7.82633997e-2, 1.91318985e-1, -1.88573400e-1, 6.91790782e-2, 3.07723786e-3, -6.74800912e-3,
  2.32448021e-1, 2.52624309e+000, 7.67543936e+000, -8.83951796e+000, -5.49838636e+000, 6.07298348e+000, -2.16053205e+000, -7.59142947e-1, 1.41269409e+000, -8.17735712e-1, 1.98119464e-1, 9.15904145e-2, -9.18092030e-2, 2.74136108e-2,
  -1.14183319e+000, 6.86126458e+000, -6.86015957e+000, -6.35135894e+000, 1.10745051e+001, -3.34847578e+000, -2.22405694e+000, 3.14374725e+000, -1.68249886e+000, 2.54083065e-1, 3.22275037e-1, -3.04794927e-1, 1.29393976e-1, -3.32026332e-2,
  1.67363115e+000, -2.93090391e+000, -1.13549165e+000, 5.65274939e+000, -3.60291782e+000, -6.20715544e-1, 2.06619782e+000, -1.42159644e+000, 3.75075865e-1, 1.88433333e-1, -2.64135123e-1, 1.47117661e-1, -4.71871047e-2, 1.24921920e-2};
#define NTAPS (14)
#define NBANKS (1)
#define ORDER (3)
#endif

/* Set here the ratio between output and input sample rate.
 * It could be changed even during runtime! */
#define INCR (1.0 / 6.28 / 3)

/* delay line storage */
double delayLine[NTAPS];

/* Coefficient lookup "table" */
static double c(int ixTap, int ixBank, int ixPower){
  return cMatrix[ixPower * (NTAPS * NBANKS) + ixTap * NBANKS + ixBank];
}

int main (void){

  /* clear delay line */
  int ix; 
  for (ix = 0; ix < NTAPS; ++ix)
    delayLine[ix] = 0;
  
  /* Index of last input sample that was read 
   * First valid sample starts at 0 */
  int sampleIndexInput = -1;
  
  /* Position of next output sample in input stream */
  double sampleIndexOutput = 0.0;

  /* loop forever. Terminate, once out of input data. */
  while (1){
    /* Split output sample location into integer and fractional part:
     * Integer part corresponds to sample index in input stream
     * fractional part [0, 1[ spans one tap (covering NBANKS segments) */
    int sio_int = floor(sampleIndexOutput);
    double sio_fracInTap = sampleIndexOutput - (double)sio_int;
    assert((sio_fracInTap >= 0) && (sio_fracInTap < 1));
    
    /* Further split the fractional part into 
     * - bank index
     * - fractional part within the bank */
    int sio_intBank = floor(sio_fracInTap * (double) NBANKS);
    double sio_fracInBank = sio_fracInTap * (double) NBANKS - (double)sio_intBank;
    assert((sio_fracInBank >= 0) && (sio_fracInBank < 1));
    
    /* ****************************************************
     * load new samples into the delay line, until the last 
     * processed input sample (sampleIndexInput) catches
     * up with the integer part of the output stream position (sio_int)
     * ***************************************************/
    while (sampleIndexInput < sio_int){
      /* Advance the delay line one step */
      ++sampleIndexInput;
      for (ix = NTAPS-1; ix > 0; --ix)
	delayLine[ix] = delayLine[ix-1];      
      
      /* Read one input sample */    
      int flag = scanf("%lf", &delayLine[0]);
      if (flag != 1) 
	goto done; /* there's nothing wrong with "goto" as "break" for multiple loops ... */
      
    } /* while delay line behind output data */

    /* ****************************************************
     * Calculate one output sample:
     * "out" sums up the contribution of each tap
     * ***************************************************/
    double out = 0;
    int ixTap; 

    for (ixTap = 0; ixTap < NTAPS; ++ixTap){
      /* ****************************************************
       * Contribution of tap "ixTap" to output: 
       * ***************************************************/
      /* Evaluate polynomial in sio_fracInBank:
       * c(ixTap, sio_intBank, 0) is the constant coefficient 
       * c(ixTap, sio_intBank, 1) is the linear coefficient etc
       */
      double hornerSum = c(ixTap, sio_intBank, ORDER);
      int ixPower;
      for (ixPower = ORDER-1; ixPower >= 0; --ixPower){
	hornerSum *= sio_fracInBank;
	hornerSum += c(ixTap, sio_intBank, ixPower);
      } /* for ixPower */

      /* ****************************************************
       * Weigh the delay line sample of this tap with the 
       * polynomial result and add to output
       * ***************************************************/
      out += hornerSum * delayLine[ixTap];
    } /* for ixTap */
    
    /* ****************************************************
     * Generate output sample and advance the position of
     * the next output sample in the input data stream 
     * ***************************************************/
    printf("%1.12le\n", out);
    sampleIndexOutput += INCR;
  } /* loop until out of input data */
  
 done: /* out of input data => break loops and continue here */
  return 0;
} /* main */

Bank-switched Farrow resampler

Markus Nentwig August 13, 2011 Coded in Matlab
% **************************************************************
% bank-switched Farrow resampler
% M. Nentwig, 2011
% Note: Uses cyclic signals (wraps around)
% **************************************************************
close all; clear all;

% inData contains input to the resampling process (instead of function arguments)
inData = struct();

% **************************************************************
% example coefficients. 
% Each column [c0; c1; c2; ...] describes a polynomial for one tap coefficent in fractional time ft [0, 1]:
% tapCoeff = c0 + c1 * ft + c2 * ft ^ 2 + ...
% Each column corresponds to one tap. 
% the matrix size may be changed arbitrarily.
% 
% The example filter is based on a 6th order Chebyshev Laplace-domain prototype.
% **************************************************************
if false
    % for comparison, this is a conventional design (no bank switching)
    inData.cMatrix =[ -8.57738278e-3 7.82989032e-1 7.19303539e+000 6.90955718e+000 -2.62377450e+000 -6.85327127e-1 1.44681608e+000 -8.79147907e-1 7.82633997e-2 1.91318985e-1 -1.88573400e-1 6.91790782e-2 3.07723786e-3 -6.74800912e-3
2.32448021e-1 2.52624309e+000 7.67543936e+000 -8.83951796e+000 -5.49838636e+000 6.07298348e+000 -2.16053205e+000 -7.59142947e-1 1.41269409e+000 -8.17735712e-1 1.98119464e-1 9.15904145e-2 -9.18092030e-2 2.74136108e-2
-1.14183319e+000 6.86126458e+000 -6.86015957e+000 -6.35135894e+000 1.10745051e+001 -3.34847578e+000 -2.22405694e+000 3.14374725e+000 -1.68249886e+000 2.54083065e-1 3.22275037e-1 -3.04794927e-1 1.29393976e-1 -3.32026332e-2
1.67363115e+000 -2.93090391e+000 -1.13549165e+000 5.65274939e+000 -3.60291782e+000 -6.20715544e-1 2.06619782e+000 -1.42159644e+000 3.75075865e-1 1.88433333e-1 -2.64135123e-1 1.47117661e-1 -4.71871047e-2 1.24921920e-2] / 231.46 * 20;
    inData.nBanks = 1;
else
    % same example filter as above, but now the matrix contains three alternative coefficient banks for each tap. 
    % The order was reduced from cubic to quadratic.
    % column 1: first bank, tap 1
    % column 2: second bank, tap 1
    % column 3: third bank, tap 1
    % column 4: first bank, tap 2
    % and so on    
    inData.cMatrix =[  2.87810386e-4 4.70096244e-3 7.93412570e-2 4.39824536e-1 1.31192924e+000 2.67892232e+000 4.16465421e+000 5.16499621e+000 5.15592605e+000 3.99000369e+000 2.00785470e+000 -7.42377060e-2 -1.52569354e+000 -1.94402804e+000 -1.40915797e+000 -3.86484652e-1 5.44712939e-1 9.77559688e-1 8.32191447e-1 3.22691788e-1 -2.13133045e-1 -5.08501962e-1 -4.82928807e-1 -2.36313854e-1 4.76034568e-2 2.16891966e-1 2.20894063e-1 1.08361553e-1 -2.63421832e-2 -1.06276015e-1 -1.07491548e-1 -5.53793711e-2 4.86314061e-3 3.94357182e-2 4.06217506e-2 2.17199064e-2 1.60318761e-3 -8.40370106e-3 -8.10525279e-3 -3.62112499e-3 -4.13413072e-4 2.33101911e-4
 -3.26760325e-3 -6.46028234e-3 1.46793247e-1 5.90235537e-1 1.18931309e+000 1.57853546e+000 1.40402774e+000 5.76506323e-1 -6.33522788e-1 -1.74564700e+000 -2.24153717e+000 -1.91309453e+000 -9.55568978e-1 1.58239169e-1 9.36193787e-1 1.10969783e+000 7.33284446e-1 1.06542194e-1 -4.15412084e-1 -6.06616434e-1 -4.54898908e-1 -1.20841199e-1 1.82941623e-1 3.12543429e-1 2.49935829e-1 8.05376898e-2 -7.83213666e-2 -1.47769751e-1 -1.18735248e-1 -3.70656555e-2 3.72608374e-2 6.71425397e-2 5.17812605e-2 1.55564930e-2 -1.40896327e-2 -2.35058137e-2 -1.59635057e-2 -3.44701792e-3 4.14108065e-3 4.56234829e-3 1.59503132e-3 -3.17301882e-4
 5.64310141e-3 7.74786707e-2 2.11791763e-1 2.84703201e-1 1.85158633e-1 -8.41118142e-2 -3.98497442e-1 -5.86821615e-1 -5.40397941e-1 -2.47558080e-1 1.50864737e-1 4.59312895e-1 5.41539400e-1 3.84673917e-1 9.39576331e-2 -1.74932542e-1 -3.01635463e-1 -2.56239225e-1 -9.87146864e-2 6.82216764e-2 1.59795852e-1 1.48668245e-1 6.62563431e-2 -2.71234898e-2 -8.07045577e-2 -7.76841351e-2 -3.55333136e-2 1.23206602e-2 3.88535040e-2 3.64199073e-2 1.54608563e-2 -6.59814558e-3 -1.72735099e-2 -1.46307777e-2 -5.04363288e-3 3.31049461e-3 6.01267607e-3 3.83904192e-3 3.92549958e-4 -1.36315264e-3 -9.76017430e-4 7.46699178e-5] / 133.64 * 20;
    inData.nBanks = 3;
end          

% **************************************************************
% Create example signal
% **************************************************************
nIn = 50; % used for test signal generation only
if false
    % complex signal
    inData.signal = cos(2*pi*(0:(nIn-1)) / nIn) + cos(2*2*pi*(0:(nIn-1)) / nIn) + 1.5;
else
    % impulse response
    inData.signal = zeros(1, nIn); inData.signal(1) = 1; % 

    % inData.cMatrix = 0 * inData.cMatrix; inData.cMatrix(1, 1) = 1; % enable to show constant c in first tap, first bank
    % inData.cMatrix = 0 * inData.cMatrix; inData.cMatrix(2, 1) = 1; % enable to show linear c in first tap, first bank
    % inData.cMatrix = 0 * inData.cMatrix; inData.cMatrix(3, 1) = 1; % enable to show quadratic c in first tap, first bank
    % inData.cMatrix = 0 * inData.cMatrix; inData.cMatrix(1, 2) = 1; % enable to show constant c in first tap, second bank
end

% **************************************************************
% Resample to the following number of output samples
% must be integer, otherwise arbitrary
% **************************************************************
inData.nSamplesOut = floor(nIn * 3 * 6.28);

% **************************************************************
% Set up Farrow resampling
% **************************************************************
nSamplesIn = size(inData.signal, 2);
nSamplesOut = inData.nSamplesOut;
order = size(inData.cMatrix, 1) - 1; % polynomial order

% number of input samples that contribute to one output sample (FIR size)
nTaps = size(inData.cMatrix, 2); 
assert(mod(nTaps, inData.nBanks) == 0);
nTaps = nTaps / inData.nBanks; % only one out of nBanks coefficients contributes at any time

% pointer to the position in the input stream for each output sample (row vector, real numbers), starting at 0
inputIndex = (0:nSamplesOut-1) / nSamplesOut * nSamplesIn;

% split into integer part (0..nSamplesIn - 1) ...
inputIndexIntegerPart = floor(inputIndex);
% ... and fractional part [0, 1[
inputIndexFractionalPart = inputIndex - inputIndexIntegerPart;

% bank switching
% the fractional part is again split into an integer and fractional part
inputIndexFractionalPart = inputIndexFractionalPart * inData.nBanks;
inputIndexFractionalPart_int = floor(inputIndexFractionalPart); % coefficient bank index
inputIndexFractionalPart_frac = inputIndexFractionalPart - inputIndexFractionalPart_int; % fractional time 0..1 within each bank

% **************************************************************
% Calculate output stream
% First constant term (conventional FIR), then linear, quadratic, cubic, ... 
% **************************************************************
outStream = zeros(1, inData.nSamplesOut);
for ixOrder = 0 : order
    % note: fractional time is now defined for each sub-segment [0, 1[
    x = inputIndexFractionalPart_frac .^ ixOrder;
    
    % **************************************************************
    % Add the contribution of each tap one-by-one
    % **************************************************************
    for ixTap = 0 : nTaps - 1
        % coefficient bank switching: There are inData.nBanks alternative coefficients for each tap
        c = inData.cMatrix(ixOrder+1, ixTap * inData.nBanks + inputIndexFractionalPart_int + 1);

        % index of input sample that contributes to output via the current tap
        % higher tap index => longer delay => older input sample => smaller data index
        dataIx = inputIndexIntegerPart - ixTap;
        % wrap around
        dataIx = mod(dataIx, nSamplesIn);
        % array indexing starts at 1
        dataIx = dataIx + 1;
        delayed = inData.signal(dataIx);
        % for each individual output sample (index in row vector), 
        % - evaluate f = c(order, tapindex) * fracPart .^ order 
        % - scale the delayed input sample with f
        % - accumulate the contribution of all taps
        % this implementation performs the operation for all output samples in parallel (row vector)
        outStream = outStream + c .* delayed .* x;
    end % for ixTap
end % for ixOrder

% **************************************************************
% plot
% **************************************************************
xIn = linspace(0, 1, nSamplesIn + 1); xIn = xIn(1:end-1);
xOut = linspace(0, 1, nSamplesOut + 1); xOut = xOut(1:end-1);

figure(); grid on; hold on;
stem(xIn, inData.signal, 'k+-');
plot(xOut, outStream, 'b+-');
legend('input', 'output');
title('bank-switched Farrow resampling. Signals are cyclic.');

Farrow resampler

Markus Nentwig August 12, 20114 comments Coded in Matlab
% **************************************************************
% Vectorized Farrow resampler
% M. Nentwig, 2011
% Note: Uses cyclic signals (wraps around)
% **************************************************************
close all; clear all;

% inData contains input to the resampling process (instead of function arguments)
inData = struct();

% **************************************************************
% example coefficients. 
% Each column [c0; c1; c2; c3] describes a polynomial for one tap coefficent in fractional time ft [0, 1]:
% tapCoeff = c0 + c1 * ft + c2 * ft ^ 2 + c3 * ft ^ 3
% Each column corresponds to one tap. 
% the matrix size may be changed arbitrarily.
% 
% The example filter is based on a 6th order Chebyshev Laplace-domain prototype.
% **************************************************************
inData.cMatrix =[ -8.57738278e-3 7.82989032e-1 7.19303539e+000 6.90955718e+000 -2.62377450e+000 -6.85327127e-1 1.44681608e+000 -8.79147907e-1 7.82633997e-2 1.91318985e-1 -1.88573400e-1 6.91790782e-2 3.07723786e-3 -6.74800912e-3
 2.32448021e-1 2.52624309e+000 7.67543936e+000 -8.83951796e+000 -5.49838636e+000 6.07298348e+000 -2.16053205e+000 -7.59142947e-1 1.41269409e+000 -8.17735712e-1 1.98119464e-1 9.15904145e-2 -9.18092030e-2 2.74136108e-2
 -1.14183319e+000 6.86126458e+000 -6.86015957e+000 -6.35135894e+000 1.10745051e+001 -3.34847578e+000 -2.22405694e+000 3.14374725e+000 -1.68249886e+000 2.54083065e-1 3.22275037e-1 -3.04794927e-1 1.29393976e-1 -3.32026332e-2
 1.67363115e+000 -2.93090391e+000 -1.13549165e+000 5.65274939e+000 -3.60291782e+000 -6.20715544e-1 2.06619782e+000 -1.42159644e+000 3.75075865e-1 1.88433333e-1 -2.64135123e-1 1.47117661e-1 -4.71871047e-2 1.24921920e-2
] / 12.28;

% **************************************************************
% Create example signal
% **************************************************************
nIn = 50; % used for test signal generation only
if true
    % complex signal
    inData.signal = cos(2*pi*(0:(nIn-1)) / nIn) + cos(2*2*pi*(0:(nIn-1)) / nIn) + 1.5;
else
    % impulse response
    inData.signal = zeros(1, nIn); inData.signal(1) = 1; % 

    % inData.cMatrix = 0 * inData.cMatrix; inData.cMatrix(1, 1) = 1; % enable to show constant c in first tap
    % inData.cMatrix = 0 * inData.cMatrix; inData.cMatrix(2, 1) = 1; % enable to show linear c in first tap
    % inData.cMatrix = 0 * inData.cMatrix; inData.cMatrix(3, 1) = 1; % enable to show quadratic c in first tap
    % inData.cMatrix = 0 * inData.cMatrix; inData.cMatrix(1, 2) = 1; % enable to show constant c in second tap
end

% **************************************************************
% Resample to the following number of output samples
% must be integer, otherwise arbitrary
% **************************************************************
inData.nSamplesOut = floor(nIn * 6.28);

% **************************************************************
% Set up Farrow resampling
% **************************************************************
nSamplesIn = size(inData.signal, 2);
nSamplesOut = inData.nSamplesOut;
order = size(inData.cMatrix, 1) - 1; % polynomial order
nTaps = size(inData.cMatrix, 2); % number of input samples that contribute to one output sample (FIR size)

% pointer to the position in the input stream for each output sample (row vector, real numbers), starting at 0
inputIndex = (0:nSamplesOut-1) / nSamplesOut * nSamplesIn;

% split into integer part (0..nSamplesIn - 1) ...
inputIndexIntegerPart = floor(inputIndex);
% ... and fractional part [0, 1[
inputIndexFractionalPart = inputIndex - inputIndexIntegerPart;

% **************************************************************
% Calculate output stream
% First constant term (conventional FIR), then linear, quadratic, cubic, ... 
% **************************************************************
outStream = zeros(1, inData.nSamplesOut);
for ixOrder = 0 : order
    x = inputIndexFractionalPart .^ ixOrder;
    
    % **************************************************************
    % Add the contribution of each tap one-by-one
    % **************************************************************
    for ixTap = 0 : nTaps - 1
        c = inData.cMatrix(ixOrder+1, ixTap+1);
        % index of input sample that contributes to output via the current tap
        % higher tap index => longer delay => older input sample => smaller data index
        dataIx = inputIndexIntegerPart - ixTap;
        % wrap around
        dataIx = mod(dataIx, nSamplesIn);
        % array indexing starts at 1
        dataIx = dataIx + 1;
        delayed = inData.signal(dataIx);
        % for each individual output sample (index in row vector), 
        % - evaluate f = c(order, tapindex) * fracPart .^ order 
        % - scale the delayed input sample with f
        % - accumulate the contribution of all taps
        % this implementation performs the operation for all output samples in parallel (row vector)
        outStream = outStream + c * delayed .* x;
    end % for ixTap
end % for ixOrder

% **************************************************************
% plot
% **************************************************************
xIn = linspace(0, 1, nSamplesIn + 1); xIn = xIn(1:end-1);
xOut = linspace(0, 1, nSamplesOut + 1); xOut = xOut(1:end-1);

figure(); grid on; hold on;
stem(xIn, inData.signal, 'k+-');
plot(xOut, outStream, 'b+-');
legend('input', 'output');
title('Farrow resampling. Signals are cyclic');

Continuous-time equivalent of a discrete-time impulse response

Markus Nentwig August 12, 2011 Coded in Matlab
% ************************************************************
% * Construct a continuous-time impulse response based on a discrete-time filter
% ************************************************************
close all; clear all;

% ************************************************************
% * example filter
% * f = [0 0.7 0.8 1]; a = [1 1 0 0];
% * b = remez(45, f, a);
% * h = b .';
% Illustrates rather clearly the difficulty of "interpolating" (in a geometric sense, 
% via polynomials, splines etc) between impulse response samples
% ************************************************************    
h = [2.64186706e-003 2.50796828e-003 -4.25450455e-003 4.82982358e-003 -2.51252769e-003 -2.52706568e-003 7.73569146e-003 -9.30398382e-003 4.65100223e-003 5.17152459e-003 -1.49409856e-002 1.76001904e-002 -8.65545521e-003 -9.78478603e-003 2.82103205e-002 -3.36173643e-002 1.68597129e-002 2.01914744e-002 -6.17486493e-002 8.13362871e-002 -4.80981494e-002 -8.05143565e-002 5.87677665e-001 5.87677665e-001 -8.05143565e-002 -4.80981494e-002 8.13362871e-002 -6.17486493e-002 2.01914744e-002 1.68597129e-002 -3.36173643e-002 2.82103205e-002 -9.78478603e-003 -8.65545521e-003 1.76001904e-002 -1.49409856e-002 5.17152459e-003 4.65100223e-003 -9.30398382e-003 7.73569146e-003 -2.52706568e-003 -2.51252769e-003 4.82982358e-003 -4.25450455e-003 2.50796828e-003 2.64186706e-003];

% ************************************************************
% zero-pad the impulse response of the FIR filter to 5x its original length
% (time domain)
% The filter remains functionally identical, since appending zero taps changes nothing
% ************************************************************
timePaddingFactor = 5;
n1 = timePaddingFactor * size(h, 2);
nh = size(h, 2);
nPad = n1 - nh;
nPad1 = floor(nPad / 2);
nPad2 = nPad - nPad1;
h = [zeros(1, nPad1), h, zeros(1, nPad2)];
hOrig = h;

% ************************************************************
% Determine equivalent Fourier series coefficients (frequency domain)
% assume that the impulse response is bandlimited (time-discrete signal by definition) 
% and periodic (period length see above, can be extended arbitrarily)
% ************************************************************
h = fft(h); % Fourier transform time domain to frequency domain

% ************************************************************
% Evaluate the Fourier series on an arbitrarily dense grid 
% this allows to resample the impulse response on an arbitrarily dense grid
% ************************************************************
% zero-pad Fourier transform
% ideal band-limited oversampling of the impulse response to n2 samples
n2 = 10 * n1;

h = [h(1:ceil(n1/2)), zeros(1, n2-n1), h(ceil(n1/2)+1:end)];
h = ifft(h); % back to time domain
% Note: One may write out the definition of ifft() and evaluate the exp(...) term at an 
% arbitrary time to acquire a true continuous-time equation.

% numerical inaccuracy in (i)fft causes some imaginary part ~1e-15
assert(max(abs(imag(h))) / max(abs(real(h))) < 1e-14);
h = real(h);

% scale to maintain amplitude level
h = h * n2 / n1;

% construct x-axis [0, 1[
xOrig = linspace(-timePaddingFactor/2, timePaddingFactor/2, size(hOrig, 2) + 1); xOrig = xOrig(1:end-1);
x = linspace(-timePaddingFactor/2, timePaddingFactor/2, size(h, 2) + 1); x = x(1:end-1);

% ************************************************************
% Plot 
% ************************************************************
% ... on a linear scale

% find points where original impulse response is defined (for illustration)
ixOrig = find(abs(hOrig) > 0);

figure(); grid on; hold on;
stem(xOrig(ixOrig), hOrig(ixOrig), 'k');
plot(x, h, 'b'); 
legend('discrete-time FIR filter', 'continuous-time equivalent');
title('equivalent discrete- and continuous-time filters');
xlabel('time (relative to discrete time IR duration)'); ylabel('h(t)');

% ... and again on a logarithmic scale
myEps = 1e-15;
figure(); grid on; hold on;
u = plot(xOrig(ixOrig), 20*log10(abs(hOrig(ixOrig)) + myEps), 'k+'); set(u, 'lineWidth', 3);
plot(x, 20*log10(abs(h) + myEps), 'b');
legend('discrete-time FIR filter', 'continuous-time equivalent');
title('equivalent discrete- and continuous-time filters');
xlabel('time (relative to discrete time IR duration)'); ylabel('|h(t)| / dB');

Computing Reciprocals of Fixed-Point Numbers by the Newton-Raphson Method — Division by Multiplication

August 11, 20115 comments Coded in ASM for the TI C64x
* =========================================================================== *
*                                                                             *
*  Compute reciprocals of a Q.15 vector by Newton-Raphson method              *
*      y[i] = ym[i] * 2^ye[i] = 1 / x[i], -1 <= x[i] < 1 and x[i] != 0        *
*      where ym is the mantissa vector and ye is the exponent vector          *
*                                                                             *
*  C prototype:                                                               *
*       void DSP_vrecip16n(short* x,    // Input, Q.15 vector                 *
*                          short* ym,   // Output, Q.15 vector                *
*                          short* ye,   // Output, int16 vector               *
*                          int    N);   // Input, vector length               *
*                                                                             *
*  Restriction:                                                               *
*       (N % 4) == 0 and N >= 24                                              *
*                                                                             *
*  Performance:                                                               *
*       55+2.5*N (software pipelining enabled by -O2, CCS5.1)                 *
*                                                                             *
*  Relative error:                                                            *
*       (-2^-16, 2^-16)                                                       *
*                                                                             *
*  Algorithm:                                                                 *
*       Input: V, abs(x[i]) normalized to [.5, 1)                             *
*       Initialization: U0 = (V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits      *
*       Iteration 1:    U1 = U0*(1-V*U0),          ~1/(4*V), 11.481 bits      *
*       Iteration 2:    U2 = 8*U1*(.5-V*U1),       ~1/(2*V), 22.585 bits      *
*       Output in the format of Fl16 = Q.15(mant)|Int16(expt)                 *
*                                                                             *
* =========================================================================== *
    
        .sect ".text: _DSP_vrecip16n"
        .global _DSP_vrecip16n

_DSP_vrecip16n: .cproc  A_X, B_YM, A_YE, B_n   
    
        .no_mdep
        .rega   A_x0, A_x1, A_rr, A_nx0, A_nx1, A_nx10, A_ny10, A_v10, A_vc10  
        .rega   A_vs1:A_vs0, A_vs10, A_u10, A_vu0, A_vu1, A_vu10, A_u0, A_u1  
        .rega   A_y0, A_y1, A_y32:A_y10, A_vp10, A_x32:A_x10 
        .rega   A_ss, A_cc, A_mm, A_ww, A_w, A_rnd
        .regb   B_x2, B_x3, B_rr, B_nx2, B_nx3, B_nx32, B_y32, B_v32, B_vc32 
        .regb   B_vs3:B_vs2, B_vs32, B_u32, B_vu2, B_vu3, B_vu32, B_u2, B_u3    
        .regb   B_y2, B_y3, B_ny32:B_ny10, B_vp32 
        .regb   B_ss, B_cc, B_mm, B_ww, B_w, B_rnd, B_nn, B_X 
        .reg    B_i, C10, C32, Cl10, Cl32

            ADD         4,              A_X,            B_X
            
            MVKL        0xB67BB67B,     B_mm                    ; Q15(1.4256)                  
            MVKH        0xB67BB67B,     B_mm                    ; Q15(1.4256)
            MV          B_mm,           A_mm
            MVKL        0x60006000,     A_cc
            MVKH        0x60006000,     A_cc
            MV          A_cc,           B_cc       
            MVKL        0xFFF1FFF1,     B_rr
            MVKH        0xFFF1FFF1,     B_rr
            MV          B_rr,           A_rr
            MVKL        0x80008000,     A_ww
            MVKH        0x80008000,     A_ww
            MV          A_ww,           B_ww
            SHL         A_ww,           15,             A_w     ; 0x40000000  
            SHL         B_ww,           15,             B_w     ; 0x40000000           
            ROTL        A_w,            17,             A_rnd   ; 0x00008000
            ROTL        B_w,            17,             B_rnd   ; 0x00008000
            MVK         15,             A_ss
            MVK         15,             B_ss

            SHR         B_n,            2,              B_i
            SUB         B_i,            2,              B_i
    
LOOP_vrecip: .trip 16 
            LDH         *A_X++,         A_x0
            LDH         *A_X++[3],      A_x1  
            LDH         *B_X++,         B_x2
            LDH         *B_X++[3],      B_x3
 
            NORM        A_x0,           A_nx0
            NORM        A_x1,           A_nx1
            NORM        B_x2,           B_nx2
            NORM        B_x3,           B_nx3
            PACK2       A_nx1,          A_nx0,          A_nx10
            PACK2       B_nx3,          B_nx2,          B_nx32
            ADD2        A_rr,           A_nx10,         A_ny10
            ADD2        B_rr,           B_nx32,         B_ny32

            SSHVL       A_x0,           A_nx0,          A_x0
            SSHVL       A_x1,           A_nx1,          A_x1
            SSHVL       B_x2,           B_nx2,          B_x2
            SSHVL       B_x3,           B_nx3,          B_x3
            PACKH2      A_x1,           A_x0,           A_v10
            PACKH2      B_x3,           B_x2,           B_v32
            
            ; Initial value, U0=(V-.75)^2 + 1.4256-V, ~1/(2*V), 5.9233 bits
            ABS2        A_v10,          A_vp10
            ABS2        B_v32,          B_vp32
            SUB2        A_vp10,         A_cc,           A_vc10
            SUB2        B_vp32,         B_cc,           B_vc32
            SUB2        A_mm,           A_vp10,         A_u10
            SUB2        B_mm,           B_vp32,         B_u32
            SMPY2       A_vc10,         A_vc10,         A_vs1:A_vs0
            SMPY2       B_vc32,         B_vc32,         B_vs3:B_vs2
            PACKH2      A_vs1,          A_vs0,          A_vs10
            PACKH2      B_vs3,          B_vs2,          B_vs32
            ADD2        A_vs10,         A_u10,          A_u10
            ADD2        B_vs32,         B_u32,          B_u32
            SHR2        A_v10,          A_ss,           C10
            SHR2        B_v32,          B_ss,           C32
            XOR         C10,            A_u10,          A_u10
            XOR         C32,            B_u32,          B_u32
                       
            ; Iteration 1, U1=U0*(1-V*U0), ~1/(4*V), 11.481 bits
            SMPY2       A_v10,          A_u10,          A_vu1:A_vu0
            SMPY2       B_v32,          B_u32,          B_vu3:B_vu2
            PACKH2      A_vu1,          A_vu0,          A_vu10
            PACKH2      B_vu3,          B_vu2,          B_vu32
            SUB2        A_ww,           A_vu10,         A_vu10
            SUB2        B_ww,           B_vu32,         B_vu32
            SMPY2       A_u10,          A_vu10,         A_u1:A_u0
            SMPY2       B_u32,          B_vu32,         B_u3:B_u2
            PACKH2      A_u1,           A_u0,           A_u10
            PACKH2      B_u3,           B_u2,           B_u32

            ; Iteration 2, U2=8*U1*(1/2-V*U1), ~1/(2*V), 22.585 bits
            SMPY2       A_v10,          A_u10,          A_vu1:A_vu0
            SMPY2       B_v32,          B_u32,          B_vu3:B_vu2
            SUB         A_w,            A_vu0,          A_vu0
            SUB         A_w,            A_vu1,          A_vu1
            SUB         B_w,            B_vu2,          B_vu2
            SUB         B_w,            B_vu3,          B_vu3
            MPYHIR      A_u0,           A_vu0,          A_u0
            MPYHIR      A_u1,           A_vu1,          A_u1
            MPYHIR      B_u2,           B_vu2,          B_u2
            MPYHIR      B_u3,           B_vu3,          B_u3
            SSHL        A_u0,           3,              A_u0
            SSHL        A_u1,           3,              A_u1
            SSHL        B_u2,           3,              B_u2
            SSHL        B_u3,           3,              B_u3          

            ; Fl16 = Q15(mant)|int16(expt)
            SADD        A_u0,           A_rnd,          A_y0
            SADD        A_u1,           A_rnd,          A_y1
            SADD        B_u2,           B_rnd,          B_y2
            SADD        B_u3,           B_rnd,          B_y3
            PACKH2      A_y1,           A_y0,           A_y10
            PACKH2      B_y3,           B_y2,           B_y32
  
            MV          B_y32,          A_y32
            MV          A_ny10,         B_ny10

            STDW        A_y32:A_y10,    *B_YM++
            STDW        B_ny32:B_ny10,  *A_YE++

            BDEC        LOOP_vrecip,    B_i
            
            .endproc