Sign in

username:

password:



Not a member?

Search c6x



Search tips

Subscribe to c6x



c6x by Keywords

AD535 | BIOS | Booting | Bootloader | C621 | C6211 | C6415 | C671 | C6711 | C6711DSK | C6713 | CCS | Chassaing | COFF | DAT | DM64 | DM642 | DMA | DSK671 | DSK6711 | EDM | EDMA | EMIF | Emulator | EVM | EVM620 | FFT | FIR | GPIO | Halting | HPI | HWI | IDK | JTAG | LDB | LDH | LDW | Linker | LMS | LOG_printf | Matlab | McBSP | MEM_alloc | MIPS | PCI | PCM3003 | Pipeline | Profiling | QDM | Reset | ROM | RTDX | Sampling | SDRAM | Stack | TEB | THS1206 | TMS320C621 | TMS320C6416 | TMS320C6711 | TMS320C6713 | UART | Vector Table | XBUS | XDS560

Sponsor

Industry's highest performing at the lowest power DSPs now as low as $5.00*
Start development today!
*volume pricing for 10ku

Discussion Groups

See Also

Embedded SystemsFPGAElectronics

Discussion Groups | TMS320C6x | optimizing C code for c6701

Technical discussions about the TI C6000 DSPs (including the c62x, c64x and c67x DSPs).

  

Post a new Thread

optimizing C code for c6701 - faysal basci - Apr 7 20:24:00 2002

Hi,
I'm implementing fx-LMS algorithm on c6701. However, system seems not be
able to catch up the real time deadlines. I've seen in assembly codes
generated that my LMS update functions and filtering functions are not
pipelined. The comment for disqualification says "Disqualified loop: loop
contains a call" . However, there is no callto any external functions in the
loop. Below is the both C - code and the produced assembly code fo one of
the functions.

One more question would be about efficient circular addressing in C. Is
there a better way of doing circular addressing which is also supported(?)
by hardware. Thanks in advance.

Faysal, THE C-CODE

/*
* This function does real time filtering over a buffer to which data is
written circularly
* W :the filter
* X :the buffer
* filt_length : the length of the filter
* buffer_length : the length of the buffer
* pointer_pos : the index of the newest element writen to the buffer
*
*/
float filter_1d(const float *W, const float *X, int filt_length, int
buffer_length, int pointer_pos){
int i;
float retval=0;
int intermadiate_index = pointer_pos+buffer_length;

for (i=0; i<filt_length; ++i){
retval+=W[i]*X[(intermadiate_index - i)%buffer_length];
}
return retval;
}
ASSEMBLY CODE PRODUCED by CCS V.2.0

;***************************************************************************
***
;* FUNCTION NAME: _filter_1d
*
;*
*
;* Regs Modified :
A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,B0,B1,B2,B3,B4,B5, *
;* SP
*
;* Regs Used :
A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,B0,B1,B2,B3,B4,B5, *
;* B6,SP
*
;* Local Frame Size : 0 Args + 0 Auto + 8 Save = 8 byte
*
;***************************************************************************
***
_filter_1d:
;** ------------------------------------------------------------------------
--*
.line 2
.sym _W,4, 22, 17, 32
.sym _X,20, 22, 17, 32
.sym _filt_length,6, 4, 17, 32
.sym _buffer_length,22, 4, 17, 32
.sym _pointer_pos,8, 4, 17, 32
.sym _intermadiate_index,7, 4, 4, 32
.sym _retval,3, 6, 4, 32
.sym _i,0, 4, 4, 32
.sym _W,9, 22, 4, 32
.sym _X,10, 22, 4, 32
.sym _filt_length,20, 4, 4, 32
.sym _buffer_length,22, 4, 4, 32
.sym _pointer_pos,8, 4, 4, 32
.sym L$1,8, 4, 4, 32
;** 94 ----------------------- intermadiate_index =
pointer_pos+buffer_length;
;** 93 ----------------------- retval = 0.0F;
;** 97 ----------------------- if ( filt_length <= 0 ) goto g4;
STW .D2T2 B3,*SP--(8) ; |91|
STW .D2T1 A10,*+SP(4) ; |91|

MV .D1 A4,A9
|| MV .S2X A6,B4
|| MV .S1X B4,A10

.line 5
ADD .S1X B6,A8,A7 ; |94|
.line 4
ZERO .D1 A3 ; |93|
.line 8
CMPGT .L2 B4,0,B0 ; |97|
[!B0] B .S1 L5 ; |97|
NOP 5
; BRANCH OCCURS ; |97|
;** ------------------------------------------------------------------------
--*
;** 98 ----------------------- L$1 = filt_length;
;** 97 ----------------------- i = 0;
;** ----------------------- #pragma MUST_ITERATE(1, 1099511627775, 1)
.line 9
MV .S1X B4,A8 ; |98|
.line 8
ZERO .D1 A0 ; |97|
;*--------------------------------------------------------------------------
--*
;* SOFTWARE PIPELINE INFORMATION
;* Disqualified loop: loop contains a call
;*--------------------------------------------------------------------------
--*
L4:
;** -----------------------g3:
;** 98 ----------------------- retval +=
W[i]*X[(intermadiate_index-i)%buffer_length];
;** 98 ----------------------- ++i;
;** 98 ----------------------- if ( --L$1 ) goto g3;
.line 9
MVKL .S2 __remi,B5 ; |98|
MVKH .S2 __remi,B5 ; |98|
B .S2 B5 ; |98|
MVKL .S2 RL4,B3 ; |98|
MVKH .S2 RL4,B3 ; |98|
MV .D2 B6,B4 ; |98|
SUB .D1 A7,A0,A4
NOP 1
RL4: ; CALL OCCURS ; |98|
LDW .D1T1 *+A9[A0],A5 ; |98|
LDW .D1T1 *+A10[A4],A4 ; |98|
ADD .D1 1,A0,A0 ; |98|
NOP 3
MPYSP .M1 A4,A5,A4 ; |98|
SUB .D1 A8,1,A1
[ A1] B .S1 L4 ; |98|
SUB .S1 A8,1,A8
ADDSP .L1 A4,A3,A3 ; |98|
NOP 3
; BRANCH OCCURS ; |98|
;** ------------------------------------------------------------------------
--*
L5:
;** -----------------------g4:
;** 99 ----------------------- return retval;
.line 10
MV .D1 A3,A4 ; |99|
.line 11
LDW .D2T1 *+SP(4),A10 ; |100|
LDW .D2T2 *++SP(8),B3 ; |100|
NOP 4
B .S2 B3 ; |100|
NOP 5
; BRANCH OCCURS ; |100|
.endfunc 100,000080400h,8 .sect ".text"
.global _readInput
.file "e:\anc_nobios\snap.c"
.sym _readInput,_readInput, 38, 2, 0
.func 341




______________________________
Start your Android Ice Cream Sandwich development on TI's AM35x Sitara ARM Cortex-A8 processor today.



(You need to be a member of c6x -- send a blank email to c6x-subscribe@yahoogroups.com )

Re: optimizing C code for c6701 - Jagadeesh Sankaran - Apr 8 13:52:00 2002


>THE C-CODE
>
>/*
>* This function does real time filtering over a buffer to which data is
>written circularly
>* W :the filter
>* X :the buffer
>* filt_length : the length of the filter
>* buffer_length : the length of the buffer
>* pointer_pos : the index of the newest element writen to the buffer
>*
> */
>float filter_1d(const float *W, const float *X, int filt_length, int
>buffer_length, int pointer_pos){
> int i;
> float retval=0;
> int intermadiate_index = pointer_pos+buffer_length;
>
> for (i=0; i<filt_length; ++i){
> retval+=W[i]*X[(intermadiate_index - i)%buffer_length];
> }
> return retval;
>} This is a case where the modulus function is being eveluated by a call to remi,
remainder upon integer division. Modulus is an expensive function, but
fortunately if buffer_length is a power of 2, then one can obtain the
same results as modulus by implementing & (buffer_length - 1). AND is
an atomic operation and hence will not results in the code being
disqualified from software pipeling. The buffer_length in your case
has got to be a power of two as you were trying to implement circular
buffer using the hardware which only works for powers of 2, anyways.

To the best of my knowledge there is no direct support for circular
addressing from C, there is however support from SA where you can program
AMR and CSR control registers. Refer C6000 documentation for this. The
other approach is to use block based processing, where you maintain
the context once in N blocks. This will result in a memcpy of the context
from the N-1 th block to front of the Nth block once in every N blocks,
but will simplify the software development a whole lot.

Along with this e-mail I am attaching the C code shown here and the resulting
assembly. With some simple tweaks I was able to get 4 filter-taps to be
eveluated in 5 cycles. Another way you could go is to use the fir benchmark
on TI's web-page.

float filter_1d(const float *W, const float *X, int filt_length, int
buffer_length, int pointer_pos)
{
int i;
float retval=0;
int intermadiate_index = pointer_pos+buffer_length;

_nassert((int)(filt_length)%4 == 0);
for (i = 0; i < filt_length; ++i)
{
retval+=W[i]*X[(intermadiate_index - i) & (buffer_length - 1)];
}

return retval;

}

I have also added the following _nassert's to help the compiler with
optimization, used -o2 -mwtx -mv6700.

_nassert((int)(filt_length)%4 == 0);
_nassert((int)(buffer_length)%2 == 0);
_nassert((int)(filt_length) >= 16);

I got the resulting code from TOOLS ver 4.20:

L1: ; PIPED LOOP PROLOG

ZERO .L2 B4
|| STW .D2T2 B10,*+SP(24) ; |3|
|| AND .S1 A4,A7,A8 ; (P) |14|
|| AND .L1 A4,A6,A7 ; (P) |14|
|| SUB .S2X A6,1,B2 ; (P) |14|
|| LDW .D1T1 *+A5[A8],A3 ; (P) |14|

ZERO .S1 A6
|| MVKH .S2 0x10000,B1 ; init prolog collapse predicate
|| ADD .L1 4,A3,A11 ; (P) Define a twin register
|| LDW .D2T1 *+B9[B5],A3 ; (P) |14|
|| MV .L2X A7,B10 ; (P) Define a twin register
|| LDW .D1T2 *+A9[A3],B3 ; (P) |14|

SET .S1 A0,0xf,0xf,A1 ; init prolog collapse predicate
|| SUB .L2 B0,2,B0
|| LDW .D1T1 *+A5[A8],A8 ; (P) |14|
|| B .S2 L2 ; (P) |15|
|| SUB .L1 A10,A11,A0 ; (P) @|14|
|| LDW .D2T2 *+B6[B5],B11 ; (P) |14|

;** --------------------------------------------------------------------------*
L2: ; PIPED LOOP KERNEL

[!A1] ADDSP .L1 A7,A2,A2 ; ^ |14|
|| MPYSP .M1X A8,B11,A7 ; @|14|
|| LDW .D2T1 *+B7[B10],A8 ; @@|14|
|| AND .S2X A4,B2,B11 ; @@|14|
|| SUB .S1 A0,2,A8 ; @@@|14|
|| LDW .D1T2 *+A12[A11],B3 ; @@@|14|

[!A1] ADDSP .L2 B8,B4,B4 ; ^ |14|
|| [!B1] ADDSP .L1 A7,A6,A6 ; @ ^ |14|
|| [!B1] MPYSP .M2X B2,A3,B8 ; @|14|
|| LDW .D2T2 *+B7[B11],B2 ; @@|14|
|| SUB .D1 A0,3,A7 ; @@@|14|
|| AND .S1 A4,A8,A8 ; @@@|14|

[ B1] MPYSU .M2 2,B1,B1 ;
|| [ B0] SUB .D2 B0,1,B0 ; @|15|
|| ADD .L2 4,B5,B5 ; @@|15|
|| MPYSP .M1X A3,B3,A7 ; @@|14|
|| AND .S1 A4,A7,A0 ; @@@|14|
|| AND .L1 A4,A0,A7 ; @@@|14|
|| LDW .D1T1 *+A5[A8],A3 ; @@@|14|
|| SUB .S2X A0,1,B2 ; @@@|14|

[ A1] MPYSU .M1 2,A1,A1 ;
|| [!B1] ADDSP .L2 B11,B8,B8 ; @ ^ |14|
|| ADD .S1 4,A11,A11 ; @@@Define a twin register
|| LDW .D2T1 *+B9[B5],A3 ; @@@|14|
|| MV .S2X A7,B10 ; @@@Define a twin register
|| LDW .D1T2 *+A9[A11],B3 ; @@@|14|

[ B0] B .S2 L2 ; @|15|
|| MPYSP .M2X A8,B3,B11 ; @@|14|
|| LDW .D2T2 *+B6[B5],B11 ; @@@|14|
|| LDW .D1T1 *+A5[A0],A8 ; @@@|14|
|| SUB .S1 A10,A11,A0 ; @@@@|14|

;** --------------------------------------------------------------------------*

Regards
Jagadeesh Sankaran


;*************************************************************************** ***
;* TMS320C6x ANSI C Codegen Version 4.20 *
;* Date/Time created: Mon Apr 8 08:48:55 2002 *
;******************************************************************************< br />
;******************************************************************************< br /> ;* GLOBAL FILE PARAMETERS *
;* *
;* Architecture : TMS320C670x *
;* Optimization : Enabled at level 2 *
;* Optimizing for : Speed *
;* Based on options: -o2, no -ms *
;* Endian : Little *
;* Interrupt Thrshld : Disabled *
;* Memory Model : Small *
;* Calls to RTS : Near *
;* Pipelining : Enabled *
;* Speculative Load : Disabled *
;* Memory Aliases : Presume not aliases (optimistic) *
;* Debug Info : No Debug Info *
;* *
;******************************************************************************< br />
.asg A15, FP
.asg B14, DP
.asg B15, SP
.global $bss

; opt6x -t -v6700 -O2 /var/tmp/aaaa003TO /var/tmp/daaa003TO
.sect ".text"
.global _filter_1d

;******************************************************************************< br /> ;* FUNCTION NAME: _filter_1d *
;* *
;* Regs Modified : A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,B0,B1,B2, *
;* B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,SP *
;* Regs Used : A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,B0,B1,B2, *
;* B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,SP *
;* Local Frame Size : 0 Args + 0 Auto + 28 Save = 28 byte *
;******************************************************************************< br /> _filter_1d:
;** --------------------------------------------------------------------------*
STW .D2T2 B12,*SP--(32) ; |3|
STW .D2T2 B11,*+SP(28) ; |3|

ZERO .D1 A2
|| ZERO .S2 B8
|| ZERO .S1 A0
|| MV .L1 A6,A3
|| STW .D2T2 B3,*+SP(20) ; |3|

ZERO .L2 B1
|| ADD .D1 12,A4,A9
|| ZERO .L1 A3
|| SHR .S2X A3,2,B0 ; |14|
|| STW .D2T1 A10,*+SP(8) ; |3|
|| ADD .S1X B6,A8,A10

ADD .L2X 4,A4,B9
|| SUB .S1X B6,1,A5
|| MVC .S2 CSR,B12
|| STW .D2T1 A12,*+SP(16) ; |3|
|| ADD .D1 8,A4,A12
|| SUB .L1 A10,A3,A6 ; (P) |14|

MV .D2 B4,B7
|| AND .S2 -2,B12,B5
|| MV .L2X A4,B6
|| LDW .D1T2 *+A12[A3],B3 ; (P) |14|
|| MV .S1 A5,A4
|| SUB .L1 A6,2,A5 ; (P) |14|

ZERO .L2 B5
|| MV .S1X B4,A5
|| STW .D2T1 A11,*+SP(12) ; |3|
|| MVC .S2 B5,CSR ; interrupts off
|| SUB .D1 A6,3,A7 ; (P) |14|
|| AND .L1 A4,A5,A8 ; (P) |14|

;*----------------------------------------------------------------------------*< br /> ;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop source line : 12
;* Loop opening brace source line : 13
;* Loop closing brace source line : 15
;* Loop Unroll Multiple : 4x
;* Known Minimum Trip Count : 4
;* Known Max Trip Count Factor : 1
;* Loop Carried Dependency Bound(^) : 4
;* Unpartitioned Resource Bound : 4
;* Partitioned Resource Bound(*) : 5
;* Resource Partition:
;* A-side B-side
;* .L units 2 2
;* .S units 0 1
;* .D units 4 4
;* .M units 2 2
;* .X cross paths 2 5*
;* .T address paths 4 4
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 3 3 (.L or .S unit)
;* Addition ops (.LSD) 4 2 (.L or .S or .D unit)
;* Bound(.L .S .LS) 3 3
;* Bound(.L .S .D .LS .LSD) 5* 4
;*
;* Searching for software pipeline schedule at ...
;* ii = 5 Schedule found with 5 iterations in parallel
;* done
;*
;* Epilog not entirely removed
;* Collapsed epilog stages : 1
;*
;* Prolog not entirely removed
;* Collapsed prolog stages : 2
;*
;* Minimum required memory pad : 0 bytes
;*
;* For further improvement on this loop, try option -mh8
;*
;* Minimum safe trip count : 3 (after unrolling)
;*----------------------------------------------------------------------------*< br /> ;* SINGLE SCHEDULED ITERATION
;*
;* C38:
;* SUB .S1 A10,A11,A0 ; |14|
;* LDW .D1T2 *+A12[A11],B3 ; |14|
;* || SUB .S1 A0,2,A8 ; |14|
;* AND .S1 A4,A8,A8 ; |14|
;* || SUB .D1 A0,3,A7 ; |14|
;* AND .L1 A4,A0,A7 ; |14|
;* || SUB .S2X A0,1,B2 ; |14|
;* || LDW .D1T1 *+A5[A8],A3 ; |14|
;* || AND .S1 A4,A7,A0 ; |14|
;* MV .S2X A7,B10 ; Define a twin register
;* || LDW .D2T1 *+B9[B5],A3 ; |14|
;* || LDW .D1T2 *+A9[A11],B3 ; |14|
;* || ADD .S1 4,A11,A11 ; Define a twin register
;* LDW .D2T2 *+B6[B5],B11 ; |14|
;* || LDW .D1T1 *+A5[A0],A8 ; |14|
;* LDW .D2T1 *+B7[B10],A8 ; |14|
;* || AND .S2X A4,B2,B11 ; |14|
;* LDW .D2T2 *+B7[B11],B2 ; |14|
;* MPYSP .M1X A3,B3,A7 ; |14|
;* || ADD .L2 4,B5,B5 ; |15|
;* NOP 1
;* MPYSP .M2X A8,B3,B11 ; |14|
;* MPYSP .M1X A8,B11,A7 ; |14|
;* MPYSP .M2X B2,A3,B8 ; |14|
;* || ADDSP .L1 A7,A6,A6 ; ^ |14|
;* [ B0] SUB .D2 B0,1,B0 ; |15|
;* ADDSP .L2 B11,B8,B8 ; ^ |14|
;* [ B0] B .S2 C38 ; |15|
;* ADDSP .L1 A7,A2,A2 ; ^ |14|
;* ADDSP .L2 B8,B4,B4 ; ^ |14|
;* NOP 3
;* ; BRANCH OCCURS ; |15|
;*----------------------------------------------------------------------------*< br /> L1: ; PIPED LOOP PROLOG

ZERO .L2 B4
|| STW .D2T2 B10,*+SP(24) ; |3|
|| AND .S1 A4,A7,A8 ; (P) |14|
|| AND .L1 A4,A6,A7 ; (P) |14|
|| SUB .S2X A6,1,B2 ; (P) |14|
|| LDW .D1T1 *+A5[A8],A3 ; (P) |14|

ZERO .S1 A6
|| MVKH .S2 0x10000,B1 ; init prolog collapse predicate
|| ADD .L1 4,A3,A11 ; (P) Define a twin register
|| LDW .D2T1 *+B9[B5],A3 ; (P) |14|
|| MV .L2X A7,B10 ; (P) Define a twin register
|| LDW .D1T2 *+A9[A3],B3 ; (P) |14|

SET .S1 A0,0xf,0xf,A1 ; init prolog collapse predicate
|| SUB .L2 B0,2,B0
|| LDW .D1T1 *+A5[A8],A8 ; (P) |14|
|| B .S2 L2 ; (P) |15|
|| SUB .L1 A10,A11,A0 ; (P) @|14|
|| LDW .D2T2 *+B6[B5],B11 ; (P) |14|

;** --------------------------------------------------------------------------*
L2: ; PIPED LOOP KERNEL

[!A1] ADDSP .L1 A7,A2,A2 ; ^ |14|
|| MPYSP .M1X A8,B11,A7 ; @|14|
|| LDW .D2T1 *+B7[B10],A8 ; @@|14|
|| AND .S2X A4,B2,B11 ; @@|14|
|| SUB .S1 A0,2,A8 ; @@@|14|
|| LDW .D1T2 *+A12[A11],B3 ; @@@|14|

[!A1] ADDSP .L2 B8,B4,B4 ; ^ |14|
|| [!B1] ADDSP .L1 A7,A6,A6 ; @ ^ |14|
|| [!B1] MPYSP .M2X B2,A3,B8 ; @|14|
|| LDW .D2T2 *+B7[B11],B2 ; @@|14|
|| SUB .D1 A0,3,A7 ; @@@|14|
|| AND .S1 A4,A8,A8 ; @@@|14|

[ B1] MPYSU .M2 2,B1,B1 ;
|| [ B0] SUB .D2 B0,1,B0 ; @|15|
|| ADD .L2 4,B5,B5 ; @@|15|
|| MPYSP .M1X A3,B3,A7 ; @@|14|
|| AND .S1 A4,A7,A0 ; @@@|14|
|| AND .L1 A4,A0,A7 ; @@@|14|
|| LDW .D1T1 *+A5[A8],A3 ; @@@|14|
|| SUB .S2X A0,1,B2 ; @@@|14|

[ A1] MPYSU .M1 2,A1,A1 ;
|| [!B1] ADDSP .L2 B11,B8,B8 ; @ ^ |14|
|| ADD .S1 4,A11,A11 ; @@@Define a twin register
|| LDW .D2T1 *+B9[B5],A3 ; @@@|14|
|| MV .S2X A7,B10 ; @@@Define a twin register
|| LDW .D1T2 *+A9[A11],B3 ; @@@|14|

[ B0] B .S2 L2 ; @|15|
|| MPYSP .M2X A8,B3,B11 ; @@|14|
|| LDW .D2T2 *+B6[B5],B11 ; @@@|14|
|| LDW .D1T1 *+A5[A0],A8 ; @@@|14|
|| SUB .S1 A10,A11,A0 ; @@@@|14|

;** --------------------------------------------------------------------------*
L3: ; PIPED LOOP EPILOG

MPYSP .M1X A8,B11,A0 ; (E) @@@|14|
|| LDW .D2T1 *+B7[B10],A8 ; (E) @@@@|14|
|| AND .S2X A4,B2,B11 ; (E) @@@@|14|
|| ADDSP .L1 A7,A2,A4 ; (E) @@ ^ |14|

ADDSP .L1 A7,A6,A5 ; (E) @@@ ^ |14|
|| MPYSP .M2X B2,A3,B5 ; (E) @@@|14|
|| LDW .D2T2 *+B7[B11],B2 ; (E) @@@@|14|
|| ADDSP .L2 B8,B4,B4 ; (E) @@ ^ |14|

LDW .D2T1 *+SP(16),A12 ; |19|
|| ADD .S2 4,B5,B5 ; (E) @@@@|15|
|| MPYSP .M1X A3,B3,A0 ; (E) @@@@|14|

LDW .D2T1 *+SP(12),A11 ; |19|
|| ADDSP .L2 B11,B8,B4 ; (E) @@@ ^ |14|

LDW .D2T2 *+SP(20),B3 ; |19|
|| MPYSP .M2X A8,B3,B5 ; (E) @@@@|14|

LDW .D2T2 *+SP(28),B11 ; |19|
|| MPYSP .M1X A8,B11,A0 ; (E) @@@@|14|
|| ADDSP .L1 A0,A4,A4 ; (E) @@@ ^ |14|

LDW .D2T2 *+SP(24),B10 ; |19|
|| ADDSP .L1 A0,A5,A3 ; (E) @@@@ ^ |14|
|| MPYSP .M2X B2,A3,B5 ; (E) @@@@|14|
|| ADDSP .L2 B5,B4,B4 ; (E) @@@ ^ |14|

LDW .D2T1 *+SP(8),A10 ; |19|
ADDSP .L2 B5,B4,B4 ; (E) @@@@ ^ |14|
NOP 1
ADDSP .L1 A0,A4,A0 ; (E) @@@@ ^ |14|
ADDSP .L2 B5,B4,B5 ; (E) @@@@ ^ |14|
NOP 2

LDW .D2T2 *++SP(32),B12 ; |19|
|| MVC .S2 B12,CSR ; interrupts on

ADDSP .L1X B5,A0,A0
NOP 3
ADDSP .L1 A3,A0,A0 ; |17|
NOP 1
B .S2 B3 ; |19|
NOP 1
ADDSP .L1X B4,A0,A4
NOP 3
; BRANCH OCCURS ; |19|


Attachment (not stored)
filter.c
Type: TEXT/x-sun-c-file


______________________________
New Code Sharing Section now Live on DSPRelated.com. Learn about the Reward Program for Contributors here.



(You need to be a member of c6x -- send a blank email to c6x-subscribe@yahoogroups.com )

Re: optimizing C code for c6701 - Mahesh Patil - Apr 9 3:45:00 2002

This function consists of a modulo call.
i.e( %) which gives the remainder .For this it calls a function called
_remi. that's why there is a disqualified loop.
U can remove this call.
----- Original Message -----
From: "faysal basci" <>
To: <>
Sent: Monday, April 08, 2002 1:54 AM
Subject: [c6x] optimizing C code for c6701 > Hi,
> I'm implementing fx-LMS algorithm on c6701. However, system seems not
be
> able to catch up the real time deadlines. I've seen in assembly codes
> generated that my LMS update functions and filtering functions are not
> pipelined. The comment for disqualification says "Disqualified loop: loop
> contains a call" . However, there is no callto any external functions in
the
> loop. Below is the both C - code and the produced assembly code fo one of
> the functions.
>
> One more question would be about efficient circular addressing in C. Is
> there a better way of doing circular addressing which is also supported(?)
> by hardware. > Thanks in advance.
>
> Faysal, > THE C-CODE
>
> /*
> * This function does real time filtering over a buffer to which data is
> written circularly
> * W :the filter
> * X :the buffer
> * filt_length : the length of the filter
> * buffer_length : the length of the buffer
> * pointer_pos : the index of the newest element writen to the buffer
> *
> */
> float filter_1d(const float *W, const float *X, int filt_length, int
> buffer_length, int pointer_pos){
> int i;
> float retval=0;
> int intermadiate_index = pointer_pos+buffer_length;
>
> for (i=0; i<filt_length; ++i){
> retval+=W[i]*X[(intermadiate_index - i)%buffer_length];
> }
> return retval;
> } >
> ASSEMBLY CODE PRODUCED by CCS V.2.0 ;***************************************************************************
> ***
> ;* FUNCTION NAME: _filter_1d
> *
> ;*
> *
> ;* Regs Modified :
> A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,B0,B1,B2,B3,B4,B5, *
> ;* SP
> *
> ;* Regs Used :
> A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,B0,B1,B2,B3,B4,B5, *
> ;* B6,SP
> *
> ;* Local Frame Size : 0 Args + 0 Auto + 8 Save = 8 byte
> *
>
;***************************************************************************
> ***
> _filter_1d:
>
;** ------------------------------------------------------------------------
> --*
> .line 2
> .sym _W,4, 22, 17, 32
> .sym _X,20, 22, 17, 32
> .sym _filt_length,6, 4, 17, 32
> .sym _buffer_length,22, 4, 17, 32
> .sym _pointer_pos,8, 4, 17, 32
> .sym _intermadiate_index,7, 4, 4, 32
> .sym _retval,3, 6, 4, 32
> .sym _i,0, 4, 4, 32
> .sym _W,9, 22, 4, 32
> .sym _X,10, 22, 4, 32
> .sym _filt_length,20, 4, 4, 32
> .sym _buffer_length,22, 4, 4, 32
> .sym _pointer_pos,8, 4, 4, 32
> .sym L$1,8, 4, 4, 32
> ;** 94 ----------------------- intermadiate_index =
> pointer_pos+buffer_length;
> ;** 93 ----------------------- retval = 0.0F;
> ;** 97 ----------------------- if ( filt_length <= 0 ) goto g4;
> STW .D2T2 B3,*SP--(8) ; |91|
> STW .D2T1 A10,*+SP(4) ; |91|
>
> MV .D1 A4,A9
> || MV .S2X A6,B4
> || MV .S1X B4,A10
>
> .line 5
> ADD .S1X B6,A8,A7 ; |94|
> .line 4
> ZERO .D1 A3 ; |93|
> .line 8
> CMPGT .L2 B4,0,B0 ; |97|
> [!B0] B .S1 L5 ; |97|
> NOP 5
> ; BRANCH OCCURS ; |97|
>
;** ------------------------------------------------------------------------
> --*
> ;** 98 ----------------------- L$1 = filt_length;
> ;** 97 ----------------------- i = 0;
> ;** ----------------------- #pragma MUST_ITERATE(1, 1099511627775, 1)
> .line 9
> MV .S1X B4,A8 ; |98|
> .line 8
> ZERO .D1 A0 ; |97|
>
;*--------------------------------------------------------------------------
> --*
> ;* SOFTWARE PIPELINE INFORMATION
> ;* Disqualified loop: loop contains a call
>
;*--------------------------------------------------------------------------
> --*
> L4:
> ;** -----------------------g3:
> ;** 98 ----------------------- retval +=
> W[i]*X[(intermadiate_index-i)%buffer_length];
> ;** 98 ----------------------- ++i;
> ;** 98 ----------------------- if ( --L$1 ) goto g3;
> .line 9
> MVKL .S2 __remi,B5 ; |98|
> MVKH .S2 __remi,B5 ; |98|
> B .S2 B5 ; |98|
> MVKL .S2 RL4,B3 ; |98|
> MVKH .S2 RL4,B3 ; |98|
> MV .D2 B6,B4 ; |98|
> SUB .D1 A7,A0,A4
> NOP 1
> RL4: ; CALL OCCURS ; |98|
> LDW .D1T1 *+A9[A0],A5 ; |98|
> LDW .D1T1 *+A10[A4],A4 ; |98|
> ADD .D1 1,A0,A0 ; |98|
> NOP 3
> MPYSP .M1 A4,A5,A4 ; |98|
> SUB .D1 A8,1,A1
> [ A1] B .S1 L4 ; |98|
> SUB .S1 A8,1,A8
> ADDSP .L1 A4,A3,A3 ; |98|
> NOP 3
> ; BRANCH OCCURS ; |98|
>
;** ------------------------------------------------------------------------
> --*
> L5:
> ;** -----------------------g4:
> ;** 99 ----------------------- return retval;
> .line 10
> MV .D1 A3,A4 ; |99|
> .line 11
> LDW .D2T1 *+SP(4),A10 ; |100|
> LDW .D2T2 *++SP(8),B3 ; |100|
> NOP 4
> B .S2 B3 ; |100|
> NOP 5
> ; BRANCH OCCURS ; |100|
> .endfunc 100,000080400h,8 > .sect ".text"
> .global _readInput
> .file "e:\anc_nobios\snap.c"
> .sym _readInput,_readInput, 38, 2, 0
> .func 341 >
>
> _____________________________________


______________________________
Start your Android Ice Cream Sandwich development on TI's AM35x Sitara ARM Cortex-A8 processor today.



(You need to be a member of c6x -- send a blank email to c6x-subscribe@yahoogroups.com )

Re: optimizing C code for c6701 - Mahesh Patil - Apr 9 11:16:00 2002

Here u have given the solution for buffer length which is constant.
but if it is a variable i.e, it can be power of 2 or may not be, then ur
code will again be disqualified.
so give a solution where we get maximum optimization.
I am attaching our code here.

----- Original Message -----
From: "Jagadeesh Sankaran" <>
To: <>; <>
Cc: <>; <>
Sent: Monday, April 08, 2002 7:22 PM
Subject: Re: [c6x] optimizing C code for c6701 >
> >THE C-CODE
> >
> >/*
> >* This function does real time filtering over a buffer to which data is
> >written circularly
> >* W :the filter
> >* X :the buffer
> >* filt_length : the length of the filter
> >* buffer_length : the length of the buffer
> >* pointer_pos : the index of the newest element writen to the buffer
> >*
> > */
> >float filter_1d(const float *W, const float *X, int filt_length, int
> >buffer_length, int pointer_pos){
> > int i;
> > float retval=0;
> > int intermadiate_index = pointer_pos+buffer_length;
> >
> > for (i=0; i<filt_length; ++i){
> > retval+=W[i]*X[(intermadiate_index - i)%buffer_length];
> > }
> > return retval;
> >}
> >
>
> This is a case where the modulus function is being eveluated by a call to
remi,
> remainder upon integer division. Modulus is an expensive function, but
> fortunately if buffer_length is a power of 2, then one can obtain the
> same results as modulus by implementing & (buffer_length - 1). AND is
> an atomic operation and hence will not results in the code being
> disqualified from software pipeling. The buffer_length in your case
> has got to be a power of two as you were trying to implement circular
> buffer using the hardware which only works for powers of 2, anyways.
>
> To the best of my knowledge there is no direct support for circular
> addressing from C, there is however support from SA where you can program
> AMR and CSR control registers. Refer C6000 documentation for this. The
> other approach is to use block based processing, where you maintain
> the context once in N blocks. This will result in a memcpy of the context
> from the N-1 th block to front of the Nth block once in every N blocks,
> but will simplify the software development a whole lot.
>
> Along with this e-mail I am attaching the C code shown here and the
resulting
> assembly. With some simple tweaks I was able to get 4 filter-taps to be
> eveluated in 5 cycles. Another way you could go is to use the fir
benchmark
> on TI's web-page.
>
> float filter_1d(const float *W, const float *X, int filt_length, int
> buffer_length, int pointer_pos)
> {
> int i;
> float retval=0;
> int intermadiate_index = pointer_pos+buffer_length;
>
> _nassert((int)(filt_length)%4 == 0);
> for (i = 0; i < filt_length; ++i)
> {
> retval+=W[i]*X[(intermadiate_index - i) & (buffer_length - 1)];
> }
>
> return retval;
>
> }
>
> I have also added the following _nassert's to help the compiler with
> optimization, used -o2 -mwtx -mv6700.
>
> _nassert((int)(filt_length)%4 == 0);
> _nassert((int)(buffer_length)%2 == 0);
> _nassert((int)(filt_length) >= 16);
>
> I got the resulting code from TOOLS ver 4.20:
>
> L1: ; PIPED LOOP PROLOG
>
> ZERO .L2 B4
> || STW .D2T2 B10,*+SP(24) ; |3|
> || AND .S1 A4,A7,A8 ; (P) |14|
> || AND .L1 A4,A6,A7 ; (P) |14|
> || SUB .S2X A6,1,B2 ; (P) |14|
> || LDW .D1T1 *+A5[A8],A3 ; (P) |14|
>
> ZERO .S1 A6
> || MVKH .S2 0x10000,B1 ; init prolog collapse
predicate
> || ADD .L1 4,A3,A11 ; (P) Define a twin register
> || LDW .D2T1 *+B9[B5],A3 ; (P) |14|
> || MV .L2X A7,B10 ; (P) Define a twin register
> || LDW .D1T2 *+A9[A3],B3 ; (P) |14|
>
> SET .S1 A0,0xf,0xf,A1 ; init prolog collapse
predicate
> || SUB .L2 B0,2,B0
> || LDW .D1T1 *+A5[A8],A8 ; (P) |14|
> || B .S2 L2 ; (P) |15|
> || SUB .L1 A10,A11,A0 ; (P) @|14|
> || LDW .D2T2 *+B6[B5],B11 ; (P) |14| ;** ------------------------------------------------------------------------
--*
> L2: ; PIPED LOOP KERNEL
>
> [!A1] ADDSP .L1 A7,A2,A2 ; ^ |14|
> || MPYSP .M1X A8,B11,A7 ; @|14|
> || LDW .D2T1 *+B7[B10],A8 ; @@|14|
> || AND .S2X A4,B2,B11 ; @@|14|
> || SUB .S1 A0,2,A8 ; @@@|14|
> || LDW .D1T2 *+A12[A11],B3 ; @@@|14|
>
> [!A1] ADDSP .L2 B8,B4,B4 ; ^ |14|
> || [!B1] ADDSP .L1 A7,A6,A6 ; @ ^ |14|
> || [!B1] MPYSP .M2X B2,A3,B8 ; @|14|
> || LDW .D2T2 *+B7[B11],B2 ; @@|14|
> || SUB .D1 A0,3,A7 ; @@@|14|
> || AND .S1 A4,A8,A8 ; @@@|14|
>
> [ B1] MPYSU .M2 2,B1,B1 ;
> || [ B0] SUB .D2 B0,1,B0 ; @|15|
> || ADD .L2 4,B5,B5 ; @@|15|
> || MPYSP .M1X A3,B3,A7 ; @@|14|
> || AND .S1 A4,A7,A0 ; @@@|14|
> || AND .L1 A4,A0,A7 ; @@@|14|
> || LDW .D1T1 *+A5[A8],A3 ; @@@|14|
> || SUB .S2X A0,1,B2 ; @@@|14|
>
> [ A1] MPYSU .M1 2,A1,A1 ;
> || [!B1] ADDSP .L2 B11,B8,B8 ; @ ^ |14|
> || ADD .S1 4,A11,A11 ; @@@Define a twin register
> || LDW .D2T1 *+B9[B5],A3 ; @@@|14|
> || MV .S2X A7,B10 ; @@@Define a twin register
> || LDW .D1T2 *+A9[A11],B3 ; @@@|14|
>
> [ B0] B .S2 L2 ; @|15|
> || MPYSP .M2X A8,B3,B11 ; @@|14|
> || LDW .D2T2 *+B6[B5],B11 ; @@@|14|
> || LDW .D1T1 *+A5[A0],A8 ; @@@|14|
> || SUB .S1 A10,A11,A0 ; @@@@|14| ;** ------------------------------------------------------------------------
--*
>
> Regards
> Jagadeesh Sankaran >
> _____________________________________ ----------------------------------------------------------------------------
---- >
;***************************************************************************
***
> ;* TMS320C6x ANSI C Codegen Version
4.20 *
> ;* Date/Time created: Mon Apr 8 08:48:55 2002
*
>
;***************************************************************************
*** ;***************************************************************************
***
> ;* GLOBAL FILE PARAMETERS
*
> ;*
*
> ;* Architecture : TMS320C670x
*
> ;* Optimization : Enabled at level 2
*
> ;* Optimizing for : Speed
*
> ;* Based on options: -o2, no -ms
*
> ;* Endian : Little
*
> ;* Interrupt Thrshld : Disabled
*
> ;* Memory Model : Small
*
> ;* Calls to RTS : Near
*
> ;* Pipelining : Enabled
*
> ;* Speculative Load : Disabled
*
> ;* Memory Aliases : Presume not aliases (optimistic)
*
> ;* Debug Info : No Debug Info
*
> ;*
*
>
;***************************************************************************
***
>
> .asg A15, FP
> .asg B14, DP
> .asg B15, SP
> .global $bss
>
> ; opt6x -t -v6700 -O2 /var/tmp/aaaa003TO /var/tmp/daaa003TO
> .sect ".text"
> .global _filter_1d ;***************************************************************************
***
> ;* FUNCTION NAME: _filter_1d
*
> ;*
*
> ;* Regs Modified :
A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,B0,B1,B2, *
> ;* B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,SP
*
> ;* Regs Used :
A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,B0,B1,B2, *
> ;* B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,SP
*
> ;* Local Frame Size : 0 Args + 0 Auto + 28 Save = 28 byte
*
>
;***************************************************************************
***
> _filter_1d:
>
;** ------------------------------------------------------------------------
--*
> STW .D2T2 B12,*SP--(32) ; |3|
> STW .D2T2 B11,*+SP(28) ; |3|
>
> ZERO .D1 A2
> || ZERO .S2 B8
> || ZERO .S1 A0
> || MV .L1 A6,A3
> || STW .D2T2 B3,*+SP(20) ; |3|
>
> ZERO .L2 B1
> || ADD .D1 12,A4,A9
> || ZERO .L1 A3
> || SHR .S2X A3,2,B0 ; |14|
> || STW .D2T1 A10,*+SP(8) ; |3|
> || ADD .S1X B6,A8,A10
>
> ADD .L2X 4,A4,B9
> || SUB .S1X B6,1,A5
> || MVC .S2 CSR,B12
> || STW .D2T1 A12,*+SP(16) ; |3|
> || ADD .D1 8,A4,A12
> || SUB .L1 A10,A3,A6 ; (P) |14|
>
> MV .D2 B4,B7
> || AND .S2 -2,B12,B5
> || MV .L2X A4,B6
> || LDW .D1T2 *+A12[A3],B3 ; (P) |14|
> || MV .S1 A5,A4
> || SUB .L1 A6,2,A5 ; (P) |14|
>
> ZERO .L2 B5
> || MV .S1X B4,A5
> || STW .D2T1 A11,*+SP(12) ; |3|
> || MVC .S2 B5,CSR ; interrupts off
> || SUB .D1 A6,3,A7 ; (P) |14|
> || AND .L1 A4,A5,A8 ; (P) |14| ;*--------------------------------------------------------------------------
--*
> ;* SOFTWARE PIPELINE INFORMATION
> ;*
> ;* Loop source line : 12
> ;* Loop opening brace source line : 13
> ;* Loop closing brace source line : 15
> ;* Loop Unroll Multiple : 4x
> ;* Known Minimum Trip Count : 4
> ;* Known Max Trip Count Factor : 1
> ;* Loop Carried Dependency Bound(^) : 4
> ;* Unpartitioned Resource Bound : 4
> ;* Partitioned Resource Bound(*) : 5
> ;* Resource Partition:
> ;* A-side B-side
> ;* .L units 2 2
> ;* .S units 0 1
> ;* .D units 4 4
> ;* .M units 2 2
> ;* .X cross paths 2 5*
> ;* .T address paths 4 4
> ;* Long read paths 0 0
> ;* Long write paths 0 0
> ;* Logical ops (.LS) 3 3 (.L or .S unit)
> ;* Addition ops (.LSD) 4 2 (.L or .S or .D unit)
> ;* Bound(.L .S .LS) 3 3
> ;* Bound(.L .S .D .LS .LSD) 5* 4
> ;*
> ;* Searching for software pipeline schedule at ...
> ;* ii = 5 Schedule found with 5 iterations in parallel
> ;* done
> ;*
> ;* Epilog not entirely removed
> ;* Collapsed epilog stages : 1
> ;*
> ;* Prolog not entirely removed
> ;* Collapsed prolog stages : 2
> ;*
> ;* Minimum required memory pad : 0 bytes
> ;*
> ;* For further improvement on this loop, try option -mh8
> ;*
> ;* Minimum safe trip count : 3 (after unrolling)
>
;*--------------------------------------------------------------------------
--*
> ;* SINGLE SCHEDULED ITERATION
> ;*
> ;* C38:
> ;* SUB .S1 A10,A11,A0 ; |14|
> ;* LDW .D1T2 *+A12[A11],B3 ; |14|
> ;* || SUB .S1 A0,2,A8 ; |14|
> ;* AND .S1 A4,A8,A8 ; |14|
> ;* || SUB .D1 A0,3,A7 ; |14|
> ;* AND .L1 A4,A0,A7 ; |14|
> ;* || SUB .S2X A0,1,B2 ; |14|
> ;* || LDW .D1T1 *+A5[A8],A3 ; |14|
> ;* || AND .S1 A4,A7,A0 ; |14|
> ;* MV .S2X A7,B10 ; Define a twin register
> ;* || LDW .D2T1 *+B9[B5],A3 ; |14|
> ;* || LDW .D1T2 *+A9[A11],B3 ; |14|
> ;* || ADD .S1 4,A11,A11 ; Define a twin register
> ;* LDW .D2T2 *+B6[B5],B11 ; |14|
> ;* || LDW .D1T1 *+A5[A0],A8 ; |14|
> ;* LDW .D2T1 *+B7[B10],A8 ; |14|
> ;* || AND .S2X A4,B2,B11 ; |14|
> ;* LDW .D2T2 *+B7[B11],B2 ; |14|
> ;* MPYSP .M1X A3,B3,A7 ; |14|
> ;* || ADD .L2 4,B5,B5 ; |15|
> ;* NOP 1
> ;* MPYSP .M2X A8,B3,B11 ; |14|
> ;* MPYSP .M1X A8,B11,A7 ; |14|
> ;* MPYSP .M2X B2,A3,B8 ; |14|
> ;* || ADDSP .L1 A7,A6,A6 ; ^ |14|
> ;* [ B0] SUB .D2 B0,1,B0 ; |15|
> ;* ADDSP .L2 B11,B8,B8 ; ^ |14|
> ;* [ B0] B .S2 C38 ; |15|
> ;* ADDSP .L1 A7,A2,A2 ; ^ |14|
> ;* ADDSP .L2 B8,B4,B4 ; ^ |14|
> ;* NOP 3
> ;* ; BRANCH OCCURS ; |15|
>
;*--------------------------------------------------------------------------
--*
> L1: ; PIPED LOOP PROLOG
>
> ZERO .L2 B4
> || STW .D2T2 B10,*+SP(24) ; |3|
> || AND .S1 A4,A7,A8 ; (P) |14|
> || AND .L1 A4,A6,A7 ; (P) |14|
> || SUB .S2X A6,1,B2 ; (P) |14|
> || LDW .D1T1 *+A5[A8],A3 ; (P) |14|
>
> ZERO .S1 A6
> || MVKH .S2 0x10000,B1 ; init prolog collapse
predicate
> || ADD .L1 4,A3,A11 ; (P) Define a twin register
> || LDW .D2T1 *+B9[B5],A3 ; (P) |14|
> || MV .L2X A7,B10 ; (P) Define a twin register
> || LDW .D1T2 *+A9[A3],B3 ; (P) |14|
>
> SET .S1 A0,0xf,0xf,A1 ; init prolog collapse
predicate
> || SUB .L2 B0,2,B0
> || LDW .D1T1 *+A5[A8],A8 ; (P) |14|
> || B .S2 L2 ; (P) |15|
> || SUB .L1 A10,A11,A0 ; (P) @|14|
> || LDW .D2T2 *+B6[B5],B11 ; (P) |14| ;** ------------------------------------------------------------------------
--*
> L2: ; PIPED LOOP KERNEL
>
> [!A1] ADDSP .L1 A7,A2,A2 ; ^ |14|
> || MPYSP .M1X A8,B11,A7 ; @|14|
> || LDW .D2T1 *+B7[B10],A8 ; @@|14|
> || AND .S2X A4,B2,B11 ; @@|14|
> || SUB .S1 A0,2,A8 ; @@@|14|
> || LDW .D1T2 *+A12[A11],B3 ; @@@|14|
>
> [!A1] ADDSP .L2 B8,B4,B4 ; ^ |14|
> || [!B1] ADDSP .L1 A7,A6,A6 ; @ ^ |14|
> || [!B1] MPYSP .M2X B2,A3,B8 ; @|14|
> || LDW .D2T2 *+B7[B11],B2 ; @@|14|
> || SUB .D1 A0,3,A7 ; @@@|14|
> || AND .S1 A4,A8,A8 ; @@@|14|
>
> [ B1] MPYSU .M2 2,B1,B1 ;
> || [ B0] SUB .D2 B0,1,B0 ; @|15|
> || ADD .L2 4,B5,B5 ; @@|15|
> || MPYSP .M1X A3,B3,A7 ; @@|14|
> || AND .S1 A4,A7,A0 ; @@@|14|
> || AND .L1 A4,A0,A7 ; @@@|14|
> || LDW .D1T1 *+A5[A8],A3 ; @@@|14|
> || SUB .S2X A0,1,B2 ; @@@|14|
>
> [ A1] MPYSU .M1 2,A1,A1 ;
> || [!B1] ADDSP .L2 B11,B8,B8 ; @ ^ |14|
> || ADD .S1 4,A11,A11 ; @@@Define a twin register
> || LDW .D2T1 *+B9[B5],A3 ; @@@|14|
> || MV .S2X A7,B10 ; @@@Define a twin register
> || LDW .D1T2 *+A9[A11],B3 ; @@@|14|
>
> [ B0] B .S2 L2 ; @|15|
> || MPYSP .M2X A8,B3,B11 ; @@|14|
> || LDW .D2T2 *+B6[B5],B11 ; @@@|14|
> || LDW .D1T1 *+A5[A0],A8 ; @@@|14|
> || SUB .S1 A10,A11,A0 ; @@@@|14| ;** ------------------------------------------------------------------------
--*
> L3: ; PIPED LOOP EPILOG
>
> MPYSP .M1X A8,B11,A0 ; (E) @@@|14|
> || LDW .D2T1 *+B7[B10],A8 ; (E) @@@@|14|
> || AND .S2X A4,B2,B11 ; (E) @@@@|14|
> || ADDSP .L1 A7,A2,A4 ; (E) @@ ^ |14|
>
> ADDSP .L1 A7,A6,A5 ; (E) @@@ ^ |14|
> || MPYSP .M2X B2,A3,B5 ; (E) @@@|14|
> || LDW .D2T2 *+B7[B11],B2 ; (E) @@@@|14|
> || ADDSP .L2 B8,B4,B4 ; (E) @@ ^ |14|
>
> LDW .D2T1 *+SP(16),A12 ; |19|
> || ADD .S2 4,B5,B5 ; (E) @@@@|15|
> || MPYSP .M1X A3,B3,A0 ; (E) @@@@|14|
>
> LDW .D2T1 *+SP(12),A11 ; |19|
> || ADDSP .L2 B11,B8,B4 ; (E) @@@ ^ |14|
>
> LDW .D2T2 *+SP(20),B3 ; |19|
> || MPYSP .M2X A8,B3,B5 ; (E) @@@@|14|
>
> LDW .D2T2 *+SP(28),B11 ; |19|
> || MPYSP .M1X A8,B11,A0 ; (E) @@@@|14|
> || ADDSP .L1 A0,A4,A4 ; (E) @@@ ^ |14|
>
> LDW .D2T2 *+SP(24),B10 ; |19|
> || ADDSP .L1 A0,A5,A3 ; (E) @@@@ ^ |14|
> || MPYSP .M2X B2,A3,B5 ; (E) @@@@|14|
> || ADDSP .L2 B5,B4,B4 ; (E) @@@ ^ |14|
>
> LDW .D2T1 *+SP(8),A10 ; |19|
> ADDSP .L2 B5,B4,B4 ; (E) @@@@ ^ |14|
> NOP 1
> ADDSP .L1 A0,A4,A0 ; (E) @@@@ ^ |14|
> ADDSP .L2 B5,B4,B5 ; (E) @@@@ ^ |14|
> NOP 2
>
> LDW .D2T2 *++SP(32),B12 ; |19|
> || MVC .S2 B12,CSR ; interrupts on
>
> ADDSP .L1X B5,A0,A0
> NOP 3
> ADDSP .L1 A3,A0,A0 ; |17|
> NOP 1
> B .S2 B3 ; |19|
> NOP 1
> ADDSP .L1X B4,A0,A4
> NOP 3
> ; BRANCH OCCURS ; |19| ----------------------------------------------------------------------------
---- > float filter_1d(const float *W, const float *X, int filt_length, int
> buffer_length, int pointer_pos)
> {
> int i;
> float retval=0;
> int intermadiate_index = pointer_pos+buffer_length;
>
> _nassert((int)(filt_length)%4 == 0);
> _nassert((int)(buffer_length)%2 == 0);
> _nassert((int)(filt_length) >= 16);
>
> for (i = 0; i < filt_length; ++i)
> {
> retval+=W[i]*X[(intermadiate_index - i) & (buffer_length - 1)];
> }
>
> return retval;
>
> }
>



/*
Function: Get_Rez()

Description: Gets delayed contribution from the previous excitation
vector.

Links to text: Sections 2.14, 2.18 & 3.4

Arguments:

Word16 *Tv delayed excitation
Word16 *PrevExc Previous excitation vector
Word16 Lag Closed loop pitch lag

Outputs:

Word16 *Tv delayed excitation

Return value: None
Lag is variable */

void Get_Rez( Word16 * restrict Tv, Word16 * restrict PrevExc, Word16 Lag )
{
int i ;

for ( i = 0 ; i < ClPitchOrd/2 ; i ++ )
Tv[i] = PrevExc[PitchMax - (int) Lag - ClPitchOrd/2 + i] ;

for ( i = 0 ; i < SubFrLen+ClPitchOrd/2 ; i ++ )
Tv[ClPitchOrd/2+i] = PrevExc[PitchMax - (int)Lag + i%(int)Lag] ;

return;
}


______________________________
New Code Sharing Section now Live on DSPRelated.com. Learn about the Reward Program for Contributors here.



(You need to be a member of c6x -- send a blank email to c6x-subscribe@yahoogroups.com )

optimizing C code for c6701 - One more issue - faysal basci - Apr 10 5:30:00 2002

Thanks everyone for very valuable advices. The code work much more
smoothly this time with your advices. There is one more thing I want
to ask:
--> I am using a C - struct to centralize all common variables used
along my application. I'm doing this because generally my functions
use require several variables. However, I wonder if that causes any
performance bottlenecks. To better visualize assume I have struct that
contains 10 variables. These inculde pointers and arrays. And I have a
function that manipulates an array in the struct using other variables
and arrays in the struct and possibly one or two variables not
included in the struct. I do not prefer (or should I?) to pass tens
of variables to the function, instead I just pass a pointer to the
struct and done.However, I doubt whether this scheme causes a
performance drawback. Below is what I mean; Question is should I
choose the first scheme or the second and why? (Do not bother with the
syntax). Thanks a lot for help.

1.
void myFunction1 (var1, var2, var3, var4...... varN){
.
.
.
}
2.
_MyStruct{
var1;
var2;
var3;
.
.
varN;
} myStruct

void myFunction2(*myStruct){
.
.

} Faysal,
-------------------
>
>>THE C-CODE
>>
>>/*
>>* This function does real time filtering over a buffer to which data
is
>>written circularly
>>* W :the filter
>>* X :the buffer
>>* filt_length : the length of the filter
>>* buffer_length : the length of the buffer
>>* pointer_pos : the index of the newest element writen to the
buffer
>>*
>> */
>>float filter_1d(const float *W, const float *X, int filt_length, int
>>buffer_length, int pointer_pos){
>> int i;
>> float retval=0;
>> int intermadiate_index = pointer_pos+buffer_length;
>>
>> for (i=0; i<filt_length; ++i){
>> retval+=W[i]*X[(intermadiate_index - i)%buffer_length];
>> }
>> return retval;
>>}
>>
>
>This is a case where the modulus function is being eveluated by a
call to remi,
>remainder upon integer division. Modulus is an expensive function,
but
>fortunately if buffer_length is a power of 2, then one can obtain the

>same results as modulus by implementing & (buffer_length - 1). AND is

>an atomic operation and hence will not results in the code being
>disqualified from software pipeling. The buffer_length in your case
>has got to be a power of two as you were trying to implement circular
>buffer using the hardware which only works for powers of 2, anyways.
>
>To the best of my knowledge there is no direct support for circular
>addressing from C, there is however support from SA where you can
program
>AMR and CSR control registers. Refer C6000 documentation for this.
The
>other approach is to use block based processing, where you maintain
>the context once in N blocks. This will result in a memcpy of the
context
>from the N-1 th block to front of the Nth block once in every N
blocks,
>but will simplify the software development a whole lot.
>
>Along with this e-mail I am attaching the C code shown here and the
resulting
>assembly. With some simple tweaks I was able to get 4 filter-taps to
be
>eveluated in 5 cycles. Another way you could go is to use the fir
benchmark
>on TI's web-page.
>
>float filter_1d(const float *W, const float *X, int filt_length, int
>buffer_length, int pointer_pos)
>{
> int i;
> float retval=0;
> int intermadiate_index = pointer_pos+buffer_length;
>
> _nassert((int)(filt_length)%4 == 0);
> for (i = 0; i < filt_length; ++i)
> {
> retval+=W[i]*X[(intermadiate_index - i) & (buffer_length -
1)];
> }
>
> return retval;
>
>}
>
>I have also added the following _nassert's to help the compiler with
>optimization, used -o2 -mwtx -mv6700.
>
> _nassert((int)(filt_length)%4 == 0);
> _nassert((int)(buffer_length)%2 == 0);
> _nassert((int)(filt_length) >= 16);
>
>I got the resulting code from TOOLS ver 4.20:
>
>L1: ; PIPED LOOP PROLOG
>
> ZERO .L2 B4
>|| STW .D2T2 B10,*+SP(24) ; |3|
>|| AND .S1 A4,A7,A8 ; (P) |14|
>|| AND .L1 A4,A6,A7 ; (P) |14|
>|| SUB .S2X A6,1,B2 ; (P) |14|
>|| LDW .D1T1 *+A5[A8],A3 ; (P) |14|
>
> ZERO .S1 A6
>|| MVKH .S2 0x10000,B1 ; init prolog collapse
predicate
>|| ADD .L1 4,A3,A11 ; (P) Define a twin
register
>|| LDW .D2T1 *+B9[B5],A3 ; (P) |14|
>|| MV .L2X A7,B10 ; (P) Define a twin
register
>|| LDW .D1T2 *+A9[A3],B3 ; (P) |14|
>
> SET .S1 A0,0xf,0xf,A1 ; init prolog collapse
predicate
>|| SUB .L2 B0,2,B0
>|| LDW .D1T1 *+A5[A8],A8 ; (P) |14|
>|| B .S2 L2 ; (P) |15|
>|| SUB .L1 A10,A11,A0 ; (P) @|14|
>|| LDW .D2T2 *+B6[B5],B11 ; (P) |14|
>
>;**
----------------------------------------------------------------------
----*
>L2: ; PIPED LOOP KERNEL
>
> [!A1] ADDSP .L1 A7,A2,A2 ; ^ |14|
>|| MPYSP .M1X A8,B11,A7 ; @|14|
>|| LDW .D2T1 *+B7[B10],A8 ; @@|14|
>|| AND .S2X A4,B2,B11 ; @@|14|
>|| SUB .S1 A0,2,A8 ; @@@|14|
>|| LDW .D1T2 *+A12[A11],B3 ; @@@|14|
>
> [!A1] ADDSP .L2 B8,B4,B4 ; ^ |14|
>|| [!B1] ADDSP .L1 A7,A6,A6 ; @ ^ |14|
>|| [!B1] MPYSP .M2X B2,A3,B8 ; @|14|
>|| LDW .D2T2 *+B7[B11],B2 ; @@|14|
>|| SUB .D1 A0,3,A7 ; @@@|14|
>|| AND .S1 A4,A8,A8 ; @@@|14|
>
> [ B1] MPYSU .M2 2,B1,B1 ;
>|| [ B0] SUB .D2 B0,1,B0 ; @|15|
>|| ADD .L2 4,B5,B5 ; @@|15|
>|| MPYSP .M1X A3,B3,A7 ; @@|14|
>|| AND .S1 A4,A7,A0 ; @@@|14|
>|| AND .L1 A4,A0,A7 ; @@@|14|
>|| LDW .D1T1 *+A5[A8],A3 ; @@@|14|
>|| SUB .S2X A0,1,B2 ; @@@|14|
>
> [ A1] MPYSU .M1 2,A1,A1 ;
>|| [!B1] ADDSP .L2 B11,B8,B8 ; @ ^ |14|
>|| ADD .S1 4,A11,A11 ; @@@Define a twin
register
>|| LDW .D2T1 *+B9[B5],A3 ; @@@|14|
>|| MV .S2X A7,B10 ; @@@Define a twin
register
>|| LDW .D1T2 *+A9[A11],B3 ; @@@|14|
>
> [ B0] B .S2 L2 ; @|15|
>|| MPYSP .M2X A8,B3,B11 ; @@|14|
>|| LDW .D2T2 *+B6[B5],B11 ; @@@|14|
>|| LDW .D1T1 *+A5[A0],A8 ; @@@|14|
>|| SUB .S1 A10,A11,A0 ; @@@@|14|
>
>;**
----------------------------------------------------------------------
----*
>
>Regards
>Jagadeesh Sankaran >
>_____________________________________ >
Faysal Basci
METU EEE Dept.


______________________________
Start your Android Ice Cream Sandwich development on TI's AM35x Sitara ARM Cortex-A8 processor today.



(You need to be a member of c6x -- send a blank email to c6x-subscribe@yahoogroups.com )