Technical discussions about the TI C6000 DSPs (including the c62x, c64x and c67x DSPs).
|
Hi, I'm implementing fx-LMS algorithm on c6701. However, system seems not be able to catch up the real time deadlines. I've seen in assembly codes generated that my LMS update functions and filtering functions are not pipelined. The comment for disqualification says "Disqualified loop: loop contains a call" . However, there is no callto any external functions in the loop. Below is the both C - code and the produced assembly code fo one of the functions. One more question would be about efficient circular addressing in C. Is there a better way of doing circular addressing which is also supported(?) by hardware. Thanks in advance. Faysal, THE C-CODE /* * This function does real time filtering over a buffer to which data is written circularly * W :the filter * X :the buffer * filt_length : the length of the filter * buffer_length : the length of the buffer * pointer_pos : the index of the newest element writen to the buffer * */ float filter_1d(const float *W, const float *X, int filt_length, int buffer_length, int pointer_pos){ int i; float retval=0; int intermadiate_index = pointer_pos+buffer_length; for (i=0; i<filt_length; ++i){ retval+=W[i]*X[(intermadiate_index - i)%buffer_length]; } return retval; } ASSEMBLY CODE PRODUCED by CCS V.2.0 ;*************************************************************************** *** ;* FUNCTION NAME: _filter_1d * ;* * ;* Regs Modified : A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,B0,B1,B2,B3,B4,B5, * ;* SP * ;* Regs Used : A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,B0,B1,B2,B3,B4,B5, * ;* B6,SP * ;* Local Frame Size : 0 Args + 0 Auto + 8 Save = 8 byte * ;*************************************************************************** *** _filter_1d: ;** ------------------------------------------------------------------------ --* .line 2 .sym _W,4, 22, 17, 32 .sym _X,20, 22, 17, 32 .sym _filt_length,6, 4, 17, 32 .sym _buffer_length,22, 4, 17, 32 .sym _pointer_pos,8, 4, 17, 32 .sym _intermadiate_index,7, 4, 4, 32 .sym _retval,3, 6, 4, 32 .sym _i,0, 4, 4, 32 .sym _W,9, 22, 4, 32 .sym _X,10, 22, 4, 32 .sym _filt_length,20, 4, 4, 32 .sym _buffer_length,22, 4, 4, 32 .sym _pointer_pos,8, 4, 4, 32 .sym L$1,8, 4, 4, 32 ;** 94 ----------------------- intermadiate_index = pointer_pos+buffer_length; ;** 93 ----------------------- retval = 0.0F; ;** 97 ----------------------- if ( filt_length <= 0 ) goto g4; STW .D2T2 B3,*SP--(8) ; |91| STW .D2T1 A10,*+SP(4) ; |91| MV .D1 A4,A9 || MV .S2X A6,B4 || MV .S1X B4,A10 .line 5 ADD .S1X B6,A8,A7 ; |94| .line 4 ZERO .D1 A3 ; |93| .line 8 CMPGT .L2 B4,0,B0 ; |97| [!B0] B .S1 L5 ; |97| NOP 5 ; BRANCH OCCURS ; |97| ;** ------------------------------------------------------------------------ --* ;** 98 ----------------------- L$1 = filt_length; ;** 97 ----------------------- i = 0; ;** ----------------------- #pragma MUST_ITERATE(1, 1099511627775, 1) .line 9 MV .S1X B4,A8 ; |98| .line 8 ZERO .D1 A0 ; |97| ;*-------------------------------------------------------------------------- --* ;* SOFTWARE PIPELINE INFORMATION ;* Disqualified loop: loop contains a call ;*-------------------------------------------------------------------------- --* L4: ;** -----------------------g3: ;** 98 ----------------------- retval += W[i]*X[(intermadiate_index-i)%buffer_length]; ;** 98 ----------------------- ++i; ;** 98 ----------------------- if ( --L$1 ) goto g3; .line 9 MVKL .S2 __remi,B5 ; |98| MVKH .S2 __remi,B5 ; |98| B .S2 B5 ; |98| MVKL .S2 RL4,B3 ; |98| MVKH .S2 RL4,B3 ; |98| MV .D2 B6,B4 ; |98| SUB .D1 A7,A0,A4 NOP 1 RL4: ; CALL OCCURS ; |98| LDW .D1T1 *+A9[A0],A5 ; |98| LDW .D1T1 *+A10[A4],A4 ; |98| ADD .D1 1,A0,A0 ; |98| NOP 3 MPYSP .M1 A4,A5,A4 ; |98| SUB .D1 A8,1,A1 [ A1] B .S1 L4 ; |98| SUB .S1 A8,1,A8 ADDSP .L1 A4,A3,A3 ; |98| NOP 3 ; BRANCH OCCURS ; |98| ;** ------------------------------------------------------------------------ --* L5: ;** -----------------------g4: ;** 99 ----------------------- return retval; .line 10 MV .D1 A3,A4 ; |99| .line 11 LDW .D2T1 *+SP(4),A10 ; |100| LDW .D2T2 *++SP(8),B3 ; |100| NOP 4 B .S2 B3 ; |100| NOP 5 ; BRANCH OCCURS ; |100| .endfunc 100,000080400h,8 .sect ".text" .global _readInput .file "e:\anc_nobios\snap.c" .sym _readInput,_readInput, 38, 2, 0 .func 341 |
|
|
|
>THE C-CODE > >/* >* This function does real time filtering over a buffer to which data is >written circularly >* W :the filter >* X :the buffer >* filt_length : the length of the filter >* buffer_length : the length of the buffer >* pointer_pos : the index of the newest element writen to the buffer >* > */ >float filter_1d(const float *W, const float *X, int filt_length, int >buffer_length, int pointer_pos){ > int i; > float retval=0; > int intermadiate_index = pointer_pos+buffer_length; > > for (i=0; i<filt_length; ++i){ > retval+=W[i]*X[(intermadiate_index - i)%buffer_length]; > } > return retval; >} This is a case where the modulus function is being eveluated by a call to remi, remainder upon integer division. Modulus is an expensive function, but fortunately if buffer_length is a power of 2, then one can obtain the same results as modulus by implementing & (buffer_length - 1). AND is an atomic operation and hence will not results in the code being disqualified from software pipeling. The buffer_length in your case has got to be a power of two as you were trying to implement circular buffer using the hardware which only works for powers of 2, anyways. To the best of my knowledge there is no direct support for circular addressing from C, there is however support from SA where you can program AMR and CSR control registers. Refer C6000 documentation for this. The other approach is to use block based processing, where you maintain the context once in N blocks. This will result in a memcpy of the context from the N-1 th block to front of the Nth block once in every N blocks, but will simplify the software development a whole lot. Along with this e-mail I am attaching the C code shown here and the resulting assembly. With some simple tweaks I was able to get 4 filter-taps to be eveluated in 5 cycles. Another way you could go is to use the fir benchmark on TI's web-page. float filter_1d(const float *W, const float *X, int filt_length, int buffer_length, int pointer_pos) { int i; float retval=0; int intermadiate_index = pointer_pos+buffer_length; _nassert((int)(filt_length)%4 == 0); for (i = 0; i < filt_length; ++i) { retval+=W[i]*X[(intermadiate_index - i) & (buffer_length - 1)]; } return retval; } I have also added the following _nassert's to help the compiler with optimization, used -o2 -mwtx -mv6700. _nassert((int)(filt_length)%4 == 0); _nassert((int)(buffer_length)%2 == 0); _nassert((int)(filt_length) >= 16); I got the resulting code from TOOLS ver 4.20: L1: ; PIPED LOOP PROLOG ZERO .L2 B4 || STW .D2T2 B10,*+SP(24) ; |3| || AND .S1 A4,A7,A8 ; (P) |14| || AND .L1 A4,A6,A7 ; (P) |14| || SUB .S2X A6,1,B2 ; (P) |14| || LDW .D1T1 *+A5[A8],A3 ; (P) |14| ZERO .S1 A6 || MVKH .S2 0x10000,B1 ; init prolog collapse predicate || ADD .L1 4,A3,A11 ; (P) Define a twin register || LDW .D2T1 *+B9[B5],A3 ; (P) |14| || MV .L2X A7,B10 ; (P) Define a twin register || LDW .D1T2 *+A9[A3],B3 ; (P) |14| SET .S1 A0,0xf,0xf,A1 ; init prolog collapse predicate || SUB .L2 B0,2,B0 || LDW .D1T1 *+A5[A8],A8 ; (P) |14| || B .S2 L2 ; (P) |15| || SUB .L1 A10,A11,A0 ; (P) @|14| || LDW .D2T2 *+B6[B5],B11 ; (P) |14| ;** --------------------------------------------------------------------------* L2: ; PIPED LOOP KERNEL [!A1] ADDSP .L1 A7,A2,A2 ; ^ |14| || MPYSP .M1X A8,B11,A7 ; @|14| || LDW .D2T1 *+B7[B10],A8 ; @@|14| || AND .S2X A4,B2,B11 ; @@|14| || SUB .S1 A0,2,A8 ; @@@|14| || LDW .D1T2 *+A12[A11],B3 ; @@@|14| [!A1] ADDSP .L2 B8,B4,B4 ; ^ |14| || [!B1] ADDSP .L1 A7,A6,A6 ; @ ^ |14| || [!B1] MPYSP .M2X B2,A3,B8 ; @|14| || LDW .D2T2 *+B7[B11],B2 ; @@|14| || SUB .D1 A0,3,A7 ; @@@|14| || AND .S1 A4,A8,A8 ; @@@|14| [ B1] MPYSU .M2 2,B1,B1 ; || [ B0] SUB .D2 B0,1,B0 ; @|15| || ADD .L2 4,B5,B5 ; @@|15| || MPYSP .M1X A3,B3,A7 ; @@|14| || AND .S1 A4,A7,A0 ; @@@|14| || AND .L1 A4,A0,A7 ; @@@|14| || LDW .D1T1 *+A5[A8],A3 ; @@@|14| || SUB .S2X A0,1,B2 ; @@@|14| [ A1] MPYSU .M1 2,A1,A1 ; || [!B1] ADDSP .L2 B11,B8,B8 ; @ ^ |14| || ADD .S1 4,A11,A11 ; @@@Define a twin register || LDW .D2T1 *+B9[B5],A3 ; @@@|14| || MV .S2X A7,B10 ; @@@Define a twin register || LDW .D1T2 *+A9[A11],B3 ; @@@|14| [ B0] B .S2 L2 ; @|15| || MPYSP .M2X A8,B3,B11 ; @@|14| || LDW .D2T2 *+B6[B5],B11 ; @@@|14| || LDW .D1T1 *+A5[A0],A8 ; @@@|14| || SUB .S1 A10,A11,A0 ; @@@@|14| ;** --------------------------------------------------------------------------* Regards Jagadeesh Sankaran | |||
|
;***************************************************************************
*** ;* TMS320C6x ANSI C Codegen Version 4.20 * ;* Date/Time created: Mon Apr 8 08:48:55 2002 * ;******************************************************************************< br /> ;******************************************************************************< br /> ;* GLOBAL FILE PARAMETERS * ;* * ;* Architecture : TMS320C670x * ;* Optimization : Enabled at level 2 * ;* Optimizing for : Speed * ;* Based on options: -o2, no -ms * ;* Endian : Little * ;* Interrupt Thrshld : Disabled * ;* Memory Model : Small * ;* Calls to RTS : Near * ;* Pipelining : Enabled * ;* Speculative Load : Disabled * ;* Memory Aliases : Presume not aliases (optimistic) * ;* Debug Info : No Debug Info * ;* * ;******************************************************************************< br /> .asg A15, FP .asg B14, DP .asg B15, SP .global $bss ; opt6x -t -v6700 -O2 /var/tmp/aaaa003TO /var/tmp/daaa003TO .sect ".text" .global _filter_1d ;******************************************************************************< br /> ;* FUNCTION NAME: _filter_1d * ;* * ;* Regs Modified : A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,B0,B1,B2, * ;* B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,SP * ;* Regs Used : A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,B0,B1,B2, * ;* B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,SP * ;* Local Frame Size : 0 Args + 0 Auto + 28 Save = 28 byte * ;******************************************************************************< br /> _filter_1d: ;** --------------------------------------------------------------------------* STW .D2T2 B12,*SP--(32) ; |3| STW .D2T2 B11,*+SP(28) ; |3| ZERO .D1 A2 || ZERO .S2 B8 || ZERO .S1 A0 || MV .L1 A6,A3 || STW .D2T2 B3,*+SP(20) ; |3| ZERO .L2 B1 || ADD .D1 12,A4,A9 || ZERO .L1 A3 || SHR .S2X A3,2,B0 ; |14| || STW .D2T1 A10,*+SP(8) ; |3| || ADD .S1X B6,A8,A10 ADD .L2X 4,A4,B9 || SUB .S1X B6,1,A5 || MVC .S2 CSR,B12 || STW .D2T1 A12,*+SP(16) ; |3| || ADD .D1 8,A4,A12 || SUB .L1 A10,A3,A6 ; (P) |14| MV .D2 B4,B7 || AND .S2 -2,B12,B5 || MV .L2X A4,B6 || LDW .D1T2 *+A12[A3],B3 ; (P) |14| || MV .S1 A5,A4 || SUB .L1 A6,2,A5 ; (P) |14| ZERO .L2 B5 || MV .S1X B4,A5 || STW .D2T1 A11,*+SP(12) ; |3| || MVC .S2 B5,CSR ; interrupts off || SUB .D1 A6,3,A7 ; (P) |14| || AND .L1 A4,A5,A8 ; (P) |14| ;*----------------------------------------------------------------------------*< br /> ;* SOFTWARE PIPELINE INFORMATION ;* ;* Loop source line : 12 ;* Loop opening brace source line : 13 ;* Loop closing brace source line : 15 ;* Loop Unroll Multiple : 4x ;* Known Minimum Trip Count : 4 ;* Known Max Trip Count Factor : 1 ;* Loop Carried Dependency Bound(^) : 4 ;* Unpartitioned Resource Bound : 4 ;* Partitioned Resource Bound(*) : 5 ;* Resource Partition: ;* A-side B-side ;* .L units 2 2 ;* .S units 0 1 ;* .D units 4 4 ;* .M units 2 2 ;* .X cross paths 2 5* ;* .T address paths 4 4 ;* Long read paths 0 0 ;* Long write paths 0 0 ;* Logical ops (.LS) 3 3 (.L or .S unit) ;* Addition ops (.LSD) 4 2 (.L or .S or .D unit) ;* Bound(.L .S .LS) 3 3 ;* Bound(.L .S .D .LS .LSD) 5* 4 ;* ;* Searching for software pipeline schedule at ... ;* ii = 5 Schedule found with 5 iterations in parallel ;* done ;* ;* Epilog not entirely removed ;* Collapsed epilog stages : 1 ;* ;* Prolog not entirely removed ;* Collapsed prolog stages : 2 ;* ;* Minimum required memory pad : 0 bytes ;* ;* For further improvement on this loop, try option -mh8 ;* ;* Minimum safe trip count : 3 (after unrolling) ;*----------------------------------------------------------------------------*< br /> ;* SINGLE SCHEDULED ITERATION ;* ;* C38: ;* SUB .S1 A10,A11,A0 ; |14| ;* LDW .D1T2 *+A12[A11],B3 ; |14| ;* || SUB .S1 A0,2,A8 ; |14| ;* AND .S1 A4,A8,A8 ; |14| ;* || SUB .D1 A0,3,A7 ; |14| ;* AND .L1 A4,A0,A7 ; |14| ;* || SUB .S2X A0,1,B2 ; |14| ;* || LDW .D1T1 *+A5[A8],A3 ; |14| ;* || AND .S1 A4,A7,A0 ; |14| ;* MV .S2X A7,B10 ; Define a twin register ;* || LDW .D2T1 *+B9[B5],A3 ; |14| ;* || LDW .D1T2 *+A9[A11],B3 ; |14| ;* || ADD .S1 4,A11,A11 ; Define a twin register ;* LDW .D2T2 *+B6[B5],B11 ; |14| ;* || LDW .D1T1 *+A5[A0],A8 ; |14| ;* LDW .D2T1 *+B7[B10],A8 ; |14| ;* || AND .S2X A4,B2,B11 ; |14| ;* LDW .D2T2 *+B7[B11],B2 ; |14| ;* MPYSP .M1X A3,B3,A7 ; |14| ;* || ADD .L2 4,B5,B5 ; |15| ;* NOP 1 ;* MPYSP .M2X A8,B3,B11 ; |14| ;* MPYSP .M1X A8,B11,A7 ; |14| ;* MPYSP .M2X B2,A3,B8 ; |14| ;* || ADDSP .L1 A7,A6,A6 ; ^ |14| ;* [ B0] SUB .D2 B0,1,B0 ; |15| ;* ADDSP .L2 B11,B8,B8 ; ^ |14| ;* [ B0] B .S2 C38 ; |15| ;* ADDSP .L1 A7,A2,A2 ; ^ |14| ;* ADDSP .L2 B8,B4,B4 ; ^ |14| ;* NOP 3 ;* ; BRANCH OCCURS ; |15| ;*----------------------------------------------------------------------------*< br /> L1: ; PIPED LOOP PROLOG ZERO .L2 B4 || STW .D2T2 B10,*+SP(24) ; |3| || AND .S1 A4,A7,A8 ; (P) |14| || AND .L1 A4,A6,A7 ; (P) |14| || SUB .S2X A6,1,B2 ; (P) |14| || LDW .D1T1 *+A5[A8],A3 ; (P) |14| ZERO .S1 A6 || MVKH .S2 0x10000,B1 ; init prolog collapse predicate || ADD .L1 4,A3,A11 ; (P) Define a twin register || LDW .D2T1 *+B9[B5],A3 ; (P) |14| || MV .L2X A7,B10 ; (P) Define a twin register || LDW .D1T2 *+A9[A3],B3 ; (P) |14| SET .S1 A0,0xf,0xf,A1 ; init prolog collapse predicate || SUB .L2 B0,2,B0 || LDW .D1T1 *+A5[A8],A8 ; (P) |14| || B .S2 L2 ; (P) |15| || SUB .L1 A10,A11,A0 ; (P) @|14| || LDW .D2T2 *+B6[B5],B11 ; (P) |14| ;** --------------------------------------------------------------------------* L2: ; PIPED LOOP KERNEL [!A1] ADDSP .L1 A7,A2,A2 ; ^ |14| || MPYSP .M1X A8,B11,A7 ; @|14| || LDW .D2T1 *+B7[B10],A8 ; @@|14| || AND .S2X A4,B2,B11 ; @@|14| || SUB .S1 A0,2,A8 ; @@@|14| || LDW .D1T2 *+A12[A11],B3 ; @@@|14| [!A1] ADDSP .L2 B8,B4,B4 ; ^ |14| || [!B1] ADDSP .L1 A7,A6,A6 ; @ ^ |14| || [!B1] MPYSP .M2X B2,A3,B8 ; @|14| || LDW .D2T2 *+B7[B11],B2 ; @@|14| || SUB .D1 A0,3,A7 ; @@@|14| || AND .S1 A4,A8,A8 ; @@@|14| [ B1] MPYSU .M2 2,B1,B1 ; || [ B0] SUB .D2 B0,1,B0 ; @|15| || ADD .L2 4,B5,B5 ; @@|15| || MPYSP .M1X A3,B3,A7 ; @@|14| || AND .S1 A4,A7,A0 ; @@@|14| || AND .L1 A4,A0,A7 ; @@@|14| || LDW .D1T1 *+A5[A8],A3 ; @@@|14| || SUB .S2X A0,1,B2 ; @@@|14| [ A1] MPYSU .M1 2,A1,A1 ; || [!B1] ADDSP .L2 B11,B8,B8 ; @ ^ |14| || ADD .S1 4,A11,A11 ; @@@Define a twin register || LDW .D2T1 *+B9[B5],A3 ; @@@|14| || MV .S2X A7,B10 ; @@@Define a twin register || LDW .D1T2 *+A9[A11],B3 ; @@@|14| [ B0] B .S2 L2 ; @|15| || MPYSP .M2X A8,B3,B11 ; @@|14| || LDW .D2T2 *+B6[B5],B11 ; @@@|14| || LDW .D1T1 *+A5[A0],A8 ; @@@|14| || SUB .S1 A10,A11,A0 ; @@@@|14| ;** --------------------------------------------------------------------------* L3: ; PIPED LOOP EPILOG MPYSP .M1X A8,B11,A0 ; (E) @@@|14| || LDW .D2T1 *+B7[B10],A8 ; (E) @@@@|14| || AND .S2X A4,B2,B11 ; (E) @@@@|14| || ADDSP .L1 A7,A2,A4 ; (E) @@ ^ |14| ADDSP .L1 A7,A6,A5 ; (E) @@@ ^ |14| || MPYSP .M2X B2,A3,B5 ; (E) @@@|14| || LDW .D2T2 *+B7[B11],B2 ; (E) @@@@|14| || ADDSP .L2 B8,B4,B4 ; (E) @@ ^ |14| LDW .D2T1 *+SP(16),A12 ; |19| || ADD .S2 4,B5,B5 ; (E) @@@@|15| || MPYSP .M1X A3,B3,A0 ; (E) @@@@|14| LDW .D2T1 *+SP(12),A11 ; |19| || ADDSP .L2 B11,B8,B4 ; (E) @@@ ^ |14| LDW .D2T2 *+SP(20),B3 ; |19| || MPYSP .M2X A8,B3,B5 ; (E) @@@@|14| LDW .D2T2 *+SP(28),B11 ; |19| || MPYSP .M1X A8,B11,A0 ; (E) @@@@|14| || ADDSP .L1 A0,A4,A4 ; (E) @@@ ^ |14| LDW .D2T2 *+SP(24),B10 ; |19| || ADDSP .L1 A0,A5,A3 ; (E) @@@@ ^ |14| || MPYSP .M2X B2,A3,B5 ; (E) @@@@|14| || ADDSP .L2 B5,B4,B4 ; (E) @@@ ^ |14| LDW .D2T1 *+SP(8),A10 ; |19| ADDSP .L2 B5,B4,B4 ; (E) @@@@ ^ |14| NOP 1 ADDSP .L1 A0,A4,A0 ; (E) @@@@ ^ |14| ADDSP .L2 B5,B4,B5 ; (E) @@@@ ^ |14| NOP 2 LDW .D2T2 *++SP(32),B12 ; |19| || MVC .S2 B12,CSR ; interrupts on ADDSP .L1X B5,A0,A0 NOP 3 ADDSP .L1 A3,A0,A0 ; |17| NOP 1 B .S2 B3 ; |19| NOP 1 ADDSP .L1X B4,A0,A4 NOP 3 ; BRANCH OCCURS ; |19| | |||
| |||
|
|
|
This function consists of a modulo call. i.e( %) which gives the remainder .For this it calls a function called _remi. that's why there is a disqualified loop. U can remove this call. ----- Original Message ----- From: "faysal basci" <> To: <> Sent: Monday, April 08, 2002 1:54 AM Subject: [c6x] optimizing C code for c6701 > Hi, > I'm implementing fx-LMS algorithm on c6701. However, system seems not be > able to catch up the real time deadlines. I've seen in assembly codes > generated that my LMS update functions and filtering functions are not > pipelined. The comment for disqualification says "Disqualified loop: loop > contains a call" . However, there is no callto any external functions in the > loop. Below is the both C - code and the produced assembly code fo one of > the functions. > > One more question would be about efficient circular addressing in C. Is > there a better way of doing circular addressing which is also supported(?) > by hardware. > Thanks in advance. > > Faysal, > THE C-CODE > > /* > * This function does real time filtering over a buffer to which data is > written circularly > * W :the filter > * X :the buffer > * filt_length : the length of the filter > * buffer_length : the length of the buffer > * pointer_pos : the index of the newest element writen to the buffer > * > */ > float filter_1d(const float *W, const float *X, int filt_length, int > buffer_length, int pointer_pos){ > int i; > float retval=0; > int intermadiate_index = pointer_pos+buffer_length; > > for (i=0; i<filt_length; ++i){ > retval+=W[i]*X[(intermadiate_index - i)%buffer_length]; > } > return retval; > } > > ASSEMBLY CODE PRODUCED by CCS V.2.0 ;*************************************************************************** > *** > ;* FUNCTION NAME: _filter_1d > * > ;* > * > ;* Regs Modified : > A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,B0,B1,B2,B3,B4,B5, * > ;* SP > * > ;* Regs Used : > A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,B0,B1,B2,B3,B4,B5, * > ;* B6,SP > * > ;* Local Frame Size : 0 Args + 0 Auto + 8 Save = 8 byte > * > ;*************************************************************************** > *** > _filter_1d: > ;** ------------------------------------------------------------------------ > --* > .line 2 > .sym _W,4, 22, 17, 32 > .sym _X,20, 22, 17, 32 > .sym _filt_length,6, 4, 17, 32 > .sym _buffer_length,22, 4, 17, 32 > .sym _pointer_pos,8, 4, 17, 32 > .sym _intermadiate_index,7, 4, 4, 32 > .sym _retval,3, 6, 4, 32 > .sym _i,0, 4, 4, 32 > .sym _W,9, 22, 4, 32 > .sym _X,10, 22, 4, 32 > .sym _filt_length,20, 4, 4, 32 > .sym _buffer_length,22, 4, 4, 32 > .sym _pointer_pos,8, 4, 4, 32 > .sym L$1,8, 4, 4, 32 > ;** 94 ----------------------- intermadiate_index = > pointer_pos+buffer_length; > ;** 93 ----------------------- retval = 0.0F; > ;** 97 ----------------------- if ( filt_length <= 0 ) goto g4; > STW .D2T2 B3,*SP--(8) ; |91| > STW .D2T1 A10,*+SP(4) ; |91| > > MV .D1 A4,A9 > || MV .S2X A6,B4 > || MV .S1X B4,A10 > > .line 5 > ADD .S1X B6,A8,A7 ; |94| > .line 4 > ZERO .D1 A3 ; |93| > .line 8 > CMPGT .L2 B4,0,B0 ; |97| > [!B0] B .S1 L5 ; |97| > NOP 5 > ; BRANCH OCCURS ; |97| > ;** ------------------------------------------------------------------------ > --* > ;** 98 ----------------------- L$1 = filt_length; > ;** 97 ----------------------- i = 0; > ;** ----------------------- #pragma MUST_ITERATE(1, 1099511627775, 1) > .line 9 > MV .S1X B4,A8 ; |98| > .line 8 > ZERO .D1 A0 ; |97| > ;*-------------------------------------------------------------------------- > --* > ;* SOFTWARE PIPELINE INFORMATION > ;* Disqualified loop: loop contains a call > ;*-------------------------------------------------------------------------- > --* > L4: > ;** -----------------------g3: > ;** 98 ----------------------- retval += > W[i]*X[(intermadiate_index-i)%buffer_length]; > ;** 98 ----------------------- ++i; > ;** 98 ----------------------- if ( --L$1 ) goto g3; > .line 9 > MVKL .S2 __remi,B5 ; |98| > MVKH .S2 __remi,B5 ; |98| > B .S2 B5 ; |98| > MVKL .S2 RL4,B3 ; |98| > MVKH .S2 RL4,B3 ; |98| > MV .D2 B6,B4 ; |98| > SUB .D1 A7,A0,A4 > NOP 1 > RL4: ; CALL OCCURS ; |98| > LDW .D1T1 *+A9[A0],A5 ; |98| > LDW .D1T1 *+A10[A4],A4 ; |98| > ADD .D1 1,A0,A0 ; |98| > NOP 3 > MPYSP .M1 A4,A5,A4 ; |98| > SUB .D1 A8,1,A1 > [ A1] B .S1 L4 ; |98| > SUB .S1 A8,1,A8 > ADDSP .L1 A4,A3,A3 ; |98| > NOP 3 > ; BRANCH OCCURS ; |98| > ;** ------------------------------------------------------------------------ > --* > L5: > ;** -----------------------g4: > ;** 99 ----------------------- return retval; > .line 10 > MV .D1 A3,A4 ; |99| > .line 11 > LDW .D2T1 *+SP(4),A10 ; |100| > LDW .D2T2 *++SP(8),B3 ; |100| > NOP 4 > B .S2 B3 ; |100| > NOP 5 > ; BRANCH OCCURS ; |100| > .endfunc 100,000080400h,8 > .sect ".text" > .global _readInput > .file "e:\anc_nobios\snap.c" > .sym _readInput,_readInput, 38, 2, 0 > .func 341 > > > _____________________________________ |
|
Here u have given the solution for buffer length which is constant. but if it is a variable i.e, it can be power of 2 or may not be, then ur code will again be disqualified. so give a solution where we get maximum optimization. I am attaching our code here. ----- Original Message ----- From: "Jagadeesh Sankaran" <> To: <>; <> Cc: <>; <> Sent: Monday, April 08, 2002 7:22 PM Subject: Re: [c6x] optimizing C code for c6701 > > >THE C-CODE > > > >/* > >* This function does real time filtering over a buffer to which data is > >written circularly > >* W :the filter > >* X :the buffer > >* filt_length : the length of the filter > >* buffer_length : the length of the buffer > >* pointer_pos : the index of the newest element writen to the buffer > >* > > */ > >float filter_1d(const float *W, const float *X, int filt_length, int > >buffer_length, int pointer_pos){ > > int i; > > float retval=0; > > int intermadiate_index = pointer_pos+buffer_length; > > > > for (i=0; i<filt_length; ++i){ > > retval+=W[i]*X[(intermadiate_index - i)%buffer_length]; > > } > > return retval; > >} > > > > This is a case where the modulus function is being eveluated by a call to remi, > remainder upon integer division. Modulus is an expensive function, but > fortunately if buffer_length is a power of 2, then one can obtain the > same results as modulus by implementing & (buffer_length - 1). AND is > an atomic operation and hence will not results in the code being > disqualified from software pipeling. The buffer_length in your case > has got to be a power of two as you were trying to implement circular > buffer using the hardware which only works for powers of 2, anyways. > > To the best of my knowledge there is no direct support for circular > addressing from C, there is however support from SA where you can program > AMR and CSR control registers. Refer C6000 documentation for this. The > other approach is to use block based processing, where you maintain > the context once in N blocks. This will result in a memcpy of the context > from the N-1 th block to front of the Nth block once in every N blocks, > but will simplify the software development a whole lot. > > Along with this e-mail I am attaching the C code shown here and the resulting > assembly. With some simple tweaks I was able to get 4 filter-taps to be > eveluated in 5 cycles. Another way you could go is to use the fir benchmark > on TI's web-page. > > float filter_1d(const float *W, const float *X, int filt_length, int > buffer_length, int pointer_pos) > { > int i; > float retval=0; > int intermadiate_index = pointer_pos+buffer_length; > > _nassert((int)(filt_length)%4 == 0); > for (i = 0; i < filt_length; ++i) > { > retval+=W[i]*X[(intermadiate_index - i) & (buffer_length - 1)]; > } > > return retval; > > } > > I have also added the following _nassert's to help the compiler with > optimization, used -o2 -mwtx -mv6700. > > _nassert((int)(filt_length)%4 == 0); > _nassert((int)(buffer_length)%2 == 0); > _nassert((int)(filt_length) >= 16); > > I got the resulting code from TOOLS ver 4.20: > > L1: ; PIPED LOOP PROLOG > > ZERO .L2 B4 > || STW .D2T2 B10,*+SP(24) ; |3| > || AND .S1 A4,A7,A8 ; (P) |14| > || AND .L1 A4,A6,A7 ; (P) |14| > || SUB .S2X A6,1,B2 ; (P) |14| > || LDW .D1T1 *+A5[A8],A3 ; (P) |14| > > ZERO .S1 A6 > || MVKH .S2 0x10000,B1 ; init prolog collapse predicate > || ADD .L1 4,A3,A11 ; (P) Define a twin register > || LDW .D2T1 *+B9[B5],A3 ; (P) |14| > || MV .L2X A7,B10 ; (P) Define a twin register > || LDW .D1T2 *+A9[A3],B3 ; (P) |14| > > SET .S1 A0,0xf,0xf,A1 ; init prolog collapse predicate > || SUB .L2 B0,2,B0 > || LDW .D1T1 *+A5[A8],A8 ; (P) |14| > || B .S2 L2 ; (P) |15| > || SUB .L1 A10,A11,A0 ; (P) @|14| > || LDW .D2T2 *+B6[B5],B11 ; (P) |14| ;** ------------------------------------------------------------------------ --* > L2: ; PIPED LOOP KERNEL > > [!A1] ADDSP .L1 A7,A2,A2 ; ^ |14| > || MPYSP .M1X A8,B11,A7 ; @|14| > || LDW .D2T1 *+B7[B10],A8 ; @@|14| > || AND .S2X A4,B2,B11 ; @@|14| > || SUB .S1 A0,2,A8 ; @@@|14| > || LDW .D1T2 *+A12[A11],B3 ; @@@|14| > > [!A1] ADDSP .L2 B8,B4,B4 ; ^ |14| > || [!B1] ADDSP .L1 A7,A6,A6 ; @ ^ |14| > || [!B1] MPYSP .M2X B2,A3,B8 ; @|14| > || LDW .D2T2 *+B7[B11],B2 ; @@|14| > || SUB .D1 A0,3,A7 ; @@@|14| > || AND .S1 A4,A8,A8 ; @@@|14| > > [ B1] MPYSU .M2 2,B1,B1 ; > || [ B0] SUB .D2 B0,1,B0 ; @|15| > || ADD .L2 4,B5,B5 ; @@|15| > || MPYSP .M1X A3,B3,A7 ; @@|14| > || AND .S1 A4,A7,A0 ; @@@|14| > || AND .L1 A4,A0,A7 ; @@@|14| > || LDW .D1T1 *+A5[A8],A3 ; @@@|14| > || SUB .S2X A0,1,B2 ; @@@|14| > > [ A1] MPYSU .M1 2,A1,A1 ; > || [!B1] ADDSP .L2 B11,B8,B8 ; @ ^ |14| > || ADD .S1 4,A11,A11 ; @@@Define a twin register > || LDW .D2T1 *+B9[B5],A3 ; @@@|14| > || MV .S2X A7,B10 ; @@@Define a twin register > || LDW .D1T2 *+A9[A11],B3 ; @@@|14| > > [ B0] B .S2 L2 ; @|15| > || MPYSP .M2X A8,B3,B11 ; @@|14| > || LDW .D2T2 *+B6[B5],B11 ; @@@|14| > || LDW .D1T1 *+A5[A0],A8 ; @@@|14| > || SUB .S1 A10,A11,A0 ; @@@@|14| ;** ------------------------------------------------------------------------ --* > > Regards > Jagadeesh Sankaran > > _____________________________________ ---------------------------------------------------------------------------- ---- > ;*************************************************************************** *** > ;* TMS320C6x ANSI C Codegen Version 4.20 * > ;* Date/Time created: Mon Apr 8 08:48:55 2002 * > ;*************************************************************************** *** ;*************************************************************************** *** > ;* GLOBAL FILE PARAMETERS * > ;* * > ;* Architecture : TMS320C670x * > ;* Optimization : Enabled at level 2 * > ;* Optimizing for : Speed * > ;* Based on options: -o2, no -ms * > ;* Endian : Little * > ;* Interrupt Thrshld : Disabled * > ;* Memory Model : Small * > ;* Calls to RTS : Near * > ;* Pipelining : Enabled * > ;* Speculative Load : Disabled * > ;* Memory Aliases : Presume not aliases (optimistic) * > ;* Debug Info : No Debug Info * > ;* * > ;*************************************************************************** *** > > .asg A15, FP > .asg B14, DP > .asg B15, SP > .global $bss > > ; opt6x -t -v6700 -O2 /var/tmp/aaaa003TO /var/tmp/daaa003TO > .sect ".text" > .global _filter_1d ;*************************************************************************** *** > ;* FUNCTION NAME: _filter_1d * > ;* * > ;* Regs Modified : A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,B0,B1,B2, * > ;* B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,SP * > ;* Regs Used : A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,B0,B1,B2, * > ;* B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,SP * > ;* Local Frame Size : 0 Args + 0 Auto + 28 Save = 28 byte * > ;*************************************************************************** *** > _filter_1d: > ;** ------------------------------------------------------------------------ --* > STW .D2T2 B12,*SP--(32) ; |3| > STW .D2T2 B11,*+SP(28) ; |3| > > ZERO .D1 A2 > || ZERO .S2 B8 > || ZERO .S1 A0 > || MV .L1 A6,A3 > || STW .D2T2 B3,*+SP(20) ; |3| > > ZERO .L2 B1 > || ADD .D1 12,A4,A9 > || ZERO .L1 A3 > || SHR .S2X A3,2,B0 ; |14| > || STW .D2T1 A10,*+SP(8) ; |3| > || ADD .S1X B6,A8,A10 > > ADD .L2X 4,A4,B9 > || SUB .S1X B6,1,A5 > || MVC .S2 CSR,B12 > || STW .D2T1 A12,*+SP(16) ; |3| > || ADD .D1 8,A4,A12 > || SUB .L1 A10,A3,A6 ; (P) |14| > > MV .D2 B4,B7 > || AND .S2 -2,B12,B5 > || MV .L2X A4,B6 > || LDW .D1T2 *+A12[A3],B3 ; (P) |14| > || MV .S1 A5,A4 > || SUB .L1 A6,2,A5 ; (P) |14| > > ZERO .L2 B5 > || MV .S1X B4,A5 > || STW .D2T1 A11,*+SP(12) ; |3| > || MVC .S2 B5,CSR ; interrupts off > || SUB .D1 A6,3,A7 ; (P) |14| > || AND .L1 A4,A5,A8 ; (P) |14| ;*-------------------------------------------------------------------------- --* > ;* SOFTWARE PIPELINE INFORMATION > ;* > ;* Loop source line : 12 > ;* Loop opening brace source line : 13 > ;* Loop closing brace source line : 15 > ;* Loop Unroll Multiple : 4x > ;* Known Minimum Trip Count : 4 > ;* Known Max Trip Count Factor : 1 > ;* Loop Carried Dependency Bound(^) : 4 > ;* Unpartitioned Resource Bound : 4 > ;* Partitioned Resource Bound(*) : 5 > ;* Resource Partition: > ;* A-side B-side > ;* .L units 2 2 > ;* .S units 0 1 > ;* .D units 4 4 > ;* .M units 2 2 > ;* .X cross paths 2 5* > ;* .T address paths 4 4 > ;* Long read paths 0 0 > ;* Long write paths 0 0 > ;* Logical ops (.LS) 3 3 (.L or .S unit) > ;* Addition ops (.LSD) 4 2 (.L or .S or .D unit) > ;* Bound(.L .S .LS) 3 3 > ;* Bound(.L .S .D .LS .LSD) 5* 4 > ;* > ;* Searching for software pipeline schedule at ... > ;* ii = 5 Schedule found with 5 iterations in parallel > ;* done > ;* > ;* Epilog not entirely removed > ;* Collapsed epilog stages : 1 > ;* > ;* Prolog not entirely removed > ;* Collapsed prolog stages : 2 > ;* > ;* Minimum required memory pad : 0 bytes > ;* > ;* For further improvement on this loop, try option -mh8 > ;* > ;* Minimum safe trip count : 3 (after unrolling) > ;*-------------------------------------------------------------------------- --* > ;* SINGLE SCHEDULED ITERATION > ;* > ;* C38: > ;* SUB .S1 A10,A11,A0 ; |14| > ;* LDW .D1T2 *+A12[A11],B3 ; |14| > ;* || SUB .S1 A0,2,A8 ; |14| > ;* AND .S1 A4,A8,A8 ; |14| > ;* || SUB .D1 A0,3,A7 ; |14| > ;* AND .L1 A4,A0,A7 ; |14| > ;* || SUB .S2X A0,1,B2 ; |14| > ;* || LDW .D1T1 *+A5[A8],A3 ; |14| > ;* || AND .S1 A4,A7,A0 ; |14| > ;* MV .S2X A7,B10 ; Define a twin register > ;* || LDW .D2T1 *+B9[B5],A3 ; |14| > ;* || LDW .D1T2 *+A9[A11],B3 ; |14| > ;* || ADD .S1 4,A11,A11 ; Define a twin register > ;* LDW .D2T2 *+B6[B5],B11 ; |14| > ;* || LDW .D1T1 *+A5[A0],A8 ; |14| > ;* LDW .D2T1 *+B7[B10],A8 ; |14| > ;* || AND .S2X A4,B2,B11 ; |14| > ;* LDW .D2T2 *+B7[B11],B2 ; |14| > ;* MPYSP .M1X A3,B3,A7 ; |14| > ;* || ADD .L2 4,B5,B5 ; |15| > ;* NOP 1 > ;* MPYSP .M2X A8,B3,B11 ; |14| > ;* MPYSP .M1X A8,B11,A7 ; |14| > ;* MPYSP .M2X B2,A3,B8 ; |14| > ;* || ADDSP .L1 A7,A6,A6 ; ^ |14| > ;* [ B0] SUB .D2 B0,1,B0 ; |15| > ;* ADDSP .L2 B11,B8,B8 ; ^ |14| > ;* [ B0] B .S2 C38 ; |15| > ;* ADDSP .L1 A7,A2,A2 ; ^ |14| > ;* ADDSP .L2 B8,B4,B4 ; ^ |14| > ;* NOP 3 > ;* ; BRANCH OCCURS ; |15| > ;*-------------------------------------------------------------------------- --* > L1: ; PIPED LOOP PROLOG > > ZERO .L2 B4 > || STW .D2T2 B10,*+SP(24) ; |3| > || AND .S1 A4,A7,A8 ; (P) |14| > || AND .L1 A4,A6,A7 ; (P) |14| > || SUB .S2X A6,1,B2 ; (P) |14| > || LDW .D1T1 *+A5[A8],A3 ; (P) |14| > > ZERO .S1 A6 > || MVKH .S2 0x10000,B1 ; init prolog collapse predicate > || ADD .L1 4,A3,A11 ; (P) Define a twin register > || LDW .D2T1 *+B9[B5],A3 ; (P) |14| > || MV .L2X A7,B10 ; (P) Define a twin register > || LDW .D1T2 *+A9[A3],B3 ; (P) |14| > > SET .S1 A0,0xf,0xf,A1 ; init prolog collapse predicate > || SUB .L2 B0,2,B0 > || LDW .D1T1 *+A5[A8],A8 ; (P) |14| > || B .S2 L2 ; (P) |15| > || SUB .L1 A10,A11,A0 ; (P) @|14| > || LDW .D2T2 *+B6[B5],B11 ; (P) |14| ;** ------------------------------------------------------------------------ --* > L2: ; PIPED LOOP KERNEL > > [!A1] ADDSP .L1 A7,A2,A2 ; ^ |14| > || MPYSP .M1X A8,B11,A7 ; @|14| > || LDW .D2T1 *+B7[B10],A8 ; @@|14| > || AND .S2X A4,B2,B11 ; @@|14| > || SUB .S1 A0,2,A8 ; @@@|14| > || LDW .D1T2 *+A12[A11],B3 ; @@@|14| > > [!A1] ADDSP .L2 B8,B4,B4 ; ^ |14| > || [!B1] ADDSP .L1 A7,A6,A6 ; @ ^ |14| > || [!B1] MPYSP .M2X B2,A3,B8 ; @|14| > || LDW .D2T2 *+B7[B11],B2 ; @@|14| > || SUB .D1 A0,3,A7 ; @@@|14| > || AND .S1 A4,A8,A8 ; @@@|14| > > [ B1] MPYSU .M2 2,B1,B1 ; > || [ B0] SUB .D2 B0,1,B0 ; @|15| > || ADD .L2 4,B5,B5 ; @@|15| > || MPYSP .M1X A3,B3,A7 ; @@|14| > || AND .S1 A4,A7,A0 ; @@@|14| > || AND .L1 A4,A0,A7 ; @@@|14| > || LDW .D1T1 *+A5[A8],A3 ; @@@|14| > || SUB .S2X A0,1,B2 ; @@@|14| > > [ A1] MPYSU .M1 2,A1,A1 ; > || [!B1] ADDSP .L2 B11,B8,B8 ; @ ^ |14| > || ADD .S1 4,A11,A11 ; @@@Define a twin register > || LDW .D2T1 *+B9[B5],A3 ; @@@|14| > || MV .S2X A7,B10 ; @@@Define a twin register > || LDW .D1T2 *+A9[A11],B3 ; @@@|14| > > [ B0] B .S2 L2 ; @|15| > || MPYSP .M2X A8,B3,B11 ; @@|14| > || LDW .D2T2 *+B6[B5],B11 ; @@@|14| > || LDW .D1T1 *+A5[A0],A8 ; @@@|14| > || SUB .S1 A10,A11,A0 ; @@@@|14| ;** ------------------------------------------------------------------------ --* > L3: ; PIPED LOOP EPILOG > > MPYSP .M1X A8,B11,A0 ; (E) @@@|14| > || LDW .D2T1 *+B7[B10],A8 ; (E) @@@@|14| > || AND .S2X A4,B2,B11 ; (E) @@@@|14| > || ADDSP .L1 A7,A2,A4 ; (E) @@ ^ |14| > > ADDSP .L1 A7,A6,A5 ; (E) @@@ ^ |14| > || MPYSP .M2X B2,A3,B5 ; (E) @@@|14| > || LDW .D2T2 *+B7[B11],B2 ; (E) @@@@|14| > || ADDSP .L2 B8,B4,B4 ; (E) @@ ^ |14| > > LDW .D2T1 *+SP(16),A12 ; |19| > || ADD .S2 4,B5,B5 ; (E) @@@@|15| > || MPYSP .M1X A3,B3,A0 ; (E) @@@@|14| > > LDW .D2T1 *+SP(12),A11 ; |19| > || ADDSP .L2 B11,B8,B4 ; (E) @@@ ^ |14| > > LDW .D2T2 *+SP(20),B3 ; |19| > || MPYSP .M2X A8,B3,B5 ; (E) @@@@|14| > > LDW .D2T2 *+SP(28),B11 ; |19| > || MPYSP .M1X A8,B11,A0 ; (E) @@@@|14| > || ADDSP .L1 A0,A4,A4 ; (E) @@@ ^ |14| > > LDW .D2T2 *+SP(24),B10 ; |19| > || ADDSP .L1 A0,A5,A3 ; (E) @@@@ ^ |14| > || MPYSP .M2X B2,A3,B5 ; (E) @@@@|14| > || ADDSP .L2 B5,B4,B4 ; (E) @@@ ^ |14| > > LDW .D2T1 *+SP(8),A10 ; |19| > ADDSP .L2 B5,B4,B4 ; (E) @@@@ ^ |14| > NOP 1 > ADDSP .L1 A0,A4,A0 ; (E) @@@@ ^ |14| > ADDSP .L2 B5,B4,B5 ; (E) @@@@ ^ |14| > NOP 2 > > LDW .D2T2 *++SP(32),B12 ; |19| > || MVC .S2 B12,CSR ; interrupts on > > ADDSP .L1X B5,A0,A0 > NOP 3 > ADDSP .L1 A3,A0,A0 ; |17| > NOP 1 > B .S2 B3 ; |19| > NOP 1 > ADDSP .L1X B4,A0,A4 > NOP 3 > ; BRANCH OCCURS ; |19| ---------------------------------------------------------------------------- ---- > float filter_1d(const float *W, const float *X, int filt_length, int > buffer_length, int pointer_pos) > { > int i; > float retval=0; > int intermadiate_index = pointer_pos+buffer_length; > > _nassert((int)(filt_length)%4 == 0); > _nassert((int)(buffer_length)%2 == 0); > _nassert((int)(filt_length) >= 16); > > for (i = 0; i < filt_length; ++i) > { > retval+=W[i]*X[(intermadiate_index - i) & (buffer_length - 1)]; > } > > return retval; > > } > |
|
/* Function: Get_Rez() Description: Gets delayed contribution from the previous excitation vector. Links to text: Sections 2.14, 2.18 & 3.4 Arguments: Word16 *Tv delayed excitation Word16 *PrevExc Previous excitation vector Word16 Lag Closed loop pitch lag Outputs: Word16 *Tv delayed excitation Return value: None Lag is variable */ void Get_Rez( Word16 * restrict Tv, Word16 * restrict PrevExc, Word16 Lag ) { int i ; for ( i = 0 ; i < ClPitchOrd/2 ; i ++ ) Tv[i] = PrevExc[PitchMax - (int) Lag - ClPitchOrd/2 + i] ; for ( i = 0 ; i < SubFrLen+ClPitchOrd/2 ; i ++ ) Tv[ClPitchOrd/2+i] = PrevExc[PitchMax - (int)Lag + i%(int)Lag] ; return; } |
|
Thanks everyone for very valuable advices. The code work much more smoothly this time with your advices. There is one more thing I want to ask: --> I am using a C - struct to centralize all common variables used along my application. I'm doing this because generally my functions use require several variables. However, I wonder if that causes any performance bottlenecks. To better visualize assume I have struct that contains 10 variables. These inculde pointers and arrays. And I have a function that manipulates an array in the struct using other variables and arrays in the struct and possibly one or two variables not included in the struct. I do not prefer (or should I?) to pass tens of variables to the function, instead I just pass a pointer to the struct and done.However, I doubt whether this scheme causes a performance drawback. Below is what I mean; Question is should I choose the first scheme or the second and why? (Do not bother with the syntax). Thanks a lot for help. 1. void myFunction1 (var1, var2, var3, var4...... varN){ . . . } 2. _MyStruct{ var1; var2; var3; . . varN; } myStruct void myFunction2(*myStruct){ . . } Faysal, ------------------- > >>THE C-CODE >> >>/* >>* This function does real time filtering over a buffer to which data is >>written circularly >>* W :the filter >>* X :the buffer >>* filt_length : the length of the filter >>* buffer_length : the length of the buffer >>* pointer_pos : the index of the newest element writen to the buffer >>* >> */ >>float filter_1d(const float *W, const float *X, int filt_length, int >>buffer_length, int pointer_pos){ >> int i; >> float retval=0; >> int intermadiate_index = pointer_pos+buffer_length; >> >> for (i=0; i<filt_length; ++i){ >> retval+=W[i]*X[(intermadiate_index - i)%buffer_length]; >> } >> return retval; >>} >> > >This is a case where the modulus function is being eveluated by a call to remi, >remainder upon integer division. Modulus is an expensive function, but >fortunately if buffer_length is a power of 2, then one can obtain the >same results as modulus by implementing & (buffer_length - 1). AND is >an atomic operation and hence will not results in the code being >disqualified from software pipeling. The buffer_length in your case >has got to be a power of two as you were trying to implement circular >buffer using the hardware which only works for powers of 2, anyways. > >To the best of my knowledge there is no direct support for circular >addressing from C, there is however support from SA where you can program >AMR and CSR control registers. Refer C6000 documentation for this. The >other approach is to use block based processing, where you maintain >the context once in N blocks. This will result in a memcpy of the context >from the N-1 th block to front of the Nth block once in every N blocks, >but will simplify the software development a whole lot. > >Along with this e-mail I am attaching the C code shown here and the resulting >assembly. With some simple tweaks I was able to get 4 filter-taps to be >eveluated in 5 cycles. Another way you could go is to use the fir benchmark >on TI's web-page. > >float filter_1d(const float *W, const float *X, int filt_length, int >buffer_length, int pointer_pos) >{ > int i; > float retval=0; > int intermadiate_index = pointer_pos+buffer_length; > > _nassert((int)(filt_length)%4 == 0); > for (i = 0; i < filt_length; ++i) > { > retval+=W[i]*X[(intermadiate_index - i) & (buffer_length - 1)]; > } > > return retval; > >} > >I have also added the following _nassert's to help the compiler with >optimization, used -o2 -mwtx -mv6700. > > _nassert((int)(filt_length)%4 == 0); > _nassert((int)(buffer_length)%2 == 0); > _nassert((int)(filt_length) >= 16); > >I got the resulting code from TOOLS ver 4.20: > >L1: ; PIPED LOOP PROLOG > > ZERO .L2 B4 >|| STW .D2T2 B10,*+SP(24) ; |3| >|| AND .S1 A4,A7,A8 ; (P) |14| >|| AND .L1 A4,A6,A7 ; (P) |14| >|| SUB .S2X A6,1,B2 ; (P) |14| >|| LDW .D1T1 *+A5[A8],A3 ; (P) |14| > > ZERO .S1 A6 >|| MVKH .S2 0x10000,B1 ; init prolog collapse predicate >|| ADD .L1 4,A3,A11 ; (P) Define a twin register >|| LDW .D2T1 *+B9[B5],A3 ; (P) |14| >|| MV .L2X A7,B10 ; (P) Define a twin register >|| LDW .D1T2 *+A9[A3],B3 ; (P) |14| > > SET .S1 A0,0xf,0xf,A1 ; init prolog collapse predicate >|| SUB .L2 B0,2,B0 >|| LDW .D1T1 *+A5[A8],A8 ; (P) |14| >|| B .S2 L2 ; (P) |15| >|| SUB .L1 A10,A11,A0 ; (P) @|14| >|| LDW .D2T2 *+B6[B5],B11 ; (P) |14| > >;** ---------------------------------------------------------------------- ----* >L2: ; PIPED LOOP KERNEL > > [!A1] ADDSP .L1 A7,A2,A2 ; ^ |14| >|| MPYSP .M1X A8,B11,A7 ; @|14| >|| LDW .D2T1 *+B7[B10],A8 ; @@|14| >|| AND .S2X A4,B2,B11 ; @@|14| >|| SUB .S1 A0,2,A8 ; @@@|14| >|| LDW .D1T2 *+A12[A11],B3 ; @@@|14| > > [!A1] ADDSP .L2 B8,B4,B4 ; ^ |14| >|| [!B1] ADDSP .L1 A7,A6,A6 ; @ ^ |14| >|| [!B1] MPYSP .M2X B2,A3,B8 ; @|14| >|| LDW .D2T2 *+B7[B11],B2 ; @@|14| >|| SUB .D1 A0,3,A7 ; @@@|14| >|| AND .S1 A4,A8,A8 ; @@@|14| > > [ B1] MPYSU .M2 2,B1,B1 ; >|| [ B0] SUB .D2 B0,1,B0 ; @|15| >|| ADD .L2 4,B5,B5 ; @@|15| >|| MPYSP .M1X A3,B3,A7 ; @@|14| >|| AND .S1 A4,A7,A0 ; @@@|14| >|| AND .L1 A4,A0,A7 ; @@@|14| >|| LDW .D1T1 *+A5[A8],A3 ; @@@|14| >|| SUB .S2X A0,1,B2 ; @@@|14| > > [ A1] MPYSU .M1 2,A1,A1 ; >|| [!B1] ADDSP .L2 B11,B8,B8 ; @ ^ |14| >|| ADD .S1 4,A11,A11 ; @@@Define a twin register >|| LDW .D2T1 *+B9[B5],A3 ; @@@|14| >|| MV .S2X A7,B10 ; @@@Define a twin register >|| LDW .D1T2 *+A9[A11],B3 ; @@@|14| > > [ B0] B .S2 L2 ; @|15| >|| MPYSP .M2X A8,B3,B11 ; @@|14| >|| LDW .D2T2 *+B6[B5],B11 ; @@@|14| >|| LDW .D1T1 *+A5[A0],A8 ; @@@|14| >|| SUB .S1 A10,A11,A0 ; @@@@|14| > >;** ---------------------------------------------------------------------- ----* > >Regards >Jagadeesh Sankaran > >_____________________________________ > Faysal Basci METU EEE Dept. |