Hi all I put my head down and on a hunch put together some basic functions for a 40b library, but its not the one that the compiler uses, nor is it the other one from Gary Sitton (GasLight) that was mentioned a day or two ago. Nor is this from SPRA114, 'Doublelength Floating-Point Arithmetic on the TMS32030' by Al Lovrich. What I surmised was that if a 40b float could be represented as an upper 32b float and lower 'error' 32b, also as a float, some of the math might get easier. Basically it makes the add/subtract operations a little more difficult but greatly improves the multiply. Anyhow... below you will find implimented MPY, ADD, SUB, INV and SQRT, plus a long double to this new format (LDBL for lack of a better name) converter. To *see* the output you will want to download the latest tools. This is because the newest DSK3DW version now has STDOUT functionality. If interested these functions are for lean and mean char IO (what you want in embedded systems) and can be found in DSKIO.C, but the important thing is to open a DSK_STDOUT window from the 'Window' pulldown menu. The test itself is fairly simple (I think it is called a SAVAGE benchmark) and may not yet catch all problems (this lib is a day old). It is basically a loop where a math function is followed by its inverse. For example, a=sqrt(a*a) should return the same, but typically has some errors. Do this enough and you can get a statistical idea of how well you are doing with your functions. To split this file up, look for <FILE> and cut the file out as needed. You will find five files, LDBL40.C, LDBLASM.ASM, LDBL40.CMD, LDBL40.BAT and LDBL40.MAK. The MAK file is only needed if you like to use a make utility. This one is for a Borland make. Enjoy, Best regards Keith Larson - <FILE> LDBL40.C //================================================================ // LDBL40.CPP // Keith Larson // TMS320 DSP Applications // (C) Copyright 1996-2003 // Texas Instruments Incorporated // // LDBL40 is an experimental method for computing 40b float multiply // and adds that would be optimal for the TMS320C3x //================================================================= #include "math.h" #include "dskio.h" #include "ldbl40.h" asm(" .global BREAKPOINT_0"); asm(" .global BREAKPOINT_1"); asm(" .global BREAKPOINT_2"); asm(" .global BREAKPOINT_3"); #define BP0 asm("BREAKPOINT_0"); #define BP1 asm("BREAKPOINT_1"); #define BP2 asm("BREAKPOINT_2"); #define BP3 asm("BREAKPOINT_3"); char OUT[200]; void main(void) { char *o; long double a,b,c,d; float f; int i; LDBL A,B,C,D; dsk_clrscr(); a = 1.1; b=1.0; for(i=1;i<500;i++) { o=OUT; o+=mf_sprintf(o,a,15,10,1); o+=SPACE(o,2); c = a*a; o+=mf_sprintf(o,c,15,10,1); o+=SPACE(o,2); // c = 1.0/c; d = sqrt(c); o+=mf_sprintf(o,d,15,10,1); o+=CRLF(o); dsk_puts(OUT); // d = 1.0/d; a = d+b; } BP1 dsk_puts("\r\n"); //------ A=FL(1.1); B=FL(1.0); for(i=1;i<500;i++) { o=OUT; o+=mf_sprintf(o,A.hi,15,10,1); o+=SPACE(o,2); //BP2 C=MPY40 (A,A); o+=mf_sprintf(o,C.hi,15,10,1); o+=SPACE(o,2); // C=INV40(C); D=SQRT40(C); o+=mf_sprintf(o,D.hi,15,10,1); o+=CRLF(o); dsk_puts(OUT); // D=INV40(D); AD40 (D,B); // A=SUB40 (A,B); // does it remove? } BP0 //------- A=FL(355.1); B=FL(113.0); D=FL(40126); C=MPY40(A,B); C=SUB40(C,D); o=OUT; o+=mf_sprintf(o,C.hi,15,10,1); o+=SPACE(o,3); o+=mf_sprintf(o,C.lo,15,10,1); o+=CRLF(o); dsk_puts(OUT); CD40(A,B); o=OUT; o+=mf_sprintf(o,C.hi,15,10,1); o+=SPACE(o,3); o+=mf_sprintf(o,C.lo,15,10,1); o+=CRLF(o); dsk_puts(OUT); C=SUB40(C,A); o=OUT; o+=mf_sprintf(o,C.hi,15,10,1); o+=SPACE(o,3); o+=mf_sprintf(o,C.lo,15,10,1); o+=CRLF(o); dsk_puts(OUT); dsk_puts(OUT); } <FILE> LDBLASM.ASM ;================================================================ ; LDBLASM.CPP ; Keith Larson ; TMS320 DSP Applications ; (C) Copyright 1996-2003 ; Texas Instruments Incorporated ; ; LDBL40 is an experimental method for computing 40b float multiply ; and adds that would be optimal for the TMS320C3x ;================================================================= .global _FL ; .global _MPY40 ; .global _ADD40 ; .global _SUB40 ; .global _INV40 ; .global _SQRT40 ; ;--------------- ;LDBL FL1(ldouble in) ;--------------- _FL pop r1 ; ldi sp,ar1 ; LDFU *-AR1(1),R0 ; 32b float LDIU *-AR1(0),R0 ; 40b precise bud r1 ; STF R0,*AR0 ; save hi bits SUBF *AR0,R0 ; residual STF R0,*+AR0(1) ; save lo bits ;------------------- ;LDBL MPY40( LDBL A,LDBL B) 12 cycles ;------------------- _MPY40 pop r1 ; ldi sp,ar1 ; ldi *-AR1(0),AR2 ; load ptrs ldi *-AR1(1),AR1 ; MPYF3 *+AR1(0),*+AR2(1),R2 ; hi*lo MPYF3 *+AR1(1),*+AR2(0),R0 ; lo*hi MPYF3 *+AR1(0),*+AR2(0),R0 ; hi*hi || ADDF3 R0,R2,R2 ; ADDF3 R0,R2,R2 ; bud r1 ; STF R2,*AR0 ; save hi bits SUBF *AR0,R2 ; residual STF R2,*+AR0(1) ; save lo bits ;------------------- ;LDBL ADD40(LDBL A,LDBL B); 11 cycles ;------------------- _ADD40 pop r1 ; ldi sp,ar1 ; ldi *-AR1(0),AR2 ; load ptrs ldi *-AR1(1),AR1 ; addf *+AR1(0),*+AR1(1),R2 ; lo+lo addf *+AR2(0),*+AR2(1),R0 ; hi+hi addf3 R0,R2,R2 ; bud r1 ; STF R2,*AR0 ; save hi bits SUBF *AR0,R2 ; residual STF R2,*+AR0(1) ; save lo bits ;---------------------- ;LDBL SUB40(LDBL A,LDBL B) ;----------------------- _SUB40 pop r1 ; ldi sp,ar1 ; ldi *-AR1(0),AR2 ; load ptrs ldi *-AR1(1),AR1 ; addf *+AR1(0),*+AR1(1),R2 ; lo+lo addf *+AR2(0),*+AR2(1),R0 ; hi+hi subf3 R2,R0,R2 ; bud r1 ; STF R2,*AR0 ; save hi bits SUBF *AR0,R2 ; residual STF R2,*+AR0(1) ; save lo bits ;---------------------- ; LDBL INV40(LDBL V) ;------------------------ _INV40 pop R3 ; ldi sp,ar1 ; ldi *-AR1(0),AR1 ; load ptr ldf *+AR1(0),R0 ; hi addf *+AR1(1),R0 ; lo (full prec) ldf R0,R1 ; lsh 1,R1 ; fast inverse pushf R1 ; pop R1 ; not R1,R1 ; 1's comp inverse push R1 ; popf R1 ; 3b lsh -1,R1 ; mpyf R1,R0,R2 ; 6b subrf 2.0,R2 ; mpyf R2,R1 ; mpyf R1,R0,R2 ; 12b subrf 2.0,R2 ; mpyf R2,R1 ; mpyf R1,R0,R2 ; 24b subrf 2.0,R2 ; mpyf R2,R1 ; R1=1/R0 ;------- ; R1 is 1/R0 precise to 24b, so only need to ; do partial full precision mpyf ;------- mpyf3 *+AR1(0),R1,R2 ; hi*hi mpyf3 *+AR1(1),R1,R0 ; hi*lo addf3 R2,R0,R2 ; subrf 2.0,R2 ; stf R2,*+AR0(0) ; Use destination subf *+AR0(0),R2 ; for temp store stf R2,*+AR0(1) ; mpyf3 *+AR0(0),R1,R2 ; hi*hi mpyf3 *+AR0(1),R1,R0 ; hi*lo addf3 R2,R0,R2 ; ;-------------------------- Exit splits bud R3 ; value STF R2,*AR0 ; save hi bits SUBF *AR0,R2 ; residual STF R2,*+AR0(1) ; save lo bits ;---------------------- ;LDBL SQRT40(LDBL V) ;---------------------- _SQRT40 pop AR2 ; ldi sp,ar1 ; ldi *-AR1(0),AR1 ; load ptr ldf *+AR1(0),R0 ; hi addf *+AR1(1),R0 ; lo (full prec) ;- - - - - - - - - - - - - ; ldf R0,R1 ; lsh 1,R1 ; fast sqrt pushf R1 ; using log2(exp) pop R1 ; ash -1,R1 ; not R1,R1 ; 1's comp inverse push R1 ; popf R1 ; 3b lsh -1,R1 ; mpyf 0.5,R0 ; ;- - - - - - - - - - - - - ; mpyf R1,R1,R2 ; 6b mpyf R0,R2 ; subrf 1.5,R2 ; mpyf R2,R1 ; ;- - - - - - - - - - - - - ; mpyf R1,R1,R2 ; 12b mpyf R0,R2 ; subrf 1.5,R2 ; mpyf R2,R1 ; ;- - - - - - - - - - - - - ; mpyf R1,R1,R2 ; 22-24b mpyf R0,R2 ; subrf 1.5,R2 ; mpyf R2,R1 ; ;-------- mpyf R1,R1,R2 ; 24x24->32b ;-------- pushf R2 ; convert to popf R0 ; LDBL(R0,R2) subf R0,R2 ; mpyf3 R2,*+AR1(0),R2 ; lo*hi mpyf3 R0,*+AR1(1),R3 ; hi*lo mpyf3 R0,*+AR1(0),R0 ; hi*hi || addf3 R3,R2,R2 ; addf R0,R2 ; ;-------- mpyf 0.5,R2 ;R0=IN/2 subrf 1.5,R2 ; ;-------- pushf R2 ; convert to popf R0 ; LDBL(R0,R2) subf R0,R2 ; mpyf R1,R2 ; hi*hi mpyf R1,R0 ; hi*lo addf R0,R2 ; sqrt(1/X) @40b ;-------- ; exit here for 1/sqrt(x) ; continue for x/sqrt(x) ;-------- pushf R2 ; final 40bx40b popf R0 ; subf R0,R2 ; mpyf3 R2 ,*+AR1(0),R2 ; lo*hi 40b mpy mpyf3 R0 ,*+AR1(1),R3 ; hi*lo mpyf3 R0 ,*+AR1(0),R0 ; hi*hi || addf3 R3,R2,R2 ; addf R0,R2 ; ;-------- bud AR2 ; STF R2,*AR0 ; save hi bits SUBF *AR0,R2 ; residual STF R2,*+AR0(1) ; save lo bits ;========================================================= ;====== END ASSEMBLY FUNCTIONS ====================== ;========================================================= .if 0 /*================================================== Approximate C equivelent functions ==================================================*/ LDBL FL1(ldouble in) { LDBL d; d.hi = in ; // upper as float d.lo = in - d.hi; // residual as float return d; } LDBL MPY40( LDBL A,LDBL B) { LDBL C; float f1,f2,f3; float i; i = A.hi *B.hi; // 4 cycles f2 = A.hi *B.lo; f3 = A.lo *B.hi; i+= f2+f3; C = FL(i); return C; } LDBL ADD40(LDBL A,LDBL B); { LDBL C; ldouble i; float f1; i = A.hi + A.lo + B.hi + B.lo; f1 = i; C.hi = f1; f1 = i - f1; C.lo = f1; return C; } LDBL SUB40(LDBL A,LDBL B) { LDBL C; ldouble i; i = (A.hi - B.hi); i += (A.lo - B.lo); C = FL(i); return C; } LDBL INV40(LDBL V) { LDBL C; LDBL G,T; G.hi = 1/(V.hi); G.lo = 0; T=MPY40(G,V); T.hi = 2.0 - T.hi; C=MPY40(T,G); return C; } LDBL SQRT40(LDBL V) { LDBL C; float g; LDBL G, K, T; g = 1.0/sqrt(V.hi); C.hi = g; C.lo=0; K.hi = 1.5; K.lo=0; T=MPY40(C,C); T=MPY40(T,V); T.hi *= 0.5; T.lo *= 0.5; T.hi = 1.5-T.hi; C=MPY40(T,C); C=MPY40(C,V); return C; } .endif <FILE> LDBL40.CMD ldbl40.obj dskio.obj ldblasm.obj -cr -o ldbl40.out -l rts30.lib -heap 1024 /* large size is for stdio functions */ -stack 256 /* large size is for stdio functions */ /* -m ldbl40.map */ /* create map to see allocations */ /*-e _c_int00 do not need for ver 5.0 */ MEMORY { BOOTRSRV : org=0x809800, len=0x0002 /* Dont load here if bootloading */ EXTLOW : org=0x000000, len=0x4000 /* External RAM for EVM */ RAM0 : org=0x809802, len=0x06fd /* INTERNAL BLK 0 */ /*RAM1 : org=0x809C00, len=0x0300 *//* INTERNAL BLK 1 */ KERNEL : org=0x809F00, len=0x00C0 /* INTERNAL BLK 1 */ BRNCHTBL : org=0x809FC5, len=0x0002 /* INTERNAL BLK 1 */ BRNCHTBL : org=0x809FC9, len=0x0002 /* INTERNAL BLK 1 */ RAM2 : org=0x800000, len=0x8000 /* INTERNAL BLK 2 */ /*RAM3a : org=0x804000, len=0x0200 *//* INTERNAL BLK 3 */ /*RAM3b : org=0x804200, len=0x3E00 *//* INTERNAL BLK 3 */ } SECTIONS { .text : {} > RAM2 .cinit : {} > RAM2 .data : {} > RAM2 .bss : {} > RAM2 .sysmem : {} > RAM2 .cio : {} > RAM2 /* needed for stdout functions */ .const : {} > RAM2 .stack : {} > RAM2 BRTBL1 : {} > BRNCHTBL1 BRTBL2 : {} > BRNCHTBL2 VECTS : {} > EXTLOW } <FILE> LDBL40.BAT erase ldbl40.out cl30 dskio.c -o3 -g cl30 ldbl40.c -o3 -mc -mm -mp -tp -ms -g asm30 ldblasm.asm -g lnk30 ldbl40.cmd <FILE> LDBL40.MAK (if you have Borland Make utility) ldbl40.out: dskio.obj ldbl40.obj ldbl40.cmd ldbl40.mak ldblasm.obj lnk30 ldbl40.cmd dskio.obj: dskio.c ldbl40.mak cl30 dskio.c -o3 -g ldbl40.obj: ldbl40.c ldbl40.mak cl30 ldbl40.c -o3 -mc -mm -mp -tp -ms -g ldblasm.obj: ldblasm.asm ldbl40.mak asm30 ldblasm.asm -g +-----------+ |Keith Larson | |Member Group Technical Staff | |Texas Instruments Incorporated | | | | 281-274-3288 | | | | www.micro.ti.com/~klarson | |-----------+ | TMS320C3x/C4x/VC33 Applications | | | | TMS320VC33 | | The lowest cost and lowest power 500 w/Mflop | | floating point DSP on the planet! | +-----------+