Reply by Keith E. Larson November 13, 20032003-11-13
Hi all

I put my head down and on a hunch put together some basic functions for a
40b library, but its not the one that the compiler uses, nor is it the other
one from Gary Sitton (GasLight) that was mentioned a day or two ago. Nor is
this from SPRA114, 'Doublelength Floating-Point Arithmetic on the TMS32030'
by Al Lovrich.

What I surmised was that if a 40b float could be represented as an upper 32b
float and lower 'error' 32b, also as a float, some of the math might get
easier. Basically it makes the add/subtract operations a little more
difficult but greatly improves the multiply.

Anyhow... below you will find implimented MPY, ADD, SUB, INV and SQRT, plus
a long double to this new format (LDBL for lack of a better name) converter.

To *see* the output you will want to download the latest tools. This is
because the newest DSK3DW version now has STDOUT functionality. If
interested these functions are for lean and mean char IO (what you want in
embedded systems) and can be found in DSKIO.C, but the important thing is to
open a DSK_STDOUT window from the 'Window' pulldown menu.

The test itself is fairly simple (I think it is called a SAVAGE benchmark)
and may not yet catch all problems (this lib is a day old). It is basically
a loop where a math function is followed by its inverse. For example,
a=sqrt(a*a) should return the same, but typically has some errors. Do this
enough and you can get a statistical idea of how well you are doing with
your functions.

To split this file up, look for <FILE> and cut the file out as needed. You
will find five files, LDBL40.C, LDBLASM.ASM, LDBL40.CMD, LDBL40.BAT and
LDBL40.MAK. The MAK file is only needed if you like to use a make utility.
This one is for a Borland make.

Enjoy, Best regards
Keith Larson
-

<FILE> LDBL40.C

//================================================================
// LDBL40.CPP
// Keith Larson
// TMS320 DSP Applications
// (C) Copyright 1996-2003
// Texas Instruments Incorporated
//
// LDBL40 is an experimental method for computing 40b float multiply
// and adds that would be optimal for the TMS320C3x
//=================================================================
#include "math.h"
#include "dskio.h"
#include "ldbl40.h"

asm(" .global BREAKPOINT_0");
asm(" .global BREAKPOINT_1");
asm(" .global BREAKPOINT_2");
asm(" .global BREAKPOINT_3");
#define BP0 asm("BREAKPOINT_0");
#define BP1 asm("BREAKPOINT_1");
#define BP2 asm("BREAKPOINT_2");
#define BP3 asm("BREAKPOINT_3");
char OUT[200];

void main(void)
{
char *o;
long double a,b,c,d;
float f;
int i;
LDBL A,B,C,D;
dsk_clrscr();

a = 1.1; b=1.0;
for(i=1;i<500;i++)
{
o=OUT; o+=mf_sprintf(o,a,15,10,1); o+=SPACE(o,2);
c = a*a; o+=mf_sprintf(o,c,15,10,1); o+=SPACE(o,2);
// c = 1.0/c;
d = sqrt(c); o+=mf_sprintf(o,d,15,10,1); o+=CRLF(o); dsk_puts(OUT);
// d = 1.0/d;
a = d+b;
}
BP1
dsk_puts("\r\n");
//------
A=FL(1.1);
B=FL(1.0);
for(i=1;i<500;i++)
{
o=OUT; o+=mf_sprintf(o,A.hi,15,10,1); o+=SPACE(o,2);
//BP2
C=MPY40 (A,A); o+=mf_sprintf(o,C.hi,15,10,1); o+=SPACE(o,2);
// C=INV40(C);
D=SQRT40(C); o+=mf_sprintf(o,D.hi,15,10,1); o+=CRLF(o); dsk_puts(OUT);
// D=INV40(D);
AD40 (D,B);
// A=SUB40 (A,B); // does it remove?
}
BP0
//-------
A=FL(355.1);
B=FL(113.0);
D=FL(40126);
C=MPY40(A,B);
C=SUB40(C,D);
o=OUT;
o+=mf_sprintf(o,C.hi,15,10,1); o+=SPACE(o,3);
o+=mf_sprintf(o,C.lo,15,10,1); o+=CRLF(o); dsk_puts(OUT);

CD40(A,B); o=OUT;
o+=mf_sprintf(o,C.hi,15,10,1); o+=SPACE(o,3);
o+=mf_sprintf(o,C.lo,15,10,1); o+=CRLF(o); dsk_puts(OUT);

C=SUB40(C,A); o=OUT;
o+=mf_sprintf(o,C.hi,15,10,1); o+=SPACE(o,3);
o+=mf_sprintf(o,C.lo,15,10,1); o+=CRLF(o); dsk_puts(OUT);

dsk_puts(OUT);
}

<FILE> LDBLASM.ASM

;================================================================
; LDBLASM.CPP
; Keith Larson
; TMS320 DSP Applications
; (C) Copyright 1996-2003
; Texas Instruments Incorporated
;
; LDBL40 is an experimental method for computing 40b float multiply
; and adds that would be optimal for the TMS320C3x
;=================================================================
.global _FL ;
.global _MPY40 ;
.global _ADD40 ;
.global _SUB40 ;
.global _INV40 ;
.global _SQRT40 ;
;---------------
;LDBL FL1(ldouble in)
;---------------
_FL pop r1 ;
ldi sp,ar1 ;
LDFU *-AR1(1),R0 ; 32b float
LDIU *-AR1(0),R0 ; 40b precise
bud r1 ;
STF R0,*AR0 ; save hi bits
SUBF *AR0,R0 ; residual
STF R0,*+AR0(1) ; save lo bits
;-------------------
;LDBL MPY40( LDBL A,LDBL B) 12 cycles
;-------------------
_MPY40 pop r1 ;
ldi sp,ar1 ;
ldi *-AR1(0),AR2 ; load ptrs
ldi *-AR1(1),AR1 ;
MPYF3 *+AR1(0),*+AR2(1),R2 ; hi*lo
MPYF3 *+AR1(1),*+AR2(0),R0 ; lo*hi
MPYF3 *+AR1(0),*+AR2(0),R0 ; hi*hi
|| ADDF3 R0,R2,R2 ;
ADDF3 R0,R2,R2 ;
bud r1 ;
STF R2,*AR0 ; save hi bits
SUBF *AR0,R2 ; residual
STF R2,*+AR0(1) ; save lo bits
;-------------------
;LDBL ADD40(LDBL A,LDBL B); 11 cycles
;-------------------
_ADD40 pop r1 ;
ldi sp,ar1 ;
ldi *-AR1(0),AR2 ; load ptrs
ldi *-AR1(1),AR1 ;
addf *+AR1(0),*+AR1(1),R2 ; lo+lo
addf *+AR2(0),*+AR2(1),R0 ; hi+hi
addf3 R0,R2,R2 ;
bud r1 ;
STF R2,*AR0 ; save hi bits
SUBF *AR0,R2 ; residual
STF R2,*+AR0(1) ; save lo bits
;----------------------
;LDBL SUB40(LDBL A,LDBL B)
;-----------------------
_SUB40 pop r1 ;
ldi sp,ar1 ;
ldi *-AR1(0),AR2 ; load ptrs
ldi *-AR1(1),AR1 ;
addf *+AR1(0),*+AR1(1),R2 ; lo+lo
addf *+AR2(0),*+AR2(1),R0 ; hi+hi
subf3 R2,R0,R2 ;
bud r1 ;
STF R2,*AR0 ; save hi bits
SUBF *AR0,R2 ; residual
STF R2,*+AR0(1) ; save lo bits
;----------------------
; LDBL INV40(LDBL V)
;------------------------
_INV40 pop R3 ;
ldi sp,ar1 ;
ldi *-AR1(0),AR1 ; load ptr
ldf *+AR1(0),R0 ; hi
addf *+AR1(1),R0 ; lo (full prec)
ldf R0,R1 ;
lsh 1,R1 ; fast inverse
pushf R1 ;
pop R1 ;
not R1,R1 ; 1's comp inverse
push R1 ;
popf R1 ; 3b
lsh -1,R1 ;
mpyf R1,R0,R2 ; 6b
subrf 2.0,R2 ;
mpyf R2,R1 ;
mpyf R1,R0,R2 ; 12b
subrf 2.0,R2 ;
mpyf R2,R1 ;
mpyf R1,R0,R2 ; 24b
subrf 2.0,R2 ;
mpyf R2,R1 ; R1=1/R0
;-------
; R1 is 1/R0 precise to 24b, so only need to
; do partial full precision mpyf
;-------
mpyf3 *+AR1(0),R1,R2 ; hi*hi
mpyf3 *+AR1(1),R1,R0 ; hi*lo
addf3 R2,R0,R2 ;
subrf 2.0,R2 ;
stf R2,*+AR0(0) ; Use destination
subf *+AR0(0),R2 ; for temp store
stf R2,*+AR0(1) ;
mpyf3 *+AR0(0),R1,R2 ; hi*hi
mpyf3 *+AR0(1),R1,R0 ; hi*lo
addf3 R2,R0,R2 ;
;-------------------------- Exit splits
bud R3 ; value
STF R2,*AR0 ; save hi bits
SUBF *AR0,R2 ; residual
STF R2,*+AR0(1) ; save lo bits
;----------------------
;LDBL SQRT40(LDBL V)
;----------------------
_SQRT40 pop AR2 ;
ldi sp,ar1 ;
ldi *-AR1(0),AR1 ; load ptr
ldf *+AR1(0),R0 ; hi
addf *+AR1(1),R0 ; lo (full prec)
;- - - - - - - - - - - - - ;
ldf R0,R1 ;
lsh 1,R1 ; fast sqrt
pushf R1 ; using log2(exp)
pop R1 ;
ash -1,R1 ;
not R1,R1 ; 1's comp inverse
push R1 ;
popf R1 ; 3b
lsh -1,R1 ;
mpyf 0.5,R0 ;
;- - - - - - - - - - - - - ;
mpyf R1,R1,R2 ; 6b
mpyf R0,R2 ;
subrf 1.5,R2 ;
mpyf R2,R1 ;
;- - - - - - - - - - - - - ;
mpyf R1,R1,R2 ; 12b
mpyf R0,R2 ;
subrf 1.5,R2 ;
mpyf R2,R1 ;
;- - - - - - - - - - - - - ;
mpyf R1,R1,R2 ; 22-24b
mpyf R0,R2 ;
subrf 1.5,R2 ;
mpyf R2,R1 ;
;--------
mpyf R1,R1,R2 ; 24x24->32b
;--------
pushf R2 ; convert to
popf R0 ; LDBL(R0,R2)
subf R0,R2 ;
mpyf3 R2,*+AR1(0),R2 ; lo*hi
mpyf3 R0,*+AR1(1),R3 ; hi*lo
mpyf3 R0,*+AR1(0),R0 ; hi*hi
|| addf3 R3,R2,R2 ;
addf R0,R2 ;
;--------
mpyf 0.5,R2 ;R0=IN/2
subrf 1.5,R2 ;
;--------
pushf R2 ; convert to
popf R0 ; LDBL(R0,R2)
subf R0,R2 ;
mpyf R1,R2 ; hi*hi
mpyf R1,R0 ; hi*lo
addf R0,R2 ; sqrt(1/X) @40b
;--------
; exit here for 1/sqrt(x)
; continue for x/sqrt(x)
;--------
pushf R2 ; final 40bx40b
popf R0 ;
subf R0,R2 ;
mpyf3 R2 ,*+AR1(0),R2 ; lo*hi 40b mpy
mpyf3 R0 ,*+AR1(1),R3 ; hi*lo
mpyf3 R0 ,*+AR1(0),R0 ; hi*hi
|| addf3 R3,R2,R2 ;
addf R0,R2 ;
;--------
bud AR2 ;
STF R2,*AR0 ; save hi bits
SUBF *AR0,R2 ; residual
STF R2,*+AR0(1) ; save lo bits
;=========================================================
;====== END ASSEMBLY FUNCTIONS ======================
;=========================================================

.if 0
/*==================================================
Approximate C equivelent functions
==================================================*/
LDBL FL1(ldouble in)
{ LDBL d;
d.hi = in ; // upper as float
d.lo = in - d.hi; // residual as float
return d;
}

LDBL MPY40( LDBL A,LDBL B)
{
LDBL C;
float f1,f2,f3;
float i;
i = A.hi *B.hi; // 4 cycles
f2 = A.hi *B.lo;
f3 = A.lo *B.hi;
i+= f2+f3;
C = FL(i);
return C;
}

LDBL ADD40(LDBL A,LDBL B);
{
LDBL C;
ldouble i;
float f1;
i = A.hi + A.lo + B.hi + B.lo;
f1 = i;
C.hi = f1;
f1 = i - f1;
C.lo = f1;
return C;
}

LDBL SUB40(LDBL A,LDBL B)
{
LDBL C;
ldouble i;
i = (A.hi - B.hi);
i += (A.lo - B.lo);
C = FL(i);
return C;
}

LDBL INV40(LDBL V)
{
LDBL C;
LDBL G,T;
G.hi = 1/(V.hi); G.lo = 0;
T=MPY40(G,V);
T.hi = 2.0 - T.hi;
C=MPY40(T,G);
return C;
}

LDBL SQRT40(LDBL V)
{
LDBL C;
float g;
LDBL G, K, T;
g = 1.0/sqrt(V.hi);
C.hi = g; C.lo=0;
K.hi = 1.5; K.lo=0;
T=MPY40(C,C);
T=MPY40(T,V);
T.hi *= 0.5;
T.lo *= 0.5;
T.hi = 1.5-T.hi;
C=MPY40(T,C);
C=MPY40(C,V);
return C;
}

.endif <FILE> LDBL40.CMD

ldbl40.obj dskio.obj ldblasm.obj
-cr
-o ldbl40.out
-l rts30.lib
-heap 1024 /* large size is for stdio functions */
-stack 256 /* large size is for stdio functions */
/* -m ldbl40.map */ /* create map to see allocations */
/*-e _c_int00 do not need for ver 5.0 */

MEMORY
{
BOOTRSRV : org=0x809800, len=0x0002 /* Dont load here if bootloading */
EXTLOW : org=0x000000, len=0x4000 /* External RAM for EVM */
RAM0 : org=0x809802, len=0x06fd /* INTERNAL BLK 0 */
/*RAM1 : org=0x809C00, len=0x0300 *//* INTERNAL BLK 1 */
KERNEL : org=0x809F00, len=0x00C0 /* INTERNAL BLK 1 */
BRNCHTBL : org=0x809FC5, len=0x0002 /* INTERNAL BLK 1 */
BRNCHTBL : org=0x809FC9, len=0x0002 /* INTERNAL BLK 1 */
RAM2 : org=0x800000, len=0x8000 /* INTERNAL BLK 2 */
/*RAM3a : org=0x804000, len=0x0200 *//* INTERNAL BLK 3 */
/*RAM3b : org=0x804200, len=0x3E00 *//* INTERNAL BLK 3 */
}

SECTIONS
{
.text : {} > RAM2
.cinit : {} > RAM2
.data : {} > RAM2
.bss : {} > RAM2
.sysmem : {} > RAM2
.cio : {} > RAM2 /* needed for stdout functions */
.const : {} > RAM2
.stack : {} > RAM2
BRTBL1 : {} > BRNCHTBL1
BRTBL2 : {} > BRNCHTBL2
VECTS : {} > EXTLOW
}

<FILE> LDBL40.BAT

erase ldbl40.out
cl30 dskio.c -o3 -g
cl30 ldbl40.c -o3 -mc -mm -mp -tp -ms -g
asm30 ldblasm.asm -g
lnk30 ldbl40.cmd

<FILE> LDBL40.MAK (if you have Borland Make utility)

ldbl40.out: dskio.obj ldbl40.obj ldbl40.cmd ldbl40.mak ldblasm.obj
lnk30 ldbl40.cmd
dskio.obj: dskio.c ldbl40.mak
cl30 dskio.c -o3 -g
ldbl40.obj: ldbl40.c ldbl40.mak
cl30 ldbl40.c -o3 -mc -mm -mp -tp -ms -g
ldblasm.obj: ldblasm.asm ldbl40.mak
asm30 ldblasm.asm -g

+-----------+
|Keith Larson |
|Member Group Technical Staff |
|Texas Instruments Incorporated |
| |
| 281-274-3288 |
| |
| www.micro.ti.com/~klarson |
|-----------+
| TMS320C3x/C4x/VC33 Applications |
| |
| TMS320VC33 |
| The lowest cost and lowest power 500 w/Mflop |
| floating point DSP on the planet! |
+-----------+