Technical discussions about the TI C6000 DSPs (including the c62x, c64x and c67x DSPs).
Can I do a 2D FFT using the TI dsp library fft routines (DSP_fft16x16t)? I can't find any examples for implementing this function even for the 1D case let alone the 2D case. It would be great if someone has some code for this. Thanks Peter Bone
Peter- > Can I do a 2D FFT using the TI dsp library fft routines > (DSP_fft16x16t)? I can't find any examples for implementing this > function even for the 1D case let alone the 2D case. It would be great > if someone has some code for this. Maybe these guys have some C64x 2D FFT code available: http://adsabs.harvard.edu/abs/2002SPIE.4681..602M They make the valid point that careful, optimized handling of DMA is crucial. -Jeff
Peter- > > > Can I do a 2D FFT using the TI dsp library fft routines > > > (DSP_fft16x16t)? I can't find any examples for implementing this > > > function even for the 1D case let alone the 2D case. It would be great > > > if someone has some code for this. > > > > Maybe these guys have some C64x 2D FFT code available: > > > > http://adsabs.harvard.edu/abs/2002SPIE.4681..602M > > > > They make the valid point that careful, optimized handling of DMA is > crucial. > > > > -Jeff > > Thanks. Unfortunately I'm not subscribed to that publisher. Would the > DMA be used to transfer from external to internal memory? > My aim is 2D image cross correlation for tracking. Would it be easier > / faster to use repeated calls to the image processing library > IMG_corr_gen function for this instead of the 2D fft method, which now > seems to be a lot more complex than I'd anticipated? With C64x devices DMA can be useful for both int-ext and int-int mem transfers. On any DSP, 2D FFT is not going to be difficult -- DSP devices are built for FFTs from the ground up, and FFT-related code examples abound for these devices. My point in mentioning DMA is that it's really well-developed on DSP devices as opposed to x86, and optimized use of it can speed up algorithms that move around a lot of block data. -Jeff PS. Please post to the group, not to me.
> With C64x devices DMA can be useful for both int-ext and int-int mem
transfers.
>
> On any DSP, 2D FFT is not going to be difficult -- DSP devices are
built for FFTs
> from the ground up, and FFT-related code examples abound for these
devices.
>
> My point in mentioning DMA is that it's really well-developed on DSP
devices as
> opposed to x86, and optimized use of it can speed up algorithms that
move around a
> lot of block data.
>
> -Jeff
>
> PS. Please post to the group, not to me.
Thanks for that. Maybe it will help if I show the code I've written
already and people can comment on it and suggest ways I can get it to
work. What does the code do that I've put in #if 1? This is not in the
ti example for generating twiddle factors but is in an example I found
on this site. I'm doing fft's on sub-sections of a larger array - is
it ok to do this or should I copy each line into a separate array
(internal memory), fft it and copy the result into the fft image using
DMA?
I've not included populating the input array but in my implementation
the values range from -127 to 127.
Thanks.
#include <syshdr.h>
#include <math.h>
#include <dsp_fft16x16t.h>
#define PI 3.141592654f
int16 *twiddle;
int16 *inpu;
int16 *outpu
int16 n = 128U;
static int16
d2s(
DOUBLE d
) {
if (d >= 32767.0f) {
return 32767;
}
if (d <= -32768.0f) {
return (int16) -32768;
}
return (int16) d;
}
void
gen_twiddle(
int16 *w, /* Pointer to twiddle-factor array */
uint16 n, /* Size of FFT */
DOUBLE scale /* Scale factor to apply to values */
) {
uint16 i;
uint16 j;
uint16 k;
for ((j = 1U), (k = 0U); j < (n >> 2); j = (uint16) (j << 2)) {
for (i = 0U; i < (n >> 2); i += (uint16) (j << 1)) {
DOUBLE tmp_ij = (PI * ((DOUBLE) i + (DOUBLE) j)) / (DOUBLE) n;
DOUBLE tmp_i = (PI * (DOUBLE) i) / (DOUBLE) n;
w[k + 11U] = d2s(scale * cos(6.0f * tmp_ij));
w[k + 10U] = d2s(scale * sin(6.0f * tmp_ij));
w[k + 9U ] = d2s(scale * cos(6.0f * tmp_i ));
w[k + 8U ] = d2s(scale * sin(6.0f * tmp_i ));
w[k + 7U ] = d2s(scale * cos(4.0f * tmp_ij));
w[k + 6U ] = d2s(scale * sin(4.0f * tmp_ij));
w[k + 5U ] = d2s(scale * cos(4.0f * tmp_i ));
w[k + 4U ] = d2s(scale * sin(4.0f * tmp_i ));
w[k + 3U ] = d2s(scale * cos(2.0f * tmp_ij));
w[k + 2U ] = d2s(scale * sin(2.0f * tmp_ij));
w[k + 1U ] = d2s(scale * cos(2.0f * tmp_i ));
w[k + 0U ] = d2s(scale * sin(2.0f * tmp_i ));
k += 12U;
}
}
#if 1
n *= 2U;
w[n - 1U] = 32767;
w[n - 3U] = 32767;
w[n - 5U] = 32767;
w[n - 2U] = 0;
w[n - 4U] = 0;
w[n - 6U] = 0;
#endif
}
void
fft2(
int16 *image_in,
int16 *image_out,
const int16 *twiddle,
uint16 dim
) {
uint16 x;
uint16 y;
uint16 dim2 = (uint16) (dim * 2U);
uint16 dim2m1 = dim2 - 1U;
/*
** FFT the rows
*/
uint32 row_start = 0U;
for (y = 0U; y < dim; ++y) {
DSP_fft16x16t(twiddle, (int32) dim, &image_in[row_start],
&image_out[row_start]);
row_start += dim2;
}
/*
** Transpose to FFT the columns
*/
for (y = 2U; y < dim2; y += 2U) {
uint32 t = (uint32) y;
uint32 i = (uint32) y * dim;
for (x = 0U; x < y; x += 2U) {
/*
** Swap real parts
*/
int16 tmp = image_out[i];
image_out[i] = image_out[t];
image_out[t] = tmp;
/*
** Swap imaginary parts
*/
++i;
++t;
tmp = image_out[i];
image_out[i] = image_out[t];
image_out[t] = tmp;
t += dim2m1;
++i;
}
}
row_start = 0U;
for (y = 0U; y < dim; ++y) {
DSP_fft16x16t(twiddle, (int32) dim, &image_in[row_start],
&image_out[row_start]);
row_start += dim2;
}
}
int main( ) {
uint32 complex_image_size = 2U * sizeof(int16) * n * n;
twiddle = (int16*) MEM_alloc(EXTERNALHEAP, 2U * sizeof(int16) * n, 8U);
{
DOUBLE scale = 32767.0f;
gen_twiddle(twiddle, n, scale);
}
inpu = (int16*) MEM_alloc(EXTERNALHEAP, complex_image_size, 8U);
outpu = (int16*) MEM_alloc(EXTERNALHEAP, complex_image_size, 8U);
fft2(inpu, outpu, twiddle, n);
return(0);
}
Here's my current fft2 code and an image showing the original image
(values 0 to 255), the Fourier transform and the inverse transform of
the Fourier transform (should look like the original). I'm shifting
the result of the fft down, which gives more sensible results but
clearly still wrong.
http://img.photobucket.com/albums/v424/peterbone/FFT-IFFT2.png
void
fft2(
const int16 *image_in, /* 2D complex array to be transformed */
int16 *image_out, /* Transformed 2D complex array */
const int16 *twiddle, /* Precomputed twiddle factors for fft */
uint16 dim, /* Size of the square 2D arrays */
bool inverse /* Forward or inverse transform */
) {
uint16 x;
uint16 y;
uint16 dim2 = (uint16) (dim * 2U);
uint32 row_start = 0U;
_nassert((dim2 % 32U) == 0U);
/* FFT the rows */
for (y = 0U; y < dim; ++y) {
/* Copy image row to internal memory buffer
Use DMA instead of memcpy - double buffers */
/*lint -e{426} memcpy for four bytes or less. */
memcpy(Odt_fft_line_in, &image_in[row_start], (uint32) dim2 *
sizeof(int16));
/* Inverse fft can be calculated using forward fft by conjugating
before and after
Faster to do the conjugates in the internal line buffers */
if (inverse) {
for (x = 1U; x < dim2; x += 2U) {
Odt_fft_line_in[x] = -Odt_fft_line_in[x];
}
}
DSP_fft16x16t(twiddle, (int32) dim, Odt_fft_line_in,
Odt_fft_line_out);
/* Scale values down ? */
for (x = 0U; x < dim2; x += 2U) {
ARITH_RIGHT_SHIFTINPLACE(Odt_fft_line_out[x], 3);
ARITH_RIGHT_SHIFTINPLACE(Odt_fft_line_out[x+1U], 3);
}
/*lint -e{426} memcpy for four bytes or less. */
memcpy(&image_out[row_start], Odt_fft_line_out, (uint32) dim2 *
sizeof(int16));
row_start += dim2;
}
/* Transpose to FFT the columns - tested and works*/
transpose(image_out, dim);
row_start = 0U;
for (y = 0U; y < dim; ++y) {
/*lint -e{426} memcpy for four bytes or less. */
memcpy(Odt_fft_line_in, &image_out[row_start], (uint32) dim2 *
sizeof(int16));
DSP_fft16x16t(twiddle, (int32) dim, Odt_fft_line_in,
Odt_fft_line_out);
if (inverse) {
for (x = 1U; x < dim2; x += 2U) {
Odt_fft_line_out[x] = -Odt_fft_line_out[x];
}
}
for (x = 0U; x < dim2; x += 2U) {
ARITH_RIGHT_SHIFTINPLACE(Odt_fft_line_out[x], 3);
ARITH_RIGHT_SHIFTINPLACE(Odt_fft_line_out[x+1U], 3);
}
/*lint -e{426} memcpy for four bytes or less. */
memcpy(&image_out[row_start], Odt_fft_line_out, (uint32) dim2 *
sizeof(int16));
row_start += dim2;
}
}
Peter-
> Here's my current fft2 code and an image showing the original image
> (values 0 to 255), the Fourier transform and the inverse transform of
> the Fourier transform (should look like the original). I'm shifting
> the result of the fft down, which gives more sensible results but
> clearly still wrong.
>
> http://img.photobucket.com/albums/v424/peterbone/FFT-IFFT2.png
Have you tried a very basic test such as doing FFT on just one row, then iFFT and
compare? I know that result is not meaningful, but for debug purposes.
Also your comment below mentions "Inverse fft can be calculated using forward fft by
conjugating before and after". Can you clarify? iFFT is normally calculated by
conjugating twiddle values, not results (specifically, negating sine table lookup
values). iFFT doesn't happen by conjugating input or output values, if that's what
you mean.
-Jeff
> void
> fft2(
> const int16 *image_in, /* 2D complex array to be transformed */
> int16 *image_out, /* Transformed 2D complex array */
> const int16 *twiddle, /* Precomputed twiddle factors for fft */
> uint16 dim, /* Size of the square 2D arrays */
> bool inverse /* Forward or inverse transform */
> ) {
> uint16 x;
> uint16 y;
> uint16 dim2 = (uint16) (dim * 2U);
> uint32 row_start = 0U;
>
> _nassert((dim2 % 32U) == 0U);
>
> /* FFT the rows */
> for (y = 0U; y < dim; ++y) {
> /* Copy image row to internal memory buffer
> Use DMA instead of memcpy - double buffers */
> /*lint -e{426} memcpy for four bytes or less. */
> memcpy(Odt_fft_line_in, &image_in[row_start], (uint32) dim2 *
> sizeof(int16));
>
> /* Inverse fft can be calculated using forward fft by conjugating
> before and after
> Faster to do the conjugates in the internal line buffers */
> if (inverse) {
> for (x = 1U; x < dim2; x += 2U) {
> Odt_fft_line_in[x] = -Odt_fft_line_in[x];
> }
> }
>
> DSP_fft16x16t(twiddle, (int32) dim, Odt_fft_line_in,
> Odt_fft_line_out);
>
> /* Scale values down ? */
> for (x = 0U; x < dim2; x += 2U) {
> ARITH_RIGHT_SHIFTINPLACE(Odt_fft_line_out[x], 3);
> ARITH_RIGHT_SHIFTINPLACE(Odt_fft_line_out[x+1U], 3);
> }
>
> /*lint -e{426} memcpy for four bytes or less. */
> memcpy(&image_out[row_start], Odt_fft_line_out, (uint32) dim2 *
> sizeof(int16));
> row_start += dim2;
> }
>
> /* Transpose to FFT the columns - tested and works*/
> transpose(image_out, dim);
>
> row_start = 0U;
> for (y = 0U; y < dim; ++y) {
> /*lint -e{426} memcpy for four bytes or less. */
> memcpy(Odt_fft_line_in, &image_out[row_start], (uint32) dim2 *
> sizeof(int16));
> DSP_fft16x16t(twiddle, (int32) dim, Odt_fft_line_in,
> Odt_fft_line_out);
>
> if (inverse) {
> for (x = 1U; x < dim2; x += 2U) {
> Odt_fft_line_out[x] = -Odt_fft_line_out[x];
> }
> }
>
> for (x = 0U; x < dim2; x += 2U) {
> ARITH_RIGHT_SHIFTINPLACE(Odt_fft_line_out[x], 3);
> ARITH_RIGHT_SHIFTINPLACE(Odt_fft_line_out[x+1U], 3);
> }
>
> /*lint -e{426} memcpy for four bytes or less. */
> memcpy(&image_out[row_start], Odt_fft_line_out, (uint32) dim2 *
> sizeof(int16));
> row_start += dim2;
> }
> }
--- In c...@yahoogroups.com, Jeff Brower <jbrower@...> wrote: > > Peter- > > > Here's my current fft2 code and an image showing the original image > > (values 0 to 255), the Fourier transform and the inverse transform of > > the Fourier transform (should look like the original). I'm shifting > > the result of the fft down, which gives more sensible results but > > clearly still wrong. > > > > http://img.photobucket.com/albums/v424/peterbone/FFT-IFFT2.png > > Have you tried a very basic test such as doing FFT on just one row, then iFFT and > compare? I know that result is not meaningful, but for debug purposes. > > Also your comment below mentions "Inverse fft can be calculated using forward fft by > conjugating before and after". Can you clarify? iFFT is normally calculated by > conjugating twiddle values, not results (specifically, negating sine table lookup > values). iFFT doesn't happen by conjugating input or output values, if that's what > you mean. > > -Jeff I appreciate your help. I'm new to dsp programming and I'm finding the TI documentation very limited. I have now got the 2d fft code to work. It seemed that the main problem was that I was trying to do a 128x128 fft and the fft dsplib function I'm using is radix 4 and therefore requires the dimension to be a power of 4. 64x64 now works but I'm not sure why 128 doesn't work because the documentation says that the function performs radix 4 followed by radix 2 if required. I'm confused by the scaling of the fft output. for a 64 point fft the result should be scaled up by 6 bits (1 bit per stage), however, I only get the correct result when I scale down by 4 bits after the fft. I also have to scale my input data to between -63 and +63 in order to prevent overflow but I should be able to afford 10 bit precision using a 64 point 16 bit fft. Also, I'm conscious of the fact that I may need to do bit reversal, but the documentation doesn't mention this at all. The documentation says that an ifft can be done by conjugating the twiddle factors but that this can cause problems due to hard coded additions and subtractions. It then suggests that you should conjugate the input data before the fft and then conjugate the output data after - which seems to work but I don't understand the theory behind it. I think that ultimately I'll do the ifft using a normal forward fft function because it just seems to flip the resulting image which doesn't cause a problem for me. Here's the document that mentions it: http://focus.ti.com/lit/an/spra884a/spra884a.pdf Some more questions - there are matrix multiplication and transpose functions in the dsplib. Is there a way to use them with complex data? It seems strange that they provide them without supporting the complex format when it would obviously be very useful in conjunction with the fft functions (cross-correlation, ect). Thanks Peter
Peter- > I appreciate your help. I'm new to dsp programming and I'm finding the > TI documentation very limited. > I have now got the 2d fft code to work. It seemed that the main > problem was that I was trying to do a 128x128 fft and the fft dsplib > function I'm using is radix 4 and therefore requires the dimension to > be a power of 4. 64x64 now works but I'm not sure why 128 doesn't work > because the documentation says that the function performs radix 4 > followed by radix 2 if required. > I'm confused by the scaling of the fft output. for a 64 point fft the > result should be scaled up by 6 bits (1 bit per stage), however, I > only get the correct result when I scale down by 4 bits after the fft. > I also have to scale my input data to between -63 and +63 in order to > prevent overflow but I should be able to afford 10 bit precision using > a 64 point 16 bit fft. Also, I'm conscious of the fact that I may need > to do bit reversal, but the documentation doesn't mention this at all. > The documentation says that an ifft can be done by conjugating the > twiddle factors but that this can cause problems due to hard coded > additions and subtractions. It then suggests that you should conjugate > the input data before the fft and then conjugate the output data after > - which seems to work but I don't understand the theory behind it. I > think that ultimately I'll do the ifft using a normal forward fft > function because it just seems to flip the resulting image which > doesn't cause a problem for me. Here's the document that mentions it: > http://focus.ti.com/lit/an/spra884a/spra884a.pdf > Some more questions - there are matrix multiplication and transpose > functions in the dsplib. Is there a way to use them with complex data? > It seems strange that they provide them without supporting the complex > format when it would obviously be very useful in conjunction with the > fft functions (cross-correlation, ect). A decent radix 4 based FFT function shouldn't require FFT size to be power of 4. It should combine radix 2 and radix 4 code as needed to get the correct result. Unless the function parameter definition actually says something about being limited to power of 4... Yes bit reversal is required somewhere, either before the FFT or after (depending on type of algorithm, decimation in time of freq). To get iFFT using forward FFT algorithm by conjugating data before and after is Ok. I understand now why the TI function is asking you do that, I didn't now about the hard-coded twiddle values before. For your scaling issue, how do you know what is the correct result? For example, if you have perfect sine wave in a 1-D FFT with amplitude +/-1 and with frequency an integer division of the FFT size (i.e. so an integral number of periods fit into N), with no windowing, then your result should show linear magnitude amplitude of N at a single bin in the freq domain (N = FFT size). Have you checked that with your code? As for complex matrix operations in dsplib, I don't know. I would think you could take the dsplib source and add some functions to create a my_dsplib to support what you need. -Jeff
> A decent radix 4 based FFT function shouldn't require FFT size to be
power of 4. It
> should combine radix 2 and radix 4 code as needed to get the correct
result. Unless
> the function parameter definition actually says something about
being limited to
> power of 4...
>
> Yes bit reversal is required somewhere, either before the FFT or
after (depending on
> type of algorithm, decimation in time of freq). To get iFFT using
forward FFT
> algorithm by conjugating data before and after is Ok. I understand
now why the TI
> function is asking you do that, I didn't now about the hard-coded
twiddle values
> before.
>
> For your scaling issue, how do you know what is the correct result?
For example, if
> you have perfect sine wave in a 1-D FFT with amplitude +/-1 and with
frequency an
> integer division of the FFT size (i.e. so an integral number of
periods fit into N),
> with no windowing, then your result should show linear magnitude
amplitude of N at a
> single bin in the freq domain (N = FFT size). Have you checked that
with your code?
>
> As for complex matrix operations in dsplib, I don't know. I would
think you could
> take the dsplib source and add some functions to create a my_dsplib
to support what
> you need.
>
> -Jeff
Yes, the radix 4 function should be able to do powers of 2 as well as
powers of 4. Maybe I just got the scaling wrong when I tried the 128
point fft.
I've come to the conclusion that bit reversal isn't required since I'm
not doing it and I'm now getting what seem like correct results. The
documentation for the fft function says that it does 'digit reversal'
in the function itself (this is why the fft can't be done in-place).
My test for finding if my 2D fft function is working correctly is to
fft an image array and then inverse fft it. If the result looks the
same as the original image then I can assume that everything is
working correctly and I've got the scaling correct. I only get the
correct result when I scale the result of each 1D fft down by 3 bits
and keep my input data within the range of -62 to +62. I'm still
confused about this.
My transpose and complex multiplication functions are working ok and
my whole cross-correlation operation is well within the time
restrictions so I won't bother with any further optimization for this
at the moment.
I've included my latest 2D FFT function that now works for 64x64 point
arrays scaled between -62 and +62.
Thanks
Peter
static void
odt_fft2(
const int16 *image_in, /* 2D complex array to be transformed */
int16 *image_out, /* Transformed 2D complex array */
const int16 *twiddle, /* Precomputed twiddle factors for fft */
int16 *line_in, /* Internal fft line buffer in */
int16 *line_out, /* Internal fft line buffer out */
uint16 dim, /* Size of the square 2D arrays */
bool inverse /* Forward or Inverse Transform */
) {
uint16 x;
uint16 y;
uint16 dim2 = (uint16) (dim << 1);
uint32 row_start = 0U;
_nassert((dim2 % 32U) == 0U);
/*
** FFT the rows
*/
for (y = 0U; y < dim; ++y) {
/*
** Copy image row to internal memory buffer
** Use DMA instead - double buffers?
*/
DSP_blk_move(&image_in[row_start], line_in, (int) dim2);
#if 0
/*
** Inverse fft can be calculated using forward fft by conjugating
before and after
** or by using conjugated twiddle factors
** Faster to do the conjugates in the internal line buffers
** Ommiting this just flips the result so it's not needed
*/
if (inverse) {
for (x = 1U; x < dim2; x += 2U) {
line_in[x] = -line_in[x];
}
}
#endif
DSP_fft16x16t(twiddle, (int32) dim, line_in, line_out);
/*
** FFT scales up by 1 bit per stage
** Scale back down to prevent overflow in column FFT's
*/
for (x = 0U; x < dim2; ++x) {
ARITH_RIGHT_SHIFTINPLACE(line_out[x], 3);
}
/*
** Copy back to external image
*/
DSP_blk_move(line_out, &image_out[row_start], (int) dim2);
row_start += dim2;
}
/*
** FFT the columns
*/
for (y = 0U; y < dim2; y += 2U) {
uint16 col;
/*
** Copy columns into internal buffers - non contiguous
** This is quicker than transposing the image to move blocks of memory
** Speed up with LDDW?
*/
for ((x = 0U), (col = y); x < dim2; (x += 2U), (col += dim2)) {
line_in[x ] = image_out[col ];
line_in[x+1U] = image_out[col+1U];
}
DSP_fft16x16t(twiddle, (int32) dim, line_in, line_out);
for (x = 0U; x < dim2; ++x) {
ARITH_RIGHT_SHIFTINPLACE(line_out[x], 3);
}
if (inverse) {
#if 0
for (x = 1U; x < dim2; x += 2U) {
line_out[x] = -line_out[x];
}
#endif
}
for ((x = 0U), (col = y); x < dim2; (x += 2U), (col += dim2)) {
image_out[col ] = line_out[x ];
image_out[col+1U] = line_out[x+1U];
}
}
}