2 ** FFT and FHT routines
3 ** Copyright 1988, 1993; Ron Mayer
6 ** Does a hartley transform of "n" points in the array "fz".
8 ** NOTE: This routine uses at least 2 patented algorithms, and may be
9 ** under the restrictions of a bunch of different organizations.
10 ** Although I wrote it completely myself; it is kind of a derivative
11 ** of a routine I once authored and released under the GPL, so it
12 ** may fall under the free software foundation's restrictions;
13 ** it was worked on as a Stanford Univ project, so they claim
14 ** some rights to it; it was further optimized at work here, so
15 ** I think this company claims parts of it. The patents are
16 ** held by R. Bracewell (the FHT algorithm) and O. Buneman (the
17 ** trig generator), both at Stanford Univ.
18 ** If it were up to me, I'd say go do whatever you want with it;
19 ** but it would be polite to give credit to the following people
20 ** if you use this anywhere:
21 ** Euler - probable inventor of the fourier transform.
22 ** Gauss - probable inventor of the FFT.
23 ** Hartley - probable inventor of the hartley transform.
24 ** Buneman - for a really cool trig generator
25 ** Mayer(me) - for authoring this particular version and
26 ** including all the optimizations in one package.
28 ** Ron Mayer; mayer@acuson.com
29 ** and added some optimization by
30 ** Mather - idea of using lookup table
31 ** Takehiro - some dirty hack for speed up
34 /* $Id: fft.c,v 1.2 2006/02/09 16:56:23 kramm Exp $ */
37 #include "config_static.h"
50 #define TRI_SIZE (5-1) /* 1024 = 4**5 */
52 static const FLOAT costab
[TRI_SIZE
*2] = {
53 9.238795325112867e-01, 3.826834323650898e-01,
54 9.951847266721969e-01, 9.801714032956060e-02,
55 9.996988186962042e-01, 2.454122852291229e-02,
56 9.999811752826011e-01, 6.135884649154475e-03
59 static void fht(FLOAT
*fz
, int n
)
61 const FLOAT
*tri
= costab
;
65 n
<<= 1; /* to get BLKSIZE, because of 3DNow! ASM routine */
70 int i
, k1
, k2
, k3
, kx
;
101 for (i
= 1; i
< kx
; i
++) {
108 FLOAT a
,b
,g0
,f0
,f1
,g1
,f2
,g2
,f3
,g3
;
109 b
= s2
*fi
[k1
] - c2
*gi
[k1
];
110 a
= c2
*fi
[k1
] + s2
*gi
[k1
];
115 b
= s2
*fi
[k3
] - c2
*gi
[k3
];
116 a
= c2
*fi
[k3
] + s2
*gi
[k3
];
137 c1
= c2
* tri
[0] - s1
* tri
[1];
138 s1
= c2
* tri
[1] + s1
* tri
[0];
144 static const unsigned char rv_tbl
[] = {
145 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
146 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
147 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
148 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
149 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
150 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
151 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
152 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
153 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
154 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
155 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
156 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
157 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
158 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
159 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
160 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe
163 #define ch01(index) (buffer[chn][index])
165 #define ml00(f) (window[i ] * f(i))
166 #define ml10(f) (window[i + 0x200] * f(i + 0x200))
167 #define ml20(f) (window[i + 0x100] * f(i + 0x100))
168 #define ml30(f) (window[i + 0x300] * f(i + 0x300))
170 #define ml01(f) (window[i + 0x001] * f(i + 0x001))
171 #define ml11(f) (window[i + 0x201] * f(i + 0x201))
172 #define ml21(f) (window[i + 0x101] * f(i + 0x101))
173 #define ml31(f) (window[i + 0x301] * f(i + 0x301))
175 #define ms00(f) (window_s[i ] * f(i + k))
176 #define ms10(f) (window_s[0x7f - i] * f(i + k + 0x80))
177 #define ms20(f) (window_s[i + 0x40] * f(i + k + 0x40))
178 #define ms30(f) (window_s[0x3f - i] * f(i + k + 0xc0))
180 #define ms01(f) (window_s[i + 0x01] * f(i + k + 0x01))
181 #define ms11(f) (window_s[0x7e - i] * f(i + k + 0x81))
182 #define ms21(f) (window_s[i + 0x41] * f(i + k + 0x41))
183 #define ms31(f) (window_s[0x3e - i] * f(i + k + 0xc1))
186 void fft_short(lame_internal_flags
* const gfc
,
187 FLOAT x_real
[3][BLKSIZE_s
], int chn
, const sample_t
*buffer
[2])
189 const FLOAT
* window_s
= (const FLOAT
*)&gfc
->window_s
[0];
194 for (b
= 0; b
< 3; b
++) {
195 FLOAT
*x
= &x_real
[b
][BLKSIZE_s
/ 2];
196 short k
= (576 / 3) * (b
+ 1);
197 j
= BLKSIZE_s
/ 8 - 1;
199 FLOAT f0
,f1
,f2
,f3
, w
;
203 f0
= ms00(ch01
); w
= ms10(ch01
); f1
= f0
- w
; f0
= f0
+ w
;
204 f2
= ms20(ch01
); w
= ms30(ch01
); f3
= f2
- w
; f2
= f2
+ w
;
212 f0
= ms01(ch01
); w
= ms11(ch01
); f1
= f0
- w
; f0
= f0
+ w
;
213 f2
= ms21(ch01
); w
= ms31(ch01
); f3
= f2
- w
; f2
= f2
+ w
;
215 x
[BLKSIZE_s
/ 2 + 0] = f0
+ f2
;
216 x
[BLKSIZE_s
/ 2 + 2] = f0
- f2
;
217 x
[BLKSIZE_s
/ 2 + 1] = f1
+ f3
;
218 x
[BLKSIZE_s
/ 2 + 3] = f1
- f3
;
221 gfc
->fft_fht(x
, BLKSIZE_s
/2);
222 /* BLKSIZE_s/2 because of 3DNow! ASM routine */
226 void fft_long(lame_internal_flags
* const gfc
,
227 FLOAT x
[BLKSIZE
], int chn
, const sample_t
*buffer
[2] )
229 const FLOAT
* window
= (const FLOAT
*)&gfc
->window
[0];
231 int jj
= BLKSIZE
/ 8 - 1;
235 FLOAT f0
,f1
,f2
,f3
, w
;
238 f0
= ml00(ch01
); w
= ml10(ch01
); f1
= f0
- w
; f0
= f0
+ w
;
239 f2
= ml20(ch01
); w
= ml30(ch01
); f3
= f2
- w
; f2
= f2
+ w
;
247 f0
= ml01(ch01
); w
= ml11(ch01
); f1
= f0
- w
; f0
= f0
+ w
;
248 f2
= ml21(ch01
); w
= ml31(ch01
); f3
= f2
- w
; f2
= f2
+ w
;
250 x
[BLKSIZE
/ 2 + 0] = f0
+ f2
;
251 x
[BLKSIZE
/ 2 + 2] = f0
- f2
;
252 x
[BLKSIZE
/ 2 + 1] = f1
+ f3
;
253 x
[BLKSIZE
/ 2 + 3] = f1
- f3
;
256 gfc
->fft_fht(x
, BLKSIZE
/2);
257 /* BLKSIZE/2 because of 3DNow! ASM routine */
261 void init_fft(lame_internal_flags
* const gfc
)
263 FLOAT
*window
= &gfc
->window
[0];
264 FLOAT
*window_s
= &gfc
->window_s
[0];
268 if (gfc
->nsPsy
.use
) {
269 for (i
= 0; i
< BLKSIZE
; i
++)
270 /* blackman window */
271 window
[i
] = 0.42-0.5*cos(2*PI
*i
/(BLKSIZE
-1))+0.08*cos(4*PI
*i
/(BLKSIZE
-1));
274 * calculate HANN window coefficients
276 for (i
= 0; i
< BLKSIZE
; i
++)
277 window
[i
] = 0.5 * (1.0 - cos(2.0 * PI
* (i
+ 0.5) / BLKSIZE
));
281 // The type of window used here will make no real difference, but
282 // in the interest of merging nspsytune stuff - switch to blackman window
283 for (i
= 0; i
< BLKSIZE
; i
++)
284 /* blackman window */
285 window
[i
] = 0.42-0.5*cos(2*PI
*(i
+.5)/BLKSIZE
)+
286 0.08*cos(4*PI
*(i
+.5)/BLKSIZE
);
288 for (i
= 0; i
< BLKSIZE_s
/2 ; i
++)
289 window_s
[i
] = 0.5 * (1.0 - cos(2.0 * PI
* (i
+ 0.5) / BLKSIZE_s
));
292 if (gfc
->CPU_features
.AMD_3DNow
) {
293 extern void fht_3DN(FLOAT
*fz
, int n
);
294 gfc
->fft_fht
= fht_3DN
;
298 if (gfc
->CPU_features
.SIMD
) {
299 extern void fht_SSE(FLOAT
*fz
, int n
);
300 gfc
->fft_fht
= fht_SSE
;
304 if (gfc
->CPU_features
.i387
) {
305 extern void fht_FPU(FLOAT
*fz
, int n
);
306 gfc
->fft_fht
= fht_FPU
;