before merging master
[inav.git] / lib / main / CMSIS / DSP / Source / TransformFunctions / arm_cfft_radix4_q31.c
blob95292e46047c50810d9856828bf18b9a4740cde1
1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cfft_radix4_q31.c
4 * Description: This file has function definition of Radix-4 FFT & IFFT function and
5 * In-place bit reversal using bit reversal table
7 * $Date: 27. January 2017
8 * $Revision: V.1.5.1
10 * Target Processor: Cortex-M cores
11 * -------------------------------------------------------------------- */
13 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
15 * SPDX-License-Identifier: Apache-2.0
17 * Licensed under the Apache License, Version 2.0 (the License); you may
18 * not use this file except in compliance with the License.
19 * You may obtain a copy of the License at
21 * www.apache.org/licenses/LICENSE-2.0
23 * Unless required by applicable law or agreed to in writing, software
24 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26 * See the License for the specific language governing permissions and
27 * limitations under the License.
30 #include "arm_math.h"
32 void arm_radix4_butterfly_inverse_q31(
33 q31_t * pSrc,
34 uint32_t fftLen,
35 q31_t * pCoef,
36 uint32_t twidCoefModifier);
38 void arm_radix4_butterfly_q31(
39 q31_t * pSrc,
40 uint32_t fftLen,
41 q31_t * pCoef,
42 uint32_t twidCoefModifier);
44 void arm_bitreversal_q31(
45 q31_t * pSrc,
46 uint32_t fftLen,
47 uint16_t bitRevFactor,
48 uint16_t * pBitRevTab);
50 /**
51 * @ingroup groupTransforms
54 /**
55 * @addtogroup ComplexFFT
56 * @{
59 /**
60 * @details
61 * @brief Processing function for the Q31 CFFT/CIFFT.
62 * @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed
63 * @param[in] *S points to an instance of the Q31 CFFT/CIFFT structure.
64 * @param[in, out] *pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place.
65 * @return none.
67 * \par Input and output formats:
68 * \par
69 * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
70 * Hence the output format is different for different FFT sizes.
71 * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
72 * \par
73 * \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"
74 * \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"
78 void arm_cfft_radix4_q31(
79 const arm_cfft_radix4_instance_q31 * S,
80 q31_t * pSrc)
82 if (S->ifftFlag == 1U)
84 /* Complex IFFT radix-4 */
85 arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
87 else
89 /* Complex FFT radix-4 */
90 arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
93 if (S->bitReverseFlag == 1U)
95 /* Bit Reversal */
96 arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
102 * @} end of ComplexFFT group
106 * Radix-4 FFT algorithm used is :
108 * Input real and imaginary data:
109 * x(n) = xa + j * ya
110 * x(n+N/4 ) = xb + j * yb
111 * x(n+N/2 ) = xc + j * yc
112 * x(n+3N 4) = xd + j * yd
115 * Output real and imaginary data:
116 * x(4r) = xa'+ j * ya'
117 * x(4r+1) = xb'+ j * yb'
118 * x(4r+2) = xc'+ j * yc'
119 * x(4r+3) = xd'+ j * yd'
122 * Twiddle factors for radix-4 FFT:
123 * Wn = co1 + j * (- si1)
124 * W2n = co2 + j * (- si2)
125 * W3n = co3 + j * (- si3)
127 * Butterfly implementation:
128 * xa' = xa + xb + xc + xd
129 * ya' = ya + yb + yc + yd
130 * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
131 * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
132 * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
133 * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
134 * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
135 * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
140 * @brief Core function for the Q31 CFFT butterfly process.
141 * @param[in, out] *pSrc points to the in-place buffer of Q31 data type.
142 * @param[in] fftLen length of the FFT.
143 * @param[in] *pCoef points to twiddle coefficient buffer.
144 * @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
145 * @return none.
148 void arm_radix4_butterfly_q31(
149 q31_t * pSrc,
150 uint32_t fftLen,
151 q31_t * pCoef,
152 uint32_t twidCoefModifier)
154 #if defined(ARM_MATH_CM7)
155 uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
156 q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
158 q31_t xa, xb, xc, xd;
159 q31_t ya, yb, yc, yd;
160 q31_t xa_out, xb_out, xc_out, xd_out;
161 q31_t ya_out, yb_out, yc_out, yd_out;
163 q31_t *ptr1;
164 q63_t xaya, xbyb, xcyc, xdyd;
165 /* Total process is divided into three stages */
167 /* process first stage, middle stages, & last stage */
170 /* start of first stage process */
172 /* Initializations for the first stage */
173 n2 = fftLen;
174 n1 = n2;
175 /* n2 = fftLen/4 */
176 n2 >>= 2U;
177 i0 = 0U;
178 ia1 = 0U;
180 j = n2;
182 /* Calculation of first stage */
185 /* index calculation for the input as, */
186 /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
187 i1 = i0 + n2;
188 i2 = i1 + n2;
189 i3 = i2 + n2;
191 /* input is in 1.31(q31) format and provide 4 guard bits for the input */
193 /* Butterfly implementation */
194 /* xa + xc */
195 r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U);
196 /* xa - xc */
197 r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
199 /* xb + xd */
200 t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
202 /* ya + yc */
203 s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
204 /* ya - yc */
205 s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
207 /* xa' = xa + xb + xc + xd */
208 pSrc[2U * i0] = (r1 + t1);
209 /* (xa + xc) - (xb + xd) */
210 r1 = r1 - t1;
211 /* yb + yd */
212 t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
214 /* ya' = ya + yb + yc + yd */
215 pSrc[(2U * i0) + 1U] = (s1 + t2);
217 /* (ya + yc) - (yb + yd) */
218 s1 = s1 - t2;
220 /* yb - yd */
221 t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
222 /* xb - xd */
223 t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
225 /* index calculation for the coefficients */
226 ia2 = 2U * ia1;
227 co2 = pCoef[ia2 * 2U];
228 si2 = pCoef[(ia2 * 2U) + 1U];
230 /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
231 pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
232 ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
234 /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
235 pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
236 ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
238 /* (xa - xc) + (yb - yd) */
239 r1 = r2 + t1;
240 /* (xa - xc) - (yb - yd) */
241 r2 = r2 - t1;
243 /* (ya - yc) - (xb - xd) */
244 s1 = s2 - t2;
245 /* (ya - yc) + (xb - xd) */
246 s2 = s2 + t2;
248 co1 = pCoef[ia1 * 2U];
249 si1 = pCoef[(ia1 * 2U) + 1U];
251 /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
252 pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
253 ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
255 /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
256 pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
257 ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
259 /* index calculation for the coefficients */
260 ia3 = 3U * ia1;
261 co3 = pCoef[ia3 * 2U];
262 si3 = pCoef[(ia3 * 2U) + 1U];
264 /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
265 pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
266 ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
268 /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
269 pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
270 ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
272 /* Twiddle coefficients index modifier */
273 ia1 = ia1 + twidCoefModifier;
275 /* Updating input index */
276 i0 = i0 + 1U;
278 } while (--j);
280 /* end of first stage process */
282 /* data is in 5.27(q27) format */
285 /* start of Middle stages process */
288 /* each stage in middle stages provides two down scaling of the input */
290 twidCoefModifier <<= 2U;
293 for (k = fftLen / 4U; k > 4U; k >>= 2U)
295 /* Initializations for the first stage */
296 n1 = n2;
297 n2 >>= 2U;
298 ia1 = 0U;
300 /* Calculation of first stage */
301 for (j = 0U; j <= (n2 - 1U); j++)
303 /* index calculation for the coefficients */
304 ia2 = ia1 + ia1;
305 ia3 = ia2 + ia1;
306 co1 = pCoef[ia1 * 2U];
307 si1 = pCoef[(ia1 * 2U) + 1U];
308 co2 = pCoef[ia2 * 2U];
309 si2 = pCoef[(ia2 * 2U) + 1U];
310 co3 = pCoef[ia3 * 2U];
311 si3 = pCoef[(ia3 * 2U) + 1U];
312 /* Twiddle coefficients index modifier */
313 ia1 = ia1 + twidCoefModifier;
315 for (i0 = j; i0 < fftLen; i0 += n1)
317 /* index calculation for the input as, */
318 /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
319 i1 = i0 + n2;
320 i2 = i1 + n2;
321 i3 = i2 + n2;
323 /* Butterfly implementation */
324 /* xa + xc */
325 r1 = pSrc[2U * i0] + pSrc[2U * i2];
326 /* xa - xc */
327 r2 = pSrc[2U * i0] - pSrc[2U * i2];
329 /* ya + yc */
330 s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
331 /* ya - yc */
332 s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
334 /* xb + xd */
335 t1 = pSrc[2U * i1] + pSrc[2U * i3];
337 /* xa' = xa + xb + xc + xd */
338 pSrc[2U * i0] = (r1 + t1) >> 2U;
339 /* xa + xc -(xb + xd) */
340 r1 = r1 - t1;
342 /* yb + yd */
343 t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
344 /* ya' = ya + yb + yc + yd */
345 pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
347 /* (ya + yc) - (yb + yd) */
348 s1 = s1 - t2;
350 /* (yb - yd) */
351 t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
352 /* (xb - xd) */
353 t2 = pSrc[2U * i1] - pSrc[2U * i3];
355 /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
356 pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
357 ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
359 /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
360 pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
361 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
363 /* (xa - xc) + (yb - yd) */
364 r1 = r2 + t1;
365 /* (xa - xc) - (yb - yd) */
366 r2 = r2 - t1;
368 /* (ya - yc) - (xb - xd) */
369 s1 = s2 - t2;
370 /* (ya - yc) + (xb - xd) */
371 s2 = s2 + t2;
373 /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
374 pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
375 ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
377 /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
378 pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
379 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
381 /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
382 pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
383 ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
385 /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
386 pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
387 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
390 twidCoefModifier <<= 2U;
392 #else
393 uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
394 q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
396 q31_t xa, xb, xc, xd;
397 q31_t ya, yb, yc, yd;
398 q31_t xa_out, xb_out, xc_out, xd_out;
399 q31_t ya_out, yb_out, yc_out, yd_out;
401 q31_t *ptr1;
402 q31_t *pSi0;
403 q31_t *pSi1;
404 q31_t *pSi2;
405 q31_t *pSi3;
406 q63_t xaya, xbyb, xcyc, xdyd;
407 /* Total process is divided into three stages */
409 /* process first stage, middle stages, & last stage */
412 /* start of first stage process */
414 /* Initializations for the first stage */
415 n2 = fftLen;
416 n1 = n2;
417 /* n2 = fftLen/4 */
418 n2 >>= 2U;
420 ia1 = 0U;
422 j = n2;
424 pSi0 = pSrc;
425 pSi1 = pSi0 + 2 * n2;
426 pSi2 = pSi1 + 2 * n2;
427 pSi3 = pSi2 + 2 * n2;
429 /* Calculation of first stage */
432 /* input is in 1.31(q31) format and provide 4 guard bits for the input */
434 /* Butterfly implementation */
435 /* xa + xc */
436 r1 = (pSi0[0] >> 4U) + (pSi2[0] >> 4U);
437 /* xa - xc */
438 r2 = (pSi0[0] >> 4U) - (pSi2[0] >> 4U);
440 /* xb + xd */
441 t1 = (pSi1[0] >> 4U) + (pSi3[0] >> 4U);
443 /* ya + yc */
444 s1 = (pSi0[1] >> 4U) + (pSi2[1] >> 4U);
445 /* ya - yc */
446 s2 = (pSi0[1] >> 4U) - (pSi2[1] >> 4U);
448 /* xa' = xa + xb + xc + xd */
449 *pSi0++ = (r1 + t1);
450 /* (xa + xc) - (xb + xd) */
451 r1 = r1 - t1;
452 /* yb + yd */
453 t2 = (pSi1[1] >> 4U) + (pSi3[1] >> 4U);
455 /* ya' = ya + yb + yc + yd */
456 *pSi0++ = (s1 + t2);
458 /* (ya + yc) - (yb + yd) */
459 s1 = s1 - t2;
461 /* yb - yd */
462 t1 = (pSi1[1] >> 4U) - (pSi3[1] >> 4U);
463 /* xb - xd */
464 t2 = (pSi1[0] >> 4U) - (pSi3[0] >> 4U);
466 /* index calculation for the coefficients */
467 ia2 = 2U * ia1;
468 co2 = pCoef[ia2 * 2U];
469 si2 = pCoef[(ia2 * 2U) + 1U];
471 /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
472 *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
473 ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
475 /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
476 *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
477 ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
479 /* (xa - xc) + (yb - yd) */
480 r1 = r2 + t1;
481 /* (xa - xc) - (yb - yd) */
482 r2 = r2 - t1;
484 /* (ya - yc) - (xb - xd) */
485 s1 = s2 - t2;
486 /* (ya - yc) + (xb - xd) */
487 s2 = s2 + t2;
489 co1 = pCoef[ia1 * 2U];
490 si1 = pCoef[(ia1 * 2U) + 1U];
492 /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
493 *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
494 ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
496 /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
497 *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
498 ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
500 /* index calculation for the coefficients */
501 ia3 = 3U * ia1;
502 co3 = pCoef[ia3 * 2U];
503 si3 = pCoef[(ia3 * 2U) + 1U];
505 /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
506 *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
507 ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
509 /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
510 *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
511 ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
513 /* Twiddle coefficients index modifier */
514 ia1 = ia1 + twidCoefModifier;
516 } while (--j);
518 /* end of first stage process */
520 /* data is in 5.27(q27) format */
523 /* start of Middle stages process */
526 /* each stage in middle stages provides two down scaling of the input */
528 twidCoefModifier <<= 2U;
531 for (k = fftLen / 4U; k > 4U; k >>= 2U)
533 /* Initializations for the first stage */
534 n1 = n2;
535 n2 >>= 2U;
536 ia1 = 0U;
538 /* Calculation of first stage */
539 for (j = 0U; j <= (n2 - 1U); j++)
541 /* index calculation for the coefficients */
542 ia2 = ia1 + ia1;
543 ia3 = ia2 + ia1;
544 co1 = pCoef[ia1 * 2U];
545 si1 = pCoef[(ia1 * 2U) + 1U];
546 co2 = pCoef[ia2 * 2U];
547 si2 = pCoef[(ia2 * 2U) + 1U];
548 co3 = pCoef[ia3 * 2U];
549 si3 = pCoef[(ia3 * 2U) + 1U];
550 /* Twiddle coefficients index modifier */
551 ia1 = ia1 + twidCoefModifier;
553 pSi0 = pSrc + 2 * j;
554 pSi1 = pSi0 + 2 * n2;
555 pSi2 = pSi1 + 2 * n2;
556 pSi3 = pSi2 + 2 * n2;
558 for (i0 = j; i0 < fftLen; i0 += n1)
560 /* Butterfly implementation */
561 /* xa + xc */
562 r1 = pSi0[0] + pSi2[0];
564 /* xa - xc */
565 r2 = pSi0[0] - pSi2[0];
568 /* ya + yc */
569 s1 = pSi0[1] + pSi2[1];
571 /* ya - yc */
572 s2 = pSi0[1] - pSi2[1];
575 /* xb + xd */
576 t1 = pSi1[0] + pSi3[0];
579 /* xa' = xa + xb + xc + xd */
580 pSi0[0] = (r1 + t1) >> 2U;
581 /* xa + xc -(xb + xd) */
582 r1 = r1 - t1;
584 /* yb + yd */
585 t2 = pSi1[1] + pSi3[1];
587 /* ya' = ya + yb + yc + yd */
588 pSi0[1] = (s1 + t2) >> 2U;
589 pSi0 += 2 * n1;
591 /* (ya + yc) - (yb + yd) */
592 s1 = s1 - t2;
594 /* (yb - yd) */
595 t1 = pSi1[1] - pSi3[1];
597 /* (xb - xd) */
598 t2 = pSi1[0] - pSi3[0];
601 /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
602 pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
603 ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
605 /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
606 pSi1[1] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
607 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
608 pSi1 += 2 * n1;
610 /* (xa - xc) + (yb - yd) */
611 r1 = r2 + t1;
612 /* (xa - xc) - (yb - yd) */
613 r2 = r2 - t1;
615 /* (ya - yc) - (xb - xd) */
616 s1 = s2 - t2;
617 /* (ya - yc) + (xb - xd) */
618 s2 = s2 + t2;
620 /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
621 pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
622 ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
624 /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
625 pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
626 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
627 pSi2 += 2 * n1;
629 /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
630 pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
631 ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
633 /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
634 pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
635 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
636 pSi3 += 2 * n1;
639 twidCoefModifier <<= 2U;
641 #endif
643 /* End of Middle stages process */
645 /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
646 /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
647 /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
648 /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
651 /* start of Last stage process */
652 /* Initializations for the last stage */
653 j = fftLen >> 2;
654 ptr1 = &pSrc[0];
656 /* Calculations of last stage */
660 #ifndef ARM_MATH_BIG_ENDIAN
662 /* Read xa (real), ya(imag) input */
663 xaya = *__SIMD64(ptr1)++;
664 xa = (q31_t) xaya;
665 ya = (q31_t) (xaya >> 32);
667 /* Read xb (real), yb(imag) input */
668 xbyb = *__SIMD64(ptr1)++;
669 xb = (q31_t) xbyb;
670 yb = (q31_t) (xbyb >> 32);
672 /* Read xc (real), yc(imag) input */
673 xcyc = *__SIMD64(ptr1)++;
674 xc = (q31_t) xcyc;
675 yc = (q31_t) (xcyc >> 32);
677 /* Read xc (real), yc(imag) input */
678 xdyd = *__SIMD64(ptr1)++;
679 xd = (q31_t) xdyd;
680 yd = (q31_t) (xdyd >> 32);
682 #else
684 /* Read xa (real), ya(imag) input */
685 xaya = *__SIMD64(ptr1)++;
686 ya = (q31_t) xaya;
687 xa = (q31_t) (xaya >> 32);
689 /* Read xb (real), yb(imag) input */
690 xbyb = *__SIMD64(ptr1)++;
691 yb = (q31_t) xbyb;
692 xb = (q31_t) (xbyb >> 32);
694 /* Read xc (real), yc(imag) input */
695 xcyc = *__SIMD64(ptr1)++;
696 yc = (q31_t) xcyc;
697 xc = (q31_t) (xcyc >> 32);
699 /* Read xc (real), yc(imag) input */
700 xdyd = *__SIMD64(ptr1)++;
701 yd = (q31_t) xdyd;
702 xd = (q31_t) (xdyd >> 32);
705 #endif
707 /* xa' = xa + xb + xc + xd */
708 xa_out = xa + xb + xc + xd;
710 /* ya' = ya + yb + yc + yd */
711 ya_out = ya + yb + yc + yd;
713 /* pointer updation for writing */
714 ptr1 = ptr1 - 8U;
716 /* writing xa' and ya' */
717 *ptr1++ = xa_out;
718 *ptr1++ = ya_out;
720 xc_out = (xa - xb + xc - xd);
721 yc_out = (ya - yb + yc - yd);
723 /* writing xc' and yc' */
724 *ptr1++ = xc_out;
725 *ptr1++ = yc_out;
727 xb_out = (xa + yb - xc - yd);
728 yb_out = (ya - xb - yc + xd);
730 /* writing xb' and yb' */
731 *ptr1++ = xb_out;
732 *ptr1++ = yb_out;
734 xd_out = (xa - yb - xc + yd);
735 yd_out = (ya + xb - yc - xd);
737 /* writing xd' and yd' */
738 *ptr1++ = xd_out;
739 *ptr1++ = yd_out;
742 } while (--j);
744 /* output is in 11.21(q21) format for the 1024 point */
745 /* output is in 9.23(q23) format for the 256 point */
746 /* output is in 7.25(q25) format for the 64 point */
747 /* output is in 5.27(q27) format for the 16 point */
749 /* End of last stage process */
755 * @brief Core function for the Q31 CIFFT butterfly process.
756 * @param[in, out] *pSrc points to the in-place buffer of Q31 data type.
757 * @param[in] fftLen length of the FFT.
758 * @param[in] *pCoef points to twiddle coefficient buffer.
759 * @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
760 * @return none.
765 * Radix-4 IFFT algorithm used is :
767 * CIFFT uses same twiddle coefficients as CFFT Function
768 * x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
771 * IFFT is implemented with following changes in equations from FFT
773 * Input real and imaginary data:
774 * x(n) = xa + j * ya
775 * x(n+N/4 ) = xb + j * yb
776 * x(n+N/2 ) = xc + j * yc
777 * x(n+3N 4) = xd + j * yd
780 * Output real and imaginary data:
781 * x(4r) = xa'+ j * ya'
782 * x(4r+1) = xb'+ j * yb'
783 * x(4r+2) = xc'+ j * yc'
784 * x(4r+3) = xd'+ j * yd'
787 * Twiddle factors for radix-4 IFFT:
788 * Wn = co1 + j * (si1)
789 * W2n = co2 + j * (si2)
790 * W3n = co3 + j * (si3)
792 * The real and imaginary output values for the radix-4 butterfly are
793 * xa' = xa + xb + xc + xd
794 * ya' = ya + yb + yc + yd
795 * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
796 * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
797 * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
798 * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
799 * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
800 * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
804 void arm_radix4_butterfly_inverse_q31(
805 q31_t * pSrc,
806 uint32_t fftLen,
807 q31_t * pCoef,
808 uint32_t twidCoefModifier)
810 #if defined(ARM_MATH_CM7)
811 uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
812 q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
813 q31_t xa, xb, xc, xd;
814 q31_t ya, yb, yc, yd;
815 q31_t xa_out, xb_out, xc_out, xd_out;
816 q31_t ya_out, yb_out, yc_out, yd_out;
818 q31_t *ptr1;
819 q63_t xaya, xbyb, xcyc, xdyd;
821 /* input is be 1.31(q31) format for all FFT sizes */
822 /* Total process is divided into three stages */
823 /* process first stage, middle stages, & last stage */
825 /* Start of first stage process */
827 /* Initializations for the first stage */
828 n2 = fftLen;
829 n1 = n2;
830 /* n2 = fftLen/4 */
831 n2 >>= 2U;
832 i0 = 0U;
833 ia1 = 0U;
835 j = n2;
840 /* input is in 1.31(q31) format and provide 4 guard bits for the input */
842 /* index calculation for the input as, */
843 /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
844 i1 = i0 + n2;
845 i2 = i1 + n2;
846 i3 = i2 + n2;
848 /* Butterfly implementation */
849 /* xa + xc */
850 r1 = (pSrc[2U * i0] >> 4U) + (pSrc[2U * i2] >> 4U);
851 /* xa - xc */
852 r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
854 /* xb + xd */
855 t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
857 /* ya + yc */
858 s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
859 /* ya - yc */
860 s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
862 /* xa' = xa + xb + xc + xd */
863 pSrc[2U * i0] = (r1 + t1);
864 /* (xa + xc) - (xb + xd) */
865 r1 = r1 - t1;
866 /* yb + yd */
867 t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
868 /* ya' = ya + yb + yc + yd */
869 pSrc[(2U * i0) + 1U] = (s1 + t2);
871 /* (ya + yc) - (yb + yd) */
872 s1 = s1 - t2;
874 /* yb - yd */
875 t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
876 /* xb - xd */
877 t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
879 /* index calculation for the coefficients */
880 ia2 = 2U * ia1;
881 co2 = pCoef[ia2 * 2U];
882 si2 = pCoef[(ia2 * 2U) + 1U];
884 /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
885 pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
886 ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
888 /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
889 pSrc[2U * i1 + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
890 ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
892 /* (xa - xc) - (yb - yd) */
893 r1 = r2 - t1;
894 /* (xa - xc) + (yb - yd) */
895 r2 = r2 + t1;
897 /* (ya - yc) + (xb - xd) */
898 s1 = s2 + t2;
899 /* (ya - yc) - (xb - xd) */
900 s2 = s2 - t2;
902 co1 = pCoef[ia1 * 2U];
903 si1 = pCoef[(ia1 * 2U) + 1U];
905 /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
906 pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
907 ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
909 /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
910 pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
911 ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
913 /* index calculation for the coefficients */
914 ia3 = 3U * ia1;
915 co3 = pCoef[ia3 * 2U];
916 si3 = pCoef[(ia3 * 2U) + 1U];
918 /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
919 pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
920 ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
922 /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
923 pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
924 ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
926 /* Twiddle coefficients index modifier */
927 ia1 = ia1 + twidCoefModifier;
929 /* Updating input index */
930 i0 = i0 + 1U;
932 } while (--j);
934 /* data is in 5.27(q27) format */
935 /* each stage provides two down scaling of the input */
938 /* Start of Middle stages process */
940 twidCoefModifier <<= 2U;
942 /* Calculation of second stage to excluding last stage */
943 for (k = fftLen / 4U; k > 4U; k >>= 2U)
945 /* Initializations for the first stage */
946 n1 = n2;
947 n2 >>= 2U;
948 ia1 = 0U;
950 for (j = 0; j <= (n2 - 1U); j++)
952 /* index calculation for the coefficients */
953 ia2 = ia1 + ia1;
954 ia3 = ia2 + ia1;
955 co1 = pCoef[ia1 * 2U];
956 si1 = pCoef[(ia1 * 2U) + 1U];
957 co2 = pCoef[ia2 * 2U];
958 si2 = pCoef[(ia2 * 2U) + 1U];
959 co3 = pCoef[ia3 * 2U];
960 si3 = pCoef[(ia3 * 2U) + 1U];
961 /* Twiddle coefficients index modifier */
962 ia1 = ia1 + twidCoefModifier;
964 for (i0 = j; i0 < fftLen; i0 += n1)
966 /* index calculation for the input as, */
967 /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
968 i1 = i0 + n2;
969 i2 = i1 + n2;
970 i3 = i2 + n2;
972 /* Butterfly implementation */
973 /* xa + xc */
974 r1 = pSrc[2U * i0] + pSrc[2U * i2];
975 /* xa - xc */
976 r2 = pSrc[2U * i0] - pSrc[2U * i2];
978 /* ya + yc */
979 s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
980 /* ya - yc */
981 s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
983 /* xb + xd */
984 t1 = pSrc[2U * i1] + pSrc[2U * i3];
986 /* xa' = xa + xb + xc + xd */
987 pSrc[2U * i0] = (r1 + t1) >> 2U;
988 /* xa + xc -(xb + xd) */
989 r1 = r1 - t1;
990 /* yb + yd */
991 t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
992 /* ya' = ya + yb + yc + yd */
993 pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
995 /* (ya + yc) - (yb + yd) */
996 s1 = s1 - t2;
998 /* (yb - yd) */
999 t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
1000 /* (xb - xd) */
1001 t2 = pSrc[2U * i1] - pSrc[2U * i3];
1003 /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1004 pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
1005 ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
1007 /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1008 pSrc[(2U * i1) + 1U] =
1009 (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
1010 ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
1012 /* (xa - xc) - (yb - yd) */
1013 r1 = r2 - t1;
1014 /* (xa - xc) + (yb - yd) */
1015 r2 = r2 + t1;
1017 /* (ya - yc) + (xb - xd) */
1018 s1 = s2 + t2;
1019 /* (ya - yc) - (xb - xd) */
1020 s2 = s2 - t2;
1022 /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1023 pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
1024 ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
1026 /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1027 pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
1028 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
1030 /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1031 pSrc[(2U * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
1032 ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
1034 /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1035 pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
1036 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
1039 twidCoefModifier <<= 2U;
1041 #else
1042 uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
1043 q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
1044 q31_t xa, xb, xc, xd;
1045 q31_t ya, yb, yc, yd;
1046 q31_t xa_out, xb_out, xc_out, xd_out;
1047 q31_t ya_out, yb_out, yc_out, yd_out;
1049 q31_t *ptr1;
1050 q31_t *pSi0;
1051 q31_t *pSi1;
1052 q31_t *pSi2;
1053 q31_t *pSi3;
1054 q63_t xaya, xbyb, xcyc, xdyd;
1056 /* input is be 1.31(q31) format for all FFT sizes */
1057 /* Total process is divided into three stages */
1058 /* process first stage, middle stages, & last stage */
1060 /* Start of first stage process */
1062 /* Initializations for the first stage */
1063 n2 = fftLen;
1064 n1 = n2;
1065 /* n2 = fftLen/4 */
1066 n2 >>= 2U;
1068 ia1 = 0U;
1070 j = n2;
1072 pSi0 = pSrc;
1073 pSi1 = pSi0 + 2 * n2;
1074 pSi2 = pSi1 + 2 * n2;
1075 pSi3 = pSi2 + 2 * n2;
1079 /* Butterfly implementation */
1080 /* xa + xc */
1081 r1 = (pSi0[0] >> 4U) + (pSi2[0] >> 4U);
1082 /* xa - xc */
1083 r2 = (pSi0[0] >> 4U) - (pSi2[0] >> 4U);
1085 /* xb + xd */
1086 t1 = (pSi1[0] >> 4U) + (pSi3[0] >> 4U);
1088 /* ya + yc */
1089 s1 = (pSi0[1] >> 4U) + (pSi2[1] >> 4U);
1090 /* ya - yc */
1091 s2 = (pSi0[1] >> 4U) - (pSi2[1] >> 4U);
1093 /* xa' = xa + xb + xc + xd */
1094 *pSi0++ = (r1 + t1);
1095 /* (xa + xc) - (xb + xd) */
1096 r1 = r1 - t1;
1097 /* yb + yd */
1098 t2 = (pSi1[1] >> 4U) + (pSi3[1] >> 4U);
1099 /* ya' = ya + yb + yc + yd */
1100 *pSi0++ = (s1 + t2);
1102 /* (ya + yc) - (yb + yd) */
1103 s1 = s1 - t2;
1105 /* yb - yd */
1106 t1 = (pSi1[1] >> 4U) - (pSi3[1] >> 4U);
1107 /* xb - xd */
1108 t2 = (pSi1[0] >> 4U) - (pSi3[0] >> 4U);
1110 /* index calculation for the coefficients */
1111 ia2 = 2U * ia1;
1112 co2 = pCoef[ia2 * 2U];
1113 si2 = pCoef[(ia2 * 2U) + 1U];
1115 /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1116 *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
1117 ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
1119 /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1120 *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
1121 ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
1123 /* (xa - xc) - (yb - yd) */
1124 r1 = r2 - t1;
1125 /* (xa - xc) + (yb - yd) */
1126 r2 = r2 + t1;
1128 /* (ya - yc) + (xb - xd) */
1129 s1 = s2 + t2;
1130 /* (ya - yc) - (xb - xd) */
1131 s2 = s2 - t2;
1133 co1 = pCoef[ia1 * 2U];
1134 si1 = pCoef[(ia1 * 2U) + 1U];
1136 /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1137 *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
1138 ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
1140 /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1141 *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
1142 ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
1144 /* index calculation for the coefficients */
1145 ia3 = 3U * ia1;
1146 co3 = pCoef[ia3 * 2U];
1147 si3 = pCoef[(ia3 * 2U) + 1U];
1149 /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1150 *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
1151 ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
1153 /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1154 *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
1155 ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
1157 /* Twiddle coefficients index modifier */
1158 ia1 = ia1 + twidCoefModifier;
1160 } while (--j);
1162 /* data is in 5.27(q27) format */
1163 /* each stage provides two down scaling of the input */
1166 /* Start of Middle stages process */
1168 twidCoefModifier <<= 2U;
1170 /* Calculation of second stage to excluding last stage */
1171 for (k = fftLen / 4U; k > 4U; k >>= 2U)
1173 /* Initializations for the first stage */
1174 n1 = n2;
1175 n2 >>= 2U;
1176 ia1 = 0U;
1178 for (j = 0; j <= (n2 - 1U); j++)
1180 /* index calculation for the coefficients */
1181 ia2 = ia1 + ia1;
1182 ia3 = ia2 + ia1;
1183 co1 = pCoef[ia1 * 2U];
1184 si1 = pCoef[(ia1 * 2U) + 1U];
1185 co2 = pCoef[ia2 * 2U];
1186 si2 = pCoef[(ia2 * 2U) + 1U];
1187 co3 = pCoef[ia3 * 2U];
1188 si3 = pCoef[(ia3 * 2U) + 1U];
1189 /* Twiddle coefficients index modifier */
1190 ia1 = ia1 + twidCoefModifier;
1192 pSi0 = pSrc + 2 * j;
1193 pSi1 = pSi0 + 2 * n2;
1194 pSi2 = pSi1 + 2 * n2;
1195 pSi3 = pSi2 + 2 * n2;
1197 for (i0 = j; i0 < fftLen; i0 += n1)
1199 /* Butterfly implementation */
1200 /* xa + xc */
1201 r1 = pSi0[0] + pSi2[0];
1203 /* xa - xc */
1204 r2 = pSi0[0] - pSi2[0];
1207 /* ya + yc */
1208 s1 = pSi0[1] + pSi2[1];
1210 /* ya - yc */
1211 s2 = pSi0[1] - pSi2[1];
1214 /* xb + xd */
1215 t1 = pSi1[0] + pSi3[0];
1218 /* xa' = xa + xb + xc + xd */
1219 pSi0[0] = (r1 + t1) >> 2U;
1220 /* xa + xc -(xb + xd) */
1221 r1 = r1 - t1;
1222 /* yb + yd */
1223 t2 = pSi1[1] + pSi3[1];
1225 /* ya' = ya + yb + yc + yd */
1226 pSi0[1] = (s1 + t2) >> 2U;
1227 pSi0 += 2 * n1;
1229 /* (ya + yc) - (yb + yd) */
1230 s1 = s1 - t2;
1232 /* (yb - yd) */
1233 t1 = pSi1[1] - pSi3[1];
1235 /* (xb - xd) */
1236 t2 = pSi1[0] - pSi3[0];
1239 /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1240 pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
1241 ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
1243 /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1244 pSi1[1] =
1246 (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
1247 ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
1248 pSi1 += 2 * n1;
1250 /* (xa - xc) - (yb - yd) */
1251 r1 = r2 - t1;
1252 /* (xa - xc) + (yb - yd) */
1253 r2 = r2 + t1;
1255 /* (ya - yc) + (xb - xd) */
1256 s1 = s2 + t2;
1257 /* (ya - yc) - (xb - xd) */
1258 s2 = s2 - t2;
1260 /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1261 pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
1262 ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
1264 /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1265 pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
1266 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
1267 pSi2 += 2 * n1;
1269 /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1270 pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
1271 ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
1273 /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1274 pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
1275 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
1276 pSi3 += 2 * n1;
1279 twidCoefModifier <<= 2U;
1281 #endif
1283 /* End of Middle stages process */
1285 /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
1286 /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
1287 /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
1288 /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
1291 /* Start of last stage process */
1294 /* Initializations for the last stage */
1295 j = fftLen >> 2;
1296 ptr1 = &pSrc[0];
1298 /* Calculations of last stage */
1301 #ifndef ARM_MATH_BIG_ENDIAN
1302 /* Read xa (real), ya(imag) input */
1303 xaya = *__SIMD64(ptr1)++;
1304 xa = (q31_t) xaya;
1305 ya = (q31_t) (xaya >> 32);
1307 /* Read xb (real), yb(imag) input */
1308 xbyb = *__SIMD64(ptr1)++;
1309 xb = (q31_t) xbyb;
1310 yb = (q31_t) (xbyb >> 32);
1312 /* Read xc (real), yc(imag) input */
1313 xcyc = *__SIMD64(ptr1)++;
1314 xc = (q31_t) xcyc;
1315 yc = (q31_t) (xcyc >> 32);
1317 /* Read xc (real), yc(imag) input */
1318 xdyd = *__SIMD64(ptr1)++;
1319 xd = (q31_t) xdyd;
1320 yd = (q31_t) (xdyd >> 32);
1322 #else
1324 /* Read xa (real), ya(imag) input */
1325 xaya = *__SIMD64(ptr1)++;
1326 ya = (q31_t) xaya;
1327 xa = (q31_t) (xaya >> 32);
1329 /* Read xb (real), yb(imag) input */
1330 xbyb = *__SIMD64(ptr1)++;
1331 yb = (q31_t) xbyb;
1332 xb = (q31_t) (xbyb >> 32);
1334 /* Read xc (real), yc(imag) input */
1335 xcyc = *__SIMD64(ptr1)++;
1336 yc = (q31_t) xcyc;
1337 xc = (q31_t) (xcyc >> 32);
1339 /* Read xc (real), yc(imag) input */
1340 xdyd = *__SIMD64(ptr1)++;
1341 yd = (q31_t) xdyd;
1342 xd = (q31_t) (xdyd >> 32);
1345 #endif
1347 /* xa' = xa + xb + xc + xd */
1348 xa_out = xa + xb + xc + xd;
1350 /* ya' = ya + yb + yc + yd */
1351 ya_out = ya + yb + yc + yd;
1353 /* pointer updation for writing */
1354 ptr1 = ptr1 - 8U;
1356 /* writing xa' and ya' */
1357 *ptr1++ = xa_out;
1358 *ptr1++ = ya_out;
1360 xc_out = (xa - xb + xc - xd);
1361 yc_out = (ya - yb + yc - yd);
1363 /* writing xc' and yc' */
1364 *ptr1++ = xc_out;
1365 *ptr1++ = yc_out;
1367 xb_out = (xa - yb - xc + yd);
1368 yb_out = (ya + xb - yc - xd);
1370 /* writing xb' and yb' */
1371 *ptr1++ = xb_out;
1372 *ptr1++ = yb_out;
1374 xd_out = (xa + yb - xc - yd);
1375 yd_out = (ya - xb - yc + xd);
1377 /* writing xd' and yd' */
1378 *ptr1++ = xd_out;
1379 *ptr1++ = yd_out;
1381 } while (--j);
1383 /* output is in 11.21(q21) format for the 1024 point */
1384 /* output is in 9.23(q23) format for the 256 point */
1385 /* output is in 7.25(q25) format for the 64 point */
1386 /* output is in 5.27(q27) format for the 16 point */
1388 /* End of last stage process */