before merging master
[inav.git] / lib / main / CMSIS / DSP / Source / TransformFunctions / arm_cfft_radix4_q15.c
blobf3451f740c757ccadf305a65139df710dfc3d888
1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cfft_radix4_q15.c
4 * Description: This file has function definition of Radix-4 FFT & IFFT function and
5 * In-place bit reversal using bit reversal table
7 * $Date: 27. January 2017
8 * $Revision: V.1.5.1
10 * Target Processor: Cortex-M cores
11 * -------------------------------------------------------------------- */
13 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
15 * SPDX-License-Identifier: Apache-2.0
17 * Licensed under the Apache License, Version 2.0 (the License); you may
18 * not use this file except in compliance with the License.
19 * You may obtain a copy of the License at
21 * www.apache.org/licenses/LICENSE-2.0
23 * Unless required by applicable law or agreed to in writing, software
24 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26 * See the License for the specific language governing permissions and
27 * limitations under the License.
30 #include "arm_math.h"
33 void arm_radix4_butterfly_q15(
34 q15_t * pSrc16,
35 uint32_t fftLen,
36 q15_t * pCoef16,
37 uint32_t twidCoefModifier);
39 void arm_radix4_butterfly_inverse_q15(
40 q15_t * pSrc16,
41 uint32_t fftLen,
42 q15_t * pCoef16,
43 uint32_t twidCoefModifier);
45 void arm_bitreversal_q15(
46 q15_t * pSrc,
47 uint32_t fftLen,
48 uint16_t bitRevFactor,
49 uint16_t * pBitRevTab);
51 /**
52 * @ingroup groupTransforms
55 /**
56 * @addtogroup ComplexFFT
57 * @{
61 /**
62 * @details
63 * @brief Processing function for the Q15 CFFT/CIFFT.
64 * @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
65 * @param[in] *S points to an instance of the Q15 CFFT/CIFFT structure.
66 * @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.
67 * @return none.
69 * \par Input and output formats:
70 * \par
71 * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
72 * Hence the output format is different for different FFT sizes.
73 * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
74 * \par
75 * \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
76 * \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
79 void arm_cfft_radix4_q15(
80 const arm_cfft_radix4_instance_q15 * S,
81 q15_t * pSrc)
83 if (S->ifftFlag == 1U)
85 /* Complex IFFT radix-4 */
86 arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
88 else
90 /* Complex FFT radix-4 */
91 arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
94 if (S->bitReverseFlag == 1U)
96 /* Bit Reversal */
97 arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
103 * @} end of ComplexFFT group
107 * Radix-4 FFT algorithm used is :
109 * Input real and imaginary data:
110 * x(n) = xa + j * ya
111 * x(n+N/4 ) = xb + j * yb
112 * x(n+N/2 ) = xc + j * yc
113 * x(n+3N 4) = xd + j * yd
116 * Output real and imaginary data:
117 * x(4r) = xa'+ j * ya'
118 * x(4r+1) = xb'+ j * yb'
119 * x(4r+2) = xc'+ j * yc'
120 * x(4r+3) = xd'+ j * yd'
123 * Twiddle factors for radix-4 FFT:
124 * Wn = co1 + j * (- si1)
125 * W2n = co2 + j * (- si2)
126 * W3n = co3 + j * (- si3)
128 * The real and imaginary output values for the radix-4 butterfly are
129 * xa' = xa + xb + xc + xd
130 * ya' = ya + yb + yc + yd
131 * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
132 * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
133 * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
134 * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
135 * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
136 * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
141 * @brief Core function for the Q15 CFFT butterfly process.
142 * @param[in, out] *pSrc16 points to the in-place buffer of Q15 data type.
143 * @param[in] fftLen length of the FFT.
144 * @param[in] *pCoef16 points to twiddle coefficient buffer.
145 * @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
146 * @return none.
149 void arm_radix4_butterfly_q15(
150 q15_t * pSrc16,
151 uint32_t fftLen,
152 q15_t * pCoef16,
153 uint32_t twidCoefModifier)
156 #if defined (ARM_MATH_DSP)
158 /* Run the below code for Cortex-M4 and Cortex-M3 */
160 q31_t R, S, T, U;
161 q31_t C1, C2, C3, out1, out2;
162 uint32_t n1, n2, ic, i0, j, k;
164 q15_t *ptr1;
165 q15_t *pSi0;
166 q15_t *pSi1;
167 q15_t *pSi2;
168 q15_t *pSi3;
170 q31_t xaya, xbyb, xcyc, xdyd;
172 /* Total process is divided into three stages */
174 /* process first stage, middle stages, & last stage */
176 /* Initializations for the first stage */
177 n2 = fftLen;
178 n1 = n2;
180 /* n2 = fftLen/4 */
181 n2 >>= 2U;
183 /* Index for twiddle coefficient */
184 ic = 0U;
186 /* Index for input read and output write */
187 j = n2;
189 pSi0 = pSrc16;
190 pSi1 = pSi0 + 2 * n2;
191 pSi2 = pSi1 + 2 * n2;
192 pSi3 = pSi2 + 2 * n2;
194 /* Input is in 1.15(q15) format */
196 /* start of first stage process */
199 /* Butterfly implementation */
201 /* Reading i0, i0+fftLen/2 inputs */
202 /* Read ya (real), xa(imag) input */
203 T = _SIMD32_OFFSET(pSi0);
204 T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
205 T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
206 //in = ((int16_t) (T & 0xFFFF)) >> 2; // alternative code that takes 3 cycles
207 //T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
209 /* Read yc (real), xc(imag) input */
210 S = _SIMD32_OFFSET(pSi2);
211 S = __SHADD16(S, 0);
212 S = __SHADD16(S, 0);
214 /* R = packed((ya + yc), (xa + xc) ) */
215 R = __QADD16(T, S);
217 /* S = packed((ya - yc), (xa - xc) ) */
218 S = __QSUB16(T, S);
220 /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
221 /* Read yb (real), xb(imag) input */
222 T = _SIMD32_OFFSET(pSi1);
223 T = __SHADD16(T, 0);
224 T = __SHADD16(T, 0);
226 /* Read yd (real), xd(imag) input */
227 U = _SIMD32_OFFSET(pSi3);
228 U = __SHADD16(U, 0);
229 U = __SHADD16(U, 0);
231 /* T = packed((yb + yd), (xb + xd) ) */
232 T = __QADD16(T, U);
234 /* writing the butterfly processed i0 sample */
235 /* xa' = xa + xb + xc + xd */
236 /* ya' = ya + yb + yc + yd */
237 _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
238 pSi0 += 2;
240 /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
241 R = __QSUB16(R, T);
243 /* co2 & si2 are read from SIMD Coefficient pointer */
244 C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
246 #ifndef ARM_MATH_BIG_ENDIAN
248 /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
249 out1 = __SMUAD(C2, R) >> 16U;
250 /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
251 out2 = __SMUSDX(C2, R);
253 #else
255 /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
256 out1 = __SMUSDX(R, C2) >> 16U;
257 /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
258 out2 = __SMUAD(C2, R);
260 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
262 /* Reading i0+fftLen/4 */
263 /* T = packed(yb, xb) */
264 T = _SIMD32_OFFSET(pSi1);
265 T = __SHADD16(T, 0);
266 T = __SHADD16(T, 0);
268 /* writing the butterfly processed i0 + fftLen/4 sample */
269 /* writing output(xc', yc') in little endian format */
270 _SIMD32_OFFSET(pSi1) =
271 (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
272 pSi1 += 2;
274 /* Butterfly calculations */
275 /* U = packed(yd, xd) */
276 U = _SIMD32_OFFSET(pSi3);
277 U = __SHADD16(U, 0);
278 U = __SHADD16(U, 0);
280 /* T = packed(yb-yd, xb-xd) */
281 T = __QSUB16(T, U);
283 #ifndef ARM_MATH_BIG_ENDIAN
285 /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
286 R = __QASX(S, T);
287 /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
288 S = __QSAX(S, T);
290 #else
292 /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
293 R = __QSAX(S, T);
294 /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
295 S = __QASX(S, T);
297 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
299 /* co1 & si1 are read from SIMD Coefficient pointer */
300 C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
301 /* Butterfly process for the i0+fftLen/2 sample */
303 #ifndef ARM_MATH_BIG_ENDIAN
305 /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
306 out1 = __SMUAD(C1, S) >> 16U;
307 /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
308 out2 = __SMUSDX(C1, S);
310 #else
312 /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
313 out1 = __SMUSDX(S, C1) >> 16U;
314 /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
315 out2 = __SMUAD(C1, S);
317 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
319 /* writing output(xb', yb') in little endian format */
320 _SIMD32_OFFSET(pSi2) =
321 ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
322 pSi2 += 2;
325 /* co3 & si3 are read from SIMD Coefficient pointer */
326 C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
327 /* Butterfly process for the i0+3fftLen/4 sample */
329 #ifndef ARM_MATH_BIG_ENDIAN
331 /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
332 out1 = __SMUAD(C3, R) >> 16U;
333 /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
334 out2 = __SMUSDX(C3, R);
336 #else
338 /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
339 out1 = __SMUSDX(R, C3) >> 16U;
340 /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
341 out2 = __SMUAD(C3, R);
343 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
345 /* writing output(xd', yd') in little endian format */
346 _SIMD32_OFFSET(pSi3) =
347 ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
348 pSi3 += 2;
350 /* Twiddle coefficients index modifier */
351 ic = ic + twidCoefModifier;
353 } while (--j);
354 /* data is in 4.11(q11) format */
356 /* end of first stage process */
359 /* start of middle stage process */
361 /* Twiddle coefficients index modifier */
362 twidCoefModifier <<= 2U;
364 /* Calculation of Middle stage */
365 for (k = fftLen / 4U; k > 4U; k >>= 2U)
367 /* Initializations for the middle stage */
368 n1 = n2;
369 n2 >>= 2U;
370 ic = 0U;
372 for (j = 0U; j <= (n2 - 1U); j++)
374 /* index calculation for the coefficients */
375 C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
376 C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
377 C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
379 /* Twiddle coefficients index modifier */
380 ic = ic + twidCoefModifier;
382 pSi0 = pSrc16 + 2 * j;
383 pSi1 = pSi0 + 2 * n2;
384 pSi2 = pSi1 + 2 * n2;
385 pSi3 = pSi2 + 2 * n2;
387 /* Butterfly implementation */
388 for (i0 = j; i0 < fftLen; i0 += n1)
390 /* Reading i0, i0+fftLen/2 inputs */
391 /* Read ya (real), xa(imag) input */
392 T = _SIMD32_OFFSET(pSi0);
394 /* Read yc (real), xc(imag) input */
395 S = _SIMD32_OFFSET(pSi2);
397 /* R = packed( (ya + yc), (xa + xc)) */
398 R = __QADD16(T, S);
400 /* S = packed((ya - yc), (xa - xc)) */
401 S = __QSUB16(T, S);
403 /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
404 /* Read yb (real), xb(imag) input */
405 T = _SIMD32_OFFSET(pSi1);
407 /* Read yd (real), xd(imag) input */
408 U = _SIMD32_OFFSET(pSi3);
410 /* T = packed( (yb + yd), (xb + xd)) */
411 T = __QADD16(T, U);
413 /* writing the butterfly processed i0 sample */
415 /* xa' = xa + xb + xc + xd */
416 /* ya' = ya + yb + yc + yd */
417 out1 = __SHADD16(R, T);
418 out1 = __SHADD16(out1, 0);
419 _SIMD32_OFFSET(pSi0) = out1;
420 pSi0 += 2 * n1;
422 /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
423 R = __SHSUB16(R, T);
425 #ifndef ARM_MATH_BIG_ENDIAN
427 /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
428 out1 = __SMUAD(C2, R) >> 16U;
430 /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
431 out2 = __SMUSDX(C2, R);
433 #else
435 /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
436 out1 = __SMUSDX(R, C2) >> 16U;
438 /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
439 out2 = __SMUAD(C2, R);
441 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
443 /* Reading i0+3fftLen/4 */
444 /* Read yb (real), xb(imag) input */
445 T = _SIMD32_OFFSET(pSi1);
447 /* writing the butterfly processed i0 + fftLen/4 sample */
448 /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
449 /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
450 _SIMD32_OFFSET(pSi1) =
451 ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
452 pSi1 += 2 * n1;
454 /* Butterfly calculations */
456 /* Read yd (real), xd(imag) input */
457 U = _SIMD32_OFFSET(pSi3);
459 /* T = packed(yb-yd, xb-xd) */
460 T = __QSUB16(T, U);
462 #ifndef ARM_MATH_BIG_ENDIAN
464 /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
465 R = __SHASX(S, T);
467 /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
468 S = __SHSAX(S, T);
471 /* Butterfly process for the i0+fftLen/2 sample */
472 out1 = __SMUAD(C1, S) >> 16U;
473 out2 = __SMUSDX(C1, S);
475 #else
477 /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
478 R = __SHSAX(S, T);
480 /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
481 S = __SHASX(S, T);
484 /* Butterfly process for the i0+fftLen/2 sample */
485 out1 = __SMUSDX(S, C1) >> 16U;
486 out2 = __SMUAD(C1, S);
488 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
490 /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
491 /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
492 _SIMD32_OFFSET(pSi2) =
493 ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
494 pSi2 += 2 * n1;
496 /* Butterfly process for the i0+3fftLen/4 sample */
498 #ifndef ARM_MATH_BIG_ENDIAN
500 out1 = __SMUAD(C3, R) >> 16U;
501 out2 = __SMUSDX(C3, R);
503 #else
505 out1 = __SMUSDX(R, C3) >> 16U;
506 out2 = __SMUAD(C3, R);
508 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
510 /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
511 /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
512 _SIMD32_OFFSET(pSi3) =
513 ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
514 pSi3 += 2 * n1;
517 /* Twiddle coefficients index modifier */
518 twidCoefModifier <<= 2U;
520 /* end of middle stage process */
523 /* data is in 10.6(q6) format for the 1024 point */
524 /* data is in 8.8(q8) format for the 256 point */
525 /* data is in 6.10(q10) format for the 64 point */
526 /* data is in 4.12(q12) format for the 16 point */
528 /* Initializations for the last stage */
529 j = fftLen >> 2;
531 ptr1 = &pSrc16[0];
533 /* start of last stage process */
535 /* Butterfly implementation */
538 /* Read xa (real), ya(imag) input */
539 xaya = *__SIMD32(ptr1)++;
541 /* Read xb (real), yb(imag) input */
542 xbyb = *__SIMD32(ptr1)++;
544 /* Read xc (real), yc(imag) input */
545 xcyc = *__SIMD32(ptr1)++;
547 /* Read xd (real), yd(imag) input */
548 xdyd = *__SIMD32(ptr1)++;
550 /* R = packed((ya + yc), (xa + xc)) */
551 R = __QADD16(xaya, xcyc);
553 /* T = packed((yb + yd), (xb + xd)) */
554 T = __QADD16(xbyb, xdyd);
556 /* pointer updation for writing */
557 ptr1 = ptr1 - 8U;
560 /* xa' = xa + xb + xc + xd */
561 /* ya' = ya + yb + yc + yd */
562 *__SIMD32(ptr1)++ = __SHADD16(R, T);
564 /* T = packed((yb + yd), (xb + xd)) */
565 T = __QADD16(xbyb, xdyd);
567 /* xc' = (xa-xb+xc-xd) */
568 /* yc' = (ya-yb+yc-yd) */
569 *__SIMD32(ptr1)++ = __SHSUB16(R, T);
571 /* S = packed((ya - yc), (xa - xc)) */
572 S = __QSUB16(xaya, xcyc);
574 /* Read yd (real), xd(imag) input */
575 /* T = packed( (yb - yd), (xb - xd)) */
576 U = __QSUB16(xbyb, xdyd);
578 #ifndef ARM_MATH_BIG_ENDIAN
580 /* xb' = (xa+yb-xc-yd) */
581 /* yb' = (ya-xb-yc+xd) */
582 *__SIMD32(ptr1)++ = __SHSAX(S, U);
585 /* xd' = (xa-yb-xc+yd) */
586 /* yd' = (ya+xb-yc-xd) */
587 *__SIMD32(ptr1)++ = __SHASX(S, U);
589 #else
591 /* xb' = (xa+yb-xc-yd) */
592 /* yb' = (ya-xb-yc+xd) */
593 *__SIMD32(ptr1)++ = __SHASX(S, U);
596 /* xd' = (xa-yb-xc+yd) */
597 /* yd' = (ya+xb-yc-xd) */
598 *__SIMD32(ptr1)++ = __SHSAX(S, U);
600 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
602 } while (--j);
604 /* end of last stage process */
606 /* output is in 11.5(q5) format for the 1024 point */
607 /* output is in 9.7(q7) format for the 256 point */
608 /* output is in 7.9(q9) format for the 64 point */
609 /* output is in 5.11(q11) format for the 16 point */
612 #else
614 /* Run the below code for Cortex-M0 */
616 q15_t R0, R1, S0, S1, T0, T1, U0, U1;
617 q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
618 uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
620 /* Total process is divided into three stages */
622 /* process first stage, middle stages, & last stage */
624 /* Initializations for the first stage */
625 n2 = fftLen;
626 n1 = n2;
628 /* n2 = fftLen/4 */
629 n2 >>= 2U;
631 /* Index for twiddle coefficient */
632 ic = 0U;
634 /* Index for input read and output write */
635 i0 = 0U;
636 j = n2;
638 /* Input is in 1.15(q15) format */
640 /* start of first stage process */
643 /* Butterfly implementation */
645 /* index calculation for the input as, */
646 /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
647 i1 = i0 + n2;
648 i2 = i1 + n2;
649 i3 = i2 + n2;
651 /* Reading i0, i0+fftLen/2 inputs */
653 /* input is down scale by 4 to avoid overflow */
654 /* Read ya (real), xa(imag) input */
655 T0 = pSrc16[i0 * 2U] >> 2U;
656 T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
658 /* input is down scale by 4 to avoid overflow */
659 /* Read yc (real), xc(imag) input */
660 S0 = pSrc16[i2 * 2U] >> 2U;
661 S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
663 /* R0 = (ya + yc) */
664 R0 = __SSAT(T0 + S0, 16U);
665 /* R1 = (xa + xc) */
666 R1 = __SSAT(T1 + S1, 16U);
668 /* S0 = (ya - yc) */
669 S0 = __SSAT(T0 - S0, 16);
670 /* S1 = (xa - xc) */
671 S1 = __SSAT(T1 - S1, 16);
673 /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
674 /* input is down scale by 4 to avoid overflow */
675 /* Read yb (real), xb(imag) input */
676 T0 = pSrc16[i1 * 2U] >> 2U;
677 T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
679 /* input is down scale by 4 to avoid overflow */
680 /* Read yd (real), xd(imag) input */
681 U0 = pSrc16[i3 * 2U] >> 2U;
682 U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
684 /* T0 = (yb + yd) */
685 T0 = __SSAT(T0 + U0, 16U);
686 /* T1 = (xb + xd) */
687 T1 = __SSAT(T1 + U1, 16U);
689 /* writing the butterfly processed i0 sample */
690 /* ya' = ya + yb + yc + yd */
691 /* xa' = xa + xb + xc + xd */
692 pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
693 pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
695 /* R0 = (ya + yc) - (yb + yd) */
696 /* R1 = (xa + xc) - (xb + xd) */
697 R0 = __SSAT(R0 - T0, 16U);
698 R1 = __SSAT(R1 - T1, 16U);
700 /* co2 & si2 are read from Coefficient pointer */
701 Co2 = pCoef16[2U * ic * 2U];
702 Si2 = pCoef16[(2U * ic * 2U) + 1];
704 /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
705 out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
706 /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
707 out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
709 /* Reading i0+fftLen/4 */
710 /* input is down scale by 4 to avoid overflow */
711 /* T0 = yb, T1 = xb */
712 T0 = pSrc16[i1 * 2U] >> 2;
713 T1 = pSrc16[(i1 * 2U) + 1] >> 2;
715 /* writing the butterfly processed i0 + fftLen/4 sample */
716 /* writing output(xc', yc') in little endian format */
717 pSrc16[i1 * 2U] = out1;
718 pSrc16[(i1 * 2U) + 1] = out2;
720 /* Butterfly calculations */
721 /* input is down scale by 4 to avoid overflow */
722 /* U0 = yd, U1 = xd */
723 U0 = pSrc16[i3 * 2U] >> 2;
724 U1 = pSrc16[(i3 * 2U) + 1] >> 2;
725 /* T0 = yb-yd */
726 T0 = __SSAT(T0 - U0, 16);
727 /* T1 = xb-xd */
728 T1 = __SSAT(T1 - U1, 16);
730 /* R1 = (ya-yc) + (xb- xd), R0 = (xa-xc) - (yb-yd)) */
731 R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
732 R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
734 /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
735 S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
736 S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
738 /* co1 & si1 are read from Coefficient pointer */
739 Co1 = pCoef16[ic * 2U];
740 Si1 = pCoef16[(ic * 2U) + 1];
741 /* Butterfly process for the i0+fftLen/2 sample */
742 /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
743 out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
744 /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
745 out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
747 /* writing output(xb', yb') in little endian format */
748 pSrc16[i2 * 2U] = out1;
749 pSrc16[(i2 * 2U) + 1] = out2;
751 /* Co3 & si3 are read from Coefficient pointer */
752 Co3 = pCoef16[3U * (ic * 2U)];
753 Si3 = pCoef16[(3U * (ic * 2U)) + 1];
754 /* Butterfly process for the i0+3fftLen/4 sample */
755 /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
756 out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
757 /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
758 out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
759 /* writing output(xd', yd') in little endian format */
760 pSrc16[i3 * 2U] = out1;
761 pSrc16[(i3 * 2U) + 1] = out2;
763 /* Twiddle coefficients index modifier */
764 ic = ic + twidCoefModifier;
766 /* Updating input index */
767 i0 = i0 + 1U;
769 } while (--j);
770 /* data is in 4.11(q11) format */
772 /* end of first stage process */
775 /* start of middle stage process */
777 /* Twiddle coefficients index modifier */
778 twidCoefModifier <<= 2U;
780 /* Calculation of Middle stage */
781 for (k = fftLen / 4U; k > 4U; k >>= 2U)
783 /* Initializations for the middle stage */
784 n1 = n2;
785 n2 >>= 2U;
786 ic = 0U;
788 for (j = 0U; j <= (n2 - 1U); j++)
790 /* index calculation for the coefficients */
791 Co1 = pCoef16[ic * 2U];
792 Si1 = pCoef16[(ic * 2U) + 1U];
793 Co2 = pCoef16[2U * (ic * 2U)];
794 Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
795 Co3 = pCoef16[3U * (ic * 2U)];
796 Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
798 /* Twiddle coefficients index modifier */
799 ic = ic + twidCoefModifier;
801 /* Butterfly implementation */
802 for (i0 = j; i0 < fftLen; i0 += n1)
804 /* index calculation for the input as, */
805 /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
806 i1 = i0 + n2;
807 i2 = i1 + n2;
808 i3 = i2 + n2;
810 /* Reading i0, i0+fftLen/2 inputs */
811 /* Read ya (real), xa(imag) input */
812 T0 = pSrc16[i0 * 2U];
813 T1 = pSrc16[(i0 * 2U) + 1U];
815 /* Read yc (real), xc(imag) input */
816 S0 = pSrc16[i2 * 2U];
817 S1 = pSrc16[(i2 * 2U) + 1U];
819 /* R0 = (ya + yc), R1 = (xa + xc) */
820 R0 = __SSAT(T0 + S0, 16);
821 R1 = __SSAT(T1 + S1, 16);
823 /* S0 = (ya - yc), S1 =(xa - xc) */
824 S0 = __SSAT(T0 - S0, 16);
825 S1 = __SSAT(T1 - S1, 16);
827 /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
828 /* Read yb (real), xb(imag) input */
829 T0 = pSrc16[i1 * 2U];
830 T1 = pSrc16[(i1 * 2U) + 1U];
832 /* Read yd (real), xd(imag) input */
833 U0 = pSrc16[i3 * 2U];
834 U1 = pSrc16[(i3 * 2U) + 1U];
837 /* T0 = (yb + yd), T1 = (xb + xd) */
838 T0 = __SSAT(T0 + U0, 16);
839 T1 = __SSAT(T1 + U1, 16);
841 /* writing the butterfly processed i0 sample */
843 /* xa' = xa + xb + xc + xd */
844 /* ya' = ya + yb + yc + yd */
845 out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
846 out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
848 pSrc16[i0 * 2U] = out1;
849 pSrc16[(2U * i0) + 1U] = out2;
851 /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
852 R0 = (R0 >> 1U) - (T0 >> 1U);
853 R1 = (R1 >> 1U) - (T1 >> 1U);
855 /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
856 out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
858 /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
859 out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
861 /* Reading i0+3fftLen/4 */
862 /* Read yb (real), xb(imag) input */
863 T0 = pSrc16[i1 * 2U];
864 T1 = pSrc16[(i1 * 2U) + 1U];
866 /* writing the butterfly processed i0 + fftLen/4 sample */
867 /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
868 /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
869 pSrc16[i1 * 2U] = out1;
870 pSrc16[(i1 * 2U) + 1U] = out2;
872 /* Butterfly calculations */
874 /* Read yd (real), xd(imag) input */
875 U0 = pSrc16[i3 * 2U];
876 U1 = pSrc16[(i3 * 2U) + 1U];
878 /* T0 = yb-yd, T1 = xb-xd */
879 T0 = __SSAT(T0 - U0, 16);
880 T1 = __SSAT(T1 - U1, 16);
882 /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
883 R0 = (S0 >> 1U) - (T1 >> 1U);
884 R1 = (S1 >> 1U) + (T0 >> 1U);
886 /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
887 S0 = (S0 >> 1U) + (T1 >> 1U);
888 S1 = (S1 >> 1U) - (T0 >> 1U);
890 /* Butterfly process for the i0+fftLen/2 sample */
891 out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
893 out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
895 /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
896 /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
897 pSrc16[i2 * 2U] = out1;
898 pSrc16[(i2 * 2U) + 1U] = out2;
900 /* Butterfly process for the i0+3fftLen/4 sample */
901 out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
903 out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
904 /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
905 /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
906 pSrc16[i3 * 2U] = out1;
907 pSrc16[(i3 * 2U) + 1U] = out2;
910 /* Twiddle coefficients index modifier */
911 twidCoefModifier <<= 2U;
913 /* end of middle stage process */
916 /* data is in 10.6(q6) format for the 1024 point */
917 /* data is in 8.8(q8) format for the 256 point */
918 /* data is in 6.10(q10) format for the 64 point */
919 /* data is in 4.12(q12) format for the 16 point */
921 /* Initializations for the last stage */
922 n1 = n2;
923 n2 >>= 2U;
925 /* start of last stage process */
927 /* Butterfly implementation */
928 for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
930 /* index calculation for the input as, */
931 /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
932 i1 = i0 + n2;
933 i2 = i1 + n2;
934 i3 = i2 + n2;
936 /* Reading i0, i0+fftLen/2 inputs */
937 /* Read ya (real), xa(imag) input */
938 T0 = pSrc16[i0 * 2U];
939 T1 = pSrc16[(i0 * 2U) + 1U];
941 /* Read yc (real), xc(imag) input */
942 S0 = pSrc16[i2 * 2U];
943 S1 = pSrc16[(i2 * 2U) + 1U];
945 /* R0 = (ya + yc), R1 = (xa + xc) */
946 R0 = __SSAT(T0 + S0, 16U);
947 R1 = __SSAT(T1 + S1, 16U);
949 /* S0 = (ya - yc), S1 = (xa - xc) */
950 S0 = __SSAT(T0 - S0, 16U);
951 S1 = __SSAT(T1 - S1, 16U);
953 /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
954 /* Read yb (real), xb(imag) input */
955 T0 = pSrc16[i1 * 2U];
956 T1 = pSrc16[(i1 * 2U) + 1U];
957 /* Read yd (real), xd(imag) input */
958 U0 = pSrc16[i3 * 2U];
959 U1 = pSrc16[(i3 * 2U) + 1U];
961 /* T0 = (yb + yd), T1 = (xb + xd)) */
962 T0 = __SSAT(T0 + U0, 16U);
963 T1 = __SSAT(T1 + U1, 16U);
965 /* writing the butterfly processed i0 sample */
966 /* xa' = xa + xb + xc + xd */
967 /* ya' = ya + yb + yc + yd */
968 pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
969 pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
971 /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
972 R0 = (R0 >> 1U) - (T0 >> 1U);
973 R1 = (R1 >> 1U) - (T1 >> 1U);
974 /* Read yb (real), xb(imag) input */
975 T0 = pSrc16[i1 * 2U];
976 T1 = pSrc16[(i1 * 2U) + 1U];
978 /* writing the butterfly processed i0 + fftLen/4 sample */
979 /* xc' = (xa-xb+xc-xd) */
980 /* yc' = (ya-yb+yc-yd) */
981 pSrc16[i1 * 2U] = R0;
982 pSrc16[(i1 * 2U) + 1U] = R1;
984 /* Read yd (real), xd(imag) input */
985 U0 = pSrc16[i3 * 2U];
986 U1 = pSrc16[(i3 * 2U) + 1U];
987 /* T0 = (yb - yd), T1 = (xb - xd) */
988 T0 = __SSAT(T0 - U0, 16U);
989 T1 = __SSAT(T1 - U1, 16U);
991 /* writing the butterfly processed i0 + fftLen/2 sample */
992 /* xb' = (xa+yb-xc-yd) */
993 /* yb' = (ya-xb-yc+xd) */
994 pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
995 pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
997 /* writing the butterfly processed i0 + 3fftLen/4 sample */
998 /* xd' = (xa-yb-xc+yd) */
999 /* yd' = (ya+xb-yc-xd) */
1000 pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
1001 pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
1005 /* end of last stage process */
1007 /* output is in 11.5(q5) format for the 1024 point */
1008 /* output is in 9.7(q7) format for the 256 point */
1009 /* output is in 7.9(q9) format for the 64 point */
1010 /* output is in 5.11(q11) format for the 16 point */
1012 #endif /* #if defined (ARM_MATH_DSP) */
1018 * @brief Core function for the Q15 CIFFT butterfly process.
1019 * @param[in, out] *pSrc16 points to the in-place buffer of Q15 data type.
1020 * @param[in] fftLen length of the FFT.
1021 * @param[in] *pCoef16 points to twiddle coefficient buffer.
1022 * @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
1023 * @return none.
1027 * Radix-4 IFFT algorithm used is :
1029 * CIFFT uses same twiddle coefficients as CFFT function
1030 * x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
1033 * IFFT is implemented with following changes in equations from FFT
1035 * Input real and imaginary data:
1036 * x(n) = xa + j * ya
1037 * x(n+N/4 ) = xb + j * yb
1038 * x(n+N/2 ) = xc + j * yc
1039 * x(n+3N 4) = xd + j * yd
1042 * Output real and imaginary data:
1043 * x(4r) = xa'+ j * ya'
1044 * x(4r+1) = xb'+ j * yb'
1045 * x(4r+2) = xc'+ j * yc'
1046 * x(4r+3) = xd'+ j * yd'
1049 * Twiddle factors for radix-4 IFFT:
1050 * Wn = co1 + j * (si1)
1051 * W2n = co2 + j * (si2)
1052 * W3n = co3 + j * (si3)
1054 * The real and imaginary output values for the radix-4 butterfly are
1055 * xa' = xa + xb + xc + xd
1056 * ya' = ya + yb + yc + yd
1057 * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
1058 * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
1059 * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
1060 * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
1061 * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
1062 * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
1066 void arm_radix4_butterfly_inverse_q15(
1067 q15_t * pSrc16,
1068 uint32_t fftLen,
1069 q15_t * pCoef16,
1070 uint32_t twidCoefModifier)
1073 #if defined (ARM_MATH_DSP)
1075 /* Run the below code for Cortex-M4 and Cortex-M3 */
1077 q31_t R, S, T, U;
1078 q31_t C1, C2, C3, out1, out2;
1079 uint32_t n1, n2, ic, i0, j, k;
1081 q15_t *ptr1;
1082 q15_t *pSi0;
1083 q15_t *pSi1;
1084 q15_t *pSi2;
1085 q15_t *pSi3;
1087 q31_t xaya, xbyb, xcyc, xdyd;
1089 /* Total process is divided into three stages */
1091 /* process first stage, middle stages, & last stage */
1093 /* Initializations for the first stage */
1094 n2 = fftLen;
1095 n1 = n2;
1097 /* n2 = fftLen/4 */
1098 n2 >>= 2U;
1100 /* Index for twiddle coefficient */
1101 ic = 0U;
1103 /* Index for input read and output write */
1104 j = n2;
1106 pSi0 = pSrc16;
1107 pSi1 = pSi0 + 2 * n2;
1108 pSi2 = pSi1 + 2 * n2;
1109 pSi3 = pSi2 + 2 * n2;
1111 /* Input is in 1.15(q15) format */
1113 /* start of first stage process */
1116 /* Butterfly implementation */
1118 /* Reading i0, i0+fftLen/2 inputs */
1119 /* Read ya (real), xa(imag) input */
1120 T = _SIMD32_OFFSET(pSi0);
1121 T = __SHADD16(T, 0);
1122 T = __SHADD16(T, 0);
1124 /* Read yc (real), xc(imag) input */
1125 S = _SIMD32_OFFSET(pSi2);
1126 S = __SHADD16(S, 0);
1127 S = __SHADD16(S, 0);
1129 /* R = packed((ya + yc), (xa + xc) ) */
1130 R = __QADD16(T, S);
1132 /* S = packed((ya - yc), (xa - xc) ) */
1133 S = __QSUB16(T, S);
1135 /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1136 /* Read yb (real), xb(imag) input */
1137 T = _SIMD32_OFFSET(pSi1);
1138 T = __SHADD16(T, 0);
1139 T = __SHADD16(T, 0);
1141 /* Read yd (real), xd(imag) input */
1142 U = _SIMD32_OFFSET(pSi3);
1143 U = __SHADD16(U, 0);
1144 U = __SHADD16(U, 0);
1146 /* T = packed((yb + yd), (xb + xd) ) */
1147 T = __QADD16(T, U);
1149 /* writing the butterfly processed i0 sample */
1150 /* xa' = xa + xb + xc + xd */
1151 /* ya' = ya + yb + yc + yd */
1152 _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
1153 pSi0 += 2;
1155 /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
1156 R = __QSUB16(R, T);
1158 /* co2 & si2 are read from SIMD Coefficient pointer */
1159 C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
1161 #ifndef ARM_MATH_BIG_ENDIAN
1163 /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1164 out1 = __SMUSD(C2, R) >> 16U;
1165 /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1166 out2 = __SMUADX(C2, R);
1168 #else
1170 /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1171 out1 = __SMUADX(C2, R) >> 16U;
1172 /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1173 out2 = __SMUSD(__QSUB16(0, C2), R);
1175 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1177 /* Reading i0+fftLen/4 */
1178 /* T = packed(yb, xb) */
1179 T = _SIMD32_OFFSET(pSi1);
1180 T = __SHADD16(T, 0);
1181 T = __SHADD16(T, 0);
1183 /* writing the butterfly processed i0 + fftLen/4 sample */
1184 /* writing output(xc', yc') in little endian format */
1185 _SIMD32_OFFSET(pSi1) =
1186 (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1187 pSi1 += 2;
1189 /* Butterfly calculations */
1190 /* U = packed(yd, xd) */
1191 U = _SIMD32_OFFSET(pSi3);
1192 U = __SHADD16(U, 0);
1193 U = __SHADD16(U, 0);
1195 /* T = packed(yb-yd, xb-xd) */
1196 T = __QSUB16(T, U);
1198 #ifndef ARM_MATH_BIG_ENDIAN
1200 /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1201 R = __QSAX(S, T);
1202 /* S = packed((ya-yc) + (xb- xd), (xa-xc) - (yb-yd)) */
1203 S = __QASX(S, T);
1205 #else
1207 /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1208 R = __QASX(S, T);
1209 /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
1210 S = __QSAX(S, T);
1212 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1214 /* co1 & si1 are read from SIMD Coefficient pointer */
1215 C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
1216 /* Butterfly process for the i0+fftLen/2 sample */
1218 #ifndef ARM_MATH_BIG_ENDIAN
1220 /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1221 out1 = __SMUSD(C1, S) >> 16U;
1222 /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1223 out2 = __SMUADX(C1, S);
1225 #else
1227 /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1228 out1 = __SMUADX(C1, S) >> 16U;
1229 /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1230 out2 = __SMUSD(__QSUB16(0, C1), S);
1232 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1234 /* writing output(xb', yb') in little endian format */
1235 _SIMD32_OFFSET(pSi2) =
1236 ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
1237 pSi2 += 2;
1240 /* co3 & si3 are read from SIMD Coefficient pointer */
1241 C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
1242 /* Butterfly process for the i0+3fftLen/4 sample */
1244 #ifndef ARM_MATH_BIG_ENDIAN
1246 /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1247 out1 = __SMUSD(C3, R) >> 16U;
1248 /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1249 out2 = __SMUADX(C3, R);
1251 #else
1253 /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1254 out1 = __SMUADX(C3, R) >> 16U;
1255 /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1256 out2 = __SMUSD(__QSUB16(0, C3), R);
1258 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1260 /* writing output(xd', yd') in little endian format */
1261 _SIMD32_OFFSET(pSi3) =
1262 ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1263 pSi3 += 2;
1265 /* Twiddle coefficients index modifier */
1266 ic = ic + twidCoefModifier;
1268 } while (--j);
1269 /* data is in 4.11(q11) format */
1271 /* end of first stage process */
1274 /* start of middle stage process */
1276 /* Twiddle coefficients index modifier */
1277 twidCoefModifier <<= 2U;
1279 /* Calculation of Middle stage */
1280 for (k = fftLen / 4U; k > 4U; k >>= 2U)
1282 /* Initializations for the middle stage */
1283 n1 = n2;
1284 n2 >>= 2U;
1285 ic = 0U;
1287 for (j = 0U; j <= (n2 - 1U); j++)
1289 /* index calculation for the coefficients */
1290 C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
1291 C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
1292 C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
1294 /* Twiddle coefficients index modifier */
1295 ic = ic + twidCoefModifier;
1297 pSi0 = pSrc16 + 2 * j;
1298 pSi1 = pSi0 + 2 * n2;
1299 pSi2 = pSi1 + 2 * n2;
1300 pSi3 = pSi2 + 2 * n2;
1302 /* Butterfly implementation */
1303 for (i0 = j; i0 < fftLen; i0 += n1)
1305 /* Reading i0, i0+fftLen/2 inputs */
1306 /* Read ya (real), xa(imag) input */
1307 T = _SIMD32_OFFSET(pSi0);
1309 /* Read yc (real), xc(imag) input */
1310 S = _SIMD32_OFFSET(pSi2);
1312 /* R = packed( (ya + yc), (xa + xc)) */
1313 R = __QADD16(T, S);
1315 /* S = packed((ya - yc), (xa - xc)) */
1316 S = __QSUB16(T, S);
1318 /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1319 /* Read yb (real), xb(imag) input */
1320 T = _SIMD32_OFFSET(pSi1);
1322 /* Read yd (real), xd(imag) input */
1323 U = _SIMD32_OFFSET(pSi3);
1325 /* T = packed( (yb + yd), (xb + xd)) */
1326 T = __QADD16(T, U);
1328 /* writing the butterfly processed i0 sample */
1330 /* xa' = xa + xb + xc + xd */
1331 /* ya' = ya + yb + yc + yd */
1332 out1 = __SHADD16(R, T);
1333 out1 = __SHADD16(out1, 0);
1334 _SIMD32_OFFSET(pSi0) = out1;
1335 pSi0 += 2 * n1;
1337 /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
1338 R = __SHSUB16(R, T);
1340 #ifndef ARM_MATH_BIG_ENDIAN
1342 /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1343 out1 = __SMUSD(C2, R) >> 16U;
1345 /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1346 out2 = __SMUADX(C2, R);
1348 #else
1350 /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1351 out1 = __SMUADX(R, C2) >> 16U;
1353 /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1354 out2 = __SMUSD(__QSUB16(0, C2), R);
1356 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1358 /* Reading i0+3fftLen/4 */
1359 /* Read yb (real), xb(imag) input */
1360 T = _SIMD32_OFFSET(pSi1);
1362 /* writing the butterfly processed i0 + fftLen/4 sample */
1363 /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1364 /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1365 _SIMD32_OFFSET(pSi1) =
1366 ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1367 pSi1 += 2 * n1;
1369 /* Butterfly calculations */
1371 /* Read yd (real), xd(imag) input */
1372 U = _SIMD32_OFFSET(pSi3);
1374 /* T = packed(yb-yd, xb-xd) */
1375 T = __QSUB16(T, U);
1377 #ifndef ARM_MATH_BIG_ENDIAN
1379 /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1380 R = __SHSAX(S, T);
1382 /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
1383 S = __SHASX(S, T);
1386 /* Butterfly process for the i0+fftLen/2 sample */
1387 out1 = __SMUSD(C1, S) >> 16U;
1388 out2 = __SMUADX(C1, S);
1390 #else
1392 /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1393 R = __SHASX(S, T);
1395 /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
1396 S = __SHSAX(S, T);
1399 /* Butterfly process for the i0+fftLen/2 sample */
1400 out1 = __SMUADX(S, C1) >> 16U;
1401 out2 = __SMUSD(__QSUB16(0, C1), S);
1403 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1405 /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1406 /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1407 _SIMD32_OFFSET(pSi2) =
1408 ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1409 pSi2 += 2 * n1;
1411 /* Butterfly process for the i0+3fftLen/4 sample */
1413 #ifndef ARM_MATH_BIG_ENDIAN
1415 out1 = __SMUSD(C3, R) >> 16U;
1416 out2 = __SMUADX(C3, R);
1418 #else
1420 out1 = __SMUADX(C3, R) >> 16U;
1421 out2 = __SMUSD(__QSUB16(0, C3), R);
1423 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1425 /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1426 /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1427 _SIMD32_OFFSET(pSi3) =
1428 ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1429 pSi3 += 2 * n1;
1432 /* Twiddle coefficients index modifier */
1433 twidCoefModifier <<= 2U;
1435 /* end of middle stage process */
1437 /* data is in 10.6(q6) format for the 1024 point */
1438 /* data is in 8.8(q8) format for the 256 point */
1439 /* data is in 6.10(q10) format for the 64 point */
1440 /* data is in 4.12(q12) format for the 16 point */
1442 /* Initializations for the last stage */
1443 j = fftLen >> 2;
1445 ptr1 = &pSrc16[0];
1447 /* start of last stage process */
1449 /* Butterfly implementation */
1452 /* Read xa (real), ya(imag) input */
1453 xaya = *__SIMD32(ptr1)++;
1455 /* Read xb (real), yb(imag) input */
1456 xbyb = *__SIMD32(ptr1)++;
1458 /* Read xc (real), yc(imag) input */
1459 xcyc = *__SIMD32(ptr1)++;
1461 /* Read xd (real), yd(imag) input */
1462 xdyd = *__SIMD32(ptr1)++;
1464 /* R = packed((ya + yc), (xa + xc)) */
1465 R = __QADD16(xaya, xcyc);
1467 /* T = packed((yb + yd), (xb + xd)) */
1468 T = __QADD16(xbyb, xdyd);
1470 /* pointer updation for writing */
1471 ptr1 = ptr1 - 8U;
1474 /* xa' = xa + xb + xc + xd */
1475 /* ya' = ya + yb + yc + yd */
1476 *__SIMD32(ptr1)++ = __SHADD16(R, T);
1478 /* T = packed((yb + yd), (xb + xd)) */
1479 T = __QADD16(xbyb, xdyd);
1481 /* xc' = (xa-xb+xc-xd) */
1482 /* yc' = (ya-yb+yc-yd) */
1483 *__SIMD32(ptr1)++ = __SHSUB16(R, T);
1485 /* S = packed((ya - yc), (xa - xc)) */
1486 S = __QSUB16(xaya, xcyc);
1488 /* Read yd (real), xd(imag) input */
1489 /* T = packed( (yb - yd), (xb - xd)) */
1490 U = __QSUB16(xbyb, xdyd);
1492 #ifndef ARM_MATH_BIG_ENDIAN
1494 /* xb' = (xa+yb-xc-yd) */
1495 /* yb' = (ya-xb-yc+xd) */
1496 *__SIMD32(ptr1)++ = __SHASX(S, U);
1499 /* xd' = (xa-yb-xc+yd) */
1500 /* yd' = (ya+xb-yc-xd) */
1501 *__SIMD32(ptr1)++ = __SHSAX(S, U);
1503 #else
1505 /* xb' = (xa+yb-xc-yd) */
1506 /* yb' = (ya-xb-yc+xd) */
1507 *__SIMD32(ptr1)++ = __SHSAX(S, U);
1510 /* xd' = (xa-yb-xc+yd) */
1511 /* yd' = (ya+xb-yc-xd) */
1512 *__SIMD32(ptr1)++ = __SHASX(S, U);
1515 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1517 } while (--j);
1519 /* end of last stage process */
1521 /* output is in 11.5(q5) format for the 1024 point */
1522 /* output is in 9.7(q7) format for the 256 point */
1523 /* output is in 7.9(q9) format for the 64 point */
1524 /* output is in 5.11(q11) format for the 16 point */
1527 #else
1529 /* Run the below code for Cortex-M0 */
1531 q15_t R0, R1, S0, S1, T0, T1, U0, U1;
1532 q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
1533 uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
1535 /* Total process is divided into three stages */
1537 /* process first stage, middle stages, & last stage */
1539 /* Initializations for the first stage */
1540 n2 = fftLen;
1541 n1 = n2;
1543 /* n2 = fftLen/4 */
1544 n2 >>= 2U;
1546 /* Index for twiddle coefficient */
1547 ic = 0U;
1549 /* Index for input read and output write */
1550 i0 = 0U;
1552 j = n2;
1554 /* Input is in 1.15(q15) format */
1556 /* Start of first stage process */
1559 /* Butterfly implementation */
1561 /* index calculation for the input as, */
1562 /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1563 i1 = i0 + n2;
1564 i2 = i1 + n2;
1565 i3 = i2 + n2;
1567 /* Reading i0, i0+fftLen/2 inputs */
1568 /* input is down scale by 4 to avoid overflow */
1569 /* Read ya (real), xa(imag) input */
1570 T0 = pSrc16[i0 * 2U] >> 2U;
1571 T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
1572 /* input is down scale by 4 to avoid overflow */
1573 /* Read yc (real), xc(imag) input */
1574 S0 = pSrc16[i2 * 2U] >> 2U;
1575 S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
1577 /* R0 = (ya + yc), R1 = (xa + xc) */
1578 R0 = __SSAT(T0 + S0, 16U);
1579 R1 = __SSAT(T1 + S1, 16U);
1580 /* S0 = (ya - yc), S1 = (xa - xc) */
1581 S0 = __SSAT(T0 - S0, 16U);
1582 S1 = __SSAT(T1 - S1, 16U);
1584 /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1585 /* input is down scale by 4 to avoid overflow */
1586 /* Read yb (real), xb(imag) input */
1587 T0 = pSrc16[i1 * 2U] >> 2U;
1588 T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
1589 /* Read yd (real), xd(imag) input */
1590 /* input is down scale by 4 to avoid overflow */
1591 U0 = pSrc16[i3 * 2U] >> 2U;
1592 U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
1594 /* T0 = (yb + yd), T1 = (xb + xd) */
1595 T0 = __SSAT(T0 + U0, 16U);
1596 T1 = __SSAT(T1 + U1, 16U);
1598 /* writing the butterfly processed i0 sample */
1599 /* xa' = xa + xb + xc + xd */
1600 /* ya' = ya + yb + yc + yd */
1601 pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
1602 pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
1604 /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
1605 R0 = __SSAT(R0 - T0, 16U);
1606 R1 = __SSAT(R1 - T1, 16U);
1607 /* co2 & si2 are read from Coefficient pointer */
1608 Co2 = pCoef16[2U * ic * 2U];
1609 Si2 = pCoef16[(2U * ic * 2U) + 1U];
1610 /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1611 out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
1612 /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1613 out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
1615 /* Reading i0+fftLen/4 */
1616 /* input is down scale by 4 to avoid overflow */
1617 /* T0 = yb, T1 = xb */
1618 T0 = pSrc16[i1 * 2U] >> 2U;
1619 T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
1621 /* writing the butterfly processed i0 + fftLen/4 sample */
1622 /* writing output(xc', yc') in little endian format */
1623 pSrc16[i1 * 2U] = out1;
1624 pSrc16[(i1 * 2U) + 1U] = out2;
1626 /* Butterfly calculations */
1627 /* input is down scale by 4 to avoid overflow */
1628 /* U0 = yd, U1 = xd) */
1629 U0 = pSrc16[i3 * 2U] >> 2U;
1630 U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
1632 /* T0 = yb-yd, T1 = xb-xd) */
1633 T0 = __SSAT(T0 - U0, 16U);
1634 T1 = __SSAT(T1 - U1, 16U);
1635 /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1636 R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
1637 R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
1638 /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1639 S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
1640 S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
1642 /* co1 & si1 are read from Coefficient pointer */
1643 Co1 = pCoef16[ic * 2U];
1644 Si1 = pCoef16[(ic * 2U) + 1U];
1645 /* Butterfly process for the i0+fftLen/2 sample */
1646 /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1647 out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
1648 /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1649 out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
1650 /* writing output(xb', yb') in little endian format */
1651 pSrc16[i2 * 2U] = out1;
1652 pSrc16[(i2 * 2U) + 1U] = out2;
1654 /* Co3 & si3 are read from Coefficient pointer */
1655 Co3 = pCoef16[3U * ic * 2U];
1656 Si3 = pCoef16[(3U * ic * 2U) + 1U];
1657 /* Butterfly process for the i0+3fftLen/4 sample */
1658 /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1659 out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
1660 /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1661 out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
1662 /* writing output(xd', yd') in little endian format */
1663 pSrc16[i3 * 2U] = out1;
1664 pSrc16[(i3 * 2U) + 1U] = out2;
1666 /* Twiddle coefficients index modifier */
1667 ic = ic + twidCoefModifier;
1669 /* Updating input index */
1670 i0 = i0 + 1U;
1672 } while (--j);
1674 /* End of first stage process */
1676 /* data is in 4.11(q11) format */
1679 /* Start of Middle stage process */
1681 /* Twiddle coefficients index modifier */
1682 twidCoefModifier <<= 2U;
1684 /* Calculation of Middle stage */
1685 for (k = fftLen / 4U; k > 4U; k >>= 2U)
1687 /* Initializations for the middle stage */
1688 n1 = n2;
1689 n2 >>= 2U;
1690 ic = 0U;
1692 for (j = 0U; j <= (n2 - 1U); j++)
1694 /* index calculation for the coefficients */
1695 Co1 = pCoef16[ic * 2U];
1696 Si1 = pCoef16[(ic * 2U) + 1U];
1697 Co2 = pCoef16[2U * ic * 2U];
1698 Si2 = pCoef16[2U * ic * 2U + 1U];
1699 Co3 = pCoef16[3U * ic * 2U];
1700 Si3 = pCoef16[(3U * ic * 2U) + 1U];
1702 /* Twiddle coefficients index modifier */
1703 ic = ic + twidCoefModifier;
1705 /* Butterfly implementation */
1706 for (i0 = j; i0 < fftLen; i0 += n1)
1708 /* index calculation for the input as, */
1709 /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1710 i1 = i0 + n2;
1711 i2 = i1 + n2;
1712 i3 = i2 + n2;
1714 /* Reading i0, i0+fftLen/2 inputs */
1715 /* Read ya (real), xa(imag) input */
1716 T0 = pSrc16[i0 * 2U];
1717 T1 = pSrc16[(i0 * 2U) + 1U];
1719 /* Read yc (real), xc(imag) input */
1720 S0 = pSrc16[i2 * 2U];
1721 S1 = pSrc16[(i2 * 2U) + 1U];
1724 /* R0 = (ya + yc), R1 = (xa + xc) */
1725 R0 = __SSAT(T0 + S0, 16U);
1726 R1 = __SSAT(T1 + S1, 16U);
1727 /* S0 = (ya - yc), S1 = (xa - xc) */
1728 S0 = __SSAT(T0 - S0, 16U);
1729 S1 = __SSAT(T1 - S1, 16U);
1731 /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1732 /* Read yb (real), xb(imag) input */
1733 T0 = pSrc16[i1 * 2U];
1734 T1 = pSrc16[(i1 * 2U) + 1U];
1736 /* Read yd (real), xd(imag) input */
1737 U0 = pSrc16[i3 * 2U];
1738 U1 = pSrc16[(i3 * 2U) + 1U];
1740 /* T0 = (yb + yd), T1 = (xb + xd) */
1741 T0 = __SSAT(T0 + U0, 16U);
1742 T1 = __SSAT(T1 + U1, 16U);
1744 /* writing the butterfly processed i0 sample */
1745 /* xa' = xa + xb + xc + xd */
1746 /* ya' = ya + yb + yc + yd */
1747 pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
1748 pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
1750 /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1751 R0 = (R0 >> 1U) - (T0 >> 1U);
1752 R1 = (R1 >> 1U) - (T1 >> 1U);
1754 /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
1755 out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
1756 /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1757 out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
1759 /* Reading i0+3fftLen/4 */
1760 /* Read yb (real), xb(imag) input */
1761 T0 = pSrc16[i1 * 2U];
1762 T1 = pSrc16[(i1 * 2U) + 1U];
1764 /* writing the butterfly processed i0 + fftLen/4 sample */
1765 /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1766 /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1767 pSrc16[i1 * 2U] = out1;
1768 pSrc16[(i1 * 2U) + 1U] = out2;
1770 /* Butterfly calculations */
1771 /* Read yd (real), xd(imag) input */
1772 U0 = pSrc16[i3 * 2U];
1773 U1 = pSrc16[(i3 * 2U) + 1U];
1775 /* T0 = yb-yd, T1 = xb-xd) */
1776 T0 = __SSAT(T0 - U0, 16U);
1777 T1 = __SSAT(T1 - U1, 16U);
1779 /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1780 R0 = (S0 >> 1U) + (T1 >> 1U);
1781 R1 = (S1 >> 1U) - (T0 >> 1U);
1783 /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1784 S0 = (S0 >> 1U) - (T1 >> 1U);
1785 S1 = (S1 >> 1U) + (T0 >> 1U);
1787 /* Butterfly process for the i0+fftLen/2 sample */
1788 out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
1789 out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
1790 /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1791 /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1792 pSrc16[i2 * 2U] = out1;
1793 pSrc16[(i2 * 2U) + 1U] = out2;
1795 /* Butterfly process for the i0+3fftLen/4 sample */
1796 out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
1798 out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
1799 /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1800 /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1801 pSrc16[i3 * 2U] = out1;
1802 pSrc16[(i3 * 2U) + 1U] = out2;
1807 /* Twiddle coefficients index modifier */
1808 twidCoefModifier <<= 2U;
1810 /* End of Middle stages process */
1813 /* data is in 10.6(q6) format for the 1024 point */
1814 /* data is in 8.8(q8) format for the 256 point */
1815 /* data is in 6.10(q10) format for the 64 point */
1816 /* data is in 4.12(q12) format for the 16 point */
1818 /* start of last stage process */
1821 /* Initializations for the last stage */
1822 n1 = n2;
1823 n2 >>= 2U;
1825 /* Butterfly implementation */
1826 for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
1828 /* index calculation for the input as, */
1829 /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1830 i1 = i0 + n2;
1831 i2 = i1 + n2;
1832 i3 = i2 + n2;
1834 /* Reading i0, i0+fftLen/2 inputs */
1835 /* Read ya (real), xa(imag) input */
1836 T0 = pSrc16[i0 * 2U];
1837 T1 = pSrc16[(i0 * 2U) + 1U];
1838 /* Read yc (real), xc(imag) input */
1839 S0 = pSrc16[i2 * 2U];
1840 S1 = pSrc16[(i2 * 2U) + 1U];
1842 /* R0 = (ya + yc), R1 = (xa + xc) */
1843 R0 = __SSAT(T0 + S0, 16U);
1844 R1 = __SSAT(T1 + S1, 16U);
1845 /* S0 = (ya - yc), S1 = (xa - xc) */
1846 S0 = __SSAT(T0 - S0, 16U);
1847 S1 = __SSAT(T1 - S1, 16U);
1849 /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1850 /* Read yb (real), xb(imag) input */
1851 T0 = pSrc16[i1 * 2U];
1852 T1 = pSrc16[(i1 * 2U) + 1U];
1853 /* Read yd (real), xd(imag) input */
1854 U0 = pSrc16[i3 * 2U];
1855 U1 = pSrc16[(i3 * 2U) + 1U];
1857 /* T0 = (yb + yd), T1 = (xb + xd) */
1858 T0 = __SSAT(T0 + U0, 16U);
1859 T1 = __SSAT(T1 + U1, 16U);
1861 /* writing the butterfly processed i0 sample */
1862 /* xa' = xa + xb + xc + xd */
1863 /* ya' = ya + yb + yc + yd */
1864 pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
1865 pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
1867 /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1868 R0 = (R0 >> 1U) - (T0 >> 1U);
1869 R1 = (R1 >> 1U) - (T1 >> 1U);
1871 /* Read yb (real), xb(imag) input */
1872 T0 = pSrc16[i1 * 2U];
1873 T1 = pSrc16[(i1 * 2U) + 1U];
1875 /* writing the butterfly processed i0 + fftLen/4 sample */
1876 /* xc' = (xa-xb+xc-xd) */
1877 /* yc' = (ya-yb+yc-yd) */
1878 pSrc16[i1 * 2U] = R0;
1879 pSrc16[(i1 * 2U) + 1U] = R1;
1881 /* Read yd (real), xd(imag) input */
1882 U0 = pSrc16[i3 * 2U];
1883 U1 = pSrc16[(i3 * 2U) + 1U];
1884 /* T0 = (yb - yd), T1 = (xb - xd) */
1885 T0 = __SSAT(T0 - U0, 16U);
1886 T1 = __SSAT(T1 - U1, 16U);
1888 /* writing the butterfly processed i0 + fftLen/2 sample */
1889 /* xb' = (xa-yb-xc+yd) */
1890 /* yb' = (ya+xb-yc-xd) */
1891 pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
1892 pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
1895 /* writing the butterfly processed i0 + 3fftLen/4 sample */
1896 /* xd' = (xa+yb-xc-yd) */
1897 /* yd' = (ya-xb-yc+xd) */
1898 pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
1899 pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
1901 /* end of last stage process */
1903 /* output is in 11.5(q5) format for the 1024 point */
1904 /* output is in 9.7(q7) format for the 256 point */
1905 /* output is in 7.9(q9) format for the 64 point */
1906 /* output is in 5.11(q11) format for the 16 point */
1908 #endif /* #if defined (ARM_MATH_DSP) */