lib/main/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c

   1 /* ----------------------------------------------------------------------
   2  * Project:      CMSIS DSP Library
   3  * Title:        arm_cfft_radix4_q15.c
   4  * Description:  This file has function definition of Radix-4 FFT & IFFT function and
   5  *               In-place bit reversal using bit reversal table
   6  *
   7  * $Date:        27. January 2017
   8  * $Revision:    V.1.5.1
   9  *
  10  * Target Processor: Cortex-M cores
  11  * -------------------------------------------------------------------- */
  12 /*
  13  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  14  *
  15  * SPDX-License-Identifier: Apache-2.0
  16  *
  17  * Licensed under the Apache License, Version 2.0 (the License); you may
  18  * not use this file except in compliance with the License.
  19  * You may obtain a copy of the License at
  20  *
  21  * www.apache.org/licenses/LICENSE-2.0
  22  *
  23  * Unless required by applicable law or agreed to in writing, software
  24  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  25  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  26  * See the License for the specific language governing permissions and
  27  * limitations under the License.
  28  */
  29
  30 #include "arm_math.h"
  31
  32
  33 void arm_radix4_butterfly_q15(
  34   q15_t * pSrc16,
  35   uint32_t fftLen,
  36   q15_t * pCoef16,
  37   uint32_t twidCoefModifier);
  38
  39 void arm_radix4_butterfly_inverse_q15(
  40   q15_t * pSrc16,
  41   uint32_t fftLen,
  42   q15_t * pCoef16,
  43   uint32_t twidCoefModifier);
  44
  45 void arm_bitreversal_q15(
  46   q15_t * pSrc,
  47   uint32_t fftLen,
  48   uint16_t bitRevFactor,
  49   uint16_t * pBitRevTab);
  50
  51 /**
  52  * @ingroup groupTransforms
  53  */
  54
  55 /**
  56  * @addtogroup ComplexFFT
  57  * @{
  58  */
  59
  60
  61 /**
  62  * @details
  63  * @brief Processing function for the Q15 CFFT/CIFFT.
  64  * @deprecated Do not use this function.  It has been superseded by \ref arm_cfft_q15 and will be removed
  65  * @param[in]      *S    points to an instance of the Q15 CFFT/CIFFT structure.
  66  * @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.
  67  * @return none.
  68  *
  69  * \par Input and output formats:
  70  * \par
  71  * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
  72  * Hence the output format is different for different FFT sizes.
  73  * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
  74  * \par
  75  * \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
  76  * \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
  77  */
  78
  79 void arm_cfft_radix4_q15(
  80   const arm_cfft_radix4_instance_q15 * S,
  81   q15_t * pSrc)
  82 {
  83   if (S->ifftFlag == 1U)
  84   {
  85     /*  Complex IFFT radix-4  */
  86     arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
  87   }
  88   else
  89   {
  90     /*  Complex FFT radix-4  */
  91     arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
  92   }
  93
  94   if (S->bitReverseFlag == 1U)
  95   {
  96     /*  Bit Reversal */
  97     arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
  98   }
  99
 100 }
 101
 102 /**
 103  * @} end of ComplexFFT group
 104  */
 105
 106 /*
 107 * Radix-4 FFT algorithm used is :
 108 *
 109 * Input real and imaginary data:
 110 * x(n) = xa + j * ya
 111 * x(n+N/4 ) = xb + j * yb
 112 * x(n+N/2 ) = xc + j * yc
 113 * x(n+3N 4) = xd + j * yd
 114 *
 115 *
 116 * Output real and imaginary data:
 117 * x(4r) = xa'+ j * ya'
 118 * x(4r+1) = xb'+ j * yb'
 119 * x(4r+2) = xc'+ j * yc'
 120 * x(4r+3) = xd'+ j * yd'
 121 *
 122 *
 123 * Twiddle factors for radix-4 FFT:
 124 * Wn = co1 + j * (- si1)
 125 * W2n = co2 + j * (- si2)
 126 * W3n = co3 + j * (- si3)
 127
 128 * The real and imaginary output values for the radix-4 butterfly are
 129 * xa' = xa + xb + xc + xd
 130 * ya' = ya + yb + yc + yd
 131 * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
 132 * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
 133 * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
 134 * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
 135 * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
 136 * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
 137 *
 138 */
 139
 140 /**
 141  * @brief  Core function for the Q15 CFFT butterfly process.
 142  * @param[in, out] *pSrc16          points to the in-place buffer of Q15 data type.
 143  * @param[in]      fftLen           length of the FFT.
 144  * @param[in]      *pCoef16         points to twiddle coefficient buffer.
 145  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
 146  * @return none.
 147  */
 148
 149 void arm_radix4_butterfly_q15(
 150   q15_t * pSrc16,
 151   uint32_t fftLen,
 152   q15_t * pCoef16,
 153   uint32_t twidCoefModifier)
 154 {
 155
 156 #if defined (ARM_MATH_DSP)
 157
 158   /* Run the below code for Cortex-M4 and Cortex-M3 */
 159
 160   q31_t R, S, T, U;
 161   q31_t C1, C2, C3, out1, out2;
 162   uint32_t n1, n2, ic, i0, j, k;
 163
 164   q15_t *ptr1;
 165   q15_t *pSi0;
 166   q15_t *pSi1;
 167   q15_t *pSi2;
 168   q15_t *pSi3;
 169
 170   q31_t xaya, xbyb, xcyc, xdyd;
 171
 172   /* Total process is divided into three stages */
 173
 174   /* process first stage, middle stages, & last stage */
 175
 176   /*  Initializations for the first stage */
 177   n2 = fftLen;
 178   n1 = n2;
 179
 180   /* n2 = fftLen/4 */
 181   n2 >>= 2U;
 182
 183   /* Index for twiddle coefficient */
 184   ic = 0U;
 185
 186   /* Index for input read and output write */
 187   j = n2;
 188
 189   pSi0 = pSrc16;
 190   pSi1 = pSi0 + 2 * n2;
 191   pSi2 = pSi1 + 2 * n2;
 192   pSi3 = pSi2 + 2 * n2;
 193
 194   /* Input is in 1.15(q15) format */
 195
 196   /*  start of first stage process */
 197   do
 198   {
 199     /*  Butterfly implementation */
 200
 201     /*  Reading i0, i0+fftLen/2 inputs */
 202     /* Read ya (real), xa(imag) input */
 203     T = _SIMD32_OFFSET(pSi0);
 204     T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
 205     T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
 206     //in = ((int16_t) (T & 0xFFFF)) >> 2;       // alternative code that takes 3 cycles
 207     //T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
 208
 209     /* Read yc (real), xc(imag) input */
 210     S = _SIMD32_OFFSET(pSi2);
 211     S = __SHADD16(S, 0);
 212     S = __SHADD16(S, 0);
 213
 214     /* R = packed((ya + yc), (xa + xc) ) */
 215     R = __QADD16(T, S);
 216
 217     /* S = packed((ya - yc), (xa - xc) ) */
 218     S = __QSUB16(T, S);
 219
 220     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
 221     /* Read yb (real), xb(imag) input */
 222     T = _SIMD32_OFFSET(pSi1);
 223     T = __SHADD16(T, 0);
 224     T = __SHADD16(T, 0);
 225
 226     /* Read yd (real), xd(imag) input */
 227     U = _SIMD32_OFFSET(pSi3);
 228     U = __SHADD16(U, 0);
 229     U = __SHADD16(U, 0);
 230
 231     /* T = packed((yb + yd), (xb + xd) ) */
 232     T = __QADD16(T, U);
 233
 234     /*  writing the butterfly processed i0 sample */
 235     /* xa' = xa + xb + xc + xd */
 236     /* ya' = ya + yb + yc + yd */
 237     _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
 238     pSi0 += 2;
 239
 240     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
 241     R = __QSUB16(R, T);
 242
 243     /* co2 & si2 are read from SIMD Coefficient pointer */
 244     C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
 245
 246 #ifndef ARM_MATH_BIG_ENDIAN
 247
 248     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
 249     out1 = __SMUAD(C2, R) >> 16U;
 250     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 251     out2 = __SMUSDX(C2, R);
 252
 253 #else
 254
 255     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 256     out1 = __SMUSDX(R, C2) >> 16U;
 257     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
 258     out2 = __SMUAD(C2, R);
 259
 260 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 261
 262     /*  Reading i0+fftLen/4 */
 263     /* T = packed(yb, xb) */
 264     T = _SIMD32_OFFSET(pSi1);
 265     T = __SHADD16(T, 0);
 266     T = __SHADD16(T, 0);
 267
 268     /* writing the butterfly processed i0 + fftLen/4 sample */
 269     /* writing output(xc', yc') in little endian format */
 270     _SIMD32_OFFSET(pSi1) =
 271       (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
 272     pSi1 += 2;
 273
 274     /*  Butterfly calculations */
 275     /* U = packed(yd, xd) */
 276     U = _SIMD32_OFFSET(pSi3);
 277     U = __SHADD16(U, 0);
 278     U = __SHADD16(U, 0);
 279
 280     /* T = packed(yb-yd, xb-xd) */
 281     T = __QSUB16(T, U);
 282
 283 #ifndef ARM_MATH_BIG_ENDIAN
 284
 285     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
 286     R = __QASX(S, T);
 287     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
 288     S = __QSAX(S, T);
 289
 290 #else
 291
 292     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
 293     R = __QSAX(S, T);
 294     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
 295     S = __QASX(S, T);
 296
 297 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 298
 299     /* co1 & si1 are read from SIMD Coefficient pointer */
 300     C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
 301     /*  Butterfly process for the i0+fftLen/2 sample */
 302
 303 #ifndef ARM_MATH_BIG_ENDIAN
 304
 305     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
 306     out1 = __SMUAD(C1, S) >> 16U;
 307     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
 308     out2 = __SMUSDX(C1, S);
 309
 310 #else
 311
 312     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
 313     out1 = __SMUSDX(S, C1) >> 16U;
 314     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
 315     out2 = __SMUAD(C1, S);
 316
 317 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 318
 319     /* writing output(xb', yb') in little endian format */
 320     _SIMD32_OFFSET(pSi2) =
 321       ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
 322     pSi2 += 2;
 323
 324
 325     /* co3 & si3 are read from SIMD Coefficient pointer */
 326     C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
 327     /*  Butterfly process for the i0+3fftLen/4 sample */
 328
 329 #ifndef ARM_MATH_BIG_ENDIAN
 330
 331     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
 332     out1 = __SMUAD(C3, R) >> 16U;
 333     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
 334     out2 = __SMUSDX(C3, R);
 335
 336 #else
 337
 338     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
 339     out1 = __SMUSDX(R, C3) >> 16U;
 340     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
 341     out2 = __SMUAD(C3, R);
 342
 343 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 344
 345     /* writing output(xd', yd') in little endian format */
 346     _SIMD32_OFFSET(pSi3) =
 347       ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
 348     pSi3 += 2;
 349
 350     /*  Twiddle coefficients index modifier */
 351     ic = ic + twidCoefModifier;
 352
 353   } while (--j);
 354   /* data is in 4.11(q11) format */
 355
 356   /* end of first stage process */
 357
 358
 359   /* start of middle stage process */
 360
 361   /*  Twiddle coefficients index modifier */
 362   twidCoefModifier <<= 2U;
 363
 364   /*  Calculation of Middle stage */
 365   for (k = fftLen / 4U; k > 4U; k >>= 2U)
 366   {
 367     /*  Initializations for the middle stage */
 368     n1 = n2;
 369     n2 >>= 2U;
 370     ic = 0U;
 371
 372     for (j = 0U; j <= (n2 - 1U); j++)
 373     {
 374       /*  index calculation for the coefficients */
 375       C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
 376       C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
 377       C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
 378
 379       /*  Twiddle coefficients index modifier */
 380       ic = ic + twidCoefModifier;
 381
 382       pSi0 = pSrc16 + 2 * j;
 383       pSi1 = pSi0 + 2 * n2;
 384       pSi2 = pSi1 + 2 * n2;
 385       pSi3 = pSi2 + 2 * n2;
 386
 387       /*  Butterfly implementation */
 388       for (i0 = j; i0 < fftLen; i0 += n1)
 389       {
 390         /*  Reading i0, i0+fftLen/2 inputs */
 391         /* Read ya (real), xa(imag) input */
 392         T = _SIMD32_OFFSET(pSi0);
 393
 394         /* Read yc (real), xc(imag) input */
 395         S = _SIMD32_OFFSET(pSi2);
 396
 397         /* R = packed( (ya + yc), (xa + xc)) */
 398         R = __QADD16(T, S);
 399
 400         /* S = packed((ya - yc), (xa - xc)) */
 401         S = __QSUB16(T, S);
 402
 403         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
 404         /* Read yb (real), xb(imag) input */
 405         T = _SIMD32_OFFSET(pSi1);
 406
 407         /* Read yd (real), xd(imag) input */
 408         U = _SIMD32_OFFSET(pSi3);
 409
 410         /* T = packed( (yb + yd), (xb + xd)) */
 411         T = __QADD16(T, U);
 412
 413         /*  writing the butterfly processed i0 sample */
 414
 415         /* xa' = xa + xb + xc + xd */
 416         /* ya' = ya + yb + yc + yd */
 417         out1 = __SHADD16(R, T);
 418         out1 = __SHADD16(out1, 0);
 419         _SIMD32_OFFSET(pSi0) = out1;
 420         pSi0 += 2 * n1;
 421
 422         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
 423         R = __SHSUB16(R, T);
 424
 425 #ifndef ARM_MATH_BIG_ENDIAN
 426
 427         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
 428         out1 = __SMUAD(C2, R) >> 16U;
 429
 430         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 431         out2 = __SMUSDX(C2, R);
 432
 433 #else
 434
 435         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 436         out1 = __SMUSDX(R, C2) >> 16U;
 437
 438         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
 439         out2 = __SMUAD(C2, R);
 440
 441 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 442
 443         /*  Reading i0+3fftLen/4 */
 444         /* Read yb (real), xb(imag) input */
 445         T = _SIMD32_OFFSET(pSi1);
 446
 447         /*  writing the butterfly processed i0 + fftLen/4 sample */
 448         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
 449         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 450         _SIMD32_OFFSET(pSi1) =
 451           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
 452         pSi1 += 2 * n1;
 453
 454         /*  Butterfly calculations */
 455
 456         /* Read yd (real), xd(imag) input */
 457         U = _SIMD32_OFFSET(pSi3);
 458
 459         /* T = packed(yb-yd, xb-xd) */
 460         T = __QSUB16(T, U);
 461
 462 #ifndef ARM_MATH_BIG_ENDIAN
 463
 464         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
 465         R = __SHASX(S, T);
 466
 467         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
 468         S = __SHSAX(S, T);
 469
 470
 471         /*  Butterfly process for the i0+fftLen/2 sample */
 472         out1 = __SMUAD(C1, S) >> 16U;
 473         out2 = __SMUSDX(C1, S);
 474
 475 #else
 476
 477         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
 478         R = __SHSAX(S, T);
 479
 480         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
 481         S = __SHASX(S, T);
 482
 483
 484         /*  Butterfly process for the i0+fftLen/2 sample */
 485         out1 = __SMUSDX(S, C1) >> 16U;
 486         out2 = __SMUAD(C1, S);
 487
 488 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 489
 490         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
 491         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
 492         _SIMD32_OFFSET(pSi2) =
 493           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
 494         pSi2 += 2 * n1;
 495
 496         /*  Butterfly process for the i0+3fftLen/4 sample */
 497
 498 #ifndef ARM_MATH_BIG_ENDIAN
 499
 500         out1 = __SMUAD(C3, R) >> 16U;
 501         out2 = __SMUSDX(C3, R);
 502
 503 #else
 504
 505         out1 = __SMUSDX(R, C3) >> 16U;
 506         out2 = __SMUAD(C3, R);
 507
 508 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 509
 510         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
 511         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
 512         _SIMD32_OFFSET(pSi3) =
 513           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
 514         pSi3 += 2 * n1;
 515       }
 516     }
 517     /*  Twiddle coefficients index modifier */
 518     twidCoefModifier <<= 2U;
 519   }
 520   /* end of middle stage process */
 521
 522
 523   /* data is in 10.6(q6) format for the 1024 point */
 524   /* data is in 8.8(q8) format for the 256 point */
 525   /* data is in 6.10(q10) format for the 64 point */
 526   /* data is in 4.12(q12) format for the 16 point */
 527
 528   /*  Initializations for the last stage */
 529   j = fftLen >> 2;
 530
 531   ptr1 = &pSrc16[0];
 532
 533   /* start of last stage process */
 534
 535   /*  Butterfly implementation */
 536   do
 537   {
 538     /* Read xa (real), ya(imag) input */
 539     xaya = *__SIMD32(ptr1)++;
 540
 541     /* Read xb (real), yb(imag) input */
 542     xbyb = *__SIMD32(ptr1)++;
 543
 544     /* Read xc (real), yc(imag) input */
 545     xcyc = *__SIMD32(ptr1)++;
 546
 547     /* Read xd (real), yd(imag) input */
 548     xdyd = *__SIMD32(ptr1)++;
 549
 550     /* R = packed((ya + yc), (xa + xc)) */
 551     R = __QADD16(xaya, xcyc);
 552
 553     /* T = packed((yb + yd), (xb + xd)) */
 554     T = __QADD16(xbyb, xdyd);
 555
 556     /* pointer updation for writing */
 557     ptr1 = ptr1 - 8U;
 558
 559
 560     /* xa' = xa + xb + xc + xd */
 561     /* ya' = ya + yb + yc + yd */
 562     *__SIMD32(ptr1)++ = __SHADD16(R, T);
 563
 564     /* T = packed((yb + yd), (xb + xd)) */
 565     T = __QADD16(xbyb, xdyd);
 566
 567     /* xc' = (xa-xb+xc-xd) */
 568     /* yc' = (ya-yb+yc-yd) */
 569     *__SIMD32(ptr1)++ = __SHSUB16(R, T);
 570
 571     /* S = packed((ya - yc), (xa - xc)) */
 572     S = __QSUB16(xaya, xcyc);
 573
 574     /* Read yd (real), xd(imag) input */
 575     /* T = packed( (yb - yd), (xb - xd))  */
 576     U = __QSUB16(xbyb, xdyd);
 577
 578 #ifndef ARM_MATH_BIG_ENDIAN
 579
 580     /* xb' = (xa+yb-xc-yd) */
 581     /* yb' = (ya-xb-yc+xd) */
 582     *__SIMD32(ptr1)++ = __SHSAX(S, U);
 583
 584
 585     /* xd' = (xa-yb-xc+yd) */
 586     /* yd' = (ya+xb-yc-xd) */
 587     *__SIMD32(ptr1)++ = __SHASX(S, U);
 588
 589 #else
 590
 591     /* xb' = (xa+yb-xc-yd) */
 592     /* yb' = (ya-xb-yc+xd) */
 593     *__SIMD32(ptr1)++ = __SHASX(S, U);
 594
 595
 596     /* xd' = (xa-yb-xc+yd) */
 597     /* yd' = (ya+xb-yc-xd) */
 598     *__SIMD32(ptr1)++ = __SHSAX(S, U);
 599
 600 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 601
 602   } while (--j);
 603
 604   /* end of last stage process */
 605
 606   /* output is in 11.5(q5) format for the 1024 point */
 607   /* output is in 9.7(q7) format for the 256 point   */
 608   /* output is in 7.9(q9) format for the 64 point  */
 609   /* output is in 5.11(q11) format for the 16 point  */
 610
 611
 612 #else
 613
 614   /* Run the below code for Cortex-M0 */
 615
 616   q15_t R0, R1, S0, S1, T0, T1, U0, U1;
 617   q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
 618   uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
 619
 620   /* Total process is divided into three stages */
 621
 622   /* process first stage, middle stages, & last stage */
 623
 624   /*  Initializations for the first stage */
 625   n2 = fftLen;
 626   n1 = n2;
 627
 628   /* n2 = fftLen/4 */
 629   n2 >>= 2U;
 630
 631   /* Index for twiddle coefficient */
 632   ic = 0U;
 633
 634   /* Index for input read and output write */
 635   i0 = 0U;
 636   j = n2;
 637
 638   /* Input is in 1.15(q15) format */
 639
 640   /*  start of first stage process */
 641   do
 642   {
 643     /*  Butterfly implementation */
 644
 645     /*  index calculation for the input as, */
 646     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
 647     i1 = i0 + n2;
 648     i2 = i1 + n2;
 649     i3 = i2 + n2;
 650
 651     /*  Reading i0, i0+fftLen/2 inputs */
 652
 653     /* input is down scale by 4 to avoid overflow */
 654     /* Read ya (real), xa(imag) input */
 655     T0 = pSrc16[i0 * 2U] >> 2U;
 656     T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
 657
 658     /* input is down scale by 4 to avoid overflow */
 659     /* Read yc (real), xc(imag) input */
 660     S0 = pSrc16[i2 * 2U] >> 2U;
 661     S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
 662
 663     /* R0 = (ya + yc) */
 664     R0 = __SSAT(T0 + S0, 16U);
 665     /* R1 = (xa + xc) */
 666     R1 = __SSAT(T1 + S1, 16U);
 667
 668     /* S0 = (ya - yc) */
 669     S0 = __SSAT(T0 - S0, 16);
 670     /* S1 = (xa - xc) */
 671     S1 = __SSAT(T1 - S1, 16);
 672
 673     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
 674     /* input is down scale by 4 to avoid overflow */
 675     /* Read yb (real), xb(imag) input */
 676     T0 = pSrc16[i1 * 2U] >> 2U;
 677     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
 678
 679     /* input is down scale by 4 to avoid overflow */
 680     /* Read yd (real), xd(imag) input */
 681     U0 = pSrc16[i3 * 2U] >> 2U;
 682     U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
 683
 684     /* T0 = (yb + yd) */
 685     T0 = __SSAT(T0 + U0, 16U);
 686     /* T1 = (xb + xd) */
 687     T1 = __SSAT(T1 + U1, 16U);
 688
 689     /*  writing the butterfly processed i0 sample */
 690     /* ya' = ya + yb + yc + yd */
 691     /* xa' = xa + xb + xc + xd */
 692     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
 693     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
 694
 695     /* R0 = (ya + yc) - (yb + yd) */
 696     /* R1 = (xa + xc) - (xb + xd) */
 697     R0 = __SSAT(R0 - T0, 16U);
 698     R1 = __SSAT(R1 - T1, 16U);
 699
 700     /* co2 & si2 are read from Coefficient pointer */
 701     Co2 = pCoef16[2U * ic * 2U];
 702     Si2 = pCoef16[(2U * ic * 2U) + 1];
 703
 704     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
 705     out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
 706     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 707     out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
 708
 709     /*  Reading i0+fftLen/4 */
 710     /* input is down scale by 4 to avoid overflow */
 711     /* T0 = yb, T1 =  xb */
 712     T0 = pSrc16[i1 * 2U] >> 2;
 713     T1 = pSrc16[(i1 * 2U) + 1] >> 2;
 714
 715     /* writing the butterfly processed i0 + fftLen/4 sample */
 716     /* writing output(xc', yc') in little endian format */
 717     pSrc16[i1 * 2U] = out1;
 718     pSrc16[(i1 * 2U) + 1] = out2;
 719
 720     /*  Butterfly calculations */
 721     /* input is down scale by 4 to avoid overflow */
 722     /* U0 = yd, U1 = xd */
 723     U0 = pSrc16[i3 * 2U] >> 2;
 724     U1 = pSrc16[(i3 * 2U) + 1] >> 2;
 725     /* T0 = yb-yd */
 726     T0 = __SSAT(T0 - U0, 16);
 727     /* T1 = xb-xd */
 728     T1 = __SSAT(T1 - U1, 16);
 729
 730     /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
 731     R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
 732     R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
 733
 734     /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
 735     S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
 736     S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
 737
 738     /* co1 & si1 are read from Coefficient pointer */
 739     Co1 = pCoef16[ic * 2U];
 740     Si1 = pCoef16[(ic * 2U) + 1];
 741     /*  Butterfly process for the i0+fftLen/2 sample */
 742     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
 743     out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
 744     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
 745     out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
 746
 747     /* writing output(xb', yb') in little endian format */
 748     pSrc16[i2 * 2U] = out1;
 749     pSrc16[(i2 * 2U) + 1] = out2;
 750
 751     /* Co3 & si3 are read from Coefficient pointer */
 752     Co3 = pCoef16[3U * (ic * 2U)];
 753     Si3 = pCoef16[(3U * (ic * 2U)) + 1];
 754     /*  Butterfly process for the i0+3fftLen/4 sample */
 755     /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
 756     out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
 757     /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
 758     out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
 759     /* writing output(xd', yd') in little endian format */
 760     pSrc16[i3 * 2U] = out1;
 761     pSrc16[(i3 * 2U) + 1] = out2;
 762
 763     /*  Twiddle coefficients index modifier */
 764     ic = ic + twidCoefModifier;
 765
 766     /*  Updating input index */
 767     i0 = i0 + 1U;
 768
 769   } while (--j);
 770   /* data is in 4.11(q11) format */
 771
 772   /* end of first stage process */
 773
 774
 775   /* start of middle stage process */
 776
 777   /*  Twiddle coefficients index modifier */
 778   twidCoefModifier <<= 2U;
 779
 780   /*  Calculation of Middle stage */
 781   for (k = fftLen / 4U; k > 4U; k >>= 2U)
 782   {
 783     /*  Initializations for the middle stage */
 784     n1 = n2;
 785     n2 >>= 2U;
 786     ic = 0U;
 787
 788     for (j = 0U; j <= (n2 - 1U); j++)
 789     {
 790       /*  index calculation for the coefficients */
 791       Co1 = pCoef16[ic * 2U];
 792       Si1 = pCoef16[(ic * 2U) + 1U];
 793       Co2 = pCoef16[2U * (ic * 2U)];
 794       Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
 795       Co3 = pCoef16[3U * (ic * 2U)];
 796       Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
 797
 798       /*  Twiddle coefficients index modifier */
 799       ic = ic + twidCoefModifier;
 800
 801       /*  Butterfly implementation */
 802       for (i0 = j; i0 < fftLen; i0 += n1)
 803       {
 804         /*  index calculation for the input as, */
 805         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
 806         i1 = i0 + n2;
 807         i2 = i1 + n2;
 808         i3 = i2 + n2;
 809
 810         /*  Reading i0, i0+fftLen/2 inputs */
 811         /* Read ya (real), xa(imag) input */
 812         T0 = pSrc16[i0 * 2U];
 813         T1 = pSrc16[(i0 * 2U) + 1U];
 814
 815         /* Read yc (real), xc(imag) input */
 816         S0 = pSrc16[i2 * 2U];
 817         S1 = pSrc16[(i2 * 2U) + 1U];
 818
 819         /* R0 = (ya + yc), R1 = (xa + xc) */
 820         R0 = __SSAT(T0 + S0, 16);
 821         R1 = __SSAT(T1 + S1, 16);
 822
 823         /* S0 = (ya - yc), S1 =(xa - xc) */
 824         S0 = __SSAT(T0 - S0, 16);
 825         S1 = __SSAT(T1 - S1, 16);
 826
 827         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
 828         /* Read yb (real), xb(imag) input */
 829         T0 = pSrc16[i1 * 2U];
 830         T1 = pSrc16[(i1 * 2U) + 1U];
 831
 832         /* Read yd (real), xd(imag) input */
 833         U0 = pSrc16[i3 * 2U];
 834         U1 = pSrc16[(i3 * 2U) + 1U];
 835
 836
 837         /* T0 = (yb + yd), T1 = (xb + xd) */
 838         T0 = __SSAT(T0 + U0, 16);
 839         T1 = __SSAT(T1 + U1, 16);
 840
 841         /*  writing the butterfly processed i0 sample */
 842
 843         /* xa' = xa + xb + xc + xd */
 844         /* ya' = ya + yb + yc + yd */
 845         out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
 846         out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
 847
 848         pSrc16[i0 * 2U] = out1;
 849         pSrc16[(2U * i0) + 1U] = out2;
 850
 851         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
 852         R0 = (R0 >> 1U) - (T0 >> 1U);
 853         R1 = (R1 >> 1U) - (T1 >> 1U);
 854
 855         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
 856         out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
 857
 858         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 859         out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
 860
 861         /*  Reading i0+3fftLen/4 */
 862         /* Read yb (real), xb(imag) input */
 863         T0 = pSrc16[i1 * 2U];
 864         T1 = pSrc16[(i1 * 2U) + 1U];
 865
 866         /*  writing the butterfly processed i0 + fftLen/4 sample */
 867         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
 868         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 869         pSrc16[i1 * 2U] = out1;
 870         pSrc16[(i1 * 2U) + 1U] = out2;
 871
 872         /*  Butterfly calculations */
 873
 874         /* Read yd (real), xd(imag) input */
 875         U0 = pSrc16[i3 * 2U];
 876         U1 = pSrc16[(i3 * 2U) + 1U];
 877
 878         /* T0 = yb-yd, T1 = xb-xd */
 879         T0 = __SSAT(T0 - U0, 16);
 880         T1 = __SSAT(T1 - U1, 16);
 881
 882         /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
 883         R0 = (S0 >> 1U) - (T1 >> 1U);
 884         R1 = (S1 >> 1U) + (T0 >> 1U);
 885
 886         /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
 887         S0 = (S0 >> 1U) + (T1 >> 1U);
 888         S1 = (S1 >> 1U) - (T0 >> 1U);
 889
 890         /*  Butterfly process for the i0+fftLen/2 sample */
 891         out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
 892
 893         out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
 894
 895         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
 896         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
 897         pSrc16[i2 * 2U] = out1;
 898         pSrc16[(i2 * 2U) + 1U] = out2;
 899
 900         /*  Butterfly process for the i0+3fftLen/4 sample */
 901         out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
 902
 903         out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
 904         /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
 905         /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
 906         pSrc16[i3 * 2U] = out1;
 907         pSrc16[(i3 * 2U) + 1U] = out2;
 908       }
 909     }
 910     /*  Twiddle coefficients index modifier */
 911     twidCoefModifier <<= 2U;
 912   }
 913   /* end of middle stage process */
 914
 915
 916   /* data is in 10.6(q6) format for the 1024 point */
 917   /* data is in 8.8(q8) format for the 256 point */
 918   /* data is in 6.10(q10) format for the 64 point */
 919   /* data is in 4.12(q12) format for the 16 point */
 920
 921   /*  Initializations for the last stage */
 922   n1 = n2;
 923   n2 >>= 2U;
 924
 925   /* start of last stage process */
 926
 927   /*  Butterfly implementation */
 928   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
 929   {
 930     /*  index calculation for the input as, */
 931     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
 932     i1 = i0 + n2;
 933     i2 = i1 + n2;
 934     i3 = i2 + n2;
 935
 936     /*  Reading i0, i0+fftLen/2 inputs */
 937     /* Read ya (real), xa(imag) input */
 938     T0 = pSrc16[i0 * 2U];
 939     T1 = pSrc16[(i0 * 2U) + 1U];
 940
 941     /* Read yc (real), xc(imag) input */
 942     S0 = pSrc16[i2 * 2U];
 943     S1 = pSrc16[(i2 * 2U) + 1U];
 944
 945     /* R0 = (ya + yc), R1 = (xa + xc) */
 946     R0 = __SSAT(T0 + S0, 16U);
 947     R1 = __SSAT(T1 + S1, 16U);
 948
 949     /* S0 = (ya - yc), S1 = (xa - xc) */
 950     S0 = __SSAT(T0 - S0, 16U);
 951     S1 = __SSAT(T1 - S1, 16U);
 952
 953     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
 954     /* Read yb (real), xb(imag) input */
 955     T0 = pSrc16[i1 * 2U];
 956     T1 = pSrc16[(i1 * 2U) + 1U];
 957     /* Read yd (real), xd(imag) input */
 958     U0 = pSrc16[i3 * 2U];
 959     U1 = pSrc16[(i3 * 2U) + 1U];
 960
 961     /* T0 = (yb + yd), T1 = (xb + xd)) */
 962     T0 = __SSAT(T0 + U0, 16U);
 963     T1 = __SSAT(T1 + U1, 16U);
 964
 965     /*  writing the butterfly processed i0 sample */
 966     /* xa' = xa + xb + xc + xd */
 967     /* ya' = ya + yb + yc + yd */
 968     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
 969     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
 970
 971     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
 972     R0 = (R0 >> 1U) - (T0 >> 1U);
 973     R1 = (R1 >> 1U) - (T1 >> 1U);
 974     /* Read yb (real), xb(imag) input */
 975     T0 = pSrc16[i1 * 2U];
 976     T1 = pSrc16[(i1 * 2U) + 1U];
 977
 978     /*  writing the butterfly processed i0 + fftLen/4 sample */
 979     /* xc' = (xa-xb+xc-xd) */
 980     /* yc' = (ya-yb+yc-yd) */
 981     pSrc16[i1 * 2U] = R0;
 982     pSrc16[(i1 * 2U) + 1U] = R1;
 983
 984     /* Read yd (real), xd(imag) input */
 985     U0 = pSrc16[i3 * 2U];
 986     U1 = pSrc16[(i3 * 2U) + 1U];
 987     /* T0 = (yb - yd), T1 = (xb - xd)  */
 988     T0 = __SSAT(T0 - U0, 16U);
 989     T1 = __SSAT(T1 - U1, 16U);
 990
 991     /*  writing the butterfly processed i0 + fftLen/2 sample */
 992     /* xb' = (xa+yb-xc-yd) */
 993     /* yb' = (ya-xb-yc+xd) */
 994     pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
 995     pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
 996
 997     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
 998     /* xd' = (xa-yb-xc+yd) */
 999     /* yd' = (ya+xb-yc-xd) */
1000     pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
1001     pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
1002
1003   }
1004
1005   /* end of last stage process */
1006
1007   /* output is in 11.5(q5) format for the 1024 point */
1008   /* output is in 9.7(q7) format for the 256 point   */
1009   /* output is in 7.9(q9) format for the 64 point  */
1010   /* output is in 5.11(q11) format for the 16 point  */
1011
1012 #endif /* #if defined (ARM_MATH_DSP) */
1013
1014 }
1015
1016
1017 /**
1018  * @brief  Core function for the Q15 CIFFT butterfly process.
1019  * @param[in, out] *pSrc16          points to the in-place buffer of Q15 data type.
1020  * @param[in]      fftLen           length of the FFT.
1021  * @param[in]      *pCoef16         points to twiddle coefficient buffer.
1022  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
1023  * @return none.
1024  */
1025
1026 /*
1027 * Radix-4 IFFT algorithm used is :
1028 *
1029 * CIFFT uses same twiddle coefficients as CFFT function
1030 *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
1031 *
1032 *
1033 * IFFT is implemented with following changes in equations from FFT
1034 *
1035 * Input real and imaginary data:
1036 * x(n) = xa + j * ya
1037 * x(n+N/4 ) = xb + j * yb
1038 * x(n+N/2 ) = xc + j * yc
1039 * x(n+3N 4) = xd + j * yd
1040 *
1041 *
1042 * Output real and imaginary data:
1043 * x(4r) = xa'+ j * ya'
1044 * x(4r+1) = xb'+ j * yb'
1045 * x(4r+2) = xc'+ j * yc'
1046 * x(4r+3) = xd'+ j * yd'
1047 *
1048 *
1049 * Twiddle factors for radix-4 IFFT:
1050 * Wn = co1 + j * (si1)
1051 * W2n = co2 + j * (si2)
1052 * W3n = co3 + j * (si3)
1053
1054 * The real and imaginary output values for the radix-4 butterfly are
1055 * xa' = xa + xb + xc + xd
1056 * ya' = ya + yb + yc + yd
1057 * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
1058 * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
1059 * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
1060 * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
1061 * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
1062 * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
1063 *
1064 */
1065
1066 void arm_radix4_butterfly_inverse_q15(
1067   q15_t * pSrc16,
1068   uint32_t fftLen,
1069   q15_t * pCoef16,
1070   uint32_t twidCoefModifier)
1071 {
1072
1073 #if defined (ARM_MATH_DSP)
1074
1075   /* Run the below code for Cortex-M4 and Cortex-M3 */
1076
1077   q31_t R, S, T, U;
1078   q31_t C1, C2, C3, out1, out2;
1079   uint32_t n1, n2, ic, i0, j, k;
1080
1081   q15_t *ptr1;
1082   q15_t *pSi0;
1083   q15_t *pSi1;
1084   q15_t *pSi2;
1085   q15_t *pSi3;
1086
1087   q31_t xaya, xbyb, xcyc, xdyd;
1088
1089   /* Total process is divided into three stages */
1090
1091   /* process first stage, middle stages, & last stage */
1092
1093   /*  Initializations for the first stage */
1094   n2 = fftLen;
1095   n1 = n2;
1096
1097   /* n2 = fftLen/4 */
1098   n2 >>= 2U;
1099
1100   /* Index for twiddle coefficient */
1101   ic = 0U;
1102
1103   /* Index for input read and output write */
1104   j = n2;
1105
1106   pSi0 = pSrc16;
1107   pSi1 = pSi0 + 2 * n2;
1108   pSi2 = pSi1 + 2 * n2;
1109   pSi3 = pSi2 + 2 * n2;
1110
1111   /* Input is in 1.15(q15) format */
1112
1113   /*  start of first stage process */
1114   do
1115   {
1116     /*  Butterfly implementation */
1117
1118     /*  Reading i0, i0+fftLen/2 inputs */
1119     /* Read ya (real), xa(imag) input */
1120     T = _SIMD32_OFFSET(pSi0);
1121     T = __SHADD16(T, 0);
1122     T = __SHADD16(T, 0);
1123
1124     /* Read yc (real), xc(imag) input */
1125     S = _SIMD32_OFFSET(pSi2);
1126     S = __SHADD16(S, 0);
1127     S = __SHADD16(S, 0);
1128
1129     /* R = packed((ya + yc), (xa + xc) ) */
1130     R = __QADD16(T, S);
1131
1132     /* S = packed((ya - yc), (xa - xc) ) */
1133     S = __QSUB16(T, S);
1134
1135     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1136     /* Read yb (real), xb(imag) input */
1137     T = _SIMD32_OFFSET(pSi1);
1138     T = __SHADD16(T, 0);
1139     T = __SHADD16(T, 0);
1140
1141     /* Read yd (real), xd(imag) input */
1142     U = _SIMD32_OFFSET(pSi3);
1143     U = __SHADD16(U, 0);
1144     U = __SHADD16(U, 0);
1145
1146     /* T = packed((yb + yd), (xb + xd) ) */
1147     T = __QADD16(T, U);
1148
1149     /*  writing the butterfly processed i0 sample */
1150     /* xa' = xa + xb + xc + xd */
1151     /* ya' = ya + yb + yc + yd */
1152     _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
1153     pSi0 += 2;
1154
1155     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
1156     R = __QSUB16(R, T);
1157
1158     /* co2 & si2 are read from SIMD Coefficient pointer */
1159     C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
1160
1161 #ifndef ARM_MATH_BIG_ENDIAN
1162
1163     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1164     out1 = __SMUSD(C2, R) >> 16U;
1165     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1166     out2 = __SMUADX(C2, R);
1167
1168 #else
1169
1170     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1171     out1 = __SMUADX(C2, R) >> 16U;
1172     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1173     out2 = __SMUSD(__QSUB16(0, C2), R);
1174
1175 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1176
1177     /*  Reading i0+fftLen/4 */
1178     /* T = packed(yb, xb) */
1179     T = _SIMD32_OFFSET(pSi1);
1180     T = __SHADD16(T, 0);
1181     T = __SHADD16(T, 0);
1182
1183     /* writing the butterfly processed i0 + fftLen/4 sample */
1184     /* writing output(xc', yc') in little endian format */
1185     _SIMD32_OFFSET(pSi1) =
1186       (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1187     pSi1 += 2;
1188
1189     /*  Butterfly calculations */
1190     /* U = packed(yd, xd) */
1191     U = _SIMD32_OFFSET(pSi3);
1192     U = __SHADD16(U, 0);
1193     U = __SHADD16(U, 0);
1194
1195     /* T = packed(yb-yd, xb-xd) */
1196     T = __QSUB16(T, U);
1197
1198 #ifndef ARM_MATH_BIG_ENDIAN
1199
1200     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1201     R = __QSAX(S, T);
1202     /* S = packed((ya-yc) + (xb- xd),  (xa-xc) - (yb-yd)) */
1203     S = __QASX(S, T);
1204
1205 #else
1206
1207     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1208     R = __QASX(S, T);
1209     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1210     S = __QSAX(S, T);
1211
1212 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1213
1214     /* co1 & si1 are read from SIMD Coefficient pointer */
1215     C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
1216     /*  Butterfly process for the i0+fftLen/2 sample */
1217
1218 #ifndef ARM_MATH_BIG_ENDIAN
1219
1220     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1221     out1 = __SMUSD(C1, S) >> 16U;
1222     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1223     out2 = __SMUADX(C1, S);
1224
1225 #else
1226
1227     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1228     out1 = __SMUADX(C1, S) >> 16U;
1229     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1230     out2 = __SMUSD(__QSUB16(0, C1), S);
1231
1232 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1233
1234     /* writing output(xb', yb') in little endian format */
1235     _SIMD32_OFFSET(pSi2) =
1236       ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
1237     pSi2 += 2;
1238
1239
1240     /* co3 & si3 are read from SIMD Coefficient pointer */
1241     C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
1242     /*  Butterfly process for the i0+3fftLen/4 sample */
1243
1244 #ifndef ARM_MATH_BIG_ENDIAN
1245
1246     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1247     out1 = __SMUSD(C3, R) >> 16U;
1248     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1249     out2 = __SMUADX(C3, R);
1250
1251 #else
1252
1253     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1254     out1 = __SMUADX(C3, R) >> 16U;
1255     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1256     out2 = __SMUSD(__QSUB16(0, C3), R);
1257
1258 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1259
1260     /* writing output(xd', yd') in little endian format */
1261     _SIMD32_OFFSET(pSi3) =
1262       ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1263     pSi3 += 2;
1264
1265     /*  Twiddle coefficients index modifier */
1266     ic = ic + twidCoefModifier;
1267
1268   } while (--j);
1269   /* data is in 4.11(q11) format */
1270
1271   /* end of first stage process */
1272
1273
1274   /* start of middle stage process */
1275
1276   /*  Twiddle coefficients index modifier */
1277   twidCoefModifier <<= 2U;
1278
1279   /*  Calculation of Middle stage */
1280   for (k = fftLen / 4U; k > 4U; k >>= 2U)
1281   {
1282     /*  Initializations for the middle stage */
1283     n1 = n2;
1284     n2 >>= 2U;
1285     ic = 0U;
1286
1287     for (j = 0U; j <= (n2 - 1U); j++)
1288     {
1289       /*  index calculation for the coefficients */
1290       C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
1291       C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
1292       C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
1293
1294       /*  Twiddle coefficients index modifier */
1295       ic = ic + twidCoefModifier;
1296
1297       pSi0 = pSrc16 + 2 * j;
1298       pSi1 = pSi0 + 2 * n2;
1299       pSi2 = pSi1 + 2 * n2;
1300       pSi3 = pSi2 + 2 * n2;
1301
1302       /*  Butterfly implementation */
1303       for (i0 = j; i0 < fftLen; i0 += n1)
1304       {
1305         /*  Reading i0, i0+fftLen/2 inputs */
1306         /* Read ya (real), xa(imag) input */
1307         T = _SIMD32_OFFSET(pSi0);
1308
1309         /* Read yc (real), xc(imag) input */
1310         S = _SIMD32_OFFSET(pSi2);
1311
1312         /* R = packed( (ya + yc), (xa + xc)) */
1313         R = __QADD16(T, S);
1314
1315         /* S = packed((ya - yc), (xa - xc)) */
1316         S = __QSUB16(T, S);
1317
1318         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1319         /* Read yb (real), xb(imag) input */
1320         T = _SIMD32_OFFSET(pSi1);
1321
1322         /* Read yd (real), xd(imag) input */
1323         U = _SIMD32_OFFSET(pSi3);
1324
1325         /* T = packed( (yb + yd), (xb + xd)) */
1326         T = __QADD16(T, U);
1327
1328         /*  writing the butterfly processed i0 sample */
1329
1330         /* xa' = xa + xb + xc + xd */
1331         /* ya' = ya + yb + yc + yd */
1332         out1 = __SHADD16(R, T);
1333         out1 = __SHADD16(out1, 0);
1334         _SIMD32_OFFSET(pSi0) = out1;
1335         pSi0 += 2 * n1;
1336
1337         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
1338         R = __SHSUB16(R, T);
1339
1340 #ifndef ARM_MATH_BIG_ENDIAN
1341
1342         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1343         out1 = __SMUSD(C2, R) >> 16U;
1344
1345         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1346         out2 = __SMUADX(C2, R);
1347
1348 #else
1349
1350         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1351         out1 = __SMUADX(R, C2) >> 16U;
1352
1353         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1354         out2 = __SMUSD(__QSUB16(0, C2), R);
1355
1356 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1357
1358         /*  Reading i0+3fftLen/4 */
1359         /* Read yb (real), xb(imag) input */
1360         T = _SIMD32_OFFSET(pSi1);
1361
1362         /*  writing the butterfly processed i0 + fftLen/4 sample */
1363         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1364         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1365         _SIMD32_OFFSET(pSi1) =
1366           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1367         pSi1 += 2 * n1;
1368
1369         /*  Butterfly calculations */
1370
1371         /* Read yd (real), xd(imag) input */
1372         U = _SIMD32_OFFSET(pSi3);
1373
1374         /* T = packed(yb-yd, xb-xd) */
1375         T = __QSUB16(T, U);
1376
1377 #ifndef ARM_MATH_BIG_ENDIAN
1378
1379         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1380         R = __SHSAX(S, T);
1381
1382         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1383         S = __SHASX(S, T);
1384
1385
1386         /*  Butterfly process for the i0+fftLen/2 sample */
1387         out1 = __SMUSD(C1, S) >> 16U;
1388         out2 = __SMUADX(C1, S);
1389
1390 #else
1391
1392         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1393         R = __SHASX(S, T);
1394
1395         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1396         S = __SHSAX(S, T);
1397
1398
1399         /*  Butterfly process for the i0+fftLen/2 sample */
1400         out1 = __SMUADX(S, C1) >> 16U;
1401         out2 = __SMUSD(__QSUB16(0, C1), S);
1402
1403 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1404
1405         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1406         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1407         _SIMD32_OFFSET(pSi2) =
1408           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1409         pSi2 += 2 * n1;
1410
1411         /*  Butterfly process for the i0+3fftLen/4 sample */
1412
1413 #ifndef ARM_MATH_BIG_ENDIAN
1414
1415         out1 = __SMUSD(C3, R) >> 16U;
1416         out2 = __SMUADX(C3, R);
1417
1418 #else
1419
1420         out1 = __SMUADX(C3, R) >> 16U;
1421         out2 = __SMUSD(__QSUB16(0, C3), R);
1422
1423 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1424
1425         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1426         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1427         _SIMD32_OFFSET(pSi3) =
1428           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1429         pSi3 += 2 * n1;
1430       }
1431     }
1432     /*  Twiddle coefficients index modifier */
1433     twidCoefModifier <<= 2U;
1434   }
1435   /* end of middle stage process */
1436
1437   /* data is in 10.6(q6) format for the 1024 point */
1438   /* data is in 8.8(q8) format for the 256 point */
1439   /* data is in 6.10(q10) format for the 64 point */
1440   /* data is in 4.12(q12) format for the 16 point */
1441
1442   /*  Initializations for the last stage */
1443   j = fftLen >> 2;
1444
1445   ptr1 = &pSrc16[0];
1446
1447   /* start of last stage process */
1448
1449   /*  Butterfly implementation */
1450   do
1451   {
1452     /* Read xa (real), ya(imag) input */
1453     xaya = *__SIMD32(ptr1)++;
1454
1455     /* Read xb (real), yb(imag) input */
1456     xbyb = *__SIMD32(ptr1)++;
1457
1458     /* Read xc (real), yc(imag) input */
1459     xcyc = *__SIMD32(ptr1)++;
1460
1461     /* Read xd (real), yd(imag) input */
1462     xdyd = *__SIMD32(ptr1)++;
1463
1464     /* R = packed((ya + yc), (xa + xc)) */
1465     R = __QADD16(xaya, xcyc);
1466
1467     /* T = packed((yb + yd), (xb + xd)) */
1468     T = __QADD16(xbyb, xdyd);
1469
1470     /* pointer updation for writing */
1471     ptr1 = ptr1 - 8U;
1472
1473
1474     /* xa' = xa + xb + xc + xd */
1475     /* ya' = ya + yb + yc + yd */
1476     *__SIMD32(ptr1)++ = __SHADD16(R, T);
1477
1478     /* T = packed((yb + yd), (xb + xd)) */
1479     T = __QADD16(xbyb, xdyd);
1480
1481     /* xc' = (xa-xb+xc-xd) */
1482     /* yc' = (ya-yb+yc-yd) */
1483     *__SIMD32(ptr1)++ = __SHSUB16(R, T);
1484
1485     /* S = packed((ya - yc), (xa - xc)) */
1486     S = __QSUB16(xaya, xcyc);
1487
1488     /* Read yd (real), xd(imag) input */
1489     /* T = packed( (yb - yd), (xb - xd))  */
1490     U = __QSUB16(xbyb, xdyd);
1491
1492 #ifndef ARM_MATH_BIG_ENDIAN
1493
1494     /* xb' = (xa+yb-xc-yd) */
1495     /* yb' = (ya-xb-yc+xd) */
1496     *__SIMD32(ptr1)++ = __SHASX(S, U);
1497
1498
1499     /* xd' = (xa-yb-xc+yd) */
1500     /* yd' = (ya+xb-yc-xd) */
1501     *__SIMD32(ptr1)++ = __SHSAX(S, U);
1502
1503 #else
1504
1505     /* xb' = (xa+yb-xc-yd) */
1506     /* yb' = (ya-xb-yc+xd) */
1507     *__SIMD32(ptr1)++ = __SHSAX(S, U);
1508
1509
1510     /* xd' = (xa-yb-xc+yd) */
1511     /* yd' = (ya+xb-yc-xd) */
1512     *__SIMD32(ptr1)++ = __SHASX(S, U);
1513
1514
1515 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1516
1517   } while (--j);
1518
1519   /* end of last stage  process */
1520
1521   /* output is in 11.5(q5) format for the 1024 point */
1522   /* output is in 9.7(q7) format for the 256 point   */
1523   /* output is in 7.9(q9) format for the 64 point  */
1524   /* output is in 5.11(q11) format for the 16 point  */
1525
1526
1527 #else
1528
1529   /* Run the below code for Cortex-M0 */
1530
1531   q15_t R0, R1, S0, S1, T0, T1, U0, U1;
1532   q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
1533   uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
1534
1535   /* Total process is divided into three stages */
1536
1537   /* process first stage, middle stages, & last stage */
1538
1539   /*  Initializations for the first stage */
1540   n2 = fftLen;
1541   n1 = n2;
1542
1543   /* n2 = fftLen/4 */
1544   n2 >>= 2U;
1545
1546   /* Index for twiddle coefficient */
1547   ic = 0U;
1548
1549   /* Index for input read and output write */
1550   i0 = 0U;
1551
1552   j = n2;
1553
1554   /* Input is in 1.15(q15) format */
1555
1556   /*  Start of first stage process */
1557   do
1558   {
1559     /*  Butterfly implementation */
1560
1561     /*  index calculation for the input as, */
1562     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1563     i1 = i0 + n2;
1564     i2 = i1 + n2;
1565     i3 = i2 + n2;
1566
1567     /*  Reading i0, i0+fftLen/2 inputs */
1568     /* input is down scale by 4 to avoid overflow */
1569     /* Read ya (real), xa(imag) input */
1570     T0 = pSrc16[i0 * 2U] >> 2U;
1571     T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
1572     /* input is down scale by 4 to avoid overflow */
1573     /* Read yc (real), xc(imag) input */
1574     S0 = pSrc16[i2 * 2U] >> 2U;
1575     S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
1576
1577     /* R0 = (ya + yc), R1 = (xa + xc) */
1578     R0 = __SSAT(T0 + S0, 16U);
1579     R1 = __SSAT(T1 + S1, 16U);
1580     /* S0 = (ya - yc), S1 = (xa - xc) */
1581     S0 = __SSAT(T0 - S0, 16U);
1582     S1 = __SSAT(T1 - S1, 16U);
1583
1584     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1585     /* input is down scale by 4 to avoid overflow */
1586     /* Read yb (real), xb(imag) input */
1587     T0 = pSrc16[i1 * 2U] >> 2U;
1588     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
1589     /* Read yd (real), xd(imag) input */
1590     /* input is down scale by 4 to avoid overflow */
1591     U0 = pSrc16[i3 * 2U] >> 2U;
1592     U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
1593
1594     /* T0 = (yb + yd), T1 = (xb + xd) */
1595     T0 = __SSAT(T0 + U0, 16U);
1596     T1 = __SSAT(T1 + U1, 16U);
1597
1598     /*  writing the butterfly processed i0 sample */
1599     /* xa' = xa + xb + xc + xd */
1600     /* ya' = ya + yb + yc + yd */
1601     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
1602     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
1603
1604     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
1605     R0 = __SSAT(R0 - T0, 16U);
1606     R1 = __SSAT(R1 - T1, 16U);
1607     /* co2 & si2 are read from Coefficient pointer */
1608     Co2 = pCoef16[2U * ic * 2U];
1609     Si2 = pCoef16[(2U * ic * 2U) + 1U];
1610     /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1611     out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
1612     /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1613     out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
1614
1615     /*  Reading i0+fftLen/4 */
1616     /* input is down scale by 4 to avoid overflow */
1617     /* T0 = yb, T1 = xb */
1618     T0 = pSrc16[i1 * 2U] >> 2U;
1619     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
1620
1621     /* writing the butterfly processed i0 + fftLen/4 sample */
1622     /* writing output(xc', yc') in little endian format */
1623     pSrc16[i1 * 2U] = out1;
1624     pSrc16[(i1 * 2U) + 1U] = out2;
1625
1626     /*  Butterfly calculations */
1627     /* input is down scale by 4 to avoid overflow */
1628     /* U0 = yd, U1 = xd) */
1629     U0 = pSrc16[i3 * 2U] >> 2U;
1630     U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
1631
1632     /* T0 = yb-yd, T1 = xb-xd) */
1633     T0 = __SSAT(T0 - U0, 16U);
1634     T1 = __SSAT(T1 - U1, 16U);
1635     /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1636     R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
1637     R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
1638     /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1639     S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
1640     S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
1641
1642     /* co1 & si1 are read from Coefficient pointer */
1643     Co1 = pCoef16[ic * 2U];
1644     Si1 = pCoef16[(ic * 2U) + 1U];
1645     /*  Butterfly process for the i0+fftLen/2 sample */
1646     /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1647     out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
1648     /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1649     out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
1650     /* writing output(xb', yb') in little endian format */
1651     pSrc16[i2 * 2U] = out1;
1652     pSrc16[(i2 * 2U) + 1U] = out2;
1653
1654     /* Co3 & si3 are read from Coefficient pointer */
1655     Co3 = pCoef16[3U * ic * 2U];
1656     Si3 = pCoef16[(3U * ic * 2U) + 1U];
1657     /*  Butterfly process for the i0+3fftLen/4 sample */
1658     /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1659     out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
1660     /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1661     out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
1662     /* writing output(xd', yd') in little endian format */
1663     pSrc16[i3 * 2U] = out1;
1664     pSrc16[(i3 * 2U) + 1U] = out2;
1665
1666     /*  Twiddle coefficients index modifier */
1667     ic = ic + twidCoefModifier;
1668
1669     /*  Updating input index */
1670     i0 = i0 + 1U;
1671
1672   } while (--j);
1673
1674   /*  End of first stage process */
1675
1676   /* data is in 4.11(q11) format */
1677
1678
1679   /*  Start of Middle stage process */
1680
1681   /*  Twiddle coefficients index modifier */
1682   twidCoefModifier <<= 2U;
1683
1684   /*  Calculation of Middle stage */
1685   for (k = fftLen / 4U; k > 4U; k >>= 2U)
1686   {
1687     /*  Initializations for the middle stage */
1688     n1 = n2;
1689     n2 >>= 2U;
1690     ic = 0U;
1691
1692     for (j = 0U; j <= (n2 - 1U); j++)
1693     {
1694       /*  index calculation for the coefficients */
1695       Co1 = pCoef16[ic * 2U];
1696       Si1 = pCoef16[(ic * 2U) + 1U];
1697       Co2 = pCoef16[2U * ic * 2U];
1698       Si2 = pCoef16[2U * ic * 2U + 1U];
1699       Co3 = pCoef16[3U * ic * 2U];
1700       Si3 = pCoef16[(3U * ic * 2U) + 1U];
1701
1702       /*  Twiddle coefficients index modifier */
1703       ic = ic + twidCoefModifier;
1704
1705       /*  Butterfly implementation */
1706       for (i0 = j; i0 < fftLen; i0 += n1)
1707       {
1708         /*  index calculation for the input as, */
1709         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1710         i1 = i0 + n2;
1711         i2 = i1 + n2;
1712         i3 = i2 + n2;
1713
1714         /*  Reading i0, i0+fftLen/2 inputs */
1715         /* Read ya (real), xa(imag) input */
1716         T0 = pSrc16[i0 * 2U];
1717         T1 = pSrc16[(i0 * 2U) + 1U];
1718
1719         /* Read yc (real), xc(imag) input */
1720         S0 = pSrc16[i2 * 2U];
1721         S1 = pSrc16[(i2 * 2U) + 1U];
1722
1723
1724         /* R0 = (ya + yc), R1 = (xa + xc) */
1725         R0 = __SSAT(T0 + S0, 16U);
1726         R1 = __SSAT(T1 + S1, 16U);
1727         /* S0 = (ya - yc), S1 = (xa - xc) */
1728         S0 = __SSAT(T0 - S0, 16U);
1729         S1 = __SSAT(T1 - S1, 16U);
1730
1731         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1732         /* Read yb (real), xb(imag) input */
1733         T0 = pSrc16[i1 * 2U];
1734         T1 = pSrc16[(i1 * 2U) + 1U];
1735
1736         /* Read yd (real), xd(imag) input */
1737         U0 = pSrc16[i3 * 2U];
1738         U1 = pSrc16[(i3 * 2U) + 1U];
1739
1740         /* T0 = (yb + yd), T1 = (xb + xd) */
1741         T0 = __SSAT(T0 + U0, 16U);
1742         T1 = __SSAT(T1 + U1, 16U);
1743
1744         /*  writing the butterfly processed i0 sample */
1745         /* xa' = xa + xb + xc + xd */
1746         /* ya' = ya + yb + yc + yd */
1747         pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
1748         pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
1749
1750         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1751         R0 = (R0 >> 1U) - (T0 >> 1U);
1752         R1 = (R1 >> 1U) - (T1 >> 1U);
1753
1754         /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
1755         out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
1756         /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1757         out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
1758
1759         /*  Reading i0+3fftLen/4 */
1760         /* Read yb (real), xb(imag) input */
1761         T0 = pSrc16[i1 * 2U];
1762         T1 = pSrc16[(i1 * 2U) + 1U];
1763
1764         /*  writing the butterfly processed i0 + fftLen/4 sample */
1765         /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1766         /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1767         pSrc16[i1 * 2U] = out1;
1768         pSrc16[(i1 * 2U) + 1U] = out2;
1769
1770         /*  Butterfly calculations */
1771         /* Read yd (real), xd(imag) input */
1772         U0 = pSrc16[i3 * 2U];
1773         U1 = pSrc16[(i3 * 2U) + 1U];
1774
1775         /* T0 = yb-yd, T1 = xb-xd) */
1776         T0 = __SSAT(T0 - U0, 16U);
1777         T1 = __SSAT(T1 - U1, 16U);
1778
1779         /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1780         R0 = (S0 >> 1U) + (T1 >> 1U);
1781         R1 = (S1 >> 1U) - (T0 >> 1U);
1782
1783         /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1784         S0 = (S0 >> 1U) - (T1 >> 1U);
1785         S1 = (S1 >> 1U) + (T0 >> 1U);
1786
1787         /*  Butterfly process for the i0+fftLen/2 sample */
1788         out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
1789         out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
1790         /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1791         /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1792         pSrc16[i2 * 2U] = out1;
1793         pSrc16[(i2 * 2U) + 1U] = out2;
1794
1795         /*  Butterfly process for the i0+3fftLen/4 sample */
1796         out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
1797
1798         out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
1799         /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1800         /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1801         pSrc16[i3 * 2U] = out1;
1802         pSrc16[(i3 * 2U) + 1U] = out2;
1803
1804
1805       }
1806     }
1807     /*  Twiddle coefficients index modifier */
1808     twidCoefModifier <<= 2U;
1809   }
1810   /*  End of Middle stages process */
1811
1812
1813   /* data is in 10.6(q6) format for the 1024 point */
1814   /* data is in 8.8(q8) format for the 256 point   */
1815   /* data is in 6.10(q10) format for the 64 point  */
1816   /* data is in 4.12(q12) format for the 16 point  */
1817
1818   /* start of last stage process */
1819
1820
1821   /*  Initializations for the last stage */
1822   n1 = n2;
1823   n2 >>= 2U;
1824
1825   /*  Butterfly implementation */
1826   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
1827   {
1828     /*  index calculation for the input as, */
1829     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1830     i1 = i0 + n2;
1831     i2 = i1 + n2;
1832     i3 = i2 + n2;
1833
1834     /*  Reading i0, i0+fftLen/2 inputs */
1835     /* Read ya (real), xa(imag) input */
1836     T0 = pSrc16[i0 * 2U];
1837     T1 = pSrc16[(i0 * 2U) + 1U];
1838     /* Read yc (real), xc(imag) input */
1839     S0 = pSrc16[i2 * 2U];
1840     S1 = pSrc16[(i2 * 2U) + 1U];
1841
1842     /* R0 = (ya + yc), R1 = (xa + xc) */
1843     R0 = __SSAT(T0 + S0, 16U);
1844     R1 = __SSAT(T1 + S1, 16U);
1845     /* S0 = (ya - yc), S1 = (xa - xc) */
1846     S0 = __SSAT(T0 - S0, 16U);
1847     S1 = __SSAT(T1 - S1, 16U);
1848
1849     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1850     /* Read yb (real), xb(imag) input */
1851     T0 = pSrc16[i1 * 2U];
1852     T1 = pSrc16[(i1 * 2U) + 1U];
1853     /* Read yd (real), xd(imag) input */
1854     U0 = pSrc16[i3 * 2U];
1855     U1 = pSrc16[(i3 * 2U) + 1U];
1856
1857     /* T0 = (yb + yd), T1 = (xb + xd) */
1858     T0 = __SSAT(T0 + U0, 16U);
1859     T1 = __SSAT(T1 + U1, 16U);
1860
1861     /*  writing the butterfly processed i0 sample */
1862     /* xa' = xa + xb + xc + xd */
1863     /* ya' = ya + yb + yc + yd */
1864     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
1865     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
1866
1867     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1868     R0 = (R0 >> 1U) - (T0 >> 1U);
1869     R1 = (R1 >> 1U) - (T1 >> 1U);
1870
1871     /* Read yb (real), xb(imag) input */
1872     T0 = pSrc16[i1 * 2U];
1873     T1 = pSrc16[(i1 * 2U) + 1U];
1874
1875     /*  writing the butterfly processed i0 + fftLen/4 sample */
1876     /* xc' = (xa-xb+xc-xd) */
1877     /* yc' = (ya-yb+yc-yd) */
1878     pSrc16[i1 * 2U] = R0;
1879     pSrc16[(i1 * 2U) + 1U] = R1;
1880
1881     /* Read yd (real), xd(imag) input */
1882     U0 = pSrc16[i3 * 2U];
1883     U1 = pSrc16[(i3 * 2U) + 1U];
1884     /* T0 = (yb - yd), T1 = (xb - xd) */
1885     T0 = __SSAT(T0 - U0, 16U);
1886     T1 = __SSAT(T1 - U1, 16U);
1887
1888     /*  writing the butterfly processed i0 + fftLen/2 sample */
1889     /* xb' = (xa-yb-xc+yd) */
1890     /* yb' = (ya+xb-yc-xd) */
1891     pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
1892     pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
1893
1894
1895     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
1896     /* xd' = (xa+yb-xc-yd) */
1897     /* yd' = (ya-xb-yc+xd) */
1898     pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
1899     pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
1900   }
1901   /* end of last stage  process */
1902
1903   /* output is in 11.5(q5) format for the 1024 point */
1904   /* output is in 9.7(q7) format for the 256 point   */
1905   /* output is in 7.9(q9) format for the 64 point  */
1906   /* output is in 5.11(q11) format for the 16 point  */
1907
1908 #endif /* #if defined (ARM_MATH_DSP) */
1909
1910 }