lib/main/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c

   1 /* ----------------------------------------------------------------------
   2  * Project:      CMSIS DSP Library
   3  * Title:        arm_cfft_radix4_q31.c
   4  * Description:  This file has function definition of Radix-4 FFT & IFFT function and
   5  *               In-place bit reversal using bit reversal table
   6  *
   7  * $Date:        27. January 2017
   8  * $Revision:    V.1.5.1
   9  *
  10  * Target Processor: Cortex-M cores
  11  * -------------------------------------------------------------------- */
  12 /*
  13  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  14  *
  15  * SPDX-License-Identifier: Apache-2.0
  16  *
  17  * Licensed under the Apache License, Version 2.0 (the License); you may
  18  * not use this file except in compliance with the License.
  19  * You may obtain a copy of the License at
  20  *
  21  * www.apache.org/licenses/LICENSE-2.0
  22  *
  23  * Unless required by applicable law or agreed to in writing, software
  24  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  25  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  26  * See the License for the specific language governing permissions and
  27  * limitations under the License.
  28  */
  29
  30 #include "arm_math.h"
  31
  32 void arm_radix4_butterfly_inverse_q31(
  33 q31_t * pSrc,
  34 uint32_t fftLen,
  35 q31_t * pCoef,
  36 uint32_t twidCoefModifier);
  37
  38 void arm_radix4_butterfly_q31(
  39 q31_t * pSrc,
  40 uint32_t fftLen,
  41 q31_t * pCoef,
  42 uint32_t twidCoefModifier);
  43
  44 void arm_bitreversal_q31(
  45 q31_t * pSrc,
  46 uint32_t fftLen,
  47 uint16_t bitRevFactor,
  48 uint16_t * pBitRevTab);
  49
  50 /**
  51  * @ingroup groupTransforms
  52  */
  53
  54 /**
  55  * @addtogroup ComplexFFT
  56  * @{
  57  */
  58
  59 /**
  60  * @details
  61  * @brief Processing function for the Q31 CFFT/CIFFT.
  62  * @deprecated Do not use this function.  It has been superseded by \ref arm_cfft_q31 and will be removed
  63  * @param[in]      *S    points to an instance of the Q31 CFFT/CIFFT structure.
  64  * @param[in, out] *pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place.
  65  * @return none.
  66  *
  67  * \par Input and output formats:
  68  * \par
  69  * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
  70  * Hence the output format is different for different FFT sizes.
  71  * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
  72  * \par
  73  * \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"
  74  * \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"
  75  *
  76  */
  77
  78 void arm_cfft_radix4_q31(
  79   const arm_cfft_radix4_instance_q31 * S,
  80   q31_t * pSrc)
  81 {
  82   if (S->ifftFlag == 1U)
  83   {
  84     /* Complex IFFT radix-4 */
  85     arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
  86   }
  87   else
  88   {
  89     /* Complex FFT radix-4 */
  90     arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
  91   }
  92
  93   if (S->bitReverseFlag == 1U)
  94   {
  95     /*  Bit Reversal */
  96     arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
  97   }
  98
  99 }
 100
 101 /**
 102  * @} end of ComplexFFT group
 103  */
 104
 105 /*
 106 * Radix-4 FFT algorithm used is :
 107 *
 108 * Input real and imaginary data:
 109 * x(n) = xa + j * ya
 110 * x(n+N/4 ) = xb + j * yb
 111 * x(n+N/2 ) = xc + j * yc
 112 * x(n+3N 4) = xd + j * yd
 113 *
 114 *
 115 * Output real and imaginary data:
 116 * x(4r) = xa'+ j * ya'
 117 * x(4r+1) = xb'+ j * yb'
 118 * x(4r+2) = xc'+ j * yc'
 119 * x(4r+3) = xd'+ j * yd'
 120 *
 121 *
 122 * Twiddle factors for radix-4 FFT:
 123 * Wn = co1 + j * (- si1)
 124 * W2n = co2 + j * (- si2)
 125 * W3n = co3 + j * (- si3)
 126 *
 127 *  Butterfly implementation:
 128 * xa' = xa + xb + xc + xd
 129 * ya' = ya + yb + yc + yd
 130 * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
 131 * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
 132 * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
 133 * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
 134 * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
 135 * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
 136 *
 137 */
 138
 139 /**
 140  * @brief  Core function for the Q31 CFFT butterfly process.
 141  * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.
 142  * @param[in]      fftLen           length of the FFT.
 143  * @param[in]      *pCoef           points to twiddle coefficient buffer.
 144  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
 145  * @return none.
 146  */
 147
 148 void arm_radix4_butterfly_q31(
 149   q31_t * pSrc,
 150   uint32_t fftLen,
 151   q31_t * pCoef,
 152   uint32_t twidCoefModifier)
 153 {
 154 #if defined(ARM_MATH_CM7)
 155   uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
 156   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
 157
 158   q31_t xa, xb, xc, xd;
 159   q31_t ya, yb, yc, yd;
 160   q31_t xa_out, xb_out, xc_out, xd_out;
 161   q31_t ya_out, yb_out, yc_out, yd_out;
 162
 163   q31_t *ptr1;
 164   q63_t xaya, xbyb, xcyc, xdyd;
 165   /* Total process is divided into three stages */
 166
 167   /* process first stage, middle stages, & last stage */
 168
 169
 170   /* start of first stage process */
 171
 172   /*  Initializations for the first stage */
 173   n2 = fftLen;
 174   n1 = n2;
 175   /* n2 = fftLen/4 */
 176   n2 >>= 2U;
 177   i0 = 0U;
 178   ia1 = 0U;
 179
 180   j = n2;
 181
 182   /*  Calculation of first stage */
 183   do
 184   {
 185     /*  index calculation for the input as, */
 186     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
 187     i1 = i0 + n2;
 188     i2 = i1 + n2;
 189     i3 = i2 + n2;
 190
 191     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
 192
 193     /*  Butterfly implementation */
 194     /* xa + xc */
 195     r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U);
 196     /* xa - xc */
 197     r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
 198
 199     /* xb + xd */
 200     t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
 201
 202     /* ya + yc */
 203     s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
 204     /* ya - yc */
 205     s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
 206
 207     /* xa' = xa + xb + xc + xd */
 208     pSrc[2U * i0] = (r1 + t1);
 209     /* (xa + xc) - (xb + xd) */
 210     r1 = r1 - t1;
 211     /* yb + yd */
 212     t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
 213
 214     /* ya' = ya + yb + yc + yd */
 215     pSrc[(2U * i0) + 1U] = (s1 + t2);
 216
 217     /* (ya + yc) - (yb + yd) */
 218     s1 = s1 - t2;
 219
 220     /* yb - yd */
 221     t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
 222     /* xb - xd */
 223     t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
 224
 225     /*  index calculation for the coefficients */
 226     ia2 = 2U * ia1;
 227     co2 = pCoef[ia2 * 2U];
 228     si2 = pCoef[(ia2 * 2U) + 1U];
 229
 230     /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
 231     pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
 232                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
 233
 234     /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
 235     pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
 236                             ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
 237
 238     /* (xa - xc) + (yb - yd) */
 239     r1 = r2 + t1;
 240     /* (xa - xc) - (yb - yd) */
 241     r2 = r2 - t1;
 242
 243     /* (ya - yc) - (xb - xd) */
 244     s1 = s2 - t2;
 245     /* (ya - yc) + (xb - xd) */
 246     s2 = s2 + t2;
 247
 248     co1 = pCoef[ia1 * 2U];
 249     si1 = pCoef[(ia1 * 2U) + 1U];
 250
 251     /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
 252     pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
 253                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
 254
 255     /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
 256     pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
 257                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
 258
 259     /*  index calculation for the coefficients */
 260     ia3 = 3U * ia1;
 261     co3 = pCoef[ia3 * 2U];
 262     si3 = pCoef[(ia3 * 2U) + 1U];
 263
 264     /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
 265     pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
 266                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
 267
 268     /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
 269     pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
 270                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
 271
 272     /*  Twiddle coefficients index modifier */
 273     ia1 = ia1 + twidCoefModifier;
 274
 275     /*  Updating input index */
 276     i0 = i0 + 1U;
 277
 278   } while (--j);
 279
 280   /* end of first stage process */
 281
 282   /* data is in 5.27(q27) format */
 283
 284
 285   /* start of Middle stages process */
 286
 287
 288   /* each stage in middle stages provides two down scaling of the input */
 289
 290   twidCoefModifier <<= 2U;
 291
 292
 293   for (k = fftLen / 4U; k > 4U; k >>= 2U)
 294   {
 295     /*  Initializations for the first stage */
 296     n1 = n2;
 297     n2 >>= 2U;
 298     ia1 = 0U;
 299
 300     /*  Calculation of first stage */
 301     for (j = 0U; j <= (n2 - 1U); j++)
 302     {
 303       /*  index calculation for the coefficients */
 304       ia2 = ia1 + ia1;
 305       ia3 = ia2 + ia1;
 306       co1 = pCoef[ia1 * 2U];
 307       si1 = pCoef[(ia1 * 2U) + 1U];
 308       co2 = pCoef[ia2 * 2U];
 309       si2 = pCoef[(ia2 * 2U) + 1U];
 310       co3 = pCoef[ia3 * 2U];
 311       si3 = pCoef[(ia3 * 2U) + 1U];
 312       /*  Twiddle coefficients index modifier */
 313       ia1 = ia1 + twidCoefModifier;
 314
 315       for (i0 = j; i0 < fftLen; i0 += n1)
 316       {
 317         /*  index calculation for the input as, */
 318         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
 319         i1 = i0 + n2;
 320         i2 = i1 + n2;
 321         i3 = i2 + n2;
 322
 323         /*  Butterfly implementation */
 324         /* xa + xc */
 325         r1 = pSrc[2U * i0] + pSrc[2U * i2];
 326         /* xa - xc */
 327         r2 = pSrc[2U * i0] - pSrc[2U * i2];
 328
 329         /* ya + yc */
 330         s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
 331         /* ya - yc */
 332         s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
 333
 334         /* xb + xd */
 335         t1 = pSrc[2U * i1] + pSrc[2U * i3];
 336
 337         /* xa' = xa + xb + xc + xd */
 338         pSrc[2U * i0] = (r1 + t1) >> 2U;
 339         /* xa + xc -(xb + xd) */
 340         r1 = r1 - t1;
 341
 342         /* yb + yd */
 343         t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
 344         /* ya' = ya + yb + yc + yd */
 345         pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
 346
 347         /* (ya + yc) - (yb + yd) */
 348         s1 = s1 - t2;
 349
 350         /* (yb - yd) */
 351         t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
 352         /* (xb - xd) */
 353         t2 = pSrc[2U * i1] - pSrc[2U * i3];
 354
 355         /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
 356         pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
 357                          ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
 358
 359         /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
 360         pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
 361                                 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
 362
 363         /* (xa - xc) + (yb - yd) */
 364         r1 = r2 + t1;
 365         /* (xa - xc) - (yb - yd) */
 366         r2 = r2 - t1;
 367
 368         /* (ya - yc) -  (xb - xd) */
 369         s1 = s2 - t2;
 370         /* (ya - yc) +  (xb - xd) */
 371         s2 = s2 + t2;
 372
 373         /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
 374         pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
 375                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
 376
 377         /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
 378         pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
 379                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
 380
 381         /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
 382         pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
 383                          ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
 384
 385         /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
 386         pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
 387                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
 388       }
 389     }
 390     twidCoefModifier <<= 2U;
 391   }
 392 #else
 393   uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
 394   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
 395
 396   q31_t xa, xb, xc, xd;
 397   q31_t ya, yb, yc, yd;
 398   q31_t xa_out, xb_out, xc_out, xd_out;
 399   q31_t ya_out, yb_out, yc_out, yd_out;
 400
 401   q31_t *ptr1;
 402   q31_t *pSi0;
 403   q31_t *pSi1;
 404   q31_t *pSi2;
 405   q31_t *pSi3;
 406   q63_t xaya, xbyb, xcyc, xdyd;
 407   /* Total process is divided into three stages */
 408
 409   /* process first stage, middle stages, & last stage */
 410
 411
 412   /* start of first stage process */
 413
 414   /*  Initializations for the first stage */
 415   n2 = fftLen;
 416   n1 = n2;
 417   /* n2 = fftLen/4 */
 418   n2 >>= 2U;
 419
 420   ia1 = 0U;
 421
 422   j = n2;
 423
 424   pSi0 = pSrc;
 425   pSi1 = pSi0 + 2 * n2;
 426   pSi2 = pSi1 + 2 * n2;
 427   pSi3 = pSi2 + 2 * n2;
 428
 429   /*  Calculation of first stage */
 430   do
 431   {
 432     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
 433
 434     /*  Butterfly implementation */
 435     /* xa + xc */
 436     r1 = (pSi0[0] >> 4U) + (pSi2[0] >> 4U);
 437     /* xa - xc */
 438     r2 = (pSi0[0] >> 4U) - (pSi2[0] >> 4U);
 439
 440     /* xb + xd */
 441     t1 = (pSi1[0] >> 4U) + (pSi3[0] >> 4U);
 442
 443     /* ya + yc */
 444     s1 = (pSi0[1] >> 4U) + (pSi2[1] >> 4U);
 445     /* ya - yc */
 446     s2 = (pSi0[1] >> 4U) - (pSi2[1] >> 4U);
 447
 448     /* xa' = xa + xb + xc + xd */
 449     *pSi0++ = (r1 + t1);
 450     /* (xa + xc) - (xb + xd) */
 451     r1 = r1 - t1;
 452     /* yb + yd */
 453     t2 = (pSi1[1] >> 4U) + (pSi3[1] >> 4U);
 454
 455     /* ya' = ya + yb + yc + yd */
 456     *pSi0++ = (s1 + t2);
 457
 458     /* (ya + yc) - (yb + yd) */
 459     s1 = s1 - t2;
 460
 461     /* yb - yd */
 462     t1 = (pSi1[1] >> 4U) - (pSi3[1] >> 4U);
 463     /* xb - xd */
 464     t2 = (pSi1[0] >> 4U) - (pSi3[0] >> 4U);
 465
 466     /*  index calculation for the coefficients */
 467     ia2 = 2U * ia1;
 468     co2 = pCoef[ia2 * 2U];
 469     si2 = pCoef[(ia2 * 2U) + 1U];
 470
 471     /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
 472     *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
 473                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
 474
 475     /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
 476     *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
 477                             ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
 478
 479     /* (xa - xc) + (yb - yd) */
 480     r1 = r2 + t1;
 481     /* (xa - xc) - (yb - yd) */
 482     r2 = r2 - t1;
 483
 484     /* (ya - yc) - (xb - xd) */
 485     s1 = s2 - t2;
 486     /* (ya - yc) + (xb - xd) */
 487     s2 = s2 + t2;
 488
 489     co1 = pCoef[ia1 * 2U];
 490     si1 = pCoef[(ia1 * 2U) + 1U];
 491
 492     /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
 493     *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
 494                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
 495
 496     /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
 497     *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
 498                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
 499
 500     /*  index calculation for the coefficients */
 501     ia3 = 3U * ia1;
 502     co3 = pCoef[ia3 * 2U];
 503     si3 = pCoef[(ia3 * 2U) + 1U];
 504
 505     /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
 506     *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
 507                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
 508
 509     /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
 510     *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
 511                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
 512
 513     /*  Twiddle coefficients index modifier */
 514     ia1 = ia1 + twidCoefModifier;
 515
 516   } while (--j);
 517
 518   /* end of first stage process */
 519
 520   /* data is in 5.27(q27) format */
 521
 522
 523   /* start of Middle stages process */
 524
 525
 526   /* each stage in middle stages provides two down scaling of the input */
 527
 528   twidCoefModifier <<= 2U;
 529
 530
 531   for (k = fftLen / 4U; k > 4U; k >>= 2U)
 532   {
 533     /*  Initializations for the first stage */
 534     n1 = n2;
 535     n2 >>= 2U;
 536     ia1 = 0U;
 537
 538     /*  Calculation of first stage */
 539     for (j = 0U; j <= (n2 - 1U); j++)
 540     {
 541       /*  index calculation for the coefficients */
 542       ia2 = ia1 + ia1;
 543       ia3 = ia2 + ia1;
 544       co1 = pCoef[ia1 * 2U];
 545       si1 = pCoef[(ia1 * 2U) + 1U];
 546       co2 = pCoef[ia2 * 2U];
 547       si2 = pCoef[(ia2 * 2U) + 1U];
 548       co3 = pCoef[ia3 * 2U];
 549       si3 = pCoef[(ia3 * 2U) + 1U];
 550       /*  Twiddle coefficients index modifier */
 551       ia1 = ia1 + twidCoefModifier;
 552
 553       pSi0 = pSrc + 2 * j;
 554       pSi1 = pSi0 + 2 * n2;
 555       pSi2 = pSi1 + 2 * n2;
 556       pSi3 = pSi2 + 2 * n2;
 557
 558       for (i0 = j; i0 < fftLen; i0 += n1)
 559       {
 560         /*  Butterfly implementation */
 561         /* xa + xc */
 562         r1 = pSi0[0] + pSi2[0];
 563
 564         /* xa - xc */
 565         r2 = pSi0[0] - pSi2[0];
 566
 567
 568         /* ya + yc */
 569         s1 = pSi0[1] + pSi2[1];
 570
 571         /* ya - yc */
 572         s2 = pSi0[1] - pSi2[1];
 573
 574
 575         /* xb + xd */
 576         t1 = pSi1[0] + pSi3[0];
 577
 578
 579         /* xa' = xa + xb + xc + xd */
 580         pSi0[0] = (r1 + t1) >> 2U;
 581         /* xa + xc -(xb + xd) */
 582         r1 = r1 - t1;
 583
 584         /* yb + yd */
 585         t2 = pSi1[1] + pSi3[1];
 586
 587         /* ya' = ya + yb + yc + yd */
 588         pSi0[1] = (s1 + t2) >> 2U;
 589         pSi0 += 2 * n1;
 590
 591         /* (ya + yc) - (yb + yd) */
 592         s1 = s1 - t2;
 593
 594         /* (yb - yd) */
 595         t1 = pSi1[1] - pSi3[1];
 596
 597         /* (xb - xd) */
 598         t2 = pSi1[0] - pSi3[0];
 599
 600
 601         /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
 602         pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
 603                          ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
 604
 605         /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
 606         pSi1[1] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
 607                                 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
 608         pSi1 += 2 * n1;
 609
 610         /* (xa - xc) + (yb - yd) */
 611         r1 = r2 + t1;
 612         /* (xa - xc) - (yb - yd) */
 613         r2 = r2 - t1;
 614
 615         /* (ya - yc) -  (xb - xd) */
 616         s1 = s2 - t2;
 617         /* (ya - yc) +  (xb - xd) */
 618         s2 = s2 + t2;
 619
 620         /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
 621         pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
 622                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
 623
 624         /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
 625         pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
 626                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
 627         pSi2 += 2 * n1;
 628
 629         /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
 630         pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
 631                          ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
 632
 633         /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
 634         pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
 635                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
 636         pSi3 += 2 * n1;
 637       }
 638     }
 639     twidCoefModifier <<= 2U;
 640   }
 641 #endif
 642
 643   /* End of Middle stages process */
 644
 645   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
 646   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
 647   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
 648   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
 649
 650
 651   /* start of Last stage process */
 652   /*  Initializations for the last stage */
 653   j = fftLen >> 2;
 654   ptr1 = &pSrc[0];
 655
 656   /*  Calculations of last stage */
 657   do
 658   {
 659
 660 #ifndef ARM_MATH_BIG_ENDIAN
 661
 662     /* Read xa (real), ya(imag) input */
 663     xaya = *__SIMD64(ptr1)++;
 664     xa = (q31_t) xaya;
 665     ya = (q31_t) (xaya >> 32);
 666
 667     /* Read xb (real), yb(imag) input */
 668     xbyb = *__SIMD64(ptr1)++;
 669     xb = (q31_t) xbyb;
 670     yb = (q31_t) (xbyb >> 32);
 671
 672     /* Read xc (real), yc(imag) input */
 673     xcyc = *__SIMD64(ptr1)++;
 674     xc = (q31_t) xcyc;
 675     yc = (q31_t) (xcyc >> 32);
 676
 677     /* Read xc (real), yc(imag) input */
 678     xdyd = *__SIMD64(ptr1)++;
 679     xd = (q31_t) xdyd;
 680     yd = (q31_t) (xdyd >> 32);
 681
 682 #else
 683
 684     /* Read xa (real), ya(imag) input */
 685     xaya = *__SIMD64(ptr1)++;
 686     ya = (q31_t) xaya;
 687     xa = (q31_t) (xaya >> 32);
 688
 689     /* Read xb (real), yb(imag) input */
 690     xbyb = *__SIMD64(ptr1)++;
 691     yb = (q31_t) xbyb;
 692     xb = (q31_t) (xbyb >> 32);
 693
 694     /* Read xc (real), yc(imag) input */
 695     xcyc = *__SIMD64(ptr1)++;
 696     yc = (q31_t) xcyc;
 697     xc = (q31_t) (xcyc >> 32);
 698
 699     /* Read xc (real), yc(imag) input */
 700     xdyd = *__SIMD64(ptr1)++;
 701     yd = (q31_t) xdyd;
 702     xd = (q31_t) (xdyd >> 32);
 703
 704
 705 #endif
 706
 707     /* xa' = xa + xb + xc + xd */
 708     xa_out = xa + xb + xc + xd;
 709
 710     /* ya' = ya + yb + yc + yd */
 711     ya_out = ya + yb + yc + yd;
 712
 713     /* pointer updation for writing */
 714     ptr1 = ptr1 - 8U;
 715
 716     /* writing xa' and ya' */
 717     *ptr1++ = xa_out;
 718     *ptr1++ = ya_out;
 719
 720     xc_out = (xa - xb + xc - xd);
 721     yc_out = (ya - yb + yc - yd);
 722
 723     /* writing xc' and yc' */
 724     *ptr1++ = xc_out;
 725     *ptr1++ = yc_out;
 726
 727     xb_out = (xa + yb - xc - yd);
 728     yb_out = (ya - xb - yc + xd);
 729
 730     /* writing xb' and yb' */
 731     *ptr1++ = xb_out;
 732     *ptr1++ = yb_out;
 733
 734     xd_out = (xa - yb - xc + yd);
 735     yd_out = (ya + xb - yc - xd);
 736
 737     /* writing xd' and yd' */
 738     *ptr1++ = xd_out;
 739     *ptr1++ = yd_out;
 740
 741
 742   } while (--j);
 743
 744   /* output is in 11.21(q21) format for the 1024 point */
 745   /* output is in 9.23(q23) format for the 256 point */
 746   /* output is in 7.25(q25) format for the 64 point */
 747   /* output is in 5.27(q27) format for the 16 point */
 748
 749   /* End of last stage process */
 750
 751 }
 752
 753
 754 /**
 755  * @brief  Core function for the Q31 CIFFT butterfly process.
 756  * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.
 757  * @param[in]      fftLen           length of the FFT.
 758  * @param[in]      *pCoef           points to twiddle coefficient buffer.
 759  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
 760  * @return none.
 761  */
 762
 763
 764 /*
 765 * Radix-4 IFFT algorithm used is :
 766 *
 767 * CIFFT uses same twiddle coefficients as CFFT Function
 768 *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
 769 *
 770 *
 771 * IFFT is implemented with following changes in equations from FFT
 772 *
 773 * Input real and imaginary data:
 774 * x(n) = xa + j * ya
 775 * x(n+N/4 ) = xb + j * yb
 776 * x(n+N/2 ) = xc + j * yc
 777 * x(n+3N 4) = xd + j * yd
 778 *
 779 *
 780 * Output real and imaginary data:
 781 * x(4r) = xa'+ j * ya'
 782 * x(4r+1) = xb'+ j * yb'
 783 * x(4r+2) = xc'+ j * yc'
 784 * x(4r+3) = xd'+ j * yd'
 785 *
 786 *
 787 * Twiddle factors for radix-4 IFFT:
 788 * Wn = co1 + j * (si1)
 789 * W2n = co2 + j * (si2)
 790 * W3n = co3 + j * (si3)
 791
 792 * The real and imaginary output values for the radix-4 butterfly are
 793 * xa' = xa + xb + xc + xd
 794 * ya' = ya + yb + yc + yd
 795 * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
 796 * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
 797 * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
 798 * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
 799 * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
 800 * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
 801 *
 802 */
 803
 804 void arm_radix4_butterfly_inverse_q31(
 805   q31_t * pSrc,
 806   uint32_t fftLen,
 807   q31_t * pCoef,
 808   uint32_t twidCoefModifier)
 809 {
 810 #if defined(ARM_MATH_CM7)
 811   uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
 812   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
 813   q31_t xa, xb, xc, xd;
 814   q31_t ya, yb, yc, yd;
 815   q31_t xa_out, xb_out, xc_out, xd_out;
 816   q31_t ya_out, yb_out, yc_out, yd_out;
 817
 818   q31_t *ptr1;
 819   q63_t xaya, xbyb, xcyc, xdyd;
 820
 821   /* input is be 1.31(q31) format for all FFT sizes */
 822   /* Total process is divided into three stages */
 823   /* process first stage, middle stages, & last stage */
 824
 825   /* Start of first stage process */
 826
 827   /* Initializations for the first stage */
 828   n2 = fftLen;
 829   n1 = n2;
 830   /* n2 = fftLen/4 */
 831   n2 >>= 2U;
 832   i0 = 0U;
 833   ia1 = 0U;
 834
 835   j = n2;
 836
 837   do
 838   {
 839
 840     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
 841
 842     /*  index calculation for the input as, */
 843     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
 844     i1 = i0 + n2;
 845     i2 = i1 + n2;
 846     i3 = i2 + n2;
 847
 848     /*  Butterfly implementation */
 849     /* xa + xc */
 850     r1 = (pSrc[2U * i0] >> 4U) + (pSrc[2U * i2] >> 4U);
 851     /* xa - xc */
 852     r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
 853
 854     /* xb + xd */
 855     t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
 856
 857     /* ya + yc */
 858     s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
 859     /* ya - yc */
 860     s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
 861
 862     /* xa' = xa + xb + xc + xd */
 863     pSrc[2U * i0] = (r1 + t1);
 864     /* (xa + xc) - (xb + xd) */
 865     r1 = r1 - t1;
 866     /* yb + yd */
 867     t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
 868     /* ya' = ya + yb + yc + yd */
 869     pSrc[(2U * i0) + 1U] = (s1 + t2);
 870
 871     /* (ya + yc) - (yb + yd) */
 872     s1 = s1 - t2;
 873
 874     /* yb - yd */
 875     t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
 876     /* xb - xd */
 877     t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
 878
 879     /*  index calculation for the coefficients */
 880     ia2 = 2U * ia1;
 881     co2 = pCoef[ia2 * 2U];
 882     si2 = pCoef[(ia2 * 2U) + 1U];
 883
 884     /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
 885     pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
 886                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
 887
 888     /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
 889     pSrc[2U * i1 + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
 890                           ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
 891
 892     /* (xa - xc) - (yb - yd) */
 893     r1 = r2 - t1;
 894     /* (xa - xc) + (yb - yd) */
 895     r2 = r2 + t1;
 896
 897     /* (ya - yc) + (xb - xd) */
 898     s1 = s2 + t2;
 899     /* (ya - yc) - (xb - xd) */
 900     s2 = s2 - t2;
 901
 902     co1 = pCoef[ia1 * 2U];
 903     si1 = pCoef[(ia1 * 2U) + 1U];
 904
 905     /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
 906     pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
 907                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
 908
 909     /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
 910     pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
 911                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
 912
 913     /*  index calculation for the coefficients */
 914     ia3 = 3U * ia1;
 915     co3 = pCoef[ia3 * 2U];
 916     si3 = pCoef[(ia3 * 2U) + 1U];
 917
 918     /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
 919     pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
 920                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
 921
 922     /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
 923     pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
 924                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
 925
 926     /*  Twiddle coefficients index modifier */
 927     ia1 = ia1 + twidCoefModifier;
 928
 929     /*  Updating input index */
 930     i0 = i0 + 1U;
 931
 932   } while (--j);
 933
 934   /* data is in 5.27(q27) format */
 935   /* each stage provides two down scaling of the input */
 936
 937
 938   /* Start of Middle stages process */
 939
 940   twidCoefModifier <<= 2U;
 941
 942   /*  Calculation of second stage to excluding last stage */
 943   for (k = fftLen / 4U; k > 4U; k >>= 2U)
 944   {
 945     /*  Initializations for the first stage */
 946     n1 = n2;
 947     n2 >>= 2U;
 948     ia1 = 0U;
 949
 950     for (j = 0; j <= (n2 - 1U); j++)
 951     {
 952       /*  index calculation for the coefficients */
 953       ia2 = ia1 + ia1;
 954       ia3 = ia2 + ia1;
 955       co1 = pCoef[ia1 * 2U];
 956       si1 = pCoef[(ia1 * 2U) + 1U];
 957       co2 = pCoef[ia2 * 2U];
 958       si2 = pCoef[(ia2 * 2U) + 1U];
 959       co3 = pCoef[ia3 * 2U];
 960       si3 = pCoef[(ia3 * 2U) + 1U];
 961       /*  Twiddle coefficients index modifier */
 962       ia1 = ia1 + twidCoefModifier;
 963
 964       for (i0 = j; i0 < fftLen; i0 += n1)
 965       {
 966         /*  index calculation for the input as, */
 967         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
 968         i1 = i0 + n2;
 969         i2 = i1 + n2;
 970         i3 = i2 + n2;
 971
 972         /*  Butterfly implementation */
 973         /* xa + xc */
 974         r1 = pSrc[2U * i0] + pSrc[2U * i2];
 975         /* xa - xc */
 976         r2 = pSrc[2U * i0] - pSrc[2U * i2];
 977
 978         /* ya + yc */
 979         s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
 980         /* ya - yc */
 981         s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
 982
 983         /* xb + xd */
 984         t1 = pSrc[2U * i1] + pSrc[2U * i3];
 985
 986         /* xa' = xa + xb + xc + xd */
 987         pSrc[2U * i0] = (r1 + t1) >> 2U;
 988         /* xa + xc -(xb + xd) */
 989         r1 = r1 - t1;
 990         /* yb + yd */
 991         t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
 992         /* ya' = ya + yb + yc + yd */
 993         pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
 994
 995         /* (ya + yc) - (yb + yd) */
 996         s1 = s1 - t2;
 997
 998         /* (yb - yd) */
 999         t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
1000         /* (xb - xd) */
1001         t2 = pSrc[2U * i1] - pSrc[2U * i3];
1002
1003         /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1004         pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
1005                          ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
1006
1007         /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1008         pSrc[(2U * i1) + 1U] =
1009           (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
1010            ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
1011
1012         /* (xa - xc) - (yb - yd) */
1013         r1 = r2 - t1;
1014         /* (xa - xc) + (yb - yd) */
1015         r2 = r2 + t1;
1016
1017         /* (ya - yc) +  (xb - xd) */
1018         s1 = s2 + t2;
1019         /* (ya - yc) -  (xb - xd) */
1020         s2 = s2 - t2;
1021
1022         /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1023         pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
1024                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
1025
1026         /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1027         pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
1028                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
1029
1030         /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1031         pSrc[(2U * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
1032                            ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
1033
1034         /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1035         pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
1036                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
1037       }
1038     }
1039     twidCoefModifier <<= 2U;
1040   }
1041 #else
1042   uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
1043   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
1044   q31_t xa, xb, xc, xd;
1045   q31_t ya, yb, yc, yd;
1046   q31_t xa_out, xb_out, xc_out, xd_out;
1047   q31_t ya_out, yb_out, yc_out, yd_out;
1048
1049   q31_t *ptr1;
1050   q31_t *pSi0;
1051   q31_t *pSi1;
1052   q31_t *pSi2;
1053   q31_t *pSi3;
1054   q63_t xaya, xbyb, xcyc, xdyd;
1055
1056   /* input is be 1.31(q31) format for all FFT sizes */
1057   /* Total process is divided into three stages */
1058   /* process first stage, middle stages, & last stage */
1059
1060   /* Start of first stage process */
1061
1062   /* Initializations for the first stage */
1063   n2 = fftLen;
1064   n1 = n2;
1065   /* n2 = fftLen/4 */
1066   n2 >>= 2U;
1067
1068   ia1 = 0U;
1069
1070   j = n2;
1071
1072   pSi0 = pSrc;
1073   pSi1 = pSi0 + 2 * n2;
1074   pSi2 = pSi1 + 2 * n2;
1075   pSi3 = pSi2 + 2 * n2;
1076
1077   do
1078   {
1079     /*  Butterfly implementation */
1080     /* xa + xc */
1081     r1 = (pSi0[0] >> 4U) + (pSi2[0] >> 4U);
1082     /* xa - xc */
1083     r2 = (pSi0[0] >> 4U) - (pSi2[0] >> 4U);
1084
1085     /* xb + xd */
1086     t1 = (pSi1[0] >> 4U) + (pSi3[0] >> 4U);
1087
1088     /* ya + yc */
1089     s1 = (pSi0[1] >> 4U) + (pSi2[1] >> 4U);
1090     /* ya - yc */
1091     s2 = (pSi0[1] >> 4U) - (pSi2[1] >> 4U);
1092
1093     /* xa' = xa + xb + xc + xd */
1094     *pSi0++ = (r1 + t1);
1095     /* (xa + xc) - (xb + xd) */
1096     r1 = r1 - t1;
1097     /* yb + yd */
1098     t2 = (pSi1[1] >> 4U) + (pSi3[1] >> 4U);
1099     /* ya' = ya + yb + yc + yd */
1100     *pSi0++ = (s1 + t2);
1101
1102     /* (ya + yc) - (yb + yd) */
1103     s1 = s1 - t2;
1104
1105     /* yb - yd */
1106     t1 = (pSi1[1] >> 4U) - (pSi3[1] >> 4U);
1107     /* xb - xd */
1108     t2 = (pSi1[0] >> 4U) - (pSi3[0] >> 4U);
1109
1110     /*  index calculation for the coefficients */
1111     ia2 = 2U * ia1;
1112     co2 = pCoef[ia2 * 2U];
1113     si2 = pCoef[(ia2 * 2U) + 1U];
1114
1115     /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1116     *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
1117                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
1118
1119     /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1120     *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
1121                           ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
1122
1123     /* (xa - xc) - (yb - yd) */
1124     r1 = r2 - t1;
1125     /* (xa - xc) + (yb - yd) */
1126     r2 = r2 + t1;
1127
1128     /* (ya - yc) + (xb - xd) */
1129     s1 = s2 + t2;
1130     /* (ya - yc) - (xb - xd) */
1131     s2 = s2 - t2;
1132
1133     co1 = pCoef[ia1 * 2U];
1134     si1 = pCoef[(ia1 * 2U) + 1U];
1135
1136     /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1137     *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
1138                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
1139
1140     /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1141     *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
1142                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
1143
1144     /*  index calculation for the coefficients */
1145     ia3 = 3U * ia1;
1146     co3 = pCoef[ia3 * 2U];
1147     si3 = pCoef[(ia3 * 2U) + 1U];
1148
1149     /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1150     *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
1151                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
1152
1153     /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1154     *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
1155                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
1156
1157     /*  Twiddle coefficients index modifier */
1158     ia1 = ia1 + twidCoefModifier;
1159
1160   } while (--j);
1161
1162   /* data is in 5.27(q27) format */
1163   /* each stage provides two down scaling of the input */
1164
1165
1166   /* Start of Middle stages process */
1167
1168   twidCoefModifier <<= 2U;
1169
1170   /*  Calculation of second stage to excluding last stage */
1171   for (k = fftLen / 4U; k > 4U; k >>= 2U)
1172   {
1173     /*  Initializations for the first stage */
1174     n1 = n2;
1175     n2 >>= 2U;
1176     ia1 = 0U;
1177
1178     for (j = 0; j <= (n2 - 1U); j++)
1179     {
1180       /*  index calculation for the coefficients */
1181       ia2 = ia1 + ia1;
1182       ia3 = ia2 + ia1;
1183       co1 = pCoef[ia1 * 2U];
1184       si1 = pCoef[(ia1 * 2U) + 1U];
1185       co2 = pCoef[ia2 * 2U];
1186       si2 = pCoef[(ia2 * 2U) + 1U];
1187       co3 = pCoef[ia3 * 2U];
1188       si3 = pCoef[(ia3 * 2U) + 1U];
1189       /*  Twiddle coefficients index modifier */
1190       ia1 = ia1 + twidCoefModifier;
1191
1192       pSi0 = pSrc + 2 * j;
1193       pSi1 = pSi0 + 2 * n2;
1194       pSi2 = pSi1 + 2 * n2;
1195       pSi3 = pSi2 + 2 * n2;
1196
1197       for (i0 = j; i0 < fftLen; i0 += n1)
1198       {
1199         /*  Butterfly implementation */
1200         /* xa + xc */
1201         r1 = pSi0[0] + pSi2[0];
1202
1203         /* xa - xc */
1204         r2 = pSi0[0] - pSi2[0];
1205
1206
1207         /* ya + yc */
1208         s1 = pSi0[1] + pSi2[1];
1209
1210         /* ya - yc */
1211         s2 = pSi0[1] - pSi2[1];
1212
1213
1214         /* xb + xd */
1215         t1 = pSi1[0] + pSi3[0];
1216
1217
1218         /* xa' = xa + xb + xc + xd */
1219         pSi0[0] = (r1 + t1) >> 2U;
1220         /* xa + xc -(xb + xd) */
1221         r1 = r1 - t1;
1222         /* yb + yd */
1223         t2 = pSi1[1] + pSi3[1];
1224
1225         /* ya' = ya + yb + yc + yd */
1226         pSi0[1] = (s1 + t2) >> 2U;
1227         pSi0 += 2 * n1;
1228
1229         /* (ya + yc) - (yb + yd) */
1230         s1 = s1 - t2;
1231
1232         /* (yb - yd) */
1233         t1 = pSi1[1] - pSi3[1];
1234
1235         /* (xb - xd) */
1236         t2 = pSi1[0] - pSi3[0];
1237
1238
1239         /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1240         pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
1241                          ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
1242
1243         /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1244         pSi1[1] =
1245
1246           (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
1247            ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
1248         pSi1 += 2 * n1;
1249
1250         /* (xa - xc) - (yb - yd) */
1251         r1 = r2 - t1;
1252         /* (xa - xc) + (yb - yd) */
1253         r2 = r2 + t1;
1254
1255         /* (ya - yc) +  (xb - xd) */
1256         s1 = s2 + t2;
1257         /* (ya - yc) -  (xb - xd) */
1258         s2 = s2 - t2;
1259
1260         /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1261         pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
1262                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
1263
1264         /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1265         pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
1266                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
1267         pSi2 += 2 * n1;
1268
1269         /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1270         pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
1271                            ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
1272
1273         /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1274         pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
1275                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
1276         pSi3 += 2 * n1;
1277       }
1278     }
1279     twidCoefModifier <<= 2U;
1280   }
1281 #endif
1282
1283   /* End of Middle stages process */
1284
1285   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
1286   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
1287   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
1288   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
1289
1290
1291   /* Start of last stage process */
1292
1293
1294   /*  Initializations for the last stage */
1295   j = fftLen >> 2;
1296   ptr1 = &pSrc[0];
1297
1298   /*  Calculations of last stage */
1299   do
1300   {
1301 #ifndef ARM_MATH_BIG_ENDIAN
1302     /* Read xa (real), ya(imag) input */
1303     xaya = *__SIMD64(ptr1)++;
1304     xa = (q31_t) xaya;
1305     ya = (q31_t) (xaya >> 32);
1306
1307     /* Read xb (real), yb(imag) input */
1308     xbyb = *__SIMD64(ptr1)++;
1309     xb = (q31_t) xbyb;
1310     yb = (q31_t) (xbyb >> 32);
1311
1312     /* Read xc (real), yc(imag) input */
1313     xcyc = *__SIMD64(ptr1)++;
1314     xc = (q31_t) xcyc;
1315     yc = (q31_t) (xcyc >> 32);
1316
1317     /* Read xc (real), yc(imag) input */
1318     xdyd = *__SIMD64(ptr1)++;
1319     xd = (q31_t) xdyd;
1320     yd = (q31_t) (xdyd >> 32);
1321
1322 #else
1323
1324     /* Read xa (real), ya(imag) input */
1325     xaya = *__SIMD64(ptr1)++;
1326     ya = (q31_t) xaya;
1327     xa = (q31_t) (xaya >> 32);
1328
1329     /* Read xb (real), yb(imag) input */
1330     xbyb = *__SIMD64(ptr1)++;
1331     yb = (q31_t) xbyb;
1332     xb = (q31_t) (xbyb >> 32);
1333
1334     /* Read xc (real), yc(imag) input */
1335     xcyc = *__SIMD64(ptr1)++;
1336     yc = (q31_t) xcyc;
1337     xc = (q31_t) (xcyc >> 32);
1338
1339     /* Read xc (real), yc(imag) input */
1340     xdyd = *__SIMD64(ptr1)++;
1341     yd = (q31_t) xdyd;
1342     xd = (q31_t) (xdyd >> 32);
1343
1344
1345 #endif
1346
1347     /* xa' = xa + xb + xc + xd */
1348     xa_out = xa + xb + xc + xd;
1349
1350     /* ya' = ya + yb + yc + yd */
1351     ya_out = ya + yb + yc + yd;
1352
1353     /* pointer updation for writing */
1354     ptr1 = ptr1 - 8U;
1355
1356     /* writing xa' and ya' */
1357     *ptr1++ = xa_out;
1358     *ptr1++ = ya_out;
1359
1360     xc_out = (xa - xb + xc - xd);
1361     yc_out = (ya - yb + yc - yd);
1362
1363     /* writing xc' and yc' */
1364     *ptr1++ = xc_out;
1365     *ptr1++ = yc_out;
1366
1367     xb_out = (xa - yb - xc + yd);
1368     yb_out = (ya + xb - yc - xd);
1369
1370     /* writing xb' and yb' */
1371     *ptr1++ = xb_out;
1372     *ptr1++ = yb_out;
1373
1374     xd_out = (xa + yb - xc - yd);
1375     yd_out = (ya - xb - yc + xd);
1376
1377     /* writing xd' and yd' */
1378     *ptr1++ = xd_out;
1379     *ptr1++ = yd_out;
1380
1381   } while (--j);
1382
1383   /* output is in 11.21(q21) format for the 1024 point */
1384   /* output is in 9.23(q23) format for the 256 point */
1385   /* output is in 7.25(q25) format for the 64 point */
1386   /* output is in 5.27(q27) format for the 16 point */
1387
1388   /* End of last stage process */
1389 }