lib/main/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c

   1 /* ----------------------------------------------------------------------
   2  * Project:      CMSIS DSP Library
   3  * Title:        arm_dct4_q15.c
   4  * Description:  Processing function of DCT4 & IDCT4 Q15
   5  *
   6  * $Date:        27. January 2017
   7  * $Revision:    V.1.5.1
   8  *
   9  * Target Processor: Cortex-M cores
  10  * -------------------------------------------------------------------- */
  11 /*
  12  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13  *
  14  * SPDX-License-Identifier: Apache-2.0
  15  *
  16  * Licensed under the Apache License, Version 2.0 (the License); you may
  17  * not use this file except in compliance with the License.
  18  * You may obtain a copy of the License at
  19  *
  20  * www.apache.org/licenses/LICENSE-2.0
  21  *
  22  * Unless required by applicable law or agreed to in writing, software
  23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25  * See the License for the specific language governing permissions and
  26  * limitations under the License.
  27  */
  28
  29 #include "arm_math.h"
  30
  31 /**
  32  * @addtogroup DCT4_IDCT4
  33  * @{
  34  */
  35
  36 /**
  37  * @brief Processing function for the Q15 DCT4/IDCT4.
  38  * @param[in]       *S             points to an instance of the Q15 DCT4 structure.
  39  * @param[in]       *pState        points to state buffer.
  40  * @param[in,out]   *pInlineBuffer points to the in-place input and output buffer.
  41  * @return none.
  42  *
  43  * \par Input an output formats:
  44  * Internally inputs are downscaled in the RFFT process function to avoid overflows.
  45  * Number of bits downscaled, depends on the size of the transform.
  46  * The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below:
  47  *
  48  * \image html dct4FormatsQ15Table.gif
  49  */
  50
  51 void arm_dct4_q15(
  52   const arm_dct4_instance_q15 * S,
  53   q15_t * pState,
  54   q15_t * pInlineBuffer)
  55 {
  56   uint32_t i;                                    /* Loop counter */
  57   q15_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
  58   q15_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
  59   q15_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
  60   q15_t in;                                      /* Temporary variable */
  61
  62
  63   /* DCT4 computation involves DCT2 (which is calculated using RFFT)
  64    * along with some pre-processing and post-processing.
  65    * Computational procedure is explained as follows:
  66    * (a) Pre-processing involves multiplying input with cos factor,
  67    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
  68    *              where,
  69    *                 r(n) -- output of preprocessing
  70    *                 u(n) -- input to preprocessing(actual Source buffer)
  71    * (b) Calculation of DCT2 using FFT is divided into three steps:
  72    *                  Step1: Re-ordering of even and odd elements of input.
  73    *                  Step2: Calculating FFT of the re-ordered input.
  74    *                  Step3: Taking the real part of the product of FFT output and weights.
  75    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
  76    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
  77    *                        where,
  78    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
  79    * (d) Multiplying the output with the normalizing factor sqrt(2/N).
  80    */
  81
  82         /*-------- Pre-processing ------------*/
  83   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
  84   arm_mult_q15(pInlineBuffer, cosFact, pInlineBuffer, S->N);
  85   arm_shift_q15(pInlineBuffer, 1, pInlineBuffer, S->N);
  86
  87   /* ----------------------------------------------------------------
  88    * Step1: Re-ordering of even and odd elements as
  89    *             pState[i] =  pInlineBuffer[2*i] and
  90    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
  91    ---------------------------------------------------------------------*/
  92
  93   /* pS1 initialized to pState */
  94   pS1 = pState;
  95
  96   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
  97   pS2 = pState + (S->N - 1U);
  98
  99   /* pbuff initialized to input buffer */
 100   pbuff = pInlineBuffer;
 101
 102
 103 #if defined (ARM_MATH_DSP)
 104
 105   /* Run the below code for Cortex-M4 and Cortex-M3 */
 106
 107   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
 108   i = (uint32_t) S->Nby2 >> 2U;
 109
 110   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 111    ** a second loop below computes the remaining 1 to 3 samples. */
 112   do
 113   {
 114     /* Re-ordering of even and odd elements */
 115     /* pState[i] =  pInlineBuffer[2*i] */
 116     *pS1++ = *pbuff++;
 117     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
 118     *pS2-- = *pbuff++;
 119
 120     *pS1++ = *pbuff++;
 121     *pS2-- = *pbuff++;
 122
 123     *pS1++ = *pbuff++;
 124     *pS2-- = *pbuff++;
 125
 126     *pS1++ = *pbuff++;
 127     *pS2-- = *pbuff++;
 128
 129     /* Decrement the loop counter */
 130     i--;
 131   } while (i > 0U);
 132
 133   /* pbuff initialized to input buffer */
 134   pbuff = pInlineBuffer;
 135
 136   /* pS1 initialized to pState */
 137   pS1 = pState;
 138
 139   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
 140   i = (uint32_t) S->N >> 2U;
 141
 142   /* Processing with loop unrolling 4 times as N is always multiple of 4.
 143    * Compute 4 outputs at a time */
 144   do
 145   {
 146     /* Writing the re-ordered output back to inplace input buffer */
 147     *pbuff++ = *pS1++;
 148     *pbuff++ = *pS1++;
 149     *pbuff++ = *pS1++;
 150     *pbuff++ = *pS1++;
 151
 152     /* Decrement the loop counter */
 153     i--;
 154   } while (i > 0U);
 155
 156
 157   /* ---------------------------------------------------------
 158    *     Step2: Calculate RFFT for N-point input
 159    * ---------------------------------------------------------- */
 160   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
 161   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);
 162
 163  /*----------------------------------------------------------------------
 164   *  Step3: Multiply the FFT output with the weights.
 165   *----------------------------------------------------------------------*/
 166   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);
 167
 168   /* The output of complex multiplication is in 3.13 format.
 169    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
 170   arm_shift_q15(pState, 2, pState, S->N * 2);
 171
 172   /* ----------- Post-processing ---------- */
 173   /* DCT-IV can be obtained from DCT-II by the equation,
 174    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
 175    *       Hence, Y4(0) = Y2(0)/2  */
 176   /* Getting only real part from the output and Converting to DCT-IV */
 177
 178   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
 179   i = ((uint32_t) S->N - 1U) >> 2U;
 180
 181   /* pbuff initialized to input buffer. */
 182   pbuff = pInlineBuffer;
 183
 184   /* pS1 initialized to pState */
 185   pS1 = pState;
 186
 187   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
 188   in = *pS1++ >> 1U;
 189   /* input buffer acts as inplace, so output values are stored in the input itself. */
 190   *pbuff++ = in;
 191
 192   /* pState pointer is incremented twice as the real values are located alternatively in the array */
 193   pS1++;
 194
 195   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 196    ** a second loop below computes the remaining 1 to 3 samples. */
 197   do
 198   {
 199     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
 200     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
 201     in = *pS1++ - in;
 202     *pbuff++ = in;
 203     /* points to the next real value */
 204     pS1++;
 205
 206     in = *pS1++ - in;
 207     *pbuff++ = in;
 208     pS1++;
 209
 210     in = *pS1++ - in;
 211     *pbuff++ = in;
 212     pS1++;
 213
 214     in = *pS1++ - in;
 215     *pbuff++ = in;
 216     pS1++;
 217
 218     /* Decrement the loop counter */
 219     i--;
 220   } while (i > 0U);
 221
 222   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
 223    ** No loop unrolling is used. */
 224   i = ((uint32_t) S->N - 1U) % 0x4U;
 225
 226   while (i > 0U)
 227   {
 228     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
 229     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
 230     in = *pS1++ - in;
 231     *pbuff++ = in;
 232     /* points to the next real value */
 233     pS1++;
 234
 235     /* Decrement the loop counter */
 236     i--;
 237   }
 238
 239
 240    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
 241
 242   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
 243   i = (uint32_t) S->N >> 2U;
 244
 245   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
 246   pbuff = pInlineBuffer;
 247
 248   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
 249   do
 250   {
 251     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
 252     in = *pbuff;
 253     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 254
 255     in = *pbuff;
 256     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 257
 258     in = *pbuff;
 259     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 260
 261     in = *pbuff;
 262     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 263
 264     /* Decrement the loop counter */
 265     i--;
 266   } while (i > 0U);
 267
 268
 269 #else
 270
 271   /* Run the below code for Cortex-M0 */
 272
 273   /* Initializing the loop counter to N/2 */
 274   i = (uint32_t) S->Nby2;
 275
 276   do
 277   {
 278     /* Re-ordering of even and odd elements */
 279     /* pState[i] =  pInlineBuffer[2*i] */
 280     *pS1++ = *pbuff++;
 281     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
 282     *pS2-- = *pbuff++;
 283
 284     /* Decrement the loop counter */
 285     i--;
 286   } while (i > 0U);
 287
 288   /* pbuff initialized to input buffer */
 289   pbuff = pInlineBuffer;
 290
 291   /* pS1 initialized to pState */
 292   pS1 = pState;
 293
 294   /* Initializing the loop counter */
 295   i = (uint32_t) S->N;
 296
 297   do
 298   {
 299     /* Writing the re-ordered output back to inplace input buffer */
 300     *pbuff++ = *pS1++;
 301
 302     /* Decrement the loop counter */
 303     i--;
 304   } while (i > 0U);
 305
 306
 307   /* ---------------------------------------------------------
 308    *     Step2: Calculate RFFT for N-point input
 309    * ---------------------------------------------------------- */
 310   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
 311   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);
 312
 313  /*----------------------------------------------------------------------
 314   *  Step3: Multiply the FFT output with the weights.
 315   *----------------------------------------------------------------------*/
 316   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);
 317
 318   /* The output of complex multiplication is in 3.13 format.
 319    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
 320   arm_shift_q15(pState, 2, pState, S->N * 2);
 321
 322   /* ----------- Post-processing ---------- */
 323   /* DCT-IV can be obtained from DCT-II by the equation,
 324    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
 325    *       Hence, Y4(0) = Y2(0)/2  */
 326   /* Getting only real part from the output and Converting to DCT-IV */
 327
 328   /* Initializing the loop counter */
 329   i = ((uint32_t) S->N - 1U);
 330
 331   /* pbuff initialized to input buffer. */
 332   pbuff = pInlineBuffer;
 333
 334   /* pS1 initialized to pState */
 335   pS1 = pState;
 336
 337   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
 338   in = *pS1++ >> 1U;
 339   /* input buffer acts as inplace, so output values are stored in the input itself. */
 340   *pbuff++ = in;
 341
 342   /* pState pointer is incremented twice as the real values are located alternatively in the array */
 343   pS1++;
 344
 345   do
 346   {
 347     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
 348     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
 349     in = *pS1++ - in;
 350     *pbuff++ = in;
 351     /* points to the next real value */
 352     pS1++;
 353
 354     /* Decrement the loop counter */
 355     i--;
 356   } while (i > 0U);
 357
 358    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
 359
 360   /* Initializing the loop counter */
 361   i = (uint32_t) S->N;
 362
 363   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
 364   pbuff = pInlineBuffer;
 365
 366   do
 367   {
 368     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
 369     in = *pbuff;
 370     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 371
 372     /* Decrement the loop counter */
 373     i--;
 374   } while (i > 0U);
 375
 376 #endif /* #if defined (ARM_MATH_DSP) */
 377
 378 }
 379
 380 /**
 381    * @} end of DCT4_IDCT4 group
 382    */