lib/main/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c

   1 /* ----------------------------------------------------------------------
   2  * Project:      CMSIS DSP Library
   3  * Title:        arm_mat_cmplx_mult_q31.c
   4  * Description:  Floating-point matrix multiplication
   5  *
   6  * $Date:        27. January 2017
   7  * $Revision:    V.1.5.1
   8  *
   9  * Target Processor: Cortex-M cores
  10  * -------------------------------------------------------------------- */
  11 /*
  12  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13  *
  14  * SPDX-License-Identifier: Apache-2.0
  15  *
  16  * Licensed under the Apache License, Version 2.0 (the License); you may
  17  * not use this file except in compliance with the License.
  18  * You may obtain a copy of the License at
  19  *
  20  * www.apache.org/licenses/LICENSE-2.0
  21  *
  22  * Unless required by applicable law or agreed to in writing, software
  23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25  * See the License for the specific language governing permissions and
  26  * limitations under the License.
  27  */
  28
  29 #include "arm_math.h"
  30
  31 /**
  32  * @ingroup groupMatrix
  33  */
  34
  35 /**
  36  * @addtogroup CmplxMatrixMult
  37  * @{
  38  */
  39
  40 /**
  41  * @brief Q31 Complex matrix multiplication
  42  * @param[in]       *pSrcA points to the first input complex matrix structure
  43  * @param[in]       *pSrcB points to the second input complex matrix structure
  44  * @param[out]      *pDst points to output complex matrix structure
  45  * @return              The function returns either
  46  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  47  *
  48  * @details
  49  * <b>Scaling and Overflow Behavior:</b>
  50  *
  51  * \par
  52  * The function is implemented using an internal 64-bit accumulator.
  53  * The accumulator has a 2.62 format and maintains full precision of the intermediate
  54  * multiplication results but provides only a single guard bit. There is no saturation
  55  * on intermediate additions. Thus, if the accumulator overflows it wraps around and
  56  * distorts the result. The input signals should be scaled down to avoid intermediate
  57  * overflows. The input is thus scaled down by log2(numColsA) bits
  58  * to avoid overflows, as a total of numColsA additions are performed internally.
  59  * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
  60  *
  61  *
  62  */
  63
  64 arm_status arm_mat_cmplx_mult_q31(
  65   const arm_matrix_instance_q31 * pSrcA,
  66   const arm_matrix_instance_q31 * pSrcB,
  67   arm_matrix_instance_q31 * pDst)
  68 {
  69   q31_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
  70   q31_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
  71   q31_t *pInA = pSrcA->pData;                    /* input data matrix pointer A  */
  72   q31_t *pOut = pDst->pData;                     /* output data matrix pointer */
  73   q31_t *px;                                     /* Temporary output data matrix pointer */
  74   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A */
  75   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  76   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  77   q63_t sumReal1, sumImag1;                      /* accumulator */
  78   q31_t a0, b0, c0, d0;
  79   q31_t a1, b1, c1, d1;
  80
  81
  82   /* Run the below code for Cortex-M4 and Cortex-M3 */
  83
  84   uint16_t col, i = 0U, j, row = numRowsA, colCnt;      /* loop counters */
  85   arm_status status;                             /* status of matrix multiplication */
  86
  87 #ifdef ARM_MATH_MATRIX_CHECK
  88
  89
  90   /* Check for matrix mismatch condition */
  91   if ((pSrcA->numCols != pSrcB->numRows) ||
  92      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  93   {
  94
  95     /* Set status as ARM_MATH_SIZE_MISMATCH */
  96     status = ARM_MATH_SIZE_MISMATCH;
  97   }
  98   else
  99 #endif /*      #ifdef ARM_MATH_MATRIX_CHECK    */
 100
 101   {
 102     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
 103     /* row loop */
 104     do
 105     {
 106       /* Output pointer is set to starting address of the row being processed */
 107       px = pOut + 2 * i;
 108
 109       /* For every row wise process, the column loop counter is to be initiated */
 110       col = numColsB;
 111
 112       /* For every row wise process, the pIn2 pointer is set
 113        ** to the starting address of the pSrcB data */
 114       pIn2 = pSrcB->pData;
 115
 116       j = 0U;
 117
 118       /* column loop */
 119       do
 120       {
 121         /* Set the variable sum, that acts as accumulator, to zero */
 122         sumReal1 = 0.0;
 123         sumImag1 = 0.0;
 124
 125         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
 126         pIn1 = pInA;
 127
 128         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 129         colCnt = numColsA >> 2;
 130
 131         /* matrix multiplication        */
 132         while (colCnt > 0U)
 133         {
 134
 135           /* Reading real part of complex matrix A */
 136           a0 = *pIn1;
 137
 138           /* Reading real part of complex matrix B */
 139           c0 = *pIn2;
 140
 141           /* Reading imaginary part of complex matrix A */
 142           b0 = *(pIn1 + 1U);
 143
 144           /* Reading imaginary part of complex matrix B */
 145           d0 = *(pIn2 + 1U);
 146
 147           /* Multiply and Accumlates */
 148           sumReal1 += (q63_t) a0 *c0;
 149           sumImag1 += (q63_t) b0 *c0;
 150
 151           /* update pointers */
 152           pIn1 += 2U;
 153           pIn2 += 2 * numColsB;
 154
 155           /* Multiply and Accumlates */
 156           sumReal1 -= (q63_t) b0 *d0;
 157           sumImag1 += (q63_t) a0 *d0;
 158
 159           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 160
 161           /* read real and imag values from pSrcA and pSrcB buffer */
 162           a1 = *pIn1;
 163           c1 = *pIn2;
 164           b1 = *(pIn1 + 1U);
 165           d1 = *(pIn2 + 1U);
 166
 167           /* Multiply and Accumlates */
 168           sumReal1 += (q63_t) a1 *c1;
 169           sumImag1 += (q63_t) b1 *c1;
 170
 171           /* update pointers */
 172           pIn1 += 2U;
 173           pIn2 += 2 * numColsB;
 174
 175           /* Multiply and Accumlates */
 176           sumReal1 -= (q63_t) b1 *d1;
 177           sumImag1 += (q63_t) a1 *d1;
 178
 179           a0 = *pIn1;
 180           c0 = *pIn2;
 181
 182           b0 = *(pIn1 + 1U);
 183           d0 = *(pIn2 + 1U);
 184
 185           /* Multiply and Accumlates */
 186           sumReal1 += (q63_t) a0 *c0;
 187           sumImag1 += (q63_t) b0 *c0;
 188
 189           /* update pointers */
 190           pIn1 += 2U;
 191           pIn2 += 2 * numColsB;
 192
 193           /* Multiply and Accumlates */
 194           sumReal1 -= (q63_t) b0 *d0;
 195           sumImag1 += (q63_t) a0 *d0;
 196
 197           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 198
 199           a1 = *pIn1;
 200           c1 = *pIn2;
 201
 202           b1 = *(pIn1 + 1U);
 203           d1 = *(pIn2 + 1U);
 204
 205           /* Multiply and Accumlates */
 206           sumReal1 += (q63_t) a1 *c1;
 207           sumImag1 += (q63_t) b1 *c1;
 208
 209           /* update pointers */
 210           pIn1 += 2U;
 211           pIn2 += 2 * numColsB;
 212
 213           /* Multiply and Accumlates */
 214           sumReal1 -= (q63_t) b1 *d1;
 215           sumImag1 += (q63_t) a1 *d1;
 216
 217           /* Decrement the loop count */
 218           colCnt--;
 219         }
 220
 221         /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
 222          ** No loop unrolling is used. */
 223         colCnt = numColsA % 0x4U;
 224
 225         while (colCnt > 0U)
 226         {
 227           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 228           a1 = *pIn1;
 229           c1 = *pIn2;
 230
 231           b1 = *(pIn1 + 1U);
 232           d1 = *(pIn2 + 1U);
 233
 234           /* Multiply and Accumlates */
 235           sumReal1 += (q63_t) a1 *c1;
 236           sumImag1 += (q63_t) b1 *c1;
 237
 238           /* update pointers */
 239           pIn1 += 2U;
 240           pIn2 += 2 * numColsB;
 241
 242           /* Multiply and Accumlates */
 243           sumReal1 -= (q63_t) b1 *d1;
 244           sumImag1 += (q63_t) a1 *d1;
 245
 246           /* Decrement the loop counter */
 247           colCnt--;
 248         }
 249
 250         /* Store the result in the destination buffer */
 251         *px++ = (q31_t) clip_q63_to_q31(sumReal1 >> 31);
 252         *px++ = (q31_t) clip_q63_to_q31(sumImag1 >> 31);
 253
 254         /* Update the pointer pIn2 to point to the  starting address of the next column */
 255         j++;
 256         pIn2 = pSrcB->pData + 2U * j;
 257
 258         /* Decrement the column loop counter */
 259         col--;
 260
 261       } while (col > 0U);
 262
 263       /* Update the pointer pInA to point to the  starting address of the next row */
 264       i = i + numColsB;
 265       pInA = pInA + 2 * numColsA;
 266
 267       /* Decrement the row loop counter */
 268       row--;
 269
 270     } while (row > 0U);
 271
 272     /* Set status as ARM_MATH_SUCCESS */
 273     status = ARM_MATH_SUCCESS;
 274   }
 275
 276   /* Return to application */
 277   return (status);
 278 }
 279
 280 /**
 281  * @} end of MatrixMult group
 282  */