lib/main/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c

   1 /* ----------------------------------------------------------------------
   2  * Project:      CMSIS DSP Library
   3  * Title:        arm_mat_mult_q15.c
   4  * Description:  Q15 matrix multiplication
   5  *
   6  * $Date:        27. January 2017
   7  * $Revision:    V.1.5.1
   8  *
   9  * Target Processor: Cortex-M cores
  10  * -------------------------------------------------------------------- */
  11 /*
  12  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13  *
  14  * SPDX-License-Identifier: Apache-2.0
  15  *
  16  * Licensed under the Apache License, Version 2.0 (the License); you may
  17  * not use this file except in compliance with the License.
  18  * You may obtain a copy of the License at
  19  *
  20  * www.apache.org/licenses/LICENSE-2.0
  21  *
  22  * Unless required by applicable law or agreed to in writing, software
  23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25  * See the License for the specific language governing permissions and
  26  * limitations under the License.
  27  */
  28
  29 #include "arm_math.h"
  30
  31 /**
  32  * @ingroup groupMatrix
  33  */
  34
  35 /**
  36  * @addtogroup MatrixMult
  37  * @{
  38  */
  39
  40
  41 /**
  42  * @brief Q15 matrix multiplication
  43  * @param[in]       *pSrcA points to the first input matrix structure
  44  * @param[in]       *pSrcB points to the second input matrix structure
  45  * @param[out]      *pDst points to output matrix structure
  46  * @param[in]       *pState points to the array for storing intermediate results (Unused)
  47  * @return          The function returns either
  48  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  49  *
  50  * @details
  51  * <b>Scaling and Overflow Behavior:</b>
  52  *
  53  * \par
  54  * The function is implemented using a 64-bit internal accumulator. The inputs to the
  55  * multiplications are in 1.15 format and multiplications yield a 2.30 result.
  56  * The 2.30 intermediate
  57  * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
  58  * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
  59  * truncated to 34.15 format by discarding the low 15 bits and then saturated to
  60  * 1.15 format.
  61  *
  62  * \par
  63  * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
  64  *
  65  */
  66
  67 arm_status arm_mat_mult_q15(
  68   const arm_matrix_instance_q15 * pSrcA,
  69   const arm_matrix_instance_q15 * pSrcB,
  70   arm_matrix_instance_q15 * pDst,
  71   q15_t * pState)
  72 {
  73   q63_t sum;                                     /* accumulator */
  74
  75 #if defined (ARM_MATH_DSP)
  76
  77   /* Run the below code for Cortex-M4 and Cortex-M3 */
  78
  79   q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
  80   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  81   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  82   q15_t *px;                                     /* Temporary output data matrix pointer */
  83   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  84   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  85   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  86   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
  87   uint16_t col, i = 0U, row = numRowsB, colCnt;  /* loop counters */
  88   arm_status status;                             /* status of matrix multiplication */
  89
  90 #ifndef UNALIGNED_SUPPORT_DISABLE
  91
  92   q31_t in;                                      /* Temporary variable to hold the input value */
  93   q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2;
  94
  95 #else
  96
  97   q15_t in;                                      /* Temporary variable to hold the input value */
  98   q15_t inA1, inB1, inA2, inB2;
  99
 100 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
 101
 102 #ifdef ARM_MATH_MATRIX_CHECK
 103   /* Check for matrix mismatch condition */
 104   if ((pSrcA->numCols != pSrcB->numRows) ||
 105      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
 106   {
 107     /* Set status as ARM_MATH_SIZE_MISMATCH */
 108     status = ARM_MATH_SIZE_MISMATCH;
 109   }
 110   else
 111 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
 112   {
 113     /* Matrix transpose */
 114     do
 115     {
 116       /* Apply loop unrolling and exchange the columns with row elements */
 117       col = numColsB >> 2;
 118
 119       /* The pointer px is set to starting address of the column being processed */
 120       px = pSrcBT + i;
 121
 122       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 123        ** a second loop below computes the remaining 1 to 3 samples. */
 124       while (col > 0U)
 125       {
 126 #ifndef UNALIGNED_SUPPORT_DISABLE
 127
 128         /* Read two elements from the row */
 129         in = *__SIMD32(pInB)++;
 130
 131         /* Unpack and store one element in the destination */
 132 #ifndef ARM_MATH_BIG_ENDIAN
 133
 134         *px = (q15_t) in;
 135
 136 #else
 137
 138         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 139
 140 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 141
 142         /* Update the pointer px to point to the next row of the transposed matrix */
 143         px += numRowsB;
 144
 145         /* Unpack and store the second element in the destination */
 146 #ifndef ARM_MATH_BIG_ENDIAN
 147
 148         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 149
 150 #else
 151
 152         *px = (q15_t) in;
 153
 154 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 155
 156         /* Update the pointer px to point to the next row of the transposed matrix */
 157         px += numRowsB;
 158
 159         /* Read two elements from the row */
 160         in = *__SIMD32(pInB)++;
 161
 162         /* Unpack and store one element in the destination */
 163 #ifndef ARM_MATH_BIG_ENDIAN
 164
 165         *px = (q15_t) in;
 166
 167 #else
 168
 169         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 170
 171 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 172
 173         /* Update the pointer px to point to the next row of the transposed matrix */
 174         px += numRowsB;
 175
 176         /* Unpack and store the second element in the destination */
 177
 178 #ifndef ARM_MATH_BIG_ENDIAN
 179
 180         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 181
 182 #else
 183
 184         *px = (q15_t) in;
 185
 186 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 187
 188         /* Update the pointer px to point to the next row of the transposed matrix */
 189         px += numRowsB;
 190
 191 #else
 192
 193         /* Read one element from the row */
 194         in = *pInB++;
 195
 196         /* Store one element in the destination */
 197         *px = in;
 198
 199         /* Update the pointer px to point to the next row of the transposed matrix */
 200         px += numRowsB;
 201
 202         /* Read one element from the row */
 203         in = *pInB++;
 204
 205         /* Store one element in the destination */
 206         *px = in;
 207
 208         /* Update the pointer px to point to the next row of the transposed matrix */
 209         px += numRowsB;
 210
 211         /* Read one element from the row */
 212         in = *pInB++;
 213
 214         /* Store one element in the destination */
 215         *px = in;
 216
 217         /* Update the pointer px to point to the next row of the transposed matrix */
 218         px += numRowsB;
 219
 220         /* Read one element from the row */
 221         in = *pInB++;
 222
 223         /* Store one element in the destination */
 224         *px = in;
 225
 226         /* Update the pointer px to point to the next row of the transposed matrix */
 227         px += numRowsB;
 228
 229 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
 230
 231        /* Decrement the column loop counter */
 232         col--;
 233       }
 234
 235       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
 236        ** No loop unrolling is used. */
 237       col = numColsB % 0x4U;
 238
 239       while (col > 0U)
 240       {
 241         /* Read and store the input element in the destination */
 242         *px = *pInB++;
 243
 244         /* Update the pointer px to point to the next row of the transposed matrix */
 245         px += numRowsB;
 246
 247         /* Decrement the column loop counter */
 248         col--;
 249       }
 250
 251       i++;
 252
 253       /* Decrement the row loop counter */
 254       row--;
 255
 256     } while (row > 0U);
 257
 258     /* Reset the variables for the usage in the following multiplication process */
 259     row = numRowsA;
 260     i = 0U;
 261     px = pDst->pData;
 262
 263     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
 264     /* row loop */
 265     do
 266     {
 267       /* For every row wise process, the column loop counter is to be initiated */
 268       col = numColsB;
 269
 270       /* For every row wise process, the pIn2 pointer is set
 271        ** to the starting address of the transposed pSrcB data */
 272       pInB = pSrcBT;
 273
 274       /* column loop */
 275       do
 276       {
 277         /* Set the variable sum, that acts as accumulator, to zero */
 278         sum = 0;
 279
 280         /* Apply loop unrolling and compute 2 MACs simultaneously. */
 281         colCnt = numColsA >> 2;
 282
 283         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
 284         pInA = pSrcA->pData + i;
 285
 286
 287         /* matrix multiplication */
 288         while (colCnt > 0U)
 289         {
 290           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 291 #ifndef UNALIGNED_SUPPORT_DISABLE
 292
 293           /* read real and imag values from pSrcA and pSrcB buffer */
 294           pSourceA1 = *__SIMD32(pInA)++;
 295           pSourceB1 = *__SIMD32(pInB)++;
 296
 297           pSourceA2 = *__SIMD32(pInA)++;
 298           pSourceB2 = *__SIMD32(pInB)++;
 299
 300           /* Multiply and Accumlates */
 301           sum = __SMLALD(pSourceA1, pSourceB1, sum);
 302           sum = __SMLALD(pSourceA2, pSourceB2, sum);
 303
 304 #else
 305           /* read real and imag values from pSrcA and pSrcB buffer */
 306           inA1 = *pInA++;
 307           inB1 = *pInB++;
 308           inA2 = *pInA++;
 309           /* Multiply and Accumlates */
 310           sum += inA1 * inB1;
 311           inB2 = *pInB++;
 312
 313           inA1 = *pInA++;
 314           inB1 = *pInB++;
 315           /* Multiply and Accumlates */
 316           sum += inA2 * inB2;
 317           inA2 = *pInA++;
 318           inB2 = *pInB++;
 319
 320           /* Multiply and Accumlates */
 321           sum += inA1 * inB1;
 322           sum += inA2 * inB2;
 323
 324 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
 325
 326           /* Decrement the loop counter */
 327           colCnt--;
 328         }
 329
 330         /* process remaining column samples */
 331         colCnt = numColsA & 3U;
 332
 333         while (colCnt > 0U)
 334         {
 335           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 336           sum += *pInA++ * *pInB++;
 337
 338           /* Decrement the loop counter */
 339           colCnt--;
 340         }
 341
 342         /* Saturate and store the result in the destination buffer */
 343         *px = (q15_t) (__SSAT((sum >> 15), 16));
 344         px++;
 345
 346         /* Decrement the column loop counter */
 347         col--;
 348
 349       } while (col > 0U);
 350
 351       i = i + numColsA;
 352
 353       /* Decrement the row loop counter */
 354       row--;
 355
 356     } while (row > 0U);
 357
 358 #else
 359
 360   /* Run the below code for Cortex-M0 */
 361
 362   q15_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
 363   q15_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
 364   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
 365   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
 366   q15_t *pOut = pDst->pData;                     /* output data matrix pointer */
 367   q15_t *px;                                     /* Temporary output data matrix pointer */
 368   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
 369   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
 370   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
 371   uint16_t col, i = 0U, row = numRowsA, colCnt;  /* loop counters */
 372   arm_status status;                             /* status of matrix multiplication */
 373
 374 #ifdef ARM_MATH_MATRIX_CHECK
 375
 376   /* Check for matrix mismatch condition */
 377   if ((pSrcA->numCols != pSrcB->numRows) ||
 378      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
 379   {
 380     /* Set status as ARM_MATH_SIZE_MISMATCH */
 381     status = ARM_MATH_SIZE_MISMATCH;
 382   }
 383   else
 384 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
 385
 386   {
 387     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
 388     /* row loop */
 389     do
 390     {
 391       /* Output pointer is set to starting address of the row being processed */
 392       px = pOut + i;
 393
 394       /* For every row wise process, the column loop counter is to be initiated */
 395       col = numColsB;
 396
 397       /* For every row wise process, the pIn2 pointer is set
 398        ** to the starting address of the pSrcB data */
 399       pIn2 = pSrcB->pData;
 400
 401       /* column loop */
 402       do
 403       {
 404         /* Set the variable sum, that acts as accumulator, to zero */
 405         sum = 0;
 406
 407         /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
 408         pIn1 = pInA;
 409
 410         /* Matrix A columns number of MAC operations are to be performed */
 411         colCnt = numColsA;
 412
 413         /* matrix multiplication */
 414         while (colCnt > 0U)
 415         {
 416           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 417           /* Perform the multiply-accumulates */
 418           sum += (q31_t) * pIn1++ * *pIn2;
 419           pIn2 += numColsB;
 420
 421           /* Decrement the loop counter */
 422           colCnt--;
 423         }
 424
 425         /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
 426         /* Saturate and store the result in the destination buffer */
 427         *px++ = (q15_t) __SSAT((sum >> 15), 16);
 428
 429         /* Decrement the column loop counter */
 430         col--;
 431
 432         /* Update the pointer pIn2 to point to the  starting address of the next column */
 433         pIn2 = pInB + (numColsB - col);
 434
 435       } while (col > 0U);
 436
 437       /* Update the pointer pSrcA to point to the  starting address of the next row */
 438       i = i + numColsB;
 439       pInA = pInA + numColsA;
 440
 441       /* Decrement the row loop counter */
 442       row--;
 443
 444     } while (row > 0U);
 445
 446 #endif /* #if defined (ARM_MATH_DSP) */
 447     /* set status as ARM_MATH_SUCCESS */
 448     status = ARM_MATH_SUCCESS;
 449   }
 450
 451   /* Return to application */
 452   return (status);
 453 }
 454
 455 /**
 456  * @} end of MatrixMult group
 457  */