lib/main/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c

   1 /* ----------------------------------------------------------------------
   2  * Project:      CMSIS DSP Library
   3  * Title:        arm_mat_mult_fast_q15.c
   4  * Description:  Q15 matrix multiplication (fast variant)
   5  *
   6  * $Date:        27. January 2017
   7  * $Revision:    V.1.5.1
   8  *
   9  * Target Processor: Cortex-M cores
  10  * -------------------------------------------------------------------- */
  11 /*
  12  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13  *
  14  * SPDX-License-Identifier: Apache-2.0
  15  *
  16  * Licensed under the Apache License, Version 2.0 (the License); you may
  17  * not use this file except in compliance with the License.
  18  * You may obtain a copy of the License at
  19  *
  20  * www.apache.org/licenses/LICENSE-2.0
  21  *
  22  * Unless required by applicable law or agreed to in writing, software
  23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25  * See the License for the specific language governing permissions and
  26  * limitations under the License.
  27  */
  28
  29 #include "arm_math.h"
  30
  31 /**
  32  * @ingroup groupMatrix
  33  */
  34
  35 /**
  36  * @addtogroup MatrixMult
  37  * @{
  38  */
  39
  40
  41 /**
  42  * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
  43  * @param[in]       *pSrcA points to the first input matrix structure
  44  * @param[in]       *pSrcB points to the second input matrix structure
  45  * @param[out]      *pDst points to output matrix structure
  46  * @param[in]       *pState points to the array for storing intermediate results
  47  * @return          The function returns either
  48  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  49  *
  50  * @details
  51  * <b>Scaling and Overflow Behavior:</b>
  52  *
  53  * \par
  54  * The difference between the function arm_mat_mult_q15() and this fast variant is that
  55  * the fast variant use a 32-bit rather than a 64-bit accumulator.
  56  * The result of each 1.15 x 1.15 multiplication is truncated to
  57  * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
  58  * format. Finally, the accumulator is saturated and converted to a 1.15 result.
  59  *
  60  * \par
  61  * The fast version has the same overflow behavior as the standard version but provides
  62  * less precision since it discards the low 16 bits of each multiplication result.
  63  * In order to avoid overflows completely the input signals must be scaled down.
  64  * Scale down one of the input matrices by log2(numColsA) bits to
  65  * avoid overflows, as a total of numColsA additions are computed internally for each
  66  * output element.
  67  *
  68  * \par
  69  * See <code>arm_mat_mult_q15()</code> for a slower implementation of this function
  70  * which uses 64-bit accumulation to provide higher precision.
  71  */
  72
  73 arm_status arm_mat_mult_fast_q15(
  74   const arm_matrix_instance_q15 * pSrcA,
  75   const arm_matrix_instance_q15 * pSrcB,
  76   arm_matrix_instance_q15 * pDst,
  77   q15_t * pState)
  78 {
  79   q31_t sum;                                     /* accumulator */
  80   q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
  81   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  82   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  83   q15_t *px;                                     /* Temporary output data matrix pointer */
  84   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  85   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  86   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  87   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
  88   uint32_t col, i = 0U, row = numRowsB, colCnt;  /* loop counters */
  89   arm_status status;                             /* status of matrix multiplication */
  90
  91 #ifndef UNALIGNED_SUPPORT_DISABLE
  92
  93   q31_t in;                                      /* Temporary variable to hold the input value */
  94   q31_t inA1, inA2, inB1, inB2;
  95   q31_t sum2, sum3, sum4;
  96   q15_t *pInA2, *pInB2, *px2;
  97   uint32_t j = 0;
  98
  99 #else
 100
 101   q15_t in;                                      /* Temporary variable to hold the input value */
 102   q15_t inA1, inA2, inB1, inB2;
 103
 104 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
 105
 106 #ifdef ARM_MATH_MATRIX_CHECK
 107   /* Check for matrix mismatch condition */
 108   if ((pSrcA->numCols != pSrcB->numRows) ||
 109      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
 110   {
 111     /* Set status as ARM_MATH_SIZE_MISMATCH */
 112     status = ARM_MATH_SIZE_MISMATCH;
 113   }
 114   else
 115 #endif
 116   {
 117     /* Matrix transpose */
 118     do
 119     {
 120       /* Apply loop unrolling and exchange the columns with row elements */
 121       col = numColsB >> 2;
 122
 123       /* The pointer px is set to starting address of the column being processed */
 124       px = pSrcBT + i;
 125
 126       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 127        ** a second loop below computes the remaining 1 to 3 samples. */
 128       while (col > 0U)
 129       {
 130 #ifndef UNALIGNED_SUPPORT_DISABLE
 131         /* Read two elements from the row */
 132         in = *__SIMD32(pInB)++;
 133
 134         /* Unpack and store one element in the destination */
 135 #ifndef ARM_MATH_BIG_ENDIAN
 136
 137         *px = (q15_t) in;
 138
 139 #else
 140
 141         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 142
 143 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
 144
 145         /* Update the pointer px to point to the next row of the transposed matrix */
 146         px += numRowsB;
 147
 148         /* Unpack and store the second element in the destination */
 149 #ifndef ARM_MATH_BIG_ENDIAN
 150
 151         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 152
 153 #else
 154
 155         *px = (q15_t) in;
 156
 157 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
 158
 159         /* Update the pointer px to point to the next row of the transposed matrix */
 160         px += numRowsB;
 161
 162         /* Read two elements from the row */
 163         in = *__SIMD32(pInB)++;
 164
 165         /* Unpack and store one element in the destination */
 166 #ifndef ARM_MATH_BIG_ENDIAN
 167
 168         *px = (q15_t) in;
 169
 170 #else
 171
 172         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 173
 174 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
 175
 176         /* Update the pointer px to point to the next row of the transposed matrix */
 177         px += numRowsB;
 178
 179         /* Unpack and store the second element in the destination */
 180
 181 #ifndef ARM_MATH_BIG_ENDIAN
 182
 183         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 184
 185 #else
 186
 187         *px = (q15_t) in;
 188
 189 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
 190
 191 #else
 192
 193         /* Read one element from the row */
 194         in = *pInB++;
 195
 196         /* Store one element in the destination */
 197         *px = in;
 198
 199         /* Update the pointer px to point to the next row of the transposed matrix */
 200         px += numRowsB;
 201
 202         /* Read one element from the row */
 203         in = *pInB++;
 204
 205         /* Store one element in the destination */
 206         *px = in;
 207
 208         /* Update the pointer px to point to the next row of the transposed matrix */
 209         px += numRowsB;
 210
 211         /* Read one element from the row */
 212         in = *pInB++;
 213
 214         /* Store one element in the destination */
 215         *px = in;
 216
 217         /* Update the pointer px to point to the next row of the transposed matrix */
 218         px += numRowsB;
 219
 220         /* Read one element from the row */
 221         in = *pInB++;
 222
 223         /* Store one element in the destination */
 224         *px = in;
 225
 226 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
 227
 228         /* Update the pointer px to point to the next row of the transposed matrix */
 229         px += numRowsB;
 230
 231         /* Decrement the column loop counter */
 232         col--;
 233       }
 234
 235       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
 236        ** No loop unrolling is used. */
 237       col = numColsB % 0x4U;
 238
 239       while (col > 0U)
 240       {
 241         /* Read and store the input element in the destination */
 242         *px = *pInB++;
 243
 244         /* Update the pointer px to point to the next row of the transposed matrix */
 245         px += numRowsB;
 246
 247         /* Decrement the column loop counter */
 248         col--;
 249       }
 250
 251       i++;
 252
 253       /* Decrement the row loop counter */
 254       row--;
 255
 256     } while (row > 0U);
 257
 258     /* Reset the variables for the usage in the following multiplication process */
 259     row = numRowsA;
 260     i = 0U;
 261     px = pDst->pData;
 262
 263 #ifndef UNALIGNED_SUPPORT_DISABLE
 264     /* Process two rows from matrix A at a time and output two rows at a time */
 265     row = row >> 1;
 266     px2 = px + numColsB;
 267 #endif
 268
 269     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
 270     /* row loop */
 271     while (row > 0U)
 272     {
 273       /* For every row wise process, the column loop counter is to be initiated */
 274       col = numColsB;
 275
 276       /* For every row wise process, the pIn2 pointer is set
 277        ** to the starting address of the transposed pSrcB data */
 278       pInB = pSrcBT;
 279
 280 #ifndef UNALIGNED_SUPPORT_DISABLE
 281       /* Process two (transposed) columns from matrix B at a time */
 282       col = col >> 1;
 283       j = 0;
 284 #endif
 285
 286       /* column loop */
 287       while (col > 0U)
 288       {
 289         /* Set the variable sum, that acts as accumulator, to zero */
 290         sum = 0;
 291
 292         /* Initiate the pointer pInA to point to the starting address of the column being processed */
 293         pInA = pSrcA->pData + i;
 294
 295 #ifndef UNALIGNED_SUPPORT_DISABLE
 296         sum2 = 0;
 297         sum3 = 0;
 298         sum4 = 0;
 299         pInB  = pSrcBT + j;
 300         pInA2 = pInA + numColsA;
 301         pInB2 = pInB + numRowsB;
 302
 303         /* Read in two elements at once - alows dual MAC instruction */
 304         colCnt = numColsA >> 1;
 305 #else
 306         colCnt = numColsA >> 2;
 307 #endif
 308
 309         /* matrix multiplication */
 310         while (colCnt > 0U)
 311         {
 312           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 313 #ifndef UNALIGNED_SUPPORT_DISABLE
 314
 315           inA1 = *__SIMD32(pInA)++;
 316           inB1 = *__SIMD32(pInB)++;
 317           inA2 = *__SIMD32(pInA2)++;
 318           inB2 = *__SIMD32(pInB2)++;
 319
 320           sum  = __SMLAD(inA1, inB1, sum);
 321           sum2 = __SMLAD(inA1, inB2, sum2);
 322           sum3 = __SMLAD(inA2, inB1, sum3);
 323           sum4 = __SMLAD(inA2, inB2, sum4);
 324
 325 #else
 326
 327           inA1 = *pInA;
 328           inB1 = *pInB;
 329           sum += inA1 * inB1;
 330
 331           inA2 = pInA[1];
 332           inB2 = pInB[1];
 333           sum += inA2 * inB2;
 334
 335           inA1 = pInA[2];
 336           inB1 = pInB[2];
 337           sum += inA1 * inB1;
 338
 339           inA2 = pInA[3];
 340           inB2 = pInB[3];
 341           sum += inA2 * inB2;
 342
 343           pInA += 4;
 344           pInB += 4;
 345
 346 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
 347
 348           /* Decrement the loop counter */
 349           colCnt--;
 350         }
 351
 352         /* process odd column samples */
 353 #ifndef UNALIGNED_SUPPORT_DISABLE
 354         if (numColsA & 1U) {
 355           inA1 = *pInA++;
 356           inB1 = *pInB++;
 357           inA2 = *pInA2++;
 358           inB2 = *pInB2++;
 359           sum  += inA1 * inB1;
 360           sum2 += inA1 * inB2;
 361           sum3 += inA2 * inB1;
 362           sum4 += inA2 * inB2;
 363         }
 364 #else
 365         colCnt = numColsA % 0x4U;
 366
 367         while (colCnt > 0U)
 368         {
 369           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 370           sum += (q31_t) (*pInA++) * (*pInB++);
 371
 372           colCnt--;
 373         }
 374 #endif
 375
 376         /* Saturate and store the result in the destination buffer */
 377         *px++  = (q15_t) (sum >> 15);
 378
 379 #ifndef UNALIGNED_SUPPORT_DISABLE
 380         *px++  = (q15_t) (sum2 >> 15);
 381         *px2++ = (q15_t) (sum3 >> 15);
 382         *px2++ = (q15_t) (sum4 >> 15);
 383         j += numRowsB * 2;
 384 #endif
 385
 386         /* Decrement the column loop counter */
 387         col--;
 388
 389       }
 390
 391       i = i + numColsA;
 392
 393 #ifndef UNALIGNED_SUPPORT_DISABLE
 394       i = i + numColsA;
 395       px = px2 + (numColsB & 1U);
 396       px2 = px + numColsB;
 397 #endif
 398
 399       /* Decrement the row loop counter */
 400       row--;
 401
 402     }
 403
 404     /* Compute any remaining odd row/column below */
 405
 406 #ifndef UNALIGNED_SUPPORT_DISABLE
 407
 408     /* Compute remaining output column */
 409     if (numColsB & 1U) {
 410
 411       /* Avoid redundant computation of last element */
 412       row = numRowsA & (~0x1);
 413
 414       /* Point to remaining unfilled column in output matrix */
 415       px = pDst->pData+numColsB-1;
 416       pInA = pSrcA->pData;
 417
 418       /* row loop */
 419       while (row > 0)
 420       {
 421
 422         /* point to last column in matrix B */
 423         pInB  = pSrcBT + numRowsB*(numColsB-1);
 424
 425         /* Set the variable sum, that acts as accumulator, to zero */
 426         sum  = 0;
 427
 428         /* Compute 4 columns at once */
 429         colCnt = numColsA >> 2;
 430
 431         /* matrix multiplication */
 432         while (colCnt > 0U)
 433         {
 434           inA1 = *__SIMD32(pInA)++;
 435           inA2 = *__SIMD32(pInA)++;
 436           inB1 = *__SIMD32(pInB)++;
 437           inB2 = *__SIMD32(pInB)++;
 438
 439           sum  = __SMLAD(inA1, inB1, sum);
 440           sum  = __SMLAD(inA2, inB2, sum);
 441
 442           /* Decrement the loop counter */
 443           colCnt--;
 444         }
 445
 446         colCnt = numColsA & 3U;
 447         while (colCnt > 0U) {
 448           sum += (q31_t) (*pInA++) * (*pInB++);
 449           colCnt--;
 450         }
 451
 452         /* Store the result in the destination buffer */
 453         *px  = (q15_t) (sum  >> 15);
 454         px += numColsB;
 455
 456         /* Decrement the row loop counter */
 457         row--;
 458       }
 459     }
 460
 461     /* Compute remaining output row */
 462     if (numRowsA & 1U) {
 463
 464       /* point to last row in output matrix */
 465       px = pDst->pData+(numColsB)*(numRowsA-1);
 466
 467       pInB  = pSrcBT;
 468       col = numColsB;
 469       i = 0U;
 470
 471       /* col loop */
 472       while (col > 0)
 473       {
 474
 475         /* point to last row in matrix A */
 476         pInA = pSrcA->pData + (numRowsA-1)*numColsA;
 477
 478         /* Set the variable sum, that acts as accumulator, to zero */
 479         sum  = 0;
 480
 481         /* Compute 4 columns at once */
 482         colCnt = numColsA >> 2;
 483
 484         /* matrix multiplication */
 485         while (colCnt > 0U)
 486         {
 487           inA1 = *__SIMD32(pInA)++;
 488           inA2 = *__SIMD32(pInA)++;
 489           inB1 = *__SIMD32(pInB)++;
 490           inB2 = *__SIMD32(pInB)++;
 491
 492           sum  = __SMLAD(inA1, inB1, sum);
 493           sum  = __SMLAD(inA2, inB2, sum);
 494
 495           /* Decrement the loop counter */
 496           colCnt--;
 497         }
 498
 499         colCnt = numColsA & 3U;
 500         while (colCnt > 0U) {
 501           sum += (q31_t) (*pInA++) * (*pInB++);
 502           colCnt--;
 503         }
 504
 505         /* Store the result in the destination buffer */
 506         *px++  = (q15_t) (sum  >> 15);
 507
 508         /* Decrement the col loop counter */
 509         col--;
 510       }
 511     }
 512
 513 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
 514
 515     /* set status as ARM_MATH_SUCCESS */
 516     status = ARM_MATH_SUCCESS;
 517   }
 518
 519   /* Return to application */
 520   return (status);
 521 }
 522
 523 /**
 524  * @} end of MatrixMult group
 525  */