1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_mult_fast_q15.c
4 * Description: Q15 matrix multiplication (fast variant)
6 * $Date: 27. January 2017
9 * Target Processor: Cortex-M cores
10 * -------------------------------------------------------------------- */
12 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
14 * SPDX-License-Identifier: Apache-2.0
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
20 * www.apache.org/licenses/LICENSE-2.0
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
32 * @ingroup groupMatrix
36 * @addtogroup MatrixMult
42 * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
43 * @param[in] *pSrcA points to the first input matrix structure
44 * @param[in] *pSrcB points to the second input matrix structure
45 * @param[out] *pDst points to output matrix structure
46 * @param[in] *pState points to the array for storing intermediate results
47 * @return The function returns either
48 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
51 * <b>Scaling and Overflow Behavior:</b>
54 * The difference between the function arm_mat_mult_q15() and this fast variant is that
55 * the fast variant use a 32-bit rather than a 64-bit accumulator.
56 * The result of each 1.15 x 1.15 multiplication is truncated to
57 * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
58 * format. Finally, the accumulator is saturated and converted to a 1.15 result.
61 * The fast version has the same overflow behavior as the standard version but provides
62 * less precision since it discards the low 16 bits of each multiplication result.
63 * In order to avoid overflows completely the input signals must be scaled down.
64 * Scale down one of the input matrices by log2(numColsA) bits to
65 * avoid overflows, as a total of numColsA additions are computed internally for each
69 * See <code>arm_mat_mult_q15()</code> for a slower implementation of this function
70 * which uses 64-bit accumulation to provide higher precision.
73 arm_status
arm_mat_mult_fast_q15(
74 const arm_matrix_instance_q15
* pSrcA
,
75 const arm_matrix_instance_q15
* pSrcB
,
76 arm_matrix_instance_q15
* pDst
,
79 q31_t sum
; /* accumulator */
80 q15_t
*pSrcBT
= pState
; /* input data matrix pointer for transpose */
81 q15_t
*pInA
= pSrcA
->pData
; /* input data matrix pointer A of Q15 type */
82 q15_t
*pInB
= pSrcB
->pData
; /* input data matrix pointer B of Q15 type */
83 q15_t
*px
; /* Temporary output data matrix pointer */
84 uint16_t numRowsA
= pSrcA
->numRows
; /* number of rows of input matrix A */
85 uint16_t numColsB
= pSrcB
->numCols
; /* number of columns of input matrix B */
86 uint16_t numColsA
= pSrcA
->numCols
; /* number of columns of input matrix A */
87 uint16_t numRowsB
= pSrcB
->numRows
; /* number of rows of input matrix A */
88 uint32_t col
, i
= 0U, row
= numRowsB
, colCnt
; /* loop counters */
89 arm_status status
; /* status of matrix multiplication */
91 #ifndef UNALIGNED_SUPPORT_DISABLE
93 q31_t in
; /* Temporary variable to hold the input value */
94 q31_t inA1
, inA2
, inB1
, inB2
;
95 q31_t sum2
, sum3
, sum4
;
96 q15_t
*pInA2
, *pInB2
, *px2
;
101 q15_t in
; /* Temporary variable to hold the input value */
102 q15_t inA1
, inA2
, inB1
, inB2
;
104 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
106 #ifdef ARM_MATH_MATRIX_CHECK
107 /* Check for matrix mismatch condition */
108 if ((pSrcA
->numCols
!= pSrcB
->numRows
) ||
109 (pSrcA
->numRows
!= pDst
->numRows
) || (pSrcB
->numCols
!= pDst
->numCols
))
111 /* Set status as ARM_MATH_SIZE_MISMATCH */
112 status
= ARM_MATH_SIZE_MISMATCH
;
117 /* Matrix transpose */
120 /* Apply loop unrolling and exchange the columns with row elements */
123 /* The pointer px is set to starting address of the column being processed */
126 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
127 ** a second loop below computes the remaining 1 to 3 samples. */
130 #ifndef UNALIGNED_SUPPORT_DISABLE
131 /* Read two elements from the row */
132 in
= *__SIMD32(pInB
)++;
134 /* Unpack and store one element in the destination */
135 #ifndef ARM_MATH_BIG_ENDIAN
141 *px
= (q15_t
) ((in
& (q31_t
) 0xffff0000) >> 16);
143 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
145 /* Update the pointer px to point to the next row of the transposed matrix */
148 /* Unpack and store the second element in the destination */
149 #ifndef ARM_MATH_BIG_ENDIAN
151 *px
= (q15_t
) ((in
& (q31_t
) 0xffff0000) >> 16);
157 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
159 /* Update the pointer px to point to the next row of the transposed matrix */
162 /* Read two elements from the row */
163 in
= *__SIMD32(pInB
)++;
165 /* Unpack and store one element in the destination */
166 #ifndef ARM_MATH_BIG_ENDIAN
172 *px
= (q15_t
) ((in
& (q31_t
) 0xffff0000) >> 16);
174 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
176 /* Update the pointer px to point to the next row of the transposed matrix */
179 /* Unpack and store the second element in the destination */
181 #ifndef ARM_MATH_BIG_ENDIAN
183 *px
= (q15_t
) ((in
& (q31_t
) 0xffff0000) >> 16);
189 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
193 /* Read one element from the row */
196 /* Store one element in the destination */
199 /* Update the pointer px to point to the next row of the transposed matrix */
202 /* Read one element from the row */
205 /* Store one element in the destination */
208 /* Update the pointer px to point to the next row of the transposed matrix */
211 /* Read one element from the row */
214 /* Store one element in the destination */
217 /* Update the pointer px to point to the next row of the transposed matrix */
220 /* Read one element from the row */
223 /* Store one element in the destination */
226 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
228 /* Update the pointer px to point to the next row of the transposed matrix */
231 /* Decrement the column loop counter */
235 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
236 ** No loop unrolling is used. */
237 col
= numColsB
% 0x4U
;
241 /* Read and store the input element in the destination */
244 /* Update the pointer px to point to the next row of the transposed matrix */
247 /* Decrement the column loop counter */
253 /* Decrement the row loop counter */
258 /* Reset the variables for the usage in the following multiplication process */
263 #ifndef UNALIGNED_SUPPORT_DISABLE
264 /* Process two rows from matrix A at a time and output two rows at a time */
269 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
273 /* For every row wise process, the column loop counter is to be initiated */
276 /* For every row wise process, the pIn2 pointer is set
277 ** to the starting address of the transposed pSrcB data */
280 #ifndef UNALIGNED_SUPPORT_DISABLE
281 /* Process two (transposed) columns from matrix B at a time */
289 /* Set the variable sum, that acts as accumulator, to zero */
292 /* Initiate the pointer pInA to point to the starting address of the column being processed */
293 pInA
= pSrcA
->pData
+ i
;
295 #ifndef UNALIGNED_SUPPORT_DISABLE
300 pInA2
= pInA
+ numColsA
;
301 pInB2
= pInB
+ numRowsB
;
303 /* Read in two elements at once - alows dual MAC instruction */
304 colCnt
= numColsA
>> 1;
306 colCnt
= numColsA
>> 2;
309 /* matrix multiplication */
312 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
313 #ifndef UNALIGNED_SUPPORT_DISABLE
315 inA1
= *__SIMD32(pInA
)++;
316 inB1
= *__SIMD32(pInB
)++;
317 inA2
= *__SIMD32(pInA2
)++;
318 inB2
= *__SIMD32(pInB2
)++;
320 sum
= __SMLAD(inA1
, inB1
, sum
);
321 sum2
= __SMLAD(inA1
, inB2
, sum2
);
322 sum3
= __SMLAD(inA2
, inB1
, sum3
);
323 sum4
= __SMLAD(inA2
, inB2
, sum4
);
346 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
348 /* Decrement the loop counter */
352 /* process odd column samples */
353 #ifndef UNALIGNED_SUPPORT_DISABLE
365 colCnt
= numColsA
% 0x4U
;
369 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
370 sum
+= (q31_t
) (*pInA
++) * (*pInB
++);
376 /* Saturate and store the result in the destination buffer */
377 *px
++ = (q15_t
) (sum
>> 15);
379 #ifndef UNALIGNED_SUPPORT_DISABLE
380 *px
++ = (q15_t
) (sum2
>> 15);
381 *px2
++ = (q15_t
) (sum3
>> 15);
382 *px2
++ = (q15_t
) (sum4
>> 15);
386 /* Decrement the column loop counter */
393 #ifndef UNALIGNED_SUPPORT_DISABLE
395 px
= px2
+ (numColsB
& 1U);
399 /* Decrement the row loop counter */
404 /* Compute any remaining odd row/column below */
406 #ifndef UNALIGNED_SUPPORT_DISABLE
408 /* Compute remaining output column */
411 /* Avoid redundant computation of last element */
412 row
= numRowsA
& (~0x1);
414 /* Point to remaining unfilled column in output matrix */
415 px
= pDst
->pData
+numColsB
-1;
422 /* point to last column in matrix B */
423 pInB
= pSrcBT
+ numRowsB
*(numColsB
-1);
425 /* Set the variable sum, that acts as accumulator, to zero */
428 /* Compute 4 columns at once */
429 colCnt
= numColsA
>> 2;
431 /* matrix multiplication */
434 inA1
= *__SIMD32(pInA
)++;
435 inA2
= *__SIMD32(pInA
)++;
436 inB1
= *__SIMD32(pInB
)++;
437 inB2
= *__SIMD32(pInB
)++;
439 sum
= __SMLAD(inA1
, inB1
, sum
);
440 sum
= __SMLAD(inA2
, inB2
, sum
);
442 /* Decrement the loop counter */
446 colCnt
= numColsA
& 3U;
447 while (colCnt
> 0U) {
448 sum
+= (q31_t
) (*pInA
++) * (*pInB
++);
452 /* Store the result in the destination buffer */
453 *px
= (q15_t
) (sum
>> 15);
456 /* Decrement the row loop counter */
461 /* Compute remaining output row */
464 /* point to last row in output matrix */
465 px
= pDst
->pData
+(numColsB
)*(numRowsA
-1);
475 /* point to last row in matrix A */
476 pInA
= pSrcA
->pData
+ (numRowsA
-1)*numColsA
;
478 /* Set the variable sum, that acts as accumulator, to zero */
481 /* Compute 4 columns at once */
482 colCnt
= numColsA
>> 2;
484 /* matrix multiplication */
487 inA1
= *__SIMD32(pInA
)++;
488 inA2
= *__SIMD32(pInA
)++;
489 inB1
= *__SIMD32(pInB
)++;
490 inB2
= *__SIMD32(pInB
)++;
492 sum
= __SMLAD(inA1
, inB1
, sum
);
493 sum
= __SMLAD(inA2
, inB2
, sum
);
495 /* Decrement the loop counter */
499 colCnt
= numColsA
& 3U;
500 while (colCnt
> 0U) {
501 sum
+= (q31_t
) (*pInA
++) * (*pInB
++);
505 /* Store the result in the destination buffer */
506 *px
++ = (q15_t
) (sum
>> 15);
508 /* Decrement the col loop counter */
513 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
515 /* set status as ARM_MATH_SUCCESS */
516 status
= ARM_MATH_SUCCESS
;
519 /* Return to application */
524 * @} end of MatrixMult group