1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_mult_q31.c
4 * Description: Q31 matrix multiplication
6 * $Date: 27. January 2017
9 * Target Processor: Cortex-M cores
10 * -------------------------------------------------------------------- */
12 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
14 * SPDX-License-Identifier: Apache-2.0
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
20 * www.apache.org/licenses/LICENSE-2.0
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
32 * @ingroup groupMatrix
36 * @addtogroup MatrixMult
41 * @brief Q31 matrix multiplication
42 * @param[in] *pSrcA points to the first input matrix structure
43 * @param[in] *pSrcB points to the second input matrix structure
44 * @param[out] *pDst points to output matrix structure
45 * @return The function returns either
46 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
49 * <b>Scaling and Overflow Behavior:</b>
52 * The function is implemented using an internal 64-bit accumulator.
53 * The accumulator has a 2.62 format and maintains full precision of the intermediate
54 * multiplication results but provides only a single guard bit. There is no saturation
55 * on intermediate additions. Thus, if the accumulator overflows it wraps around and
56 * distorts the result. The input signals should be scaled down to avoid intermediate
57 * overflows. The input is thus scaled down by log2(numColsA) bits
58 * to avoid overflows, as a total of numColsA additions are performed internally.
59 * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
62 * See <code>arm_mat_mult_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
66 arm_status
arm_mat_mult_q31(
67 const arm_matrix_instance_q31
* pSrcA
,
68 const arm_matrix_instance_q31
* pSrcB
,
69 arm_matrix_instance_q31
* pDst
)
71 q31_t
*pIn1
= pSrcA
->pData
; /* input data matrix pointer A */
72 q31_t
*pIn2
= pSrcB
->pData
; /* input data matrix pointer B */
73 q31_t
*pInA
= pSrcA
->pData
; /* input data matrix pointer A */
74 q31_t
*pOut
= pDst
->pData
; /* output data matrix pointer */
75 q31_t
*px
; /* Temporary output data matrix pointer */
76 q63_t sum
; /* Accumulator */
77 uint16_t numRowsA
= pSrcA
->numRows
; /* number of rows of input matrix A */
78 uint16_t numColsB
= pSrcB
->numCols
; /* number of columns of input matrix B */
79 uint16_t numColsA
= pSrcA
->numCols
; /* number of columns of input matrix A */
81 #if defined (ARM_MATH_DSP)
83 /* Run the below code for Cortex-M4 and Cortex-M3 */
85 uint16_t col
, i
= 0U, j
, row
= numRowsA
, colCnt
; /* loop counters */
86 arm_status status
; /* status of matrix multiplication */
87 q31_t a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
89 #ifdef ARM_MATH_MATRIX_CHECK
92 /* Check for matrix mismatch condition */
93 if ((pSrcA
->numCols
!= pSrcB
->numRows
) ||
94 (pSrcA
->numRows
!= pDst
->numRows
) || (pSrcB
->numCols
!= pDst
->numCols
))
96 /* Set status as ARM_MATH_SIZE_MISMATCH */
97 status
= ARM_MATH_SIZE_MISMATCH
;
100 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
103 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
107 /* Output pointer is set to starting address of the row being processed */
110 /* For every row wise process, the column loop counter is to be initiated */
113 /* For every row wise process, the pIn2 pointer is set
114 ** to the starting address of the pSrcB data */
122 /* Set the variable sum, that acts as accumulator, to zero */
125 /* Initiate the pointer pIn1 to point to the starting address of pInA */
128 /* Apply loop unrolling and compute 4 MACs simultaneously. */
129 colCnt
= numColsA
>> 2;
132 /* matrix multiplication */
135 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
136 /* Perform the multiply-accumulates */
148 sum
+= (q63_t
) a0
*b0
;
149 sum
+= (q63_t
) a1
*b1
;
157 sum
+= (q63_t
) a2
*b2
;
158 sum
+= (q63_t
) a3
*b3
;
160 /* Decrement the loop counter */
164 /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.
165 ** No loop unrolling is used. */
166 colCnt
= numColsA
% 0x4U
;
170 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
171 /* Perform the multiply-accumulates */
172 sum
+= (q63_t
) * pIn1
++ * *pIn2
;
175 /* Decrement the loop counter */
179 /* Convert the result from 2.62 to 1.31 format and store in destination buffer */
180 *px
++ = (q31_t
) (sum
>> 31);
182 /* Update the pointer pIn2 to point to the starting address of the next column */
184 pIn2
= (pSrcB
->pData
) + j
;
186 /* Decrement the column loop counter */
193 /* Run the below code for Cortex-M0 */
195 q31_t
*pInB
= pSrcB
->pData
; /* input data matrix pointer B */
196 uint16_t col
, i
= 0U, row
= numRowsA
, colCnt
; /* loop counters */
197 arm_status status
; /* status of matrix multiplication */
200 #ifdef ARM_MATH_MATRIX_CHECK
202 /* Check for matrix mismatch condition */
203 if ((pSrcA
->numCols
!= pSrcB
->numRows
) ||
204 (pSrcA
->numRows
!= pDst
->numRows
) || (pSrcB
->numCols
!= pDst
->numCols
))
206 /* Set status as ARM_MATH_SIZE_MISMATCH */
207 status
= ARM_MATH_SIZE_MISMATCH
;
210 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
213 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
217 /* Output pointer is set to starting address of the row being processed */
220 /* For every row wise process, the column loop counter is to be initiated */
223 /* For every row wise process, the pIn2 pointer is set
224 ** to the starting address of the pSrcB data */
230 /* Set the variable sum, that acts as accumulator, to zero */
233 /* Initiate the pointer pIn1 to point to the starting address of pInA */
236 /* Matrix A columns number of MAC operations are to be performed */
239 /* matrix multiplication */
242 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
243 /* Perform the multiply-accumulates */
244 sum
+= (q63_t
) * pIn1
++ * *pIn2
;
247 /* Decrement the loop counter */
251 /* Convert the result from 2.62 to 1.31 format and store in destination buffer */
252 *px
++ = (q31_t
) clip_q63_to_q31(sum
>> 31);
254 /* Decrement the column loop counter */
257 /* Update the pointer pIn2 to point to the starting address of the next column */
258 pIn2
= pInB
+ (numColsB
- col
);
264 /* Update the pointer pInA to point to the starting address of the next row */
266 pInA
= pInA
+ numColsA
;
268 /* Decrement the row loop counter */
273 /* set status as ARM_MATH_SUCCESS */
274 status
= ARM_MATH_SUCCESS
;
276 /* Return to application */
281 * @} end of MatrixMult group