1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_mult_q15.c
4 * Description: Q15 matrix multiplication
6 * $Date: 27. January 2017
9 * Target Processor: Cortex-M cores
10 * -------------------------------------------------------------------- */
12 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
14 * SPDX-License-Identifier: Apache-2.0
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
20 * www.apache.org/licenses/LICENSE-2.0
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
32 * @ingroup groupMatrix
36 * @addtogroup MatrixMult
42 * @brief Q15 matrix multiplication
43 * @param[in] *pSrcA points to the first input matrix structure
44 * @param[in] *pSrcB points to the second input matrix structure
45 * @param[out] *pDst points to output matrix structure
46 * @param[in] *pState points to the array for storing intermediate results (Unused)
47 * @return The function returns either
48 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
51 * <b>Scaling and Overflow Behavior:</b>
54 * The function is implemented using a 64-bit internal accumulator. The inputs to the
55 * multiplications are in 1.15 format and multiplications yield a 2.30 result.
56 * The 2.30 intermediate
57 * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
58 * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
59 * truncated to 34.15 format by discarding the low 15 bits and then saturated to
63 * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
67 arm_status
arm_mat_mult_q15(
68 const arm_matrix_instance_q15
* pSrcA
,
69 const arm_matrix_instance_q15
* pSrcB
,
70 arm_matrix_instance_q15
* pDst
,
73 q63_t sum
; /* accumulator */
75 #if defined (ARM_MATH_DSP)
77 /* Run the below code for Cortex-M4 and Cortex-M3 */
79 q15_t
*pSrcBT
= pState
; /* input data matrix pointer for transpose */
80 q15_t
*pInA
= pSrcA
->pData
; /* input data matrix pointer A of Q15 type */
81 q15_t
*pInB
= pSrcB
->pData
; /* input data matrix pointer B of Q15 type */
82 q15_t
*px
; /* Temporary output data matrix pointer */
83 uint16_t numRowsA
= pSrcA
->numRows
; /* number of rows of input matrix A */
84 uint16_t numColsB
= pSrcB
->numCols
; /* number of columns of input matrix B */
85 uint16_t numColsA
= pSrcA
->numCols
; /* number of columns of input matrix A */
86 uint16_t numRowsB
= pSrcB
->numRows
; /* number of rows of input matrix A */
87 uint16_t col
, i
= 0U, row
= numRowsB
, colCnt
; /* loop counters */
88 arm_status status
; /* status of matrix multiplication */
90 #ifndef UNALIGNED_SUPPORT_DISABLE
92 q31_t in
; /* Temporary variable to hold the input value */
93 q31_t pSourceA1
, pSourceB1
, pSourceA2
, pSourceB2
;
97 q15_t in
; /* Temporary variable to hold the input value */
98 q15_t inA1
, inB1
, inA2
, inB2
;
100 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
102 #ifdef ARM_MATH_MATRIX_CHECK
103 /* Check for matrix mismatch condition */
104 if ((pSrcA
->numCols
!= pSrcB
->numRows
) ||
105 (pSrcA
->numRows
!= pDst
->numRows
) || (pSrcB
->numCols
!= pDst
->numCols
))
107 /* Set status as ARM_MATH_SIZE_MISMATCH */
108 status
= ARM_MATH_SIZE_MISMATCH
;
111 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
113 /* Matrix transpose */
116 /* Apply loop unrolling and exchange the columns with row elements */
119 /* The pointer px is set to starting address of the column being processed */
122 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
123 ** a second loop below computes the remaining 1 to 3 samples. */
126 #ifndef UNALIGNED_SUPPORT_DISABLE
128 /* Read two elements from the row */
129 in
= *__SIMD32(pInB
)++;
131 /* Unpack and store one element in the destination */
132 #ifndef ARM_MATH_BIG_ENDIAN
138 *px
= (q15_t
) ((in
& (q31_t
) 0xffff0000) >> 16);
140 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
142 /* Update the pointer px to point to the next row of the transposed matrix */
145 /* Unpack and store the second element in the destination */
146 #ifndef ARM_MATH_BIG_ENDIAN
148 *px
= (q15_t
) ((in
& (q31_t
) 0xffff0000) >> 16);
154 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
156 /* Update the pointer px to point to the next row of the transposed matrix */
159 /* Read two elements from the row */
160 in
= *__SIMD32(pInB
)++;
162 /* Unpack and store one element in the destination */
163 #ifndef ARM_MATH_BIG_ENDIAN
169 *px
= (q15_t
) ((in
& (q31_t
) 0xffff0000) >> 16);
171 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
173 /* Update the pointer px to point to the next row of the transposed matrix */
176 /* Unpack and store the second element in the destination */
178 #ifndef ARM_MATH_BIG_ENDIAN
180 *px
= (q15_t
) ((in
& (q31_t
) 0xffff0000) >> 16);
186 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
188 /* Update the pointer px to point to the next row of the transposed matrix */
193 /* Read one element from the row */
196 /* Store one element in the destination */
199 /* Update the pointer px to point to the next row of the transposed matrix */
202 /* Read one element from the row */
205 /* Store one element in the destination */
208 /* Update the pointer px to point to the next row of the transposed matrix */
211 /* Read one element from the row */
214 /* Store one element in the destination */
217 /* Update the pointer px to point to the next row of the transposed matrix */
220 /* Read one element from the row */
223 /* Store one element in the destination */
226 /* Update the pointer px to point to the next row of the transposed matrix */
229 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
231 /* Decrement the column loop counter */
235 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
236 ** No loop unrolling is used. */
237 col
= numColsB
% 0x4U
;
241 /* Read and store the input element in the destination */
244 /* Update the pointer px to point to the next row of the transposed matrix */
247 /* Decrement the column loop counter */
253 /* Decrement the row loop counter */
258 /* Reset the variables for the usage in the following multiplication process */
263 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
267 /* For every row wise process, the column loop counter is to be initiated */
270 /* For every row wise process, the pIn2 pointer is set
271 ** to the starting address of the transposed pSrcB data */
277 /* Set the variable sum, that acts as accumulator, to zero */
280 /* Apply loop unrolling and compute 2 MACs simultaneously. */
281 colCnt
= numColsA
>> 2;
283 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
284 pInA
= pSrcA
->pData
+ i
;
287 /* matrix multiplication */
290 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
291 #ifndef UNALIGNED_SUPPORT_DISABLE
293 /* read real and imag values from pSrcA and pSrcB buffer */
294 pSourceA1
= *__SIMD32(pInA
)++;
295 pSourceB1
= *__SIMD32(pInB
)++;
297 pSourceA2
= *__SIMD32(pInA
)++;
298 pSourceB2
= *__SIMD32(pInB
)++;
300 /* Multiply and Accumlates */
301 sum
= __SMLALD(pSourceA1
, pSourceB1
, sum
);
302 sum
= __SMLALD(pSourceA2
, pSourceB2
, sum
);
305 /* read real and imag values from pSrcA and pSrcB buffer */
309 /* Multiply and Accumlates */
315 /* Multiply and Accumlates */
320 /* Multiply and Accumlates */
324 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
326 /* Decrement the loop counter */
330 /* process remaining column samples */
331 colCnt
= numColsA
& 3U;
335 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
336 sum
+= *pInA
++ * *pInB
++;
338 /* Decrement the loop counter */
342 /* Saturate and store the result in the destination buffer */
343 *px
= (q15_t
) (__SSAT((sum
>> 15), 16));
346 /* Decrement the column loop counter */
353 /* Decrement the row loop counter */
360 /* Run the below code for Cortex-M0 */
362 q15_t
*pIn1
= pSrcA
->pData
; /* input data matrix pointer A */
363 q15_t
*pIn2
= pSrcB
->pData
; /* input data matrix pointer B */
364 q15_t
*pInA
= pSrcA
->pData
; /* input data matrix pointer A of Q15 type */
365 q15_t
*pInB
= pSrcB
->pData
; /* input data matrix pointer B of Q15 type */
366 q15_t
*pOut
= pDst
->pData
; /* output data matrix pointer */
367 q15_t
*px
; /* Temporary output data matrix pointer */
368 uint16_t numColsB
= pSrcB
->numCols
; /* number of columns of input matrix B */
369 uint16_t numColsA
= pSrcA
->numCols
; /* number of columns of input matrix A */
370 uint16_t numRowsA
= pSrcA
->numRows
; /* number of rows of input matrix A */
371 uint16_t col
, i
= 0U, row
= numRowsA
, colCnt
; /* loop counters */
372 arm_status status
; /* status of matrix multiplication */
374 #ifdef ARM_MATH_MATRIX_CHECK
376 /* Check for matrix mismatch condition */
377 if ((pSrcA
->numCols
!= pSrcB
->numRows
) ||
378 (pSrcA
->numRows
!= pDst
->numRows
) || (pSrcB
->numCols
!= pDst
->numCols
))
380 /* Set status as ARM_MATH_SIZE_MISMATCH */
381 status
= ARM_MATH_SIZE_MISMATCH
;
384 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
387 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
391 /* Output pointer is set to starting address of the row being processed */
394 /* For every row wise process, the column loop counter is to be initiated */
397 /* For every row wise process, the pIn2 pointer is set
398 ** to the starting address of the pSrcB data */
404 /* Set the variable sum, that acts as accumulator, to zero */
407 /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
410 /* Matrix A columns number of MAC operations are to be performed */
413 /* matrix multiplication */
416 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
417 /* Perform the multiply-accumulates */
418 sum
+= (q31_t
) * pIn1
++ * *pIn2
;
421 /* Decrement the loop counter */
425 /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
426 /* Saturate and store the result in the destination buffer */
427 *px
++ = (q15_t
) __SSAT((sum
>> 15), 16);
429 /* Decrement the column loop counter */
432 /* Update the pointer pIn2 to point to the starting address of the next column */
433 pIn2
= pInB
+ (numColsB
- col
);
437 /* Update the pointer pSrcA to point to the starting address of the next row */
439 pInA
= pInA
+ numColsA
;
441 /* Decrement the row loop counter */
446 #endif /* #if defined (ARM_MATH_DSP) */
447 /* set status as ARM_MATH_SUCCESS */
448 status
= ARM_MATH_SUCCESS
;
451 /* Return to application */
456 * @} end of MatrixMult group