before merging master
[inav.git] / lib / main / CMSIS / DSP / Source / MatrixFunctions / arm_mat_mult_q15.c
blob3244f471c320112c96a5c5bf3451444d04efebf9
1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_mult_q15.c
4 * Description: Q15 matrix multiplication
6 * $Date: 27. January 2017
7 * $Revision: V.1.5.1
9 * Target Processor: Cortex-M cores
10 * -------------------------------------------------------------------- */
12 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
14 * SPDX-License-Identifier: Apache-2.0
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
20 * www.apache.org/licenses/LICENSE-2.0
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
29 #include "arm_math.h"
31 /**
32 * @ingroup groupMatrix
35 /**
36 * @addtogroup MatrixMult
37 * @{
41 /**
42 * @brief Q15 matrix multiplication
43 * @param[in] *pSrcA points to the first input matrix structure
44 * @param[in] *pSrcB points to the second input matrix structure
45 * @param[out] *pDst points to output matrix structure
46 * @param[in] *pState points to the array for storing intermediate results (Unused)
47 * @return The function returns either
48 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
50 * @details
51 * <b>Scaling and Overflow Behavior:</b>
53 * \par
54 * The function is implemented using a 64-bit internal accumulator. The inputs to the
55 * multiplications are in 1.15 format and multiplications yield a 2.30 result.
56 * The 2.30 intermediate
57 * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
58 * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
59 * truncated to 34.15 format by discarding the low 15 bits and then saturated to
60 * 1.15 format.
62 * \par
63 * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
67 arm_status arm_mat_mult_q15(
68 const arm_matrix_instance_q15 * pSrcA,
69 const arm_matrix_instance_q15 * pSrcB,
70 arm_matrix_instance_q15 * pDst,
71 q15_t * pState)
73 q63_t sum; /* accumulator */
75 #if defined (ARM_MATH_DSP)
77 /* Run the below code for Cortex-M4 and Cortex-M3 */
79 q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
80 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
81 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
82 q15_t *px; /* Temporary output data matrix pointer */
83 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
84 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
85 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
86 uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
87 uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
88 arm_status status; /* status of matrix multiplication */
90 #ifndef UNALIGNED_SUPPORT_DISABLE
92 q31_t in; /* Temporary variable to hold the input value */
93 q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2;
95 #else
97 q15_t in; /* Temporary variable to hold the input value */
98 q15_t inA1, inB1, inA2, inB2;
100 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
102 #ifdef ARM_MATH_MATRIX_CHECK
103 /* Check for matrix mismatch condition */
104 if ((pSrcA->numCols != pSrcB->numRows) ||
105 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
107 /* Set status as ARM_MATH_SIZE_MISMATCH */
108 status = ARM_MATH_SIZE_MISMATCH;
110 else
111 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
113 /* Matrix transpose */
116 /* Apply loop unrolling and exchange the columns with row elements */
117 col = numColsB >> 2;
119 /* The pointer px is set to starting address of the column being processed */
120 px = pSrcBT + i;
122 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
123 ** a second loop below computes the remaining 1 to 3 samples. */
124 while (col > 0U)
126 #ifndef UNALIGNED_SUPPORT_DISABLE
128 /* Read two elements from the row */
129 in = *__SIMD32(pInB)++;
131 /* Unpack and store one element in the destination */
132 #ifndef ARM_MATH_BIG_ENDIAN
134 *px = (q15_t) in;
136 #else
138 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
140 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
142 /* Update the pointer px to point to the next row of the transposed matrix */
143 px += numRowsB;
145 /* Unpack and store the second element in the destination */
146 #ifndef ARM_MATH_BIG_ENDIAN
148 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
150 #else
152 *px = (q15_t) in;
154 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
156 /* Update the pointer px to point to the next row of the transposed matrix */
157 px += numRowsB;
159 /* Read two elements from the row */
160 in = *__SIMD32(pInB)++;
162 /* Unpack and store one element in the destination */
163 #ifndef ARM_MATH_BIG_ENDIAN
165 *px = (q15_t) in;
167 #else
169 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
171 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
173 /* Update the pointer px to point to the next row of the transposed matrix */
174 px += numRowsB;
176 /* Unpack and store the second element in the destination */
178 #ifndef ARM_MATH_BIG_ENDIAN
180 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
182 #else
184 *px = (q15_t) in;
186 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
188 /* Update the pointer px to point to the next row of the transposed matrix */
189 px += numRowsB;
191 #else
193 /* Read one element from the row */
194 in = *pInB++;
196 /* Store one element in the destination */
197 *px = in;
199 /* Update the pointer px to point to the next row of the transposed matrix */
200 px += numRowsB;
202 /* Read one element from the row */
203 in = *pInB++;
205 /* Store one element in the destination */
206 *px = in;
208 /* Update the pointer px to point to the next row of the transposed matrix */
209 px += numRowsB;
211 /* Read one element from the row */
212 in = *pInB++;
214 /* Store one element in the destination */
215 *px = in;
217 /* Update the pointer px to point to the next row of the transposed matrix */
218 px += numRowsB;
220 /* Read one element from the row */
221 in = *pInB++;
223 /* Store one element in the destination */
224 *px = in;
226 /* Update the pointer px to point to the next row of the transposed matrix */
227 px += numRowsB;
229 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
231 /* Decrement the column loop counter */
232 col--;
235 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
236 ** No loop unrolling is used. */
237 col = numColsB % 0x4U;
239 while (col > 0U)
241 /* Read and store the input element in the destination */
242 *px = *pInB++;
244 /* Update the pointer px to point to the next row of the transposed matrix */
245 px += numRowsB;
247 /* Decrement the column loop counter */
248 col--;
251 i++;
253 /* Decrement the row loop counter */
254 row--;
256 } while (row > 0U);
258 /* Reset the variables for the usage in the following multiplication process */
259 row = numRowsA;
260 i = 0U;
261 px = pDst->pData;
263 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
264 /* row loop */
267 /* For every row wise process, the column loop counter is to be initiated */
268 col = numColsB;
270 /* For every row wise process, the pIn2 pointer is set
271 ** to the starting address of the transposed pSrcB data */
272 pInB = pSrcBT;
274 /* column loop */
277 /* Set the variable sum, that acts as accumulator, to zero */
278 sum = 0;
280 /* Apply loop unrolling and compute 2 MACs simultaneously. */
281 colCnt = numColsA >> 2;
283 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
284 pInA = pSrcA->pData + i;
287 /* matrix multiplication */
288 while (colCnt > 0U)
290 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
291 #ifndef UNALIGNED_SUPPORT_DISABLE
293 /* read real and imag values from pSrcA and pSrcB buffer */
294 pSourceA1 = *__SIMD32(pInA)++;
295 pSourceB1 = *__SIMD32(pInB)++;
297 pSourceA2 = *__SIMD32(pInA)++;
298 pSourceB2 = *__SIMD32(pInB)++;
300 /* Multiply and Accumlates */
301 sum = __SMLALD(pSourceA1, pSourceB1, sum);
302 sum = __SMLALD(pSourceA2, pSourceB2, sum);
304 #else
305 /* read real and imag values from pSrcA and pSrcB buffer */
306 inA1 = *pInA++;
307 inB1 = *pInB++;
308 inA2 = *pInA++;
309 /* Multiply and Accumlates */
310 sum += inA1 * inB1;
311 inB2 = *pInB++;
313 inA1 = *pInA++;
314 inB1 = *pInB++;
315 /* Multiply and Accumlates */
316 sum += inA2 * inB2;
317 inA2 = *pInA++;
318 inB2 = *pInB++;
320 /* Multiply and Accumlates */
321 sum += inA1 * inB1;
322 sum += inA2 * inB2;
324 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
326 /* Decrement the loop counter */
327 colCnt--;
330 /* process remaining column samples */
331 colCnt = numColsA & 3U;
333 while (colCnt > 0U)
335 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
336 sum += *pInA++ * *pInB++;
338 /* Decrement the loop counter */
339 colCnt--;
342 /* Saturate and store the result in the destination buffer */
343 *px = (q15_t) (__SSAT((sum >> 15), 16));
344 px++;
346 /* Decrement the column loop counter */
347 col--;
349 } while (col > 0U);
351 i = i + numColsA;
353 /* Decrement the row loop counter */
354 row--;
356 } while (row > 0U);
358 #else
360 /* Run the below code for Cortex-M0 */
362 q15_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
363 q15_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
364 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
365 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
366 q15_t *pOut = pDst->pData; /* output data matrix pointer */
367 q15_t *px; /* Temporary output data matrix pointer */
368 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
369 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
370 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
371 uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
372 arm_status status; /* status of matrix multiplication */
374 #ifdef ARM_MATH_MATRIX_CHECK
376 /* Check for matrix mismatch condition */
377 if ((pSrcA->numCols != pSrcB->numRows) ||
378 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
380 /* Set status as ARM_MATH_SIZE_MISMATCH */
381 status = ARM_MATH_SIZE_MISMATCH;
383 else
384 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
387 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
388 /* row loop */
391 /* Output pointer is set to starting address of the row being processed */
392 px = pOut + i;
394 /* For every row wise process, the column loop counter is to be initiated */
395 col = numColsB;
397 /* For every row wise process, the pIn2 pointer is set
398 ** to the starting address of the pSrcB data */
399 pIn2 = pSrcB->pData;
401 /* column loop */
404 /* Set the variable sum, that acts as accumulator, to zero */
405 sum = 0;
407 /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
408 pIn1 = pInA;
410 /* Matrix A columns number of MAC operations are to be performed */
411 colCnt = numColsA;
413 /* matrix multiplication */
414 while (colCnt > 0U)
416 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
417 /* Perform the multiply-accumulates */
418 sum += (q31_t) * pIn1++ * *pIn2;
419 pIn2 += numColsB;
421 /* Decrement the loop counter */
422 colCnt--;
425 /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
426 /* Saturate and store the result in the destination buffer */
427 *px++ = (q15_t) __SSAT((sum >> 15), 16);
429 /* Decrement the column loop counter */
430 col--;
432 /* Update the pointer pIn2 to point to the starting address of the next column */
433 pIn2 = pInB + (numColsB - col);
435 } while (col > 0U);
437 /* Update the pointer pSrcA to point to the starting address of the next row */
438 i = i + numColsB;
439 pInA = pInA + numColsA;
441 /* Decrement the row loop counter */
442 row--;
444 } while (row > 0U);
446 #endif /* #if defined (ARM_MATH_DSP) */
447 /* set status as ARM_MATH_SUCCESS */
448 status = ARM_MATH_SUCCESS;
451 /* Return to application */
452 return (status);
456 * @} end of MatrixMult group