before merging master
[inav.git] / lib / main / CMSIS / DSP / Source / MatrixFunctions / arm_mat_cmplx_mult_q15.c
blob5dee79c9eb614f078f4bc36d563af961a28a3ec1
1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mat_mult_q15.c
4 * Description: Q15 complex matrix multiplication
6 * $Date: 27. January 2017
7 * $Revision: V.1.5.1
9 * Target Processor: Cortex-M cores
10 * -------------------------------------------------------------------- */
12 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
14 * SPDX-License-Identifier: Apache-2.0
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
20 * www.apache.org/licenses/LICENSE-2.0
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
29 #include "arm_math.h"
31 /**
32 * @ingroup groupMatrix
35 /**
36 * @addtogroup CmplxMatrixMult
37 * @{
41 /**
42 * @brief Q15 Complex matrix multiplication
43 * @param[in] *pSrcA points to the first input complex matrix structure
44 * @param[in] *pSrcB points to the second input complex matrix structure
45 * @param[out] *pDst points to output complex matrix structure
46 * @param[in] *pScratch points to the array for storing intermediate results
47 * @return The function returns either
48 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
50 * \par Conditions for optimum performance
51 * Input, output and state buffers should be aligned by 32-bit
53 * \par Restrictions
54 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
55 * In this case input, output, scratch buffers should be aligned by 32-bit
57 * @details
58 * <b>Scaling and Overflow Behavior:</b>
60 * \par
61 * The function is implemented using a 64-bit internal accumulator. The inputs to the
62 * multiplications are in 1.15 format and multiplications yield a 2.30 result.
63 * The 2.30 intermediate
64 * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
65 * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
66 * truncated to 34.15 format by discarding the low 15 bits and then saturated to
67 * 1.15 format.
69 * \par
70 * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function.
77 arm_status arm_mat_cmplx_mult_q15(
78 const arm_matrix_instance_q15 * pSrcA,
79 const arm_matrix_instance_q15 * pSrcB,
80 arm_matrix_instance_q15 * pDst,
81 q15_t * pScratch)
83 /* accumulator */
84 q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */
85 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
86 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
87 q15_t *px; /* Temporary output data matrix pointer */
88 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
89 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
90 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
91 uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
92 uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
93 arm_status status; /* status of matrix multiplication */
94 q63_t sumReal, sumImag;
96 #ifdef UNALIGNED_SUPPORT_DISABLE
97 q15_t in; /* Temporary variable to hold the input value */
98 q15_t a, b, c, d;
99 #else
100 q31_t in; /* Temporary variable to hold the input value */
101 q31_t prod1, prod2;
102 q31_t pSourceA, pSourceB;
103 #endif
105 #ifdef ARM_MATH_MATRIX_CHECK
106 /* Check for matrix mismatch condition */
107 if ((pSrcA->numCols != pSrcB->numRows) ||
108 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
110 /* Set status as ARM_MATH_SIZE_MISMATCH */
111 status = ARM_MATH_SIZE_MISMATCH;
113 else
114 #endif
116 /* Matrix transpose */
119 /* Apply loop unrolling and exchange the columns with row elements */
120 col = numColsB >> 2;
122 /* The pointer px is set to starting address of the column being processed */
123 px = pSrcBT + i;
125 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
126 ** a second loop below computes the remaining 1 to 3 samples. */
127 while (col > 0U)
129 #ifdef UNALIGNED_SUPPORT_DISABLE
130 /* Read two elements from the row */
131 in = *pInB++;
132 *px = in;
133 in = *pInB++;
134 px[1] = in;
136 /* Update the pointer px to point to the next row of the transposed matrix */
137 px += numRowsB * 2;
139 /* Read two elements from the row */
140 in = *pInB++;
141 *px = in;
142 in = *pInB++;
143 px[1] = in;
145 /* Update the pointer px to point to the next row of the transposed matrix */
146 px += numRowsB * 2;
148 /* Read two elements from the row */
149 in = *pInB++;
150 *px = in;
151 in = *pInB++;
152 px[1] = in;
154 /* Update the pointer px to point to the next row of the transposed matrix */
155 px += numRowsB * 2;
157 /* Read two elements from the row */
158 in = *pInB++;
159 *px = in;
160 in = *pInB++;
161 px[1] = in;
163 /* Update the pointer px to point to the next row of the transposed matrix */
164 px += numRowsB * 2;
166 /* Decrement the column loop counter */
167 col--;
170 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
171 ** No loop unrolling is used. */
172 col = numColsB % 0x4U;
174 while (col > 0U)
176 /* Read two elements from the row */
177 in = *pInB++;
178 *px = in;
179 in = *pInB++;
180 px[1] = in;
181 #else
183 /* Read two elements from the row */
184 in = *__SIMD32(pInB)++;
186 *__SIMD32(px) = in;
188 /* Update the pointer px to point to the next row of the transposed matrix */
189 px += numRowsB * 2;
192 /* Read two elements from the row */
193 in = *__SIMD32(pInB)++;
195 *__SIMD32(px) = in;
197 /* Update the pointer px to point to the next row of the transposed matrix */
198 px += numRowsB * 2;
200 /* Read two elements from the row */
201 in = *__SIMD32(pInB)++;
203 *__SIMD32(px) = in;
205 /* Update the pointer px to point to the next row of the transposed matrix */
206 px += numRowsB * 2;
208 /* Read two elements from the row */
209 in = *__SIMD32(pInB)++;
211 *__SIMD32(px) = in;
213 /* Update the pointer px to point to the next row of the transposed matrix */
214 px += numRowsB * 2;
216 /* Decrement the column loop counter */
217 col--;
220 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
221 ** No loop unrolling is used. */
222 col = numColsB % 0x4U;
224 while (col > 0U)
226 /* Read two elements from the row */
227 in = *__SIMD32(pInB)++;
229 *__SIMD32(px) = in;
230 #endif
232 /* Update the pointer px to point to the next row of the transposed matrix */
233 px += numRowsB * 2;
235 /* Decrement the column loop counter */
236 col--;
239 i = i + 2U;
241 /* Decrement the row loop counter */
242 row--;
244 } while (row > 0U);
246 /* Reset the variables for the usage in the following multiplication process */
247 row = numRowsA;
248 i = 0U;
249 px = pDst->pData;
251 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
252 /* row loop */
255 /* For every row wise process, the column loop counter is to be initiated */
256 col = numColsB;
258 /* For every row wise process, the pIn2 pointer is set
259 ** to the starting address of the transposed pSrcB data */
260 pInB = pSrcBT;
262 /* column loop */
265 /* Set the variable sum, that acts as accumulator, to zero */
266 sumReal = 0;
267 sumImag = 0;
269 /* Apply loop unrolling and compute 2 MACs simultaneously. */
270 colCnt = numColsA >> 1;
272 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
273 pInA = pSrcA->pData + i * 2;
276 /* matrix multiplication */
277 while (colCnt > 0U)
279 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
281 #ifdef UNALIGNED_SUPPORT_DISABLE
283 /* read real and imag values from pSrcA buffer */
284 a = *pInA;
285 b = *(pInA + 1U);
286 /* read real and imag values from pSrcB buffer */
287 c = *pInB;
288 d = *(pInB + 1U);
290 /* Multiply and Accumlates */
291 sumReal += (q31_t) a *c;
292 sumImag += (q31_t) a *d;
293 sumReal -= (q31_t) b *d;
294 sumImag += (q31_t) b *c;
296 /* read next real and imag values from pSrcA buffer */
297 a = *(pInA + 2U);
298 b = *(pInA + 3U);
299 /* read next real and imag values from pSrcB buffer */
300 c = *(pInB + 2U);
301 d = *(pInB + 3U);
303 /* update pointer */
304 pInA += 4U;
306 /* Multiply and Accumlates */
307 sumReal += (q31_t) a *c;
308 sumImag += (q31_t) a *d;
309 sumReal -= (q31_t) b *d;
310 sumImag += (q31_t) b *c;
311 /* update pointer */
312 pInB += 4U;
313 #else
314 /* read real and imag values from pSrcA and pSrcB buffer */
315 pSourceA = *__SIMD32(pInA)++;
316 pSourceB = *__SIMD32(pInB)++;
318 /* Multiply and Accumlates */
319 #ifdef ARM_MATH_BIG_ENDIAN
320 prod1 = -__SMUSD(pSourceA, pSourceB);
321 #else
322 prod1 = __SMUSD(pSourceA, pSourceB);
323 #endif
324 prod2 = __SMUADX(pSourceA, pSourceB);
325 sumReal += (q63_t) prod1;
326 sumImag += (q63_t) prod2;
328 /* read real and imag values from pSrcA and pSrcB buffer */
329 pSourceA = *__SIMD32(pInA)++;
330 pSourceB = *__SIMD32(pInB)++;
332 /* Multiply and Accumlates */
333 #ifdef ARM_MATH_BIG_ENDIAN
334 prod1 = -__SMUSD(pSourceA, pSourceB);
335 #else
336 prod1 = __SMUSD(pSourceA, pSourceB);
337 #endif
338 prod2 = __SMUADX(pSourceA, pSourceB);
339 sumReal += (q63_t) prod1;
340 sumImag += (q63_t) prod2;
342 #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
344 /* Decrement the loop counter */
345 colCnt--;
348 /* process odd column samples */
349 if ((numColsA & 0x1U) > 0U)
351 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
353 #ifdef UNALIGNED_SUPPORT_DISABLE
355 /* read real and imag values from pSrcA and pSrcB buffer */
356 a = *pInA++;
357 b = *pInA++;
358 c = *pInB++;
359 d = *pInB++;
361 /* Multiply and Accumlates */
362 sumReal += (q31_t) a *c;
363 sumImag += (q31_t) a *d;
364 sumReal -= (q31_t) b *d;
365 sumImag += (q31_t) b *c;
367 #else
368 /* read real and imag values from pSrcA and pSrcB buffer */
369 pSourceA = *__SIMD32(pInA)++;
370 pSourceB = *__SIMD32(pInB)++;
372 /* Multiply and Accumlates */
373 #ifdef ARM_MATH_BIG_ENDIAN
374 prod1 = -__SMUSD(pSourceA, pSourceB);
375 #else
376 prod1 = __SMUSD(pSourceA, pSourceB);
377 #endif
378 prod2 = __SMUADX(pSourceA, pSourceB);
379 sumReal += (q63_t) prod1;
380 sumImag += (q63_t) prod2;
382 #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
386 /* Saturate and store the result in the destination buffer */
388 *px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
389 *px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
391 /* Decrement the column loop counter */
392 col--;
394 } while (col > 0U);
396 i = i + numColsA;
398 /* Decrement the row loop counter */
399 row--;
401 } while (row > 0U);
403 /* set status as ARM_MATH_SUCCESS */
404 status = ARM_MATH_SUCCESS;
407 /* Return to application */
408 return (status);
412 * @} end of MatrixMult group