1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_cmplx_mult_q31.c
4 * Description: Floating-point matrix multiplication
6 * $Date: 27. January 2017
9 * Target Processor: Cortex-M cores
10 * -------------------------------------------------------------------- */
12 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
14 * SPDX-License-Identifier: Apache-2.0
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
20 * www.apache.org/licenses/LICENSE-2.0
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
32 * @ingroup groupMatrix
36 * @addtogroup CmplxMatrixMult
41 * @brief Q31 Complex matrix multiplication
42 * @param[in] *pSrcA points to the first input complex matrix structure
43 * @param[in] *pSrcB points to the second input complex matrix structure
44 * @param[out] *pDst points to output complex matrix structure
45 * @return The function returns either
46 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
49 * <b>Scaling and Overflow Behavior:</b>
52 * The function is implemented using an internal 64-bit accumulator.
53 * The accumulator has a 2.62 format and maintains full precision of the intermediate
54 * multiplication results but provides only a single guard bit. There is no saturation
55 * on intermediate additions. Thus, if the accumulator overflows it wraps around and
56 * distorts the result. The input signals should be scaled down to avoid intermediate
57 * overflows. The input is thus scaled down by log2(numColsA) bits
58 * to avoid overflows, as a total of numColsA additions are performed internally.
59 * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
64 arm_status
arm_mat_cmplx_mult_q31(
65 const arm_matrix_instance_q31
* pSrcA
,
66 const arm_matrix_instance_q31
* pSrcB
,
67 arm_matrix_instance_q31
* pDst
)
69 q31_t
*pIn1
= pSrcA
->pData
; /* input data matrix pointer A */
70 q31_t
*pIn2
= pSrcB
->pData
; /* input data matrix pointer B */
71 q31_t
*pInA
= pSrcA
->pData
; /* input data matrix pointer A */
72 q31_t
*pOut
= pDst
->pData
; /* output data matrix pointer */
73 q31_t
*px
; /* Temporary output data matrix pointer */
74 uint16_t numRowsA
= pSrcA
->numRows
; /* number of rows of input matrix A */
75 uint16_t numColsB
= pSrcB
->numCols
; /* number of columns of input matrix B */
76 uint16_t numColsA
= pSrcA
->numCols
; /* number of columns of input matrix A */
77 q63_t sumReal1
, sumImag1
; /* accumulator */
82 /* Run the below code for Cortex-M4 and Cortex-M3 */
84 uint16_t col
, i
= 0U, j
, row
= numRowsA
, colCnt
; /* loop counters */
85 arm_status status
; /* status of matrix multiplication */
87 #ifdef ARM_MATH_MATRIX_CHECK
90 /* Check for matrix mismatch condition */
91 if ((pSrcA
->numCols
!= pSrcB
->numRows
) ||
92 (pSrcA
->numRows
!= pDst
->numRows
) || (pSrcB
->numCols
!= pDst
->numCols
))
95 /* Set status as ARM_MATH_SIZE_MISMATCH */
96 status
= ARM_MATH_SIZE_MISMATCH
;
99 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
102 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
106 /* Output pointer is set to starting address of the row being processed */
109 /* For every row wise process, the column loop counter is to be initiated */
112 /* For every row wise process, the pIn2 pointer is set
113 ** to the starting address of the pSrcB data */
121 /* Set the variable sum, that acts as accumulator, to zero */
125 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
128 /* Apply loop unrolling and compute 4 MACs simultaneously. */
129 colCnt
= numColsA
>> 2;
131 /* matrix multiplication */
135 /* Reading real part of complex matrix A */
138 /* Reading real part of complex matrix B */
141 /* Reading imaginary part of complex matrix A */
144 /* Reading imaginary part of complex matrix B */
147 /* Multiply and Accumlates */
148 sumReal1
+= (q63_t
) a0
*c0
;
149 sumImag1
+= (q63_t
) b0
*c0
;
151 /* update pointers */
153 pIn2
+= 2 * numColsB
;
155 /* Multiply and Accumlates */
156 sumReal1
-= (q63_t
) b0
*d0
;
157 sumImag1
+= (q63_t
) a0
*d0
;
159 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
161 /* read real and imag values from pSrcA and pSrcB buffer */
167 /* Multiply and Accumlates */
168 sumReal1
+= (q63_t
) a1
*c1
;
169 sumImag1
+= (q63_t
) b1
*c1
;
171 /* update pointers */
173 pIn2
+= 2 * numColsB
;
175 /* Multiply and Accumlates */
176 sumReal1
-= (q63_t
) b1
*d1
;
177 sumImag1
+= (q63_t
) a1
*d1
;
185 /* Multiply and Accumlates */
186 sumReal1
+= (q63_t
) a0
*c0
;
187 sumImag1
+= (q63_t
) b0
*c0
;
189 /* update pointers */
191 pIn2
+= 2 * numColsB
;
193 /* Multiply and Accumlates */
194 sumReal1
-= (q63_t
) b0
*d0
;
195 sumImag1
+= (q63_t
) a0
*d0
;
197 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
205 /* Multiply and Accumlates */
206 sumReal1
+= (q63_t
) a1
*c1
;
207 sumImag1
+= (q63_t
) b1
*c1
;
209 /* update pointers */
211 pIn2
+= 2 * numColsB
;
213 /* Multiply and Accumlates */
214 sumReal1
-= (q63_t
) b1
*d1
;
215 sumImag1
+= (q63_t
) a1
*d1
;
217 /* Decrement the loop count */
221 /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
222 ** No loop unrolling is used. */
223 colCnt
= numColsA
% 0x4U
;
227 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
234 /* Multiply and Accumlates */
235 sumReal1
+= (q63_t
) a1
*c1
;
236 sumImag1
+= (q63_t
) b1
*c1
;
238 /* update pointers */
240 pIn2
+= 2 * numColsB
;
242 /* Multiply and Accumlates */
243 sumReal1
-= (q63_t
) b1
*d1
;
244 sumImag1
+= (q63_t
) a1
*d1
;
246 /* Decrement the loop counter */
250 /* Store the result in the destination buffer */
251 *px
++ = (q31_t
) clip_q63_to_q31(sumReal1
>> 31);
252 *px
++ = (q31_t
) clip_q63_to_q31(sumImag1
>> 31);
254 /* Update the pointer pIn2 to point to the starting address of the next column */
256 pIn2
= pSrcB
->pData
+ 2U * j
;
258 /* Decrement the column loop counter */
263 /* Update the pointer pInA to point to the starting address of the next row */
265 pInA
= pInA
+ 2 * numColsA
;
267 /* Decrement the row loop counter */
272 /* Set status as ARM_MATH_SUCCESS */
273 status
= ARM_MATH_SUCCESS
;
276 /* Return to application */
281 * @} end of MatrixMult group