lib/main/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f32.c

   1 /* ----------------------------------------------------------------------
   2  * Project:      CMSIS DSP Library
   3  * Title:        arm_mat_mult_f32.c
   4  * Description:  Floating-point matrix multiplication
   5  *
   6  * $Date:        27. January 2017
   7  * $Revision:    V.1.5.1
   8  *
   9  * Target Processor: Cortex-M cores
  10  * -------------------------------------------------------------------- */
  11 /*
  12  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13  *
  14  * SPDX-License-Identifier: Apache-2.0
  15  *
  16  * Licensed under the Apache License, Version 2.0 (the License); you may
  17  * not use this file except in compliance with the License.
  18  * You may obtain a copy of the License at
  19  *
  20  * www.apache.org/licenses/LICENSE-2.0
  21  *
  22  * Unless required by applicable law or agreed to in writing, software
  23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25  * See the License for the specific language governing permissions and
  26  * limitations under the License.
  27  */
  28
  29 #include "arm_math.h"
  30
  31 /**
  32  * @ingroup groupMatrix
  33  */
  34
  35 /**
  36  * @defgroup MatrixMult Matrix Multiplication
  37  *
  38  * Multiplies two matrices.
  39  *
  40  * \image html MatrixMultiplication.gif "Multiplication of two 3 x 3 matrices"
  41
  42  * Matrix multiplication is only defined if the number of columns of the
  43  * first matrix equals the number of rows of the second matrix.
  44  * Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
  45  * in an <code>M x P</code> matrix.
  46  * When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
  47  * <code>pSrcA</code> and <code>pSrcB</code> are equal; and (2) that the size of the output
  48  * matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
  49  */
  50
  51
  52 /**
  53  * @addtogroup MatrixMult
  54  * @{
  55  */
  56
  57 /**
  58  * @brief Floating-point matrix multiplication.
  59  * @param[in]       *pSrcA points to the first input matrix structure
  60  * @param[in]       *pSrcB points to the second input matrix structure
  61  * @param[out]      *pDst points to output matrix structure
  62  * @return              The function returns either
  63  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  64  */
  65
  66 arm_status arm_mat_mult_f32(
  67   const arm_matrix_instance_f32 * pSrcA,
  68   const arm_matrix_instance_f32 * pSrcB,
  69   arm_matrix_instance_f32 * pDst)
  70 {
  71   float32_t *pIn1 = pSrcA->pData;                /* input data matrix pointer A */
  72   float32_t *pIn2 = pSrcB->pData;                /* input data matrix pointer B */
  73   float32_t *pInA = pSrcA->pData;                /* input data matrix pointer A  */
  74   float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
  75   float32_t *px;                                 /* Temporary output data matrix pointer */
  76   float32_t sum;                                 /* Accumulator */
  77   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A */
  78   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  79   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  80
  81 #if defined (ARM_MATH_DSP)
  82
  83   /* Run the below code for Cortex-M4 and Cortex-M3 */
  84
  85   float32_t in1, in2, in3, in4;
  86   uint16_t col, i = 0U, j, row = numRowsA, colCnt;      /* loop counters */
  87   arm_status status;                             /* status of matrix multiplication */
  88
  89 #ifdef ARM_MATH_MATRIX_CHECK
  90
  91
  92   /* Check for matrix mismatch condition */
  93   if ((pSrcA->numCols != pSrcB->numRows) ||
  94      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  95   {
  96
  97     /* Set status as ARM_MATH_SIZE_MISMATCH */
  98     status = ARM_MATH_SIZE_MISMATCH;
  99   }
 100   else
 101 #endif /*      #ifdef ARM_MATH_MATRIX_CHECK    */
 102
 103   {
 104     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
 105     /* row loop */
 106     do
 107     {
 108       /* Output pointer is set to starting address of the row being processed */
 109       px = pOut + i;
 110
 111       /* For every row wise process, the column loop counter is to be initiated */
 112       col = numColsB;
 113
 114       /* For every row wise process, the pIn2 pointer is set
 115        ** to the starting address of the pSrcB data */
 116       pIn2 = pSrcB->pData;
 117
 118       j = 0U;
 119
 120       /* column loop */
 121       do
 122       {
 123         /* Set the variable sum, that acts as accumulator, to zero */
 124         sum = 0.0f;
 125
 126         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
 127         pIn1 = pInA;
 128
 129         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 130         colCnt = numColsA >> 2U;
 131
 132         /* matrix multiplication        */
 133         while (colCnt > 0U)
 134         {
 135           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 136           in3 = *pIn2;
 137           pIn2 += numColsB;
 138           in1 = pIn1[0];
 139           in2 = pIn1[1];
 140           sum += in1 * in3;
 141           in4 = *pIn2;
 142           pIn2 += numColsB;
 143           sum += in2 * in4;
 144
 145           in3 = *pIn2;
 146           pIn2 += numColsB;
 147           in1 = pIn1[2];
 148           in2 = pIn1[3];
 149           sum += in1 * in3;
 150           in4 = *pIn2;
 151           pIn2 += numColsB;
 152           sum += in2 * in4;
 153           pIn1 += 4U;
 154
 155           /* Decrement the loop count */
 156           colCnt--;
 157         }
 158
 159         /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
 160          ** No loop unrolling is used. */
 161         colCnt = numColsA % 0x4U;
 162
 163         while (colCnt > 0U)
 164         {
 165           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 166           sum += *pIn1++ * (*pIn2);
 167           pIn2 += numColsB;
 168
 169           /* Decrement the loop counter */
 170           colCnt--;
 171         }
 172
 173         /* Store the result in the destination buffer */
 174         *px++ = sum;
 175
 176         /* Update the pointer pIn2 to point to the  starting address of the next column */
 177         j++;
 178         pIn2 = pSrcB->pData + j;
 179
 180         /* Decrement the column loop counter */
 181         col--;
 182
 183       } while (col > 0U);
 184
 185 #else
 186
 187   /* Run the below code for Cortex-M0 */
 188
 189   float32_t *pInB = pSrcB->pData;                /* input data matrix pointer B */
 190   uint16_t col, i = 0U, row = numRowsA, colCnt;  /* loop counters */
 191   arm_status status;                             /* status of matrix multiplication */
 192
 193 #ifdef ARM_MATH_MATRIX_CHECK
 194
 195   /* Check for matrix mismatch condition */
 196   if ((pSrcA->numCols != pSrcB->numRows) ||
 197      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
 198   {
 199
 200     /* Set status as ARM_MATH_SIZE_MISMATCH */
 201     status = ARM_MATH_SIZE_MISMATCH;
 202   }
 203   else
 204 #endif /*      #ifdef ARM_MATH_MATRIX_CHECK    */
 205
 206   {
 207     /* The following loop performs the dot-product of each row in pInA with each column in pInB */
 208     /* row loop */
 209     do
 210     {
 211       /* Output pointer is set to starting address of the row being processed */
 212       px = pOut + i;
 213
 214       /* For every row wise process, the column loop counter is to be initiated */
 215       col = numColsB;
 216
 217       /* For every row wise process, the pIn2 pointer is set
 218        ** to the starting address of the pSrcB data */
 219       pIn2 = pSrcB->pData;
 220
 221       /* column loop */
 222       do
 223       {
 224         /* Set the variable sum, that acts as accumulator, to zero */
 225         sum = 0.0f;
 226
 227         /* Initialize the pointer pIn1 to point to the starting address of the row being processed */
 228         pIn1 = pInA;
 229
 230         /* Matrix A columns number of MAC operations are to be performed */
 231         colCnt = numColsA;
 232
 233         while (colCnt > 0U)
 234         {
 235           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 236           sum += *pIn1++ * (*pIn2);
 237           pIn2 += numColsB;
 238
 239           /* Decrement the loop counter */
 240           colCnt--;
 241         }
 242
 243         /* Store the result in the destination buffer */
 244         *px++ = sum;
 245
 246         /* Decrement the column loop counter */
 247         col--;
 248
 249         /* Update the pointer pIn2 to point to the  starting address of the next column */
 250         pIn2 = pInB + (numColsB - col);
 251
 252       } while (col > 0U);
 253
 254 #endif /* #if defined (ARM_MATH_DSP) */
 255
 256       /* Update the pointer pInA to point to the  starting address of the next row */
 257       i = i + numColsB;
 258       pInA = pInA + numColsA;
 259
 260       /* Decrement the row loop counter */
 261       row--;
 262
 263     } while (row > 0U);
 264     /* Set status as ARM_MATH_SUCCESS */
 265     status = ARM_MATH_SUCCESS;
 266   }
 267
 268   /* Return to application */
 269   return (status);
 270 }
 271
 272 /**
 273  * @} end of MatrixMult group
 274  */