flang/runtime/matmul-transpose.cpp

   1 //===-- runtime/matmul-transpose.cpp --------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 // Implements a fused matmul-transpose operation
  10 //
  11 // There are two main entry points; one establishes a descriptor for the
  12 // result and allocates it, and the other expects a result descriptor that
  13 // points to existing storage.
  14 //
  15 // This implementation must handle all combinations of numeric types and
  16 // kinds (100 - 165 cases depending on the target), plus all combinations
  17 // of logical kinds (16).  A single template undergoes many instantiations
  18 // to cover all of the valid possibilities.
  19 //
  20 // The usefulness of this optimization should be reviewed once Matmul is swapped
  21 // to use the faster BLAS routines.
  22
  23 #include "flang/Runtime/matmul-transpose.h"
  24 #include "terminator.h"
  25 #include "tools.h"
  26 #include "flang/Runtime/c-or-cpp.h"
  27 #include "flang/Runtime/cpp-type.h"
  28 #include "flang/Runtime/descriptor.h"
  29 #include <cstring>
  30
  31 namespace {
  32 using namespace Fortran::runtime;
  33
  34 // Contiguous numeric TRANSPOSE(matrix)*matrix multiplication
  35 //   TRANSPOSE(matrix(n, rows)) * matrix(n,cols) ->
  36 //             matrix(rows, n)  * matrix(n,cols) -> matrix(rows,cols)
  37 // The transpose is implemented by swapping the indices of accesses into the LHS
  38 //
  39 // Straightforward algorithm:
  40 //   DO 1 I = 1, NROWS
  41 //    DO 1 J = 1, NCOLS
  42 //     RES(I,J) = 0
  43 //     DO 1 K = 1, N
  44 //   1  RES(I,J) = RES(I,J) + X(K,I)*Y(K,J)
  45 //
  46 // With loop distribution and transposition to avoid the inner sum
  47 // reduction and to avoid non-unit strides:
  48 //   DO 1 I = 1, NROWS
  49 //    DO 1 J = 1, NCOLS
  50 //   1 RES(I,J) = 0
  51 //   DO 2 J = 1, NCOLS
  52 //    DO 2 I = 1, NROWS
  53 //     DO 2 K = 1, N
  54 //   2  RES(I,J) = RES(I,J) + X(K,I)*Y(K,J) ! loop-invariant last term
  55 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
  56     bool X_HAS_STRIDED_COLUMNS, bool Y_HAS_STRIDED_COLUMNS>
  57 inline static void MatrixTransposedTimesMatrix(
  58     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
  59     SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
  60     SubscriptValue n, std::size_t xColumnByteStride = 0,
  61     std::size_t yColumnByteStride = 0) {
  62   using ResultType = CppTypeFor<RCAT, RKIND>;
  63
  64   std::memset(product, 0, rows * cols * sizeof *product);
  65   for (SubscriptValue j{0}; j < cols; ++j) {
  66     for (SubscriptValue i{0}; i < rows; ++i) {
  67       for (SubscriptValue k{0}; k < n; ++k) {
  68         ResultType x_ki;
  69         if constexpr (!X_HAS_STRIDED_COLUMNS) {
  70           x_ki = static_cast<ResultType>(x[i * n + k]);
  71         } else {
  72           x_ki = static_cast<ResultType>(reinterpret_cast<const XT *>(
  73               reinterpret_cast<const char *>(x) + i * xColumnByteStride)[k]);
  74         }
  75         ResultType y_kj;
  76         if constexpr (!Y_HAS_STRIDED_COLUMNS) {
  77           y_kj = static_cast<ResultType>(y[j * n + k]);
  78         } else {
  79           y_kj = static_cast<ResultType>(reinterpret_cast<const YT *>(
  80               reinterpret_cast<const char *>(y) + j * yColumnByteStride)[k]);
  81         }
  82         product[j * rows + i] += x_ki * y_kj;
  83       }
  84     }
  85   }
  86 }
  87
  88 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
  89 inline static void MatrixTransposedTimesMatrixHelper(
  90     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
  91     SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
  92     SubscriptValue n, std::optional<std::size_t> xColumnByteStride,
  93     std::optional<std::size_t> yColumnByteStride) {
  94   if (!xColumnByteStride) {
  95     if (!yColumnByteStride) {
  96       MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, false, false>(
  97           product, rows, cols, x, y, n);
  98     } else {
  99       MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, false, true>(
 100           product, rows, cols, x, y, n, 0, *yColumnByteStride);
 101     }
 102   } else {
 103     if (!yColumnByteStride) {
 104       MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, true, false>(
 105           product, rows, cols, x, y, n, *xColumnByteStride);
 106     } else {
 107       MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, true, true>(
 108           product, rows, cols, x, y, n, *xColumnByteStride, *yColumnByteStride);
 109     }
 110   }
 111 }
 112
 113 // Contiguous numeric matrix*vector multiplication
 114 //   matrix(rows,n) * column vector(n) -> column vector(rows)
 115 // Straightforward algorithm:
 116 //   DO 1 I = 1, NROWS
 117 //    RES(I) = 0
 118 //    DO 1 K = 1, N
 119 //   1 RES(I) = RES(I) + X(K,I)*Y(K)
 120 // With loop distribution and transposition to avoid the inner
 121 // sum reduction and to avoid non-unit strides:
 122 //   DO 1 I = 1, NROWS
 123 //   1 RES(I) = 0
 124 //   DO 2 I = 1, NROWS
 125 //    DO 2 K = 1, N
 126 //   2 RES(I) = RES(I) + X(K,I)*Y(K)
 127 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
 128     bool X_HAS_STRIDED_COLUMNS>
 129 inline static void MatrixTransposedTimesVector(
 130     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
 131     SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
 132     std::size_t xColumnByteStride = 0) {
 133   using ResultType = CppTypeFor<RCAT, RKIND>;
 134   std::memset(product, 0, rows * sizeof *product);
 135   for (SubscriptValue i{0}; i < rows; ++i) {
 136     for (SubscriptValue k{0}; k < n; ++k) {
 137       ResultType x_ki;
 138       if constexpr (!X_HAS_STRIDED_COLUMNS) {
 139         x_ki = static_cast<ResultType>(x[i * n + k]);
 140       } else {
 141         x_ki = static_cast<ResultType>(reinterpret_cast<const XT *>(
 142             reinterpret_cast<const char *>(x) + i * xColumnByteStride)[k]);
 143       }
 144       ResultType y_k = static_cast<ResultType>(y[k]);
 145       product[i] += x_ki * y_k;
 146     }
 147   }
 148 }
 149
 150 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 151 inline static void MatrixTransposedTimesVectorHelper(
 152     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
 153     SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
 154     std::optional<std::size_t> xColumnByteStride) {
 155   if (!xColumnByteStride) {
 156     MatrixTransposedTimesVector<RCAT, RKIND, XT, YT, false>(
 157         product, rows, n, x, y);
 158   } else {
 159     MatrixTransposedTimesVector<RCAT, RKIND, XT, YT, true>(
 160         product, rows, n, x, y, *xColumnByteStride);
 161   }
 162 }
 163
 164 // Implements an instance of MATMUL for given argument types.
 165 template <bool IS_ALLOCATING, TypeCategory RCAT, int RKIND, typename XT,
 166     typename YT>
 167 inline static void DoMatmulTranspose(
 168     std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor> &result,
 169     const Descriptor &x, const Descriptor &y, Terminator &terminator) {
 170   int xRank{x.rank()};
 171   int yRank{y.rank()};
 172   int resRank{xRank + yRank - 2};
 173   if (xRank * yRank != 2 * resRank) {
 174     terminator.Crash(
 175         "MATMUL-TRANSPOSE: bad argument ranks (%d * %d)", xRank, yRank);
 176   }
 177   SubscriptValue extent[2]{x.GetDimension(1).Extent(),
 178       resRank == 2 ? y.GetDimension(1).Extent() : 0};
 179   if constexpr (IS_ALLOCATING) {
 180     result.Establish(
 181         RCAT, RKIND, nullptr, resRank, extent, CFI_attribute_allocatable);
 182     for (int j{0}; j < resRank; ++j) {
 183       result.GetDimension(j).SetBounds(1, extent[j]);
 184     }
 185     if (int stat{result.Allocate()}) {
 186       terminator.Crash(
 187           "MATMUL-TRANSPOSE: could not allocate memory for result; STAT=%d",
 188           stat);
 189     }
 190   } else {
 191     RUNTIME_CHECK(terminator, resRank == result.rank());
 192     RUNTIME_CHECK(
 193         terminator, result.ElementBytes() == static_cast<std::size_t>(RKIND));
 194     RUNTIME_CHECK(terminator, result.GetDimension(0).Extent() == extent[0]);
 195     RUNTIME_CHECK(terminator,
 196         resRank == 1 || result.GetDimension(1).Extent() == extent[1]);
 197   }
 198   SubscriptValue n{x.GetDimension(0).Extent()};
 199   if (n != y.GetDimension(0).Extent()) {
 200     terminator.Crash(
 201         "MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
 202         static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
 203         static_cast<std::intmax_t>(x.GetDimension(1).Extent()),
 204         static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
 205         static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
 206   }
 207   using WriteResult =
 208       CppTypeFor<RCAT == TypeCategory::Logical ? TypeCategory::Integer : RCAT,
 209           RKIND>;
 210   const SubscriptValue rows{extent[0]};
 211   const SubscriptValue cols{extent[1]};
 212   if constexpr (RCAT != TypeCategory::Logical) {
 213     if (x.IsContiguous(1) && y.IsContiguous(1) &&
 214         (IS_ALLOCATING || result.IsContiguous())) {
 215       // Contiguous numeric matrices (maybe with columns
 216       // separated by a stride).
 217       std::optional<std::size_t> xColumnByteStride;
 218       if (!x.IsContiguous()) {
 219         // X's columns are strided.
 220         SubscriptValue xAt[2]{};
 221         x.GetLowerBounds(xAt);
 222         xAt[1]++;
 223         xColumnByteStride = x.SubscriptsToByteOffset(xAt);
 224       }
 225       std::optional<std::size_t> yColumnByteStride;
 226       if (!y.IsContiguous()) {
 227         // Y's columns are strided.
 228         SubscriptValue yAt[2]{};
 229         y.GetLowerBounds(yAt);
 230         yAt[1]++;
 231         yColumnByteStride = y.SubscriptsToByteOffset(yAt);
 232       }
 233       if (resRank == 2) { // M*M -> M
 234         // TODO: use BLAS-3 GEMM for supported types.
 235         MatrixTransposedTimesMatrixHelper<RCAT, RKIND, XT, YT>(
 236             result.template OffsetElement<WriteResult>(), rows, cols,
 237             x.OffsetElement<XT>(), y.OffsetElement<YT>(), n, xColumnByteStride,
 238             yColumnByteStride);
 239         return;
 240       }
 241       if (xRank == 2) { // M*V -> V
 242         // TODO: use BLAS-2 GEMM for supported types.
 243         MatrixTransposedTimesVectorHelper<RCAT, RKIND, XT, YT>(
 244             result.template OffsetElement<WriteResult>(), rows, n,
 245             x.OffsetElement<XT>(), y.OffsetElement<YT>(), xColumnByteStride);
 246         return;
 247       }
 248       // else V*M -> V (not allowed because TRANSPOSE() is only defined for rank
 249       // 1 matrices
 250       terminator.Crash(
 251           "MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
 252           static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
 253           static_cast<std::intmax_t>(n),
 254           static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
 255           static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
 256       return;
 257     }
 258   }
 259   // General algorithms for LOGICAL and noncontiguity
 260   SubscriptValue xLB[2], yLB[2], resLB[2];
 261   x.GetLowerBounds(xLB);
 262   y.GetLowerBounds(yLB);
 263   result.GetLowerBounds(resLB);
 264   using ResultType = CppTypeFor<RCAT, RKIND>;
 265   if (resRank == 2) { // M*M -> M
 266     for (SubscriptValue i{0}; i < rows; ++i) {
 267       for (SubscriptValue j{0}; j < cols; ++j) {
 268         ResultType res_ij;
 269         if constexpr (RCAT == TypeCategory::Logical) {
 270           res_ij = false;
 271         } else {
 272           res_ij = 0;
 273         }
 274
 275         for (SubscriptValue k{0}; k < n; ++k) {
 276           SubscriptValue xAt[2]{k + xLB[0], i + xLB[1]};
 277           SubscriptValue yAt[2]{k + yLB[0], j + yLB[1]};
 278           if constexpr (RCAT == TypeCategory::Logical) {
 279             ResultType x_ki = IsLogicalElementTrue(x, xAt);
 280             ResultType y_kj = IsLogicalElementTrue(y, yAt);
 281             res_ij = res_ij || (x_ki && y_kj);
 282           } else {
 283             ResultType x_ki = static_cast<ResultType>(*x.Element<XT>(xAt));
 284             ResultType y_kj = static_cast<ResultType>(*y.Element<YT>(yAt));
 285             res_ij += x_ki * y_kj;
 286           }
 287         }
 288         SubscriptValue resAt[2]{i + resLB[0], j + resLB[1]};
 289         *result.template Element<WriteResult>(resAt) = res_ij;
 290       }
 291     }
 292   } else if (xRank == 2) { // M*V -> V
 293     for (SubscriptValue i{0}; i < rows; ++i) {
 294       ResultType res_i;
 295       if constexpr (RCAT == TypeCategory::Logical) {
 296         res_i = false;
 297       } else {
 298         res_i = 0;
 299       }
 300
 301       for (SubscriptValue k{0}; k < n; ++k) {
 302         SubscriptValue xAt[2]{k + xLB[0], i + xLB[1]};
 303         SubscriptValue yAt[1]{k + yLB[0]};
 304         if constexpr (RCAT == TypeCategory::Logical) {
 305           ResultType x_ki = IsLogicalElementTrue(x, xAt);
 306           ResultType y_k = IsLogicalElementTrue(y, yAt);
 307           res_i = res_i || (x_ki && y_k);
 308         } else {
 309           ResultType x_ki = static_cast<ResultType>(*x.Element<XT>(xAt));
 310           ResultType y_k = static_cast<ResultType>(*y.Element<YT>(yAt));
 311           res_i += x_ki * y_k;
 312         }
 313       }
 314       SubscriptValue resAt[1]{i + resLB[0]};
 315       *result.template Element<WriteResult>(resAt) = res_i;
 316     }
 317   } else { // V*M -> V
 318     // TRANSPOSE(V) not allowed by fortran standard
 319     terminator.Crash(
 320         "MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
 321         static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
 322         static_cast<std::intmax_t>(n),
 323         static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
 324         static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
 325   }
 326 }
 327
 328 // Maps the dynamic type information from the arguments' descriptors
 329 // to the right instantiation of DoMatmul() for valid combinations of
 330 // types.
 331 template <bool IS_ALLOCATING> struct MatmulTranspose {
 332   using ResultDescriptor =
 333       std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor>;
 334   template <TypeCategory XCAT, int XKIND> struct MM1 {
 335     template <TypeCategory YCAT, int YKIND> struct MM2 {
 336       void operator()(ResultDescriptor &result, const Descriptor &x,
 337           const Descriptor &y, Terminator &terminator) const {
 338         if constexpr (constexpr auto resultType{
 339                           GetResultType(XCAT, XKIND, YCAT, YKIND)}) {
 340           if constexpr (Fortran::common::IsNumericTypeCategory(
 341                             resultType->first) ||
 342               resultType->first == TypeCategory::Logical) {
 343             return DoMatmulTranspose<IS_ALLOCATING, resultType->first,
 344                 resultType->second, CppTypeFor<XCAT, XKIND>,
 345                 CppTypeFor<YCAT, YKIND>>(result, x, y, terminator);
 346           }
 347         }
 348         terminator.Crash("MATMUL-TRANSPOSE: bad operand types (%d(%d), %d(%d))",
 349             static_cast<int>(XCAT), XKIND, static_cast<int>(YCAT), YKIND);
 350       }
 351     };
 352     void operator()(ResultDescriptor &result, const Descriptor &x,
 353         const Descriptor &y, Terminator &terminator, TypeCategory yCat,
 354         int yKind) const {
 355       ApplyType<MM2, void>(yCat, yKind, terminator, result, x, y, terminator);
 356     }
 357   };
 358   void operator()(ResultDescriptor &result, const Descriptor &x,
 359       const Descriptor &y, const char *sourceFile, int line) const {
 360     Terminator terminator{sourceFile, line};
 361     auto xCatKind{x.type().GetCategoryAndKind()};
 362     auto yCatKind{y.type().GetCategoryAndKind()};
 363     RUNTIME_CHECK(terminator, xCatKind.has_value() && yCatKind.has_value());
 364     ApplyType<MM1, void>(xCatKind->first, xCatKind->second, terminator, result,
 365         x, y, terminator, yCatKind->first, yCatKind->second);
 366   }
 367 };
 368 } // namespace
 369
 370 namespace Fortran::runtime {
 371 extern "C" {
 372 void RTNAME(MatmulTranspose)(Descriptor &result, const Descriptor &x,
 373     const Descriptor &y, const char *sourceFile, int line) {
 374   MatmulTranspose<true>{}(result, x, y, sourceFile, line);
 375 }
 376 void RTNAME(MatmulTransposeDirect)(const Descriptor &result,
 377     const Descriptor &x, const Descriptor &y, const char *sourceFile,
 378     int line) {
 379   MatmulTranspose<false>{}(result, x, y, sourceFile, line);
 380 }
 381 } // extern "C"
 382 } // namespace Fortran::runtime