flang/runtime/matmul.cpp

   1 //===-- runtime/matmul.cpp ------------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 // Implements all forms of MATMUL (Fortran 2018 16.9.124)
  10 //
  11 // There are two main entry points; one establishes a descriptor for the
  12 // result and allocates it, and the other expects a result descriptor that
  13 // points to existing storage.
  14 //
  15 // This implementation must handle all combinations of numeric types and
  16 // kinds (100 - 165 cases depending on the target), plus all combinations
  17 // of logical kinds (16).  A single template undergoes many instantiations
  18 // to cover all of the valid possibilities.
  19 //
  20 // Places where BLAS routines could be called are marked as TODO items.
  21
  22 #include "flang/Runtime/matmul.h"
  23 #include "terminator.h"
  24 #include "tools.h"
  25 #include "flang/Runtime/c-or-cpp.h"
  26 #include "flang/Runtime/cpp-type.h"
  27 #include "flang/Runtime/descriptor.h"
  28 #include <cstring>
  29
  30 namespace Fortran::runtime {
  31
  32 // General accumulator for any type and stride; this is not used for
  33 // contiguous numeric cases.
  34 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
  35 class Accumulator {
  36 public:
  37   using Result = AccumulationType<RCAT, RKIND>;
  38   Accumulator(const Descriptor &x, const Descriptor &y) : x_{x}, y_{y} {}
  39   void Accumulate(const SubscriptValue xAt[], const SubscriptValue yAt[]) {
  40     if constexpr (RCAT == TypeCategory::Logical) {
  41       sum_ = sum_ ||
  42           (IsLogicalElementTrue(x_, xAt) && IsLogicalElementTrue(y_, yAt));
  43     } else {
  44       sum_ += static_cast<Result>(*x_.Element<XT>(xAt)) *
  45           static_cast<Result>(*y_.Element<YT>(yAt));
  46     }
  47   }
  48   Result GetResult() const { return sum_; }
  49
  50 private:
  51   const Descriptor &x_, &y_;
  52   Result sum_{};
  53 };
  54
  55 // Contiguous numeric matrix*matrix multiplication
  56 //   matrix(rows,n) * matrix(n,cols) -> matrix(rows,cols)
  57 // Straightforward algorithm:
  58 //   DO 1 I = 1, NROWS
  59 //    DO 1 J = 1, NCOLS
  60 //     RES(I,J) = 0
  61 //     DO 1 K = 1, N
  62 //   1  RES(I,J) = RES(I,J) + X(I,K)*Y(K,J)
  63 // With loop distribution and transposition to avoid the inner sum
  64 // reduction and to avoid non-unit strides:
  65 //   DO 1 I = 1, NROWS
  66 //    DO 1 J = 1, NCOLS
  67 //   1 RES(I,J) = 0
  68 //   DO 2 K = 1, N
  69 //    DO 2 J = 1, NCOLS
  70 //     DO 2 I = 1, NROWS
  71 //   2  RES(I,J) = RES(I,J) + X(I,K)*Y(K,J) ! loop-invariant last term
  72 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
  73     bool X_HAS_STRIDED_COLUMNS, bool Y_HAS_STRIDED_COLUMNS>
  74 inline void MatrixTimesMatrix(CppTypeFor<RCAT, RKIND> *RESTRICT product,
  75     SubscriptValue rows, SubscriptValue cols, const XT *RESTRICT x,
  76     const YT *RESTRICT y, SubscriptValue n, std::size_t xColumnByteStride = 0,
  77     std::size_t yColumnByteStride = 0) {
  78   using ResultType = CppTypeFor<RCAT, RKIND>;
  79   std::memset(product, 0, rows * cols * sizeof *product);
  80   const XT *RESTRICT xp0{x};
  81   for (SubscriptValue k{0}; k < n; ++k) {
  82     ResultType *RESTRICT p{product};
  83     for (SubscriptValue j{0}; j < cols; ++j) {
  84       const XT *RESTRICT xp{xp0};
  85       ResultType yv;
  86       if constexpr (!Y_HAS_STRIDED_COLUMNS) {
  87         yv = static_cast<ResultType>(y[k + j * n]);
  88       } else {
  89         yv = static_cast<ResultType>(reinterpret_cast<const YT *>(
  90             reinterpret_cast<const char *>(y) + j * yColumnByteStride)[k]);
  91       }
  92       for (SubscriptValue i{0}; i < rows; ++i) {
  93         *p++ += static_cast<ResultType>(*xp++) * yv;
  94       }
  95     }
  96     if constexpr (!X_HAS_STRIDED_COLUMNS) {
  97       xp0 += rows;
  98     } else {
  99       xp0 = reinterpret_cast<const XT *>(
 100           reinterpret_cast<const char *>(xp0) + xColumnByteStride);
 101     }
 102   }
 103 }
 104
 105 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 106 inline void MatrixTimesMatrixHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
 107     SubscriptValue rows, SubscriptValue cols, const XT *RESTRICT x,
 108     const YT *RESTRICT y, SubscriptValue n,
 109     std::optional<std::size_t> xColumnByteStride,
 110     std::optional<std::size_t> yColumnByteStride) {
 111   if (!xColumnByteStride) {
 112     if (!yColumnByteStride) {
 113       MatrixTimesMatrix<RCAT, RKIND, XT, YT, false, false>(
 114           product, rows, cols, x, y, n);
 115     } else {
 116       MatrixTimesMatrix<RCAT, RKIND, XT, YT, false, true>(
 117           product, rows, cols, x, y, n, 0, *yColumnByteStride);
 118     }
 119   } else {
 120     if (!yColumnByteStride) {
 121       MatrixTimesMatrix<RCAT, RKIND, XT, YT, true, false>(
 122           product, rows, cols, x, y, n, *xColumnByteStride);
 123     } else {
 124       MatrixTimesMatrix<RCAT, RKIND, XT, YT, true, true>(
 125           product, rows, cols, x, y, n, *xColumnByteStride, *yColumnByteStride);
 126     }
 127   }
 128 }
 129
 130 // Contiguous numeric matrix*vector multiplication
 131 //   matrix(rows,n) * column vector(n) -> column vector(rows)
 132 // Straightforward algorithm:
 133 //   DO 1 J = 1, NROWS
 134 //    RES(J) = 0
 135 //    DO 1 K = 1, N
 136 //   1 RES(J) = RES(J) + X(J,K)*Y(K)
 137 // With loop distribution and transposition to avoid the inner
 138 // sum reduction and to avoid non-unit strides:
 139 //   DO 1 J = 1, NROWS
 140 //   1 RES(J) = 0
 141 //   DO 2 K = 1, N
 142 //    DO 2 J = 1, NROWS
 143 //   2 RES(J) = RES(J) + X(J,K)*Y(K)
 144 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
 145     bool X_HAS_STRIDED_COLUMNS>
 146 inline void MatrixTimesVector(CppTypeFor<RCAT, RKIND> *RESTRICT product,
 147     SubscriptValue rows, SubscriptValue n, const XT *RESTRICT x,
 148     const YT *RESTRICT y, std::size_t xColumnByteStride = 0) {
 149   using ResultType = CppTypeFor<RCAT, RKIND>;
 150   std::memset(product, 0, rows * sizeof *product);
 151   [[maybe_unused]] const XT *RESTRICT xp0{x};
 152   for (SubscriptValue k{0}; k < n; ++k) {
 153     ResultType *RESTRICT p{product};
 154     auto yv{static_cast<ResultType>(*y++)};
 155     for (SubscriptValue j{0}; j < rows; ++j) {
 156       *p++ += static_cast<ResultType>(*x++) * yv;
 157     }
 158     if constexpr (X_HAS_STRIDED_COLUMNS) {
 159       xp0 = reinterpret_cast<const XT *>(
 160           reinterpret_cast<const char *>(xp0) + xColumnByteStride);
 161       x = xp0;
 162     }
 163   }
 164 }
 165
 166 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 167 inline void MatrixTimesVectorHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
 168     SubscriptValue rows, SubscriptValue n, const XT *RESTRICT x,
 169     const YT *RESTRICT y, std::optional<std::size_t> xColumnByteStride) {
 170   if (!xColumnByteStride) {
 171     MatrixTimesVector<RCAT, RKIND, XT, YT, false>(product, rows, n, x, y);
 172   } else {
 173     MatrixTimesVector<RCAT, RKIND, XT, YT, true>(
 174         product, rows, n, x, y, *xColumnByteStride);
 175   }
 176 }
 177
 178 // Contiguous numeric vector*matrix multiplication
 179 //   row vector(n) * matrix(n,cols) -> row vector(cols)
 180 // Straightforward algorithm:
 181 //   DO 1 J = 1, NCOLS
 182 //    RES(J) = 0
 183 //    DO 1 K = 1, N
 184 //   1 RES(J) = RES(J) + X(K)*Y(K,J)
 185 // With loop distribution and transposition to avoid the inner
 186 // sum reduction and one non-unit stride (the other remains):
 187 //   DO 1 J = 1, NCOLS
 188 //   1 RES(J) = 0
 189 //   DO 2 K = 1, N
 190 //    DO 2 J = 1, NCOLS
 191 //   2 RES(J) = RES(J) + X(K)*Y(K,J)
 192 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
 193     bool Y_HAS_STRIDED_COLUMNS>
 194 inline void VectorTimesMatrix(CppTypeFor<RCAT, RKIND> *RESTRICT product,
 195     SubscriptValue n, SubscriptValue cols, const XT *RESTRICT x,
 196     const YT *RESTRICT y, std::size_t yColumnByteStride = 0) {
 197   using ResultType = CppTypeFor<RCAT, RKIND>;
 198   std::memset(product, 0, cols * sizeof *product);
 199   for (SubscriptValue k{0}; k < n; ++k) {
 200     ResultType *RESTRICT p{product};
 201     auto xv{static_cast<ResultType>(*x++)};
 202     const YT *RESTRICT yp{&y[k]};
 203     for (SubscriptValue j{0}; j < cols; ++j) {
 204       *p++ += xv * static_cast<ResultType>(*yp);
 205       if constexpr (!Y_HAS_STRIDED_COLUMNS) {
 206         yp += n;
 207       } else {
 208         yp = reinterpret_cast<const YT *>(
 209             reinterpret_cast<const char *>(yp) + yColumnByteStride);
 210       }
 211     }
 212   }
 213 }
 214
 215 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
 216     bool SPARSE_COLUMNS = false>
 217 inline void VectorTimesMatrixHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
 218     SubscriptValue n, SubscriptValue cols, const XT *RESTRICT x,
 219     const YT *RESTRICT y, std::optional<std::size_t> yColumnByteStride) {
 220   if (!yColumnByteStride) {
 221     VectorTimesMatrix<RCAT, RKIND, XT, YT, false>(product, n, cols, x, y);
 222   } else {
 223     VectorTimesMatrix<RCAT, RKIND, XT, YT, true>(
 224         product, n, cols, x, y, *yColumnByteStride);
 225   }
 226 }
 227
 228 // Implements an instance of MATMUL for given argument types.
 229 template <bool IS_ALLOCATING, TypeCategory RCAT, int RKIND, typename XT,
 230     typename YT>
 231 static inline void DoMatmul(
 232     std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor> &result,
 233     const Descriptor &x, const Descriptor &y, Terminator &terminator) {
 234   int xRank{x.rank()};
 235   int yRank{y.rank()};
 236   int resRank{xRank + yRank - 2};
 237   if (xRank * yRank != 2 * resRank) {
 238     terminator.Crash("MATMUL: bad argument ranks (%d * %d)", xRank, yRank);
 239   }
 240   SubscriptValue extent[2]{
 241       xRank == 2 ? x.GetDimension(0).Extent() : y.GetDimension(1).Extent(),
 242       resRank == 2 ? y.GetDimension(1).Extent() : 0};
 243   if constexpr (IS_ALLOCATING) {
 244     result.Establish(
 245         RCAT, RKIND, nullptr, resRank, extent, CFI_attribute_allocatable);
 246     for (int j{0}; j < resRank; ++j) {
 247       result.GetDimension(j).SetBounds(1, extent[j]);
 248     }
 249     if (int stat{result.Allocate()}) {
 250       terminator.Crash(
 251           "MATMUL: could not allocate memory for result; STAT=%d", stat);
 252     }
 253   } else {
 254     RUNTIME_CHECK(terminator, resRank == result.rank());
 255     RUNTIME_CHECK(
 256         terminator, result.ElementBytes() == static_cast<std::size_t>(RKIND));
 257     RUNTIME_CHECK(terminator, result.GetDimension(0).Extent() == extent[0]);
 258     RUNTIME_CHECK(terminator,
 259         resRank == 1 || result.GetDimension(1).Extent() == extent[1]);
 260   }
 261   SubscriptValue n{x.GetDimension(xRank - 1).Extent()};
 262   if (n != y.GetDimension(0).Extent()) {
 263     terminator.Crash("MATMUL: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
 264         static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
 265         static_cast<std::intmax_t>(n),
 266         static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
 267         static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
 268   }
 269   using WriteResult =
 270       CppTypeFor<RCAT == TypeCategory::Logical ? TypeCategory::Integer : RCAT,
 271           RKIND>;
 272   if constexpr (RCAT != TypeCategory::Logical) {
 273     if (x.IsContiguous(1) && y.IsContiguous(1) &&
 274         (IS_ALLOCATING || result.IsContiguous())) {
 275       // Contiguous numeric matrices (maybe with columns
 276       // separated by a stride).
 277       std::optional<std::size_t> xColumnByteStride;
 278       if (!x.IsContiguous()) {
 279         // X's columns are strided.
 280         SubscriptValue xAt[2]{};
 281         x.GetLowerBounds(xAt);
 282         xAt[1]++;
 283         xColumnByteStride = x.SubscriptsToByteOffset(xAt);
 284       }
 285       std::optional<std::size_t> yColumnByteStride;
 286       if (!y.IsContiguous()) {
 287         // Y's columns are strided.
 288         SubscriptValue yAt[2]{};
 289         y.GetLowerBounds(yAt);
 290         yAt[1]++;
 291         yColumnByteStride = y.SubscriptsToByteOffset(yAt);
 292       }
 293       // Note that BLAS GEMM can be used for the strided
 294       // columns by setting proper leading dimension size.
 295       // This implies that the column stride is divisible
 296       // by the element size, which is usually true.
 297       if (resRank == 2) { // M*M -> M
 298         if (std::is_same_v<XT, YT>) {
 299           if constexpr (std::is_same_v<XT, float>) {
 300             // TODO: call BLAS-3 SGEMM
 301             // TODO: try using CUTLASS for device.
 302           } else if constexpr (std::is_same_v<XT, double>) {
 303             // TODO: call BLAS-3 DGEMM
 304           } else if constexpr (std::is_same_v<XT, std::complex<float>>) {
 305             // TODO: call BLAS-3 CGEMM
 306           } else if constexpr (std::is_same_v<XT, std::complex<double>>) {
 307             // TODO: call BLAS-3 ZGEMM
 308           }
 309         }
 310         MatrixTimesMatrixHelper<RCAT, RKIND, XT, YT>(
 311             result.template OffsetElement<WriteResult>(), extent[0], extent[1],
 312             x.OffsetElement<XT>(), y.OffsetElement<YT>(), n, xColumnByteStride,
 313             yColumnByteStride);
 314         return;
 315       } else if (xRank == 2) { // M*V -> V
 316         if (std::is_same_v<XT, YT>) {
 317           if constexpr (std::is_same_v<XT, float>) {
 318             // TODO: call BLAS-2 SGEMV(x,y)
 319           } else if constexpr (std::is_same_v<XT, double>) {
 320             // TODO: call BLAS-2 DGEMV(x,y)
 321           } else if constexpr (std::is_same_v<XT, std::complex<float>>) {
 322             // TODO: call BLAS-2 CGEMV(x,y)
 323           } else if constexpr (std::is_same_v<XT, std::complex<double>>) {
 324             // TODO: call BLAS-2 ZGEMV(x,y)
 325           }
 326         }
 327         MatrixTimesVectorHelper<RCAT, RKIND, XT, YT>(
 328             result.template OffsetElement<WriteResult>(), extent[0], n,
 329             x.OffsetElement<XT>(), y.OffsetElement<YT>(), xColumnByteStride);
 330         return;
 331       } else { // V*M -> V
 332         if (std::is_same_v<XT, YT>) {
 333           if constexpr (std::is_same_v<XT, float>) {
 334             // TODO: call BLAS-2 SGEMV(y,x)
 335           } else if constexpr (std::is_same_v<XT, double>) {
 336             // TODO: call BLAS-2 DGEMV(y,x)
 337           } else if constexpr (std::is_same_v<XT, std::complex<float>>) {
 338             // TODO: call BLAS-2 CGEMV(y,x)
 339           } else if constexpr (std::is_same_v<XT, std::complex<double>>) {
 340             // TODO: call BLAS-2 ZGEMV(y,x)
 341           }
 342         }
 343         VectorTimesMatrixHelper<RCAT, RKIND, XT, YT>(
 344             result.template OffsetElement<WriteResult>(), n, extent[0],
 345             x.OffsetElement<XT>(), y.OffsetElement<YT>(), yColumnByteStride);
 346         return;
 347       }
 348     }
 349   }
 350   // General algorithms for LOGICAL and noncontiguity
 351   SubscriptValue xAt[2], yAt[2], resAt[2];
 352   x.GetLowerBounds(xAt);
 353   y.GetLowerBounds(yAt);
 354   result.GetLowerBounds(resAt);
 355   if (resRank == 2) { // M*M -> M
 356     SubscriptValue x1{xAt[1]}, y0{yAt[0]}, y1{yAt[1]}, res1{resAt[1]};
 357     for (SubscriptValue i{0}; i < extent[0]; ++i) {
 358       for (SubscriptValue j{0}; j < extent[1]; ++j) {
 359         Accumulator<RCAT, RKIND, XT, YT> accumulator{x, y};
 360         yAt[1] = y1 + j;
 361         for (SubscriptValue k{0}; k < n; ++k) {
 362           xAt[1] = x1 + k;
 363           yAt[0] = y0 + k;
 364           accumulator.Accumulate(xAt, yAt);
 365         }
 366         resAt[1] = res1 + j;
 367         *result.template Element<WriteResult>(resAt) = accumulator.GetResult();
 368       }
 369       ++resAt[0];
 370       ++xAt[0];
 371     }
 372   } else if (xRank == 2) { // M*V -> V
 373     SubscriptValue x1{xAt[1]}, y0{yAt[0]};
 374     for (SubscriptValue j{0}; j < extent[0]; ++j) {
 375       Accumulator<RCAT, RKIND, XT, YT> accumulator{x, y};
 376       for (SubscriptValue k{0}; k < n; ++k) {
 377         xAt[1] = x1 + k;
 378         yAt[0] = y0 + k;
 379         accumulator.Accumulate(xAt, yAt);
 380       }
 381       *result.template Element<WriteResult>(resAt) = accumulator.GetResult();
 382       ++resAt[0];
 383       ++xAt[0];
 384     }
 385   } else { // V*M -> V
 386     SubscriptValue x0{xAt[0]}, y0{yAt[0]};
 387     for (SubscriptValue j{0}; j < extent[0]; ++j) {
 388       Accumulator<RCAT, RKIND, XT, YT> accumulator{x, y};
 389       for (SubscriptValue k{0}; k < n; ++k) {
 390         xAt[0] = x0 + k;
 391         yAt[0] = y0 + k;
 392         accumulator.Accumulate(xAt, yAt);
 393       }
 394       *result.template Element<WriteResult>(resAt) = accumulator.GetResult();
 395       ++resAt[0];
 396       ++yAt[1];
 397     }
 398   }
 399 }
 400
 401 // Maps the dynamic type information from the arguments' descriptors
 402 // to the right instantiation of DoMatmul() for valid combinations of
 403 // types.
 404 template <bool IS_ALLOCATING> struct Matmul {
 405   using ResultDescriptor =
 406       std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor>;
 407   template <TypeCategory XCAT, int XKIND> struct MM1 {
 408     template <TypeCategory YCAT, int YKIND> struct MM2 {
 409       void operator()(ResultDescriptor &result, const Descriptor &x,
 410           const Descriptor &y, Terminator &terminator) const {
 411         if constexpr (constexpr auto resultType{
 412                           GetResultType(XCAT, XKIND, YCAT, YKIND)}) {
 413           if constexpr (common::IsNumericTypeCategory(resultType->first) ||
 414               resultType->first == TypeCategory::Logical) {
 415             return DoMatmul<IS_ALLOCATING, resultType->first,
 416                 resultType->second, CppTypeFor<XCAT, XKIND>,
 417                 CppTypeFor<YCAT, YKIND>>(result, x, y, terminator);
 418           }
 419         }
 420         terminator.Crash("MATMUL: bad operand types (%d(%d), %d(%d))",
 421             static_cast<int>(XCAT), XKIND, static_cast<int>(YCAT), YKIND);
 422       }
 423     };
 424     void operator()(ResultDescriptor &result, const Descriptor &x,
 425         const Descriptor &y, Terminator &terminator, TypeCategory yCat,
 426         int yKind) const {
 427       ApplyType<MM2, void>(yCat, yKind, terminator, result, x, y, terminator);
 428     }
 429   };
 430   void operator()(ResultDescriptor &result, const Descriptor &x,
 431       const Descriptor &y, const char *sourceFile, int line) const {
 432     Terminator terminator{sourceFile, line};
 433     auto xCatKind{x.type().GetCategoryAndKind()};
 434     auto yCatKind{y.type().GetCategoryAndKind()};
 435     RUNTIME_CHECK(terminator, xCatKind.has_value() && yCatKind.has_value());
 436     ApplyType<MM1, void>(xCatKind->first, xCatKind->second, terminator, result,
 437         x, y, terminator, yCatKind->first, yCatKind->second);
 438   }
 439 };
 440
 441 extern "C" {
 442 void RTNAME(Matmul)(Descriptor &result, const Descriptor &x,
 443     const Descriptor &y, const char *sourceFile, int line) {
 444   Matmul<true>{}(result, x, y, sourceFile, line);
 445 }
 446 void RTNAME(MatmulDirect)(const Descriptor &result, const Descriptor &x,
 447     const Descriptor &y, const char *sourceFile, int line) {
 448   Matmul<false>{}(result, x, y, sourceFile, line);
 449 }
 450 } // extern "C"
 451 } // namespace Fortran::runtime