1 //===-- runtime/matmul.cpp ------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Implements all forms of MATMUL (Fortran 2018 16.9.124)
11 // There are two main entry points; one establishes a descriptor for the
12 // result and allocates it, and the other expects a result descriptor that
13 // points to existing storage.
15 // This implementation must handle all combinations of numeric types and
16 // kinds (100 - 165 cases depending on the target), plus all combinations
17 // of logical kinds (16). A single template undergoes many instantiations
18 // to cover all of the valid possibilities.
20 // Places where BLAS routines could be called are marked as TODO items.
22 #include "flang/Runtime/matmul.h"
23 #include "terminator.h"
25 #include "flang/Runtime/c-or-cpp.h"
26 #include "flang/Runtime/cpp-type.h"
27 #include "flang/Runtime/descriptor.h"
30 namespace Fortran::runtime
{
32 // General accumulator for any type and stride; this is not used for
33 // contiguous numeric cases.
34 template <TypeCategory RCAT
, int RKIND
, typename XT
, typename YT
>
37 using Result
= AccumulationType
<RCAT
, RKIND
>;
38 Accumulator(const Descriptor
&x
, const Descriptor
&y
) : x_
{x
}, y_
{y
} {}
39 void Accumulate(const SubscriptValue xAt
[], const SubscriptValue yAt
[]) {
40 if constexpr (RCAT
== TypeCategory::Logical
) {
42 (IsLogicalElementTrue(x_
, xAt
) && IsLogicalElementTrue(y_
, yAt
));
44 sum_
+= static_cast<Result
>(*x_
.Element
<XT
>(xAt
)) *
45 static_cast<Result
>(*y_
.Element
<YT
>(yAt
));
48 Result
GetResult() const { return sum_
; }
51 const Descriptor
&x_
, &y_
;
55 // Contiguous numeric matrix*matrix multiplication
56 // matrix(rows,n) * matrix(n,cols) -> matrix(rows,cols)
57 // Straightforward algorithm:
62 // 1 RES(I,J) = RES(I,J) + X(I,K)*Y(K,J)
63 // With loop distribution and transposition to avoid the inner sum
64 // reduction and to avoid non-unit strides:
71 // 2 RES(I,J) = RES(I,J) + X(I,K)*Y(K,J) ! loop-invariant last term
72 template <TypeCategory RCAT
, int RKIND
, typename XT
, typename YT
,
73 bool X_HAS_STRIDED_COLUMNS
, bool Y_HAS_STRIDED_COLUMNS
>
74 inline void MatrixTimesMatrix(CppTypeFor
<RCAT
, RKIND
> *RESTRICT product
,
75 SubscriptValue rows
, SubscriptValue cols
, const XT
*RESTRICT x
,
76 const YT
*RESTRICT y
, SubscriptValue n
, std::size_t xColumnByteStride
= 0,
77 std::size_t yColumnByteStride
= 0) {
78 using ResultType
= CppTypeFor
<RCAT
, RKIND
>;
79 std::memset(product
, 0, rows
* cols
* sizeof *product
);
80 const XT
*RESTRICT xp0
{x
};
81 for (SubscriptValue k
{0}; k
< n
; ++k
) {
82 ResultType
*RESTRICT p
{product
};
83 for (SubscriptValue j
{0}; j
< cols
; ++j
) {
84 const XT
*RESTRICT xp
{xp0
};
86 if constexpr (!Y_HAS_STRIDED_COLUMNS
) {
87 yv
= static_cast<ResultType
>(y
[k
+ j
* n
]);
89 yv
= static_cast<ResultType
>(reinterpret_cast<const YT
*>(
90 reinterpret_cast<const char *>(y
) + j
* yColumnByteStride
)[k
]);
92 for (SubscriptValue i
{0}; i
< rows
; ++i
) {
93 *p
++ += static_cast<ResultType
>(*xp
++) * yv
;
96 if constexpr (!X_HAS_STRIDED_COLUMNS
) {
99 xp0
= reinterpret_cast<const XT
*>(
100 reinterpret_cast<const char *>(xp0
) + xColumnByteStride
);
105 template <TypeCategory RCAT
, int RKIND
, typename XT
, typename YT
>
106 inline void MatrixTimesMatrixHelper(CppTypeFor
<RCAT
, RKIND
> *RESTRICT product
,
107 SubscriptValue rows
, SubscriptValue cols
, const XT
*RESTRICT x
,
108 const YT
*RESTRICT y
, SubscriptValue n
,
109 std::optional
<std::size_t> xColumnByteStride
,
110 std::optional
<std::size_t> yColumnByteStride
) {
111 if (!xColumnByteStride
) {
112 if (!yColumnByteStride
) {
113 MatrixTimesMatrix
<RCAT
, RKIND
, XT
, YT
, false, false>(
114 product
, rows
, cols
, x
, y
, n
);
116 MatrixTimesMatrix
<RCAT
, RKIND
, XT
, YT
, false, true>(
117 product
, rows
, cols
, x
, y
, n
, 0, *yColumnByteStride
);
120 if (!yColumnByteStride
) {
121 MatrixTimesMatrix
<RCAT
, RKIND
, XT
, YT
, true, false>(
122 product
, rows
, cols
, x
, y
, n
, *xColumnByteStride
);
124 MatrixTimesMatrix
<RCAT
, RKIND
, XT
, YT
, true, true>(
125 product
, rows
, cols
, x
, y
, n
, *xColumnByteStride
, *yColumnByteStride
);
130 // Contiguous numeric matrix*vector multiplication
131 // matrix(rows,n) * column vector(n) -> column vector(rows)
132 // Straightforward algorithm:
136 // 1 RES(J) = RES(J) + X(J,K)*Y(K)
137 // With loop distribution and transposition to avoid the inner
138 // sum reduction and to avoid non-unit strides:
143 // 2 RES(J) = RES(J) + X(J,K)*Y(K)
144 template <TypeCategory RCAT
, int RKIND
, typename XT
, typename YT
,
145 bool X_HAS_STRIDED_COLUMNS
>
146 inline void MatrixTimesVector(CppTypeFor
<RCAT
, RKIND
> *RESTRICT product
,
147 SubscriptValue rows
, SubscriptValue n
, const XT
*RESTRICT x
,
148 const YT
*RESTRICT y
, std::size_t xColumnByteStride
= 0) {
149 using ResultType
= CppTypeFor
<RCAT
, RKIND
>;
150 std::memset(product
, 0, rows
* sizeof *product
);
151 [[maybe_unused
]] const XT
*RESTRICT xp0
{x
};
152 for (SubscriptValue k
{0}; k
< n
; ++k
) {
153 ResultType
*RESTRICT p
{product
};
154 auto yv
{static_cast<ResultType
>(*y
++)};
155 for (SubscriptValue j
{0}; j
< rows
; ++j
) {
156 *p
++ += static_cast<ResultType
>(*x
++) * yv
;
158 if constexpr (X_HAS_STRIDED_COLUMNS
) {
159 xp0
= reinterpret_cast<const XT
*>(
160 reinterpret_cast<const char *>(xp0
) + xColumnByteStride
);
166 template <TypeCategory RCAT
, int RKIND
, typename XT
, typename YT
>
167 inline void MatrixTimesVectorHelper(CppTypeFor
<RCAT
, RKIND
> *RESTRICT product
,
168 SubscriptValue rows
, SubscriptValue n
, const XT
*RESTRICT x
,
169 const YT
*RESTRICT y
, std::optional
<std::size_t> xColumnByteStride
) {
170 if (!xColumnByteStride
) {
171 MatrixTimesVector
<RCAT
, RKIND
, XT
, YT
, false>(product
, rows
, n
, x
, y
);
173 MatrixTimesVector
<RCAT
, RKIND
, XT
, YT
, true>(
174 product
, rows
, n
, x
, y
, *xColumnByteStride
);
178 // Contiguous numeric vector*matrix multiplication
179 // row vector(n) * matrix(n,cols) -> row vector(cols)
180 // Straightforward algorithm:
184 // 1 RES(J) = RES(J) + X(K)*Y(K,J)
185 // With loop distribution and transposition to avoid the inner
186 // sum reduction and one non-unit stride (the other remains):
191 // 2 RES(J) = RES(J) + X(K)*Y(K,J)
192 template <TypeCategory RCAT
, int RKIND
, typename XT
, typename YT
,
193 bool Y_HAS_STRIDED_COLUMNS
>
194 inline void VectorTimesMatrix(CppTypeFor
<RCAT
, RKIND
> *RESTRICT product
,
195 SubscriptValue n
, SubscriptValue cols
, const XT
*RESTRICT x
,
196 const YT
*RESTRICT y
, std::size_t yColumnByteStride
= 0) {
197 using ResultType
= CppTypeFor
<RCAT
, RKIND
>;
198 std::memset(product
, 0, cols
* sizeof *product
);
199 for (SubscriptValue k
{0}; k
< n
; ++k
) {
200 ResultType
*RESTRICT p
{product
};
201 auto xv
{static_cast<ResultType
>(*x
++)};
202 const YT
*RESTRICT yp
{&y
[k
]};
203 for (SubscriptValue j
{0}; j
< cols
; ++j
) {
204 *p
++ += xv
* static_cast<ResultType
>(*yp
);
205 if constexpr (!Y_HAS_STRIDED_COLUMNS
) {
208 yp
= reinterpret_cast<const YT
*>(
209 reinterpret_cast<const char *>(yp
) + yColumnByteStride
);
215 template <TypeCategory RCAT
, int RKIND
, typename XT
, typename YT
,
216 bool SPARSE_COLUMNS
= false>
217 inline void VectorTimesMatrixHelper(CppTypeFor
<RCAT
, RKIND
> *RESTRICT product
,
218 SubscriptValue n
, SubscriptValue cols
, const XT
*RESTRICT x
,
219 const YT
*RESTRICT y
, std::optional
<std::size_t> yColumnByteStride
) {
220 if (!yColumnByteStride
) {
221 VectorTimesMatrix
<RCAT
, RKIND
, XT
, YT
, false>(product
, n
, cols
, x
, y
);
223 VectorTimesMatrix
<RCAT
, RKIND
, XT
, YT
, true>(
224 product
, n
, cols
, x
, y
, *yColumnByteStride
);
228 // Implements an instance of MATMUL for given argument types.
229 template <bool IS_ALLOCATING
, TypeCategory RCAT
, int RKIND
, typename XT
,
231 static inline void DoMatmul(
232 std::conditional_t
<IS_ALLOCATING
, Descriptor
, const Descriptor
> &result
,
233 const Descriptor
&x
, const Descriptor
&y
, Terminator
&terminator
) {
236 int resRank
{xRank
+ yRank
- 2};
237 if (xRank
* yRank
!= 2 * resRank
) {
238 terminator
.Crash("MATMUL: bad argument ranks (%d * %d)", xRank
, yRank
);
240 SubscriptValue extent
[2]{
241 xRank
== 2 ? x
.GetDimension(0).Extent() : y
.GetDimension(1).Extent(),
242 resRank
== 2 ? y
.GetDimension(1).Extent() : 0};
243 if constexpr (IS_ALLOCATING
) {
245 RCAT
, RKIND
, nullptr, resRank
, extent
, CFI_attribute_allocatable
);
246 for (int j
{0}; j
< resRank
; ++j
) {
247 result
.GetDimension(j
).SetBounds(1, extent
[j
]);
249 if (int stat
{result
.Allocate()}) {
251 "MATMUL: could not allocate memory for result; STAT=%d", stat
);
254 RUNTIME_CHECK(terminator
, resRank
== result
.rank());
256 terminator
, result
.ElementBytes() == static_cast<std::size_t>(RKIND
));
257 RUNTIME_CHECK(terminator
, result
.GetDimension(0).Extent() == extent
[0]);
258 RUNTIME_CHECK(terminator
,
259 resRank
== 1 || result
.GetDimension(1).Extent() == extent
[1]);
261 SubscriptValue n
{x
.GetDimension(xRank
- 1).Extent()};
262 if (n
!= y
.GetDimension(0).Extent()) {
263 terminator
.Crash("MATMUL: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
264 static_cast<std::intmax_t>(x
.GetDimension(0).Extent()),
265 static_cast<std::intmax_t>(n
),
266 static_cast<std::intmax_t>(y
.GetDimension(0).Extent()),
267 static_cast<std::intmax_t>(y
.GetDimension(1).Extent()));
270 CppTypeFor
<RCAT
== TypeCategory::Logical
? TypeCategory::Integer
: RCAT
,
272 if constexpr (RCAT
!= TypeCategory::Logical
) {
273 if (x
.IsContiguous(1) && y
.IsContiguous(1) &&
274 (IS_ALLOCATING
|| result
.IsContiguous())) {
275 // Contiguous numeric matrices (maybe with columns
276 // separated by a stride).
277 std::optional
<std::size_t> xColumnByteStride
;
278 if (!x
.IsContiguous()) {
279 // X's columns are strided.
280 SubscriptValue xAt
[2]{};
281 x
.GetLowerBounds(xAt
);
283 xColumnByteStride
= x
.SubscriptsToByteOffset(xAt
);
285 std::optional
<std::size_t> yColumnByteStride
;
286 if (!y
.IsContiguous()) {
287 // Y's columns are strided.
288 SubscriptValue yAt
[2]{};
289 y
.GetLowerBounds(yAt
);
291 yColumnByteStride
= y
.SubscriptsToByteOffset(yAt
);
293 // Note that BLAS GEMM can be used for the strided
294 // columns by setting proper leading dimension size.
295 // This implies that the column stride is divisible
296 // by the element size, which is usually true.
297 if (resRank
== 2) { // M*M -> M
298 if (std::is_same_v
<XT
, YT
>) {
299 if constexpr (std::is_same_v
<XT
, float>) {
300 // TODO: call BLAS-3 SGEMM
301 // TODO: try using CUTLASS for device.
302 } else if constexpr (std::is_same_v
<XT
, double>) {
303 // TODO: call BLAS-3 DGEMM
304 } else if constexpr (std::is_same_v
<XT
, std::complex<float>>) {
305 // TODO: call BLAS-3 CGEMM
306 } else if constexpr (std::is_same_v
<XT
, std::complex<double>>) {
307 // TODO: call BLAS-3 ZGEMM
310 MatrixTimesMatrixHelper
<RCAT
, RKIND
, XT
, YT
>(
311 result
.template OffsetElement
<WriteResult
>(), extent
[0], extent
[1],
312 x
.OffsetElement
<XT
>(), y
.OffsetElement
<YT
>(), n
, xColumnByteStride
,
315 } else if (xRank
== 2) { // M*V -> V
316 if (std::is_same_v
<XT
, YT
>) {
317 if constexpr (std::is_same_v
<XT
, float>) {
318 // TODO: call BLAS-2 SGEMV(x,y)
319 } else if constexpr (std::is_same_v
<XT
, double>) {
320 // TODO: call BLAS-2 DGEMV(x,y)
321 } else if constexpr (std::is_same_v
<XT
, std::complex<float>>) {
322 // TODO: call BLAS-2 CGEMV(x,y)
323 } else if constexpr (std::is_same_v
<XT
, std::complex<double>>) {
324 // TODO: call BLAS-2 ZGEMV(x,y)
327 MatrixTimesVectorHelper
<RCAT
, RKIND
, XT
, YT
>(
328 result
.template OffsetElement
<WriteResult
>(), extent
[0], n
,
329 x
.OffsetElement
<XT
>(), y
.OffsetElement
<YT
>(), xColumnByteStride
);
332 if (std::is_same_v
<XT
, YT
>) {
333 if constexpr (std::is_same_v
<XT
, float>) {
334 // TODO: call BLAS-2 SGEMV(y,x)
335 } else if constexpr (std::is_same_v
<XT
, double>) {
336 // TODO: call BLAS-2 DGEMV(y,x)
337 } else if constexpr (std::is_same_v
<XT
, std::complex<float>>) {
338 // TODO: call BLAS-2 CGEMV(y,x)
339 } else if constexpr (std::is_same_v
<XT
, std::complex<double>>) {
340 // TODO: call BLAS-2 ZGEMV(y,x)
343 VectorTimesMatrixHelper
<RCAT
, RKIND
, XT
, YT
>(
344 result
.template OffsetElement
<WriteResult
>(), n
, extent
[0],
345 x
.OffsetElement
<XT
>(), y
.OffsetElement
<YT
>(), yColumnByteStride
);
350 // General algorithms for LOGICAL and noncontiguity
351 SubscriptValue xAt
[2], yAt
[2], resAt
[2];
352 x
.GetLowerBounds(xAt
);
353 y
.GetLowerBounds(yAt
);
354 result
.GetLowerBounds(resAt
);
355 if (resRank
== 2) { // M*M -> M
356 SubscriptValue x1
{xAt
[1]}, y0
{yAt
[0]}, y1
{yAt
[1]}, res1
{resAt
[1]};
357 for (SubscriptValue i
{0}; i
< extent
[0]; ++i
) {
358 for (SubscriptValue j
{0}; j
< extent
[1]; ++j
) {
359 Accumulator
<RCAT
, RKIND
, XT
, YT
> accumulator
{x
, y
};
361 for (SubscriptValue k
{0}; k
< n
; ++k
) {
364 accumulator
.Accumulate(xAt
, yAt
);
367 *result
.template Element
<WriteResult
>(resAt
) = accumulator
.GetResult();
372 } else if (xRank
== 2) { // M*V -> V
373 SubscriptValue x1
{xAt
[1]}, y0
{yAt
[0]};
374 for (SubscriptValue j
{0}; j
< extent
[0]; ++j
) {
375 Accumulator
<RCAT
, RKIND
, XT
, YT
> accumulator
{x
, y
};
376 for (SubscriptValue k
{0}; k
< n
; ++k
) {
379 accumulator
.Accumulate(xAt
, yAt
);
381 *result
.template Element
<WriteResult
>(resAt
) = accumulator
.GetResult();
386 SubscriptValue x0
{xAt
[0]}, y0
{yAt
[0]};
387 for (SubscriptValue j
{0}; j
< extent
[0]; ++j
) {
388 Accumulator
<RCAT
, RKIND
, XT
, YT
> accumulator
{x
, y
};
389 for (SubscriptValue k
{0}; k
< n
; ++k
) {
392 accumulator
.Accumulate(xAt
, yAt
);
394 *result
.template Element
<WriteResult
>(resAt
) = accumulator
.GetResult();
401 // Maps the dynamic type information from the arguments' descriptors
402 // to the right instantiation of DoMatmul() for valid combinations of
404 template <bool IS_ALLOCATING
> struct Matmul
{
405 using ResultDescriptor
=
406 std::conditional_t
<IS_ALLOCATING
, Descriptor
, const Descriptor
>;
407 template <TypeCategory XCAT
, int XKIND
> struct MM1
{
408 template <TypeCategory YCAT
, int YKIND
> struct MM2
{
409 void operator()(ResultDescriptor
&result
, const Descriptor
&x
,
410 const Descriptor
&y
, Terminator
&terminator
) const {
411 if constexpr (constexpr auto resultType
{
412 GetResultType(XCAT
, XKIND
, YCAT
, YKIND
)}) {
413 if constexpr (common::IsNumericTypeCategory(resultType
->first
) ||
414 resultType
->first
== TypeCategory::Logical
) {
415 return DoMatmul
<IS_ALLOCATING
, resultType
->first
,
416 resultType
->second
, CppTypeFor
<XCAT
, XKIND
>,
417 CppTypeFor
<YCAT
, YKIND
>>(result
, x
, y
, terminator
);
420 terminator
.Crash("MATMUL: bad operand types (%d(%d), %d(%d))",
421 static_cast<int>(XCAT
), XKIND
, static_cast<int>(YCAT
), YKIND
);
424 void operator()(ResultDescriptor
&result
, const Descriptor
&x
,
425 const Descriptor
&y
, Terminator
&terminator
, TypeCategory yCat
,
427 ApplyType
<MM2
, void>(yCat
, yKind
, terminator
, result
, x
, y
, terminator
);
430 void operator()(ResultDescriptor
&result
, const Descriptor
&x
,
431 const Descriptor
&y
, const char *sourceFile
, int line
) const {
432 Terminator terminator
{sourceFile
, line
};
433 auto xCatKind
{x
.type().GetCategoryAndKind()};
434 auto yCatKind
{y
.type().GetCategoryAndKind()};
435 RUNTIME_CHECK(terminator
, xCatKind
.has_value() && yCatKind
.has_value());
436 ApplyType
<MM1
, void>(xCatKind
->first
, xCatKind
->second
, terminator
, result
,
437 x
, y
, terminator
, yCatKind
->first
, yCatKind
->second
);
442 void RTNAME(Matmul
)(Descriptor
&result
, const Descriptor
&x
,
443 const Descriptor
&y
, const char *sourceFile
, int line
) {
444 Matmul
<true>{}(result
, x
, y
, sourceFile
, line
);
446 void RTNAME(MatmulDirect
)(const Descriptor
&result
, const Descriptor
&x
,
447 const Descriptor
&y
, const char *sourceFile
, int line
) {
448 Matmul
<false>{}(result
, x
, y
, sourceFile
, line
);
451 } // namespace Fortran::runtime