[AArch64,ELF] Restrict MOVZ/MOVK to non-PIC large code model (#70178)
[llvm-project.git] / flang / runtime / matmul.cpp
blobb46a94de01cedaf7c99a84eaede7660c449e6f64
1 //===-- runtime/matmul.cpp ------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 // Implements all forms of MATMUL (Fortran 2018 16.9.124)
11 // There are two main entry points; one establishes a descriptor for the
12 // result and allocates it, and the other expects a result descriptor that
13 // points to existing storage.
15 // This implementation must handle all combinations of numeric types and
16 // kinds (100 - 165 cases depending on the target), plus all combinations
17 // of logical kinds (16). A single template undergoes many instantiations
18 // to cover all of the valid possibilities.
20 // Places where BLAS routines could be called are marked as TODO items.
22 #include "flang/Runtime/matmul.h"
23 #include "terminator.h"
24 #include "tools.h"
25 #include "flang/Runtime/c-or-cpp.h"
26 #include "flang/Runtime/cpp-type.h"
27 #include "flang/Runtime/descriptor.h"
28 #include <cstring>
30 namespace Fortran::runtime {
32 // General accumulator for any type and stride; this is not used for
33 // contiguous numeric cases.
34 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
35 class Accumulator {
36 public:
37 using Result = AccumulationType<RCAT, RKIND>;
38 Accumulator(const Descriptor &x, const Descriptor &y) : x_{x}, y_{y} {}
39 void Accumulate(const SubscriptValue xAt[], const SubscriptValue yAt[]) {
40 if constexpr (RCAT == TypeCategory::Logical) {
41 sum_ = sum_ ||
42 (IsLogicalElementTrue(x_, xAt) && IsLogicalElementTrue(y_, yAt));
43 } else {
44 sum_ += static_cast<Result>(*x_.Element<XT>(xAt)) *
45 static_cast<Result>(*y_.Element<YT>(yAt));
48 Result GetResult() const { return sum_; }
50 private:
51 const Descriptor &x_, &y_;
52 Result sum_{};
55 // Contiguous numeric matrix*matrix multiplication
56 // matrix(rows,n) * matrix(n,cols) -> matrix(rows,cols)
57 // Straightforward algorithm:
58 // DO 1 I = 1, NROWS
59 // DO 1 J = 1, NCOLS
60 // RES(I,J) = 0
61 // DO 1 K = 1, N
62 // 1 RES(I,J) = RES(I,J) + X(I,K)*Y(K,J)
63 // With loop distribution and transposition to avoid the inner sum
64 // reduction and to avoid non-unit strides:
65 // DO 1 I = 1, NROWS
66 // DO 1 J = 1, NCOLS
67 // 1 RES(I,J) = 0
68 // DO 2 K = 1, N
69 // DO 2 J = 1, NCOLS
70 // DO 2 I = 1, NROWS
71 // 2 RES(I,J) = RES(I,J) + X(I,K)*Y(K,J) ! loop-invariant last term
72 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
73 bool X_HAS_STRIDED_COLUMNS, bool Y_HAS_STRIDED_COLUMNS>
74 inline void MatrixTimesMatrix(CppTypeFor<RCAT, RKIND> *RESTRICT product,
75 SubscriptValue rows, SubscriptValue cols, const XT *RESTRICT x,
76 const YT *RESTRICT y, SubscriptValue n, std::size_t xColumnByteStride = 0,
77 std::size_t yColumnByteStride = 0) {
78 using ResultType = CppTypeFor<RCAT, RKIND>;
79 std::memset(product, 0, rows * cols * sizeof *product);
80 const XT *RESTRICT xp0{x};
81 for (SubscriptValue k{0}; k < n; ++k) {
82 ResultType *RESTRICT p{product};
83 for (SubscriptValue j{0}; j < cols; ++j) {
84 const XT *RESTRICT xp{xp0};
85 ResultType yv;
86 if constexpr (!Y_HAS_STRIDED_COLUMNS) {
87 yv = static_cast<ResultType>(y[k + j * n]);
88 } else {
89 yv = static_cast<ResultType>(reinterpret_cast<const YT *>(
90 reinterpret_cast<const char *>(y) + j * yColumnByteStride)[k]);
92 for (SubscriptValue i{0}; i < rows; ++i) {
93 *p++ += static_cast<ResultType>(*xp++) * yv;
96 if constexpr (!X_HAS_STRIDED_COLUMNS) {
97 xp0 += rows;
98 } else {
99 xp0 = reinterpret_cast<const XT *>(
100 reinterpret_cast<const char *>(xp0) + xColumnByteStride);
105 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
106 inline void MatrixTimesMatrixHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
107 SubscriptValue rows, SubscriptValue cols, const XT *RESTRICT x,
108 const YT *RESTRICT y, SubscriptValue n,
109 std::optional<std::size_t> xColumnByteStride,
110 std::optional<std::size_t> yColumnByteStride) {
111 if (!xColumnByteStride) {
112 if (!yColumnByteStride) {
113 MatrixTimesMatrix<RCAT, RKIND, XT, YT, false, false>(
114 product, rows, cols, x, y, n);
115 } else {
116 MatrixTimesMatrix<RCAT, RKIND, XT, YT, false, true>(
117 product, rows, cols, x, y, n, 0, *yColumnByteStride);
119 } else {
120 if (!yColumnByteStride) {
121 MatrixTimesMatrix<RCAT, RKIND, XT, YT, true, false>(
122 product, rows, cols, x, y, n, *xColumnByteStride);
123 } else {
124 MatrixTimesMatrix<RCAT, RKIND, XT, YT, true, true>(
125 product, rows, cols, x, y, n, *xColumnByteStride, *yColumnByteStride);
130 // Contiguous numeric matrix*vector multiplication
131 // matrix(rows,n) * column vector(n) -> column vector(rows)
132 // Straightforward algorithm:
133 // DO 1 J = 1, NROWS
134 // RES(J) = 0
135 // DO 1 K = 1, N
136 // 1 RES(J) = RES(J) + X(J,K)*Y(K)
137 // With loop distribution and transposition to avoid the inner
138 // sum reduction and to avoid non-unit strides:
139 // DO 1 J = 1, NROWS
140 // 1 RES(J) = 0
141 // DO 2 K = 1, N
142 // DO 2 J = 1, NROWS
143 // 2 RES(J) = RES(J) + X(J,K)*Y(K)
144 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
145 bool X_HAS_STRIDED_COLUMNS>
146 inline void MatrixTimesVector(CppTypeFor<RCAT, RKIND> *RESTRICT product,
147 SubscriptValue rows, SubscriptValue n, const XT *RESTRICT x,
148 const YT *RESTRICT y, std::size_t xColumnByteStride = 0) {
149 using ResultType = CppTypeFor<RCAT, RKIND>;
150 std::memset(product, 0, rows * sizeof *product);
151 [[maybe_unused]] const XT *RESTRICT xp0{x};
152 for (SubscriptValue k{0}; k < n; ++k) {
153 ResultType *RESTRICT p{product};
154 auto yv{static_cast<ResultType>(*y++)};
155 for (SubscriptValue j{0}; j < rows; ++j) {
156 *p++ += static_cast<ResultType>(*x++) * yv;
158 if constexpr (X_HAS_STRIDED_COLUMNS) {
159 xp0 = reinterpret_cast<const XT *>(
160 reinterpret_cast<const char *>(xp0) + xColumnByteStride);
161 x = xp0;
166 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
167 inline void MatrixTimesVectorHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
168 SubscriptValue rows, SubscriptValue n, const XT *RESTRICT x,
169 const YT *RESTRICT y, std::optional<std::size_t> xColumnByteStride) {
170 if (!xColumnByteStride) {
171 MatrixTimesVector<RCAT, RKIND, XT, YT, false>(product, rows, n, x, y);
172 } else {
173 MatrixTimesVector<RCAT, RKIND, XT, YT, true>(
174 product, rows, n, x, y, *xColumnByteStride);
178 // Contiguous numeric vector*matrix multiplication
179 // row vector(n) * matrix(n,cols) -> row vector(cols)
180 // Straightforward algorithm:
181 // DO 1 J = 1, NCOLS
182 // RES(J) = 0
183 // DO 1 K = 1, N
184 // 1 RES(J) = RES(J) + X(K)*Y(K,J)
185 // With loop distribution and transposition to avoid the inner
186 // sum reduction and one non-unit stride (the other remains):
187 // DO 1 J = 1, NCOLS
188 // 1 RES(J) = 0
189 // DO 2 K = 1, N
190 // DO 2 J = 1, NCOLS
191 // 2 RES(J) = RES(J) + X(K)*Y(K,J)
192 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
193 bool Y_HAS_STRIDED_COLUMNS>
194 inline void VectorTimesMatrix(CppTypeFor<RCAT, RKIND> *RESTRICT product,
195 SubscriptValue n, SubscriptValue cols, const XT *RESTRICT x,
196 const YT *RESTRICT y, std::size_t yColumnByteStride = 0) {
197 using ResultType = CppTypeFor<RCAT, RKIND>;
198 std::memset(product, 0, cols * sizeof *product);
199 for (SubscriptValue k{0}; k < n; ++k) {
200 ResultType *RESTRICT p{product};
201 auto xv{static_cast<ResultType>(*x++)};
202 const YT *RESTRICT yp{&y[k]};
203 for (SubscriptValue j{0}; j < cols; ++j) {
204 *p++ += xv * static_cast<ResultType>(*yp);
205 if constexpr (!Y_HAS_STRIDED_COLUMNS) {
206 yp += n;
207 } else {
208 yp = reinterpret_cast<const YT *>(
209 reinterpret_cast<const char *>(yp) + yColumnByteStride);
215 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
216 bool SPARSE_COLUMNS = false>
217 inline void VectorTimesMatrixHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
218 SubscriptValue n, SubscriptValue cols, const XT *RESTRICT x,
219 const YT *RESTRICT y, std::optional<std::size_t> yColumnByteStride) {
220 if (!yColumnByteStride) {
221 VectorTimesMatrix<RCAT, RKIND, XT, YT, false>(product, n, cols, x, y);
222 } else {
223 VectorTimesMatrix<RCAT, RKIND, XT, YT, true>(
224 product, n, cols, x, y, *yColumnByteStride);
228 // Implements an instance of MATMUL for given argument types.
229 template <bool IS_ALLOCATING, TypeCategory RCAT, int RKIND, typename XT,
230 typename YT>
231 static inline void DoMatmul(
232 std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor> &result,
233 const Descriptor &x, const Descriptor &y, Terminator &terminator) {
234 int xRank{x.rank()};
235 int yRank{y.rank()};
236 int resRank{xRank + yRank - 2};
237 if (xRank * yRank != 2 * resRank) {
238 terminator.Crash("MATMUL: bad argument ranks (%d * %d)", xRank, yRank);
240 SubscriptValue extent[2]{
241 xRank == 2 ? x.GetDimension(0).Extent() : y.GetDimension(1).Extent(),
242 resRank == 2 ? y.GetDimension(1).Extent() : 0};
243 if constexpr (IS_ALLOCATING) {
244 result.Establish(
245 RCAT, RKIND, nullptr, resRank, extent, CFI_attribute_allocatable);
246 for (int j{0}; j < resRank; ++j) {
247 result.GetDimension(j).SetBounds(1, extent[j]);
249 if (int stat{result.Allocate()}) {
250 terminator.Crash(
251 "MATMUL: could not allocate memory for result; STAT=%d", stat);
253 } else {
254 RUNTIME_CHECK(terminator, resRank == result.rank());
255 RUNTIME_CHECK(
256 terminator, result.ElementBytes() == static_cast<std::size_t>(RKIND));
257 RUNTIME_CHECK(terminator, result.GetDimension(0).Extent() == extent[0]);
258 RUNTIME_CHECK(terminator,
259 resRank == 1 || result.GetDimension(1).Extent() == extent[1]);
261 SubscriptValue n{x.GetDimension(xRank - 1).Extent()};
262 if (n != y.GetDimension(0).Extent()) {
263 terminator.Crash("MATMUL: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
264 static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
265 static_cast<std::intmax_t>(n),
266 static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
267 static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
269 using WriteResult =
270 CppTypeFor<RCAT == TypeCategory::Logical ? TypeCategory::Integer : RCAT,
271 RKIND>;
272 if constexpr (RCAT != TypeCategory::Logical) {
273 if (x.IsContiguous(1) && y.IsContiguous(1) &&
274 (IS_ALLOCATING || result.IsContiguous())) {
275 // Contiguous numeric matrices (maybe with columns
276 // separated by a stride).
277 std::optional<std::size_t> xColumnByteStride;
278 if (!x.IsContiguous()) {
279 // X's columns are strided.
280 SubscriptValue xAt[2]{};
281 x.GetLowerBounds(xAt);
282 xAt[1]++;
283 xColumnByteStride = x.SubscriptsToByteOffset(xAt);
285 std::optional<std::size_t> yColumnByteStride;
286 if (!y.IsContiguous()) {
287 // Y's columns are strided.
288 SubscriptValue yAt[2]{};
289 y.GetLowerBounds(yAt);
290 yAt[1]++;
291 yColumnByteStride = y.SubscriptsToByteOffset(yAt);
293 // Note that BLAS GEMM can be used for the strided
294 // columns by setting proper leading dimension size.
295 // This implies that the column stride is divisible
296 // by the element size, which is usually true.
297 if (resRank == 2) { // M*M -> M
298 if (std::is_same_v<XT, YT>) {
299 if constexpr (std::is_same_v<XT, float>) {
300 // TODO: call BLAS-3 SGEMM
301 // TODO: try using CUTLASS for device.
302 } else if constexpr (std::is_same_v<XT, double>) {
303 // TODO: call BLAS-3 DGEMM
304 } else if constexpr (std::is_same_v<XT, std::complex<float>>) {
305 // TODO: call BLAS-3 CGEMM
306 } else if constexpr (std::is_same_v<XT, std::complex<double>>) {
307 // TODO: call BLAS-3 ZGEMM
310 MatrixTimesMatrixHelper<RCAT, RKIND, XT, YT>(
311 result.template OffsetElement<WriteResult>(), extent[0], extent[1],
312 x.OffsetElement<XT>(), y.OffsetElement<YT>(), n, xColumnByteStride,
313 yColumnByteStride);
314 return;
315 } else if (xRank == 2) { // M*V -> V
316 if (std::is_same_v<XT, YT>) {
317 if constexpr (std::is_same_v<XT, float>) {
318 // TODO: call BLAS-2 SGEMV(x,y)
319 } else if constexpr (std::is_same_v<XT, double>) {
320 // TODO: call BLAS-2 DGEMV(x,y)
321 } else if constexpr (std::is_same_v<XT, std::complex<float>>) {
322 // TODO: call BLAS-2 CGEMV(x,y)
323 } else if constexpr (std::is_same_v<XT, std::complex<double>>) {
324 // TODO: call BLAS-2 ZGEMV(x,y)
327 MatrixTimesVectorHelper<RCAT, RKIND, XT, YT>(
328 result.template OffsetElement<WriteResult>(), extent[0], n,
329 x.OffsetElement<XT>(), y.OffsetElement<YT>(), xColumnByteStride);
330 return;
331 } else { // V*M -> V
332 if (std::is_same_v<XT, YT>) {
333 if constexpr (std::is_same_v<XT, float>) {
334 // TODO: call BLAS-2 SGEMV(y,x)
335 } else if constexpr (std::is_same_v<XT, double>) {
336 // TODO: call BLAS-2 DGEMV(y,x)
337 } else if constexpr (std::is_same_v<XT, std::complex<float>>) {
338 // TODO: call BLAS-2 CGEMV(y,x)
339 } else if constexpr (std::is_same_v<XT, std::complex<double>>) {
340 // TODO: call BLAS-2 ZGEMV(y,x)
343 VectorTimesMatrixHelper<RCAT, RKIND, XT, YT>(
344 result.template OffsetElement<WriteResult>(), n, extent[0],
345 x.OffsetElement<XT>(), y.OffsetElement<YT>(), yColumnByteStride);
346 return;
350 // General algorithms for LOGICAL and noncontiguity
351 SubscriptValue xAt[2], yAt[2], resAt[2];
352 x.GetLowerBounds(xAt);
353 y.GetLowerBounds(yAt);
354 result.GetLowerBounds(resAt);
355 if (resRank == 2) { // M*M -> M
356 SubscriptValue x1{xAt[1]}, y0{yAt[0]}, y1{yAt[1]}, res1{resAt[1]};
357 for (SubscriptValue i{0}; i < extent[0]; ++i) {
358 for (SubscriptValue j{0}; j < extent[1]; ++j) {
359 Accumulator<RCAT, RKIND, XT, YT> accumulator{x, y};
360 yAt[1] = y1 + j;
361 for (SubscriptValue k{0}; k < n; ++k) {
362 xAt[1] = x1 + k;
363 yAt[0] = y0 + k;
364 accumulator.Accumulate(xAt, yAt);
366 resAt[1] = res1 + j;
367 *result.template Element<WriteResult>(resAt) = accumulator.GetResult();
369 ++resAt[0];
370 ++xAt[0];
372 } else if (xRank == 2) { // M*V -> V
373 SubscriptValue x1{xAt[1]}, y0{yAt[0]};
374 for (SubscriptValue j{0}; j < extent[0]; ++j) {
375 Accumulator<RCAT, RKIND, XT, YT> accumulator{x, y};
376 for (SubscriptValue k{0}; k < n; ++k) {
377 xAt[1] = x1 + k;
378 yAt[0] = y0 + k;
379 accumulator.Accumulate(xAt, yAt);
381 *result.template Element<WriteResult>(resAt) = accumulator.GetResult();
382 ++resAt[0];
383 ++xAt[0];
385 } else { // V*M -> V
386 SubscriptValue x0{xAt[0]}, y0{yAt[0]};
387 for (SubscriptValue j{0}; j < extent[0]; ++j) {
388 Accumulator<RCAT, RKIND, XT, YT> accumulator{x, y};
389 for (SubscriptValue k{0}; k < n; ++k) {
390 xAt[0] = x0 + k;
391 yAt[0] = y0 + k;
392 accumulator.Accumulate(xAt, yAt);
394 *result.template Element<WriteResult>(resAt) = accumulator.GetResult();
395 ++resAt[0];
396 ++yAt[1];
401 // Maps the dynamic type information from the arguments' descriptors
402 // to the right instantiation of DoMatmul() for valid combinations of
403 // types.
404 template <bool IS_ALLOCATING> struct Matmul {
405 using ResultDescriptor =
406 std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor>;
407 template <TypeCategory XCAT, int XKIND> struct MM1 {
408 template <TypeCategory YCAT, int YKIND> struct MM2 {
409 void operator()(ResultDescriptor &result, const Descriptor &x,
410 const Descriptor &y, Terminator &terminator) const {
411 if constexpr (constexpr auto resultType{
412 GetResultType(XCAT, XKIND, YCAT, YKIND)}) {
413 if constexpr (common::IsNumericTypeCategory(resultType->first) ||
414 resultType->first == TypeCategory::Logical) {
415 return DoMatmul<IS_ALLOCATING, resultType->first,
416 resultType->second, CppTypeFor<XCAT, XKIND>,
417 CppTypeFor<YCAT, YKIND>>(result, x, y, terminator);
420 terminator.Crash("MATMUL: bad operand types (%d(%d), %d(%d))",
421 static_cast<int>(XCAT), XKIND, static_cast<int>(YCAT), YKIND);
424 void operator()(ResultDescriptor &result, const Descriptor &x,
425 const Descriptor &y, Terminator &terminator, TypeCategory yCat,
426 int yKind) const {
427 ApplyType<MM2, void>(yCat, yKind, terminator, result, x, y, terminator);
430 void operator()(ResultDescriptor &result, const Descriptor &x,
431 const Descriptor &y, const char *sourceFile, int line) const {
432 Terminator terminator{sourceFile, line};
433 auto xCatKind{x.type().GetCategoryAndKind()};
434 auto yCatKind{y.type().GetCategoryAndKind()};
435 RUNTIME_CHECK(terminator, xCatKind.has_value() && yCatKind.has_value());
436 ApplyType<MM1, void>(xCatKind->first, xCatKind->second, terminator, result,
437 x, y, terminator, yCatKind->first, yCatKind->second);
441 extern "C" {
442 void RTNAME(Matmul)(Descriptor &result, const Descriptor &x,
443 const Descriptor &y, const char *sourceFile, int line) {
444 Matmul<true>{}(result, x, y, sourceFile, line);
446 void RTNAME(MatmulDirect)(const Descriptor &result, const Descriptor &x,
447 const Descriptor &y, const char *sourceFile, int line) {
448 Matmul<false>{}(result, x, y, sourceFile, line);
450 } // extern "C"
451 } // namespace Fortran::runtime