1 /* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 * ===-----------------------------------------------------------------------===
11 #error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
12 #endif /* __IMMINTRIN_H */
14 #ifndef __AMX_TRANSPOSEINTRIN_H
15 #define __AMX_TRANSPOSEINTRIN_H
18 #define __DEFAULT_FN_ATTRS_TRANSPOSE \
19 __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
21 #define _tile_2rpntlvwz0(tdst, base, stride) \
22 __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
23 #define _tile_2rpntlvwz0t1(tdst, base, stride) \
24 __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
25 #define _tile_2rpntlvwz1(tdst, base, stride) \
26 __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
27 #define _tile_2rpntlvwz1t1(tdst, base, stride) \
28 __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
30 /// Transpose 32-bit elements from \a src and write the result to \a dst.
32 /// \headerfile <immintrin.h>
35 /// void _tile_transposed(__tile dst, __tile src);
38 /// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
41 /// The destination tile. Max size is 1024 Bytes.
43 /// The source tile. Max size is 1024 Bytes.
47 /// FOR i := 0 TO (dst.rows-1)
49 /// FOR j := 0 TO (dst.colsb/4-1)
50 /// tmp.dword[j] := src.row[j].dword[i]
55 /// zero_upper_rows(dst, dst.rows)
56 /// zero_tileconfig_start()
58 #define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
60 static __inline__
void __DEFAULT_FN_ATTRS_TRANSPOSE
_tile_2rpntlvwz0_internal(
61 unsigned short row
, unsigned short col0
, unsigned short col1
,
62 _tile1024i
*dst0
, _tile1024i
*dst1
, const void *base
,
63 __SIZE_TYPE__ stride
) {
64 // Use __tile1024i_1024a* to escape the alignment check in
65 // clang/test/Headers/x86-intrinsics-headers-clean.cpp
66 __builtin_ia32_t2rpntlvwz0_internal(row
, col0
, col1
, (_tile1024i_1024a
*)dst0
,
67 (_tile1024i_1024a
*)dst1
, base
,
68 (__SIZE_TYPE__
)(stride
));
71 static __inline__
void __DEFAULT_FN_ATTRS_TRANSPOSE
_tile_2rpntlvwz0t1_internal(
72 unsigned short row
, unsigned short col0
, unsigned short col1
,
73 _tile1024i
*dst0
, _tile1024i
*dst1
, const void *base
,
74 __SIZE_TYPE__ stride
) {
75 __builtin_ia32_t2rpntlvwz0t1_internal(
76 row
, col0
, col1
, (_tile1024i_1024a
*)dst0
, (_tile1024i_1024a
*)dst1
, base
,
77 (__SIZE_TYPE__
)(stride
));
80 static __inline__
void __DEFAULT_FN_ATTRS_TRANSPOSE
_tile_2rpntlvwz1_internal(
81 unsigned short row
, unsigned short col0
, unsigned short col1
,
82 _tile1024i
*dst0
, _tile1024i
*dst1
, const void *base
,
83 __SIZE_TYPE__ stride
) {
84 __builtin_ia32_t2rpntlvwz1_internal(row
, col0
, col1
, (_tile1024i_1024a
*)dst0
,
85 (_tile1024i_1024a
*)dst1
, base
,
86 (__SIZE_TYPE__
)(stride
));
89 static __inline__
void __DEFAULT_FN_ATTRS_TRANSPOSE
_tile_2rpntlvwz1t1_internal(
90 unsigned short row
, unsigned short col0
, unsigned short col1
,
91 _tile1024i
*dst0
, _tile1024i
*dst1
, const void *base
,
92 __SIZE_TYPE__ stride
) {
93 __builtin_ia32_t2rpntlvwz1t1_internal(
94 row
, col0
, col1
, (_tile1024i_1024a
*)dst0
, (_tile1024i_1024a
*)dst1
, base
,
95 (__SIZE_TYPE__
)(stride
));
98 // This is internal intrinsic. C/C++ user should avoid calling it directly.
99 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
100 _tile_transposed_internal(unsigned short m
, unsigned short n
, _tile1024i src
) {
101 return __builtin_ia32_ttransposed_internal(m
, n
, src
);
104 /// Converts a pair of tiles from memory into VNNI format, and places the
105 /// results in a pair of destinations specified by dst. The pair of tiles
106 /// in memory is specified via a tsib; the second tile is after the first
107 /// one, separated by the same stride that separates each row.
108 /// The tile configuration for the destination tiles indicates the amount
109 /// of data to read from memory. The instruction will load a number of rows
110 /// that is equal to twice the number of rows in tmm1. The size of each row
111 /// is equal to the average width of the destination tiles. If the second
112 /// tile is configured with zero rows and columns, only the first tile will
114 /// Provides a hint to the implementation that the data will likely not be
115 /// reused in the near future and the data caching can be optimized.
117 /// \headerfile <immintrin.h>
119 /// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
122 /// First tile of destination tile pair. Max size is 1024i*2 Bytes.
124 /// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
126 /// A pointer to base address.
128 /// The stride between the rows' data to be loaded in memory.
129 __DEFAULT_FN_ATTRS_TRANSPOSE
130 static void __tile_2rpntlvwz0(__tile1024i
*dst0
, __tile1024i
*dst1
,
131 const void *base
, __SIZE_TYPE__ stride
) {
132 _tile_2rpntlvwz0_internal(dst0
->row
, dst0
->col
, dst1
->col
, &dst0
->tile
,
133 &dst1
->tile
, base
, stride
);
136 /// Converts a pair of tiles from memory into VNNI format, and places the
137 /// results in a pair of destinations specified by dst. The pair of tiles
138 /// in memory is specified via a tsib; the second tile is after the first
139 /// one, separated by the same stride that separates each row.
140 /// The tile configuration for the destination tiles indicates the amount
141 /// of data to read from memory. The instruction will load a number of rows
142 /// that is equal to twice the number of rows in tmm1. The size of each row
143 /// is equal to the average width of the destination tiles. If the second
144 /// tile is configured with zero rows and columns, only the first tile will
147 /// \headerfile <immintrin.h>
149 /// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
152 /// First tile of destination tile pair. Max size is 1024i*2 Bytes.
154 /// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
156 /// A pointer to base address.
158 /// The stride between the rows' data to be loaded in memory.
159 __DEFAULT_FN_ATTRS_TRANSPOSE
160 static void __tile_2rpntlvwz0t1(__tile1024i
*dst0
, __tile1024i
*dst1
,
161 const void *base
, __SIZE_TYPE__ stride
) {
162 _tile_2rpntlvwz0t1_internal(dst0
->row
, dst0
->col
, dst1
->col
, &dst0
->tile
,
163 &dst1
->tile
, base
, stride
);
166 /// Converts a pair of tiles from memory into VNNI format, and places the
167 /// results in a pair of destinations specified by dst. The pair of tiles
168 /// in memory is specified via a tsib; the second tile is after the first
169 /// one, separated by the same stride that separates each row.
170 /// The tile configuration for the destination tiles indicates the amount
171 /// of data to read from memory. The instruction will load a number of rows
172 /// that is equal to twice the number of rows in tmm1. The size of each row
173 /// is equal to the average width of the destination tiles. If the second
174 /// tile is configured with zero rows and columns, only the first tile will
175 /// be written. The last row will be not be read from memory but instead
176 /// filled with zeros.
177 /// Provides a hint to the implementation that the data will likely not be
178 /// reused in the near future and the data caching can be optimized.
180 /// \headerfile <immintrin.h>
182 /// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
185 /// First tile of destination tile pair. Max size is 1024i*2 Bytes.
187 /// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
189 /// A pointer to base address.
191 /// The stride between the rows' data to be loaded in memory.
192 __DEFAULT_FN_ATTRS_TRANSPOSE
193 static void __tile_2rpntlvwz1(__tile1024i
*dst0
, __tile1024i
*dst1
,
194 const void *base
, __SIZE_TYPE__ stride
) {
195 _tile_2rpntlvwz1_internal(dst0
->row
, dst0
->col
, dst1
->col
, &dst0
->tile
,
196 &dst1
->tile
, base
, stride
);
199 /// Converts a pair of tiles from memory into VNNI format, and places the
200 /// results in a pair of destinations specified by dst. The pair of tiles
201 /// in memory is specified via a tsib; the second tile is after the first
202 /// one, separated by the same stride that separates each row.
203 /// The tile configuration for the destination tiles indicates the amount
204 /// of data to read from memory. The instruction will load a number of rows
205 /// that is equal to twice the number of rows in tmm1. The size of each row
206 /// is equal to the average width of the destination tiles. If the second
207 /// tile is configured with zero rows and columns, only the first tile will
208 /// be written. The last row will be not be read from memory but instead
209 /// filled with zeros.
210 /// Provides a hint to the implementation that the data will likely not be
211 /// reused in the near future and the data caching can be optimized.
213 /// \headerfile <immintrin.h>
215 /// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
218 /// First tile of destination tile pair. Max size is 1024i*2 Bytes.
220 /// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
222 /// A pointer to base address.
224 /// The stride between the rows' data to be loaded in memory.
225 __DEFAULT_FN_ATTRS_TRANSPOSE
226 static void __tile_2rpntlvwz1t1(__tile1024i
*dst0
, __tile1024i
*dst1
,
227 const void *base
, __SIZE_TYPE__ stride
) {
228 _tile_2rpntlvwz1t1_internal(dst0
->row
, dst0
->col
, dst1
->col
, &dst0
->tile
,
229 &dst1
->tile
, base
, stride
);
232 /// Transpose 32-bit elements from src and write the result to dst.
234 /// \headerfile <immintrin.h>
236 /// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
239 /// The destination tile. Max size is 1024 Bytes.
241 /// The source tile. Max size is 1024 Bytes.
242 __DEFAULT_FN_ATTRS_TRANSPOSE
243 static void __tile_transposed(__tile1024i
*dst
, __tile1024i src
) {
244 dst
->tile
= _tile_transposed_internal(dst
->row
, dst
->col
, src
.tile
);
247 #endif /* __x86_64__ */
248 #endif /* __AMX_TRANSPOSEINTRIN_H */