clang/lib/Headers/amxintrin.h

   1 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===------------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __IMMINTRIN_H
  11 #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
  12 #endif /* __IMMINTRIN_H */
  13
  14 #ifndef __AMXINTRIN_H
  15 #define __AMXINTRIN_H
  16 #ifdef __x86_64__
  17
  18 /* Define the default attributes for the functions in this file. */
  19 #define __DEFAULT_FN_ATTRS_TILE                                                \
  20   __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
  21 #define __DEFAULT_FN_ATTRS_INT8                                                \
  22   __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
  23 #define __DEFAULT_FN_ATTRS_BF16                                                \
  24   __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
  25
  26 /// Load tile configuration from a 64-byte memory location specified by
  27 /// "mem_addr". The tile configuration includes the tile type palette, the
  28 /// number of bytes per row, and the number of rows. If the specified
  29 /// palette_id is zero, that signifies the init state for both the tile
  30 /// config and the tile data, and the tiles are zeroed. Any invalid
  31 /// configurations will result in #GP fault.
  32 ///
  33 /// \headerfile <immintrin.h>
  34 ///
  35 /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
  36 ///
  37 /// \param __config
  38 ///    A pointer to 512-bits configuration
  39 static __inline__ void __DEFAULT_FN_ATTRS_TILE
  40 _tile_loadconfig(const void *__config) {
  41   __builtin_ia32_tile_loadconfig(__config);
  42 }
  43
  44 /// Stores the current tile configuration to a 64-byte memory location
  45 /// specified by "mem_addr". The tile configuration includes the tile type
  46 /// palette, the number of bytes per row, and the number of rows. If tiles
  47 /// are not configured, all zeroes will be stored to memory.
  48 ///
  49 /// \headerfile <immintrin.h>
  50 ///
  51 /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
  52 ///
  53 /// \param __config
  54 ///    A pointer to 512-bits configuration
  55 static __inline__ void __DEFAULT_FN_ATTRS_TILE
  56 _tile_storeconfig(void *__config) {
  57   __builtin_ia32_tile_storeconfig(__config);
  58 }
  59
  60 /// Release the tile configuration to return to the init state, which
  61 /// releases all storage it currently holds.
  62 ///
  63 /// \headerfile <immintrin.h>
  64 ///
  65 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
  66 static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
  67   __builtin_ia32_tilerelease();
  68 }
  69
  70 /// Load tile rows from memory specifieid by "base" address and "stride" into
  71 /// destination tile "dst" using the tile configuration previously configured
  72 /// via "_tile_loadconfig".
  73 ///
  74 /// \headerfile <immintrin.h>
  75 ///
  76 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
  77 ///
  78 /// \param dst
  79 ///    A destination tile. Max size is 1024 Bytes.
  80 /// \param base
  81 ///    A pointer to base address.
  82 /// \param stride
  83 ///    The stride between the rows' data to be loaded in memory.
  84 #define _tile_loadd(dst, base, stride)                                         \
  85   __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
  86                              (__SIZE_TYPE__)(stride))
  87
  88 /// Load tile rows from memory specifieid by "base" address and "stride" into
  89 /// destination tile "dst" using the tile configuration previously configured
  90 /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
  91 /// that the data will likely not be reused in the near future and the data
  92 /// caching can be optimized accordingly.
  93 ///
  94 /// \headerfile <immintrin.h>
  95 ///
  96 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
  97 ///
  98 /// \param dst
  99 ///    A destination tile. Max size is 1024 Bytes.
 100 /// \param base
 101 ///    A pointer to base address.
 102 /// \param stride
 103 ///    The stride between the rows' data to be loaded in memory.
 104 #define _tile_stream_loadd(dst, base, stride)                                  \
 105   __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
 106                                (__SIZE_TYPE__)(stride))
 107
 108 /// Store the tile specified by "src" to memory specifieid by "base" address and
 109 /// "stride" using the tile configuration previously configured via
 110 /// "_tile_loadconfig".
 111 ///
 112 /// \headerfile <immintrin.h>
 113 ///
 114 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
 115 ///
 116 /// \param dst
 117 ///    A destination tile. Max size is 1024 Bytes.
 118 /// \param base
 119 ///    A pointer to base address.
 120 /// \param stride
 121 ///    The stride between the rows' data to be stored in memory.
 122 #define _tile_stored(dst, base, stride)                                        \
 123   __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
 124
 125 /// Zero the tile specified by "tdest".
 126 ///
 127 /// \headerfile <immintrin.h>
 128 ///
 129 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
 130 ///
 131 /// \param tile
 132 ///    The destination tile to be zero. Max size is 1024 Bytes.
 133 #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
 134
 135 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 136 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 137 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 138 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 139 /// and store the 32-bit result back to tile "dst".
 140 ///
 141 /// \headerfile <immintrin.h>
 142 ///
 143 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
 144 ///
 145 /// \param dst
 146 ///    The destination tile. Max size is 1024 Bytes.
 147 /// \param src0
 148 ///    The 1st source tile. Max size is 1024 Bytes.
 149 /// \param src1
 150 ///    The 2nd source tile. Max size is 1024 Bytes.
 151 #define _tile_dpbssd(dst, src0, src1)                                          \
 152   __builtin_ia32_tdpbssd((dst), (src0), (src1))
 153
 154 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 155 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 156 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 157 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
 158 /// in "dst", and store the 32-bit result back to tile "dst".
 159 ///
 160 /// \headerfile <immintrin.h>
 161 ///
 162 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
 163 ///
 164 /// \param dst
 165 ///    The destination tile. Max size is 1024 Bytes.
 166 /// \param src0
 167 ///    The 1st source tile. Max size is 1024 Bytes.
 168 /// \param src1
 169 ///    The 2nd source tile. Max size is 1024 Bytes.
 170 #define _tile_dpbsud(dst, src0, src1)                                          \
 171   __builtin_ia32_tdpbsud((dst), (src0), (src1))
 172
 173 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 174 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 175 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 176 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 177 /// and store the 32-bit result back to tile "dst".
 178 ///
 179 /// \headerfile <immintrin.h>
 180 ///
 181 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
 182 ///
 183 /// \param dst
 184 ///    The destination tile. Max size is 1024 Bytes.
 185 /// \param src0
 186 ///    The 1st source tile. Max size is 1024 Bytes.
 187 /// \param src1
 188 ///    The 2nd source tile. Max size is 1024 Bytes.
 189 #define _tile_dpbusd(dst, src0, src1)                                          \
 190   __builtin_ia32_tdpbusd((dst), (src0), (src1))
 191
 192 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 193 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 194 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 195 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
 196 /// "dst", and store the 32-bit result back to tile "dst".
 197 ///
 198 /// \headerfile <immintrin.h>
 199 ///
 200 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
 201 ///
 202 /// \param dst
 203 ///    The destination tile. Max size is 1024 Bytes.
 204 /// \param src0
 205 ///    The 1st source tile. Max size is 1024 Bytes.
 206 /// \param src1
 207 ///    The 2nd source tile. Max size is 1024 Bytes.
 208 #define _tile_dpbuud(dst, src0, src1)                                          \
 209   __builtin_ia32_tdpbuud((dst), (src0), (src1))
 210
 211 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
 212 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
 213 /// elements with elements in "dst", and store the 32-bit result back to tile
 214 /// "dst".
 215 ///
 216 /// \headerfile <immintrin.h>
 217 ///
 218 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
 219 ///
 220 /// \param dst
 221 ///    The destination tile. Max size is 1024 Bytes.
 222 /// \param src0
 223 ///    The 1st source tile. Max size is 1024 Bytes.
 224 /// \param src1
 225 ///    The 2nd source tile. Max size is 1024 Bytes.
 226 #define _tile_dpbf16ps(dst, src0, src1)                                        \
 227   __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
 228
 229 /// AMX tile register size can be configured, the maximum size is 16x64=1024
 230 /// bytes. Since there is no 2D type in llvm IR, we use vector type to
 231 /// represent 2D tile and the fixed size is maximum amx tile register size.
 232 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
 233 typedef int _tile1024i_1024a
 234     __attribute__((__vector_size__(1024), __aligned__(1024)));
 235
 236 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 237 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 238 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
 239                      __SIZE_TYPE__ stride) {
 240   return __builtin_ia32_tileloadd64_internal(m, n, base,
 241                                              (__SIZE_TYPE__)(stride));
 242 }
 243
 244 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 245 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 246 _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
 247                        __SIZE_TYPE__ stride) {
 248   return __builtin_ia32_tileloaddt164_internal(m, n, base,
 249                                                (__SIZE_TYPE__)(stride));
 250 }
 251
 252 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 253 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 254 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
 255                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 256   return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
 257 }
 258
 259 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 260 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 261 _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
 262                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 263   return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
 264 }
 265
 266 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 267 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 268 _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
 269                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 270   return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
 271 }
 272
 273 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 274 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 275 _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
 276                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 277   return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
 278 }
 279
 280 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 281 static __inline__ void __DEFAULT_FN_ATTRS_INT8
 282 _tile_stored_internal(unsigned short m, unsigned short n, void *base,
 283                       __SIZE_TYPE__ stride, _tile1024i tile) {
 284   return __builtin_ia32_tilestored64_internal(m, n, base,
 285                                               (__SIZE_TYPE__)(stride), tile);
 286 }
 287
 288 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 289 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
 290 _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
 291                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 292   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
 293 }
 294
 295 /// This struct pack the shape and tile data together for user. We suggest
 296 /// initializing the struct as early as possible, because compiler depends
 297 /// on the shape information to do configure. The constant value is preferred
 298 /// for optimization by compiler.
 299 typedef struct __tile1024i_str {
 300   const unsigned short row;
 301   const unsigned short col;
 302   _tile1024i tile;
 303 } __tile1024i;
 304
 305 /// Load tile rows from memory specifieid by "base" address and "stride" into
 306 /// destination tile "dst".
 307 ///
 308 /// \headerfile <immintrin.h>
 309 ///
 310 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
 311 ///
 312 /// \param dst
 313 ///    A destination tile. Max size is 1024 Bytes.
 314 /// \param base
 315 ///    A pointer to base address.
 316 /// \param stride
 317 ///    The stride between the rows' data to be loaded in memory.
 318 __DEFAULT_FN_ATTRS_TILE
 319 static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
 320                                     __SIZE_TYPE__ stride) {
 321   dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
 322 }
 323
 324 /// Load tile rows from memory specifieid by "base" address and "stride" into
 325 /// destination tile "dst". This intrinsic provides a hint to the implementation
 326 /// that the data will likely not be reused in the near future and the data
 327 /// caching can be optimized accordingly.
 328 ///
 329 /// \headerfile <immintrin.h>
 330 ///
 331 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
 332 ///
 333 /// \param dst
 334 ///    A destination tile. Max size is 1024 Bytes.
 335 /// \param base
 336 ///    A pointer to base address.
 337 /// \param stride
 338 ///    The stride between the rows' data to be loaded in memory.
 339 __DEFAULT_FN_ATTRS_TILE
 340 static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
 341                                            __SIZE_TYPE__ stride) {
 342   dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
 343 }
 344
 345 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 346 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 347 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 348 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 349 /// and store the 32-bit result back to tile "dst".
 350 ///
 351 /// \headerfile <immintrin.h>
 352 ///
 353 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
 354 ///
 355 /// \param dst
 356 ///    The destination tile. Max size is 1024 Bytes.
 357 /// \param src0
 358 ///    The 1st source tile. Max size is 1024 Bytes.
 359 /// \param src1
 360 ///    The 2nd source tile. Max size is 1024 Bytes.
 361 __DEFAULT_FN_ATTRS_INT8
 362 static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
 363                                      __tile1024i src1) {
 364   dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
 365                                     src0.tile, src1.tile);
 366 }
 367
 368 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 369 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 370 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 371 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
 372 /// in "dst", and store the 32-bit result back to tile "dst".
 373 ///
 374 /// \headerfile <immintrin.h>
 375 ///
 376 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
 377 ///
 378 /// \param dst
 379 ///    The destination tile. Max size is 1024 Bytes.
 380 /// \param src0
 381 ///    The 1st source tile. Max size is 1024 Bytes.
 382 /// \param src1
 383 ///    The 2nd source tile. Max size is 1024 Bytes.
 384 __DEFAULT_FN_ATTRS_INT8
 385 static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
 386                                      __tile1024i src1) {
 387   dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
 388                                     src0.tile, src1.tile);
 389 }
 390
 391 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 392 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 393 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 394 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 395 /// and store the 32-bit result back to tile "dst".
 396 ///
 397 /// \headerfile <immintrin.h>
 398 ///
 399 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
 400 ///
 401 /// \param dst
 402 ///    The destination tile. Max size is 1024 Bytes.
 403 /// \param src0
 404 ///    The 1st source tile. Max size is 1024 Bytes.
 405 /// \param src1
 406 ///    The 2nd source tile. Max size is 1024 Bytes.
 407 __DEFAULT_FN_ATTRS_INT8
 408 static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
 409                                      __tile1024i src1) {
 410   dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
 411                                     src0.tile, src1.tile);
 412 }
 413
 414 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 415 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 416 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 417 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
 418 /// "dst", and store the 32-bit result back to tile "dst".
 419 ///
 420 /// \headerfile <immintrin.h>
 421 ///
 422 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
 423 ///
 424 /// \param dst
 425 ///    The destination tile. Max size is 1024 Bytes.
 426 /// \param src0
 427 ///    The 1st source tile. Max size is 1024 Bytes.
 428 /// \param src1
 429 ///    The 2nd source tile. Max size is 1024 Bytes.
 430 __DEFAULT_FN_ATTRS_INT8
 431 static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
 432                                      __tile1024i src1) {
 433   dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
 434                                     src0.tile, src1.tile);
 435 }
 436
 437 /// Store the tile specified by "src" to memory specifieid by "base" address and
 438 /// "stride".
 439 ///
 440 /// \headerfile <immintrin.h>
 441 ///
 442 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
 443 ///
 444 /// \param base
 445 ///    A pointer to base address.
 446 /// \param stride
 447 ///    The stride between the rows' data to be stored in memory.
 448 __DEFAULT_FN_ATTRS_TILE
 449 static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
 450                                      __tile1024i src) {
 451   _tile_stored_internal(src.row, src.col, base, stride, src.tile);
 452 }
 453
 454 /// Zero the tile specified by "dst".
 455 ///
 456 /// \headerfile <immintrin.h>
 457 ///
 458 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
 459 ///
 460 /// \param dst
 461 ///    The destination tile to be zero. Max size is 1024 Bytes.
 462 __DEFAULT_FN_ATTRS_TILE
 463 static __inline__ void __tile_zero(__tile1024i *dst) {
 464   dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
 465 }
 466
 467 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
 468 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
 469 /// elements with elements in "dst", and store the 32-bit result back to tile
 470 /// "dst".
 471 ///
 472 /// \headerfile <immintrin.h>
 473 ///
 474 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
 475 ///
 476 /// \param dst
 477 ///    The destination tile. Max size is 1024 Bytes.
 478 /// \param src0
 479 ///    The 1st source tile. Max size is 1024 Bytes.
 480 /// \param src1
 481 ///    The 2nd source tile. Max size is 1024 Bytes.
 482 __DEFAULT_FN_ATTRS_BF16
 483 static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
 484                                        __tile1024i src1) {
 485   dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
 486                                       src0.tile, src1.tile);
 487 }
 488
 489 #undef __DEFAULT_FN_ATTRS_TILE
 490 #undef __DEFAULT_FN_ATTRS_INT8
 491 #undef __DEFAULT_FN_ATTRS_BF16
 492
 493 #endif /* __x86_64__ */
 494 #endif /* __AMXINTRIN_H */