clang/lib/Headers/amxintrin.h

   1 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===------------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __IMMINTRIN_H
  11 #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
  12 #endif /* __IMMINTRIN_H */
  13
  14 #ifndef __AMXINTRIN_H
  15 #define __AMXINTRIN_H
  16 #ifdef __x86_64__
  17
  18 /* Define the default attributes for the functions in this file. */
  19 #define __DEFAULT_FN_ATTRS_TILE                                                \
  20   __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
  21 #define __DEFAULT_FN_ATTRS_INT8                                                \
  22   __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
  23 #define __DEFAULT_FN_ATTRS_BF16                                                \
  24   __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
  25
  26 /// Load tile configuration from a 64-byte memory location specified by
  27 /// "mem_addr". The tile configuration includes the tile type palette, the
  28 /// number of bytes per row, and the number of rows. If the specified
  29 /// palette_id is zero, that signifies the init state for both the tile
  30 /// config and the tile data, and the tiles are zeroed. Any invalid
  31 /// configurations will result in #GP fault.
  32 ///
  33 /// \headerfile <immintrin.h>
  34 ///
  35 /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
  36 ///
  37 /// \param __config
  38 ///    A pointer to 512-bits configuration
  39 static __inline__ void __DEFAULT_FN_ATTRS_TILE
  40 _tile_loadconfig(const void *__config) {
  41   __builtin_ia32_tile_loadconfig(__config);
  42 }
  43
  44 /// Stores the current tile configuration to a 64-byte memory location
  45 /// specified by "mem_addr". The tile configuration includes the tile type
  46 /// palette, the number of bytes per row, and the number of rows. If tiles
  47 /// are not configured, all zeroes will be stored to memory.
  48 ///
  49 /// \headerfile <immintrin.h>
  50 ///
  51 /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
  52 ///
  53 /// \param __config
  54 ///    A pointer to 512-bits configuration
  55 static __inline__ void __DEFAULT_FN_ATTRS_TILE
  56 _tile_storeconfig(void *__config) {
  57   __builtin_ia32_tile_storeconfig(__config);
  58 }
  59
  60 /// Release the tile configuration to return to the init state, which
  61 /// releases all storage it currently holds.
  62 ///
  63 /// \headerfile <immintrin.h>
  64 ///
  65 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
  66 static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
  67   __builtin_ia32_tilerelease();
  68 }
  69
  70 /// Load tile rows from memory specifieid by "base" address and "stride" into
  71 /// destination tile "dst" using the tile configuration previously configured
  72 /// via "_tile_loadconfig".
  73 ///
  74 /// \headerfile <immintrin.h>
  75 ///
  76 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
  77 ///
  78 /// \param dst
  79 ///    A destination tile. Max size is 1024 Bytes.
  80 /// \param base
  81 ///    A pointer to base address.
  82 /// \param stride
  83 ///    The stride between the rows' data to be loaded in memory.
  84 #define _tile_loadd(dst, base, stride)                                         \
  85   __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
  86                              (__SIZE_TYPE__)(stride))
  87
  88 /// Load tile rows from memory specifieid by "base" address and "stride" into
  89 /// destination tile "dst" using the tile configuration previously configured
  90 /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
  91 /// that the data will likely not be reused in the near future and the data
  92 /// caching can be optimized accordingly.
  93 ///
  94 /// \headerfile <immintrin.h>
  95 ///
  96 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
  97 ///
  98 /// \param dst
  99 ///    A destination tile. Max size is 1024 Bytes.
 100 /// \param base
 101 ///    A pointer to base address.
 102 /// \param stride
 103 ///    The stride between the rows' data to be loaded in memory.
 104 #define _tile_stream_loadd(dst, base, stride)                                  \
 105   __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
 106                                (__SIZE_TYPE__)(stride))
 107
 108 /// Store the tile specified by "src" to memory specifieid by "base" address and
 109 /// "stride" using the tile configuration previously configured via
 110 /// "_tile_loadconfig".
 111 ///
 112 /// \headerfile <immintrin.h>
 113 ///
 114 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
 115 ///
 116 /// \param dst
 117 ///    A destination tile. Max size is 1024 Bytes.
 118 /// \param base
 119 ///    A pointer to base address.
 120 /// \param stride
 121 ///    The stride between the rows' data to be stored in memory.
 122 #define _tile_stored(dst, base, stride)                                        \
 123   __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
 124
 125 /// Zero the tile specified by "tdest".
 126 ///
 127 /// \headerfile <immintrin.h>
 128 ///
 129 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
 130 ///
 131 /// \param tile
 132 ///    The destination tile to be zero. Max size is 1024 Bytes.
 133 #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
 134
 135 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 136 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 137 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 138 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 139 /// and store the 32-bit result back to tile "dst".
 140 ///
 141 /// \headerfile <immintrin.h>
 142 ///
 143 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
 144 ///
 145 /// \param dst
 146 ///    The destination tile. Max size is 1024 Bytes.
 147 /// \param src0
 148 ///    The 1st source tile. Max size is 1024 Bytes.
 149 /// \param src1
 150 ///    The 2nd source tile. Max size is 1024 Bytes.
 151 #define _tile_dpbssd(dst, src0, src1)                                          \
 152   __builtin_ia32_tdpbssd((dst), (src0), (src1))
 153
 154 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 155 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 156 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 157 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
 158 /// in "dst", and store the 32-bit result back to tile "dst".
 159 ///
 160 /// \headerfile <immintrin.h>
 161 ///
 162 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
 163 ///
 164 /// \param dst
 165 ///    The destination tile. Max size is 1024 Bytes.
 166 /// \param src0
 167 ///    The 1st source tile. Max size is 1024 Bytes.
 168 /// \param src1
 169 ///    The 2nd source tile. Max size is 1024 Bytes.
 170 #define _tile_dpbsud(dst, src0, src1)                                          \
 171   __builtin_ia32_tdpbsud((dst), (src0), (src1))
 172
 173 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 174 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 175 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 176 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 177 /// and store the 32-bit result back to tile "dst".
 178 ///
 179 /// \headerfile <immintrin.h>
 180 ///
 181 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
 182 ///
 183 /// \param dst
 184 ///    The destination tile. Max size is 1024 Bytes.
 185 /// \param src0
 186 ///    The 1st source tile. Max size is 1024 Bytes.
 187 /// \param src1
 188 ///    The 2nd source tile. Max size is 1024 Bytes.
 189 #define _tile_dpbusd(dst, src0, src1)                                          \
 190   __builtin_ia32_tdpbusd((dst), (src0), (src1))
 191
 192 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 193 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 194 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 195 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
 196 /// "dst", and store the 32-bit result back to tile "dst".
 197 ///
 198 /// \headerfile <immintrin.h>
 199 ///
 200 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
 201 ///
 202 /// \param dst
 203 ///    The destination tile. Max size is 1024 Bytes.
 204 /// \param src0
 205 ///    The 1st source tile. Max size is 1024 Bytes.
 206 /// \param src1
 207 ///    The 2nd source tile. Max size is 1024 Bytes.
 208 #define _tile_dpbuud(dst, src0, src1)                                          \
 209   __builtin_ia32_tdpbuud((dst), (src0), (src1))
 210
 211 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
 212 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
 213 /// elements with elements in "dst", and store the 32-bit result back to tile
 214 /// "dst".
 215 ///
 216 /// \headerfile <immintrin.h>
 217 ///
 218 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
 219 ///
 220 /// \param dst
 221 ///    The destination tile. Max size is 1024 Bytes.
 222 /// \param src0
 223 ///    The 1st source tile. Max size is 1024 Bytes.
 224 /// \param src1
 225 ///    The 2nd source tile. Max size is 1024 Bytes.
 226 #define _tile_dpbf16ps(dst, src0, src1)                                        \
 227   __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
 228
 229 /// AMX tile register size can be configured, the maximum size is 16x64=1024
 230 /// bytes. Since there is no 2D type in llvm IR, we use vector type to
 231 /// represent 2D tile and the fixed size is maximum amx tile register size.
 232 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
 233
 234 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 235 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 236 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
 237                      __SIZE_TYPE__ stride) {
 238   return __builtin_ia32_tileloadd64_internal(m, n, base,
 239                                              (__SIZE_TYPE__)(stride));
 240 }
 241
 242 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 243 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 244 _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
 245                        __SIZE_TYPE__ stride) {
 246   return __builtin_ia32_tileloaddt164_internal(m, n, base,
 247                                                (__SIZE_TYPE__)(stride));
 248 }
 249
 250 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 251 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 252 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
 253                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 254   return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
 255 }
 256
 257 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 258 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 259 _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
 260                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 261   return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
 262 }
 263
 264 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 265 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 266 _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
 267                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 268   return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
 269 }
 270
 271 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 272 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 273 _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
 274                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 275   return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
 276 }
 277
 278 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 279 static __inline__ void __DEFAULT_FN_ATTRS_INT8
 280 _tile_stored_internal(unsigned short m, unsigned short n, void *base,
 281                       __SIZE_TYPE__ stride, _tile1024i tile) {
 282   return __builtin_ia32_tilestored64_internal(m, n, base,
 283                                               (__SIZE_TYPE__)(stride), tile);
 284 }
 285
 286 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 287 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
 288 _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
 289                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 290   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
 291 }
 292
 293 /// This struct pack the shape and tile data together for user. We suggest
 294 /// initializing the struct as early as possible, because compiler depends
 295 /// on the shape information to do configure. The constant value is preferred
 296 /// for optimization by compiler.
 297 typedef struct __tile1024i_str {
 298   const unsigned short row;
 299   const unsigned short col;
 300   _tile1024i tile;
 301 } __tile1024i;
 302
 303 /// Load tile rows from memory specifieid by "base" address and "stride" into
 304 /// destination tile "dst".
 305 ///
 306 /// \headerfile <immintrin.h>
 307 ///
 308 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
 309 ///
 310 /// \param dst
 311 ///    A destination tile. Max size is 1024 Bytes.
 312 /// \param base
 313 ///    A pointer to base address.
 314 /// \param stride
 315 ///    The stride between the rows' data to be loaded in memory.
 316 __DEFAULT_FN_ATTRS_TILE
 317 static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
 318                                     __SIZE_TYPE__ stride) {
 319   dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
 320 }
 321
 322 /// Load tile rows from memory specifieid by "base" address and "stride" into
 323 /// destination tile "dst". This intrinsic provides a hint to the implementation
 324 /// that the data will likely not be reused in the near future and the data
 325 /// caching can be optimized accordingly.
 326 ///
 327 /// \headerfile <immintrin.h>
 328 ///
 329 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
 330 ///
 331 /// \param dst
 332 ///    A destination tile. Max size is 1024 Bytes.
 333 /// \param base
 334 ///    A pointer to base address.
 335 /// \param stride
 336 ///    The stride between the rows' data to be loaded in memory.
 337 __DEFAULT_FN_ATTRS_TILE
 338 static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
 339                                            __SIZE_TYPE__ stride) {
 340   dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
 341 }
 342
 343 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 344 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 345 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 346 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 347 /// and store the 32-bit result back to tile "dst".
 348 ///
 349 /// \headerfile <immintrin.h>
 350 ///
 351 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
 352 ///
 353 /// \param dst
 354 ///    The destination tile. Max size is 1024 Bytes.
 355 /// \param src0
 356 ///    The 1st source tile. Max size is 1024 Bytes.
 357 /// \param src1
 358 ///    The 2nd source tile. Max size is 1024 Bytes.
 359 __DEFAULT_FN_ATTRS_INT8
 360 static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
 361                                      __tile1024i src1) {
 362   dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
 363                                     src0.tile, src1.tile);
 364 }
 365
 366 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 367 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 368 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 369 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
 370 /// in "dst", and store the 32-bit result back to tile "dst".
 371 ///
 372 /// \headerfile <immintrin.h>
 373 ///
 374 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
 375 ///
 376 /// \param dst
 377 ///    The destination tile. Max size is 1024 Bytes.
 378 /// \param src0
 379 ///    The 1st source tile. Max size is 1024 Bytes.
 380 /// \param src1
 381 ///    The 2nd source tile. Max size is 1024 Bytes.
 382 __DEFAULT_FN_ATTRS_INT8
 383 static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
 384                                      __tile1024i src1) {
 385   dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
 386                                     src0.tile, src1.tile);
 387 }
 388
 389 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 390 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 391 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 392 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 393 /// and store the 32-bit result back to tile "dst".
 394 ///
 395 /// \headerfile <immintrin.h>
 396 ///
 397 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
 398 ///
 399 /// \param dst
 400 ///    The destination tile. Max size is 1024 Bytes.
 401 /// \param src0
 402 ///    The 1st source tile. Max size is 1024 Bytes.
 403 /// \param src1
 404 ///    The 2nd source tile. Max size is 1024 Bytes.
 405 __DEFAULT_FN_ATTRS_INT8
 406 static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
 407                                      __tile1024i src1) {
 408   dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
 409                                     src0.tile, src1.tile);
 410 }
 411
 412 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 413 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 414 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 415 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
 416 /// "dst", and store the 32-bit result back to tile "dst".
 417 ///
 418 /// \headerfile <immintrin.h>
 419 ///
 420 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
 421 ///
 422 /// \param dst
 423 ///    The destination tile. Max size is 1024 Bytes.
 424 /// \param src0
 425 ///    The 1st source tile. Max size is 1024 Bytes.
 426 /// \param src1
 427 ///    The 2nd source tile. Max size is 1024 Bytes.
 428 __DEFAULT_FN_ATTRS_INT8
 429 static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
 430                                      __tile1024i src1) {
 431   dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
 432                                     src0.tile, src1.tile);
 433 }
 434
 435 /// Store the tile specified by "src" to memory specifieid by "base" address and
 436 /// "stride".
 437 ///
 438 /// \headerfile <immintrin.h>
 439 ///
 440 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
 441 ///
 442 /// \param base
 443 ///    A pointer to base address.
 444 /// \param stride
 445 ///    The stride between the rows' data to be stored in memory.
 446 __DEFAULT_FN_ATTRS_TILE
 447 static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
 448                                      __tile1024i src) {
 449   _tile_stored_internal(src.row, src.col, base, stride, src.tile);
 450 }
 451
 452 /// Zero the tile specified by "dst".
 453 ///
 454 /// \headerfile <immintrin.h>
 455 ///
 456 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
 457 ///
 458 /// \param dst
 459 ///    The destination tile to be zero. Max size is 1024 Bytes.
 460 __DEFAULT_FN_ATTRS_TILE
 461 static __inline__ void __tile_zero(__tile1024i *dst) {
 462   dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
 463 }
 464
 465 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
 466 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
 467 /// elements with elements in "dst", and store the 32-bit result back to tile
 468 /// "dst".
 469 ///
 470 /// \headerfile <immintrin.h>
 471 ///
 472 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
 473 ///
 474 /// \param dst
 475 ///    The destination tile. Max size is 1024 Bytes.
 476 /// \param src0
 477 ///    The 1st source tile. Max size is 1024 Bytes.
 478 /// \param src1
 479 ///    The 2nd source tile. Max size is 1024 Bytes.
 480 __DEFAULT_FN_ATTRS_BF16
 481 static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
 482                                        __tile1024i src1) {
 483   dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
 484                                       src0.tile, src1.tile);
 485 }
 486
 487 #undef __DEFAULT_FN_ATTRS_TILE
 488 #undef __DEFAULT_FN_ATTRS_INT8
 489 #undef __DEFAULT_FN_ATTRS_BF16
 490
 491 #endif /* __x86_64__ */
 492 #endif /* __AMXINTRIN_H */