clang/lib/Headers/amxintrin.h

   1 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===------------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __IMMINTRIN_H
  11 #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
  12 #endif /* __IMMINTRIN_H */
  13
  14 #ifndef __AMXINTRIN_H
  15 #define __AMXINTRIN_H
  16 #ifdef __x86_64__
  17
  18 /* Define the default attributes for the functions in this file. */
  19 #define __DEFAULT_FN_ATTRS_TILE                                                \
  20   __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
  21 #define __DEFAULT_FN_ATTRS_INT8                                                \
  22   __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
  23 #define __DEFAULT_FN_ATTRS_BF16                                                \
  24   __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
  25 #define __DEFAULT_FN_ATTRS_FP16                                                \
  26   __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
  27
  28 /// Load tile configuration from a 64-byte memory location specified by
  29 /// "mem_addr". The tile configuration includes the tile type palette, the
  30 /// number of bytes per row, and the number of rows. If the specified
  31 /// palette_id is zero, that signifies the init state for both the tile
  32 /// config and the tile data, and the tiles are zeroed. Any invalid
  33 /// configurations will result in #GP fault.
  34 ///
  35 /// \headerfile <immintrin.h>
  36 ///
  37 /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
  38 ///
  39 /// \param __config
  40 ///    A pointer to 512-bits configuration
  41 static __inline__ void __DEFAULT_FN_ATTRS_TILE
  42 _tile_loadconfig(const void *__config) {
  43   __builtin_ia32_tile_loadconfig(__config);
  44 }
  45
  46 /// Stores the current tile configuration to a 64-byte memory location
  47 /// specified by "mem_addr". The tile configuration includes the tile type
  48 /// palette, the number of bytes per row, and the number of rows. If tiles
  49 /// are not configured, all zeroes will be stored to memory.
  50 ///
  51 /// \headerfile <immintrin.h>
  52 ///
  53 /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
  54 ///
  55 /// \param __config
  56 ///    A pointer to 512-bits configuration
  57 static __inline__ void __DEFAULT_FN_ATTRS_TILE
  58 _tile_storeconfig(void *__config) {
  59   __builtin_ia32_tile_storeconfig(__config);
  60 }
  61
  62 /// Release the tile configuration to return to the init state, which
  63 /// releases all storage it currently holds.
  64 ///
  65 /// \headerfile <immintrin.h>
  66 ///
  67 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
  68 static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
  69   __builtin_ia32_tilerelease();
  70 }
  71
  72 /// Load tile rows from memory specifieid by "base" address and "stride" into
  73 /// destination tile "dst" using the tile configuration previously configured
  74 /// via "_tile_loadconfig".
  75 ///
  76 /// \headerfile <immintrin.h>
  77 ///
  78 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
  79 ///
  80 /// \param dst
  81 ///    A destination tile. Max size is 1024 Bytes.
  82 /// \param base
  83 ///    A pointer to base address.
  84 /// \param stride
  85 ///    The stride between the rows' data to be loaded in memory.
  86 #define _tile_loadd(dst, base, stride)                                         \
  87   __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
  88                              (__SIZE_TYPE__)(stride))
  89
  90 /// Load tile rows from memory specifieid by "base" address and "stride" into
  91 /// destination tile "dst" using the tile configuration previously configured
  92 /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
  93 /// that the data will likely not be reused in the near future and the data
  94 /// caching can be optimized accordingly.
  95 ///
  96 /// \headerfile <immintrin.h>
  97 ///
  98 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
  99 ///
 100 /// \param dst
 101 ///    A destination tile. Max size is 1024 Bytes.
 102 /// \param base
 103 ///    A pointer to base address.
 104 /// \param stride
 105 ///    The stride between the rows' data to be loaded in memory.
 106 #define _tile_stream_loadd(dst, base, stride)                                  \
 107   __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
 108                                (__SIZE_TYPE__)(stride))
 109
 110 /// Store the tile specified by "src" to memory specifieid by "base" address and
 111 /// "stride" using the tile configuration previously configured via
 112 /// "_tile_loadconfig".
 113 ///
 114 /// \headerfile <immintrin.h>
 115 ///
 116 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
 117 ///
 118 /// \param dst
 119 ///    A destination tile. Max size is 1024 Bytes.
 120 /// \param base
 121 ///    A pointer to base address.
 122 /// \param stride
 123 ///    The stride between the rows' data to be stored in memory.
 124 #define _tile_stored(dst, base, stride)                                        \
 125   __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
 126
 127 /// Zero the tile specified by "tdest".
 128 ///
 129 /// \headerfile <immintrin.h>
 130 ///
 131 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
 132 ///
 133 /// \param tile
 134 ///    The destination tile to be zero. Max size is 1024 Bytes.
 135 #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
 136
 137 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 138 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 139 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 140 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 141 /// and store the 32-bit result back to tile "dst".
 142 ///
 143 /// \headerfile <immintrin.h>
 144 ///
 145 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
 146 ///
 147 /// \param dst
 148 ///    The destination tile. Max size is 1024 Bytes.
 149 /// \param src0
 150 ///    The 1st source tile. Max size is 1024 Bytes.
 151 /// \param src1
 152 ///    The 2nd source tile. Max size is 1024 Bytes.
 153 #define _tile_dpbssd(dst, src0, src1)                                          \
 154   __builtin_ia32_tdpbssd((dst), (src0), (src1))
 155
 156 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 157 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 158 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 159 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
 160 /// in "dst", and store the 32-bit result back to tile "dst".
 161 ///
 162 /// \headerfile <immintrin.h>
 163 ///
 164 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
 165 ///
 166 /// \param dst
 167 ///    The destination tile. Max size is 1024 Bytes.
 168 /// \param src0
 169 ///    The 1st source tile. Max size is 1024 Bytes.
 170 /// \param src1
 171 ///    The 2nd source tile. Max size is 1024 Bytes.
 172 #define _tile_dpbsud(dst, src0, src1)                                          \
 173   __builtin_ia32_tdpbsud((dst), (src0), (src1))
 174
 175 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 176 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 177 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 178 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 179 /// and store the 32-bit result back to tile "dst".
 180 ///
 181 /// \headerfile <immintrin.h>
 182 ///
 183 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
 184 ///
 185 /// \param dst
 186 ///    The destination tile. Max size is 1024 Bytes.
 187 /// \param src0
 188 ///    The 1st source tile. Max size is 1024 Bytes.
 189 /// \param src1
 190 ///    The 2nd source tile. Max size is 1024 Bytes.
 191 #define _tile_dpbusd(dst, src0, src1)                                          \
 192   __builtin_ia32_tdpbusd((dst), (src0), (src1))
 193
 194 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 195 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 196 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 197 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
 198 /// "dst", and store the 32-bit result back to tile "dst".
 199 ///
 200 /// \headerfile <immintrin.h>
 201 ///
 202 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
 203 ///
 204 /// \param dst
 205 ///    The destination tile. Max size is 1024 Bytes.
 206 /// \param src0
 207 ///    The 1st source tile. Max size is 1024 Bytes.
 208 /// \param src1
 209 ///    The 2nd source tile. Max size is 1024 Bytes.
 210 #define _tile_dpbuud(dst, src0, src1)                                          \
 211   __builtin_ia32_tdpbuud((dst), (src0), (src1))
 212
 213 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
 214 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
 215 /// elements with elements in "dst", and store the 32-bit result back to tile
 216 /// "dst".
 217 ///
 218 /// \headerfile <immintrin.h>
 219 ///
 220 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
 221 ///
 222 /// \param dst
 223 ///    The destination tile. Max size is 1024 Bytes.
 224 /// \param src0
 225 ///    The 1st source tile. Max size is 1024 Bytes.
 226 /// \param src1
 227 ///    The 2nd source tile. Max size is 1024 Bytes.
 228 #define _tile_dpbf16ps(dst, src0, src1)                                        \
 229   __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
 230
 231 /// AMX tile register size can be configured, the maximum size is 16x64=1024
 232 /// bytes. Since there is no 2D type in llvm IR, we use vector type to
 233 /// represent 2D tile and the fixed size is maximum amx tile register size.
 234 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
 235
 236 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 237 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 238 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
 239                      __SIZE_TYPE__ stride) {
 240   return __builtin_ia32_tileloadd64_internal(m, n, base,
 241                                              (__SIZE_TYPE__)(stride));
 242 }
 243
 244 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 245 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 246 _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
 247                        __SIZE_TYPE__ stride) {
 248   return __builtin_ia32_tileloaddt164_internal(m, n, base,
 249                                                (__SIZE_TYPE__)(stride));
 250 }
 251
 252 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 253 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 254 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
 255                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 256   return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
 257 }
 258
 259 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 260 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 261 _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
 262                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 263   return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
 264 }
 265
 266 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 267 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 268 _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
 269                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 270   return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
 271 }
 272
 273 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 274 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 275 _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
 276                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 277   return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
 278 }
 279
 280 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 281 static __inline__ void __DEFAULT_FN_ATTRS_INT8
 282 _tile_stored_internal(unsigned short m, unsigned short n, void *base,
 283                       __SIZE_TYPE__ stride, _tile1024i tile) {
 284   return __builtin_ia32_tilestored64_internal(m, n, base,
 285                                               (__SIZE_TYPE__)(stride), tile);
 286 }
 287
 288 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 289 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
 290 _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
 291                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 292   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
 293 }
 294
 295 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 296 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
 297 _tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
 298                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
 299   return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
 300 }
 301
 302 /// This struct pack the shape and tile data together for user. We suggest
 303 /// initializing the struct as early as possible, because compiler depends
 304 /// on the shape information to do configure. The constant value is preferred
 305 /// for optimization by compiler.
 306 typedef struct __tile1024i_str {
 307   const unsigned short row;
 308   const unsigned short col;
 309   _tile1024i tile;
 310 } __tile1024i;
 311
 312 /// Load tile rows from memory specifieid by "base" address and "stride" into
 313 /// destination tile "dst".
 314 ///
 315 /// \headerfile <immintrin.h>
 316 ///
 317 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
 318 ///
 319 /// \param dst
 320 ///    A destination tile. Max size is 1024 Bytes.
 321 /// \param base
 322 ///    A pointer to base address.
 323 /// \param stride
 324 ///    The stride between the rows' data to be loaded in memory.
 325 __DEFAULT_FN_ATTRS_TILE
 326 static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
 327                                     __SIZE_TYPE__ stride) {
 328   dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
 329 }
 330
 331 /// Load tile rows from memory specifieid by "base" address and "stride" into
 332 /// destination tile "dst". This intrinsic provides a hint to the implementation
 333 /// that the data will likely not be reused in the near future and the data
 334 /// caching can be optimized accordingly.
 335 ///
 336 /// \headerfile <immintrin.h>
 337 ///
 338 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
 339 ///
 340 /// \param dst
 341 ///    A destination tile. Max size is 1024 Bytes.
 342 /// \param base
 343 ///    A pointer to base address.
 344 /// \param stride
 345 ///    The stride between the rows' data to be loaded in memory.
 346 __DEFAULT_FN_ATTRS_TILE
 347 static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
 348                                            __SIZE_TYPE__ stride) {
 349   dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
 350 }
 351
 352 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 353 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 354 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 355 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 356 /// and store the 32-bit result back to tile "dst".
 357 ///
 358 /// \headerfile <immintrin.h>
 359 ///
 360 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
 361 ///
 362 /// \param dst
 363 ///    The destination tile. Max size is 1024 Bytes.
 364 /// \param src0
 365 ///    The 1st source tile. Max size is 1024 Bytes.
 366 /// \param src1
 367 ///    The 2nd source tile. Max size is 1024 Bytes.
 368 __DEFAULT_FN_ATTRS_INT8
 369 static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
 370                                      __tile1024i src1) {
 371   dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
 372                                     src0.tile, src1.tile);
 373 }
 374
 375 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 376 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 377 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 378 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
 379 /// in "dst", and store the 32-bit result back to tile "dst".
 380 ///
 381 /// \headerfile <immintrin.h>
 382 ///
 383 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
 384 ///
 385 /// \param dst
 386 ///    The destination tile. Max size is 1024 Bytes.
 387 /// \param src0
 388 ///    The 1st source tile. Max size is 1024 Bytes.
 389 /// \param src1
 390 ///    The 2nd source tile. Max size is 1024 Bytes.
 391 __DEFAULT_FN_ATTRS_INT8
 392 static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
 393                                      __tile1024i src1) {
 394   dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
 395                                     src0.tile, src1.tile);
 396 }
 397
 398 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 399 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 400 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 401 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 402 /// and store the 32-bit result back to tile "dst".
 403 ///
 404 /// \headerfile <immintrin.h>
 405 ///
 406 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
 407 ///
 408 /// \param dst
 409 ///    The destination tile. Max size is 1024 Bytes.
 410 /// \param src0
 411 ///    The 1st source tile. Max size is 1024 Bytes.
 412 /// \param src1
 413 ///    The 2nd source tile. Max size is 1024 Bytes.
 414 __DEFAULT_FN_ATTRS_INT8
 415 static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
 416                                      __tile1024i src1) {
 417   dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
 418                                     src0.tile, src1.tile);
 419 }
 420
 421 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 422 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 423 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 424 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
 425 /// "dst", and store the 32-bit result back to tile "dst".
 426 ///
 427 /// \headerfile <immintrin.h>
 428 ///
 429 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
 430 ///
 431 /// \param dst
 432 ///    The destination tile. Max size is 1024 Bytes.
 433 /// \param src0
 434 ///    The 1st source tile. Max size is 1024 Bytes.
 435 /// \param src1
 436 ///    The 2nd source tile. Max size is 1024 Bytes.
 437 __DEFAULT_FN_ATTRS_INT8
 438 static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
 439                                      __tile1024i src1) {
 440   dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
 441                                     src0.tile, src1.tile);
 442 }
 443
 444 /// Store the tile specified by "src" to memory specifieid by "base" address and
 445 /// "stride".
 446 ///
 447 /// \headerfile <immintrin.h>
 448 ///
 449 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
 450 ///
 451 /// \param base
 452 ///    A pointer to base address.
 453 /// \param stride
 454 ///    The stride between the rows' data to be stored in memory.
 455 __DEFAULT_FN_ATTRS_TILE
 456 static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
 457                                      __tile1024i src) {
 458   _tile_stored_internal(src.row, src.col, base, stride, src.tile);
 459 }
 460
 461 /// Zero the tile specified by "dst".
 462 ///
 463 /// \headerfile <immintrin.h>
 464 ///
 465 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
 466 ///
 467 /// \param dst
 468 ///    The destination tile to be zero. Max size is 1024 Bytes.
 469 __DEFAULT_FN_ATTRS_TILE
 470 static __inline__ void __tile_zero(__tile1024i *dst) {
 471   dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
 472 }
 473
 474 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
 475 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
 476 /// elements with elements in "dst", and store the 32-bit result back to tile
 477 /// "dst".
 478 ///
 479 /// \headerfile <immintrin.h>
 480 ///
 481 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
 482 ///
 483 /// \param dst
 484 ///    The destination tile. Max size is 1024 Bytes.
 485 /// \param src0
 486 ///    The 1st source tile. Max size is 1024 Bytes.
 487 /// \param src1
 488 ///    The 2nd source tile. Max size is 1024 Bytes.
 489 __DEFAULT_FN_ATTRS_BF16
 490 static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
 491                                        __tile1024i src1) {
 492   dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
 493                                       src0.tile, src1.tile);
 494 }
 495
 496 /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
 497 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
 498 /// elements with elements in "dst", and store the 32-bit result back to tile
 499 /// "dst".
 500 ///
 501 /// \headerfile <immintrin.h>
 502 ///
 503 /// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
 504 ///
 505 /// \param dst
 506 ///    The destination tile. Max size is 1024 Bytes.
 507 /// \param src0
 508 ///    The 1st source tile. Max size is 1024 Bytes.
 509 /// \param src1
 510 ///    The 2nd source tile. Max size is 1024 Bytes.
 511 __DEFAULT_FN_ATTRS_FP16
 512 static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
 513                                        __tile1024i src1) {
 514   dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
 515                                       src0.tile, src1.tile);
 516 }
 517
 518 #undef __DEFAULT_FN_ATTRS_TILE
 519 #undef __DEFAULT_FN_ATTRS_INT8
 520 #undef __DEFAULT_FN_ATTRS_BF16
 521 #undef __DEFAULT_FN_ATTRS_FP16
 522
 523 #endif /* __x86_64__ */
 524 #endif /* __AMXINTRIN_H */