clang/lib/Headers/avx512vlbf16intrin.h

   1 /*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9 #ifndef __IMMINTRIN_H
  10 #error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
  11 #endif
  12
  13 #ifdef __SSE2__
  14
  15 #ifndef __AVX512VLBF16INTRIN_H
  16 #define __AVX512VLBF16INTRIN_H
  17
  18 #define __DEFAULT_FN_ATTRS128 \
  19   __attribute__((__always_inline__, __nodebug__, \
  20                  __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
  21 #define __DEFAULT_FN_ATTRS256 \
  22   __attribute__((__always_inline__, __nodebug__, \
  23                  __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
  24
  25 /// Convert Two Packed Single Data to One Packed BF16 Data.
  26 ///
  27 /// \headerfile <x86intrin.h>
  28 ///
  29 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  30 ///
  31 /// \param __A
  32 ///    A 128-bit vector of [4 x float].
  33 /// \param __B
  34 ///    A 128-bit vector of [4 x float].
  35 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
  36 ///    conversion of __B, and higher 64 bits come from conversion of __A.
  37 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
  38 _mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
  39   return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
  40                                                     (__v4sf) __B);
  41 }
  42
  43 /// Convert Two Packed Single Data to One Packed BF16 Data.
  44 ///
  45 /// \headerfile <x86intrin.h>
  46 ///
  47 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  48 ///
  49 /// \param __A
  50 ///    A 128-bit vector of [4 x float].
  51 /// \param __B
  52 ///    A 128-bit vector of [4 x float].
  53 /// \param __W
  54 ///    A 128-bit vector of [8 x bfloat].
  55 /// \param __U
  56 ///    A 8-bit mask value specifying what is chosen for each element.
  57 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
  58 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
  59 ///    conversion of __B, and higher 64 bits come from conversion of __A.
  60 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
  61 _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
  62   return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
  63                                              (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
  64                                              (__v8bf)__W);
  65 }
  66
  67 /// Convert Two Packed Single Data to One Packed BF16 Data.
  68 ///
  69 /// \headerfile <x86intrin.h>
  70 ///
  71 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  72 ///
  73 /// \param __A
  74 ///    A 128-bit vector of [4 x float].
  75 /// \param __B
  76 ///    A 128-bit vector of [4 x float].
  77 /// \param __U
  78 ///    A 8-bit mask value specifying what is chosen for each element.
  79 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
  80 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
  81 ///    conversion of __B, and higher 64 bits come from conversion of __A.
  82 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
  83 _mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
  84   return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
  85                                              (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
  86                                              (__v8bf)_mm_setzero_si128());
  87 }
  88
  89 /// Convert Two Packed Single Data to One Packed BF16 Data.
  90 ///
  91 /// \headerfile <x86intrin.h>
  92 ///
  93 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  94 ///
  95 /// \param __A
  96 ///    A 256-bit vector of [8 x float].
  97 /// \param __B
  98 ///    A 256-bit vector of [8 x float].
  99 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
 100 ///    conversion of __B, and higher 128 bits come from conversion of __A.
 101 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 102 _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
 103   return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
 104                                                     (__v8sf) __B);
 105 }
 106
 107 /// Convert Two Packed Single Data to One Packed BF16 Data.
 108 ///
 109 /// \headerfile <x86intrin.h>
 110 ///
 111 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
 112 ///
 113 /// \param __A
 114 ///    A 256-bit vector of [8 x float].
 115 /// \param __B
 116 ///    A 256-bit vector of [8 x float].
 117 /// \param __W
 118 ///    A 256-bit vector of [16 x bfloat].
 119 /// \param __U
 120 ///    A 16-bit mask value specifying what is chosen for each element.
 121 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
 122 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
 123 ///    conversion of __B, and higher 128 bits come from conversion of __A.
 124 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 125 _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
 126   return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
 127                                          (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
 128                                          (__v16bf)__W);
 129 }
 130
 131 /// Convert Two Packed Single Data to One Packed BF16 Data.
 132 ///
 133 /// \headerfile <x86intrin.h>
 134 ///
 135 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
 136 ///
 137 /// \param __A
 138 ///    A 256-bit vector of [8 x float].
 139 /// \param __B
 140 ///    A 256-bit vector of [8 x float].
 141 /// \param __U
 142 ///    A 16-bit mask value specifying what is chosen for each element.
 143 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
 144 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
 145 ///    conversion of __B, and higher 128 bits come from conversion of __A.
 146 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 147 _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
 148   return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
 149                                          (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
 150                                          (__v16bf)_mm256_setzero_si256());
 151 }
 152
 153 /// Convert Packed Single Data to Packed BF16 Data.
 154 ///
 155 /// \headerfile <x86intrin.h>
 156 ///
 157 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 158 ///
 159 /// \param __A
 160 ///    A 128-bit vector of [4 x float].
 161 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 162 ///    conversion of __A, and higher 64 bits are 0.
 163 #define _mm_cvtneps_pbh(A)                                                     \
 164   ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
 165
 166 /// Convert Packed Single Data to Packed BF16 Data.
 167 ///
 168 /// \headerfile <x86intrin.h>
 169 ///
 170 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 171 ///
 172 /// \param __A
 173 ///    A 128-bit vector of [4 x float].
 174 /// \param __W
 175 ///    A 128-bit vector of [8 x bfloat].
 176 /// \param __U
 177 ///    A 4-bit mask value specifying what is chosen for each element.
 178 ///    A 1 means conversion of __A. A 0 means element from __W.
 179 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 180 ///    conversion of __A, and higher 64 bits are 0.
 181 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 182 _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
 183   return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
 184                                                         (__v8bf)__W,
 185                                                         (__mmask8)__U);
 186 }
 187
 188 /// Convert Packed Single Data to Packed BF16 Data.
 189 ///
 190 /// \headerfile <x86intrin.h>
 191 ///
 192 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 193 ///
 194 /// \param __A
 195 ///    A 128-bit vector of [4 x float].
 196 /// \param __U
 197 ///    A 4-bit mask value specifying what is chosen for each element.
 198 ///    A 1 means conversion of __A. A 0 means element is zero.
 199 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 200 ///    conversion of __A, and higher 64 bits are 0.
 201 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 202 _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
 203   return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
 204                                                     (__v8bf)_mm_setzero_si128(),
 205                                                     (__mmask8)__U);
 206 }
 207
 208 /// Convert Packed Single Data to Packed BF16 Data.
 209 ///
 210 /// \headerfile <x86intrin.h>
 211 ///
 212 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 213 ///
 214 /// \param __A
 215 ///    A 256-bit vector of [8 x float].
 216 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
 217 #define _mm256_cvtneps_pbh(A)                                                  \
 218   ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
 219
 220 /// Convert Packed Single Data to Packed BF16 Data.
 221 ///
 222 /// \headerfile <x86intrin.h>
 223 ///
 224 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 225 ///
 226 /// \param __A
 227 ///    A 256-bit vector of [8 x float].
 228 /// \param __W
 229 ///    A 256-bit vector of [8 x bfloat].
 230 /// \param __U
 231 ///    A 8-bit mask value specifying what is chosen for each element.
 232 ///    A 1 means conversion of __A. A 0 means element from __W.
 233 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
 234 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 235 _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
 236   return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
 237                                                         (__v8bf)__W,
 238                                                         (__mmask8)__U);
 239 }
 240
 241 /// Convert Packed Single Data to Packed BF16 Data.
 242 ///
 243 /// \headerfile <x86intrin.h>
 244 ///
 245 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 246 ///
 247 /// \param __A
 248 ///    A 256-bit vector of [8 x float].
 249 /// \param __U
 250 ///    A 8-bit mask value specifying what is chosen for each element.
 251 ///    A 1 means conversion of __A. A 0 means element is zero.
 252 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
 253 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 254 _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
 255   return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
 256                                                     (__v8bf)_mm_setzero_si128(),
 257                                                     (__mmask8)__U);
 258 }
 259
 260 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 261 ///
 262 /// \headerfile <x86intrin.h>
 263 ///
 264 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 265 ///
 266 /// \param __A
 267 ///    A 128-bit vector of [8 x bfloat].
 268 /// \param __B
 269 ///    A 128-bit vector of [8 x bfloat].
 270 /// \param __D
 271 ///    A 128-bit vector of [4 x float].
 272 /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
 273 ///  __A, __B and __D
 274 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 275 _mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
 276   return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
 277                                              (__v8bf)__A,
 278                                              (__v8bf)__B);
 279 }
 280
 281 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 282 ///
 283 /// \headerfile <x86intrin.h>
 284 ///
 285 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 286 ///
 287 /// \param __A
 288 ///    A 128-bit vector of [8 x bfloat].
 289 /// \param __B
 290 ///    A 128-bit vector of [8 x bfloat].
 291 /// \param __D
 292 ///    A 128-bit vector of [4 x float].
 293 /// \param __U
 294 ///    A 8-bit mask value specifying what is chosen for each element.
 295 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
 296 /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
 297 ///  __A, __B and __D
 298 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 299 _mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
 300   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
 301                                            (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
 302                                            (__v4sf)__D);
 303 }
 304
 305 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 306 ///
 307 /// \headerfile <x86intrin.h>
 308 ///
 309 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 310 ///
 311 /// \param __A
 312 ///    A 128-bit vector of [8 x bfloat].
 313 /// \param __B
 314 ///    A 128-bit vector of [8 x bfloat].
 315 /// \param __D
 316 ///    A 128-bit vector of [4 x float].
 317 /// \param __U
 318 ///    A 8-bit mask value specifying what is chosen for each element.
 319 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
 320 /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
 321 ///  __A, __B and __D
 322 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 323 _mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
 324   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
 325                                            (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
 326                                            (__v4sf)_mm_setzero_si128());
 327 }
 328
 329 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 330 ///
 331 /// \headerfile <x86intrin.h>
 332 ///
 333 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 334 ///
 335 /// \param __A
 336 ///    A 256-bit vector of [16 x bfloat].
 337 /// \param __B
 338 ///    A 256-bit vector of [16 x bfloat].
 339 /// \param __D
 340 ///    A 256-bit vector of [8 x float].
 341 /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
 342 ///  __A, __B and __D
 343 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 344 _mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
 345   return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
 346                                              (__v16bf)__A,
 347                                              (__v16bf)__B);
 348 }
 349
 350 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 351 ///
 352 /// \headerfile <x86intrin.h>
 353 ///
 354 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 355 ///
 356 /// \param __A
 357 ///    A 256-bit vector of [16 x bfloat].
 358 /// \param __B
 359 ///    A 256-bit vector of [16 x bfloat].
 360 /// \param __D
 361 ///    A 256-bit vector of [8 x float].
 362 /// \param __U
 363 ///    A 16-bit mask value specifying what is chosen for each element.
 364 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
 365 /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
 366 ///  __A, __B and __D
 367 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 368 _mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
 369   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
 370                                         (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
 371                                         (__v8sf)__D);
 372 }
 373
 374 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 375 ///
 376 /// \headerfile <x86intrin.h>
 377 ///
 378 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 379 ///
 380 /// \param __A
 381 ///    A 256-bit vector of [16 x bfloat].
 382 /// \param __B
 383 ///    A 256-bit vector of [16 x bfloat].
 384 /// \param __D
 385 ///    A 256-bit vector of [8 x float].
 386 /// \param __U
 387 ///    A 8-bit mask value specifying what is chosen for each element.
 388 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
 389 /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
 390 ///  __A, __B and __D
 391 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 392 _mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
 393   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
 394                                         (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
 395                                         (__v8sf)_mm256_setzero_si256());
 396 }
 397
 398 /// Convert One Single float Data to One BF16 Data.
 399 ///
 400 /// \headerfile <x86intrin.h>
 401 ///
 402 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 403 ///
 404 /// \param __A
 405 ///    A float data.
 406 /// \returns A bf16 data whose sign field and exponent field keep unchanged,
 407 ///    and fraction field is truncated to 7 bits.
 408 static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
 409   __v4sf __V = {__A, 0, 0, 0};
 410   __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
 411       (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
 412   return (__bf16)__R[0];
 413 }
 414
 415 /// Convert Packed BF16 Data to Packed float Data.
 416 ///
 417 /// \headerfile <x86intrin.h>
 418 ///
 419 /// \param __A
 420 ///    A 128-bit vector of [4 x bfloat].
 421 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
 422 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
 423   return _mm_castsi128_ps(
 424       (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
 425 }
 426
 427 /// Convert Packed BF16 Data to Packed float Data.
 428 ///
 429 /// \headerfile <x86intrin.h>
 430 ///
 431 /// \param __A
 432 ///    A 128-bit vector of [8 x bfloat].
 433 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
 434 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
 435   return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
 436       (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
 437 }
 438
 439 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
 440 ///
 441 /// \headerfile <x86intrin.h>
 442 ///
 443 /// \param __U
 444 ///    A 4-bit mask. Elements are zeroed out when the corresponding mask
 445 ///    bit is not set.
 446 /// \param __A
 447 ///    A 128-bit vector of [4 x bfloat].
 448 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
 449 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 450 _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
 451   return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
 452       (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
 453 }
 454
 455 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
 456 ///
 457 /// \headerfile <x86intrin.h>
 458 ///
 459 /// \param __U
 460 ///    A 8-bit mask. Elements are zeroed out when the corresponding mask
 461 ///    bit is not set.
 462 /// \param __A
 463 ///    A 128-bit vector of [8 x bfloat].
 464 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
 465 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 466 _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
 467   return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
 468       (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
 469 }
 470
 471 /// Convert Packed BF16 Data to Packed float Data using merging mask.
 472 ///
 473 /// \headerfile <x86intrin.h>
 474 ///
 475 /// \param __S
 476 ///    A 128-bit vector of [4 x float]. Elements are copied from __S when
 477 ///     the corresponding mask bit is not set.
 478 /// \param __U
 479 ///    A 4-bit mask. Elements are zeroed out when the corresponding mask
 480 ///    bit is not set.
 481 /// \param __A
 482 ///    A 128-bit vector of [4 x bfloat].
 483 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
 484 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 485 _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
 486   return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
 487       (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
 488       16));
 489 }
 490
 491 /// Convert Packed BF16 Data to Packed float Data using merging mask.
 492 ///
 493 /// \headerfile <x86intrin.h>
 494 ///
 495 /// \param __S
 496 ///    A 256-bit vector of [8 x float]. Elements are copied from __S when
 497 ///     the corresponding mask bit is not set.
 498 /// \param __U
 499 ///    A 8-bit mask. Elements are zeroed out when the corresponding mask
 500 ///    bit is not set.
 501 /// \param __A
 502 ///    A 128-bit vector of [8 x bfloat].
 503 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
 504 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 505 _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
 506   return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
 507       (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
 508       16));
 509 }
 510
 511 #undef __DEFAULT_FN_ATTRS128
 512 #undef __DEFAULT_FN_ATTRS256
 513
 514 #endif
 515 #endif