1 /*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
10 #error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
15 #ifndef __AVX512VLBF16INTRIN_H
16 #define __AVX512VLBF16INTRIN_H
18 #define __DEFAULT_FN_ATTRS128 \
19 __attribute__((__always_inline__, __nodebug__, \
20 __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
21 #define __DEFAULT_FN_ATTRS256 \
22 __attribute__((__always_inline__, __nodebug__, \
23 __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
25 /// Convert Two Packed Single Data to One Packed BF16 Data.
27 /// \headerfile <x86intrin.h>
29 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
32 /// A 128-bit vector of [4 x float].
34 /// A 128-bit vector of [4 x float].
35 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
36 /// conversion of __B, and higher 64 bits come from conversion of __A.
37 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
38 _mm_cvtne2ps_pbh(__m128 __A
, __m128 __B
) {
39 return (__m128bh
)__builtin_ia32_cvtne2ps2bf16_128((__v4sf
) __A
,
43 /// Convert Two Packed Single Data to One Packed BF16 Data.
45 /// \headerfile <x86intrin.h>
47 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
50 /// A 128-bit vector of [4 x float].
52 /// A 128-bit vector of [4 x float].
54 /// A 128-bit vector of [8 x bfloat].
56 /// A 8-bit mask value specifying what is chosen for each element.
57 /// A 1 means conversion of __A or __B. A 0 means element from __W.
58 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
59 /// conversion of __B, and higher 64 bits come from conversion of __A.
60 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
61 _mm_mask_cvtne2ps_pbh(__m128bh __W
, __mmask8 __U
, __m128 __A
, __m128 __B
) {
62 return (__m128bh
)__builtin_ia32_selectpbf_128((__mmask8
)__U
,
63 (__v8bf
)_mm_cvtne2ps_pbh(__A
, __B
),
67 /// Convert Two Packed Single Data to One Packed BF16 Data.
69 /// \headerfile <x86intrin.h>
71 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
74 /// A 128-bit vector of [4 x float].
76 /// A 128-bit vector of [4 x float].
78 /// A 8-bit mask value specifying what is chosen for each element.
79 /// A 1 means conversion of __A or __B. A 0 means element is zero.
80 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
81 /// conversion of __B, and higher 64 bits come from conversion of __A.
82 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
83 _mm_maskz_cvtne2ps_pbh(__mmask8 __U
, __m128 __A
, __m128 __B
) {
84 return (__m128bh
)__builtin_ia32_selectpbf_128((__mmask8
)__U
,
85 (__v8bf
)_mm_cvtne2ps_pbh(__A
, __B
),
86 (__v8bf
)_mm_setzero_si128());
89 /// Convert Two Packed Single Data to One Packed BF16 Data.
91 /// \headerfile <x86intrin.h>
93 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
96 /// A 256-bit vector of [8 x float].
98 /// A 256-bit vector of [8 x float].
99 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
100 /// conversion of __B, and higher 128 bits come from conversion of __A.
101 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
102 _mm256_cvtne2ps_pbh(__m256 __A
, __m256 __B
) {
103 return (__m256bh
)__builtin_ia32_cvtne2ps2bf16_256((__v8sf
) __A
,
107 /// Convert Two Packed Single Data to One Packed BF16 Data.
109 /// \headerfile <x86intrin.h>
111 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
114 /// A 256-bit vector of [8 x float].
116 /// A 256-bit vector of [8 x float].
118 /// A 256-bit vector of [16 x bfloat].
120 /// A 16-bit mask value specifying what is chosen for each element.
121 /// A 1 means conversion of __A or __B. A 0 means element from __W.
122 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
123 /// conversion of __B, and higher 128 bits come from conversion of __A.
124 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
125 _mm256_mask_cvtne2ps_pbh(__m256bh __W
, __mmask16 __U
, __m256 __A
, __m256 __B
) {
126 return (__m256bh
)__builtin_ia32_selectpbf_256((__mmask16
)__U
,
127 (__v16bf
)_mm256_cvtne2ps_pbh(__A
, __B
),
131 /// Convert Two Packed Single Data to One Packed BF16 Data.
133 /// \headerfile <x86intrin.h>
135 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
138 /// A 256-bit vector of [8 x float].
140 /// A 256-bit vector of [8 x float].
142 /// A 16-bit mask value specifying what is chosen for each element.
143 /// A 1 means conversion of __A or __B. A 0 means element is zero.
144 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
145 /// conversion of __B, and higher 128 bits come from conversion of __A.
146 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
147 _mm256_maskz_cvtne2ps_pbh(__mmask16 __U
, __m256 __A
, __m256 __B
) {
148 return (__m256bh
)__builtin_ia32_selectpbf_256((__mmask16
)__U
,
149 (__v16bf
)_mm256_cvtne2ps_pbh(__A
, __B
),
150 (__v16bf
)_mm256_setzero_si256());
153 /// Convert Packed Single Data to Packed BF16 Data.
155 /// \headerfile <x86intrin.h>
157 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
160 /// A 128-bit vector of [4 x float].
161 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
162 /// conversion of __A, and higher 64 bits are 0.
163 #define _mm_cvtneps_pbh(A) \
164 ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
166 /// Convert Packed Single Data to Packed BF16 Data.
168 /// \headerfile <x86intrin.h>
170 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
173 /// A 128-bit vector of [4 x float].
175 /// A 128-bit vector of [8 x bfloat].
177 /// A 4-bit mask value specifying what is chosen for each element.
178 /// A 1 means conversion of __A. A 0 means element from __W.
179 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
180 /// conversion of __A, and higher 64 bits are 0.
181 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
182 _mm_mask_cvtneps_pbh(__m128bh __W
, __mmask8 __U
, __m128 __A
) {
183 return (__m128bh
)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf
) __A
,
188 /// Convert Packed Single Data to Packed BF16 Data.
190 /// \headerfile <x86intrin.h>
192 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
195 /// A 128-bit vector of [4 x float].
197 /// A 4-bit mask value specifying what is chosen for each element.
198 /// A 1 means conversion of __A. A 0 means element is zero.
199 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
200 /// conversion of __A, and higher 64 bits are 0.
201 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
202 _mm_maskz_cvtneps_pbh(__mmask8 __U
, __m128 __A
) {
203 return (__m128bh
)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf
) __A
,
204 (__v8bf
)_mm_setzero_si128(),
208 /// Convert Packed Single Data to Packed BF16 Data.
210 /// \headerfile <x86intrin.h>
212 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
215 /// A 256-bit vector of [8 x float].
216 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
217 #define _mm256_cvtneps_pbh(A) \
218 ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
220 /// Convert Packed Single Data to Packed BF16 Data.
222 /// \headerfile <x86intrin.h>
224 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
227 /// A 256-bit vector of [8 x float].
229 /// A 256-bit vector of [8 x bfloat].
231 /// A 8-bit mask value specifying what is chosen for each element.
232 /// A 1 means conversion of __A. A 0 means element from __W.
233 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
234 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
235 _mm256_mask_cvtneps_pbh(__m128bh __W
, __mmask8 __U
, __m256 __A
) {
236 return (__m128bh
)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf
)__A
,
241 /// Convert Packed Single Data to Packed BF16 Data.
243 /// \headerfile <x86intrin.h>
245 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
248 /// A 256-bit vector of [8 x float].
250 /// A 8-bit mask value specifying what is chosen for each element.
251 /// A 1 means conversion of __A. A 0 means element is zero.
252 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
253 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
254 _mm256_maskz_cvtneps_pbh(__mmask8 __U
, __m256 __A
) {
255 return (__m128bh
)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf
)__A
,
256 (__v8bf
)_mm_setzero_si128(),
260 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
262 /// \headerfile <x86intrin.h>
264 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
267 /// A 128-bit vector of [8 x bfloat].
269 /// A 128-bit vector of [8 x bfloat].
271 /// A 128-bit vector of [4 x float].
272 /// \returns A 128-bit vector of [4 x float] comes from Dot Product of
274 static __inline__ __m128 __DEFAULT_FN_ATTRS128
275 _mm_dpbf16_ps(__m128 __D
, __m128bh __A
, __m128bh __B
) {
276 return (__m128
)__builtin_ia32_dpbf16ps_128((__v4sf
)__D
,
281 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
283 /// \headerfile <x86intrin.h>
285 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
288 /// A 128-bit vector of [8 x bfloat].
290 /// A 128-bit vector of [8 x bfloat].
292 /// A 128-bit vector of [4 x float].
294 /// A 8-bit mask value specifying what is chosen for each element.
295 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
296 /// \returns A 128-bit vector of [4 x float] comes from Dot Product of
298 static __inline__ __m128 __DEFAULT_FN_ATTRS128
299 _mm_mask_dpbf16_ps(__m128 __D
, __mmask8 __U
, __m128bh __A
, __m128bh __B
) {
300 return (__m128
)__builtin_ia32_selectps_128((__mmask8
)__U
,
301 (__v4sf
)_mm_dpbf16_ps(__D
, __A
, __B
),
305 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
307 /// \headerfile <x86intrin.h>
309 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
312 /// A 128-bit vector of [8 x bfloat].
314 /// A 128-bit vector of [8 x bfloat].
316 /// A 128-bit vector of [4 x float].
318 /// A 8-bit mask value specifying what is chosen for each element.
319 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
320 /// \returns A 128-bit vector of [4 x float] comes from Dot Product of
322 static __inline__ __m128 __DEFAULT_FN_ATTRS128
323 _mm_maskz_dpbf16_ps(__mmask8 __U
, __m128 __D
, __m128bh __A
, __m128bh __B
) {
324 return (__m128
)__builtin_ia32_selectps_128((__mmask8
)__U
,
325 (__v4sf
)_mm_dpbf16_ps(__D
, __A
, __B
),
326 (__v4sf
)_mm_setzero_si128());
329 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
331 /// \headerfile <x86intrin.h>
333 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
336 /// A 256-bit vector of [16 x bfloat].
338 /// A 256-bit vector of [16 x bfloat].
340 /// A 256-bit vector of [8 x float].
341 /// \returns A 256-bit vector of [8 x float] comes from Dot Product of
343 static __inline__ __m256 __DEFAULT_FN_ATTRS256
344 _mm256_dpbf16_ps(__m256 __D
, __m256bh __A
, __m256bh __B
) {
345 return (__m256
)__builtin_ia32_dpbf16ps_256((__v8sf
)__D
,
350 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
352 /// \headerfile <x86intrin.h>
354 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
357 /// A 256-bit vector of [16 x bfloat].
359 /// A 256-bit vector of [16 x bfloat].
361 /// A 256-bit vector of [8 x float].
363 /// A 16-bit mask value specifying what is chosen for each element.
364 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
365 /// \returns A 256-bit vector of [8 x float] comes from Dot Product of
367 static __inline__ __m256 __DEFAULT_FN_ATTRS256
368 _mm256_mask_dpbf16_ps(__m256 __D
, __mmask8 __U
, __m256bh __A
, __m256bh __B
) {
369 return (__m256
)__builtin_ia32_selectps_256((__mmask8
)__U
,
370 (__v8sf
)_mm256_dpbf16_ps(__D
, __A
, __B
),
374 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
376 /// \headerfile <x86intrin.h>
378 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
381 /// A 256-bit vector of [16 x bfloat].
383 /// A 256-bit vector of [16 x bfloat].
385 /// A 256-bit vector of [8 x float].
387 /// A 8-bit mask value specifying what is chosen for each element.
388 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
389 /// \returns A 256-bit vector of [8 x float] comes from Dot Product of
391 static __inline__ __m256 __DEFAULT_FN_ATTRS256
392 _mm256_maskz_dpbf16_ps(__mmask8 __U
, __m256 __D
, __m256bh __A
, __m256bh __B
) {
393 return (__m256
)__builtin_ia32_selectps_256((__mmask8
)__U
,
394 (__v8sf
)_mm256_dpbf16_ps(__D
, __A
, __B
),
395 (__v8sf
)_mm256_setzero_si256());
398 /// Convert One Single float Data to One BF16 Data.
400 /// \headerfile <x86intrin.h>
402 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
406 /// \returns A bf16 data whose sign field and exponent field keep unchanged,
407 /// and fraction field is truncated to 7 bits.
408 static __inline__ __bf16 __DEFAULT_FN_ATTRS128
_mm_cvtness_sbh(float __A
) {
409 __v4sf __V
= {__A
, 0, 0, 0};
410 __v8bf __R
= __builtin_ia32_cvtneps2bf16_128_mask(
411 (__v4sf
)__V
, (__v8bf
)_mm_undefined_si128(), (__mmask8
)-1);
412 return (__bf16
)__R
[0];
415 /// Convert Packed BF16 Data to Packed float Data.
417 /// \headerfile <x86intrin.h>
420 /// A 128-bit vector of [4 x bfloat].
421 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
422 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtpbh_ps(__m128bh __A
) {
423 return _mm_castsi128_ps(
424 (__m128i
)_mm_slli_epi32((__m128i
)_mm_cvtepi16_epi32((__m128i
)__A
), 16));
427 /// Convert Packed BF16 Data to Packed float Data.
429 /// \headerfile <x86intrin.h>
432 /// A 128-bit vector of [8 x bfloat].
433 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
434 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtpbh_ps(__m128bh __A
) {
435 return _mm256_castsi256_ps((__m256i
)_mm256_slli_epi32(
436 (__m256i
)_mm256_cvtepi16_epi32((__m128i
)__A
), 16));
439 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
441 /// \headerfile <x86intrin.h>
444 /// A 4-bit mask. Elements are zeroed out when the corresponding mask
447 /// A 128-bit vector of [4 x bfloat].
448 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
449 static __inline__ __m128 __DEFAULT_FN_ATTRS128
450 _mm_maskz_cvtpbh_ps(__mmask8 __U
, __m128bh __A
) {
451 return _mm_castsi128_ps((__m128i
)_mm_slli_epi32(
452 (__m128i
)_mm_maskz_cvtepi16_epi32((__mmask8
)__U
, (__m128i
)__A
), 16));
455 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
457 /// \headerfile <x86intrin.h>
460 /// A 8-bit mask. Elements are zeroed out when the corresponding mask
463 /// A 128-bit vector of [8 x bfloat].
464 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
465 static __inline__ __m256 __DEFAULT_FN_ATTRS256
466 _mm256_maskz_cvtpbh_ps(__mmask8 __U
, __m128bh __A
) {
467 return _mm256_castsi256_ps((__m256i
)_mm256_slli_epi32(
468 (__m256i
)_mm256_maskz_cvtepi16_epi32((__mmask8
)__U
, (__m128i
)__A
), 16));
471 /// Convert Packed BF16 Data to Packed float Data using merging mask.
473 /// \headerfile <x86intrin.h>
476 /// A 128-bit vector of [4 x float]. Elements are copied from __S when
477 /// the corresponding mask bit is not set.
479 /// A 4-bit mask. Elements are zeroed out when the corresponding mask
482 /// A 128-bit vector of [4 x bfloat].
483 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
484 static __inline__ __m128 __DEFAULT_FN_ATTRS128
485 _mm_mask_cvtpbh_ps(__m128 __S
, __mmask8 __U
, __m128bh __A
) {
486 return _mm_castsi128_ps((__m128i
)_mm_mask_slli_epi32(
487 (__m128i
)__S
, (__mmask8
)__U
, (__m128i
)_mm_cvtepi16_epi32((__m128i
)__A
),
491 /// Convert Packed BF16 Data to Packed float Data using merging mask.
493 /// \headerfile <x86intrin.h>
496 /// A 256-bit vector of [8 x float]. Elements are copied from __S when
497 /// the corresponding mask bit is not set.
499 /// A 8-bit mask. Elements are zeroed out when the corresponding mask
502 /// A 128-bit vector of [8 x bfloat].
503 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
504 static __inline__ __m256 __DEFAULT_FN_ATTRS256
505 _mm256_mask_cvtpbh_ps(__m256 __S
, __mmask8 __U
, __m128bh __A
) {
506 return _mm256_castsi256_ps((__m256i
)_mm256_mask_slli_epi32(
507 (__m256i
)__S
, (__mmask8
)__U
, (__m256i
)_mm256_cvtepi16_epi32((__m128i
)__A
),
511 #undef __DEFAULT_FN_ATTRS128
512 #undef __DEFAULT_FN_ATTRS256