1 /*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
10 #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
15 #ifndef __AVX512BF16INTRIN_H
16 #define __AVX512BF16INTRIN_H
18 typedef __bf16 __v32bf
__attribute__((__vector_size__(64), __aligned__(64)));
19 typedef __bf16 __m512bh
__attribute__((__vector_size__(64), __aligned__(64)));
20 typedef __bf16 __bfloat16
__attribute__((deprecated("use __bf16 instead")));
22 #define __DEFAULT_FN_ATTRS512 \
23 __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
24 __min_vector_width__(512)))
25 #define __DEFAULT_FN_ATTRS \
26 __attribute__((__always_inline__, __nodebug__, \
27 __target__("avx512bf16,no-evex512")))
29 /// Convert One BF16 Data to One Single Float Data.
31 /// \headerfile <x86intrin.h>
33 /// This intrinsic does not correspond to a specific instruction.
37 /// \returns A float data whose sign field and exponent field keep unchanged,
38 /// and fraction field is extended to 23 bits.
39 static __inline__
float __DEFAULT_FN_ATTRS
_mm_cvtsbh_ss(__bf16 __A
) {
40 return __builtin_ia32_cvtsbf162ss_32(__A
);
43 /// Convert Two Packed Single Data to One Packed BF16 Data.
45 /// \headerfile <x86intrin.h>
47 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
50 /// A 512-bit vector of [16 x float].
52 /// A 512-bit vector of [16 x float].
53 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
54 /// conversion of __B, and higher 256 bits come from conversion of __A.
55 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
56 _mm512_cvtne2ps_pbh(__m512 __A
, __m512 __B
) {
57 return (__m512bh
)__builtin_ia32_cvtne2ps2bf16_512((__v16sf
) __A
,
61 /// Convert Two Packed Single Data to One Packed BF16 Data.
63 /// \headerfile <x86intrin.h>
65 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
68 /// A 512-bit vector of [16 x float].
70 /// A 512-bit vector of [16 x float].
72 /// A 512-bit vector of [32 x bfloat].
74 /// A 32-bit mask value specifying what is chosen for each element.
75 /// A 1 means conversion of __A or __B. A 0 means element from __W.
76 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
77 /// conversion of __B, and higher 256 bits come from conversion of __A.
78 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
79 _mm512_mask_cvtne2ps_pbh(__m512bh __W
, __mmask32 __U
, __m512 __A
, __m512 __B
) {
80 return (__m512bh
)__builtin_ia32_selectpbf_512((__mmask32
)__U
,
81 (__v32bf
)_mm512_cvtne2ps_pbh(__A
, __B
),
85 /// Convert Two Packed Single Data to One Packed BF16 Data.
87 /// \headerfile <x86intrin.h>
89 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
92 /// A 512-bit vector of [16 x float].
94 /// A 512-bit vector of [16 x float].
96 /// A 32-bit mask value specifying what is chosen for each element.
97 /// A 1 means conversion of __A or __B. A 0 means element is zero.
98 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
99 /// conversion of __B, and higher 256 bits come from conversion of __A.
100 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
101 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U
, __m512 __A
, __m512 __B
) {
102 return (__m512bh
)__builtin_ia32_selectpbf_512((__mmask32
)__U
,
103 (__v32bf
)_mm512_cvtne2ps_pbh(__A
, __B
),
104 (__v32bf
)_mm512_setzero_si512());
107 /// Convert Packed Single Data to Packed BF16 Data.
109 /// \headerfile <x86intrin.h>
111 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
114 /// A 512-bit vector of [16 x float].
115 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
116 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
117 _mm512_cvtneps_pbh(__m512 __A
) {
118 return (__m256bh
)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf
)__A
,
119 (__v16bf
)_mm256_undefined_si256(),
123 /// Convert Packed Single Data to Packed BF16 Data.
125 /// \headerfile <x86intrin.h>
127 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
130 /// A 512-bit vector of [16 x float].
132 /// A 256-bit vector of [16 x bfloat].
134 /// A 16-bit mask value specifying what is chosen for each element.
135 /// A 1 means conversion of __A. A 0 means element from __W.
136 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
137 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
138 _mm512_mask_cvtneps_pbh(__m256bh __W
, __mmask16 __U
, __m512 __A
) {
139 return (__m256bh
)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf
)__A
,
144 /// Convert Packed Single Data to Packed BF16 Data.
146 /// \headerfile <x86intrin.h>
148 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
151 /// A 512-bit vector of [16 x float].
153 /// A 16-bit mask value specifying what is chosen for each element.
154 /// A 1 means conversion of __A. A 0 means element is zero.
155 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
156 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
157 _mm512_maskz_cvtneps_pbh(__mmask16 __U
, __m512 __A
) {
158 return (__m256bh
)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf
)__A
,
159 (__v16bf
)_mm256_setzero_si256(),
163 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
165 /// \headerfile <x86intrin.h>
167 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
170 /// A 512-bit vector of [32 x bfloat].
172 /// A 512-bit vector of [32 x bfloat].
174 /// A 512-bit vector of [16 x float].
175 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
177 static __inline__ __m512 __DEFAULT_FN_ATTRS512
178 _mm512_dpbf16_ps(__m512 __D
, __m512bh __A
, __m512bh __B
) {
179 return (__m512
)__builtin_ia32_dpbf16ps_512((__v16sf
) __D
,
184 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
186 /// \headerfile <x86intrin.h>
188 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
191 /// A 512-bit vector of [32 x bfloat].
193 /// A 512-bit vector of [32 x bfloat].
195 /// A 512-bit vector of [16 x float].
197 /// A 16-bit mask value specifying what is chosen for each element.
198 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
199 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
201 static __inline__ __m512 __DEFAULT_FN_ATTRS512
202 _mm512_mask_dpbf16_ps(__m512 __D
, __mmask16 __U
, __m512bh __A
, __m512bh __B
) {
203 return (__m512
)__builtin_ia32_selectps_512((__mmask16
)__U
,
204 (__v16sf
)_mm512_dpbf16_ps(__D
, __A
, __B
),
208 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
210 /// \headerfile <x86intrin.h>
212 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
215 /// A 512-bit vector of [32 x bfloat].
217 /// A 512-bit vector of [32 x bfloat].
219 /// A 512-bit vector of [16 x float].
221 /// A 16-bit mask value specifying what is chosen for each element.
222 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
223 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
225 static __inline__ __m512 __DEFAULT_FN_ATTRS512
226 _mm512_maskz_dpbf16_ps(__mmask16 __U
, __m512 __D
, __m512bh __A
, __m512bh __B
) {
227 return (__m512
)__builtin_ia32_selectps_512((__mmask16
)__U
,
228 (__v16sf
)_mm512_dpbf16_ps(__D
, __A
, __B
),
229 (__v16sf
)_mm512_setzero_si512());
232 /// Convert Packed BF16 Data to Packed float Data.
234 /// \headerfile <x86intrin.h>
237 /// A 256-bit vector of [16 x bfloat].
238 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
239 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_cvtpbh_ps(__m256bh __A
) {
240 return _mm512_castsi512_ps((__m512i
)_mm512_slli_epi32(
241 (__m512i
)_mm512_cvtepi16_epi32((__m256i
)__A
), 16));
244 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
246 /// \headerfile <x86intrin.h>
249 /// A 16-bit mask. Elements are zeroed out when the corresponding mask
252 /// A 256-bit vector of [16 x bfloat].
253 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
254 static __inline__ __m512 __DEFAULT_FN_ATTRS512
255 _mm512_maskz_cvtpbh_ps(__mmask16 __U
, __m256bh __A
) {
256 return _mm512_castsi512_ps((__m512i
)_mm512_slli_epi32(
257 (__m512i
)_mm512_maskz_cvtepi16_epi32((__mmask16
)__U
, (__m256i
)__A
), 16));
260 /// Convert Packed BF16 Data to Packed float Data using merging mask.
262 /// \headerfile <x86intrin.h>
265 /// A 512-bit vector of [16 x float]. Elements are copied from __S when
266 /// the corresponding mask bit is not set.
270 /// A 256-bit vector of [16 x bfloat].
271 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
272 static __inline__ __m512 __DEFAULT_FN_ATTRS512
273 _mm512_mask_cvtpbh_ps(__m512 __S
, __mmask16 __U
, __m256bh __A
) {
274 return _mm512_castsi512_ps((__m512i
)_mm512_mask_slli_epi32(
275 (__m512i
)__S
, (__mmask16
)__U
,
276 (__m512i
)_mm512_cvtepi16_epi32((__m256i
)__A
), 16));
279 #undef __DEFAULT_FN_ATTRS
280 #undef __DEFAULT_FN_ATTRS512