1 /*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
10 #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
13 #ifndef __AVX512BF16INTRIN_H
14 #define __AVX512BF16INTRIN_H
16 typedef short __m512bh
__attribute__((__vector_size__(64), __aligned__(64)));
17 typedef short __m256bh
__attribute__((__vector_size__(32), __aligned__(32)));
18 typedef unsigned short __bfloat16
;
20 #define __DEFAULT_FN_ATTRS512 \
21 __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
22 __min_vector_width__(512)))
23 #define __DEFAULT_FN_ATTRS \
24 __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
26 /// Convert One BF16 Data to One Single Float Data.
28 /// \headerfile <x86intrin.h>
30 /// This intrinsic does not correspond to a specific instruction.
34 /// \returns A float data whose sign field and exponent field keep unchanged,
35 /// and fraction field is extended to 23 bits.
36 static __inline__
float __DEFAULT_FN_ATTRS
_mm_cvtsbh_ss(__bfloat16 __A
) {
37 return __builtin_ia32_cvtsbf162ss_32(__A
);
40 /// Convert Two Packed Single Data to One Packed BF16 Data.
42 /// \headerfile <x86intrin.h>
44 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
47 /// A 512-bit vector of [16 x float].
49 /// A 512-bit vector of [16 x float].
50 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
51 /// conversion of __B, and higher 256 bits come from conversion of __A.
52 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
53 _mm512_cvtne2ps_pbh(__m512 __A
, __m512 __B
) {
54 return (__m512bh
)__builtin_ia32_cvtne2ps2bf16_512((__v16sf
) __A
,
58 /// Convert Two Packed Single Data to One Packed BF16 Data.
60 /// \headerfile <x86intrin.h>
62 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
65 /// A 512-bit vector of [16 x float].
67 /// A 512-bit vector of [16 x float].
69 /// A 512-bit vector of [32 x bfloat].
71 /// A 32-bit mask value specifying what is chosen for each element.
72 /// A 1 means conversion of __A or __B. A 0 means element from __W.
73 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
74 /// conversion of __B, and higher 256 bits come from conversion of __A.
75 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
76 _mm512_mask_cvtne2ps_pbh(__m512bh __W
, __mmask32 __U
, __m512 __A
, __m512 __B
) {
77 return (__m512bh
)__builtin_ia32_selectw_512((__mmask32
)__U
,
78 (__v32hi
)_mm512_cvtne2ps_pbh(__A
, __B
),
82 /// Convert Two Packed Single Data to One Packed BF16 Data.
84 /// \headerfile <x86intrin.h>
86 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
89 /// A 512-bit vector of [16 x float].
91 /// A 512-bit vector of [16 x float].
93 /// A 32-bit mask value specifying what is chosen for each element.
94 /// A 1 means conversion of __A or __B. A 0 means element is zero.
95 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
96 /// conversion of __B, and higher 256 bits come from conversion of __A.
97 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
98 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U
, __m512 __A
, __m512 __B
) {
99 return (__m512bh
)__builtin_ia32_selectw_512((__mmask32
)__U
,
100 (__v32hi
)_mm512_cvtne2ps_pbh(__A
, __B
),
101 (__v32hi
)_mm512_setzero_si512());
104 /// Convert Packed Single Data to Packed BF16 Data.
106 /// \headerfile <x86intrin.h>
108 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
111 /// A 512-bit vector of [16 x float].
112 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
113 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
114 _mm512_cvtneps_pbh(__m512 __A
) {
115 return (__m256bh
)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf
)__A
,
116 (__v16hi
)_mm256_undefined_si256(),
120 /// Convert Packed Single Data to Packed BF16 Data.
122 /// \headerfile <x86intrin.h>
124 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
127 /// A 512-bit vector of [16 x float].
129 /// A 256-bit vector of [16 x bfloat].
131 /// A 16-bit mask value specifying what is chosen for each element.
132 /// A 1 means conversion of __A. A 0 means element from __W.
133 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
134 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
135 _mm512_mask_cvtneps_pbh(__m256bh __W
, __mmask16 __U
, __m512 __A
) {
136 return (__m256bh
)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf
)__A
,
141 /// Convert Packed Single Data to Packed BF16 Data.
143 /// \headerfile <x86intrin.h>
145 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
148 /// A 512-bit vector of [16 x float].
150 /// A 16-bit mask value specifying what is chosen for each element.
151 /// A 1 means conversion of __A. A 0 means element is zero.
152 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
153 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
154 _mm512_maskz_cvtneps_pbh(__mmask16 __U
, __m512 __A
) {
155 return (__m256bh
)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf
)__A
,
156 (__v16hi
)_mm256_setzero_si256(),
160 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
162 /// \headerfile <x86intrin.h>
164 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
167 /// A 512-bit vector of [32 x bfloat].
169 /// A 512-bit vector of [32 x bfloat].
171 /// A 512-bit vector of [16 x float].
172 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
174 static __inline__ __m512 __DEFAULT_FN_ATTRS512
175 _mm512_dpbf16_ps(__m512 __D
, __m512bh __A
, __m512bh __B
) {
176 return (__m512
)__builtin_ia32_dpbf16ps_512((__v16sf
) __D
,
181 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
183 /// \headerfile <x86intrin.h>
185 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
188 /// A 512-bit vector of [32 x bfloat].
190 /// A 512-bit vector of [32 x bfloat].
192 /// A 512-bit vector of [16 x float].
194 /// A 16-bit mask value specifying what is chosen for each element.
195 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
196 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
198 static __inline__ __m512 __DEFAULT_FN_ATTRS512
199 _mm512_mask_dpbf16_ps(__m512 __D
, __mmask16 __U
, __m512bh __A
, __m512bh __B
) {
200 return (__m512
)__builtin_ia32_selectps_512((__mmask16
)__U
,
201 (__v16sf
)_mm512_dpbf16_ps(__D
, __A
, __B
),
205 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
207 /// \headerfile <x86intrin.h>
209 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
212 /// A 512-bit vector of [32 x bfloat].
214 /// A 512-bit vector of [32 x bfloat].
216 /// A 512-bit vector of [16 x float].
218 /// A 16-bit mask value specifying what is chosen for each element.
219 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
220 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
222 static __inline__ __m512 __DEFAULT_FN_ATTRS512
223 _mm512_maskz_dpbf16_ps(__mmask16 __U
, __m512 __D
, __m512bh __A
, __m512bh __B
) {
224 return (__m512
)__builtin_ia32_selectps_512((__mmask16
)__U
,
225 (__v16sf
)_mm512_dpbf16_ps(__D
, __A
, __B
),
226 (__v16sf
)_mm512_setzero_si512());
229 /// Convert Packed BF16 Data to Packed float Data.
231 /// \headerfile <x86intrin.h>
234 /// A 256-bit vector of [16 x bfloat].
235 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
236 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_cvtpbh_ps(__m256bh __A
) {
237 return _mm512_castsi512_ps((__m512i
)_mm512_slli_epi32(
238 (__m512i
)_mm512_cvtepi16_epi32((__m256i
)__A
), 16));
241 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
243 /// \headerfile <x86intrin.h>
246 /// A 16-bit mask. Elements are zeroed out when the corresponding mask
249 /// A 256-bit vector of [16 x bfloat].
250 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
251 static __inline__ __m512 __DEFAULT_FN_ATTRS512
252 _mm512_maskz_cvtpbh_ps(__mmask16 __U
, __m256bh __A
) {
253 return _mm512_castsi512_ps((__m512i
)_mm512_slli_epi32(
254 (__m512i
)_mm512_maskz_cvtepi16_epi32((__mmask16
)__U
, (__m256i
)__A
), 16));
257 /// Convert Packed BF16 Data to Packed float Data using merging mask.
259 /// \headerfile <x86intrin.h>
262 /// A 512-bit vector of [16 x float]. Elements are copied from __S when
263 /// the corresponding mask bit is not set.
267 /// A 256-bit vector of [16 x bfloat].
268 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
269 static __inline__ __m512 __DEFAULT_FN_ATTRS512
270 _mm512_mask_cvtpbh_ps(__m512 __S
, __mmask16 __U
, __m256bh __A
) {
271 return _mm512_castsi512_ps((__m512i
)_mm512_mask_slli_epi32(
272 (__m512i
)__S
, (__mmask16
)__U
,
273 (__m512i
)_mm512_cvtepi16_epi32((__m256i
)__A
), 16));
276 #undef __DEFAULT_FN_ATTRS
277 #undef __DEFAULT_FN_ATTRS512