[clang-format] Fix a bug in aligning comments above PPDirective (#72791)
[llvm-project.git] / clang / lib / Headers / avx512bf16intrin.h
blobb28d2e243f2cb8085259e8e170d26eba55201321
1 /*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
11 #endif
13 #ifdef __SSE2__
15 #ifndef __AVX512BF16INTRIN_H
16 #define __AVX512BF16INTRIN_H
18 typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
19 typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
20 typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
22 #define __DEFAULT_FN_ATTRS512 \
23 __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
24 __min_vector_width__(512)))
25 #define __DEFAULT_FN_ATTRS \
26 __attribute__((__always_inline__, __nodebug__, \
27 __target__("avx512bf16,no-evex512")))
29 /// Convert One BF16 Data to One Single Float Data.
30 ///
31 /// \headerfile <x86intrin.h>
32 ///
33 /// This intrinsic does not correspond to a specific instruction.
34 ///
35 /// \param __A
36 /// A bfloat data.
37 /// \returns A float data whose sign field and exponent field keep unchanged,
38 /// and fraction field is extended to 23 bits.
39 static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
40 return __builtin_ia32_cvtsbf162ss_32(__A);
43 /// Convert Two Packed Single Data to One Packed BF16 Data.
44 ///
45 /// \headerfile <x86intrin.h>
46 ///
47 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
48 ///
49 /// \param __A
50 /// A 512-bit vector of [16 x float].
51 /// \param __B
52 /// A 512-bit vector of [16 x float].
53 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
54 /// conversion of __B, and higher 256 bits come from conversion of __A.
55 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
56 _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
57 return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
58 (__v16sf) __B);
61 /// Convert Two Packed Single Data to One Packed BF16 Data.
62 ///
63 /// \headerfile <x86intrin.h>
64 ///
65 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
66 ///
67 /// \param __A
68 /// A 512-bit vector of [16 x float].
69 /// \param __B
70 /// A 512-bit vector of [16 x float].
71 /// \param __W
72 /// A 512-bit vector of [32 x bfloat].
73 /// \param __U
74 /// A 32-bit mask value specifying what is chosen for each element.
75 /// A 1 means conversion of __A or __B. A 0 means element from __W.
76 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
77 /// conversion of __B, and higher 256 bits come from conversion of __A.
78 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
79 _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
80 return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
81 (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
82 (__v32bf)__W);
85 /// Convert Two Packed Single Data to One Packed BF16 Data.
86 ///
87 /// \headerfile <x86intrin.h>
88 ///
89 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
90 ///
91 /// \param __A
92 /// A 512-bit vector of [16 x float].
93 /// \param __B
94 /// A 512-bit vector of [16 x float].
95 /// \param __U
96 /// A 32-bit mask value specifying what is chosen for each element.
97 /// A 1 means conversion of __A or __B. A 0 means element is zero.
98 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
99 /// conversion of __B, and higher 256 bits come from conversion of __A.
100 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
101 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
102 return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
103 (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
104 (__v32bf)_mm512_setzero_si512());
107 /// Convert Packed Single Data to Packed BF16 Data.
109 /// \headerfile <x86intrin.h>
111 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
113 /// \param __A
114 /// A 512-bit vector of [16 x float].
115 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
116 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
117 _mm512_cvtneps_pbh(__m512 __A) {
118 return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
119 (__v16bf)_mm256_undefined_si256(),
120 (__mmask16)-1);
123 /// Convert Packed Single Data to Packed BF16 Data.
125 /// \headerfile <x86intrin.h>
127 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
129 /// \param __A
130 /// A 512-bit vector of [16 x float].
131 /// \param __W
132 /// A 256-bit vector of [16 x bfloat].
133 /// \param __U
134 /// A 16-bit mask value specifying what is chosen for each element.
135 /// A 1 means conversion of __A. A 0 means element from __W.
136 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
137 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
138 _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
139 return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
140 (__v16bf)__W,
141 (__mmask16)__U);
144 /// Convert Packed Single Data to Packed BF16 Data.
146 /// \headerfile <x86intrin.h>
148 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
150 /// \param __A
151 /// A 512-bit vector of [16 x float].
152 /// \param __U
153 /// A 16-bit mask value specifying what is chosen for each element.
154 /// A 1 means conversion of __A. A 0 means element is zero.
155 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
156 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
157 _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
158 return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
159 (__v16bf)_mm256_setzero_si256(),
160 (__mmask16)__U);
163 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
165 /// \headerfile <x86intrin.h>
167 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
169 /// \param __A
170 /// A 512-bit vector of [32 x bfloat].
171 /// \param __B
172 /// A 512-bit vector of [32 x bfloat].
173 /// \param __D
174 /// A 512-bit vector of [16 x float].
175 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
176 /// __A, __B and __D
177 static __inline__ __m512 __DEFAULT_FN_ATTRS512
178 _mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
179 return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
180 (__v32bf) __A,
181 (__v32bf) __B);
184 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
186 /// \headerfile <x86intrin.h>
188 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
190 /// \param __A
191 /// A 512-bit vector of [32 x bfloat].
192 /// \param __B
193 /// A 512-bit vector of [32 x bfloat].
194 /// \param __D
195 /// A 512-bit vector of [16 x float].
196 /// \param __U
197 /// A 16-bit mask value specifying what is chosen for each element.
198 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
199 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
200 /// __A, __B and __D
201 static __inline__ __m512 __DEFAULT_FN_ATTRS512
202 _mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
203 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
204 (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
205 (__v16sf)__D);
208 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
210 /// \headerfile <x86intrin.h>
212 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
214 /// \param __A
215 /// A 512-bit vector of [32 x bfloat].
216 /// \param __B
217 /// A 512-bit vector of [32 x bfloat].
218 /// \param __D
219 /// A 512-bit vector of [16 x float].
220 /// \param __U
221 /// A 16-bit mask value specifying what is chosen for each element.
222 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
223 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of
224 /// __A, __B and __D
225 static __inline__ __m512 __DEFAULT_FN_ATTRS512
226 _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
227 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
228 (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
229 (__v16sf)_mm512_setzero_si512());
232 /// Convert Packed BF16 Data to Packed float Data.
234 /// \headerfile <x86intrin.h>
236 /// \param __A
237 /// A 256-bit vector of [16 x bfloat].
238 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
239 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
240 return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
241 (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
244 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
246 /// \headerfile <x86intrin.h>
248 /// \param __U
249 /// A 16-bit mask. Elements are zeroed out when the corresponding mask
250 /// bit is not set.
251 /// \param __A
252 /// A 256-bit vector of [16 x bfloat].
253 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
254 static __inline__ __m512 __DEFAULT_FN_ATTRS512
255 _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
256 return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
257 (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
260 /// Convert Packed BF16 Data to Packed float Data using merging mask.
262 /// \headerfile <x86intrin.h>
264 /// \param __S
265 /// A 512-bit vector of [16 x float]. Elements are copied from __S when
266 /// the corresponding mask bit is not set.
267 /// \param __U
268 /// A 16-bit mask.
269 /// \param __A
270 /// A 256-bit vector of [16 x bfloat].
271 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
272 static __inline__ __m512 __DEFAULT_FN_ATTRS512
273 _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
274 return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
275 (__m512i)__S, (__mmask16)__U,
276 (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
279 #undef __DEFAULT_FN_ATTRS
280 #undef __DEFAULT_FN_ATTRS512
282 #endif
283 #endif