[AMDGPU][AsmParser][NFC] Get rid of custom default operand handlers.
[llvm-project.git] / clang / lib / Headers / avx512vlbf16intrin.h
blobf5b8911fac2aeb87f5428ab2b8a3256dfe8c035b
1 /*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
11 #endif
13 #ifdef __SSE2__
15 #ifndef __AVX512VLBF16INTRIN_H
16 #define __AVX512VLBF16INTRIN_H
18 #define __DEFAULT_FN_ATTRS128 \
19 __attribute__((__always_inline__, __nodebug__, \
20 __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
21 #define __DEFAULT_FN_ATTRS256 \
22 __attribute__((__always_inline__, __nodebug__, \
23 __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
25 /// Convert Two Packed Single Data to One Packed BF16 Data.
26 ///
27 /// \headerfile <x86intrin.h>
28 ///
29 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
30 ///
31 /// \param __A
32 /// A 128-bit vector of [4 x float].
33 /// \param __B
34 /// A 128-bit vector of [4 x float].
35 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
36 /// conversion of __B, and higher 64 bits come from conversion of __A.
37 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
38 _mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
39 return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
40 (__v4sf) __B);
43 /// Convert Two Packed Single Data to One Packed BF16 Data.
44 ///
45 /// \headerfile <x86intrin.h>
46 ///
47 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
48 ///
49 /// \param __A
50 /// A 128-bit vector of [4 x float].
51 /// \param __B
52 /// A 128-bit vector of [4 x float].
53 /// \param __W
54 /// A 128-bit vector of [8 x bfloat].
55 /// \param __U
56 /// A 8-bit mask value specifying what is chosen for each element.
57 /// A 1 means conversion of __A or __B. A 0 means element from __W.
58 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
59 /// conversion of __B, and higher 64 bits come from conversion of __A.
60 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
61 _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
62 return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
63 (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
64 (__v8bf)__W);
67 /// Convert Two Packed Single Data to One Packed BF16 Data.
68 ///
69 /// \headerfile <x86intrin.h>
70 ///
71 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
72 ///
73 /// \param __A
74 /// A 128-bit vector of [4 x float].
75 /// \param __B
76 /// A 128-bit vector of [4 x float].
77 /// \param __U
78 /// A 8-bit mask value specifying what is chosen for each element.
79 /// A 1 means conversion of __A or __B. A 0 means element is zero.
80 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
81 /// conversion of __B, and higher 64 bits come from conversion of __A.
82 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
83 _mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
84 return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
85 (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
86 (__v8bf)_mm_setzero_si128());
89 /// Convert Two Packed Single Data to One Packed BF16 Data.
90 ///
91 /// \headerfile <x86intrin.h>
92 ///
93 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
94 ///
95 /// \param __A
96 /// A 256-bit vector of [8 x float].
97 /// \param __B
98 /// A 256-bit vector of [8 x float].
99 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
100 /// conversion of __B, and higher 128 bits come from conversion of __A.
101 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
102 _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
103 return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
104 (__v8sf) __B);
107 /// Convert Two Packed Single Data to One Packed BF16 Data.
109 /// \headerfile <x86intrin.h>
111 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
113 /// \param __A
114 /// A 256-bit vector of [8 x float].
115 /// \param __B
116 /// A 256-bit vector of [8 x float].
117 /// \param __W
118 /// A 256-bit vector of [16 x bfloat].
119 /// \param __U
120 /// A 16-bit mask value specifying what is chosen for each element.
121 /// A 1 means conversion of __A or __B. A 0 means element from __W.
122 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
123 /// conversion of __B, and higher 128 bits come from conversion of __A.
124 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
125 _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
126 return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
127 (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
128 (__v16bf)__W);
131 /// Convert Two Packed Single Data to One Packed BF16 Data.
133 /// \headerfile <x86intrin.h>
135 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
137 /// \param __A
138 /// A 256-bit vector of [8 x float].
139 /// \param __B
140 /// A 256-bit vector of [8 x float].
141 /// \param __U
142 /// A 16-bit mask value specifying what is chosen for each element.
143 /// A 1 means conversion of __A or __B. A 0 means element is zero.
144 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
145 /// conversion of __B, and higher 128 bits come from conversion of __A.
146 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
147 _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
148 return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
149 (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
150 (__v16bf)_mm256_setzero_si256());
153 /// Convert Packed Single Data to Packed BF16 Data.
155 /// \headerfile <x86intrin.h>
157 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
159 /// \param __A
160 /// A 128-bit vector of [4 x float].
161 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
162 /// conversion of __A, and higher 64 bits are 0.
163 #define _mm_cvtneps_pbh(A) \
164 ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
166 /// Convert Packed Single Data to Packed BF16 Data.
168 /// \headerfile <x86intrin.h>
170 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
172 /// \param __A
173 /// A 128-bit vector of [4 x float].
174 /// \param __W
175 /// A 128-bit vector of [8 x bfloat].
176 /// \param __U
177 /// A 4-bit mask value specifying what is chosen for each element.
178 /// A 1 means conversion of __A. A 0 means element from __W.
179 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
180 /// conversion of __A, and higher 64 bits are 0.
181 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
182 _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
183 return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
184 (__v8bf)__W,
185 (__mmask8)__U);
188 /// Convert Packed Single Data to Packed BF16 Data.
190 /// \headerfile <x86intrin.h>
192 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
194 /// \param __A
195 /// A 128-bit vector of [4 x float].
196 /// \param __U
197 /// A 4-bit mask value specifying what is chosen for each element.
198 /// A 1 means conversion of __A. A 0 means element is zero.
199 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
200 /// conversion of __A, and higher 64 bits are 0.
201 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
202 _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
203 return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
204 (__v8bf)_mm_setzero_si128(),
205 (__mmask8)__U);
208 /// Convert Packed Single Data to Packed BF16 Data.
210 /// \headerfile <x86intrin.h>
212 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
214 /// \param __A
215 /// A 256-bit vector of [8 x float].
216 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
217 #define _mm256_cvtneps_pbh(A) \
218 ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
220 /// Convert Packed Single Data to Packed BF16 Data.
222 /// \headerfile <x86intrin.h>
224 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
226 /// \param __A
227 /// A 256-bit vector of [8 x float].
228 /// \param __W
229 /// A 256-bit vector of [8 x bfloat].
230 /// \param __U
231 /// A 8-bit mask value specifying what is chosen for each element.
232 /// A 1 means conversion of __A. A 0 means element from __W.
233 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
234 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
235 _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
236 return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
237 (__v8bf)__W,
238 (__mmask8)__U);
241 /// Convert Packed Single Data to Packed BF16 Data.
243 /// \headerfile <x86intrin.h>
245 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
247 /// \param __A
248 /// A 256-bit vector of [8 x float].
249 /// \param __U
250 /// A 8-bit mask value specifying what is chosen for each element.
251 /// A 1 means conversion of __A. A 0 means element is zero.
252 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
253 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
254 _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
255 return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
256 (__v8bf)_mm_setzero_si128(),
257 (__mmask8)__U);
260 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
262 /// \headerfile <x86intrin.h>
264 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
266 /// \param __A
267 /// A 128-bit vector of [8 x bfloat].
268 /// \param __B
269 /// A 128-bit vector of [8 x bfloat].
270 /// \param __D
271 /// A 128-bit vector of [4 x float].
272 /// \returns A 128-bit vector of [4 x float] comes from Dot Product of
273 /// __A, __B and __D
274 static __inline__ __m128 __DEFAULT_FN_ATTRS128
275 _mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
276 return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
277 (__v8bf)__A,
278 (__v8bf)__B);
281 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
283 /// \headerfile <x86intrin.h>
285 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
287 /// \param __A
288 /// A 128-bit vector of [8 x bfloat].
289 /// \param __B
290 /// A 128-bit vector of [8 x bfloat].
291 /// \param __D
292 /// A 128-bit vector of [4 x float].
293 /// \param __U
294 /// A 8-bit mask value specifying what is chosen for each element.
295 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
296 /// \returns A 128-bit vector of [4 x float] comes from Dot Product of
297 /// __A, __B and __D
298 static __inline__ __m128 __DEFAULT_FN_ATTRS128
299 _mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
300 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
301 (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
302 (__v4sf)__D);
305 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
307 /// \headerfile <x86intrin.h>
309 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
311 /// \param __A
312 /// A 128-bit vector of [8 x bfloat].
313 /// \param __B
314 /// A 128-bit vector of [8 x bfloat].
315 /// \param __D
316 /// A 128-bit vector of [4 x float].
317 /// \param __U
318 /// A 8-bit mask value specifying what is chosen for each element.
319 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
320 /// \returns A 128-bit vector of [4 x float] comes from Dot Product of
321 /// __A, __B and __D
322 static __inline__ __m128 __DEFAULT_FN_ATTRS128
323 _mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
324 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
325 (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
326 (__v4sf)_mm_setzero_si128());
329 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
331 /// \headerfile <x86intrin.h>
333 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
335 /// \param __A
336 /// A 256-bit vector of [16 x bfloat].
337 /// \param __B
338 /// A 256-bit vector of [16 x bfloat].
339 /// \param __D
340 /// A 256-bit vector of [8 x float].
341 /// \returns A 256-bit vector of [8 x float] comes from Dot Product of
342 /// __A, __B and __D
343 static __inline__ __m256 __DEFAULT_FN_ATTRS256
344 _mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
345 return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
346 (__v16bf)__A,
347 (__v16bf)__B);
350 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
352 /// \headerfile <x86intrin.h>
354 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
356 /// \param __A
357 /// A 256-bit vector of [16 x bfloat].
358 /// \param __B
359 /// A 256-bit vector of [16 x bfloat].
360 /// \param __D
361 /// A 256-bit vector of [8 x float].
362 /// \param __U
363 /// A 16-bit mask value specifying what is chosen for each element.
364 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
365 /// \returns A 256-bit vector of [8 x float] comes from Dot Product of
366 /// __A, __B and __D
367 static __inline__ __m256 __DEFAULT_FN_ATTRS256
368 _mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
369 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
370 (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
371 (__v8sf)__D);
374 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
376 /// \headerfile <x86intrin.h>
378 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
380 /// \param __A
381 /// A 256-bit vector of [16 x bfloat].
382 /// \param __B
383 /// A 256-bit vector of [16 x bfloat].
384 /// \param __D
385 /// A 256-bit vector of [8 x float].
386 /// \param __U
387 /// A 8-bit mask value specifying what is chosen for each element.
388 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
389 /// \returns A 256-bit vector of [8 x float] comes from Dot Product of
390 /// __A, __B and __D
391 static __inline__ __m256 __DEFAULT_FN_ATTRS256
392 _mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
393 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
394 (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
395 (__v8sf)_mm256_setzero_si256());
398 /// Convert One Single float Data to One BF16 Data.
400 /// \headerfile <x86intrin.h>
402 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
404 /// \param __A
405 /// A float data.
406 /// \returns A bf16 data whose sign field and exponent field keep unchanged,
407 /// and fraction field is truncated to 7 bits.
408 static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
409 __v4sf __V = {__A, 0, 0, 0};
410 __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
411 (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
412 return (__bf16)__R[0];
415 /// Convert Packed BF16 Data to Packed float Data.
417 /// \headerfile <x86intrin.h>
419 /// \param __A
420 /// A 128-bit vector of [4 x bfloat].
421 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
422 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
423 return _mm_castsi128_ps(
424 (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
427 /// Convert Packed BF16 Data to Packed float Data.
429 /// \headerfile <x86intrin.h>
431 /// \param __A
432 /// A 128-bit vector of [8 x bfloat].
433 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
434 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
435 return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
436 (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
439 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
441 /// \headerfile <x86intrin.h>
443 /// \param __U
444 /// A 4-bit mask. Elements are zeroed out when the corresponding mask
445 /// bit is not set.
446 /// \param __A
447 /// A 128-bit vector of [4 x bfloat].
448 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
449 static __inline__ __m128 __DEFAULT_FN_ATTRS128
450 _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
451 return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
452 (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
455 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
457 /// \headerfile <x86intrin.h>
459 /// \param __U
460 /// A 8-bit mask. Elements are zeroed out when the corresponding mask
461 /// bit is not set.
462 /// \param __A
463 /// A 128-bit vector of [8 x bfloat].
464 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
465 static __inline__ __m256 __DEFAULT_FN_ATTRS256
466 _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
467 return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
468 (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
471 /// Convert Packed BF16 Data to Packed float Data using merging mask.
473 /// \headerfile <x86intrin.h>
475 /// \param __S
476 /// A 128-bit vector of [4 x float]. Elements are copied from __S when
477 /// the corresponding mask bit is not set.
478 /// \param __U
479 /// A 4-bit mask. Elements are zeroed out when the corresponding mask
480 /// bit is not set.
481 /// \param __A
482 /// A 128-bit vector of [4 x bfloat].
483 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
484 static __inline__ __m128 __DEFAULT_FN_ATTRS128
485 _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
486 return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
487 (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
488 16));
491 /// Convert Packed BF16 Data to Packed float Data using merging mask.
493 /// \headerfile <x86intrin.h>
495 /// \param __S
496 /// A 256-bit vector of [8 x float]. Elements are copied from __S when
497 /// the corresponding mask bit is not set.
498 /// \param __U
499 /// A 8-bit mask. Elements are zeroed out when the corresponding mask
500 /// bit is not set.
501 /// \param __A
502 /// A 128-bit vector of [8 x bfloat].
503 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
504 static __inline__ __m256 __DEFAULT_FN_ATTRS256
505 _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
506 return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
507 (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
508 16));
511 #undef __DEFAULT_FN_ATTRS128
512 #undef __DEFAULT_FN_ATTRS256
514 #endif
515 #endif