[DFAJumpThreading] Remove incoming StartBlock from all phis when unfolding select...
[llvm-project.git] / clang / lib / Headers / avxneconvertintrin.h
blob1bef1c89378795c86c8d40100bffccb2da913d90
1 /*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 #ifndef __IMMINTRIN_H
11 #error \
12 "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
13 #endif // __IMMINTRIN_H
15 #ifdef __SSE2__
17 #ifndef __AVXNECONVERTINTRIN_H
18 #define __AVXNECONVERTINTRIN_H
20 /* Define the default attributes for the functions in this file. */
21 #define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \
23 __min_vector_width__(128)))
24 #define __DEFAULT_FN_ATTRS256 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \
26 __min_vector_width__(256)))
28 /// Convert scalar BF16 (16-bit) floating-point element
29 /// stored at memory locations starting at location \a __A to a
30 /// single-precision (32-bit) floating-point, broadcast it to packed
31 /// single-precision (32-bit) floating-point elements, and store the results in
32 /// \a dst.
33 ///
34 /// \headerfile <x86intrin.h>
35 ///
36 /// \code
37 /// _mm_bcstnebf16_ps(const void *__A);
38 /// \endcode
39 ///
40 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
41 ///
42 /// \param __A
43 /// A pointer to a 16-bit memory location. The address of the memory
44 /// location does not have to be aligned.
45 /// \returns
46 /// A 128-bit vector of [4 x float].
47 ///
48 /// \code{.operation}
49 /// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
50 /// FOR j := 0 to 3
51 /// m := j*32
52 /// dst[m+31:m] := b
53 /// ENDFOR
54 /// dst[MAX:128] := 0
55 /// \endcode
56 static __inline__ __m128 __DEFAULT_FN_ATTRS128
57 _mm_bcstnebf16_ps(const void *__A) {
58 return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
61 /// Convert scalar BF16 (16-bit) floating-point element
62 /// stored at memory locations starting at location \a __A to a
63 /// single-precision (32-bit) floating-point, broadcast it to packed
64 /// single-precision (32-bit) floating-point elements, and store the results in
65 /// \a dst.
66 ///
67 /// \headerfile <x86intrin.h>
68 ///
69 /// \code
70 /// _mm256_bcstnebf16_ps(const void *__A);
71 /// \endcode
72 ///
73 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
74 ///
75 /// \param __A
76 /// A pointer to a 16-bit memory location. The address of the memory
77 /// location does not have to be aligned.
78 /// \returns
79 /// A 256-bit vector of [8 x float].
80 ///
81 /// \code{.operation}
82 /// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
83 /// FOR j := 0 to 7
84 /// m := j*32
85 /// dst[m+31:m] := b
86 /// ENDFOR
87 /// dst[MAX:256] := 0
88 /// \endcode
89 static __inline__ __m256 __DEFAULT_FN_ATTRS256
90 _mm256_bcstnebf16_ps(const void *__A) {
91 return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
94 /// Convert scalar half-precision (16-bit) floating-point element
95 /// stored at memory locations starting at location \a __A to a
96 /// single-precision (32-bit) floating-point, broadcast it to packed
97 /// single-precision (32-bit) floating-point elements, and store the results in
98 /// \a dst.
99 ///
100 /// \headerfile <x86intrin.h>
102 /// \code
103 /// _mm_bcstnesh_ps(const void *__A);
104 /// \endcode
106 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
108 /// \param __A
109 /// A pointer to a 16-bit memory location. The address of the memory
110 /// location does not have to be aligned.
111 /// \returns
112 /// A 128-bit vector of [4 x float].
114 /// \code{.operation}
115 /// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
116 /// FOR j := 0 to 3
117 /// m := j*32
118 /// dst[m+31:m] := b
119 /// ENDFOR
120 /// dst[MAX:128] := 0
121 /// \endcode
122 static __inline__ __m128 __DEFAULT_FN_ATTRS128
123 _mm_bcstnesh_ps(const void *__A) {
124 return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
127 /// Convert scalar half-precision (16-bit) floating-point element
128 /// stored at memory locations starting at location \a __A to a
129 /// single-precision (32-bit) floating-point, broadcast it to packed
130 /// single-precision (32-bit) floating-point elements, and store the results in
131 /// \a dst.
133 /// \headerfile <x86intrin.h>
135 /// \code
136 /// _mm256_bcstnesh_ps(const void *__A);
137 /// \endcode
139 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
141 /// \param __A
142 /// A pointer to a 16-bit memory location. The address of the memory
143 /// location does not have to be aligned.
144 /// \returns
145 /// A 256-bit vector of [8 x float].
147 /// \code{.operation}
148 /// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
149 /// FOR j := 0 to 7
150 /// m := j*32
151 /// dst[m+31:m] := b
152 /// ENDFOR
153 /// dst[MAX:256] := 0
154 /// \endcode
155 static __inline__ __m256 __DEFAULT_FN_ATTRS256
156 _mm256_bcstnesh_ps(const void *__A) {
157 return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
160 /// Convert packed BF16 (16-bit) floating-point even-indexed elements
161 /// stored at memory locations starting at location \a __A to packed
162 /// single-precision (32-bit) floating-point elements, and store the results in
163 /// \a dst.
165 /// \headerfile <x86intrin.h>
167 /// \code
168 /// _mm_cvtneebf16_ps(const __m128bh *__A);
169 /// \endcode
171 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
173 /// \param __A
174 /// A pointer to a 128-bit memory location containing 8 consecutive
175 /// BF16 (16-bit) floating-point values.
176 /// \returns
177 /// A 128-bit vector of [4 x float].
179 /// \code{.operation}
180 /// FOR j := 0 to 3
181 /// k := j*2
182 /// i := k*16
183 /// m := j*32
184 /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
185 /// ENDFOR
186 /// dst[MAX:128] := 0
187 /// \endcode
188 static __inline__ __m128 __DEFAULT_FN_ATTRS128
189 _mm_cvtneebf16_ps(const __m128bh *__A) {
190 return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
193 /// Convert packed BF16 (16-bit) floating-point even-indexed elements
194 /// stored at memory locations starting at location \a __A to packed
195 /// single-precision (32-bit) floating-point elements, and store the results in
196 /// \a dst.
198 /// \headerfile <x86intrin.h>
200 /// \code
201 /// _mm256_cvtneebf16_ps(const __m256bh *__A);
202 /// \endcode
204 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
206 /// \param __A
207 /// A pointer to a 256-bit memory location containing 16 consecutive
208 /// BF16 (16-bit) floating-point values.
209 /// \returns
210 /// A 256-bit vector of [8 x float].
212 /// \code{.operation}
213 /// FOR j := 0 to 7
214 /// k := j*2
215 /// i := k*16
216 /// m := j*32
217 /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
218 /// ENDFOR
219 /// dst[MAX:256] := 0
220 /// \endcode
221 static __inline__ __m256 __DEFAULT_FN_ATTRS256
222 _mm256_cvtneebf16_ps(const __m256bh *__A) {
223 return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
226 /// Convert packed half-precision (16-bit) floating-point even-indexed elements
227 /// stored at memory locations starting at location \a __A to packed
228 /// single-precision (32-bit) floating-point elements, and store the results in
229 /// \a dst.
231 /// \headerfile <x86intrin.h>
233 /// \code
234 /// _mm_cvtneeph_ps(const __m128h *__A);
235 /// \endcode
237 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
239 /// \param __A
240 /// A pointer to a 128-bit memory location containing 8 consecutive
241 /// half-precision (16-bit) floating-point values.
242 /// \returns
243 /// A 128-bit vector of [4 x float].
245 /// \code{.operation}
246 /// FOR j := 0 to 3
247 /// k := j*2
248 /// i := k*16
249 /// m := j*32
250 /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
251 /// ENDFOR
252 /// dst[MAX:128] := 0
253 /// \endcode
254 static __inline__ __m128 __DEFAULT_FN_ATTRS128
255 _mm_cvtneeph_ps(const __m128h *__A) {
256 return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
259 /// Convert packed half-precision (16-bit) floating-point even-indexed elements
260 /// stored at memory locations starting at location \a __A to packed
261 /// single-precision (32-bit) floating-point elements, and store the results in
262 /// \a dst.
264 /// \headerfile <x86intrin.h>
266 /// \code
267 /// _mm256_cvtneeph_ps(const __m256h *__A);
268 /// \endcode
270 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
272 /// \param __A
273 /// A pointer to a 256-bit memory location containing 16 consecutive
274 /// half-precision (16-bit) floating-point values.
275 /// \returns
276 /// A 256-bit vector of [8 x float].
278 /// \code{.operation}
279 /// FOR j := 0 to 7
280 /// k := j*2
281 /// i := k*16
282 /// m := j*32
283 /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
284 /// ENDFOR
285 /// dst[MAX:256] := 0
286 /// \endcode
287 static __inline__ __m256 __DEFAULT_FN_ATTRS256
288 _mm256_cvtneeph_ps(const __m256h *__A) {
289 return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
292 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
293 /// stored at memory locations starting at location \a __A to packed
294 /// single-precision (32-bit) floating-point elements, and store the results in
295 /// \a dst.
297 /// \headerfile <x86intrin.h>
299 /// \code
300 /// _mm_cvtneobf16_ps(const __m128bh *__A);
301 /// \endcode
303 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
305 /// \param __A
306 /// A pointer to a 128-bit memory location containing 8 consecutive
307 /// BF16 (16-bit) floating-point values.
308 /// \returns
309 /// A 128-bit vector of [4 x float].
311 /// \code{.operation}
312 /// FOR j := 0 to 3
313 /// k := j*2+1
314 /// i := k*16
315 /// m := j*32
316 /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
317 /// ENDFOR
318 /// dst[MAX:128] := 0
319 /// \endcode
320 static __inline__ __m128 __DEFAULT_FN_ATTRS128
321 _mm_cvtneobf16_ps(const __m128bh *__A) {
322 return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
325 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
326 /// stored at memory locations starting at location \a __A to packed
327 /// single-precision (32-bit) floating-point elements, and store the results in
328 /// \a dst.
330 /// \headerfile <x86intrin.h>
332 /// \code
333 /// _mm256_cvtneobf16_ps(const __m256bh *__A);
334 /// \endcode
336 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
338 /// \param __A
339 /// A pointer to a 256-bit memory location containing 16 consecutive
340 /// BF16 (16-bit) floating-point values.
341 /// \returns
342 /// A 256-bit vector of [8 x float].
344 /// \code{.operation}
345 /// FOR j := 0 to 7
346 /// k := j*2+1
347 /// i := k*16
348 /// m := j*32
349 /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
350 /// ENDFOR
351 /// dst[MAX:256] := 0
352 /// \endcode
353 static __inline__ __m256 __DEFAULT_FN_ATTRS256
354 _mm256_cvtneobf16_ps(const __m256bh *__A) {
355 return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
358 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
359 /// stored at memory locations starting at location \a __A to packed
360 /// single-precision (32-bit) floating-point elements, and store the results in
361 /// \a dst.
363 /// \headerfile <x86intrin.h>
365 /// \code
366 /// _mm_cvtneoph_ps(const __m128h *__A);
367 /// \endcode
369 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
371 /// \param __A
372 /// A pointer to a 128-bit memory location containing 8 consecutive
373 /// half-precision (16-bit) floating-point values.
374 /// \returns
375 /// A 128-bit vector of [4 x float].
377 /// \code{.operation}
378 /// FOR j := 0 to 3
379 /// k := j*2+1
380 /// i := k*16
381 /// m := j*32
382 /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
383 /// ENDFOR
384 /// dst[MAX:128] := 0
385 /// \endcode
386 static __inline__ __m128 __DEFAULT_FN_ATTRS128
387 _mm_cvtneoph_ps(const __m128h *__A) {
388 return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
391 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
392 /// stored at memory locations starting at location \a __A to packed
393 /// single-precision (32-bit) floating-point elements, and store the results in
394 /// \a dst.
396 /// \headerfile <x86intrin.h>
398 /// \code
399 /// _mm256_cvtneoph_ps(const __m256h *__A);
400 /// \endcode
402 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
404 /// \param __A
405 /// A pointer to a 256-bit memory location containing 16 consecutive
406 /// half-precision (16-bit) floating-point values.
407 /// \returns
408 /// A 256-bit vector of [8 x float].
410 /// \code{.operation}
411 /// FOR j := 0 to 7
412 /// k := j*2+1
413 /// i := k*16
414 /// m := j*32
415 /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
416 /// ENDFOR
417 /// dst[MAX:256] := 0
418 /// \endcode
419 static __inline__ __m256 __DEFAULT_FN_ATTRS256
420 _mm256_cvtneoph_ps(const __m256h *__A) {
421 return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
424 /// Convert packed single-precision (32-bit) floating-point elements in \a __A
425 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
426 /// dst.
428 /// \headerfile <x86intrin.h>
430 /// \code
431 /// _mm_cvtneps_avx_pbh(__m128 __A);
432 /// \endcode
434 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
436 /// \param __A
437 /// A 128-bit vector of [4 x float].
438 /// \returns
439 /// A 128-bit vector of [8 x bfloat].
441 /// \code{.operation}
442 /// FOR j := 0 to 3
443 /// dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
444 /// ENDFOR
445 /// dst[MAX:128] := 0
446 /// \endcode
447 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
448 _mm_cvtneps_avx_pbh(__m128 __A) {
449 return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
452 /// Convert packed single-precision (32-bit) floating-point elements in \a __A
453 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
454 /// dst.
456 /// \headerfile <x86intrin.h>
458 /// \code
459 /// _mm256_cvtneps_avx_pbh(__m256 __A);
460 /// \endcode
462 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
464 /// \param __A
465 /// A 256-bit vector of [8 x float].
466 /// \returns
467 /// A 128-bit vector of [8 x bfloat].
469 /// \code{.operation}
470 /// FOR j := 0 to 7
471 /// dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
472 /// ENDFOR
473 /// dst[MAX:128] := 0
474 /// \endcode
475 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
476 _mm256_cvtneps_avx_pbh(__m256 __A) {
477 return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
480 #undef __DEFAULT_FN_ATTRS128
481 #undef __DEFAULT_FN_ATTRS256
483 #endif // __AVXNECONVERTINTRIN_H
484 #endif // __SSE2__