1 /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
15 #ifndef __AVX512FP16INTRIN_H
16 #define __AVX512FP16INTRIN_H
18 /* Define the default attributes for the functions in this file. */
19 typedef _Float16 __v32hf
__attribute__((__vector_size__(64), __aligned__(64)));
20 typedef _Float16 __m512h
__attribute__((__vector_size__(64), __aligned__(64)));
21 typedef _Float16 __m512h_u
__attribute__((__vector_size__(64), __aligned__(1)));
23 /* Define the default attributes for the functions in this file. */
24 #define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
26 __min_vector_width__(512)))
27 #define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
29 __min_vector_width__(256)))
30 #define __DEFAULT_FN_ATTRS128 \
31 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
32 __min_vector_width__(128)))
34 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_cvtsh_h(__m512h __a
) {
38 static __inline __m128h __DEFAULT_FN_ATTRS128
_mm_setzero_ph(void) {
39 return (__m128h
){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
42 static __inline __m256h __DEFAULT_FN_ATTRS256
_mm256_setzero_ph(void) {
43 return (__m256h
){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
44 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
47 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_undefined_ph(void) {
48 return (__m256h
)__builtin_ia32_undef256();
51 static __inline __m512h __DEFAULT_FN_ATTRS512
_mm512_setzero_ph(void) {
52 return (__m512h
){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
53 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
54 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
57 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_undefined_ph(void) {
58 return (__m128h
)__builtin_ia32_undef128();
61 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_undefined_ph(void) {
62 return (__m512h
)__builtin_ia32_undef512();
65 static __inline __m512h __DEFAULT_FN_ATTRS512
_mm512_set1_ph(_Float16 __h
) {
66 return (__m512h
)(__v32hf
){__h
, __h
, __h
, __h
, __h
, __h
, __h
, __h
,
67 __h
, __h
, __h
, __h
, __h
, __h
, __h
, __h
,
68 __h
, __h
, __h
, __h
, __h
, __h
, __h
, __h
,
69 __h
, __h
, __h
, __h
, __h
, __h
, __h
, __h
};
72 static __inline __m512h __DEFAULT_FN_ATTRS512
73 _mm512_set_ph(_Float16 __h1
, _Float16 __h2
, _Float16 __h3
, _Float16 __h4
,
74 _Float16 __h5
, _Float16 __h6
, _Float16 __h7
, _Float16 __h8
,
75 _Float16 __h9
, _Float16 __h10
, _Float16 __h11
, _Float16 __h12
,
76 _Float16 __h13
, _Float16 __h14
, _Float16 __h15
, _Float16 __h16
,
77 _Float16 __h17
, _Float16 __h18
, _Float16 __h19
, _Float16 __h20
,
78 _Float16 __h21
, _Float16 __h22
, _Float16 __h23
, _Float16 __h24
,
79 _Float16 __h25
, _Float16 __h26
, _Float16 __h27
, _Float16 __h28
,
80 _Float16 __h29
, _Float16 __h30
, _Float16 __h31
, _Float16 __h32
) {
81 return (__m512h
)(__v32hf
){__h32
, __h31
, __h30
, __h29
, __h28
, __h27
, __h26
,
82 __h25
, __h24
, __h23
, __h22
, __h21
, __h20
, __h19
,
83 __h18
, __h17
, __h16
, __h15
, __h14
, __h13
, __h12
,
84 __h11
, __h10
, __h9
, __h8
, __h7
, __h6
, __h5
,
85 __h4
, __h3
, __h2
, __h1
};
88 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
89 h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \
90 h25, h26, h27, h28, h29, h30, h31, h32) \
91 _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
92 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
93 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \
94 (h5), (h4), (h3), (h2), (h1))
96 static __inline __m512h __DEFAULT_FN_ATTRS512
97 _mm512_set1_pch(_Float16 _Complex h
) {
98 return (__m512h
)_mm512_set1_ps(__builtin_bit_cast(float, h
));
101 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_castph_ps(__m128h __a
) {
105 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_castph_ps(__m256h __a
) {
109 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_castph_ps(__m512h __a
) {
113 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_castph_pd(__m128h __a
) {
117 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_castph_pd(__m256h __a
) {
121 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_castph_pd(__m512h __a
) {
125 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_castph_si128(__m128h __a
) {
129 static __inline__ __m256i __DEFAULT_FN_ATTRS256
130 _mm256_castph_si256(__m256h __a
) {
134 static __inline__ __m512i __DEFAULT_FN_ATTRS512
135 _mm512_castph_si512(__m512h __a
) {
139 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_castps_ph(__m128 __a
) {
143 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_castps_ph(__m256 __a
) {
147 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castps_ph(__m512 __a
) {
151 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_castpd_ph(__m128d __a
) {
155 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_castpd_ph(__m256d __a
) {
159 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castpd_ph(__m512d __a
) {
163 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_castsi128_ph(__m128i __a
) {
167 static __inline__ __m256h __DEFAULT_FN_ATTRS256
168 _mm256_castsi256_ph(__m256i __a
) {
172 static __inline__ __m512h __DEFAULT_FN_ATTRS512
173 _mm512_castsi512_ph(__m512i __a
) {
177 static __inline__ __m128h __DEFAULT_FN_ATTRS256
178 _mm256_castph256_ph128(__m256h __a
) {
179 return __builtin_shufflevector(__a
, __a
, 0, 1, 2, 3, 4, 5, 6, 7);
182 static __inline__ __m128h __DEFAULT_FN_ATTRS512
183 _mm512_castph512_ph128(__m512h __a
) {
184 return __builtin_shufflevector(__a
, __a
, 0, 1, 2, 3, 4, 5, 6, 7);
187 static __inline__ __m256h __DEFAULT_FN_ATTRS512
188 _mm512_castph512_ph256(__m512h __a
) {
189 return __builtin_shufflevector(__a
, __a
, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
193 static __inline__ __m256h __DEFAULT_FN_ATTRS256
194 _mm256_castph128_ph256(__m128h __a
) {
195 return __builtin_shufflevector(__a
, __builtin_nondeterministic_value(__a
),
196 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
199 static __inline__ __m512h __DEFAULT_FN_ATTRS512
200 _mm512_castph128_ph512(__m128h __a
) {
201 __m256h __b
= __builtin_nondeterministic_value(__b
);
202 return __builtin_shufflevector(
203 __builtin_shufflevector(__a
, __builtin_nondeterministic_value(__a
),
204 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
205 __b
, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
206 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
209 static __inline__ __m512h __DEFAULT_FN_ATTRS512
210 _mm512_castph256_ph512(__m256h __a
) {
211 return __builtin_shufflevector(__a
, __builtin_nondeterministic_value(__a
), 0,
212 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
213 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
217 /// Constructs a 256-bit floating-point vector of [16 x half] from a
218 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits
219 /// contain the value of the source vector. The upper 384 bits are set
222 /// \headerfile <x86intrin.h>
224 /// This intrinsic has no corresponding instruction.
227 /// A 128-bit vector of [8 x half].
228 /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
229 /// contain the value of the parameter. The upper 384 bits are set to zero.
230 static __inline__ __m256h __DEFAULT_FN_ATTRS256
231 _mm256_zextph128_ph256(__m128h __a
) {
232 return __builtin_shufflevector(__a
, (__v8hf
)_mm_setzero_ph(), 0, 1, 2, 3, 4,
233 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
236 /// Constructs a 512-bit floating-point vector of [32 x half] from a
237 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits
238 /// contain the value of the source vector. The upper 384 bits are set
241 /// \headerfile <x86intrin.h>
243 /// This intrinsic has no corresponding instruction.
246 /// A 128-bit vector of [8 x half].
247 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
248 /// contain the value of the parameter. The upper 384 bits are set to zero.
249 static __inline__ __m512h __DEFAULT_FN_ATTRS512
250 _mm512_zextph128_ph512(__m128h __a
) {
251 return __builtin_shufflevector(
252 __a
, (__v8hf
)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
253 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
256 /// Constructs a 512-bit floating-point vector of [32 x half] from a
257 /// 256-bit floating-point vector of [16 x half]. The lower 256 bits
258 /// contain the value of the source vector. The upper 256 bits are set
261 /// \headerfile <x86intrin.h>
263 /// This intrinsic has no corresponding instruction.
266 /// A 256-bit vector of [16 x half].
267 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
268 /// contain the value of the parameter. The upper 256 bits are set to zero.
269 static __inline__ __m512h __DEFAULT_FN_ATTRS512
270 _mm512_zextph256_ph512(__m256h __a
) {
271 return __builtin_shufflevector(__a
, (__v16hf
)_mm256_setzero_ph(), 0, 1, 2, 3,
272 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
273 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
277 #define _mm_comi_round_sh(A, B, P, R) \
278 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
280 #define _mm_comi_sh(A, B, pred) \
281 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
283 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_comieq_sh(__m128h A
,
285 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_EQ_OS
,
286 _MM_FROUND_CUR_DIRECTION
);
289 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_comilt_sh(__m128h A
,
291 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_LT_OS
,
292 _MM_FROUND_CUR_DIRECTION
);
295 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_comile_sh(__m128h A
,
297 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_LE_OS
,
298 _MM_FROUND_CUR_DIRECTION
);
301 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_comigt_sh(__m128h A
,
303 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_GT_OS
,
304 _MM_FROUND_CUR_DIRECTION
);
307 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_comige_sh(__m128h A
,
309 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_GE_OS
,
310 _MM_FROUND_CUR_DIRECTION
);
313 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_comineq_sh(__m128h A
,
315 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_NEQ_US
,
316 _MM_FROUND_CUR_DIRECTION
);
319 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_ucomieq_sh(__m128h A
,
321 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_EQ_OQ
,
322 _MM_FROUND_CUR_DIRECTION
);
325 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_ucomilt_sh(__m128h A
,
327 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_LT_OQ
,
328 _MM_FROUND_CUR_DIRECTION
);
331 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_ucomile_sh(__m128h A
,
333 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_LE_OQ
,
334 _MM_FROUND_CUR_DIRECTION
);
337 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_ucomigt_sh(__m128h A
,
339 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_GT_OQ
,
340 _MM_FROUND_CUR_DIRECTION
);
343 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_ucomige_sh(__m128h A
,
345 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_GE_OQ
,
346 _MM_FROUND_CUR_DIRECTION
);
349 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_ucomineq_sh(__m128h A
,
351 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_NEQ_UQ
,
352 _MM_FROUND_CUR_DIRECTION
);
355 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_add_ph(__m512h __A
,
357 return (__m512h
)((__v32hf
)__A
+ (__v32hf
)__B
);
360 static __inline__ __m512h __DEFAULT_FN_ATTRS512
361 _mm512_mask_add_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
362 return (__m512h
)__builtin_ia32_selectph_512(
363 (__mmask32
)__U
, (__v32hf
)_mm512_add_ph(__A
, __B
), (__v32hf
)__W
);
366 static __inline__ __m512h __DEFAULT_FN_ATTRS512
367 _mm512_maskz_add_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
368 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
,
369 (__v32hf
)_mm512_add_ph(__A
, __B
),
370 (__v32hf
)_mm512_setzero_ph());
373 #define _mm512_add_round_ph(A, B, R) \
374 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
375 (__v32hf)(__m512h)(B), (int)(R)))
377 #define _mm512_mask_add_round_ph(W, U, A, B, R) \
378 ((__m512h)__builtin_ia32_selectph_512( \
379 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
380 (__v32hf)(__m512h)(W)))
382 #define _mm512_maskz_add_round_ph(U, A, B, R) \
383 ((__m512h)__builtin_ia32_selectph_512( \
384 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
385 (__v32hf)_mm512_setzero_ph()))
387 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_sub_ph(__m512h __A
,
389 return (__m512h
)((__v32hf
)__A
- (__v32hf
)__B
);
392 static __inline__ __m512h __DEFAULT_FN_ATTRS512
393 _mm512_mask_sub_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
394 return (__m512h
)__builtin_ia32_selectph_512(
395 (__mmask32
)__U
, (__v32hf
)_mm512_sub_ph(__A
, __B
), (__v32hf
)__W
);
398 static __inline__ __m512h __DEFAULT_FN_ATTRS512
399 _mm512_maskz_sub_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
400 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
,
401 (__v32hf
)_mm512_sub_ph(__A
, __B
),
402 (__v32hf
)_mm512_setzero_ph());
405 #define _mm512_sub_round_ph(A, B, R) \
406 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
407 (__v32hf)(__m512h)(B), (int)(R)))
409 #define _mm512_mask_sub_round_ph(W, U, A, B, R) \
410 ((__m512h)__builtin_ia32_selectph_512( \
411 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
412 (__v32hf)(__m512h)(W)))
414 #define _mm512_maskz_sub_round_ph(U, A, B, R) \
415 ((__m512h)__builtin_ia32_selectph_512( \
416 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
417 (__v32hf)_mm512_setzero_ph()))
419 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mul_ph(__m512h __A
,
421 return (__m512h
)((__v32hf
)__A
* (__v32hf
)__B
);
424 static __inline__ __m512h __DEFAULT_FN_ATTRS512
425 _mm512_mask_mul_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
426 return (__m512h
)__builtin_ia32_selectph_512(
427 (__mmask32
)__U
, (__v32hf
)_mm512_mul_ph(__A
, __B
), (__v32hf
)__W
);
430 static __inline__ __m512h __DEFAULT_FN_ATTRS512
431 _mm512_maskz_mul_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
432 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
,
433 (__v32hf
)_mm512_mul_ph(__A
, __B
),
434 (__v32hf
)_mm512_setzero_ph());
437 #define _mm512_mul_round_ph(A, B, R) \
438 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
439 (__v32hf)(__m512h)(B), (int)(R)))
441 #define _mm512_mask_mul_round_ph(W, U, A, B, R) \
442 ((__m512h)__builtin_ia32_selectph_512( \
443 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
444 (__v32hf)(__m512h)(W)))
446 #define _mm512_maskz_mul_round_ph(U, A, B, R) \
447 ((__m512h)__builtin_ia32_selectph_512( \
448 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
449 (__v32hf)_mm512_setzero_ph()))
451 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_div_ph(__m512h __A
,
453 return (__m512h
)((__v32hf
)__A
/ (__v32hf
)__B
);
456 static __inline__ __m512h __DEFAULT_FN_ATTRS512
457 _mm512_mask_div_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
458 return (__m512h
)__builtin_ia32_selectph_512(
459 (__mmask32
)__U
, (__v32hf
)_mm512_div_ph(__A
, __B
), (__v32hf
)__W
);
462 static __inline__ __m512h __DEFAULT_FN_ATTRS512
463 _mm512_maskz_div_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
464 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
,
465 (__v32hf
)_mm512_div_ph(__A
, __B
),
466 (__v32hf
)_mm512_setzero_ph());
469 #define _mm512_div_round_ph(A, B, R) \
470 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
471 (__v32hf)(__m512h)(B), (int)(R)))
473 #define _mm512_mask_div_round_ph(W, U, A, B, R) \
474 ((__m512h)__builtin_ia32_selectph_512( \
475 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
476 (__v32hf)(__m512h)(W)))
478 #define _mm512_maskz_div_round_ph(U, A, B, R) \
479 ((__m512h)__builtin_ia32_selectph_512( \
480 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
481 (__v32hf)_mm512_setzero_ph()))
483 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_min_ph(__m512h __A
,
485 return (__m512h
)__builtin_ia32_minph512((__v32hf
)__A
, (__v32hf
)__B
,
486 _MM_FROUND_CUR_DIRECTION
);
489 static __inline__ __m512h __DEFAULT_FN_ATTRS512
490 _mm512_mask_min_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
491 return (__m512h
)__builtin_ia32_selectph_512(
492 (__mmask32
)__U
, (__v32hf
)_mm512_min_ph(__A
, __B
), (__v32hf
)__W
);
495 static __inline__ __m512h __DEFAULT_FN_ATTRS512
496 _mm512_maskz_min_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
497 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
,
498 (__v32hf
)_mm512_min_ph(__A
, __B
),
499 (__v32hf
)_mm512_setzero_ph());
502 #define _mm512_min_round_ph(A, B, R) \
503 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
504 (__v32hf)(__m512h)(B), (int)(R)))
506 #define _mm512_mask_min_round_ph(W, U, A, B, R) \
507 ((__m512h)__builtin_ia32_selectph_512( \
508 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
509 (__v32hf)(__m512h)(W)))
511 #define _mm512_maskz_min_round_ph(U, A, B, R) \
512 ((__m512h)__builtin_ia32_selectph_512( \
513 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
514 (__v32hf)_mm512_setzero_ph()))
516 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_max_ph(__m512h __A
,
518 return (__m512h
)__builtin_ia32_maxph512((__v32hf
)__A
, (__v32hf
)__B
,
519 _MM_FROUND_CUR_DIRECTION
);
522 static __inline__ __m512h __DEFAULT_FN_ATTRS512
523 _mm512_mask_max_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
524 return (__m512h
)__builtin_ia32_selectph_512(
525 (__mmask32
)__U
, (__v32hf
)_mm512_max_ph(__A
, __B
), (__v32hf
)__W
);
528 static __inline__ __m512h __DEFAULT_FN_ATTRS512
529 _mm512_maskz_max_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
530 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
,
531 (__v32hf
)_mm512_max_ph(__A
, __B
),
532 (__v32hf
)_mm512_setzero_ph());
535 #define _mm512_max_round_ph(A, B, R) \
536 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
537 (__v32hf)(__m512h)(B), (int)(R)))
539 #define _mm512_mask_max_round_ph(W, U, A, B, R) \
540 ((__m512h)__builtin_ia32_selectph_512( \
541 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
542 (__v32hf)(__m512h)(W)))
544 #define _mm512_maskz_max_round_ph(U, A, B, R) \
545 ((__m512h)__builtin_ia32_selectph_512( \
546 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
547 (__v32hf)_mm512_setzero_ph()))
549 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_abs_ph(__m512h __A
) {
550 return (__m512h
)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i
)__A
);
553 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_conj_pch(__m512h __A
) {
554 return (__m512h
)_mm512_xor_ps((__m512
)__A
, _mm512_set1_ps(-0.0f
));
557 static __inline__ __m512h __DEFAULT_FN_ATTRS512
558 _mm512_mask_conj_pch(__m512h __W
, __mmask16 __U
, __m512h __A
) {
559 return (__m512h
)__builtin_ia32_selectps_512(
560 (__mmask16
)__U
, (__v16sf
)_mm512_conj_pch(__A
), (__v16sf
)__W
);
563 static __inline__ __m512h __DEFAULT_FN_ATTRS512
564 _mm512_maskz_conj_pch(__mmask16 __U
, __m512h __A
) {
565 return (__m512h
)__builtin_ia32_selectps_512((__mmask16
)__U
,
566 (__v16sf
)_mm512_conj_pch(__A
),
567 (__v16sf
)_mm512_setzero_ps());
570 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_add_sh(__m128h __A
,
576 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_add_sh(__m128h __W
,
580 __A
= _mm_add_sh(__A
, __B
);
581 return __builtin_ia32_selectsh_128(__U
, __A
, __W
);
584 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_add_sh(__mmask8 __U
,
587 __A
= _mm_add_sh(__A
, __B
);
588 return __builtin_ia32_selectsh_128(__U
, __A
, _mm_setzero_ph());
591 #define _mm_add_round_sh(A, B, R) \
592 ((__m128h)__builtin_ia32_addsh_round_mask( \
593 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
594 (__mmask8)-1, (int)(R)))
596 #define _mm_mask_add_round_sh(W, U, A, B, R) \
597 ((__m128h)__builtin_ia32_addsh_round_mask( \
598 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
599 (__mmask8)(U), (int)(R)))
601 #define _mm_maskz_add_round_sh(U, A, B, R) \
602 ((__m128h)__builtin_ia32_addsh_round_mask( \
603 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
604 (__mmask8)(U), (int)(R)))
606 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_sub_sh(__m128h __A
,
612 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_sub_sh(__m128h __W
,
616 __A
= _mm_sub_sh(__A
, __B
);
617 return __builtin_ia32_selectsh_128(__U
, __A
, __W
);
620 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_sub_sh(__mmask8 __U
,
623 __A
= _mm_sub_sh(__A
, __B
);
624 return __builtin_ia32_selectsh_128(__U
, __A
, _mm_setzero_ph());
627 #define _mm_sub_round_sh(A, B, R) \
628 ((__m128h)__builtin_ia32_subsh_round_mask( \
629 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
630 (__mmask8)-1, (int)(R)))
632 #define _mm_mask_sub_round_sh(W, U, A, B, R) \
633 ((__m128h)__builtin_ia32_subsh_round_mask( \
634 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
635 (__mmask8)(U), (int)(R)))
637 #define _mm_maskz_sub_round_sh(U, A, B, R) \
638 ((__m128h)__builtin_ia32_subsh_round_mask( \
639 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
640 (__mmask8)(U), (int)(R)))
642 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mul_sh(__m128h __A
,
648 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_mul_sh(__m128h __W
,
652 __A
= _mm_mul_sh(__A
, __B
);
653 return __builtin_ia32_selectsh_128(__U
, __A
, __W
);
656 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_mul_sh(__mmask8 __U
,
659 __A
= _mm_mul_sh(__A
, __B
);
660 return __builtin_ia32_selectsh_128(__U
, __A
, _mm_setzero_ph());
663 #define _mm_mul_round_sh(A, B, R) \
664 ((__m128h)__builtin_ia32_mulsh_round_mask( \
665 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
666 (__mmask8)-1, (int)(R)))
668 #define _mm_mask_mul_round_sh(W, U, A, B, R) \
669 ((__m128h)__builtin_ia32_mulsh_round_mask( \
670 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
671 (__mmask8)(U), (int)(R)))
673 #define _mm_maskz_mul_round_sh(U, A, B, R) \
674 ((__m128h)__builtin_ia32_mulsh_round_mask( \
675 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
676 (__mmask8)(U), (int)(R)))
678 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_div_sh(__m128h __A
,
684 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_div_sh(__m128h __W
,
688 __A
= _mm_div_sh(__A
, __B
);
689 return __builtin_ia32_selectsh_128(__U
, __A
, __W
);
692 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_div_sh(__mmask8 __U
,
695 __A
= _mm_div_sh(__A
, __B
);
696 return __builtin_ia32_selectsh_128(__U
, __A
, _mm_setzero_ph());
699 #define _mm_div_round_sh(A, B, R) \
700 ((__m128h)__builtin_ia32_divsh_round_mask( \
701 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
702 (__mmask8)-1, (int)(R)))
704 #define _mm_mask_div_round_sh(W, U, A, B, R) \
705 ((__m128h)__builtin_ia32_divsh_round_mask( \
706 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
707 (__mmask8)(U), (int)(R)))
709 #define _mm_maskz_div_round_sh(U, A, B, R) \
710 ((__m128h)__builtin_ia32_divsh_round_mask( \
711 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
712 (__mmask8)(U), (int)(R)))
714 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_min_sh(__m128h __A
,
716 return (__m128h
)__builtin_ia32_minsh_round_mask(
717 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
718 _MM_FROUND_CUR_DIRECTION
);
721 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_min_sh(__m128h __W
,
725 return (__m128h
)__builtin_ia32_minsh_round_mask((__v8hf
)__A
, (__v8hf
)__B
,
726 (__v8hf
)__W
, (__mmask8
)__U
,
727 _MM_FROUND_CUR_DIRECTION
);
730 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_min_sh(__mmask8 __U
,
733 return (__m128h
)__builtin_ia32_minsh_round_mask(
734 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
735 _MM_FROUND_CUR_DIRECTION
);
738 #define _mm_min_round_sh(A, B, R) \
739 ((__m128h)__builtin_ia32_minsh_round_mask( \
740 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
741 (__mmask8)-1, (int)(R)))
743 #define _mm_mask_min_round_sh(W, U, A, B, R) \
744 ((__m128h)__builtin_ia32_minsh_round_mask( \
745 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
746 (__mmask8)(U), (int)(R)))
748 #define _mm_maskz_min_round_sh(U, A, B, R) \
749 ((__m128h)__builtin_ia32_minsh_round_mask( \
750 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
751 (__mmask8)(U), (int)(R)))
753 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_max_sh(__m128h __A
,
755 return (__m128h
)__builtin_ia32_maxsh_round_mask(
756 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
757 _MM_FROUND_CUR_DIRECTION
);
760 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_max_sh(__m128h __W
,
764 return (__m128h
)__builtin_ia32_maxsh_round_mask((__v8hf
)__A
, (__v8hf
)__B
,
765 (__v8hf
)__W
, (__mmask8
)__U
,
766 _MM_FROUND_CUR_DIRECTION
);
769 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_max_sh(__mmask8 __U
,
772 return (__m128h
)__builtin_ia32_maxsh_round_mask(
773 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
774 _MM_FROUND_CUR_DIRECTION
);
777 #define _mm_max_round_sh(A, B, R) \
778 ((__m128h)__builtin_ia32_maxsh_round_mask( \
779 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
780 (__mmask8)-1, (int)(R)))
782 #define _mm_mask_max_round_sh(W, U, A, B, R) \
783 ((__m128h)__builtin_ia32_maxsh_round_mask( \
784 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
785 (__mmask8)(U), (int)(R)))
787 #define _mm_maskz_max_round_sh(U, A, B, R) \
788 ((__m128h)__builtin_ia32_maxsh_round_mask( \
789 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
790 (__mmask8)(U), (int)(R)))
792 #define _mm512_cmp_round_ph_mask(A, B, P, R) \
793 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
794 (__v32hf)(__m512h)(B), (int)(P), \
795 (__mmask32)-1, (int)(R)))
797 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
798 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
799 (__v32hf)(__m512h)(B), (int)(P), \
800 (__mmask32)(U), (int)(R)))
802 #define _mm512_cmp_ph_mask(A, B, P) \
803 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
805 #define _mm512_mask_cmp_ph_mask(U, A, B, P) \
806 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
808 #define _mm_cmp_round_sh_mask(X, Y, P, R) \
809 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
810 (__v8hf)(__m128h)(Y), (int)(P), \
811 (__mmask8)-1, (int)(R)))
813 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
814 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
815 (__v8hf)(__m128h)(Y), (int)(P), \
816 (__mmask8)(M), (int)(R)))
818 #define _mm_cmp_sh_mask(X, Y, P) \
819 ((__mmask8)__builtin_ia32_cmpsh_mask( \
820 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
821 _MM_FROUND_CUR_DIRECTION))
823 #define _mm_mask_cmp_sh_mask(M, X, Y, P) \
824 ((__mmask8)__builtin_ia32_cmpsh_mask( \
825 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
826 _MM_FROUND_CUR_DIRECTION))
827 // loads with vmovsh:
828 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_load_sh(void const *__dp
) {
829 struct __mm_load_sh_struct
{
831 } __attribute__((__packed__
, __may_alias__
));
832 _Float16 __u
= ((const struct __mm_load_sh_struct
*)__dp
)->__u
;
833 return (__m128h
){__u
, 0, 0, 0, 0, 0, 0, 0};
836 static __inline__ __m128h __DEFAULT_FN_ATTRS128
837 _mm_mask_load_sh(__m128h __W
, __mmask8 __U
, const void *__A
) {
838 __m128h src
= (__v8hf
)__builtin_shufflevector(
839 (__v8hf
)__W
, (__v8hf
)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
841 return (__m128h
)__builtin_ia32_loadsh128_mask((const __v8hf
*)__A
, src
, __U
& 1);
844 static __inline__ __m128h __DEFAULT_FN_ATTRS128
845 _mm_maskz_load_sh(__mmask8 __U
, const void *__A
) {
846 return (__m128h
)__builtin_ia32_loadsh128_mask(
847 (const __v8hf
*)__A
, (__v8hf
)_mm_setzero_ph(), __U
& 1);
850 static __inline__ __m512h __DEFAULT_FN_ATTRS512
851 _mm512_load_ph(void const *__p
) {
852 return *(const __m512h
*)__p
;
855 static __inline__ __m256h __DEFAULT_FN_ATTRS256
856 _mm256_load_ph(void const *__p
) {
857 return *(const __m256h
*)__p
;
860 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_load_ph(void const *__p
) {
861 return *(const __m128h
*)__p
;
864 static __inline__ __m512h __DEFAULT_FN_ATTRS512
865 _mm512_loadu_ph(void const *__p
) {
868 } __attribute__((__packed__
, __may_alias__
));
869 return ((const struct __loadu_ph
*)__p
)->__v
;
872 static __inline__ __m256h __DEFAULT_FN_ATTRS256
873 _mm256_loadu_ph(void const *__p
) {
876 } __attribute__((__packed__
, __may_alias__
));
877 return ((const struct __loadu_ph
*)__p
)->__v
;
880 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_loadu_ph(void const *__p
) {
883 } __attribute__((__packed__
, __may_alias__
));
884 return ((const struct __loadu_ph
*)__p
)->__v
;
887 // stores with vmovsh:
888 static __inline__
void __DEFAULT_FN_ATTRS128
_mm_store_sh(void *__dp
,
890 struct __mm_store_sh_struct
{
892 } __attribute__((__packed__
, __may_alias__
));
893 ((struct __mm_store_sh_struct
*)__dp
)->__u
= __a
[0];
896 static __inline__
void __DEFAULT_FN_ATTRS128
_mm_mask_store_sh(void *__W
,
899 __builtin_ia32_storesh128_mask((__v8hf
*)__W
, __A
, __U
& 1);
902 static __inline__
void __DEFAULT_FN_ATTRS512
_mm512_store_ph(void *__P
,
904 *(__m512h
*)__P
= __A
;
907 static __inline__
void __DEFAULT_FN_ATTRS256
_mm256_store_ph(void *__P
,
909 *(__m256h
*)__P
= __A
;
912 static __inline__
void __DEFAULT_FN_ATTRS128
_mm_store_ph(void *__P
,
914 *(__m128h
*)__P
= __A
;
917 static __inline__
void __DEFAULT_FN_ATTRS512
_mm512_storeu_ph(void *__P
,
921 } __attribute__((__packed__
, __may_alias__
));
922 ((struct __storeu_ph
*)__P
)->__v
= __A
;
925 static __inline__
void __DEFAULT_FN_ATTRS256
_mm256_storeu_ph(void *__P
,
929 } __attribute__((__packed__
, __may_alias__
));
930 ((struct __storeu_ph
*)__P
)->__v
= __A
;
933 static __inline__
void __DEFAULT_FN_ATTRS128
_mm_storeu_ph(void *__P
,
937 } __attribute__((__packed__
, __may_alias__
));
938 ((struct __storeu_ph
*)__P
)->__v
= __A
;
941 // moves with vmovsh:
942 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_move_sh(__m128h __a
,
948 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_move_sh(__m128h __W
,
952 return __builtin_ia32_selectsh_128(__U
, _mm_move_sh(__A
, __B
), __W
);
955 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_move_sh(__mmask8 __U
,
958 return __builtin_ia32_selectsh_128(__U
, _mm_move_sh(__A
, __B
),
963 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvtsi16_si128(short __a
) {
964 return (__m128i
)(__v8hi
){__a
, 0, 0, 0, 0, 0, 0, 0};
967 static __inline__
short __DEFAULT_FN_ATTRS128
_mm_cvtsi128_si16(__m128i __a
) {
968 __v8hi __b
= (__v8hi
)__a
;
972 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_rcp_ph(__m512h __A
) {
973 return (__m512h
)__builtin_ia32_rcpph512_mask(
974 (__v32hf
)__A
, (__v32hf
)_mm512_undefined_ph(), (__mmask32
)-1);
977 static __inline__ __m512h __DEFAULT_FN_ATTRS512
978 _mm512_mask_rcp_ph(__m512h __W
, __mmask32 __U
, __m512h __A
) {
979 return (__m512h
)__builtin_ia32_rcpph512_mask((__v32hf
)__A
, (__v32hf
)__W
,
983 static __inline__ __m512h __DEFAULT_FN_ATTRS512
984 _mm512_maskz_rcp_ph(__mmask32 __U
, __m512h __A
) {
985 return (__m512h
)__builtin_ia32_rcpph512_mask(
986 (__v32hf
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)__U
);
989 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_rsqrt_ph(__m512h __A
) {
990 return (__m512h
)__builtin_ia32_rsqrtph512_mask(
991 (__v32hf
)__A
, (__v32hf
)_mm512_undefined_ph(), (__mmask32
)-1);
994 static __inline__ __m512h __DEFAULT_FN_ATTRS512
995 _mm512_mask_rsqrt_ph(__m512h __W
, __mmask32 __U
, __m512h __A
) {
996 return (__m512h
)__builtin_ia32_rsqrtph512_mask((__v32hf
)__A
, (__v32hf
)__W
,
1000 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1001 _mm512_maskz_rsqrt_ph(__mmask32 __U
, __m512h __A
) {
1002 return (__m512h
)__builtin_ia32_rsqrtph512_mask(
1003 (__v32hf
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)__U
);
1006 #define _mm512_getmant_ph(A, B, C) \
1007 ((__m512h)__builtin_ia32_getmantph512_mask( \
1008 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1009 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1010 _MM_FROUND_CUR_DIRECTION))
1012 #define _mm512_mask_getmant_ph(W, U, A, B, C) \
1013 ((__m512h)__builtin_ia32_getmantph512_mask( \
1014 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1015 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1017 #define _mm512_maskz_getmant_ph(U, A, B, C) \
1018 ((__m512h)__builtin_ia32_getmantph512_mask( \
1019 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1020 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1022 #define _mm512_getmant_round_ph(A, B, C, R) \
1023 ((__m512h)__builtin_ia32_getmantph512_mask( \
1024 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1025 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1027 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1028 ((__m512h)__builtin_ia32_getmantph512_mask( \
1029 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1030 (__mmask32)(U), (int)(R)))
1032 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1033 ((__m512h)__builtin_ia32_getmantph512_mask( \
1034 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1035 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1037 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_getexp_ph(__m512h __A
) {
1038 return (__m512h
)__builtin_ia32_getexpph512_mask(
1039 (__v32hf
)__A
, (__v32hf
)_mm512_undefined_ph(), (__mmask32
)-1,
1040 _MM_FROUND_CUR_DIRECTION
);
1043 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1044 _mm512_mask_getexp_ph(__m512h __W
, __mmask32 __U
, __m512h __A
) {
1045 return (__m512h
)__builtin_ia32_getexpph512_mask(
1046 (__v32hf
)__A
, (__v32hf
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1049 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1050 _mm512_maskz_getexp_ph(__mmask32 __U
, __m512h __A
) {
1051 return (__m512h
)__builtin_ia32_getexpph512_mask(
1052 (__v32hf
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)__U
,
1053 _MM_FROUND_CUR_DIRECTION
);
1056 #define _mm512_getexp_round_ph(A, R) \
1057 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1058 (__v32hf)_mm512_undefined_ph(), \
1059 (__mmask32)-1, (int)(R)))
1061 #define _mm512_mask_getexp_round_ph(W, U, A, R) \
1062 ((__m512h)__builtin_ia32_getexpph512_mask( \
1063 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1065 #define _mm512_maskz_getexp_round_ph(U, A, R) \
1066 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1067 (__v32hf)_mm512_setzero_ph(), \
1068 (__mmask32)(U), (int)(R)))
1070 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_scalef_ph(__m512h __A
,
1072 return (__m512h
)__builtin_ia32_scalefph512_mask(
1073 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)_mm512_undefined_ph(), (__mmask32
)-1,
1074 _MM_FROUND_CUR_DIRECTION
);
1077 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1078 _mm512_mask_scalef_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
1079 return (__m512h
)__builtin_ia32_scalefph512_mask((__v32hf
)__A
, (__v32hf
)__B
,
1080 (__v32hf
)__W
, (__mmask32
)__U
,
1081 _MM_FROUND_CUR_DIRECTION
);
1084 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1085 _mm512_maskz_scalef_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
1086 return (__m512h
)__builtin_ia32_scalefph512_mask(
1087 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)__U
,
1088 _MM_FROUND_CUR_DIRECTION
);
1091 #define _mm512_scalef_round_ph(A, B, R) \
1092 ((__m512h)__builtin_ia32_scalefph512_mask( \
1093 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1094 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1096 #define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1097 ((__m512h)__builtin_ia32_scalefph512_mask( \
1098 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1099 (__mmask32)(U), (int)(R)))
1101 #define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1102 ((__m512h)__builtin_ia32_scalefph512_mask( \
1103 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1104 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1106 #define _mm512_roundscale_ph(A, B) \
1107 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1108 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1109 _MM_FROUND_CUR_DIRECTION))
1111 #define _mm512_mask_roundscale_ph(A, B, C, imm) \
1112 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1113 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1114 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1116 #define _mm512_maskz_roundscale_ph(A, B, imm) \
1117 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1118 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1119 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1121 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1122 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1123 (__v32hf)(__m512h)(A), \
1124 (__mmask32)(B), (int)(R)))
1126 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1127 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1128 (__v32hf)_mm512_setzero_ph(), \
1129 (__mmask32)(A), (int)(R)))
1131 #define _mm512_roundscale_round_ph(A, imm, R) \
1132 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1133 (__v32hf)_mm512_undefined_ph(), \
1134 (__mmask32)-1, (int)(R)))
1136 #define _mm512_reduce_ph(A, imm) \
1137 ((__m512h)__builtin_ia32_reduceph512_mask( \
1138 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1139 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1141 #define _mm512_mask_reduce_ph(W, U, A, imm) \
1142 ((__m512h)__builtin_ia32_reduceph512_mask( \
1143 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1144 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1146 #define _mm512_maskz_reduce_ph(U, A, imm) \
1147 ((__m512h)__builtin_ia32_reduceph512_mask( \
1148 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1149 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1151 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1152 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1153 (__v32hf)(__m512h)(W), \
1154 (__mmask32)(U), (int)(R)))
1156 #define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1157 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1158 (__v32hf)_mm512_setzero_ph(), \
1159 (__mmask32)(U), (int)(R)))
1161 #define _mm512_reduce_round_ph(A, imm, R) \
1162 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1163 (__v32hf)_mm512_undefined_ph(), \
1164 (__mmask32)-1, (int)(R)))
1166 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_rcp_sh(__m128h __A
,
1168 return (__m128h
)__builtin_ia32_rcpsh_mask(
1169 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1);
1172 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_rcp_sh(__m128h __W
,
1176 return (__m128h
)__builtin_ia32_rcpsh_mask((__v8hf
)__A
, (__v8hf
)__B
,
1177 (__v8hf
)__W
, (__mmask8
)__U
);
1180 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_rcp_sh(__mmask8 __U
,
1183 return (__m128h
)__builtin_ia32_rcpsh_mask(
1184 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
1187 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_rsqrt_sh(__m128h __A
,
1189 return (__m128h
)__builtin_ia32_rsqrtsh_mask(
1190 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1);
1193 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_rsqrt_sh(__m128h __W
,
1197 return (__m128h
)__builtin_ia32_rsqrtsh_mask((__v8hf
)__A
, (__v8hf
)__B
,
1198 (__v8hf
)__W
, (__mmask8
)__U
);
1201 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1202 _mm_maskz_rsqrt_sh(__mmask8 __U
, __m128h __A
, __m128h __B
) {
1203 return (__m128h
)__builtin_ia32_rsqrtsh_mask(
1204 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
1207 #define _mm_getmant_round_sh(A, B, C, D, R) \
1208 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1209 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1210 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1212 #define _mm_getmant_sh(A, B, C, D) \
1213 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1214 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1215 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1217 #define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1218 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1219 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1220 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1222 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1223 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1224 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1225 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1227 #define _mm_maskz_getmant_sh(U, A, B, C, D) \
1228 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1229 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1230 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1232 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1233 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1234 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1235 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1237 #define _mm_getexp_round_sh(A, B, R) \
1238 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1239 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1240 (__mmask8)-1, (int)(R)))
1242 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_getexp_sh(__m128h __A
,
1244 return (__m128h
)__builtin_ia32_getexpsh128_round_mask(
1245 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
1246 _MM_FROUND_CUR_DIRECTION
);
1249 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1250 _mm_mask_getexp_sh(__m128h __W
, __mmask8 __U
, __m128h __A
, __m128h __B
) {
1251 return (__m128h
)__builtin_ia32_getexpsh128_round_mask(
1252 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__W
, (__mmask8
)__U
,
1253 _MM_FROUND_CUR_DIRECTION
);
1256 #define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1257 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1258 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1259 (__mmask8)(U), (int)(R)))
1261 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1262 _mm_maskz_getexp_sh(__mmask8 __U
, __m128h __A
, __m128h __B
) {
1263 return (__m128h
)__builtin_ia32_getexpsh128_round_mask(
1264 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
1265 _MM_FROUND_CUR_DIRECTION
);
1268 #define _mm_maskz_getexp_round_sh(U, A, B, R) \
1269 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1270 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1271 (__mmask8)(U), (int)(R)))
1273 #define _mm_scalef_round_sh(A, B, R) \
1274 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1275 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1276 (__mmask8)-1, (int)(R)))
1278 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_scalef_sh(__m128h __A
,
1280 return (__m128h
)__builtin_ia32_scalefsh_round_mask(
1281 (__v8hf
)__A
, (__v8hf
)(__B
), (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
1282 _MM_FROUND_CUR_DIRECTION
);
1285 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1286 _mm_mask_scalef_sh(__m128h __W
, __mmask8 __U
, __m128h __A
, __m128h __B
) {
1287 return (__m128h
)__builtin_ia32_scalefsh_round_mask((__v8hf
)__A
, (__v8hf
)__B
,
1288 (__v8hf
)__W
, (__mmask8
)__U
,
1289 _MM_FROUND_CUR_DIRECTION
);
1292 #define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1293 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1294 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1295 (__mmask8)(U), (int)(R)))
1297 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1298 _mm_maskz_scalef_sh(__mmask8 __U
, __m128h __A
, __m128h __B
) {
1299 return (__m128h
)__builtin_ia32_scalefsh_round_mask(
1300 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
1301 _MM_FROUND_CUR_DIRECTION
);
1304 #define _mm_maskz_scalef_round_sh(U, A, B, R) \
1305 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1306 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1307 (__mmask8)(U), (int)(R)))
1309 #define _mm_roundscale_round_sh(A, B, imm, R) \
1310 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1311 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1312 (__mmask8)-1, (int)(imm), (int)(R)))
1314 #define _mm_roundscale_sh(A, B, imm) \
1315 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1316 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1317 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1319 #define _mm_mask_roundscale_sh(W, U, A, B, I) \
1320 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1321 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1322 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1324 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1325 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1326 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1327 (__mmask8)(U), (int)(I), (int)(R)))
1329 #define _mm_maskz_roundscale_sh(U, A, B, I) \
1330 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1331 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1332 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1334 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1335 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1336 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1337 (__mmask8)(U), (int)(I), (int)(R)))
1339 #define _mm_reduce_sh(A, B, C) \
1340 ((__m128h)__builtin_ia32_reducesh_mask( \
1341 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1342 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1344 #define _mm_mask_reduce_sh(W, U, A, B, C) \
1345 ((__m128h)__builtin_ia32_reducesh_mask( \
1346 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1347 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1349 #define _mm_maskz_reduce_sh(U, A, B, C) \
1350 ((__m128h)__builtin_ia32_reducesh_mask( \
1351 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1352 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1354 #define _mm_reduce_round_sh(A, B, C, R) \
1355 ((__m128h)__builtin_ia32_reducesh_mask( \
1356 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1357 (__mmask8)-1, (int)(C), (int)(R)))
1359 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1360 ((__m128h)__builtin_ia32_reducesh_mask( \
1361 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1362 (__mmask8)(U), (int)(C), (int)(R)))
1364 #define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1365 ((__m128h)__builtin_ia32_reducesh_mask( \
1366 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1367 (__mmask8)(U), (int)(C), (int)(R)))
1369 #define _mm512_sqrt_round_ph(A, R) \
1370 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1372 #define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1373 ((__m512h)__builtin_ia32_selectph_512( \
1374 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1375 (__v32hf)(__m512h)(W)))
1377 #define _mm512_maskz_sqrt_round_ph(U, A, R) \
1378 ((__m512h)__builtin_ia32_selectph_512( \
1379 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1380 (__v32hf)_mm512_setzero_ph()))
1382 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_sqrt_ph(__m512h __A
) {
1383 return (__m512h
)__builtin_ia32_sqrtph512((__v32hf
)__A
,
1384 _MM_FROUND_CUR_DIRECTION
);
1387 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1388 _mm512_mask_sqrt_ph(__m512h __W
, __mmask32 __U
, __m512h __A
) {
1389 return (__m512h
)__builtin_ia32_selectph_512(
1391 (__v32hf
)__builtin_ia32_sqrtph512((__A
), (_MM_FROUND_CUR_DIRECTION
)),
1392 (__v32hf
)(__m512h
)(__W
));
1395 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1396 _mm512_maskz_sqrt_ph(__mmask32 __U
, __m512h __A
) {
1397 return (__m512h
)__builtin_ia32_selectph_512(
1399 (__v32hf
)__builtin_ia32_sqrtph512((__A
), (_MM_FROUND_CUR_DIRECTION
)),
1400 (__v32hf
)_mm512_setzero_ph());
1403 #define _mm_sqrt_round_sh(A, B, R) \
1404 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1405 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1406 (__mmask8)-1, (int)(R)))
1408 #define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1409 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1410 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1411 (__mmask8)(U), (int)(R)))
1413 #define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1414 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1415 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1416 (__mmask8)(U), (int)(R)))
1418 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_sqrt_sh(__m128h __A
,
1420 return (__m128h
)__builtin_ia32_sqrtsh_round_mask(
1421 (__v8hf
)(__m128h
)(__A
), (__v8hf
)(__m128h
)(__B
), (__v8hf
)_mm_setzero_ph(),
1422 (__mmask8
)-1, _MM_FROUND_CUR_DIRECTION
);
1425 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_sqrt_sh(__m128h __W
,
1429 return (__m128h
)__builtin_ia32_sqrtsh_round_mask(
1430 (__v8hf
)(__m128h
)(__A
), (__v8hf
)(__m128h
)(__B
), (__v8hf
)(__m128h
)(__W
),
1431 (__mmask8
)(__U
), _MM_FROUND_CUR_DIRECTION
);
1434 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_sqrt_sh(__mmask32 __U
,
1437 return (__m128h
)__builtin_ia32_sqrtsh_round_mask(
1438 (__v8hf
)(__m128h
)(__A
), (__v8hf
)(__m128h
)(__B
), (__v8hf
)_mm_setzero_ph(),
1439 (__mmask8
)(__U
), _MM_FROUND_CUR_DIRECTION
);
1442 #define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1443 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1444 (int)(imm), (__mmask32)(U)))
1446 #define _mm512_fpclass_ph_mask(A, imm) \
1447 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1448 (int)(imm), (__mmask32)-1))
1450 #define _mm_fpclass_sh_mask(A, imm) \
1451 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1454 #define _mm_mask_fpclass_sh_mask(U, A, imm) \
1455 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1458 #define _mm512_cvt_roundpd_ph(A, R) \
1459 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1460 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1462 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1463 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1464 (__mmask8)(U), (int)(R)))
1466 #define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1467 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1468 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1470 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_cvtpd_ph(__m512d __A
) {
1471 return (__m128h
)__builtin_ia32_vcvtpd2ph512_mask(
1472 (__v8df
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
1473 _MM_FROUND_CUR_DIRECTION
);
1476 static __inline__ __m128h __DEFAULT_FN_ATTRS512
1477 _mm512_mask_cvtpd_ph(__m128h __W
, __mmask8 __U
, __m512d __A
) {
1478 return (__m128h
)__builtin_ia32_vcvtpd2ph512_mask(
1479 (__v8df
)__A
, (__v8hf
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
1482 static __inline__ __m128h __DEFAULT_FN_ATTRS512
1483 _mm512_maskz_cvtpd_ph(__mmask8 __U
, __m512d __A
) {
1484 return (__m128h
)__builtin_ia32_vcvtpd2ph512_mask(
1485 (__v8df
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
1486 _MM_FROUND_CUR_DIRECTION
);
1489 #define _mm512_cvt_roundph_pd(A, R) \
1490 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1491 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1493 #define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1494 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1495 (__mmask8)(U), (int)(R)))
1497 #define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1498 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1499 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1501 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_cvtph_pd(__m128h __A
) {
1502 return (__m512d
)__builtin_ia32_vcvtph2pd512_mask(
1503 (__v8hf
)__A
, (__v8df
)_mm512_setzero_pd(), (__mmask8
)-1,
1504 _MM_FROUND_CUR_DIRECTION
);
1507 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1508 _mm512_mask_cvtph_pd(__m512d __W
, __mmask8 __U
, __m128h __A
) {
1509 return (__m512d
)__builtin_ia32_vcvtph2pd512_mask(
1510 (__v8hf
)__A
, (__v8df
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
1513 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1514 _mm512_maskz_cvtph_pd(__mmask8 __U
, __m128h __A
) {
1515 return (__m512d
)__builtin_ia32_vcvtph2pd512_mask(
1516 (__v8hf
)__A
, (__v8df
)_mm512_setzero_pd(), (__mmask8
)__U
,
1517 _MM_FROUND_CUR_DIRECTION
);
1520 #define _mm_cvt_roundsh_ss(A, B, R) \
1521 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1522 (__v4sf)_mm_undefined_ps(), \
1523 (__mmask8)(-1), (int)(R)))
1525 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1526 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1527 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1529 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1530 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1531 (__v4sf)_mm_setzero_ps(), \
1532 (__mmask8)(U), (int)(R)))
1534 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtsh_ss(__m128 __A
,
1536 return (__m128
)__builtin_ia32_vcvtsh2ss_round_mask(
1537 (__v4sf
)__A
, (__v8hf
)__B
, (__v4sf
)_mm_undefined_ps(), (__mmask8
)-1,
1538 _MM_FROUND_CUR_DIRECTION
);
1541 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_cvtsh_ss(__m128 __W
,
1545 return (__m128
)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf
)__A
, (__v8hf
)__B
,
1546 (__v4sf
)__W
, (__mmask8
)__U
,
1547 _MM_FROUND_CUR_DIRECTION
);
1550 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_cvtsh_ss(__mmask8 __U
,
1553 return (__m128
)__builtin_ia32_vcvtsh2ss_round_mask(
1554 (__v4sf
)__A
, (__v8hf
)__B
, (__v4sf
)_mm_setzero_ps(), (__mmask8
)__U
,
1555 _MM_FROUND_CUR_DIRECTION
);
1558 #define _mm_cvt_roundss_sh(A, B, R) \
1559 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1560 (__v8hf)_mm_undefined_ph(), \
1561 (__mmask8)(-1), (int)(R)))
1563 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1564 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1565 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1567 #define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1568 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1569 (__v8hf)_mm_setzero_ph(), \
1570 (__mmask8)(U), (int)(R)))
1572 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtss_sh(__m128h __A
,
1574 return (__m128h
)__builtin_ia32_vcvtss2sh_round_mask(
1575 (__v8hf
)__A
, (__v4sf
)__B
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1,
1576 _MM_FROUND_CUR_DIRECTION
);
1579 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtss_sh(__m128h __W
,
1583 return (__m128h
)__builtin_ia32_vcvtss2sh_round_mask(
1584 (__v8hf
)__A
, (__v4sf
)__B
, (__v8hf
)__W
, (__mmask8
)__U
,
1585 _MM_FROUND_CUR_DIRECTION
);
1588 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtss_sh(__mmask8 __U
,
1591 return (__m128h
)__builtin_ia32_vcvtss2sh_round_mask(
1592 (__v8hf
)__A
, (__v4sf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
1593 _MM_FROUND_CUR_DIRECTION
);
1596 #define _mm_cvt_roundsd_sh(A, B, R) \
1597 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1598 (__v8hf)_mm_undefined_ph(), \
1599 (__mmask8)(-1), (int)(R)))
1601 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1602 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1603 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1605 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1606 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1607 (__v8hf)_mm_setzero_ph(), \
1608 (__mmask8)(U), (int)(R)))
1610 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtsd_sh(__m128h __A
,
1612 return (__m128h
)__builtin_ia32_vcvtsd2sh_round_mask(
1613 (__v8hf
)__A
, (__v2df
)__B
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1,
1614 _MM_FROUND_CUR_DIRECTION
);
1617 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtsd_sh(__m128h __W
,
1621 return (__m128h
)__builtin_ia32_vcvtsd2sh_round_mask(
1622 (__v8hf
)__A
, (__v2df
)__B
, (__v8hf
)__W
, (__mmask8
)__U
,
1623 _MM_FROUND_CUR_DIRECTION
);
1626 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1627 _mm_maskz_cvtsd_sh(__mmask8 __U
, __m128h __A
, __m128d __B
) {
1628 return (__m128h
)__builtin_ia32_vcvtsd2sh_round_mask(
1629 (__v8hf
)__A
, (__v2df
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
1630 _MM_FROUND_CUR_DIRECTION
);
1633 #define _mm_cvt_roundsh_sd(A, B, R) \
1634 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1635 (__v2df)_mm_undefined_pd(), \
1636 (__mmask8)(-1), (int)(R)))
1638 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1639 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1640 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1642 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1643 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1644 (__v2df)_mm_setzero_pd(), \
1645 (__mmask8)(U), (int)(R)))
1647 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_cvtsh_sd(__m128d __A
,
1649 return (__m128d
)__builtin_ia32_vcvtsh2sd_round_mask(
1650 (__v2df
)__A
, (__v8hf
)__B
, (__v2df
)_mm_undefined_pd(), (__mmask8
)-1,
1651 _MM_FROUND_CUR_DIRECTION
);
1654 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_cvtsh_sd(__m128d __W
,
1658 return (__m128d
)__builtin_ia32_vcvtsh2sd_round_mask(
1659 (__v2df
)__A
, (__v8hf
)__B
, (__v2df
)__W
, (__mmask8
)__U
,
1660 _MM_FROUND_CUR_DIRECTION
);
1663 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1664 _mm_maskz_cvtsh_sd(__mmask8 __U
, __m128d __A
, __m128h __B
) {
1665 return (__m128d
)__builtin_ia32_vcvtsh2sd_round_mask(
1666 (__v2df
)__A
, (__v8hf
)__B
, (__v2df
)_mm_setzero_pd(), (__mmask8
)__U
,
1667 _MM_FROUND_CUR_DIRECTION
);
1670 #define _mm512_cvt_roundph_epi16(A, R) \
1671 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1672 (__v32hi)_mm512_undefined_epi32(), \
1673 (__mmask32)(-1), (int)(R)))
1675 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1676 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1677 (__mmask32)(U), (int)(R)))
1679 #define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1680 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1681 (__v32hi)_mm512_setzero_epi32(), \
1682 (__mmask32)(U), (int)(R)))
1684 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1685 _mm512_cvtph_epi16(__m512h __A
) {
1686 return (__m512i
)__builtin_ia32_vcvtph2w512_mask(
1687 (__v32hf
)__A
, (__v32hi
)_mm512_setzero_epi32(), (__mmask32
)-1,
1688 _MM_FROUND_CUR_DIRECTION
);
1691 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1692 _mm512_mask_cvtph_epi16(__m512i __W
, __mmask32 __U
, __m512h __A
) {
1693 return (__m512i
)__builtin_ia32_vcvtph2w512_mask(
1694 (__v32hf
)__A
, (__v32hi
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1697 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1698 _mm512_maskz_cvtph_epi16(__mmask32 __U
, __m512h __A
) {
1699 return (__m512i
)__builtin_ia32_vcvtph2w512_mask(
1700 (__v32hf
)__A
, (__v32hi
)_mm512_setzero_epi32(), (__mmask32
)__U
,
1701 _MM_FROUND_CUR_DIRECTION
);
1704 #define _mm512_cvtt_roundph_epi16(A, R) \
1705 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1706 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1709 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1710 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1711 (__mmask32)(U), (int)(R)))
1713 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1714 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1715 (__v32hi)_mm512_setzero_epi32(), \
1716 (__mmask32)(U), (int)(R)))
1718 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1719 _mm512_cvttph_epi16(__m512h __A
) {
1720 return (__m512i
)__builtin_ia32_vcvttph2w512_mask(
1721 (__v32hf
)__A
, (__v32hi
)_mm512_setzero_epi32(), (__mmask32
)-1,
1722 _MM_FROUND_CUR_DIRECTION
);
1725 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1726 _mm512_mask_cvttph_epi16(__m512i __W
, __mmask32 __U
, __m512h __A
) {
1727 return (__m512i
)__builtin_ia32_vcvttph2w512_mask(
1728 (__v32hf
)__A
, (__v32hi
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1731 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1732 _mm512_maskz_cvttph_epi16(__mmask32 __U
, __m512h __A
) {
1733 return (__m512i
)__builtin_ia32_vcvttph2w512_mask(
1734 (__v32hf
)__A
, (__v32hi
)_mm512_setzero_epi32(), (__mmask32
)__U
,
1735 _MM_FROUND_CUR_DIRECTION
);
1738 #define _mm512_cvt_roundepi16_ph(A, R) \
1739 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1740 (__v32hf)_mm512_undefined_ph(), \
1741 (__mmask32)(-1), (int)(R)))
1743 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1744 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1745 (__mmask32)(U), (int)(R)))
1747 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1748 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1749 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1751 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1752 _mm512_cvtepi16_ph(__m512i __A
) {
1753 return (__m512h
)__builtin_ia32_vcvtw2ph512_mask(
1754 (__v32hi
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)-1,
1755 _MM_FROUND_CUR_DIRECTION
);
1758 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1759 _mm512_mask_cvtepi16_ph(__m512h __W
, __mmask32 __U
, __m512i __A
) {
1760 return (__m512h
)__builtin_ia32_vcvtw2ph512_mask(
1761 (__v32hi
)__A
, (__v32hf
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1764 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1765 _mm512_maskz_cvtepi16_ph(__mmask32 __U
, __m512i __A
) {
1766 return (__m512h
)__builtin_ia32_vcvtw2ph512_mask(
1767 (__v32hi
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)__U
,
1768 _MM_FROUND_CUR_DIRECTION
);
1771 #define _mm512_cvt_roundph_epu16(A, R) \
1772 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1773 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1776 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1777 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1778 (__mmask32)(U), (int)(R)))
1780 #define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1781 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1782 (__v32hu)_mm512_setzero_epi32(), \
1783 (__mmask32)(U), (int)(R)))
1785 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1786 _mm512_cvtph_epu16(__m512h __A
) {
1787 return (__m512i
)__builtin_ia32_vcvtph2uw512_mask(
1788 (__v32hf
)__A
, (__v32hu
)_mm512_setzero_epi32(), (__mmask32
)-1,
1789 _MM_FROUND_CUR_DIRECTION
);
1792 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1793 _mm512_mask_cvtph_epu16(__m512i __W
, __mmask32 __U
, __m512h __A
) {
1794 return (__m512i
)__builtin_ia32_vcvtph2uw512_mask(
1795 (__v32hf
)__A
, (__v32hu
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1798 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1799 _mm512_maskz_cvtph_epu16(__mmask32 __U
, __m512h __A
) {
1800 return (__m512i
)__builtin_ia32_vcvtph2uw512_mask(
1801 (__v32hf
)__A
, (__v32hu
)_mm512_setzero_epi32(), (__mmask32
)__U
,
1802 _MM_FROUND_CUR_DIRECTION
);
1805 #define _mm512_cvtt_roundph_epu16(A, R) \
1806 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1807 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1810 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1811 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1812 (__mmask32)(U), (int)(R)))
1814 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1815 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1816 (__v32hu)_mm512_setzero_epi32(), \
1817 (__mmask32)(U), (int)(R)))
1819 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1820 _mm512_cvttph_epu16(__m512h __A
) {
1821 return (__m512i
)__builtin_ia32_vcvttph2uw512_mask(
1822 (__v32hf
)__A
, (__v32hu
)_mm512_setzero_epi32(), (__mmask32
)-1,
1823 _MM_FROUND_CUR_DIRECTION
);
1826 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1827 _mm512_mask_cvttph_epu16(__m512i __W
, __mmask32 __U
, __m512h __A
) {
1828 return (__m512i
)__builtin_ia32_vcvttph2uw512_mask(
1829 (__v32hf
)__A
, (__v32hu
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1832 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1833 _mm512_maskz_cvttph_epu16(__mmask32 __U
, __m512h __A
) {
1834 return (__m512i
)__builtin_ia32_vcvttph2uw512_mask(
1835 (__v32hf
)__A
, (__v32hu
)_mm512_setzero_epi32(), (__mmask32
)__U
,
1836 _MM_FROUND_CUR_DIRECTION
);
1839 #define _mm512_cvt_roundepu16_ph(A, R) \
1840 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1841 (__v32hf)_mm512_undefined_ph(), \
1842 (__mmask32)(-1), (int)(R)))
1844 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1845 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1846 (__mmask32)(U), (int)(R)))
1848 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1849 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1850 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1852 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1853 _mm512_cvtepu16_ph(__m512i __A
) {
1854 return (__m512h
)__builtin_ia32_vcvtuw2ph512_mask(
1855 (__v32hu
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)-1,
1856 _MM_FROUND_CUR_DIRECTION
);
1859 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1860 _mm512_mask_cvtepu16_ph(__m512h __W
, __mmask32 __U
, __m512i __A
) {
1861 return (__m512h
)__builtin_ia32_vcvtuw2ph512_mask(
1862 (__v32hu
)__A
, (__v32hf
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1865 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1866 _mm512_maskz_cvtepu16_ph(__mmask32 __U
, __m512i __A
) {
1867 return (__m512h
)__builtin_ia32_vcvtuw2ph512_mask(
1868 (__v32hu
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)__U
,
1869 _MM_FROUND_CUR_DIRECTION
);
1872 #define _mm512_cvt_roundph_epi32(A, R) \
1873 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1874 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1877 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1878 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1879 (__mmask16)(U), (int)(R)))
1881 #define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1882 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1883 (__v16si)_mm512_setzero_epi32(), \
1884 (__mmask16)(U), (int)(R)))
1886 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1887 _mm512_cvtph_epi32(__m256h __A
) {
1888 return (__m512i
)__builtin_ia32_vcvtph2dq512_mask(
1889 (__v16hf
)__A
, (__v16si
)_mm512_setzero_epi32(), (__mmask16
)-1,
1890 _MM_FROUND_CUR_DIRECTION
);
1893 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1894 _mm512_mask_cvtph_epi32(__m512i __W
, __mmask16 __U
, __m256h __A
) {
1895 return (__m512i
)__builtin_ia32_vcvtph2dq512_mask(
1896 (__v16hf
)__A
, (__v16si
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
1899 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1900 _mm512_maskz_cvtph_epi32(__mmask16 __U
, __m256h __A
) {
1901 return (__m512i
)__builtin_ia32_vcvtph2dq512_mask(
1902 (__v16hf
)__A
, (__v16si
)_mm512_setzero_epi32(), (__mmask16
)__U
,
1903 _MM_FROUND_CUR_DIRECTION
);
1906 #define _mm512_cvt_roundph_epu32(A, R) \
1907 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1908 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1911 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1912 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1913 (__mmask16)(U), (int)(R)))
1915 #define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1916 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1917 (__v16su)_mm512_setzero_epi32(), \
1918 (__mmask16)(U), (int)(R)))
1920 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1921 _mm512_cvtph_epu32(__m256h __A
) {
1922 return (__m512i
)__builtin_ia32_vcvtph2udq512_mask(
1923 (__v16hf
)__A
, (__v16su
)_mm512_setzero_epi32(), (__mmask16
)-1,
1924 _MM_FROUND_CUR_DIRECTION
);
1927 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1928 _mm512_mask_cvtph_epu32(__m512i __W
, __mmask16 __U
, __m256h __A
) {
1929 return (__m512i
)__builtin_ia32_vcvtph2udq512_mask(
1930 (__v16hf
)__A
, (__v16su
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
1933 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1934 _mm512_maskz_cvtph_epu32(__mmask16 __U
, __m256h __A
) {
1935 return (__m512i
)__builtin_ia32_vcvtph2udq512_mask(
1936 (__v16hf
)__A
, (__v16su
)_mm512_setzero_epi32(), (__mmask16
)__U
,
1937 _MM_FROUND_CUR_DIRECTION
);
1940 #define _mm512_cvt_roundepi32_ph(A, R) \
1941 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1942 (__v16hf)_mm256_undefined_ph(), \
1943 (__mmask16)(-1), (int)(R)))
1945 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1946 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1947 (__mmask16)(U), (int)(R)))
1949 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1950 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1951 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1953 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1954 _mm512_cvtepi32_ph(__m512i __A
) {
1955 return (__m256h
)__builtin_ia32_vcvtdq2ph512_mask(
1956 (__v16si
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)-1,
1957 _MM_FROUND_CUR_DIRECTION
);
1960 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1961 _mm512_mask_cvtepi32_ph(__m256h __W
, __mmask16 __U
, __m512i __A
) {
1962 return (__m256h
)__builtin_ia32_vcvtdq2ph512_mask(
1963 (__v16si
)__A
, (__v16hf
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
1966 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1967 _mm512_maskz_cvtepi32_ph(__mmask16 __U
, __m512i __A
) {
1968 return (__m256h
)__builtin_ia32_vcvtdq2ph512_mask(
1969 (__v16si
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)__U
,
1970 _MM_FROUND_CUR_DIRECTION
);
1973 #define _mm512_cvt_roundepu32_ph(A, R) \
1974 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1975 (__v16hf)_mm256_undefined_ph(), \
1976 (__mmask16)(-1), (int)(R)))
1978 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1979 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1980 (__mmask16)(U), (int)(R)))
1982 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1983 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1984 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1986 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1987 _mm512_cvtepu32_ph(__m512i __A
) {
1988 return (__m256h
)__builtin_ia32_vcvtudq2ph512_mask(
1989 (__v16su
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)-1,
1990 _MM_FROUND_CUR_DIRECTION
);
1993 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1994 _mm512_mask_cvtepu32_ph(__m256h __W
, __mmask16 __U
, __m512i __A
) {
1995 return (__m256h
)__builtin_ia32_vcvtudq2ph512_mask(
1996 (__v16su
)__A
, (__v16hf
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
1999 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2000 _mm512_maskz_cvtepu32_ph(__mmask16 __U
, __m512i __A
) {
2001 return (__m256h
)__builtin_ia32_vcvtudq2ph512_mask(
2002 (__v16su
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)__U
,
2003 _MM_FROUND_CUR_DIRECTION
);
2006 #define _mm512_cvtt_roundph_epi32(A, R) \
2007 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2008 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2011 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2012 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2013 (__mmask16)(U), (int)(R)))
2015 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2016 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2017 (__v16si)_mm512_setzero_epi32(), \
2018 (__mmask16)(U), (int)(R)))
2020 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2021 _mm512_cvttph_epi32(__m256h __A
) {
2022 return (__m512i
)__builtin_ia32_vcvttph2dq512_mask(
2023 (__v16hf
)__A
, (__v16si
)_mm512_setzero_epi32(), (__mmask16
)-1,
2024 _MM_FROUND_CUR_DIRECTION
);
2027 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2028 _mm512_mask_cvttph_epi32(__m512i __W
, __mmask16 __U
, __m256h __A
) {
2029 return (__m512i
)__builtin_ia32_vcvttph2dq512_mask(
2030 (__v16hf
)__A
, (__v16si
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
2033 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2034 _mm512_maskz_cvttph_epi32(__mmask16 __U
, __m256h __A
) {
2035 return (__m512i
)__builtin_ia32_vcvttph2dq512_mask(
2036 (__v16hf
)__A
, (__v16si
)_mm512_setzero_epi32(), (__mmask16
)__U
,
2037 _MM_FROUND_CUR_DIRECTION
);
2040 #define _mm512_cvtt_roundph_epu32(A, R) \
2041 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2042 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2045 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2046 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2047 (__mmask16)(U), (int)(R)))
2049 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2050 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2051 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2054 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2055 _mm512_cvttph_epu32(__m256h __A
) {
2056 return (__m512i
)__builtin_ia32_vcvttph2udq512_mask(
2057 (__v16hf
)__A
, (__v16su
)_mm512_setzero_epi32(), (__mmask16
)-1,
2058 _MM_FROUND_CUR_DIRECTION
);
2061 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2062 _mm512_mask_cvttph_epu32(__m512i __W
, __mmask16 __U
, __m256h __A
) {
2063 return (__m512i
)__builtin_ia32_vcvttph2udq512_mask(
2064 (__v16hf
)__A
, (__v16su
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
2067 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2068 _mm512_maskz_cvttph_epu32(__mmask16 __U
, __m256h __A
) {
2069 return (__m512i
)__builtin_ia32_vcvttph2udq512_mask(
2070 (__v16hf
)__A
, (__v16su
)_mm512_setzero_epi32(), (__mmask16
)__U
,
2071 _MM_FROUND_CUR_DIRECTION
);
2074 #define _mm512_cvt_roundepi64_ph(A, R) \
2075 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2076 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2078 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2079 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2080 (__mmask8)(U), (int)(R)))
2082 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2083 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2084 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2086 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2087 _mm512_cvtepi64_ph(__m512i __A
) {
2088 return (__m128h
)__builtin_ia32_vcvtqq2ph512_mask(
2089 (__v8di
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
2090 _MM_FROUND_CUR_DIRECTION
);
2093 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2094 _mm512_mask_cvtepi64_ph(__m128h __W
, __mmask8 __U
, __m512i __A
) {
2095 return (__m128h
)__builtin_ia32_vcvtqq2ph512_mask(
2096 (__v8di
)__A
, (__v8hf
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2099 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2100 _mm512_maskz_cvtepi64_ph(__mmask8 __U
, __m512i __A
) {
2101 return (__m128h
)__builtin_ia32_vcvtqq2ph512_mask(
2102 (__v8di
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
2103 _MM_FROUND_CUR_DIRECTION
);
2106 #define _mm512_cvt_roundph_epi64(A, R) \
2107 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2108 (__v8di)_mm512_undefined_epi32(), \
2109 (__mmask8)(-1), (int)(R)))
2111 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2112 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2113 (__mmask8)(U), (int)(R)))
2115 #define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2116 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2117 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2119 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2120 _mm512_cvtph_epi64(__m128h __A
) {
2121 return (__m512i
)__builtin_ia32_vcvtph2qq512_mask(
2122 (__v8hf
)__A
, (__v8di
)_mm512_setzero_epi32(), (__mmask8
)-1,
2123 _MM_FROUND_CUR_DIRECTION
);
2126 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2127 _mm512_mask_cvtph_epi64(__m512i __W
, __mmask8 __U
, __m128h __A
) {
2128 return (__m512i
)__builtin_ia32_vcvtph2qq512_mask(
2129 (__v8hf
)__A
, (__v8di
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2132 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2133 _mm512_maskz_cvtph_epi64(__mmask8 __U
, __m128h __A
) {
2134 return (__m512i
)__builtin_ia32_vcvtph2qq512_mask(
2135 (__v8hf
)__A
, (__v8di
)_mm512_setzero_epi32(), (__mmask8
)__U
,
2136 _MM_FROUND_CUR_DIRECTION
);
2139 #define _mm512_cvt_roundepu64_ph(A, R) \
2140 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2141 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2143 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2144 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2145 (__mmask8)(U), (int)(R)))
2147 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2148 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2149 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2151 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2152 _mm512_cvtepu64_ph(__m512i __A
) {
2153 return (__m128h
)__builtin_ia32_vcvtuqq2ph512_mask(
2154 (__v8du
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
2155 _MM_FROUND_CUR_DIRECTION
);
2158 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2159 _mm512_mask_cvtepu64_ph(__m128h __W
, __mmask8 __U
, __m512i __A
) {
2160 return (__m128h
)__builtin_ia32_vcvtuqq2ph512_mask(
2161 (__v8du
)__A
, (__v8hf
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2164 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2165 _mm512_maskz_cvtepu64_ph(__mmask8 __U
, __m512i __A
) {
2166 return (__m128h
)__builtin_ia32_vcvtuqq2ph512_mask(
2167 (__v8du
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
2168 _MM_FROUND_CUR_DIRECTION
);
2171 #define _mm512_cvt_roundph_epu64(A, R) \
2172 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2173 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2176 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2177 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2178 (__mmask8)(U), (int)(R)))
2180 #define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2181 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2182 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2184 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2185 _mm512_cvtph_epu64(__m128h __A
) {
2186 return (__m512i
)__builtin_ia32_vcvtph2uqq512_mask(
2187 (__v8hf
)__A
, (__v8du
)_mm512_setzero_epi32(), (__mmask8
)-1,
2188 _MM_FROUND_CUR_DIRECTION
);
2191 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2192 _mm512_mask_cvtph_epu64(__m512i __W
, __mmask8 __U
, __m128h __A
) {
2193 return (__m512i
)__builtin_ia32_vcvtph2uqq512_mask(
2194 (__v8hf
)__A
, (__v8du
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2197 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2198 _mm512_maskz_cvtph_epu64(__mmask8 __U
, __m128h __A
) {
2199 return (__m512i
)__builtin_ia32_vcvtph2uqq512_mask(
2200 (__v8hf
)__A
, (__v8du
)_mm512_setzero_epi32(), (__mmask8
)__U
,
2201 _MM_FROUND_CUR_DIRECTION
);
2204 #define _mm512_cvtt_roundph_epi64(A, R) \
2205 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2206 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2209 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2210 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2211 (__mmask8)(U), (int)(R)))
2213 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2214 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2215 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2217 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2218 _mm512_cvttph_epi64(__m128h __A
) {
2219 return (__m512i
)__builtin_ia32_vcvttph2qq512_mask(
2220 (__v8hf
)__A
, (__v8di
)_mm512_setzero_epi32(), (__mmask8
)-1,
2221 _MM_FROUND_CUR_DIRECTION
);
2224 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2225 _mm512_mask_cvttph_epi64(__m512i __W
, __mmask8 __U
, __m128h __A
) {
2226 return (__m512i
)__builtin_ia32_vcvttph2qq512_mask(
2227 (__v8hf
)__A
, (__v8di
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2230 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2231 _mm512_maskz_cvttph_epi64(__mmask8 __U
, __m128h __A
) {
2232 return (__m512i
)__builtin_ia32_vcvttph2qq512_mask(
2233 (__v8hf
)__A
, (__v8di
)_mm512_setzero_epi32(), (__mmask8
)__U
,
2234 _MM_FROUND_CUR_DIRECTION
);
2237 #define _mm512_cvtt_roundph_epu64(A, R) \
2238 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2239 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2242 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2243 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2244 (__mmask8)(U), (int)(R)))
2246 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2247 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2248 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2250 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2251 _mm512_cvttph_epu64(__m128h __A
) {
2252 return (__m512i
)__builtin_ia32_vcvttph2uqq512_mask(
2253 (__v8hf
)__A
, (__v8du
)_mm512_setzero_epi32(), (__mmask8
)-1,
2254 _MM_FROUND_CUR_DIRECTION
);
2257 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2258 _mm512_mask_cvttph_epu64(__m512i __W
, __mmask8 __U
, __m128h __A
) {
2259 return (__m512i
)__builtin_ia32_vcvttph2uqq512_mask(
2260 (__v8hf
)__A
, (__v8du
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2263 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2264 _mm512_maskz_cvttph_epu64(__mmask8 __U
, __m128h __A
) {
2265 return (__m512i
)__builtin_ia32_vcvttph2uqq512_mask(
2266 (__v8hf
)__A
, (__v8du
)_mm512_setzero_epi32(), (__mmask8
)__U
,
2267 _MM_FROUND_CUR_DIRECTION
);
2270 #define _mm_cvt_roundsh_i32(A, R) \
2271 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2273 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_cvtsh_i32(__m128h __A
) {
2274 return (int)__builtin_ia32_vcvtsh2si32((__v8hf
)__A
, _MM_FROUND_CUR_DIRECTION
);
2277 #define _mm_cvt_roundsh_u32(A, R) \
2278 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2280 static __inline__
unsigned int __DEFAULT_FN_ATTRS128
2281 _mm_cvtsh_u32(__m128h __A
) {
2282 return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf
)__A
,
2283 _MM_FROUND_CUR_DIRECTION
);
2287 #define _mm_cvt_roundsh_i64(A, R) \
2288 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2290 static __inline__
long long __DEFAULT_FN_ATTRS128
_mm_cvtsh_i64(__m128h __A
) {
2291 return (long long)__builtin_ia32_vcvtsh2si64((__v8hf
)__A
,
2292 _MM_FROUND_CUR_DIRECTION
);
2295 #define _mm_cvt_roundsh_u64(A, R) \
2296 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2298 static __inline__
unsigned long long __DEFAULT_FN_ATTRS128
2299 _mm_cvtsh_u64(__m128h __A
) {
2300 return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2301 (__v8hf
)__A
, _MM_FROUND_CUR_DIRECTION
);
2303 #endif // __x86_64__
2305 #define _mm_cvt_roundu32_sh(A, B, R) \
2306 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2308 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2309 _mm_cvtu32_sh(__m128h __A
, unsigned int __B
) {
2315 #define _mm_cvt_roundu64_sh(A, B, R) \
2316 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2319 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2320 _mm_cvtu64_sh(__m128h __A
, unsigned long long __B
) {
2326 #define _mm_cvt_roundi32_sh(A, B, R) \
2327 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2329 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvti32_sh(__m128h __A
,
2336 #define _mm_cvt_roundi64_sh(A, B, R) \
2337 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2339 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvti64_sh(__m128h __A
,
2346 #define _mm_cvtt_roundsh_i32(A, R) \
2347 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2349 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_cvttsh_i32(__m128h __A
) {
2350 return (int)__builtin_ia32_vcvttsh2si32((__v8hf
)__A
,
2351 _MM_FROUND_CUR_DIRECTION
);
2355 #define _mm_cvtt_roundsh_i64(A, R) \
2356 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2358 static __inline__
long long __DEFAULT_FN_ATTRS128
_mm_cvttsh_i64(__m128h __A
) {
2359 return (long long)__builtin_ia32_vcvttsh2si64((__v8hf
)__A
,
2360 _MM_FROUND_CUR_DIRECTION
);
2364 #define _mm_cvtt_roundsh_u32(A, R) \
2365 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2367 static __inline__
unsigned int __DEFAULT_FN_ATTRS128
2368 _mm_cvttsh_u32(__m128h __A
) {
2369 return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf
)__A
,
2370 _MM_FROUND_CUR_DIRECTION
);
2374 #define _mm_cvtt_roundsh_u64(A, R) \
2375 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2377 static __inline__
unsigned long long __DEFAULT_FN_ATTRS128
2378 _mm_cvttsh_u64(__m128h __A
) {
2379 return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2380 (__v8hf
)__A
, _MM_FROUND_CUR_DIRECTION
);
2384 #define _mm512_cvtx_roundph_ps(A, R) \
2385 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2386 (__v16sf)_mm512_undefined_ps(), \
2387 (__mmask16)(-1), (int)(R)))
2389 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2390 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2391 (__mmask16)(U), (int)(R)))
2393 #define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2394 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2395 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2397 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_cvtxph_ps(__m256h __A
) {
2398 return (__m512
)__builtin_ia32_vcvtph2psx512_mask(
2399 (__v16hf
)__A
, (__v16sf
)_mm512_setzero_ps(), (__mmask16
)-1,
2400 _MM_FROUND_CUR_DIRECTION
);
2403 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2404 _mm512_mask_cvtxph_ps(__m512 __W
, __mmask16 __U
, __m256h __A
) {
2405 return (__m512
)__builtin_ia32_vcvtph2psx512_mask(
2406 (__v16hf
)__A
, (__v16sf
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
2409 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2410 _mm512_maskz_cvtxph_ps(__mmask16 __U
, __m256h __A
) {
2411 return (__m512
)__builtin_ia32_vcvtph2psx512_mask(
2412 (__v16hf
)__A
, (__v16sf
)_mm512_setzero_ps(), (__mmask16
)__U
,
2413 _MM_FROUND_CUR_DIRECTION
);
2416 #define _mm512_cvtx_roundps_ph(A, R) \
2417 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2418 (__v16hf)_mm256_undefined_ph(), \
2419 (__mmask16)(-1), (int)(R)))
2421 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2422 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2423 (__mmask16)(U), (int)(R)))
2425 #define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2426 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2427 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2429 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_cvtxps_ph(__m512 __A
) {
2430 return (__m256h
)__builtin_ia32_vcvtps2phx512_mask(
2431 (__v16sf
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)-1,
2432 _MM_FROUND_CUR_DIRECTION
);
2435 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2436 _mm512_mask_cvtxps_ph(__m256h __W
, __mmask16 __U
, __m512 __A
) {
2437 return (__m256h
)__builtin_ia32_vcvtps2phx512_mask(
2438 (__v16sf
)__A
, (__v16hf
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
2441 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2442 _mm512_maskz_cvtxps_ph(__mmask16 __U
, __m512 __A
) {
2443 return (__m256h
)__builtin_ia32_vcvtps2phx512_mask(
2444 (__v16sf
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)__U
,
2445 _MM_FROUND_CUR_DIRECTION
);
2448 #define _mm512_fmadd_round_ph(A, B, C, R) \
2449 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2450 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2451 (__mmask32)-1, (int)(R)))
2453 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2454 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2455 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2456 (__mmask32)(U), (int)(R)))
2458 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2459 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2460 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2461 (__mmask32)(U), (int)(R)))
2463 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2464 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2465 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2466 (__mmask32)(U), (int)(R)))
2468 #define _mm512_fmsub_round_ph(A, B, C, R) \
2469 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2470 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2471 (__mmask32)-1, (int)(R)))
2473 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2474 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2475 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2476 (__mmask32)(U), (int)(R)))
2478 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2479 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2480 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2481 (__mmask32)(U), (int)(R)))
2483 #define _mm512_fnmadd_round_ph(A, B, C, R) \
2484 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2485 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2486 (__mmask32)-1, (int)(R)))
2488 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2489 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2490 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2491 (__mmask32)(U), (int)(R)))
2493 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2494 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2495 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2496 (__mmask32)(U), (int)(R)))
2498 #define _mm512_fnmsub_round_ph(A, B, C, R) \
2499 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2500 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2501 (__mmask32)-1, (int)(R)))
2503 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2504 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2505 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2506 (__mmask32)(U), (int)(R)))
2508 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmadd_ph(__m512h __A
,
2511 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, (__v32hf
)__B
,
2512 (__v32hf
)__C
, (__mmask32
)-1,
2513 _MM_FROUND_CUR_DIRECTION
);
2516 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2517 _mm512_mask_fmadd_ph(__m512h __A
, __mmask32 __U
, __m512h __B
, __m512h __C
) {
2518 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, (__v32hf
)__B
,
2519 (__v32hf
)__C
, (__mmask32
)__U
,
2520 _MM_FROUND_CUR_DIRECTION
);
2523 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2524 _mm512_mask3_fmadd_ph(__m512h __A
, __m512h __B
, __m512h __C
, __mmask32 __U
) {
2525 return (__m512h
)__builtin_ia32_vfmaddph512_mask3((__v32hf
)__A
, (__v32hf
)__B
,
2526 (__v32hf
)__C
, (__mmask32
)__U
,
2527 _MM_FROUND_CUR_DIRECTION
);
2530 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2531 _mm512_maskz_fmadd_ph(__mmask32 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
2532 return (__m512h
)__builtin_ia32_vfmaddph512_maskz((__v32hf
)__A
, (__v32hf
)__B
,
2533 (__v32hf
)__C
, (__mmask32
)__U
,
2534 _MM_FROUND_CUR_DIRECTION
);
2537 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmsub_ph(__m512h __A
,
2540 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, (__v32hf
)__B
,
2541 -(__v32hf
)__C
, (__mmask32
)-1,
2542 _MM_FROUND_CUR_DIRECTION
);
2545 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2546 _mm512_mask_fmsub_ph(__m512h __A
, __mmask32 __U
, __m512h __B
, __m512h __C
) {
2547 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, (__v32hf
)__B
,
2548 -(__v32hf
)__C
, (__mmask32
)__U
,
2549 _MM_FROUND_CUR_DIRECTION
);
2552 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2553 _mm512_maskz_fmsub_ph(__mmask32 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
2554 return (__m512h
)__builtin_ia32_vfmaddph512_maskz(
2555 (__v32hf
)__A
, (__v32hf
)__B
, -(__v32hf
)__C
, (__mmask32
)__U
,
2556 _MM_FROUND_CUR_DIRECTION
);
2559 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fnmadd_ph(__m512h __A
,
2562 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, -(__v32hf
)__B
,
2563 (__v32hf
)__C
, (__mmask32
)-1,
2564 _MM_FROUND_CUR_DIRECTION
);
2567 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2568 _mm512_mask3_fnmadd_ph(__m512h __A
, __m512h __B
, __m512h __C
, __mmask32 __U
) {
2569 return (__m512h
)__builtin_ia32_vfmaddph512_mask3(-(__v32hf
)__A
, (__v32hf
)__B
,
2570 (__v32hf
)__C
, (__mmask32
)__U
,
2571 _MM_FROUND_CUR_DIRECTION
);
2574 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2575 _mm512_maskz_fnmadd_ph(__mmask32 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
2576 return (__m512h
)__builtin_ia32_vfmaddph512_maskz(-(__v32hf
)__A
, (__v32hf
)__B
,
2577 (__v32hf
)__C
, (__mmask32
)__U
,
2578 _MM_FROUND_CUR_DIRECTION
);
2581 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fnmsub_ph(__m512h __A
,
2584 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, -(__v32hf
)__B
,
2585 -(__v32hf
)__C
, (__mmask32
)-1,
2586 _MM_FROUND_CUR_DIRECTION
);
2589 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2590 _mm512_maskz_fnmsub_ph(__mmask32 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
2591 return (__m512h
)__builtin_ia32_vfmaddph512_maskz(
2592 -(__v32hf
)__A
, (__v32hf
)__B
, -(__v32hf
)__C
, (__mmask32
)__U
,
2593 _MM_FROUND_CUR_DIRECTION
);
2596 #define _mm512_fmaddsub_round_ph(A, B, C, R) \
2597 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2598 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2599 (__mmask32)-1, (int)(R)))
2601 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2602 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2603 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2604 (__mmask32)(U), (int)(R)))
2606 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2607 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2608 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2609 (__mmask32)(U), (int)(R)))
2611 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2612 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2613 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2614 (__mmask32)(U), (int)(R)))
2616 #define _mm512_fmsubadd_round_ph(A, B, C, R) \
2617 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2618 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2619 (__mmask32)-1, (int)(R)))
2621 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2622 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2623 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2624 (__mmask32)(U), (int)(R)))
2626 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2627 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2628 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2629 (__mmask32)(U), (int)(R)))
2631 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2632 _mm512_fmaddsub_ph(__m512h __A
, __m512h __B
, __m512h __C
) {
2633 return (__m512h
)__builtin_ia32_vfmaddsubph512_mask(
2634 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)__C
, (__mmask32
)-1,
2635 _MM_FROUND_CUR_DIRECTION
);
2638 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2639 _mm512_mask_fmaddsub_ph(__m512h __A
, __mmask32 __U
, __m512h __B
, __m512h __C
) {
2640 return (__m512h
)__builtin_ia32_vfmaddsubph512_mask(
2641 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)__C
, (__mmask32
)__U
,
2642 _MM_FROUND_CUR_DIRECTION
);
2645 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2646 _mm512_mask3_fmaddsub_ph(__m512h __A
, __m512h __B
, __m512h __C
, __mmask32 __U
) {
2647 return (__m512h
)__builtin_ia32_vfmaddsubph512_mask3(
2648 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)__C
, (__mmask32
)__U
,
2649 _MM_FROUND_CUR_DIRECTION
);
2652 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2653 _mm512_maskz_fmaddsub_ph(__mmask32 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
2654 return (__m512h
)__builtin_ia32_vfmaddsubph512_maskz(
2655 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)__C
, (__mmask32
)__U
,
2656 _MM_FROUND_CUR_DIRECTION
);
2659 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2660 _mm512_fmsubadd_ph(__m512h __A
, __m512h __B
, __m512h __C
) {
2661 return (__m512h
)__builtin_ia32_vfmaddsubph512_mask(
2662 (__v32hf
)__A
, (__v32hf
)__B
, -(__v32hf
)__C
, (__mmask32
)-1,
2663 _MM_FROUND_CUR_DIRECTION
);
2666 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2667 _mm512_mask_fmsubadd_ph(__m512h __A
, __mmask32 __U
, __m512h __B
, __m512h __C
) {
2668 return (__m512h
)__builtin_ia32_vfmaddsubph512_mask(
2669 (__v32hf
)__A
, (__v32hf
)__B
, -(__v32hf
)__C
, (__mmask32
)__U
,
2670 _MM_FROUND_CUR_DIRECTION
);
2673 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2674 _mm512_maskz_fmsubadd_ph(__mmask32 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
2675 return (__m512h
)__builtin_ia32_vfmaddsubph512_maskz(
2676 (__v32hf
)__A
, (__v32hf
)__B
, -(__v32hf
)__C
, (__mmask32
)__U
,
2677 _MM_FROUND_CUR_DIRECTION
);
2680 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2681 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2682 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2683 (__mmask32)(U), (int)(R)))
2685 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2686 _mm512_mask3_fmsub_ph(__m512h __A
, __m512h __B
, __m512h __C
, __mmask32 __U
) {
2687 return (__m512h
)__builtin_ia32_vfmsubph512_mask3((__v32hf
)__A
, (__v32hf
)__B
,
2688 (__v32hf
)__C
, (__mmask32
)__U
,
2689 _MM_FROUND_CUR_DIRECTION
);
2692 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2693 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2694 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2695 (__mmask32)(U), (int)(R)))
2697 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2698 _mm512_mask3_fmsubadd_ph(__m512h __A
, __m512h __B
, __m512h __C
, __mmask32 __U
) {
2699 return (__m512h
)__builtin_ia32_vfmsubaddph512_mask3(
2700 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)__C
, (__mmask32
)__U
,
2701 _MM_FROUND_CUR_DIRECTION
);
2704 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2705 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2706 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2707 (__mmask32)(U), (int)(R)))
2709 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2710 _mm512_mask_fnmadd_ph(__m512h __A
, __mmask32 __U
, __m512h __B
, __m512h __C
) {
2711 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, -(__v32hf
)__B
,
2712 (__v32hf
)__C
, (__mmask32
)__U
,
2713 _MM_FROUND_CUR_DIRECTION
);
2716 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2717 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2718 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2719 (__mmask32)(U), (int)(R)))
2721 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2722 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2723 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2724 (__mmask32)(U), (int)(R)))
2726 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2727 _mm512_mask_fnmsub_ph(__m512h __A
, __mmask32 __U
, __m512h __B
, __m512h __C
) {
2728 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, -(__v32hf
)__B
,
2729 -(__v32hf
)__C
, (__mmask32
)__U
,
2730 _MM_FROUND_CUR_DIRECTION
);
2733 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2734 _mm512_mask3_fnmsub_ph(__m512h __A
, __m512h __B
, __m512h __C
, __mmask32 __U
) {
2735 return (__m512h
)__builtin_ia32_vfmsubph512_mask3(-(__v32hf
)__A
, (__v32hf
)__B
,
2736 (__v32hf
)__C
, (__mmask32
)__U
,
2737 _MM_FROUND_CUR_DIRECTION
);
2740 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmadd_sh(__m128h __W
,
2743 return __builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, (__v8hf
)__A
, (__v8hf
)__B
,
2744 (__mmask8
)-1, _MM_FROUND_CUR_DIRECTION
);
2747 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmadd_sh(__m128h __W
,
2751 return __builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, (__v8hf
)__A
, (__v8hf
)__B
,
2752 (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2755 #define _mm_fmadd_round_sh(A, B, C, R) \
2756 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2757 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2758 (__mmask8)-1, (int)(R)))
2760 #define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2761 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2762 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2763 (__mmask8)(U), (int)(R)))
2765 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2766 _mm_maskz_fmadd_sh(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
2767 return __builtin_ia32_vfmaddsh3_maskz((__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__C
,
2769 _MM_FROUND_CUR_DIRECTION
);
2772 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2773 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2774 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2775 (__mmask8)(U), (int)(R)))
2777 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2778 _mm_mask3_fmadd_sh(__m128h __W
, __m128h __X
, __m128h __Y
, __mmask8 __U
) {
2779 return __builtin_ia32_vfmaddsh3_mask3((__v8hf
)__W
, (__v8hf
)__X
, (__v8hf
)__Y
,
2781 _MM_FROUND_CUR_DIRECTION
);
2784 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2785 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2786 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2787 (__mmask8)(U), (int)(R)))
2789 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmsub_sh(__m128h __W
,
2792 return (__m128h
)__builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, (__v8hf
)__A
,
2793 -(__v8hf
)__B
, (__mmask8
)-1,
2794 _MM_FROUND_CUR_DIRECTION
);
2797 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmsub_sh(__m128h __W
,
2801 return (__m128h
)__builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, (__v8hf
)__A
,
2802 -(__v8hf
)__B
, (__mmask8
)__U
,
2803 _MM_FROUND_CUR_DIRECTION
);
2806 #define _mm_fmsub_round_sh(A, B, C, R) \
2807 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2808 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2809 (__mmask8)-1, (int)(R)))
2811 #define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2812 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2813 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2814 (__mmask8)(U), (int)(R)))
2816 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2817 _mm_maskz_fmsub_sh(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
2818 return (__m128h
)__builtin_ia32_vfmaddsh3_maskz((__v8hf
)__A
, (__v8hf
)__B
,
2819 -(__v8hf
)__C
, (__mmask8
)__U
,
2820 _MM_FROUND_CUR_DIRECTION
);
2823 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2824 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2825 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2826 (__mmask8)(U), (int)R))
2828 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2829 _mm_mask3_fmsub_sh(__m128h __W
, __m128h __X
, __m128h __Y
, __mmask8 __U
) {
2830 return __builtin_ia32_vfmsubsh3_mask3((__v8hf
)__W
, (__v8hf
)__X
, (__v8hf
)__Y
,
2832 _MM_FROUND_CUR_DIRECTION
);
2835 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2836 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2837 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2838 (__mmask8)(U), (int)(R)))
2840 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fnmadd_sh(__m128h __W
,
2843 return __builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, -(__v8hf
)__A
, (__v8hf
)__B
,
2844 (__mmask8
)-1, _MM_FROUND_CUR_DIRECTION
);
2847 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2848 _mm_mask_fnmadd_sh(__m128h __W
, __mmask8 __U
, __m128h __A
, __m128h __B
) {
2849 return __builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, -(__v8hf
)__A
, (__v8hf
)__B
,
2850 (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2853 #define _mm_fnmadd_round_sh(A, B, C, R) \
2854 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2855 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2856 (__mmask8)-1, (int)(R)))
2858 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2859 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2860 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2861 (__mmask8)(U), (int)(R)))
2863 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2864 _mm_maskz_fnmadd_sh(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
2865 return __builtin_ia32_vfmaddsh3_maskz((__v8hf
)__A
, -(__v8hf
)__B
, (__v8hf
)__C
,
2867 _MM_FROUND_CUR_DIRECTION
);
2870 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2871 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2872 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2873 (__mmask8)(U), (int)(R)))
2875 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2876 _mm_mask3_fnmadd_sh(__m128h __W
, __m128h __X
, __m128h __Y
, __mmask8 __U
) {
2877 return __builtin_ia32_vfmaddsh3_mask3((__v8hf
)__W
, -(__v8hf
)__X
, (__v8hf
)__Y
,
2879 _MM_FROUND_CUR_DIRECTION
);
2882 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2883 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2884 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2885 (__mmask8)(U), (int)(R)))
2887 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fnmsub_sh(__m128h __W
,
2890 return __builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, -(__v8hf
)__A
, -(__v8hf
)__B
,
2891 (__mmask8
)-1, _MM_FROUND_CUR_DIRECTION
);
2894 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2895 _mm_mask_fnmsub_sh(__m128h __W
, __mmask8 __U
, __m128h __A
, __m128h __B
) {
2896 return __builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, -(__v8hf
)__A
, -(__v8hf
)__B
,
2897 (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2900 #define _mm_fnmsub_round_sh(A, B, C, R) \
2901 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2902 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2903 (__mmask8)-1, (int)(R)))
2905 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2906 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2907 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2908 (__mmask8)(U), (int)(R)))
2910 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2911 _mm_maskz_fnmsub_sh(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
2912 return __builtin_ia32_vfmaddsh3_maskz((__v8hf
)__A
, -(__v8hf
)__B
, -(__v8hf
)__C
,
2914 _MM_FROUND_CUR_DIRECTION
);
2917 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2918 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2919 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2920 (__mmask8)(U), (int)(R)))
2922 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2923 _mm_mask3_fnmsub_sh(__m128h __W
, __m128h __X
, __m128h __Y
, __mmask8 __U
) {
2924 return __builtin_ia32_vfmsubsh3_mask3((__v8hf
)__W
, -(__v8hf
)__X
, (__v8hf
)__Y
,
2926 _MM_FROUND_CUR_DIRECTION
);
2929 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2930 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2931 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2932 (__mmask8)(U), (int)(R)))
2934 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fcmadd_sch(__m128h __A
,
2937 return (__m128h
)__builtin_ia32_vfcmaddcsh_mask((__v4sf
)__A
, (__v4sf
)__B
,
2938 (__v4sf
)__C
, (__mmask8
)-1,
2939 _MM_FROUND_CUR_DIRECTION
);
2942 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2943 _mm_mask_fcmadd_sch(__m128h __A
, __mmask8 __U
, __m128h __B
, __m128h __C
) {
2944 return (__m128h
)__builtin_ia32_vfcmaddcsh_round_mask(
2945 (__v4sf
)__A
, (__v4sf
)(__B
), (__v4sf
)(__C
), __U
, _MM_FROUND_CUR_DIRECTION
);
2948 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2949 _mm_maskz_fcmadd_sch(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
2950 return (__m128h
)__builtin_ia32_vfcmaddcsh_maskz((__v4sf
)__A
, (__v4sf
)__B
,
2951 (__v4sf
)__C
, (__mmask8
)__U
,
2952 _MM_FROUND_CUR_DIRECTION
);
2955 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2956 _mm_mask3_fcmadd_sch(__m128h __A
, __m128h __B
, __m128h __C
, __mmask8 __U
) {
2957 return (__m128h
)__builtin_ia32_vfcmaddcsh_round_mask3(
2958 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
, __U
, _MM_FROUND_CUR_DIRECTION
);
2961 #define _mm_fcmadd_round_sch(A, B, C, R) \
2962 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2963 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2964 (__mmask8)-1, (int)(R)))
2966 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2967 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2968 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2969 (__mmask8)(U), (int)(R)))
2971 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2972 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2973 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2974 (__mmask8)(U), (int)(R)))
2976 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2977 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2978 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2979 (__mmask8)(U), (int)(R)))
2981 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmadd_sch(__m128h __A
,
2984 return (__m128h
)__builtin_ia32_vfmaddcsh_mask((__v4sf
)__A
, (__v4sf
)__B
,
2985 (__v4sf
)__C
, (__mmask8
)-1,
2986 _MM_FROUND_CUR_DIRECTION
);
2989 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2990 _mm_mask_fmadd_sch(__m128h __A
, __mmask8 __U
, __m128h __B
, __m128h __C
) {
2991 return (__m128h
)__builtin_ia32_vfmaddcsh_round_mask(
2992 (__v4sf
)__A
, (__v4sf
)(__B
), (__v4sf
)(__C
), __U
, _MM_FROUND_CUR_DIRECTION
);
2995 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2996 _mm_maskz_fmadd_sch(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
2997 return (__m128h
)__builtin_ia32_vfmaddcsh_maskz((__v4sf
)__A
, (__v4sf
)__B
,
2998 (__v4sf
)__C
, (__mmask8
)__U
,
2999 _MM_FROUND_CUR_DIRECTION
);
3002 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3003 _mm_mask3_fmadd_sch(__m128h __A
, __m128h __B
, __m128h __C
, __mmask8 __U
) {
3004 return (__m128h
)__builtin_ia32_vfmaddcsh_round_mask3(
3005 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
, __U
, _MM_FROUND_CUR_DIRECTION
);
3008 #define _mm_fmadd_round_sch(A, B, C, R) \
3009 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3010 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3011 (__mmask8)-1, (int)(R)))
3013 #define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3014 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3015 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3016 (__mmask8)(U), (int)(R)))
3018 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3019 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3020 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3021 (__mmask8)(U), (int)(R)))
3023 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3024 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3025 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3026 (__mmask8)(U), (int)(R)))
3028 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fcmul_sch(__m128h __A
,
3030 return (__m128h
)__builtin_ia32_vfcmulcsh_mask(
3031 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)_mm_undefined_ph(), (__mmask8
)-1,
3032 _MM_FROUND_CUR_DIRECTION
);
3035 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3036 _mm_mask_fcmul_sch(__m128h __W
, __mmask8 __U
, __m128h __A
, __m128h __B
) {
3037 return (__m128h
)__builtin_ia32_vfcmulcsh_mask((__v4sf
)__A
, (__v4sf
)__B
,
3038 (__v4sf
)__W
, (__mmask8
)__U
,
3039 _MM_FROUND_CUR_DIRECTION
);
3042 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3043 _mm_maskz_fcmul_sch(__mmask8 __U
, __m128h __A
, __m128h __B
) {
3044 return (__m128h
)__builtin_ia32_vfcmulcsh_mask(
3045 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)_mm_setzero_ph(), (__mmask8
)__U
,
3046 _MM_FROUND_CUR_DIRECTION
);
3049 #define _mm_fcmul_round_sch(A, B, R) \
3050 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3051 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3052 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3054 #define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3055 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3056 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3057 (__mmask8)(U), (int)(R)))
3059 #define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3060 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3061 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3062 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3064 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmul_sch(__m128h __A
,
3066 return (__m128h
)__builtin_ia32_vfmulcsh_mask(
3067 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)_mm_undefined_ph(), (__mmask8
)-1,
3068 _MM_FROUND_CUR_DIRECTION
);
3071 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmul_sch(__m128h __W
,
3075 return (__m128h
)__builtin_ia32_vfmulcsh_mask((__v4sf
)__A
, (__v4sf
)__B
,
3076 (__v4sf
)__W
, (__mmask8
)__U
,
3077 _MM_FROUND_CUR_DIRECTION
);
3080 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3081 _mm_maskz_fmul_sch(__mmask8 __U
, __m128h __A
, __m128h __B
) {
3082 return (__m128h
)__builtin_ia32_vfmulcsh_mask(
3083 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)_mm_setzero_ph(), (__mmask8
)__U
,
3084 _MM_FROUND_CUR_DIRECTION
);
3087 #define _mm_fmul_round_sch(A, B, R) \
3088 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3089 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3090 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3092 #define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3093 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3094 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3095 (__mmask8)(U), (int)(R)))
3097 #define _mm_maskz_fmul_round_sch(U, A, B, R) \
3098 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3099 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3100 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3102 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fcmul_pch(__m512h __A
,
3104 return (__m512h
)__builtin_ia32_vfcmulcph512_mask(
3105 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)_mm512_undefined_ph(), (__mmask16
)-1,
3106 _MM_FROUND_CUR_DIRECTION
);
3109 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3110 _mm512_mask_fcmul_pch(__m512h __W
, __mmask16 __U
, __m512h __A
, __m512h __B
) {
3111 return (__m512h
)__builtin_ia32_vfcmulcph512_mask((__v16sf
)__A
, (__v16sf
)__B
,
3112 (__v16sf
)__W
, (__mmask16
)__U
,
3113 _MM_FROUND_CUR_DIRECTION
);
3116 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3117 _mm512_maskz_fcmul_pch(__mmask16 __U
, __m512h __A
, __m512h __B
) {
3118 return (__m512h
)__builtin_ia32_vfcmulcph512_mask(
3119 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)_mm512_setzero_ph(), (__mmask16
)__U
,
3120 _MM_FROUND_CUR_DIRECTION
);
3123 #define _mm512_fcmul_round_pch(A, B, R) \
3124 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3125 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3126 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3128 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3129 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3130 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3131 (__mmask16)(U), (int)(R)))
3133 #define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3134 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3135 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3136 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3138 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmul_pch(__m512h __A
,
3140 return (__m512h
)__builtin_ia32_vfmulcph512_mask(
3141 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)_mm512_undefined_ph(), (__mmask16
)-1,
3142 _MM_FROUND_CUR_DIRECTION
);
3145 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3146 _mm512_mask_fmul_pch(__m512h __W
, __mmask16 __U
, __m512h __A
, __m512h __B
) {
3147 return (__m512h
)__builtin_ia32_vfmulcph512_mask((__v16sf
)__A
, (__v16sf
)__B
,
3148 (__v16sf
)__W
, (__mmask16
)__U
,
3149 _MM_FROUND_CUR_DIRECTION
);
3152 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3153 _mm512_maskz_fmul_pch(__mmask16 __U
, __m512h __A
, __m512h __B
) {
3154 return (__m512h
)__builtin_ia32_vfmulcph512_mask(
3155 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)_mm512_setzero_ph(), (__mmask16
)__U
,
3156 _MM_FROUND_CUR_DIRECTION
);
3159 #define _mm512_fmul_round_pch(A, B, R) \
3160 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3161 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3162 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3164 #define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3165 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3166 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3167 (__mmask16)(U), (int)(R)))
3169 #define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3170 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3171 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3172 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3174 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fcmadd_pch(__m512h __A
,
3177 return (__m512h
)__builtin_ia32_vfcmaddcph512_mask3(
3178 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)__C
, (__mmask16
)-1,
3179 _MM_FROUND_CUR_DIRECTION
);
3182 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3183 _mm512_mask_fcmadd_pch(__m512h __A
, __mmask16 __U
, __m512h __B
, __m512h __C
) {
3184 return (__m512h
)__builtin_ia32_vfcmaddcph512_mask(
3185 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)__C
, (__mmask16
)__U
,
3186 _MM_FROUND_CUR_DIRECTION
);
3189 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3190 _mm512_mask3_fcmadd_pch(__m512h __A
, __m512h __B
, __m512h __C
, __mmask16 __U
) {
3191 return (__m512h
)__builtin_ia32_vfcmaddcph512_mask3(
3192 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)__C
, (__mmask16
)__U
,
3193 _MM_FROUND_CUR_DIRECTION
);
3196 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3197 _mm512_maskz_fcmadd_pch(__mmask16 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
3198 return (__m512h
)__builtin_ia32_vfcmaddcph512_maskz(
3199 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)__C
, (__mmask16
)__U
,
3200 _MM_FROUND_CUR_DIRECTION
);
3203 #define _mm512_fcmadd_round_pch(A, B, C, R) \
3204 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3205 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3206 (__mmask16)-1, (int)(R)))
3208 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3209 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3210 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3211 (__mmask16)(U), (int)(R)))
3213 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3214 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3215 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3216 (__mmask16)(U), (int)(R)))
3218 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3219 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3220 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3221 (__mmask16)(U), (int)(R)))
3223 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmadd_pch(__m512h __A
,
3226 return (__m512h
)__builtin_ia32_vfmaddcph512_mask3((__v16sf
)__A
, (__v16sf
)__B
,
3227 (__v16sf
)__C
, (__mmask16
)-1,
3228 _MM_FROUND_CUR_DIRECTION
);
3231 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3232 _mm512_mask_fmadd_pch(__m512h __A
, __mmask16 __U
, __m512h __B
, __m512h __C
) {
3233 return (__m512h
)__builtin_ia32_vfmaddcph512_mask((__v16sf
)__A
, (__v16sf
)__B
,
3234 (__v16sf
)__C
, (__mmask16
)__U
,
3235 _MM_FROUND_CUR_DIRECTION
);
3238 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3239 _mm512_mask3_fmadd_pch(__m512h __A
, __m512h __B
, __m512h __C
, __mmask16 __U
) {
3240 return (__m512h
)__builtin_ia32_vfmaddcph512_mask3(
3241 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)__C
, (__mmask16
)__U
,
3242 _MM_FROUND_CUR_DIRECTION
);
3245 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3246 _mm512_maskz_fmadd_pch(__mmask16 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
3247 return (__m512h
)__builtin_ia32_vfmaddcph512_maskz(
3248 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)__C
, (__mmask16
)__U
,
3249 _MM_FROUND_CUR_DIRECTION
);
3252 #define _mm512_fmadd_round_pch(A, B, C, R) \
3253 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3254 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3255 (__mmask16)-1, (int)(R)))
3257 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3258 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3259 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3260 (__mmask16)(U), (int)(R)))
3262 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3263 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3264 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3265 (__mmask16)(U), (int)(R)))
3267 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3268 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3269 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3270 (__mmask16)(U), (int)(R)))
3272 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3273 _mm512_reduce_add_ph(__m512h __W
) {
3274 return __builtin_ia32_reduce_fadd_ph512(-0.0f16
, __W
);
3277 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3278 _mm512_reduce_mul_ph(__m512h __W
) {
3279 return __builtin_ia32_reduce_fmul_ph512(1.0f16
, __W
);
3282 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3283 _mm512_reduce_max_ph(__m512h __V
) {
3284 return __builtin_ia32_reduce_fmax_ph512(__V
);
3287 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3288 _mm512_reduce_min_ph(__m512h __V
) {
3289 return __builtin_ia32_reduce_fmin_ph512(__V
);
3292 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3293 _mm512_mask_blend_ph(__mmask32 __U
, __m512h __A
, __m512h __W
) {
3294 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
, (__v32hf
)__W
,
3298 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3299 _mm512_permutex2var_ph(__m512h __A
, __m512i __I
, __m512h __B
) {
3300 return (__m512h
)__builtin_ia32_vpermi2varhi512((__v32hi
)__A
, (__v32hi
)__I
,
3304 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3305 _mm512_permutexvar_ph(__m512i __A
, __m512h __B
) {
3306 return (__m512h
)__builtin_ia32_permvarhi512((__v32hi
)__B
, (__v32hi
)__A
);
3309 // intrinsics below are alias for f*mul_*ch
3310 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3311 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3312 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3313 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3314 #define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3315 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3316 #define _mm512_maskz_mul_round_pch(U, A, B, R) \
3317 _mm512_maskz_fmul_round_pch(U, A, B, R)
3319 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3320 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3321 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3322 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3323 #define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3324 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3325 #define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3326 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3328 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3329 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3330 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3331 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3332 #define _mm_mask_mul_round_sch(W, U, A, B, R) \
3333 _mm_mask_fmul_round_sch(W, U, A, B, R)
3334 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3336 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3337 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3338 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3339 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3340 #define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3341 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3342 #define _mm_maskz_cmul_round_sch(U, A, B, R) \
3343 _mm_maskz_fcmul_round_sch(U, A, B, R)
3345 #undef __DEFAULT_FN_ATTRS128
3346 #undef __DEFAULT_FN_ATTRS256
3347 #undef __DEFAULT_FN_ATTRS512