1 /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
15 #ifndef __AVX512FP16INTRIN_H
16 #define __AVX512FP16INTRIN_H
18 /* Define the default attributes for the functions in this file. */
19 typedef _Float16 __v32hf
__attribute__((__vector_size__(64), __aligned__(64)));
20 typedef _Float16 __m512h
__attribute__((__vector_size__(64), __aligned__(64)));
21 typedef _Float16 __m512h_u
__attribute__((__vector_size__(64), __aligned__(1)));
23 /* Define the default attributes for the functions in this file. */
24 #define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
26 __min_vector_width__(512)))
27 #define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
29 __min_vector_width__(256)))
30 #define __DEFAULT_FN_ATTRS128 \
31 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
32 __min_vector_width__(128)))
34 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_cvtsh_h(__m512h __a
) {
38 static __inline __m128h __DEFAULT_FN_ATTRS128
_mm_setzero_ph(void) {
39 return (__m128h
){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
42 static __inline __m256h __DEFAULT_FN_ATTRS256
_mm256_setzero_ph(void) {
43 return (__m256h
){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
44 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
47 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_undefined_ph(void) {
48 return (__m256h
)__builtin_ia32_undef256();
51 static __inline __m512h __DEFAULT_FN_ATTRS512
_mm512_setzero_ph(void) {
52 return (__m512h
){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
53 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
54 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
57 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_undefined_ph(void) {
58 return (__m128h
)__builtin_ia32_undef128();
61 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_undefined_ph(void) {
62 return (__m512h
)__builtin_ia32_undef512();
65 static __inline __m512h __DEFAULT_FN_ATTRS512
_mm512_set1_ph(_Float16 __h
) {
66 return (__m512h
)(__v32hf
){__h
, __h
, __h
, __h
, __h
, __h
, __h
, __h
,
67 __h
, __h
, __h
, __h
, __h
, __h
, __h
, __h
,
68 __h
, __h
, __h
, __h
, __h
, __h
, __h
, __h
,
69 __h
, __h
, __h
, __h
, __h
, __h
, __h
, __h
};
72 static __inline __m512h __DEFAULT_FN_ATTRS512
73 _mm512_set_ph(_Float16 __h1
, _Float16 __h2
, _Float16 __h3
, _Float16 __h4
,
74 _Float16 __h5
, _Float16 __h6
, _Float16 __h7
, _Float16 __h8
,
75 _Float16 __h9
, _Float16 __h10
, _Float16 __h11
, _Float16 __h12
,
76 _Float16 __h13
, _Float16 __h14
, _Float16 __h15
, _Float16 __h16
,
77 _Float16 __h17
, _Float16 __h18
, _Float16 __h19
, _Float16 __h20
,
78 _Float16 __h21
, _Float16 __h22
, _Float16 __h23
, _Float16 __h24
,
79 _Float16 __h25
, _Float16 __h26
, _Float16 __h27
, _Float16 __h28
,
80 _Float16 __h29
, _Float16 __h30
, _Float16 __h31
, _Float16 __h32
) {
81 return (__m512h
)(__v32hf
){__h32
, __h31
, __h30
, __h29
, __h28
, __h27
, __h26
,
82 __h25
, __h24
, __h23
, __h22
, __h21
, __h20
, __h19
,
83 __h18
, __h17
, __h16
, __h15
, __h14
, __h13
, __h12
,
84 __h11
, __h10
, __h9
, __h8
, __h7
, __h6
, __h5
,
85 __h4
, __h3
, __h2
, __h1
};
88 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
89 h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \
90 h25, h26, h27, h28, h29, h30, h31, h32) \
91 _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
92 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
93 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \
94 (h5), (h4), (h3), (h2), (h1))
96 static __inline __m512h __DEFAULT_FN_ATTRS512
97 _mm512_set1_pch(_Float16 _Complex h
) {
98 return (__m512h
)_mm512_set1_ps(__builtin_bit_cast(float, h
));
101 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_castph_ps(__m128h __a
) {
105 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_castph_ps(__m256h __a
) {
109 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_castph_ps(__m512h __a
) {
113 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_castph_pd(__m128h __a
) {
117 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_castph_pd(__m256h __a
) {
121 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_castph_pd(__m512h __a
) {
125 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_castph_si128(__m128h __a
) {
129 static __inline__ __m256i __DEFAULT_FN_ATTRS256
130 _mm256_castph_si256(__m256h __a
) {
134 static __inline__ __m512i __DEFAULT_FN_ATTRS512
135 _mm512_castph_si512(__m512h __a
) {
139 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_castps_ph(__m128 __a
) {
143 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_castps_ph(__m256 __a
) {
147 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castps_ph(__m512 __a
) {
151 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_castpd_ph(__m128d __a
) {
155 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_castpd_ph(__m256d __a
) {
159 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castpd_ph(__m512d __a
) {
163 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_castsi128_ph(__m128i __a
) {
167 static __inline__ __m256h __DEFAULT_FN_ATTRS256
168 _mm256_castsi256_ph(__m256i __a
) {
172 static __inline__ __m512h __DEFAULT_FN_ATTRS512
173 _mm512_castsi512_ph(__m512i __a
) {
177 static __inline__ __m128h __DEFAULT_FN_ATTRS256
178 _mm256_castph256_ph128(__m256h __a
) {
179 return __builtin_shufflevector(__a
, __a
, 0, 1, 2, 3, 4, 5, 6, 7);
182 static __inline__ __m128h __DEFAULT_FN_ATTRS512
183 _mm512_castph512_ph128(__m512h __a
) {
184 return __builtin_shufflevector(__a
, __a
, 0, 1, 2, 3, 4, 5, 6, 7);
187 static __inline__ __m256h __DEFAULT_FN_ATTRS512
188 _mm512_castph512_ph256(__m512h __a
) {
189 return __builtin_shufflevector(__a
, __a
, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
193 static __inline__ __m256h __DEFAULT_FN_ATTRS256
194 _mm256_castph128_ph256(__m128h __a
) {
195 return __builtin_shufflevector(__a
, __a
, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
199 static __inline__ __m512h __DEFAULT_FN_ATTRS512
200 _mm512_castph128_ph512(__m128h __a
) {
201 return __builtin_shufflevector(__a
, __a
, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
202 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
203 -1, -1, -1, -1, -1, -1, -1, -1, -1);
206 static __inline__ __m512h __DEFAULT_FN_ATTRS512
207 _mm512_castph256_ph512(__m256h __a
) {
208 return __builtin_shufflevector(__a
, __a
, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
209 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
210 -1, -1, -1, -1, -1, -1, -1, -1);
213 /// Constructs a 256-bit floating-point vector of [16 x half] from a
214 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits
215 /// contain the value of the source vector. The upper 384 bits are set
218 /// \headerfile <x86intrin.h>
220 /// This intrinsic has no corresponding instruction.
223 /// A 128-bit vector of [8 x half].
224 /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
225 /// contain the value of the parameter. The upper 384 bits are set to zero.
226 static __inline__ __m256h __DEFAULT_FN_ATTRS256
227 _mm256_zextph128_ph256(__m128h __a
) {
228 return __builtin_shufflevector(__a
, (__v8hf
)_mm_setzero_ph(), 0, 1, 2, 3, 4,
229 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
232 /// Constructs a 512-bit floating-point vector of [32 x half] from a
233 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits
234 /// contain the value of the source vector. The upper 384 bits are set
237 /// \headerfile <x86intrin.h>
239 /// This intrinsic has no corresponding instruction.
242 /// A 128-bit vector of [8 x half].
243 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
244 /// contain the value of the parameter. The upper 384 bits are set to zero.
245 static __inline__ __m512h __DEFAULT_FN_ATTRS512
246 _mm512_zextph128_ph512(__m128h __a
) {
247 return __builtin_shufflevector(
248 __a
, (__v8hf
)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
249 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
252 /// Constructs a 512-bit floating-point vector of [32 x half] from a
253 /// 256-bit floating-point vector of [16 x half]. The lower 256 bits
254 /// contain the value of the source vector. The upper 256 bits are set
257 /// \headerfile <x86intrin.h>
259 /// This intrinsic has no corresponding instruction.
262 /// A 256-bit vector of [16 x half].
263 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
264 /// contain the value of the parameter. The upper 256 bits are set to zero.
265 static __inline__ __m512h __DEFAULT_FN_ATTRS512
266 _mm512_zextph256_ph512(__m256h __a
) {
267 return __builtin_shufflevector(__a
, (__v16hf
)_mm256_setzero_ph(), 0, 1, 2, 3,
268 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
269 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
273 #define _mm_comi_round_sh(A, B, P, R) \
274 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
276 #define _mm_comi_sh(A, B, pred) \
277 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
279 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_comieq_sh(__m128h A
,
281 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_EQ_OS
,
282 _MM_FROUND_CUR_DIRECTION
);
285 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_comilt_sh(__m128h A
,
287 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_LT_OS
,
288 _MM_FROUND_CUR_DIRECTION
);
291 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_comile_sh(__m128h A
,
293 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_LE_OS
,
294 _MM_FROUND_CUR_DIRECTION
);
297 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_comigt_sh(__m128h A
,
299 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_GT_OS
,
300 _MM_FROUND_CUR_DIRECTION
);
303 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_comige_sh(__m128h A
,
305 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_GE_OS
,
306 _MM_FROUND_CUR_DIRECTION
);
309 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_comineq_sh(__m128h A
,
311 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_NEQ_US
,
312 _MM_FROUND_CUR_DIRECTION
);
315 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_ucomieq_sh(__m128h A
,
317 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_EQ_OQ
,
318 _MM_FROUND_CUR_DIRECTION
);
321 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_ucomilt_sh(__m128h A
,
323 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_LT_OQ
,
324 _MM_FROUND_CUR_DIRECTION
);
327 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_ucomile_sh(__m128h A
,
329 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_LE_OQ
,
330 _MM_FROUND_CUR_DIRECTION
);
333 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_ucomigt_sh(__m128h A
,
335 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_GT_OQ
,
336 _MM_FROUND_CUR_DIRECTION
);
339 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_ucomige_sh(__m128h A
,
341 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_GE_OQ
,
342 _MM_FROUND_CUR_DIRECTION
);
345 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_ucomineq_sh(__m128h A
,
347 return __builtin_ia32_vcomish((__v8hf
)A
, (__v8hf
)B
, _CMP_NEQ_UQ
,
348 _MM_FROUND_CUR_DIRECTION
);
351 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_add_ph(__m512h __A
,
353 return (__m512h
)((__v32hf
)__A
+ (__v32hf
)__B
);
356 static __inline__ __m512h __DEFAULT_FN_ATTRS512
357 _mm512_mask_add_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
358 return (__m512h
)__builtin_ia32_selectph_512(
359 (__mmask32
)__U
, (__v32hf
)_mm512_add_ph(__A
, __B
), (__v32hf
)__W
);
362 static __inline__ __m512h __DEFAULT_FN_ATTRS512
363 _mm512_maskz_add_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
364 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
,
365 (__v32hf
)_mm512_add_ph(__A
, __B
),
366 (__v32hf
)_mm512_setzero_ph());
369 #define _mm512_add_round_ph(A, B, R) \
370 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
371 (__v32hf)(__m512h)(B), (int)(R)))
373 #define _mm512_mask_add_round_ph(W, U, A, B, R) \
374 ((__m512h)__builtin_ia32_selectph_512( \
375 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
376 (__v32hf)(__m512h)(W)))
378 #define _mm512_maskz_add_round_ph(U, A, B, R) \
379 ((__m512h)__builtin_ia32_selectph_512( \
380 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
381 (__v32hf)_mm512_setzero_ph()))
383 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_sub_ph(__m512h __A
,
385 return (__m512h
)((__v32hf
)__A
- (__v32hf
)__B
);
388 static __inline__ __m512h __DEFAULT_FN_ATTRS512
389 _mm512_mask_sub_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
390 return (__m512h
)__builtin_ia32_selectph_512(
391 (__mmask32
)__U
, (__v32hf
)_mm512_sub_ph(__A
, __B
), (__v32hf
)__W
);
394 static __inline__ __m512h __DEFAULT_FN_ATTRS512
395 _mm512_maskz_sub_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
396 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
,
397 (__v32hf
)_mm512_sub_ph(__A
, __B
),
398 (__v32hf
)_mm512_setzero_ph());
401 #define _mm512_sub_round_ph(A, B, R) \
402 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
403 (__v32hf)(__m512h)(B), (int)(R)))
405 #define _mm512_mask_sub_round_ph(W, U, A, B, R) \
406 ((__m512h)__builtin_ia32_selectph_512( \
407 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
408 (__v32hf)(__m512h)(W)))
410 #define _mm512_maskz_sub_round_ph(U, A, B, R) \
411 ((__m512h)__builtin_ia32_selectph_512( \
412 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
413 (__v32hf)_mm512_setzero_ph()))
415 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mul_ph(__m512h __A
,
417 return (__m512h
)((__v32hf
)__A
* (__v32hf
)__B
);
420 static __inline__ __m512h __DEFAULT_FN_ATTRS512
421 _mm512_mask_mul_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
422 return (__m512h
)__builtin_ia32_selectph_512(
423 (__mmask32
)__U
, (__v32hf
)_mm512_mul_ph(__A
, __B
), (__v32hf
)__W
);
426 static __inline__ __m512h __DEFAULT_FN_ATTRS512
427 _mm512_maskz_mul_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
428 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
,
429 (__v32hf
)_mm512_mul_ph(__A
, __B
),
430 (__v32hf
)_mm512_setzero_ph());
433 #define _mm512_mul_round_ph(A, B, R) \
434 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
435 (__v32hf)(__m512h)(B), (int)(R)))
437 #define _mm512_mask_mul_round_ph(W, U, A, B, R) \
438 ((__m512h)__builtin_ia32_selectph_512( \
439 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
440 (__v32hf)(__m512h)(W)))
442 #define _mm512_maskz_mul_round_ph(U, A, B, R) \
443 ((__m512h)__builtin_ia32_selectph_512( \
444 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
445 (__v32hf)_mm512_setzero_ph()))
447 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_div_ph(__m512h __A
,
449 return (__m512h
)((__v32hf
)__A
/ (__v32hf
)__B
);
452 static __inline__ __m512h __DEFAULT_FN_ATTRS512
453 _mm512_mask_div_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
454 return (__m512h
)__builtin_ia32_selectph_512(
455 (__mmask32
)__U
, (__v32hf
)_mm512_div_ph(__A
, __B
), (__v32hf
)__W
);
458 static __inline__ __m512h __DEFAULT_FN_ATTRS512
459 _mm512_maskz_div_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
460 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
,
461 (__v32hf
)_mm512_div_ph(__A
, __B
),
462 (__v32hf
)_mm512_setzero_ph());
465 #define _mm512_div_round_ph(A, B, R) \
466 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
467 (__v32hf)(__m512h)(B), (int)(R)))
469 #define _mm512_mask_div_round_ph(W, U, A, B, R) \
470 ((__m512h)__builtin_ia32_selectph_512( \
471 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
472 (__v32hf)(__m512h)(W)))
474 #define _mm512_maskz_div_round_ph(U, A, B, R) \
475 ((__m512h)__builtin_ia32_selectph_512( \
476 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
477 (__v32hf)_mm512_setzero_ph()))
479 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_min_ph(__m512h __A
,
481 return (__m512h
)__builtin_ia32_minph512((__v32hf
)__A
, (__v32hf
)__B
,
482 _MM_FROUND_CUR_DIRECTION
);
485 static __inline__ __m512h __DEFAULT_FN_ATTRS512
486 _mm512_mask_min_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
487 return (__m512h
)__builtin_ia32_selectph_512(
488 (__mmask32
)__U
, (__v32hf
)_mm512_min_ph(__A
, __B
), (__v32hf
)__W
);
491 static __inline__ __m512h __DEFAULT_FN_ATTRS512
492 _mm512_maskz_min_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
493 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
,
494 (__v32hf
)_mm512_min_ph(__A
, __B
),
495 (__v32hf
)_mm512_setzero_ph());
498 #define _mm512_min_round_ph(A, B, R) \
499 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
500 (__v32hf)(__m512h)(B), (int)(R)))
502 #define _mm512_mask_min_round_ph(W, U, A, B, R) \
503 ((__m512h)__builtin_ia32_selectph_512( \
504 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
505 (__v32hf)(__m512h)(W)))
507 #define _mm512_maskz_min_round_ph(U, A, B, R) \
508 ((__m512h)__builtin_ia32_selectph_512( \
509 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
510 (__v32hf)_mm512_setzero_ph()))
512 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_max_ph(__m512h __A
,
514 return (__m512h
)__builtin_ia32_maxph512((__v32hf
)__A
, (__v32hf
)__B
,
515 _MM_FROUND_CUR_DIRECTION
);
518 static __inline__ __m512h __DEFAULT_FN_ATTRS512
519 _mm512_mask_max_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
520 return (__m512h
)__builtin_ia32_selectph_512(
521 (__mmask32
)__U
, (__v32hf
)_mm512_max_ph(__A
, __B
), (__v32hf
)__W
);
524 static __inline__ __m512h __DEFAULT_FN_ATTRS512
525 _mm512_maskz_max_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
526 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
,
527 (__v32hf
)_mm512_max_ph(__A
, __B
),
528 (__v32hf
)_mm512_setzero_ph());
531 #define _mm512_max_round_ph(A, B, R) \
532 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
533 (__v32hf)(__m512h)(B), (int)(R)))
535 #define _mm512_mask_max_round_ph(W, U, A, B, R) \
536 ((__m512h)__builtin_ia32_selectph_512( \
537 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
538 (__v32hf)(__m512h)(W)))
540 #define _mm512_maskz_max_round_ph(U, A, B, R) \
541 ((__m512h)__builtin_ia32_selectph_512( \
542 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
543 (__v32hf)_mm512_setzero_ph()))
545 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_abs_ph(__m512h __A
) {
546 return (__m512h
)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i
)__A
);
549 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_conj_pch(__m512h __A
) {
550 return (__m512h
)_mm512_xor_ps((__m512
)__A
, _mm512_set1_ps(-0.0f
));
553 static __inline__ __m512h __DEFAULT_FN_ATTRS512
554 _mm512_mask_conj_pch(__m512h __W
, __mmask16 __U
, __m512h __A
) {
555 return (__m512h
)__builtin_ia32_selectps_512(
556 (__mmask16
)__U
, (__v16sf
)_mm512_conj_pch(__A
), (__v16sf
)__W
);
559 static __inline__ __m512h __DEFAULT_FN_ATTRS512
560 _mm512_maskz_conj_pch(__mmask16 __U
, __m512h __A
) {
561 return (__m512h
)__builtin_ia32_selectps_512((__mmask16
)__U
,
562 (__v16sf
)_mm512_conj_pch(__A
),
563 (__v16sf
)_mm512_setzero_ps());
566 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_add_sh(__m128h __A
,
572 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_add_sh(__m128h __W
,
576 __A
= _mm_add_sh(__A
, __B
);
577 return __builtin_ia32_selectsh_128(__U
, __A
, __W
);
580 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_add_sh(__mmask8 __U
,
583 __A
= _mm_add_sh(__A
, __B
);
584 return __builtin_ia32_selectsh_128(__U
, __A
, _mm_setzero_ph());
587 #define _mm_add_round_sh(A, B, R) \
588 ((__m128h)__builtin_ia32_addsh_round_mask( \
589 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
590 (__mmask8)-1, (int)(R)))
592 #define _mm_mask_add_round_sh(W, U, A, B, R) \
593 ((__m128h)__builtin_ia32_addsh_round_mask( \
594 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
595 (__mmask8)(U), (int)(R)))
597 #define _mm_maskz_add_round_sh(U, A, B, R) \
598 ((__m128h)__builtin_ia32_addsh_round_mask( \
599 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
600 (__mmask8)(U), (int)(R)))
602 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_sub_sh(__m128h __A
,
608 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_sub_sh(__m128h __W
,
612 __A
= _mm_sub_sh(__A
, __B
);
613 return __builtin_ia32_selectsh_128(__U
, __A
, __W
);
616 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_sub_sh(__mmask8 __U
,
619 __A
= _mm_sub_sh(__A
, __B
);
620 return __builtin_ia32_selectsh_128(__U
, __A
, _mm_setzero_ph());
623 #define _mm_sub_round_sh(A, B, R) \
624 ((__m128h)__builtin_ia32_subsh_round_mask( \
625 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
626 (__mmask8)-1, (int)(R)))
628 #define _mm_mask_sub_round_sh(W, U, A, B, R) \
629 ((__m128h)__builtin_ia32_subsh_round_mask( \
630 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
631 (__mmask8)(U), (int)(R)))
633 #define _mm_maskz_sub_round_sh(U, A, B, R) \
634 ((__m128h)__builtin_ia32_subsh_round_mask( \
635 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
636 (__mmask8)(U), (int)(R)))
638 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mul_sh(__m128h __A
,
644 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_mul_sh(__m128h __W
,
648 __A
= _mm_mul_sh(__A
, __B
);
649 return __builtin_ia32_selectsh_128(__U
, __A
, __W
);
652 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_mul_sh(__mmask8 __U
,
655 __A
= _mm_mul_sh(__A
, __B
);
656 return __builtin_ia32_selectsh_128(__U
, __A
, _mm_setzero_ph());
659 #define _mm_mul_round_sh(A, B, R) \
660 ((__m128h)__builtin_ia32_mulsh_round_mask( \
661 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
662 (__mmask8)-1, (int)(R)))
664 #define _mm_mask_mul_round_sh(W, U, A, B, R) \
665 ((__m128h)__builtin_ia32_mulsh_round_mask( \
666 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
667 (__mmask8)(U), (int)(R)))
669 #define _mm_maskz_mul_round_sh(U, A, B, R) \
670 ((__m128h)__builtin_ia32_mulsh_round_mask( \
671 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
672 (__mmask8)(U), (int)(R)))
674 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_div_sh(__m128h __A
,
680 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_div_sh(__m128h __W
,
684 __A
= _mm_div_sh(__A
, __B
);
685 return __builtin_ia32_selectsh_128(__U
, __A
, __W
);
688 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_div_sh(__mmask8 __U
,
691 __A
= _mm_div_sh(__A
, __B
);
692 return __builtin_ia32_selectsh_128(__U
, __A
, _mm_setzero_ph());
695 #define _mm_div_round_sh(A, B, R) \
696 ((__m128h)__builtin_ia32_divsh_round_mask( \
697 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
698 (__mmask8)-1, (int)(R)))
700 #define _mm_mask_div_round_sh(W, U, A, B, R) \
701 ((__m128h)__builtin_ia32_divsh_round_mask( \
702 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
703 (__mmask8)(U), (int)(R)))
705 #define _mm_maskz_div_round_sh(U, A, B, R) \
706 ((__m128h)__builtin_ia32_divsh_round_mask( \
707 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
708 (__mmask8)(U), (int)(R)))
710 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_min_sh(__m128h __A
,
712 return (__m128h
)__builtin_ia32_minsh_round_mask(
713 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
714 _MM_FROUND_CUR_DIRECTION
);
717 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_min_sh(__m128h __W
,
721 return (__m128h
)__builtin_ia32_minsh_round_mask((__v8hf
)__A
, (__v8hf
)__B
,
722 (__v8hf
)__W
, (__mmask8
)__U
,
723 _MM_FROUND_CUR_DIRECTION
);
726 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_min_sh(__mmask8 __U
,
729 return (__m128h
)__builtin_ia32_minsh_round_mask(
730 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
731 _MM_FROUND_CUR_DIRECTION
);
734 #define _mm_min_round_sh(A, B, R) \
735 ((__m128h)__builtin_ia32_minsh_round_mask( \
736 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
737 (__mmask8)-1, (int)(R)))
739 #define _mm_mask_min_round_sh(W, U, A, B, R) \
740 ((__m128h)__builtin_ia32_minsh_round_mask( \
741 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
742 (__mmask8)(U), (int)(R)))
744 #define _mm_maskz_min_round_sh(U, A, B, R) \
745 ((__m128h)__builtin_ia32_minsh_round_mask( \
746 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
747 (__mmask8)(U), (int)(R)))
749 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_max_sh(__m128h __A
,
751 return (__m128h
)__builtin_ia32_maxsh_round_mask(
752 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
753 _MM_FROUND_CUR_DIRECTION
);
756 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_max_sh(__m128h __W
,
760 return (__m128h
)__builtin_ia32_maxsh_round_mask((__v8hf
)__A
, (__v8hf
)__B
,
761 (__v8hf
)__W
, (__mmask8
)__U
,
762 _MM_FROUND_CUR_DIRECTION
);
765 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_max_sh(__mmask8 __U
,
768 return (__m128h
)__builtin_ia32_maxsh_round_mask(
769 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
770 _MM_FROUND_CUR_DIRECTION
);
773 #define _mm_max_round_sh(A, B, R) \
774 ((__m128h)__builtin_ia32_maxsh_round_mask( \
775 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
776 (__mmask8)-1, (int)(R)))
778 #define _mm_mask_max_round_sh(W, U, A, B, R) \
779 ((__m128h)__builtin_ia32_maxsh_round_mask( \
780 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
781 (__mmask8)(U), (int)(R)))
783 #define _mm_maskz_max_round_sh(U, A, B, R) \
784 ((__m128h)__builtin_ia32_maxsh_round_mask( \
785 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
786 (__mmask8)(U), (int)(R)))
788 #define _mm512_cmp_round_ph_mask(A, B, P, R) \
789 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
790 (__v32hf)(__m512h)(B), (int)(P), \
791 (__mmask32)-1, (int)(R)))
793 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
794 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
795 (__v32hf)(__m512h)(B), (int)(P), \
796 (__mmask32)(U), (int)(R)))
798 #define _mm512_cmp_ph_mask(A, B, P) \
799 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
801 #define _mm512_mask_cmp_ph_mask(U, A, B, P) \
802 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
804 #define _mm_cmp_round_sh_mask(X, Y, P, R) \
805 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
806 (__v8hf)(__m128h)(Y), (int)(P), \
807 (__mmask8)-1, (int)(R)))
809 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
810 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
811 (__v8hf)(__m128h)(Y), (int)(P), \
812 (__mmask8)(M), (int)(R)))
814 #define _mm_cmp_sh_mask(X, Y, P) \
815 ((__mmask8)__builtin_ia32_cmpsh_mask( \
816 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
817 _MM_FROUND_CUR_DIRECTION))
819 #define _mm_mask_cmp_sh_mask(M, X, Y, P) \
820 ((__mmask8)__builtin_ia32_cmpsh_mask( \
821 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
822 _MM_FROUND_CUR_DIRECTION))
823 // loads with vmovsh:
824 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_load_sh(void const *__dp
) {
825 struct __mm_load_sh_struct
{
827 } __attribute__((__packed__
, __may_alias__
));
828 _Float16 __u
= ((const struct __mm_load_sh_struct
*)__dp
)->__u
;
829 return (__m128h
){__u
, 0, 0, 0, 0, 0, 0, 0};
832 static __inline__ __m128h __DEFAULT_FN_ATTRS128
833 _mm_mask_load_sh(__m128h __W
, __mmask8 __U
, const void *__A
) {
834 __m128h src
= (__v8hf
)__builtin_shufflevector(
835 (__v8hf
)__W
, (__v8hf
)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
837 return (__m128h
)__builtin_ia32_loadsh128_mask((const __v8hf
*)__A
, src
, __U
& 1);
840 static __inline__ __m128h __DEFAULT_FN_ATTRS128
841 _mm_maskz_load_sh(__mmask8 __U
, const void *__A
) {
842 return (__m128h
)__builtin_ia32_loadsh128_mask(
843 (const __v8hf
*)__A
, (__v8hf
)_mm_setzero_ph(), __U
& 1);
846 static __inline__ __m512h __DEFAULT_FN_ATTRS512
847 _mm512_load_ph(void const *__p
) {
848 return *(const __m512h
*)__p
;
851 static __inline__ __m256h __DEFAULT_FN_ATTRS256
852 _mm256_load_ph(void const *__p
) {
853 return *(const __m256h
*)__p
;
856 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_load_ph(void const *__p
) {
857 return *(const __m128h
*)__p
;
860 static __inline__ __m512h __DEFAULT_FN_ATTRS512
861 _mm512_loadu_ph(void const *__p
) {
864 } __attribute__((__packed__
, __may_alias__
));
865 return ((const struct __loadu_ph
*)__p
)->__v
;
868 static __inline__ __m256h __DEFAULT_FN_ATTRS256
869 _mm256_loadu_ph(void const *__p
) {
872 } __attribute__((__packed__
, __may_alias__
));
873 return ((const struct __loadu_ph
*)__p
)->__v
;
876 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_loadu_ph(void const *__p
) {
879 } __attribute__((__packed__
, __may_alias__
));
880 return ((const struct __loadu_ph
*)__p
)->__v
;
883 // stores with vmovsh:
884 static __inline__
void __DEFAULT_FN_ATTRS128
_mm_store_sh(void *__dp
,
886 struct __mm_store_sh_struct
{
888 } __attribute__((__packed__
, __may_alias__
));
889 ((struct __mm_store_sh_struct
*)__dp
)->__u
= __a
[0];
892 static __inline__
void __DEFAULT_FN_ATTRS128
_mm_mask_store_sh(void *__W
,
895 __builtin_ia32_storesh128_mask((__v8hf
*)__W
, __A
, __U
& 1);
898 static __inline__
void __DEFAULT_FN_ATTRS512
_mm512_store_ph(void *__P
,
900 *(__m512h
*)__P
= __A
;
903 static __inline__
void __DEFAULT_FN_ATTRS256
_mm256_store_ph(void *__P
,
905 *(__m256h
*)__P
= __A
;
908 static __inline__
void __DEFAULT_FN_ATTRS128
_mm_store_ph(void *__P
,
910 *(__m128h
*)__P
= __A
;
913 static __inline__
void __DEFAULT_FN_ATTRS512
_mm512_storeu_ph(void *__P
,
917 } __attribute__((__packed__
, __may_alias__
));
918 ((struct __storeu_ph
*)__P
)->__v
= __A
;
921 static __inline__
void __DEFAULT_FN_ATTRS256
_mm256_storeu_ph(void *__P
,
925 } __attribute__((__packed__
, __may_alias__
));
926 ((struct __storeu_ph
*)__P
)->__v
= __A
;
929 static __inline__
void __DEFAULT_FN_ATTRS128
_mm_storeu_ph(void *__P
,
933 } __attribute__((__packed__
, __may_alias__
));
934 ((struct __storeu_ph
*)__P
)->__v
= __A
;
937 // moves with vmovsh:
938 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_move_sh(__m128h __a
,
944 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_move_sh(__m128h __W
,
948 return __builtin_ia32_selectsh_128(__U
, _mm_move_sh(__A
, __B
), __W
);
951 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_move_sh(__mmask8 __U
,
954 return __builtin_ia32_selectsh_128(__U
, _mm_move_sh(__A
, __B
),
959 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvtsi16_si128(short __a
) {
960 return (__m128i
)(__v8hi
){__a
, 0, 0, 0, 0, 0, 0, 0};
963 static __inline__
short __DEFAULT_FN_ATTRS128
_mm_cvtsi128_si16(__m128i __a
) {
964 __v8hi __b
= (__v8hi
)__a
;
968 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_rcp_ph(__m512h __A
) {
969 return (__m512h
)__builtin_ia32_rcpph512_mask(
970 (__v32hf
)__A
, (__v32hf
)_mm512_undefined_ph(), (__mmask32
)-1);
973 static __inline__ __m512h __DEFAULT_FN_ATTRS512
974 _mm512_mask_rcp_ph(__m512h __W
, __mmask32 __U
, __m512h __A
) {
975 return (__m512h
)__builtin_ia32_rcpph512_mask((__v32hf
)__A
, (__v32hf
)__W
,
979 static __inline__ __m512h __DEFAULT_FN_ATTRS512
980 _mm512_maskz_rcp_ph(__mmask32 __U
, __m512h __A
) {
981 return (__m512h
)__builtin_ia32_rcpph512_mask(
982 (__v32hf
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)__U
);
985 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_rsqrt_ph(__m512h __A
) {
986 return (__m512h
)__builtin_ia32_rsqrtph512_mask(
987 (__v32hf
)__A
, (__v32hf
)_mm512_undefined_ph(), (__mmask32
)-1);
990 static __inline__ __m512h __DEFAULT_FN_ATTRS512
991 _mm512_mask_rsqrt_ph(__m512h __W
, __mmask32 __U
, __m512h __A
) {
992 return (__m512h
)__builtin_ia32_rsqrtph512_mask((__v32hf
)__A
, (__v32hf
)__W
,
996 static __inline__ __m512h __DEFAULT_FN_ATTRS512
997 _mm512_maskz_rsqrt_ph(__mmask32 __U
, __m512h __A
) {
998 return (__m512h
)__builtin_ia32_rsqrtph512_mask(
999 (__v32hf
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)__U
);
1002 #define _mm512_getmant_ph(A, B, C) \
1003 ((__m512h)__builtin_ia32_getmantph512_mask( \
1004 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1005 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1006 _MM_FROUND_CUR_DIRECTION))
1008 #define _mm512_mask_getmant_ph(W, U, A, B, C) \
1009 ((__m512h)__builtin_ia32_getmantph512_mask( \
1010 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1011 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1013 #define _mm512_maskz_getmant_ph(U, A, B, C) \
1014 ((__m512h)__builtin_ia32_getmantph512_mask( \
1015 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1016 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1018 #define _mm512_getmant_round_ph(A, B, C, R) \
1019 ((__m512h)__builtin_ia32_getmantph512_mask( \
1020 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1021 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1023 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1024 ((__m512h)__builtin_ia32_getmantph512_mask( \
1025 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1026 (__mmask32)(U), (int)(R)))
1028 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1029 ((__m512h)__builtin_ia32_getmantph512_mask( \
1030 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1031 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1033 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_getexp_ph(__m512h __A
) {
1034 return (__m512h
)__builtin_ia32_getexpph512_mask(
1035 (__v32hf
)__A
, (__v32hf
)_mm512_undefined_ph(), (__mmask32
)-1,
1036 _MM_FROUND_CUR_DIRECTION
);
1039 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1040 _mm512_mask_getexp_ph(__m512h __W
, __mmask32 __U
, __m512h __A
) {
1041 return (__m512h
)__builtin_ia32_getexpph512_mask(
1042 (__v32hf
)__A
, (__v32hf
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1045 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1046 _mm512_maskz_getexp_ph(__mmask32 __U
, __m512h __A
) {
1047 return (__m512h
)__builtin_ia32_getexpph512_mask(
1048 (__v32hf
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)__U
,
1049 _MM_FROUND_CUR_DIRECTION
);
1052 #define _mm512_getexp_round_ph(A, R) \
1053 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1054 (__v32hf)_mm512_undefined_ph(), \
1055 (__mmask32)-1, (int)(R)))
1057 #define _mm512_mask_getexp_round_ph(W, U, A, R) \
1058 ((__m512h)__builtin_ia32_getexpph512_mask( \
1059 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1061 #define _mm512_maskz_getexp_round_ph(U, A, R) \
1062 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1063 (__v32hf)_mm512_setzero_ph(), \
1064 (__mmask32)(U), (int)(R)))
1066 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_scalef_ph(__m512h __A
,
1068 return (__m512h
)__builtin_ia32_scalefph512_mask(
1069 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)_mm512_undefined_ph(), (__mmask32
)-1,
1070 _MM_FROUND_CUR_DIRECTION
);
1073 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1074 _mm512_mask_scalef_ph(__m512h __W
, __mmask32 __U
, __m512h __A
, __m512h __B
) {
1075 return (__m512h
)__builtin_ia32_scalefph512_mask((__v32hf
)__A
, (__v32hf
)__B
,
1076 (__v32hf
)__W
, (__mmask32
)__U
,
1077 _MM_FROUND_CUR_DIRECTION
);
1080 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1081 _mm512_maskz_scalef_ph(__mmask32 __U
, __m512h __A
, __m512h __B
) {
1082 return (__m512h
)__builtin_ia32_scalefph512_mask(
1083 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)__U
,
1084 _MM_FROUND_CUR_DIRECTION
);
1087 #define _mm512_scalef_round_ph(A, B, R) \
1088 ((__m512h)__builtin_ia32_scalefph512_mask( \
1089 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1090 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1092 #define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1093 ((__m512h)__builtin_ia32_scalefph512_mask( \
1094 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1095 (__mmask32)(U), (int)(R)))
1097 #define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1098 ((__m512h)__builtin_ia32_scalefph512_mask( \
1099 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1100 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1102 #define _mm512_roundscale_ph(A, B) \
1103 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1104 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1105 _MM_FROUND_CUR_DIRECTION))
1107 #define _mm512_mask_roundscale_ph(A, B, C, imm) \
1108 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1109 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1110 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1112 #define _mm512_maskz_roundscale_ph(A, B, imm) \
1113 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1114 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1115 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1117 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1118 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1119 (__v32hf)(__m512h)(A), \
1120 (__mmask32)(B), (int)(R)))
1122 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1123 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1124 (__v32hf)_mm512_setzero_ph(), \
1125 (__mmask32)(A), (int)(R)))
1127 #define _mm512_roundscale_round_ph(A, imm, R) \
1128 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1129 (__v32hf)_mm512_undefined_ph(), \
1130 (__mmask32)-1, (int)(R)))
1132 #define _mm512_reduce_ph(A, imm) \
1133 ((__m512h)__builtin_ia32_reduceph512_mask( \
1134 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1135 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1137 #define _mm512_mask_reduce_ph(W, U, A, imm) \
1138 ((__m512h)__builtin_ia32_reduceph512_mask( \
1139 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1140 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1142 #define _mm512_maskz_reduce_ph(U, A, imm) \
1143 ((__m512h)__builtin_ia32_reduceph512_mask( \
1144 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1145 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1147 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1148 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1149 (__v32hf)(__m512h)(W), \
1150 (__mmask32)(U), (int)(R)))
1152 #define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1153 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1154 (__v32hf)_mm512_setzero_ph(), \
1155 (__mmask32)(U), (int)(R)))
1157 #define _mm512_reduce_round_ph(A, imm, R) \
1158 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1159 (__v32hf)_mm512_undefined_ph(), \
1160 (__mmask32)-1, (int)(R)))
1162 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_rcp_sh(__m128h __A
,
1164 return (__m128h
)__builtin_ia32_rcpsh_mask(
1165 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1);
1168 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_rcp_sh(__m128h __W
,
1172 return (__m128h
)__builtin_ia32_rcpsh_mask((__v8hf
)__A
, (__v8hf
)__B
,
1173 (__v8hf
)__W
, (__mmask8
)__U
);
1176 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_rcp_sh(__mmask8 __U
,
1179 return (__m128h
)__builtin_ia32_rcpsh_mask(
1180 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
1183 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_rsqrt_sh(__m128h __A
,
1185 return (__m128h
)__builtin_ia32_rsqrtsh_mask(
1186 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1);
1189 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_rsqrt_sh(__m128h __W
,
1193 return (__m128h
)__builtin_ia32_rsqrtsh_mask((__v8hf
)__A
, (__v8hf
)__B
,
1194 (__v8hf
)__W
, (__mmask8
)__U
);
1197 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1198 _mm_maskz_rsqrt_sh(__mmask8 __U
, __m128h __A
, __m128h __B
) {
1199 return (__m128h
)__builtin_ia32_rsqrtsh_mask(
1200 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
1203 #define _mm_getmant_round_sh(A, B, C, D, R) \
1204 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1205 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1206 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1208 #define _mm_getmant_sh(A, B, C, D) \
1209 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1210 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1211 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1213 #define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1214 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1215 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1216 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1218 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1219 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1220 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1221 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1223 #define _mm_maskz_getmant_sh(U, A, B, C, D) \
1224 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1225 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1226 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1228 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1229 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1230 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1231 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1233 #define _mm_getexp_round_sh(A, B, R) \
1234 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1235 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1236 (__mmask8)-1, (int)(R)))
1238 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_getexp_sh(__m128h __A
,
1240 return (__m128h
)__builtin_ia32_getexpsh128_round_mask(
1241 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
1242 _MM_FROUND_CUR_DIRECTION
);
1245 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1246 _mm_mask_getexp_sh(__m128h __W
, __mmask8 __U
, __m128h __A
, __m128h __B
) {
1247 return (__m128h
)__builtin_ia32_getexpsh128_round_mask(
1248 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__W
, (__mmask8
)__U
,
1249 _MM_FROUND_CUR_DIRECTION
);
1252 #define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1253 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1254 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1255 (__mmask8)(U), (int)(R)))
1257 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1258 _mm_maskz_getexp_sh(__mmask8 __U
, __m128h __A
, __m128h __B
) {
1259 return (__m128h
)__builtin_ia32_getexpsh128_round_mask(
1260 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
1261 _MM_FROUND_CUR_DIRECTION
);
1264 #define _mm_maskz_getexp_round_sh(U, A, B, R) \
1265 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1266 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1267 (__mmask8)(U), (int)(R)))
1269 #define _mm_scalef_round_sh(A, B, R) \
1270 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1271 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1272 (__mmask8)-1, (int)(R)))
1274 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_scalef_sh(__m128h __A
,
1276 return (__m128h
)__builtin_ia32_scalefsh_round_mask(
1277 (__v8hf
)__A
, (__v8hf
)(__B
), (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
1278 _MM_FROUND_CUR_DIRECTION
);
1281 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1282 _mm_mask_scalef_sh(__m128h __W
, __mmask8 __U
, __m128h __A
, __m128h __B
) {
1283 return (__m128h
)__builtin_ia32_scalefsh_round_mask((__v8hf
)__A
, (__v8hf
)__B
,
1284 (__v8hf
)__W
, (__mmask8
)__U
,
1285 _MM_FROUND_CUR_DIRECTION
);
1288 #define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1289 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1290 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1291 (__mmask8)(U), (int)(R)))
1293 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1294 _mm_maskz_scalef_sh(__mmask8 __U
, __m128h __A
, __m128h __B
) {
1295 return (__m128h
)__builtin_ia32_scalefsh_round_mask(
1296 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
1297 _MM_FROUND_CUR_DIRECTION
);
1300 #define _mm_maskz_scalef_round_sh(U, A, B, R) \
1301 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1302 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1303 (__mmask8)(U), (int)(R)))
1305 #define _mm_roundscale_round_sh(A, B, imm, R) \
1306 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1307 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1308 (__mmask8)-1, (int)(imm), (int)(R)))
1310 #define _mm_roundscale_sh(A, B, imm) \
1311 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1312 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1313 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1315 #define _mm_mask_roundscale_sh(W, U, A, B, I) \
1316 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1317 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1318 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1320 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1321 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1322 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1323 (__mmask8)(U), (int)(I), (int)(R)))
1325 #define _mm_maskz_roundscale_sh(U, A, B, I) \
1326 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1327 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1328 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1330 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1331 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1332 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1333 (__mmask8)(U), (int)(I), (int)(R)))
1335 #define _mm_reduce_sh(A, B, C) \
1336 ((__m128h)__builtin_ia32_reducesh_mask( \
1337 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1338 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1340 #define _mm_mask_reduce_sh(W, U, A, B, C) \
1341 ((__m128h)__builtin_ia32_reducesh_mask( \
1342 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1343 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1345 #define _mm_maskz_reduce_sh(U, A, B, C) \
1346 ((__m128h)__builtin_ia32_reducesh_mask( \
1347 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1348 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1350 #define _mm_reduce_round_sh(A, B, C, R) \
1351 ((__m128h)__builtin_ia32_reducesh_mask( \
1352 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1353 (__mmask8)-1, (int)(C), (int)(R)))
1355 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1356 ((__m128h)__builtin_ia32_reducesh_mask( \
1357 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1358 (__mmask8)(U), (int)(C), (int)(R)))
1360 #define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1361 ((__m128h)__builtin_ia32_reducesh_mask( \
1362 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1363 (__mmask8)(U), (int)(C), (int)(R)))
1365 #define _mm512_sqrt_round_ph(A, R) \
1366 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1368 #define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1369 ((__m512h)__builtin_ia32_selectph_512( \
1370 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1371 (__v32hf)(__m512h)(W)))
1373 #define _mm512_maskz_sqrt_round_ph(U, A, R) \
1374 ((__m512h)__builtin_ia32_selectph_512( \
1375 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1376 (__v32hf)_mm512_setzero_ph()))
1378 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_sqrt_ph(__m512h __A
) {
1379 return (__m512h
)__builtin_ia32_sqrtph512((__v32hf
)__A
,
1380 _MM_FROUND_CUR_DIRECTION
);
1383 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1384 _mm512_mask_sqrt_ph(__m512h __W
, __mmask32 __U
, __m512h __A
) {
1385 return (__m512h
)__builtin_ia32_selectph_512(
1387 (__v32hf
)__builtin_ia32_sqrtph512((__A
), (_MM_FROUND_CUR_DIRECTION
)),
1388 (__v32hf
)(__m512h
)(__W
));
1391 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1392 _mm512_maskz_sqrt_ph(__mmask32 __U
, __m512h __A
) {
1393 return (__m512h
)__builtin_ia32_selectph_512(
1395 (__v32hf
)__builtin_ia32_sqrtph512((__A
), (_MM_FROUND_CUR_DIRECTION
)),
1396 (__v32hf
)_mm512_setzero_ph());
1399 #define _mm_sqrt_round_sh(A, B, R) \
1400 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1401 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1402 (__mmask8)-1, (int)(R)))
1404 #define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1405 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1406 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1407 (__mmask8)(U), (int)(R)))
1409 #define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1410 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1411 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1412 (__mmask8)(U), (int)(R)))
1414 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_sqrt_sh(__m128h __A
,
1416 return (__m128h
)__builtin_ia32_sqrtsh_round_mask(
1417 (__v8hf
)(__m128h
)(__A
), (__v8hf
)(__m128h
)(__B
), (__v8hf
)_mm_setzero_ph(),
1418 (__mmask8
)-1, _MM_FROUND_CUR_DIRECTION
);
1421 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_sqrt_sh(__m128h __W
,
1425 return (__m128h
)__builtin_ia32_sqrtsh_round_mask(
1426 (__v8hf
)(__m128h
)(__A
), (__v8hf
)(__m128h
)(__B
), (__v8hf
)(__m128h
)(__W
),
1427 (__mmask8
)(__U
), _MM_FROUND_CUR_DIRECTION
);
1430 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_sqrt_sh(__mmask32 __U
,
1433 return (__m128h
)__builtin_ia32_sqrtsh_round_mask(
1434 (__v8hf
)(__m128h
)(__A
), (__v8hf
)(__m128h
)(__B
), (__v8hf
)_mm_setzero_ph(),
1435 (__mmask8
)(__U
), _MM_FROUND_CUR_DIRECTION
);
1438 #define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1439 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1440 (int)(imm), (__mmask32)(U)))
1442 #define _mm512_fpclass_ph_mask(A, imm) \
1443 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1444 (int)(imm), (__mmask32)-1))
1446 #define _mm_fpclass_sh_mask(A, imm) \
1447 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1450 #define _mm_mask_fpclass_sh_mask(U, A, imm) \
1451 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1454 #define _mm512_cvt_roundpd_ph(A, R) \
1455 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1456 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1458 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1459 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1460 (__mmask8)(U), (int)(R)))
1462 #define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1463 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1464 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1466 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_cvtpd_ph(__m512d __A
) {
1467 return (__m128h
)__builtin_ia32_vcvtpd2ph512_mask(
1468 (__v8df
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
1469 _MM_FROUND_CUR_DIRECTION
);
1472 static __inline__ __m128h __DEFAULT_FN_ATTRS512
1473 _mm512_mask_cvtpd_ph(__m128h __W
, __mmask8 __U
, __m512d __A
) {
1474 return (__m128h
)__builtin_ia32_vcvtpd2ph512_mask(
1475 (__v8df
)__A
, (__v8hf
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
1478 static __inline__ __m128h __DEFAULT_FN_ATTRS512
1479 _mm512_maskz_cvtpd_ph(__mmask8 __U
, __m512d __A
) {
1480 return (__m128h
)__builtin_ia32_vcvtpd2ph512_mask(
1481 (__v8df
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
1482 _MM_FROUND_CUR_DIRECTION
);
1485 #define _mm512_cvt_roundph_pd(A, R) \
1486 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1487 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1489 #define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1490 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1491 (__mmask8)(U), (int)(R)))
1493 #define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1494 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1495 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1497 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_cvtph_pd(__m128h __A
) {
1498 return (__m512d
)__builtin_ia32_vcvtph2pd512_mask(
1499 (__v8hf
)__A
, (__v8df
)_mm512_setzero_pd(), (__mmask8
)-1,
1500 _MM_FROUND_CUR_DIRECTION
);
1503 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1504 _mm512_mask_cvtph_pd(__m512d __W
, __mmask8 __U
, __m128h __A
) {
1505 return (__m512d
)__builtin_ia32_vcvtph2pd512_mask(
1506 (__v8hf
)__A
, (__v8df
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
1509 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1510 _mm512_maskz_cvtph_pd(__mmask8 __U
, __m128h __A
) {
1511 return (__m512d
)__builtin_ia32_vcvtph2pd512_mask(
1512 (__v8hf
)__A
, (__v8df
)_mm512_setzero_pd(), (__mmask8
)__U
,
1513 _MM_FROUND_CUR_DIRECTION
);
1516 #define _mm_cvt_roundsh_ss(A, B, R) \
1517 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1518 (__v4sf)_mm_undefined_ps(), \
1519 (__mmask8)(-1), (int)(R)))
1521 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1522 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1523 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1525 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1526 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1527 (__v4sf)_mm_setzero_ps(), \
1528 (__mmask8)(U), (int)(R)))
1530 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtsh_ss(__m128 __A
,
1532 return (__m128
)__builtin_ia32_vcvtsh2ss_round_mask(
1533 (__v4sf
)__A
, (__v8hf
)__B
, (__v4sf
)_mm_undefined_ps(), (__mmask8
)-1,
1534 _MM_FROUND_CUR_DIRECTION
);
1537 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_cvtsh_ss(__m128 __W
,
1541 return (__m128
)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf
)__A
, (__v8hf
)__B
,
1542 (__v4sf
)__W
, (__mmask8
)__U
,
1543 _MM_FROUND_CUR_DIRECTION
);
1546 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_cvtsh_ss(__mmask8 __U
,
1549 return (__m128
)__builtin_ia32_vcvtsh2ss_round_mask(
1550 (__v4sf
)__A
, (__v8hf
)__B
, (__v4sf
)_mm_setzero_ps(), (__mmask8
)__U
,
1551 _MM_FROUND_CUR_DIRECTION
);
1554 #define _mm_cvt_roundss_sh(A, B, R) \
1555 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1556 (__v8hf)_mm_undefined_ph(), \
1557 (__mmask8)(-1), (int)(R)))
1559 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1560 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1561 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1563 #define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1564 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1565 (__v8hf)_mm_setzero_ph(), \
1566 (__mmask8)(U), (int)(R)))
1568 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtss_sh(__m128h __A
,
1570 return (__m128h
)__builtin_ia32_vcvtss2sh_round_mask(
1571 (__v8hf
)__A
, (__v4sf
)__B
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1,
1572 _MM_FROUND_CUR_DIRECTION
);
1575 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtss_sh(__m128h __W
,
1579 return (__m128h
)__builtin_ia32_vcvtss2sh_round_mask(
1580 (__v8hf
)__A
, (__v4sf
)__B
, (__v8hf
)__W
, (__mmask8
)__U
,
1581 _MM_FROUND_CUR_DIRECTION
);
1584 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtss_sh(__mmask8 __U
,
1587 return (__m128h
)__builtin_ia32_vcvtss2sh_round_mask(
1588 (__v8hf
)__A
, (__v4sf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
1589 _MM_FROUND_CUR_DIRECTION
);
1592 #define _mm_cvt_roundsd_sh(A, B, R) \
1593 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1594 (__v8hf)_mm_undefined_ph(), \
1595 (__mmask8)(-1), (int)(R)))
1597 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1598 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1599 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1601 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1602 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1603 (__v8hf)_mm_setzero_ph(), \
1604 (__mmask8)(U), (int)(R)))
1606 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtsd_sh(__m128h __A
,
1608 return (__m128h
)__builtin_ia32_vcvtsd2sh_round_mask(
1609 (__v8hf
)__A
, (__v2df
)__B
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1,
1610 _MM_FROUND_CUR_DIRECTION
);
1613 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtsd_sh(__m128h __W
,
1617 return (__m128h
)__builtin_ia32_vcvtsd2sh_round_mask(
1618 (__v8hf
)__A
, (__v2df
)__B
, (__v8hf
)__W
, (__mmask8
)__U
,
1619 _MM_FROUND_CUR_DIRECTION
);
1622 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1623 _mm_maskz_cvtsd_sh(__mmask8 __U
, __m128h __A
, __m128d __B
) {
1624 return (__m128h
)__builtin_ia32_vcvtsd2sh_round_mask(
1625 (__v8hf
)__A
, (__v2df
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
1626 _MM_FROUND_CUR_DIRECTION
);
1629 #define _mm_cvt_roundsh_sd(A, B, R) \
1630 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1631 (__v2df)_mm_undefined_pd(), \
1632 (__mmask8)(-1), (int)(R)))
1634 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1635 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1636 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1638 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1639 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1640 (__v2df)_mm_setzero_pd(), \
1641 (__mmask8)(U), (int)(R)))
1643 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_cvtsh_sd(__m128d __A
,
1645 return (__m128d
)__builtin_ia32_vcvtsh2sd_round_mask(
1646 (__v2df
)__A
, (__v8hf
)__B
, (__v2df
)_mm_undefined_pd(), (__mmask8
)-1,
1647 _MM_FROUND_CUR_DIRECTION
);
1650 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_cvtsh_sd(__m128d __W
,
1654 return (__m128d
)__builtin_ia32_vcvtsh2sd_round_mask(
1655 (__v2df
)__A
, (__v8hf
)__B
, (__v2df
)__W
, (__mmask8
)__U
,
1656 _MM_FROUND_CUR_DIRECTION
);
1659 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1660 _mm_maskz_cvtsh_sd(__mmask8 __U
, __m128d __A
, __m128h __B
) {
1661 return (__m128d
)__builtin_ia32_vcvtsh2sd_round_mask(
1662 (__v2df
)__A
, (__v8hf
)__B
, (__v2df
)_mm_setzero_pd(), (__mmask8
)__U
,
1663 _MM_FROUND_CUR_DIRECTION
);
1666 #define _mm512_cvt_roundph_epi16(A, R) \
1667 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1668 (__v32hi)_mm512_undefined_epi32(), \
1669 (__mmask32)(-1), (int)(R)))
1671 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1672 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1673 (__mmask32)(U), (int)(R)))
1675 #define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1676 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1677 (__v32hi)_mm512_setzero_epi32(), \
1678 (__mmask32)(U), (int)(R)))
1680 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1681 _mm512_cvtph_epi16(__m512h __A
) {
1682 return (__m512i
)__builtin_ia32_vcvtph2w512_mask(
1683 (__v32hf
)__A
, (__v32hi
)_mm512_setzero_epi32(), (__mmask32
)-1,
1684 _MM_FROUND_CUR_DIRECTION
);
1687 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1688 _mm512_mask_cvtph_epi16(__m512i __W
, __mmask32 __U
, __m512h __A
) {
1689 return (__m512i
)__builtin_ia32_vcvtph2w512_mask(
1690 (__v32hf
)__A
, (__v32hi
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1693 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1694 _mm512_maskz_cvtph_epi16(__mmask32 __U
, __m512h __A
) {
1695 return (__m512i
)__builtin_ia32_vcvtph2w512_mask(
1696 (__v32hf
)__A
, (__v32hi
)_mm512_setzero_epi32(), (__mmask32
)__U
,
1697 _MM_FROUND_CUR_DIRECTION
);
1700 #define _mm512_cvtt_roundph_epi16(A, R) \
1701 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1702 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1705 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1706 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1707 (__mmask32)(U), (int)(R)))
1709 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1710 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1711 (__v32hi)_mm512_setzero_epi32(), \
1712 (__mmask32)(U), (int)(R)))
1714 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1715 _mm512_cvttph_epi16(__m512h __A
) {
1716 return (__m512i
)__builtin_ia32_vcvttph2w512_mask(
1717 (__v32hf
)__A
, (__v32hi
)_mm512_setzero_epi32(), (__mmask32
)-1,
1718 _MM_FROUND_CUR_DIRECTION
);
1721 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1722 _mm512_mask_cvttph_epi16(__m512i __W
, __mmask32 __U
, __m512h __A
) {
1723 return (__m512i
)__builtin_ia32_vcvttph2w512_mask(
1724 (__v32hf
)__A
, (__v32hi
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1727 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1728 _mm512_maskz_cvttph_epi16(__mmask32 __U
, __m512h __A
) {
1729 return (__m512i
)__builtin_ia32_vcvttph2w512_mask(
1730 (__v32hf
)__A
, (__v32hi
)_mm512_setzero_epi32(), (__mmask32
)__U
,
1731 _MM_FROUND_CUR_DIRECTION
);
1734 #define _mm512_cvt_roundepi16_ph(A, R) \
1735 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1736 (__v32hf)_mm512_undefined_ph(), \
1737 (__mmask32)(-1), (int)(R)))
1739 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1740 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1741 (__mmask32)(U), (int)(R)))
1743 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1744 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1745 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1747 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1748 _mm512_cvtepi16_ph(__m512i __A
) {
1749 return (__m512h
)__builtin_ia32_vcvtw2ph512_mask(
1750 (__v32hi
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)-1,
1751 _MM_FROUND_CUR_DIRECTION
);
1754 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1755 _mm512_mask_cvtepi16_ph(__m512h __W
, __mmask32 __U
, __m512i __A
) {
1756 return (__m512h
)__builtin_ia32_vcvtw2ph512_mask(
1757 (__v32hi
)__A
, (__v32hf
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1760 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1761 _mm512_maskz_cvtepi16_ph(__mmask32 __U
, __m512i __A
) {
1762 return (__m512h
)__builtin_ia32_vcvtw2ph512_mask(
1763 (__v32hi
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)__U
,
1764 _MM_FROUND_CUR_DIRECTION
);
1767 #define _mm512_cvt_roundph_epu16(A, R) \
1768 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1769 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1772 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1773 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1774 (__mmask32)(U), (int)(R)))
1776 #define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1777 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1778 (__v32hu)_mm512_setzero_epi32(), \
1779 (__mmask32)(U), (int)(R)))
1781 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1782 _mm512_cvtph_epu16(__m512h __A
) {
1783 return (__m512i
)__builtin_ia32_vcvtph2uw512_mask(
1784 (__v32hf
)__A
, (__v32hu
)_mm512_setzero_epi32(), (__mmask32
)-1,
1785 _MM_FROUND_CUR_DIRECTION
);
1788 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1789 _mm512_mask_cvtph_epu16(__m512i __W
, __mmask32 __U
, __m512h __A
) {
1790 return (__m512i
)__builtin_ia32_vcvtph2uw512_mask(
1791 (__v32hf
)__A
, (__v32hu
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1794 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1795 _mm512_maskz_cvtph_epu16(__mmask32 __U
, __m512h __A
) {
1796 return (__m512i
)__builtin_ia32_vcvtph2uw512_mask(
1797 (__v32hf
)__A
, (__v32hu
)_mm512_setzero_epi32(), (__mmask32
)__U
,
1798 _MM_FROUND_CUR_DIRECTION
);
1801 #define _mm512_cvtt_roundph_epu16(A, R) \
1802 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1803 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1806 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1807 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1808 (__mmask32)(U), (int)(R)))
1810 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1811 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1812 (__v32hu)_mm512_setzero_epi32(), \
1813 (__mmask32)(U), (int)(R)))
1815 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1816 _mm512_cvttph_epu16(__m512h __A
) {
1817 return (__m512i
)__builtin_ia32_vcvttph2uw512_mask(
1818 (__v32hf
)__A
, (__v32hu
)_mm512_setzero_epi32(), (__mmask32
)-1,
1819 _MM_FROUND_CUR_DIRECTION
);
1822 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1823 _mm512_mask_cvttph_epu16(__m512i __W
, __mmask32 __U
, __m512h __A
) {
1824 return (__m512i
)__builtin_ia32_vcvttph2uw512_mask(
1825 (__v32hf
)__A
, (__v32hu
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1828 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1829 _mm512_maskz_cvttph_epu16(__mmask32 __U
, __m512h __A
) {
1830 return (__m512i
)__builtin_ia32_vcvttph2uw512_mask(
1831 (__v32hf
)__A
, (__v32hu
)_mm512_setzero_epi32(), (__mmask32
)__U
,
1832 _MM_FROUND_CUR_DIRECTION
);
1835 #define _mm512_cvt_roundepu16_ph(A, R) \
1836 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1837 (__v32hf)_mm512_undefined_ph(), \
1838 (__mmask32)(-1), (int)(R)))
1840 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1841 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1842 (__mmask32)(U), (int)(R)))
1844 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1845 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1846 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1848 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1849 _mm512_cvtepu16_ph(__m512i __A
) {
1850 return (__m512h
)__builtin_ia32_vcvtuw2ph512_mask(
1851 (__v32hu
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)-1,
1852 _MM_FROUND_CUR_DIRECTION
);
1855 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1856 _mm512_mask_cvtepu16_ph(__m512h __W
, __mmask32 __U
, __m512i __A
) {
1857 return (__m512h
)__builtin_ia32_vcvtuw2ph512_mask(
1858 (__v32hu
)__A
, (__v32hf
)__W
, (__mmask32
)__U
, _MM_FROUND_CUR_DIRECTION
);
1861 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1862 _mm512_maskz_cvtepu16_ph(__mmask32 __U
, __m512i __A
) {
1863 return (__m512h
)__builtin_ia32_vcvtuw2ph512_mask(
1864 (__v32hu
)__A
, (__v32hf
)_mm512_setzero_ph(), (__mmask32
)__U
,
1865 _MM_FROUND_CUR_DIRECTION
);
1868 #define _mm512_cvt_roundph_epi32(A, R) \
1869 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1870 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1873 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1874 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1875 (__mmask16)(U), (int)(R)))
1877 #define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1878 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1879 (__v16si)_mm512_setzero_epi32(), \
1880 (__mmask16)(U), (int)(R)))
1882 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1883 _mm512_cvtph_epi32(__m256h __A
) {
1884 return (__m512i
)__builtin_ia32_vcvtph2dq512_mask(
1885 (__v16hf
)__A
, (__v16si
)_mm512_setzero_epi32(), (__mmask16
)-1,
1886 _MM_FROUND_CUR_DIRECTION
);
1889 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1890 _mm512_mask_cvtph_epi32(__m512i __W
, __mmask16 __U
, __m256h __A
) {
1891 return (__m512i
)__builtin_ia32_vcvtph2dq512_mask(
1892 (__v16hf
)__A
, (__v16si
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
1895 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1896 _mm512_maskz_cvtph_epi32(__mmask16 __U
, __m256h __A
) {
1897 return (__m512i
)__builtin_ia32_vcvtph2dq512_mask(
1898 (__v16hf
)__A
, (__v16si
)_mm512_setzero_epi32(), (__mmask16
)__U
,
1899 _MM_FROUND_CUR_DIRECTION
);
1902 #define _mm512_cvt_roundph_epu32(A, R) \
1903 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1904 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1907 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1908 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1909 (__mmask16)(U), (int)(R)))
1911 #define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1912 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1913 (__v16su)_mm512_setzero_epi32(), \
1914 (__mmask16)(U), (int)(R)))
1916 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1917 _mm512_cvtph_epu32(__m256h __A
) {
1918 return (__m512i
)__builtin_ia32_vcvtph2udq512_mask(
1919 (__v16hf
)__A
, (__v16su
)_mm512_setzero_epi32(), (__mmask16
)-1,
1920 _MM_FROUND_CUR_DIRECTION
);
1923 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1924 _mm512_mask_cvtph_epu32(__m512i __W
, __mmask16 __U
, __m256h __A
) {
1925 return (__m512i
)__builtin_ia32_vcvtph2udq512_mask(
1926 (__v16hf
)__A
, (__v16su
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
1929 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1930 _mm512_maskz_cvtph_epu32(__mmask16 __U
, __m256h __A
) {
1931 return (__m512i
)__builtin_ia32_vcvtph2udq512_mask(
1932 (__v16hf
)__A
, (__v16su
)_mm512_setzero_epi32(), (__mmask16
)__U
,
1933 _MM_FROUND_CUR_DIRECTION
);
1936 #define _mm512_cvt_roundepi32_ph(A, R) \
1937 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1938 (__v16hf)_mm256_undefined_ph(), \
1939 (__mmask16)(-1), (int)(R)))
1941 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1942 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1943 (__mmask16)(U), (int)(R)))
1945 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1946 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1947 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1949 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1950 _mm512_cvtepi32_ph(__m512i __A
) {
1951 return (__m256h
)__builtin_ia32_vcvtdq2ph512_mask(
1952 (__v16si
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)-1,
1953 _MM_FROUND_CUR_DIRECTION
);
1956 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1957 _mm512_mask_cvtepi32_ph(__m256h __W
, __mmask16 __U
, __m512i __A
) {
1958 return (__m256h
)__builtin_ia32_vcvtdq2ph512_mask(
1959 (__v16si
)__A
, (__v16hf
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
1962 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1963 _mm512_maskz_cvtepi32_ph(__mmask16 __U
, __m512i __A
) {
1964 return (__m256h
)__builtin_ia32_vcvtdq2ph512_mask(
1965 (__v16si
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)__U
,
1966 _MM_FROUND_CUR_DIRECTION
);
1969 #define _mm512_cvt_roundepu32_ph(A, R) \
1970 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1971 (__v16hf)_mm256_undefined_ph(), \
1972 (__mmask16)(-1), (int)(R)))
1974 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1975 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1976 (__mmask16)(U), (int)(R)))
1978 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1979 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1980 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1982 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1983 _mm512_cvtepu32_ph(__m512i __A
) {
1984 return (__m256h
)__builtin_ia32_vcvtudq2ph512_mask(
1985 (__v16su
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)-1,
1986 _MM_FROUND_CUR_DIRECTION
);
1989 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1990 _mm512_mask_cvtepu32_ph(__m256h __W
, __mmask16 __U
, __m512i __A
) {
1991 return (__m256h
)__builtin_ia32_vcvtudq2ph512_mask(
1992 (__v16su
)__A
, (__v16hf
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
1995 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1996 _mm512_maskz_cvtepu32_ph(__mmask16 __U
, __m512i __A
) {
1997 return (__m256h
)__builtin_ia32_vcvtudq2ph512_mask(
1998 (__v16su
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)__U
,
1999 _MM_FROUND_CUR_DIRECTION
);
2002 #define _mm512_cvtt_roundph_epi32(A, R) \
2003 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2004 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2007 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2008 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2009 (__mmask16)(U), (int)(R)))
2011 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2012 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2013 (__v16si)_mm512_setzero_epi32(), \
2014 (__mmask16)(U), (int)(R)))
2016 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2017 _mm512_cvttph_epi32(__m256h __A
) {
2018 return (__m512i
)__builtin_ia32_vcvttph2dq512_mask(
2019 (__v16hf
)__A
, (__v16si
)_mm512_setzero_epi32(), (__mmask16
)-1,
2020 _MM_FROUND_CUR_DIRECTION
);
2023 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2024 _mm512_mask_cvttph_epi32(__m512i __W
, __mmask16 __U
, __m256h __A
) {
2025 return (__m512i
)__builtin_ia32_vcvttph2dq512_mask(
2026 (__v16hf
)__A
, (__v16si
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
2029 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2030 _mm512_maskz_cvttph_epi32(__mmask16 __U
, __m256h __A
) {
2031 return (__m512i
)__builtin_ia32_vcvttph2dq512_mask(
2032 (__v16hf
)__A
, (__v16si
)_mm512_setzero_epi32(), (__mmask16
)__U
,
2033 _MM_FROUND_CUR_DIRECTION
);
2036 #define _mm512_cvtt_roundph_epu32(A, R) \
2037 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2038 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2041 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2042 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2043 (__mmask16)(U), (int)(R)))
2045 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2046 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2047 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2050 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2051 _mm512_cvttph_epu32(__m256h __A
) {
2052 return (__m512i
)__builtin_ia32_vcvttph2udq512_mask(
2053 (__v16hf
)__A
, (__v16su
)_mm512_setzero_epi32(), (__mmask16
)-1,
2054 _MM_FROUND_CUR_DIRECTION
);
2057 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2058 _mm512_mask_cvttph_epu32(__m512i __W
, __mmask16 __U
, __m256h __A
) {
2059 return (__m512i
)__builtin_ia32_vcvttph2udq512_mask(
2060 (__v16hf
)__A
, (__v16su
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
2063 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2064 _mm512_maskz_cvttph_epu32(__mmask16 __U
, __m256h __A
) {
2065 return (__m512i
)__builtin_ia32_vcvttph2udq512_mask(
2066 (__v16hf
)__A
, (__v16su
)_mm512_setzero_epi32(), (__mmask16
)__U
,
2067 _MM_FROUND_CUR_DIRECTION
);
2070 #define _mm512_cvt_roundepi64_ph(A, R) \
2071 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2072 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2074 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2075 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2076 (__mmask8)(U), (int)(R)))
2078 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2079 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2080 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2082 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2083 _mm512_cvtepi64_ph(__m512i __A
) {
2084 return (__m128h
)__builtin_ia32_vcvtqq2ph512_mask(
2085 (__v8di
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
2086 _MM_FROUND_CUR_DIRECTION
);
2089 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2090 _mm512_mask_cvtepi64_ph(__m128h __W
, __mmask8 __U
, __m512i __A
) {
2091 return (__m128h
)__builtin_ia32_vcvtqq2ph512_mask(
2092 (__v8di
)__A
, (__v8hf
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2095 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2096 _mm512_maskz_cvtepi64_ph(__mmask8 __U
, __m512i __A
) {
2097 return (__m128h
)__builtin_ia32_vcvtqq2ph512_mask(
2098 (__v8di
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
2099 _MM_FROUND_CUR_DIRECTION
);
2102 #define _mm512_cvt_roundph_epi64(A, R) \
2103 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2104 (__v8di)_mm512_undefined_epi32(), \
2105 (__mmask8)(-1), (int)(R)))
2107 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2108 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2109 (__mmask8)(U), (int)(R)))
2111 #define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2112 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2113 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2115 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2116 _mm512_cvtph_epi64(__m128h __A
) {
2117 return (__m512i
)__builtin_ia32_vcvtph2qq512_mask(
2118 (__v8hf
)__A
, (__v8di
)_mm512_setzero_epi32(), (__mmask8
)-1,
2119 _MM_FROUND_CUR_DIRECTION
);
2122 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2123 _mm512_mask_cvtph_epi64(__m512i __W
, __mmask8 __U
, __m128h __A
) {
2124 return (__m512i
)__builtin_ia32_vcvtph2qq512_mask(
2125 (__v8hf
)__A
, (__v8di
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2128 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2129 _mm512_maskz_cvtph_epi64(__mmask8 __U
, __m128h __A
) {
2130 return (__m512i
)__builtin_ia32_vcvtph2qq512_mask(
2131 (__v8hf
)__A
, (__v8di
)_mm512_setzero_epi32(), (__mmask8
)__U
,
2132 _MM_FROUND_CUR_DIRECTION
);
2135 #define _mm512_cvt_roundepu64_ph(A, R) \
2136 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2137 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2139 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2140 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2141 (__mmask8)(U), (int)(R)))
2143 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2144 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2145 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2147 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2148 _mm512_cvtepu64_ph(__m512i __A
) {
2149 return (__m128h
)__builtin_ia32_vcvtuqq2ph512_mask(
2150 (__v8du
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1,
2151 _MM_FROUND_CUR_DIRECTION
);
2154 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2155 _mm512_mask_cvtepu64_ph(__m128h __W
, __mmask8 __U
, __m512i __A
) {
2156 return (__m128h
)__builtin_ia32_vcvtuqq2ph512_mask(
2157 (__v8du
)__A
, (__v8hf
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2160 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2161 _mm512_maskz_cvtepu64_ph(__mmask8 __U
, __m512i __A
) {
2162 return (__m128h
)__builtin_ia32_vcvtuqq2ph512_mask(
2163 (__v8du
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
,
2164 _MM_FROUND_CUR_DIRECTION
);
2167 #define _mm512_cvt_roundph_epu64(A, R) \
2168 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2169 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2172 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2173 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2174 (__mmask8)(U), (int)(R)))
2176 #define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2177 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2178 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2180 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2181 _mm512_cvtph_epu64(__m128h __A
) {
2182 return (__m512i
)__builtin_ia32_vcvtph2uqq512_mask(
2183 (__v8hf
)__A
, (__v8du
)_mm512_setzero_epi32(), (__mmask8
)-1,
2184 _MM_FROUND_CUR_DIRECTION
);
2187 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2188 _mm512_mask_cvtph_epu64(__m512i __W
, __mmask8 __U
, __m128h __A
) {
2189 return (__m512i
)__builtin_ia32_vcvtph2uqq512_mask(
2190 (__v8hf
)__A
, (__v8du
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2193 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2194 _mm512_maskz_cvtph_epu64(__mmask8 __U
, __m128h __A
) {
2195 return (__m512i
)__builtin_ia32_vcvtph2uqq512_mask(
2196 (__v8hf
)__A
, (__v8du
)_mm512_setzero_epi32(), (__mmask8
)__U
,
2197 _MM_FROUND_CUR_DIRECTION
);
2200 #define _mm512_cvtt_roundph_epi64(A, R) \
2201 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2202 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2205 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2206 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2207 (__mmask8)(U), (int)(R)))
2209 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2210 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2211 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2213 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2214 _mm512_cvttph_epi64(__m128h __A
) {
2215 return (__m512i
)__builtin_ia32_vcvttph2qq512_mask(
2216 (__v8hf
)__A
, (__v8di
)_mm512_setzero_epi32(), (__mmask8
)-1,
2217 _MM_FROUND_CUR_DIRECTION
);
2220 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2221 _mm512_mask_cvttph_epi64(__m512i __W
, __mmask8 __U
, __m128h __A
) {
2222 return (__m512i
)__builtin_ia32_vcvttph2qq512_mask(
2223 (__v8hf
)__A
, (__v8di
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2226 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2227 _mm512_maskz_cvttph_epi64(__mmask8 __U
, __m128h __A
) {
2228 return (__m512i
)__builtin_ia32_vcvttph2qq512_mask(
2229 (__v8hf
)__A
, (__v8di
)_mm512_setzero_epi32(), (__mmask8
)__U
,
2230 _MM_FROUND_CUR_DIRECTION
);
2233 #define _mm512_cvtt_roundph_epu64(A, R) \
2234 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2235 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2238 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2239 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2240 (__mmask8)(U), (int)(R)))
2242 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2243 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2244 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2246 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2247 _mm512_cvttph_epu64(__m128h __A
) {
2248 return (__m512i
)__builtin_ia32_vcvttph2uqq512_mask(
2249 (__v8hf
)__A
, (__v8du
)_mm512_setzero_epi32(), (__mmask8
)-1,
2250 _MM_FROUND_CUR_DIRECTION
);
2253 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2254 _mm512_mask_cvttph_epu64(__m512i __W
, __mmask8 __U
, __m128h __A
) {
2255 return (__m512i
)__builtin_ia32_vcvttph2uqq512_mask(
2256 (__v8hf
)__A
, (__v8du
)__W
, (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2259 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2260 _mm512_maskz_cvttph_epu64(__mmask8 __U
, __m128h __A
) {
2261 return (__m512i
)__builtin_ia32_vcvttph2uqq512_mask(
2262 (__v8hf
)__A
, (__v8du
)_mm512_setzero_epi32(), (__mmask8
)__U
,
2263 _MM_FROUND_CUR_DIRECTION
);
2266 #define _mm_cvt_roundsh_i32(A, R) \
2267 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2269 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_cvtsh_i32(__m128h __A
) {
2270 return (int)__builtin_ia32_vcvtsh2si32((__v8hf
)__A
, _MM_FROUND_CUR_DIRECTION
);
2273 #define _mm_cvt_roundsh_u32(A, R) \
2274 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2276 static __inline__
unsigned int __DEFAULT_FN_ATTRS128
2277 _mm_cvtsh_u32(__m128h __A
) {
2278 return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf
)__A
,
2279 _MM_FROUND_CUR_DIRECTION
);
2283 #define _mm_cvt_roundsh_i64(A, R) \
2284 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2286 static __inline__
long long __DEFAULT_FN_ATTRS128
_mm_cvtsh_i64(__m128h __A
) {
2287 return (long long)__builtin_ia32_vcvtsh2si64((__v8hf
)__A
,
2288 _MM_FROUND_CUR_DIRECTION
);
2291 #define _mm_cvt_roundsh_u64(A, R) \
2292 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2294 static __inline__
unsigned long long __DEFAULT_FN_ATTRS128
2295 _mm_cvtsh_u64(__m128h __A
) {
2296 return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2297 (__v8hf
)__A
, _MM_FROUND_CUR_DIRECTION
);
2299 #endif // __x86_64__
2301 #define _mm_cvt_roundu32_sh(A, B, R) \
2302 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2304 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2305 _mm_cvtu32_sh(__m128h __A
, unsigned int __B
) {
2311 #define _mm_cvt_roundu64_sh(A, B, R) \
2312 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2315 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2316 _mm_cvtu64_sh(__m128h __A
, unsigned long long __B
) {
2322 #define _mm_cvt_roundi32_sh(A, B, R) \
2323 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2325 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvti32_sh(__m128h __A
,
2332 #define _mm_cvt_roundi64_sh(A, B, R) \
2333 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2335 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvti64_sh(__m128h __A
,
2342 #define _mm_cvtt_roundsh_i32(A, R) \
2343 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2345 static __inline__
int __DEFAULT_FN_ATTRS128
_mm_cvttsh_i32(__m128h __A
) {
2346 return (int)__builtin_ia32_vcvttsh2si32((__v8hf
)__A
,
2347 _MM_FROUND_CUR_DIRECTION
);
2351 #define _mm_cvtt_roundsh_i64(A, R) \
2352 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2354 static __inline__
long long __DEFAULT_FN_ATTRS128
_mm_cvttsh_i64(__m128h __A
) {
2355 return (long long)__builtin_ia32_vcvttsh2si64((__v8hf
)__A
,
2356 _MM_FROUND_CUR_DIRECTION
);
2360 #define _mm_cvtt_roundsh_u32(A, R) \
2361 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2363 static __inline__
unsigned int __DEFAULT_FN_ATTRS128
2364 _mm_cvttsh_u32(__m128h __A
) {
2365 return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf
)__A
,
2366 _MM_FROUND_CUR_DIRECTION
);
2370 #define _mm_cvtt_roundsh_u64(A, R) \
2371 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2373 static __inline__
unsigned long long __DEFAULT_FN_ATTRS128
2374 _mm_cvttsh_u64(__m128h __A
) {
2375 return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2376 (__v8hf
)__A
, _MM_FROUND_CUR_DIRECTION
);
2380 #define _mm512_cvtx_roundph_ps(A, R) \
2381 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2382 (__v16sf)_mm512_undefined_ps(), \
2383 (__mmask16)(-1), (int)(R)))
2385 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2386 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2387 (__mmask16)(U), (int)(R)))
2389 #define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2390 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2391 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2393 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_cvtxph_ps(__m256h __A
) {
2394 return (__m512
)__builtin_ia32_vcvtph2psx512_mask(
2395 (__v16hf
)__A
, (__v16sf
)_mm512_setzero_ps(), (__mmask16
)-1,
2396 _MM_FROUND_CUR_DIRECTION
);
2399 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2400 _mm512_mask_cvtxph_ps(__m512 __W
, __mmask16 __U
, __m256h __A
) {
2401 return (__m512
)__builtin_ia32_vcvtph2psx512_mask(
2402 (__v16hf
)__A
, (__v16sf
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
2405 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2406 _mm512_maskz_cvtxph_ps(__mmask16 __U
, __m256h __A
) {
2407 return (__m512
)__builtin_ia32_vcvtph2psx512_mask(
2408 (__v16hf
)__A
, (__v16sf
)_mm512_setzero_ps(), (__mmask16
)__U
,
2409 _MM_FROUND_CUR_DIRECTION
);
2412 #define _mm512_cvtx_roundps_ph(A, R) \
2413 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2414 (__v16hf)_mm256_undefined_ph(), \
2415 (__mmask16)(-1), (int)(R)))
2417 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2418 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2419 (__mmask16)(U), (int)(R)))
2421 #define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2422 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2423 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2425 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_cvtxps_ph(__m512 __A
) {
2426 return (__m256h
)__builtin_ia32_vcvtps2phx512_mask(
2427 (__v16sf
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)-1,
2428 _MM_FROUND_CUR_DIRECTION
);
2431 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2432 _mm512_mask_cvtxps_ph(__m256h __W
, __mmask16 __U
, __m512 __A
) {
2433 return (__m256h
)__builtin_ia32_vcvtps2phx512_mask(
2434 (__v16sf
)__A
, (__v16hf
)__W
, (__mmask16
)__U
, _MM_FROUND_CUR_DIRECTION
);
2437 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2438 _mm512_maskz_cvtxps_ph(__mmask16 __U
, __m512 __A
) {
2439 return (__m256h
)__builtin_ia32_vcvtps2phx512_mask(
2440 (__v16sf
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)__U
,
2441 _MM_FROUND_CUR_DIRECTION
);
2444 #define _mm512_fmadd_round_ph(A, B, C, R) \
2445 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2446 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2447 (__mmask32)-1, (int)(R)))
2449 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2450 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2451 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2452 (__mmask32)(U), (int)(R)))
2454 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2455 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2456 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2457 (__mmask32)(U), (int)(R)))
2459 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2460 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2461 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2462 (__mmask32)(U), (int)(R)))
2464 #define _mm512_fmsub_round_ph(A, B, C, R) \
2465 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2466 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2467 (__mmask32)-1, (int)(R)))
2469 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2470 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2471 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2472 (__mmask32)(U), (int)(R)))
2474 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2475 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2476 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2477 (__mmask32)(U), (int)(R)))
2479 #define _mm512_fnmadd_round_ph(A, B, C, R) \
2480 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2481 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2482 (__mmask32)-1, (int)(R)))
2484 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2485 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2486 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2487 (__mmask32)(U), (int)(R)))
2489 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2490 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2491 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2492 (__mmask32)(U), (int)(R)))
2494 #define _mm512_fnmsub_round_ph(A, B, C, R) \
2495 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2496 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2497 (__mmask32)-1, (int)(R)))
2499 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2500 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2501 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2502 (__mmask32)(U), (int)(R)))
2504 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmadd_ph(__m512h __A
,
2507 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, (__v32hf
)__B
,
2508 (__v32hf
)__C
, (__mmask32
)-1,
2509 _MM_FROUND_CUR_DIRECTION
);
2512 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2513 _mm512_mask_fmadd_ph(__m512h __A
, __mmask32 __U
, __m512h __B
, __m512h __C
) {
2514 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, (__v32hf
)__B
,
2515 (__v32hf
)__C
, (__mmask32
)__U
,
2516 _MM_FROUND_CUR_DIRECTION
);
2519 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2520 _mm512_mask3_fmadd_ph(__m512h __A
, __m512h __B
, __m512h __C
, __mmask32 __U
) {
2521 return (__m512h
)__builtin_ia32_vfmaddph512_mask3((__v32hf
)__A
, (__v32hf
)__B
,
2522 (__v32hf
)__C
, (__mmask32
)__U
,
2523 _MM_FROUND_CUR_DIRECTION
);
2526 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2527 _mm512_maskz_fmadd_ph(__mmask32 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
2528 return (__m512h
)__builtin_ia32_vfmaddph512_maskz((__v32hf
)__A
, (__v32hf
)__B
,
2529 (__v32hf
)__C
, (__mmask32
)__U
,
2530 _MM_FROUND_CUR_DIRECTION
);
2533 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmsub_ph(__m512h __A
,
2536 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, (__v32hf
)__B
,
2537 -(__v32hf
)__C
, (__mmask32
)-1,
2538 _MM_FROUND_CUR_DIRECTION
);
2541 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2542 _mm512_mask_fmsub_ph(__m512h __A
, __mmask32 __U
, __m512h __B
, __m512h __C
) {
2543 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, (__v32hf
)__B
,
2544 -(__v32hf
)__C
, (__mmask32
)__U
,
2545 _MM_FROUND_CUR_DIRECTION
);
2548 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2549 _mm512_maskz_fmsub_ph(__mmask32 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
2550 return (__m512h
)__builtin_ia32_vfmaddph512_maskz(
2551 (__v32hf
)__A
, (__v32hf
)__B
, -(__v32hf
)__C
, (__mmask32
)__U
,
2552 _MM_FROUND_CUR_DIRECTION
);
2555 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fnmadd_ph(__m512h __A
,
2558 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, -(__v32hf
)__B
,
2559 (__v32hf
)__C
, (__mmask32
)-1,
2560 _MM_FROUND_CUR_DIRECTION
);
2563 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2564 _mm512_mask3_fnmadd_ph(__m512h __A
, __m512h __B
, __m512h __C
, __mmask32 __U
) {
2565 return (__m512h
)__builtin_ia32_vfmaddph512_mask3(-(__v32hf
)__A
, (__v32hf
)__B
,
2566 (__v32hf
)__C
, (__mmask32
)__U
,
2567 _MM_FROUND_CUR_DIRECTION
);
2570 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2571 _mm512_maskz_fnmadd_ph(__mmask32 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
2572 return (__m512h
)__builtin_ia32_vfmaddph512_maskz(-(__v32hf
)__A
, (__v32hf
)__B
,
2573 (__v32hf
)__C
, (__mmask32
)__U
,
2574 _MM_FROUND_CUR_DIRECTION
);
2577 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fnmsub_ph(__m512h __A
,
2580 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, -(__v32hf
)__B
,
2581 -(__v32hf
)__C
, (__mmask32
)-1,
2582 _MM_FROUND_CUR_DIRECTION
);
2585 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2586 _mm512_maskz_fnmsub_ph(__mmask32 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
2587 return (__m512h
)__builtin_ia32_vfmaddph512_maskz(
2588 -(__v32hf
)__A
, (__v32hf
)__B
, -(__v32hf
)__C
, (__mmask32
)__U
,
2589 _MM_FROUND_CUR_DIRECTION
);
2592 #define _mm512_fmaddsub_round_ph(A, B, C, R) \
2593 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2594 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2595 (__mmask32)-1, (int)(R)))
2597 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2598 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2599 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2600 (__mmask32)(U), (int)(R)))
2602 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2603 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2604 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2605 (__mmask32)(U), (int)(R)))
2607 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2608 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2609 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2610 (__mmask32)(U), (int)(R)))
2612 #define _mm512_fmsubadd_round_ph(A, B, C, R) \
2613 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2614 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2615 (__mmask32)-1, (int)(R)))
2617 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2618 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2619 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2620 (__mmask32)(U), (int)(R)))
2622 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2623 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2624 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2625 (__mmask32)(U), (int)(R)))
2627 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2628 _mm512_fmaddsub_ph(__m512h __A
, __m512h __B
, __m512h __C
) {
2629 return (__m512h
)__builtin_ia32_vfmaddsubph512_mask(
2630 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)__C
, (__mmask32
)-1,
2631 _MM_FROUND_CUR_DIRECTION
);
2634 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2635 _mm512_mask_fmaddsub_ph(__m512h __A
, __mmask32 __U
, __m512h __B
, __m512h __C
) {
2636 return (__m512h
)__builtin_ia32_vfmaddsubph512_mask(
2637 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)__C
, (__mmask32
)__U
,
2638 _MM_FROUND_CUR_DIRECTION
);
2641 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2642 _mm512_mask3_fmaddsub_ph(__m512h __A
, __m512h __B
, __m512h __C
, __mmask32 __U
) {
2643 return (__m512h
)__builtin_ia32_vfmaddsubph512_mask3(
2644 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)__C
, (__mmask32
)__U
,
2645 _MM_FROUND_CUR_DIRECTION
);
2648 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2649 _mm512_maskz_fmaddsub_ph(__mmask32 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
2650 return (__m512h
)__builtin_ia32_vfmaddsubph512_maskz(
2651 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)__C
, (__mmask32
)__U
,
2652 _MM_FROUND_CUR_DIRECTION
);
2655 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2656 _mm512_fmsubadd_ph(__m512h __A
, __m512h __B
, __m512h __C
) {
2657 return (__m512h
)__builtin_ia32_vfmaddsubph512_mask(
2658 (__v32hf
)__A
, (__v32hf
)__B
, -(__v32hf
)__C
, (__mmask32
)-1,
2659 _MM_FROUND_CUR_DIRECTION
);
2662 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2663 _mm512_mask_fmsubadd_ph(__m512h __A
, __mmask32 __U
, __m512h __B
, __m512h __C
) {
2664 return (__m512h
)__builtin_ia32_vfmaddsubph512_mask(
2665 (__v32hf
)__A
, (__v32hf
)__B
, -(__v32hf
)__C
, (__mmask32
)__U
,
2666 _MM_FROUND_CUR_DIRECTION
);
2669 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2670 _mm512_maskz_fmsubadd_ph(__mmask32 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
2671 return (__m512h
)__builtin_ia32_vfmaddsubph512_maskz(
2672 (__v32hf
)__A
, (__v32hf
)__B
, -(__v32hf
)__C
, (__mmask32
)__U
,
2673 _MM_FROUND_CUR_DIRECTION
);
2676 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2677 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2678 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2679 (__mmask32)(U), (int)(R)))
2681 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2682 _mm512_mask3_fmsub_ph(__m512h __A
, __m512h __B
, __m512h __C
, __mmask32 __U
) {
2683 return (__m512h
)__builtin_ia32_vfmsubph512_mask3((__v32hf
)__A
, (__v32hf
)__B
,
2684 (__v32hf
)__C
, (__mmask32
)__U
,
2685 _MM_FROUND_CUR_DIRECTION
);
2688 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2689 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2690 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2691 (__mmask32)(U), (int)(R)))
2693 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2694 _mm512_mask3_fmsubadd_ph(__m512h __A
, __m512h __B
, __m512h __C
, __mmask32 __U
) {
2695 return (__m512h
)__builtin_ia32_vfmsubaddph512_mask3(
2696 (__v32hf
)__A
, (__v32hf
)__B
, (__v32hf
)__C
, (__mmask32
)__U
,
2697 _MM_FROUND_CUR_DIRECTION
);
2700 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2701 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2702 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2703 (__mmask32)(U), (int)(R)))
2705 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2706 _mm512_mask_fnmadd_ph(__m512h __A
, __mmask32 __U
, __m512h __B
, __m512h __C
) {
2707 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, -(__v32hf
)__B
,
2708 (__v32hf
)__C
, (__mmask32
)__U
,
2709 _MM_FROUND_CUR_DIRECTION
);
2712 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2713 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2714 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2715 (__mmask32)(U), (int)(R)))
2717 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2718 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2719 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2720 (__mmask32)(U), (int)(R)))
2722 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2723 _mm512_mask_fnmsub_ph(__m512h __A
, __mmask32 __U
, __m512h __B
, __m512h __C
) {
2724 return (__m512h
)__builtin_ia32_vfmaddph512_mask((__v32hf
)__A
, -(__v32hf
)__B
,
2725 -(__v32hf
)__C
, (__mmask32
)__U
,
2726 _MM_FROUND_CUR_DIRECTION
);
2729 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2730 _mm512_mask3_fnmsub_ph(__m512h __A
, __m512h __B
, __m512h __C
, __mmask32 __U
) {
2731 return (__m512h
)__builtin_ia32_vfmsubph512_mask3(-(__v32hf
)__A
, (__v32hf
)__B
,
2732 (__v32hf
)__C
, (__mmask32
)__U
,
2733 _MM_FROUND_CUR_DIRECTION
);
2736 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmadd_sh(__m128h __W
,
2739 return __builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, (__v8hf
)__A
, (__v8hf
)__B
,
2740 (__mmask8
)-1, _MM_FROUND_CUR_DIRECTION
);
2743 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmadd_sh(__m128h __W
,
2747 return __builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, (__v8hf
)__A
, (__v8hf
)__B
,
2748 (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2751 #define _mm_fmadd_round_sh(A, B, C, R) \
2752 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2753 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2754 (__mmask8)-1, (int)(R)))
2756 #define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2757 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2758 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2759 (__mmask8)(U), (int)(R)))
2761 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2762 _mm_maskz_fmadd_sh(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
2763 return __builtin_ia32_vfmaddsh3_maskz((__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__C
,
2765 _MM_FROUND_CUR_DIRECTION
);
2768 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2769 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2770 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2771 (__mmask8)(U), (int)(R)))
2773 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2774 _mm_mask3_fmadd_sh(__m128h __W
, __m128h __X
, __m128h __Y
, __mmask8 __U
) {
2775 return __builtin_ia32_vfmaddsh3_mask3((__v8hf
)__W
, (__v8hf
)__X
, (__v8hf
)__Y
,
2777 _MM_FROUND_CUR_DIRECTION
);
2780 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2781 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2782 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2783 (__mmask8)(U), (int)(R)))
2785 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmsub_sh(__m128h __W
,
2788 return (__m128h
)__builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, (__v8hf
)__A
,
2789 -(__v8hf
)__B
, (__mmask8
)-1,
2790 _MM_FROUND_CUR_DIRECTION
);
2793 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmsub_sh(__m128h __W
,
2797 return (__m128h
)__builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, (__v8hf
)__A
,
2798 -(__v8hf
)__B
, (__mmask8
)__U
,
2799 _MM_FROUND_CUR_DIRECTION
);
2802 #define _mm_fmsub_round_sh(A, B, C, R) \
2803 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2804 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2805 (__mmask8)-1, (int)(R)))
2807 #define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2808 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2809 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2810 (__mmask8)(U), (int)(R)))
2812 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2813 _mm_maskz_fmsub_sh(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
2814 return (__m128h
)__builtin_ia32_vfmaddsh3_maskz((__v8hf
)__A
, (__v8hf
)__B
,
2815 -(__v8hf
)__C
, (__mmask8
)__U
,
2816 _MM_FROUND_CUR_DIRECTION
);
2819 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2820 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2821 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2822 (__mmask8)(U), (int)R))
2824 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2825 _mm_mask3_fmsub_sh(__m128h __W
, __m128h __X
, __m128h __Y
, __mmask8 __U
) {
2826 return __builtin_ia32_vfmsubsh3_mask3((__v8hf
)__W
, (__v8hf
)__X
, (__v8hf
)__Y
,
2828 _MM_FROUND_CUR_DIRECTION
);
2831 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2832 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2833 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2834 (__mmask8)(U), (int)(R)))
2836 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fnmadd_sh(__m128h __W
,
2839 return __builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, -(__v8hf
)__A
, (__v8hf
)__B
,
2840 (__mmask8
)-1, _MM_FROUND_CUR_DIRECTION
);
2843 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2844 _mm_mask_fnmadd_sh(__m128h __W
, __mmask8 __U
, __m128h __A
, __m128h __B
) {
2845 return __builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, -(__v8hf
)__A
, (__v8hf
)__B
,
2846 (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2849 #define _mm_fnmadd_round_sh(A, B, C, R) \
2850 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2851 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2852 (__mmask8)-1, (int)(R)))
2854 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2855 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2856 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2857 (__mmask8)(U), (int)(R)))
2859 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2860 _mm_maskz_fnmadd_sh(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
2861 return __builtin_ia32_vfmaddsh3_maskz((__v8hf
)__A
, -(__v8hf
)__B
, (__v8hf
)__C
,
2863 _MM_FROUND_CUR_DIRECTION
);
2866 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2867 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2868 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2869 (__mmask8)(U), (int)(R)))
2871 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2872 _mm_mask3_fnmadd_sh(__m128h __W
, __m128h __X
, __m128h __Y
, __mmask8 __U
) {
2873 return __builtin_ia32_vfmaddsh3_mask3((__v8hf
)__W
, -(__v8hf
)__X
, (__v8hf
)__Y
,
2875 _MM_FROUND_CUR_DIRECTION
);
2878 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2879 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2880 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2881 (__mmask8)(U), (int)(R)))
2883 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fnmsub_sh(__m128h __W
,
2886 return __builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, -(__v8hf
)__A
, -(__v8hf
)__B
,
2887 (__mmask8
)-1, _MM_FROUND_CUR_DIRECTION
);
2890 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2891 _mm_mask_fnmsub_sh(__m128h __W
, __mmask8 __U
, __m128h __A
, __m128h __B
) {
2892 return __builtin_ia32_vfmaddsh3_mask((__v8hf
)__W
, -(__v8hf
)__A
, -(__v8hf
)__B
,
2893 (__mmask8
)__U
, _MM_FROUND_CUR_DIRECTION
);
2896 #define _mm_fnmsub_round_sh(A, B, C, R) \
2897 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2898 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2899 (__mmask8)-1, (int)(R)))
2901 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2902 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2903 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2904 (__mmask8)(U), (int)(R)))
2906 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2907 _mm_maskz_fnmsub_sh(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
2908 return __builtin_ia32_vfmaddsh3_maskz((__v8hf
)__A
, -(__v8hf
)__B
, -(__v8hf
)__C
,
2910 _MM_FROUND_CUR_DIRECTION
);
2913 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2914 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2915 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2916 (__mmask8)(U), (int)(R)))
2918 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2919 _mm_mask3_fnmsub_sh(__m128h __W
, __m128h __X
, __m128h __Y
, __mmask8 __U
) {
2920 return __builtin_ia32_vfmsubsh3_mask3((__v8hf
)__W
, -(__v8hf
)__X
, (__v8hf
)__Y
,
2922 _MM_FROUND_CUR_DIRECTION
);
2925 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2926 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2927 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2928 (__mmask8)(U), (int)(R)))
2930 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fcmadd_sch(__m128h __A
,
2933 return (__m128h
)__builtin_ia32_vfcmaddcsh_mask((__v4sf
)__A
, (__v4sf
)__B
,
2934 (__v4sf
)__C
, (__mmask8
)-1,
2935 _MM_FROUND_CUR_DIRECTION
);
2938 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2939 _mm_mask_fcmadd_sch(__m128h __A
, __mmask8 __U
, __m128h __B
, __m128h __C
) {
2940 return (__m128h
)__builtin_ia32_vfcmaddcsh_round_mask(
2941 (__v4sf
)__A
, (__v4sf
)(__B
), (__v4sf
)(__C
), __U
, _MM_FROUND_CUR_DIRECTION
);
2944 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2945 _mm_maskz_fcmadd_sch(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
2946 return (__m128h
)__builtin_ia32_vfcmaddcsh_maskz((__v4sf
)__A
, (__v4sf
)__B
,
2947 (__v4sf
)__C
, (__mmask8
)__U
,
2948 _MM_FROUND_CUR_DIRECTION
);
2951 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2952 _mm_mask3_fcmadd_sch(__m128h __A
, __m128h __B
, __m128h __C
, __mmask8 __U
) {
2953 return (__m128h
)__builtin_ia32_vfcmaddcsh_round_mask3(
2954 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
, __U
, _MM_FROUND_CUR_DIRECTION
);
2957 #define _mm_fcmadd_round_sch(A, B, C, R) \
2958 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2959 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2960 (__mmask8)-1, (int)(R)))
2962 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2963 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2964 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2965 (__mmask8)(U), (int)(R)))
2967 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2968 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2969 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2970 (__mmask8)(U), (int)(R)))
2972 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2973 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2974 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2975 (__mmask8)(U), (int)(R)))
2977 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmadd_sch(__m128h __A
,
2980 return (__m128h
)__builtin_ia32_vfmaddcsh_mask((__v4sf
)__A
, (__v4sf
)__B
,
2981 (__v4sf
)__C
, (__mmask8
)-1,
2982 _MM_FROUND_CUR_DIRECTION
);
2985 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2986 _mm_mask_fmadd_sch(__m128h __A
, __mmask8 __U
, __m128h __B
, __m128h __C
) {
2987 return (__m128h
)__builtin_ia32_vfmaddcsh_round_mask(
2988 (__v4sf
)__A
, (__v4sf
)(__B
), (__v4sf
)(__C
), __U
, _MM_FROUND_CUR_DIRECTION
);
2991 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2992 _mm_maskz_fmadd_sch(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
2993 return (__m128h
)__builtin_ia32_vfmaddcsh_maskz((__v4sf
)__A
, (__v4sf
)__B
,
2994 (__v4sf
)__C
, (__mmask8
)__U
,
2995 _MM_FROUND_CUR_DIRECTION
);
2998 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2999 _mm_mask3_fmadd_sch(__m128h __A
, __m128h __B
, __m128h __C
, __mmask8 __U
) {
3000 return (__m128h
)__builtin_ia32_vfmaddcsh_round_mask3(
3001 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
, __U
, _MM_FROUND_CUR_DIRECTION
);
3004 #define _mm_fmadd_round_sch(A, B, C, R) \
3005 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3006 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3007 (__mmask8)-1, (int)(R)))
3009 #define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3010 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3011 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3012 (__mmask8)(U), (int)(R)))
3014 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3015 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3016 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3017 (__mmask8)(U), (int)(R)))
3019 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3020 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3021 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3022 (__mmask8)(U), (int)(R)))
3024 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fcmul_sch(__m128h __A
,
3026 return (__m128h
)__builtin_ia32_vfcmulcsh_mask(
3027 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)_mm_undefined_ph(), (__mmask8
)-1,
3028 _MM_FROUND_CUR_DIRECTION
);
3031 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3032 _mm_mask_fcmul_sch(__m128h __W
, __mmask8 __U
, __m128h __A
, __m128h __B
) {
3033 return (__m128h
)__builtin_ia32_vfcmulcsh_mask((__v4sf
)__A
, (__v4sf
)__B
,
3034 (__v4sf
)__W
, (__mmask8
)__U
,
3035 _MM_FROUND_CUR_DIRECTION
);
3038 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3039 _mm_maskz_fcmul_sch(__mmask8 __U
, __m128h __A
, __m128h __B
) {
3040 return (__m128h
)__builtin_ia32_vfcmulcsh_mask(
3041 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)_mm_setzero_ph(), (__mmask8
)__U
,
3042 _MM_FROUND_CUR_DIRECTION
);
3045 #define _mm_fcmul_round_sch(A, B, R) \
3046 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3047 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3048 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3050 #define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3051 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3052 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3053 (__mmask8)(U), (int)(R)))
3055 #define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3056 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3057 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3058 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3060 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmul_sch(__m128h __A
,
3062 return (__m128h
)__builtin_ia32_vfmulcsh_mask(
3063 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)_mm_undefined_ph(), (__mmask8
)-1,
3064 _MM_FROUND_CUR_DIRECTION
);
3067 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmul_sch(__m128h __W
,
3071 return (__m128h
)__builtin_ia32_vfmulcsh_mask((__v4sf
)__A
, (__v4sf
)__B
,
3072 (__v4sf
)__W
, (__mmask8
)__U
,
3073 _MM_FROUND_CUR_DIRECTION
);
3076 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3077 _mm_maskz_fmul_sch(__mmask8 __U
, __m128h __A
, __m128h __B
) {
3078 return (__m128h
)__builtin_ia32_vfmulcsh_mask(
3079 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)_mm_setzero_ph(), (__mmask8
)__U
,
3080 _MM_FROUND_CUR_DIRECTION
);
3083 #define _mm_fmul_round_sch(A, B, R) \
3084 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3085 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3086 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3088 #define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3089 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3090 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3091 (__mmask8)(U), (int)(R)))
3093 #define _mm_maskz_fmul_round_sch(U, A, B, R) \
3094 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3095 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3096 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3098 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fcmul_pch(__m512h __A
,
3100 return (__m512h
)__builtin_ia32_vfcmulcph512_mask(
3101 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)_mm512_undefined_ph(), (__mmask16
)-1,
3102 _MM_FROUND_CUR_DIRECTION
);
3105 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3106 _mm512_mask_fcmul_pch(__m512h __W
, __mmask16 __U
, __m512h __A
, __m512h __B
) {
3107 return (__m512h
)__builtin_ia32_vfcmulcph512_mask((__v16sf
)__A
, (__v16sf
)__B
,
3108 (__v16sf
)__W
, (__mmask16
)__U
,
3109 _MM_FROUND_CUR_DIRECTION
);
3112 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3113 _mm512_maskz_fcmul_pch(__mmask16 __U
, __m512h __A
, __m512h __B
) {
3114 return (__m512h
)__builtin_ia32_vfcmulcph512_mask(
3115 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)_mm512_setzero_ph(), (__mmask16
)__U
,
3116 _MM_FROUND_CUR_DIRECTION
);
3119 #define _mm512_fcmul_round_pch(A, B, R) \
3120 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3121 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3122 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3124 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3125 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3126 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3127 (__mmask16)(U), (int)(R)))
3129 #define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3130 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3131 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3132 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3134 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmul_pch(__m512h __A
,
3136 return (__m512h
)__builtin_ia32_vfmulcph512_mask(
3137 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)_mm512_undefined_ph(), (__mmask16
)-1,
3138 _MM_FROUND_CUR_DIRECTION
);
3141 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3142 _mm512_mask_fmul_pch(__m512h __W
, __mmask16 __U
, __m512h __A
, __m512h __B
) {
3143 return (__m512h
)__builtin_ia32_vfmulcph512_mask((__v16sf
)__A
, (__v16sf
)__B
,
3144 (__v16sf
)__W
, (__mmask16
)__U
,
3145 _MM_FROUND_CUR_DIRECTION
);
3148 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3149 _mm512_maskz_fmul_pch(__mmask16 __U
, __m512h __A
, __m512h __B
) {
3150 return (__m512h
)__builtin_ia32_vfmulcph512_mask(
3151 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)_mm512_setzero_ph(), (__mmask16
)__U
,
3152 _MM_FROUND_CUR_DIRECTION
);
3155 #define _mm512_fmul_round_pch(A, B, R) \
3156 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3157 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3158 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3160 #define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3161 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3162 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3163 (__mmask16)(U), (int)(R)))
3165 #define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3166 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3167 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3168 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3170 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fcmadd_pch(__m512h __A
,
3173 return (__m512h
)__builtin_ia32_vfcmaddcph512_mask3(
3174 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)__C
, (__mmask16
)-1,
3175 _MM_FROUND_CUR_DIRECTION
);
3178 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3179 _mm512_mask_fcmadd_pch(__m512h __A
, __mmask16 __U
, __m512h __B
, __m512h __C
) {
3180 return (__m512h
)__builtin_ia32_vfcmaddcph512_mask(
3181 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)__C
, (__mmask16
)__U
,
3182 _MM_FROUND_CUR_DIRECTION
);
3185 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3186 _mm512_mask3_fcmadd_pch(__m512h __A
, __m512h __B
, __m512h __C
, __mmask16 __U
) {
3187 return (__m512h
)__builtin_ia32_vfcmaddcph512_mask3(
3188 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)__C
, (__mmask16
)__U
,
3189 _MM_FROUND_CUR_DIRECTION
);
3192 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3193 _mm512_maskz_fcmadd_pch(__mmask16 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
3194 return (__m512h
)__builtin_ia32_vfcmaddcph512_maskz(
3195 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)__C
, (__mmask16
)__U
,
3196 _MM_FROUND_CUR_DIRECTION
);
3199 #define _mm512_fcmadd_round_pch(A, B, C, R) \
3200 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3201 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3202 (__mmask16)-1, (int)(R)))
3204 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3205 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3206 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3207 (__mmask16)(U), (int)(R)))
3209 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3210 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3211 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3212 (__mmask16)(U), (int)(R)))
3214 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3215 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3216 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3217 (__mmask16)(U), (int)(R)))
3219 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmadd_pch(__m512h __A
,
3222 return (__m512h
)__builtin_ia32_vfmaddcph512_mask3((__v16sf
)__A
, (__v16sf
)__B
,
3223 (__v16sf
)__C
, (__mmask16
)-1,
3224 _MM_FROUND_CUR_DIRECTION
);
3227 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3228 _mm512_mask_fmadd_pch(__m512h __A
, __mmask16 __U
, __m512h __B
, __m512h __C
) {
3229 return (__m512h
)__builtin_ia32_vfmaddcph512_mask((__v16sf
)__A
, (__v16sf
)__B
,
3230 (__v16sf
)__C
, (__mmask16
)__U
,
3231 _MM_FROUND_CUR_DIRECTION
);
3234 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3235 _mm512_mask3_fmadd_pch(__m512h __A
, __m512h __B
, __m512h __C
, __mmask16 __U
) {
3236 return (__m512h
)__builtin_ia32_vfmaddcph512_mask3(
3237 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)__C
, (__mmask16
)__U
,
3238 _MM_FROUND_CUR_DIRECTION
);
3241 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3242 _mm512_maskz_fmadd_pch(__mmask16 __U
, __m512h __A
, __m512h __B
, __m512h __C
) {
3243 return (__m512h
)__builtin_ia32_vfmaddcph512_maskz(
3244 (__v16sf
)__A
, (__v16sf
)__B
, (__v16sf
)__C
, (__mmask16
)__U
,
3245 _MM_FROUND_CUR_DIRECTION
);
3248 #define _mm512_fmadd_round_pch(A, B, C, R) \
3249 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3250 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3251 (__mmask16)-1, (int)(R)))
3253 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3254 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3255 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3256 (__mmask16)(U), (int)(R)))
3258 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3259 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3260 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3261 (__mmask16)(U), (int)(R)))
3263 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3264 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3265 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3266 (__mmask16)(U), (int)(R)))
3268 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3269 _mm512_reduce_add_ph(__m512h __W
) {
3270 return __builtin_ia32_reduce_fadd_ph512(-0.0f16
, __W
);
3273 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3274 _mm512_reduce_mul_ph(__m512h __W
) {
3275 return __builtin_ia32_reduce_fmul_ph512(1.0f16
, __W
);
3278 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3279 _mm512_reduce_max_ph(__m512h __V
) {
3280 return __builtin_ia32_reduce_fmax_ph512(__V
);
3283 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3284 _mm512_reduce_min_ph(__m512h __V
) {
3285 return __builtin_ia32_reduce_fmin_ph512(__V
);
3288 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3289 _mm512_mask_blend_ph(__mmask32 __U
, __m512h __A
, __m512h __W
) {
3290 return (__m512h
)__builtin_ia32_selectph_512((__mmask32
)__U
, (__v32hf
)__W
,
3294 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3295 _mm512_permutex2var_ph(__m512h __A
, __m512i __I
, __m512h __B
) {
3296 return (__m512h
)__builtin_ia32_vpermi2varhi512((__v32hi
)__A
, (__v32hi
)__I
,
3300 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3301 _mm512_permutexvar_ph(__m512i __A
, __m512h __B
) {
3302 return (__m512h
)__builtin_ia32_permvarhi512((__v32hi
)__B
, (__v32hi
)__A
);
3305 // intrinsics below are alias for f*mul_*ch
3306 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3307 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3308 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3309 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3310 #define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3311 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3312 #define _mm512_maskz_mul_round_pch(U, A, B, R) \
3313 _mm512_maskz_fmul_round_pch(U, A, B, R)
3315 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3316 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3317 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3318 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3319 #define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3320 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3321 #define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3322 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3324 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3325 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3326 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3327 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3328 #define _mm_mask_mul_round_sch(W, U, A, B, R) \
3329 _mm_mask_fmul_round_sch(W, U, A, B, R)
3330 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3332 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3333 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3334 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3335 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3336 #define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3337 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3338 #define _mm_maskz_cmul_round_sch(U, A, B, R) \
3339 _mm_maskz_fcmul_round_sch(U, A, B, R)
3341 #undef __DEFAULT_FN_ATTRS128
3342 #undef __DEFAULT_FN_ATTRS256
3343 #undef __DEFAULT_FN_ATTRS512