1 /*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
11 "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
16 #ifndef __AVX512VLFP16INTRIN_H
17 #define __AVX512VLFP16INTRIN_H
19 /* Define the default attributes for the functions in this file. */
20 #define __DEFAULT_FN_ATTRS256 \
21 __attribute__((__always_inline__, __nodebug__, \
22 __target__("avx512fp16, avx512vl"), \
23 __min_vector_width__(256)))
24 #define __DEFAULT_FN_ATTRS128 \
25 __attribute__((__always_inline__, __nodebug__, \
26 __target__("avx512fp16, avx512vl"), \
27 __min_vector_width__(128)))
29 static __inline__ _Float16 __DEFAULT_FN_ATTRS128
_mm_cvtsh_h(__m128h __a
) {
33 static __inline__ _Float16 __DEFAULT_FN_ATTRS256
_mm256_cvtsh_h(__m256h __a
) {
37 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_set_sh(_Float16 __h
) {
38 return __extension__(__m128h
){__h
, 0, 0, 0, 0, 0, 0, 0};
41 static __inline __m128h __DEFAULT_FN_ATTRS128
_mm_set1_ph(_Float16 __h
) {
42 return (__m128h
)(__v8hf
){__h
, __h
, __h
, __h
, __h
, __h
, __h
, __h
};
45 static __inline __m256h __DEFAULT_FN_ATTRS256
_mm256_set1_ph(_Float16 __h
) {
46 return (__m256h
)(__v16hf
){__h
, __h
, __h
, __h
, __h
, __h
, __h
, __h
,
47 __h
, __h
, __h
, __h
, __h
, __h
, __h
, __h
};
50 static __inline __m128h __DEFAULT_FN_ATTRS128
51 _mm_set_ph(_Float16 __h1
, _Float16 __h2
, _Float16 __h3
, _Float16 __h4
,
52 _Float16 __h5
, _Float16 __h6
, _Float16 __h7
, _Float16 __h8
) {
53 return (__m128h
)(__v8hf
){__h8
, __h7
, __h6
, __h5
, __h4
, __h3
, __h2
, __h1
};
56 static __inline __m256h __DEFAULT_FN_ATTRS256
57 _mm256_set1_pch(_Float16 _Complex h
) {
58 return (__m256h
)_mm256_set1_ps(__builtin_bit_cast(float, h
));
61 static __inline __m128h __DEFAULT_FN_ATTRS128
62 _mm_set1_pch(_Float16 _Complex h
) {
63 return (__m128h
)_mm_set1_ps(__builtin_bit_cast(float, h
));
66 static __inline __m256h __DEFAULT_FN_ATTRS256
67 _mm256_set_ph(_Float16 __h1
, _Float16 __h2
, _Float16 __h3
, _Float16 __h4
,
68 _Float16 __h5
, _Float16 __h6
, _Float16 __h7
, _Float16 __h8
,
69 _Float16 __h9
, _Float16 __h10
, _Float16 __h11
, _Float16 __h12
,
70 _Float16 __h13
, _Float16 __h14
, _Float16 __h15
, _Float16 __h16
) {
71 return (__m256h
)(__v16hf
){__h16
, __h15
, __h14
, __h13
, __h12
, __h11
,
72 __h10
, __h9
, __h8
, __h7
, __h6
, __h5
,
73 __h4
, __h3
, __h2
, __h1
};
76 #define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \
77 _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1))
79 #define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
81 _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \
82 (h7), (h6), (h5), (h4), (h3), (h2), (h1))
84 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_add_ph(__m256h __A
,
86 return (__m256h
)((__v16hf
)__A
+ (__v16hf
)__B
);
89 static __inline__ __m256h __DEFAULT_FN_ATTRS256
90 _mm256_mask_add_ph(__m256h __W
, __mmask16 __U
, __m256h __A
, __m256h __B
) {
91 return (__m256h
)__builtin_ia32_selectph_256(
92 __U
, (__v16hf
)_mm256_add_ph(__A
, __B
), (__v16hf
)__W
);
95 static __inline__ __m256h __DEFAULT_FN_ATTRS256
96 _mm256_maskz_add_ph(__mmask16 __U
, __m256h __A
, __m256h __B
) {
97 return (__m256h
)__builtin_ia32_selectph_256(
98 __U
, (__v16hf
)_mm256_add_ph(__A
, __B
), (__v16hf
)_mm256_setzero_ph());
101 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_add_ph(__m128h __A
,
103 return (__m128h
)((__v8hf
)__A
+ (__v8hf
)__B
);
106 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_add_ph(__m128h __W
,
110 return (__m128h
)__builtin_ia32_selectph_128(__U
, (__v8hf
)_mm_add_ph(__A
, __B
),
114 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_add_ph(__mmask8 __U
,
117 return (__m128h
)__builtin_ia32_selectph_128(__U
, (__v8hf
)_mm_add_ph(__A
, __B
),
118 (__v8hf
)_mm_setzero_ph());
121 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_sub_ph(__m256h __A
,
123 return (__m256h
)((__v16hf
)__A
- (__v16hf
)__B
);
126 static __inline__ __m256h __DEFAULT_FN_ATTRS256
127 _mm256_mask_sub_ph(__m256h __W
, __mmask16 __U
, __m256h __A
, __m256h __B
) {
128 return (__m256h
)__builtin_ia32_selectph_256(
129 __U
, (__v16hf
)_mm256_sub_ph(__A
, __B
), (__v16hf
)__W
);
132 static __inline__ __m256h __DEFAULT_FN_ATTRS256
133 _mm256_maskz_sub_ph(__mmask16 __U
, __m256h __A
, __m256h __B
) {
134 return (__m256h
)__builtin_ia32_selectph_256(
135 __U
, (__v16hf
)_mm256_sub_ph(__A
, __B
), (__v16hf
)_mm256_setzero_ph());
138 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_sub_ph(__m128h __A
,
140 return (__m128h
)((__v8hf
)__A
- (__v8hf
)__B
);
143 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_sub_ph(__m128h __W
,
147 return (__m128h
)__builtin_ia32_selectph_128(__U
, (__v8hf
)_mm_sub_ph(__A
, __B
),
151 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_sub_ph(__mmask8 __U
,
154 return (__m128h
)__builtin_ia32_selectph_128(__U
, (__v8hf
)_mm_sub_ph(__A
, __B
),
155 (__v8hf
)_mm_setzero_ph());
158 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mul_ph(__m256h __A
,
160 return (__m256h
)((__v16hf
)__A
* (__v16hf
)__B
);
163 static __inline__ __m256h __DEFAULT_FN_ATTRS256
164 _mm256_mask_mul_ph(__m256h __W
, __mmask16 __U
, __m256h __A
, __m256h __B
) {
165 return (__m256h
)__builtin_ia32_selectph_256(
166 __U
, (__v16hf
)_mm256_mul_ph(__A
, __B
), (__v16hf
)__W
);
169 static __inline__ __m256h __DEFAULT_FN_ATTRS256
170 _mm256_maskz_mul_ph(__mmask16 __U
, __m256h __A
, __m256h __B
) {
171 return (__m256h
)__builtin_ia32_selectph_256(
172 __U
, (__v16hf
)_mm256_mul_ph(__A
, __B
), (__v16hf
)_mm256_setzero_ph());
175 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mul_ph(__m128h __A
,
177 return (__m128h
)((__v8hf
)__A
* (__v8hf
)__B
);
180 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_mul_ph(__m128h __W
,
184 return (__m128h
)__builtin_ia32_selectph_128(__U
, (__v8hf
)_mm_mul_ph(__A
, __B
),
188 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_mul_ph(__mmask8 __U
,
191 return (__m128h
)__builtin_ia32_selectph_128(__U
, (__v8hf
)_mm_mul_ph(__A
, __B
),
192 (__v8hf
)_mm_setzero_ph());
195 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_div_ph(__m256h __A
,
197 return (__m256h
)((__v16hf
)__A
/ (__v16hf
)__B
);
200 static __inline__ __m256h __DEFAULT_FN_ATTRS256
201 _mm256_mask_div_ph(__m256h __W
, __mmask16 __U
, __m256h __A
, __m256h __B
) {
202 return (__m256h
)__builtin_ia32_selectph_256(
203 __U
, (__v16hf
)_mm256_div_ph(__A
, __B
), (__v16hf
)__W
);
206 static __inline__ __m256h __DEFAULT_FN_ATTRS256
207 _mm256_maskz_div_ph(__mmask16 __U
, __m256h __A
, __m256h __B
) {
208 return (__m256h
)__builtin_ia32_selectph_256(
209 __U
, (__v16hf
)_mm256_div_ph(__A
, __B
), (__v16hf
)_mm256_setzero_ph());
212 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_div_ph(__m128h __A
,
214 return (__m128h
)((__v8hf
)__A
/ (__v8hf
)__B
);
217 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_div_ph(__m128h __W
,
221 return (__m128h
)__builtin_ia32_selectph_128(__U
, (__v8hf
)_mm_div_ph(__A
, __B
),
225 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_div_ph(__mmask8 __U
,
228 return (__m128h
)__builtin_ia32_selectph_128(__U
, (__v8hf
)_mm_div_ph(__A
, __B
),
229 (__v8hf
)_mm_setzero_ph());
232 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_min_ph(__m256h __A
,
234 return (__m256h
)__builtin_ia32_minph256((__v16hf
)__A
, (__v16hf
)__B
);
237 static __inline__ __m256h __DEFAULT_FN_ATTRS256
238 _mm256_mask_min_ph(__m256h __W
, __mmask16 __U
, __m256h __A
, __m256h __B
) {
239 return (__m256h
)__builtin_ia32_selectph_256(
241 (__v16hf
)__builtin_ia32_minph256((__v16hf
)__A
, (__v16hf
)__B
),
245 static __inline__ __m256h __DEFAULT_FN_ATTRS256
246 _mm256_maskz_min_ph(__mmask16 __U
, __m256h __A
, __m256h __B
) {
247 return (__m256h
)__builtin_ia32_selectph_256(
249 (__v16hf
)__builtin_ia32_minph256((__v16hf
)__A
, (__v16hf
)__B
),
250 (__v16hf
)_mm256_setzero_ph());
253 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_min_ph(__m128h __A
,
255 return (__m128h
)__builtin_ia32_minph128((__v8hf
)__A
, (__v8hf
)__B
);
258 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_min_ph(__m128h __W
,
262 return (__m128h
)__builtin_ia32_selectph_128(
263 (__mmask8
)__U
, (__v8hf
)__builtin_ia32_minph128((__v8hf
)__A
, (__v8hf
)__B
),
267 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_min_ph(__mmask8 __U
,
270 return (__m128h
)__builtin_ia32_selectph_128(
271 (__mmask8
)__U
, (__v8hf
)__builtin_ia32_minph128((__v8hf
)__A
, (__v8hf
)__B
),
272 (__v8hf
)_mm_setzero_ph());
275 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_max_ph(__m256h __A
,
277 return (__m256h
)__builtin_ia32_maxph256((__v16hf
)__A
, (__v16hf
)__B
);
280 static __inline__ __m256h __DEFAULT_FN_ATTRS256
281 _mm256_mask_max_ph(__m256h __W
, __mmask16 __U
, __m256h __A
, __m256h __B
) {
282 return (__m256h
)__builtin_ia32_selectph_256(
284 (__v16hf
)__builtin_ia32_maxph256((__v16hf
)__A
, (__v16hf
)__B
),
288 static __inline__ __m256h __DEFAULT_FN_ATTRS256
289 _mm256_maskz_max_ph(__mmask16 __U
, __m256h __A
, __m256h __B
) {
290 return (__m256h
)__builtin_ia32_selectph_256(
292 (__v16hf
)__builtin_ia32_maxph256((__v16hf
)__A
, (__v16hf
)__B
),
293 (__v16hf
)_mm256_setzero_ph());
296 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_max_ph(__m128h __A
,
298 return (__m128h
)__builtin_ia32_maxph128((__v8hf
)__A
, (__v8hf
)__B
);
301 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_max_ph(__m128h __W
,
305 return (__m128h
)__builtin_ia32_selectph_128(
306 (__mmask8
)__U
, (__v8hf
)__builtin_ia32_maxph128((__v8hf
)__A
, (__v8hf
)__B
),
310 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_max_ph(__mmask8 __U
,
313 return (__m128h
)__builtin_ia32_selectph_128(
314 (__mmask8
)__U
, (__v8hf
)__builtin_ia32_maxph128((__v8hf
)__A
, (__v8hf
)__B
),
315 (__v8hf
)_mm_setzero_ph());
318 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_abs_ph(__m256h __A
) {
319 return (__m256h
)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i
)__A
);
322 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_abs_ph(__m128h __A
) {
323 return (__m128h
)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i
)__A
);
326 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_conj_pch(__m256h __A
) {
327 return (__m256h
)_mm256_xor_ps((__m256
)__A
, _mm256_set1_ps(-0.0f
));
330 static __inline__ __m256h __DEFAULT_FN_ATTRS256
331 _mm256_mask_conj_pch(__m256h __W
, __mmask8 __U
, __m256h __A
) {
332 return (__m256h
)__builtin_ia32_selectps_256(
333 (__mmask8
)__U
, (__v8sf
)_mm256_conj_pch(__A
), (__v8sf
)__W
);
336 static __inline__ __m256h __DEFAULT_FN_ATTRS256
337 _mm256_maskz_conj_pch(__mmask8 __U
, __m256h __A
) {
338 return (__m256h
)__builtin_ia32_selectps_256(
339 (__mmask8
)__U
, (__v8sf
)_mm256_conj_pch(__A
), (__v8sf
)_mm256_setzero_ps());
342 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_conj_pch(__m128h __A
) {
343 return (__m128h
)_mm_xor_ps((__m128
)__A
, _mm_set1_ps(-0.0f
));
346 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_conj_pch(__m128h __W
,
349 return (__m128h
)__builtin_ia32_selectps_128(
350 (__mmask8
)__U
, (__v4sf
)_mm_conj_pch(__A
), (__v4sf
)__W
);
353 static __inline__ __m128h __DEFAULT_FN_ATTRS128
354 _mm_maskz_conj_pch(__mmask8 __U
, __m128h __A
) {
355 return (__m128h
)__builtin_ia32_selectps_128(
356 (__mmask8
)__U
, (__v4sf
)_mm_conj_pch(__A
), (__v4sf
)_mm_setzero_ps());
359 #define _mm256_cmp_ph_mask(a, b, p) \
360 ((__mmask16)__builtin_ia32_cmpph256_mask( \
361 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1))
363 #define _mm256_mask_cmp_ph_mask(m, a, b, p) \
364 ((__mmask16)__builtin_ia32_cmpph256_mask( \
365 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m)))
367 #define _mm_cmp_ph_mask(a, b, p) \
368 ((__mmask8)__builtin_ia32_cmpph128_mask( \
369 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1))
371 #define _mm_mask_cmp_ph_mask(m, a, b, p) \
372 ((__mmask8)__builtin_ia32_cmpph128_mask( \
373 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m)))
375 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_rcp_ph(__m256h __A
) {
376 return (__m256h
)__builtin_ia32_rcpph256_mask(
377 (__v16hf
)__A
, (__v16hf
)_mm256_undefined_ph(), (__mmask16
)-1);
380 static __inline__ __m256h __DEFAULT_FN_ATTRS256
381 _mm256_mask_rcp_ph(__m256h __W
, __mmask16 __U
, __m256h __A
) {
382 return (__m256h
)__builtin_ia32_rcpph256_mask((__v16hf
)__A
, (__v16hf
)__W
,
386 static __inline__ __m256h __DEFAULT_FN_ATTRS256
387 _mm256_maskz_rcp_ph(__mmask16 __U
, __m256h __A
) {
388 return (__m256h
)__builtin_ia32_rcpph256_mask(
389 (__v16hf
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)__U
);
392 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_rcp_ph(__m128h __A
) {
393 return (__m128h
)__builtin_ia32_rcpph128_mask(
394 (__v8hf
)__A
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1);
397 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_rcp_ph(__m128h __W
,
400 return (__m128h
)__builtin_ia32_rcpph128_mask((__v8hf
)__A
, (__v8hf
)__W
,
404 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_rcp_ph(__mmask8 __U
,
406 return (__m128h
)__builtin_ia32_rcpph128_mask(
407 (__v8hf
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
410 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_rsqrt_ph(__m256h __A
) {
411 return (__m256h
)__builtin_ia32_rsqrtph256_mask(
412 (__v16hf
)__A
, (__v16hf
)_mm256_undefined_ph(), (__mmask16
)-1);
415 static __inline__ __m256h __DEFAULT_FN_ATTRS256
416 _mm256_mask_rsqrt_ph(__m256h __W
, __mmask16 __U
, __m256h __A
) {
417 return (__m256h
)__builtin_ia32_rsqrtph256_mask((__v16hf
)__A
, (__v16hf
)__W
,
421 static __inline__ __m256h __DEFAULT_FN_ATTRS256
422 _mm256_maskz_rsqrt_ph(__mmask16 __U
, __m256h __A
) {
423 return (__m256h
)__builtin_ia32_rsqrtph256_mask(
424 (__v16hf
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)__U
);
427 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_rsqrt_ph(__m128h __A
) {
428 return (__m128h
)__builtin_ia32_rsqrtph128_mask(
429 (__v8hf
)__A
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1);
432 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_rsqrt_ph(__m128h __W
,
435 return (__m128h
)__builtin_ia32_rsqrtph128_mask((__v8hf
)__A
, (__v8hf
)__W
,
439 static __inline__ __m128h __DEFAULT_FN_ATTRS128
440 _mm_maskz_rsqrt_ph(__mmask8 __U
, __m128h __A
) {
441 return (__m128h
)__builtin_ia32_rsqrtph128_mask(
442 (__v8hf
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
445 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_getexp_ph(__m128h __A
) {
446 return (__m128h
)__builtin_ia32_getexpph128_mask(
447 (__v8hf
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1);
450 static __inline__ __m128h __DEFAULT_FN_ATTRS128
451 _mm_mask_getexp_ph(__m128h __W
, __mmask8 __U
, __m128h __A
) {
452 return (__m128h
)__builtin_ia32_getexpph128_mask((__v8hf
)__A
, (__v8hf
)__W
,
456 static __inline__ __m128h __DEFAULT_FN_ATTRS128
457 _mm_maskz_getexp_ph(__mmask8 __U
, __m128h __A
) {
458 return (__m128h
)__builtin_ia32_getexpph128_mask(
459 (__v8hf
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
462 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_getexp_ph(__m256h __A
) {
463 return (__m256h
)__builtin_ia32_getexpph256_mask(
464 (__v16hf
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)-1);
467 static __inline__ __m256h __DEFAULT_FN_ATTRS256
468 _mm256_mask_getexp_ph(__m256h __W
, __mmask16 __U
, __m256h __A
) {
469 return (__m256h
)__builtin_ia32_getexpph256_mask((__v16hf
)__A
, (__v16hf
)__W
,
473 static __inline__ __m256h __DEFAULT_FN_ATTRS256
474 _mm256_maskz_getexp_ph(__mmask16 __U
, __m256h __A
) {
475 return (__m256h
)__builtin_ia32_getexpph256_mask(
476 (__v16hf
)__A
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)__U
);
479 #define _mm_getmant_ph(A, B, C) \
480 ((__m128h)__builtin_ia32_getmantph128_mask( \
481 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
484 #define _mm_mask_getmant_ph(W, U, A, B, C) \
485 ((__m128h)__builtin_ia32_getmantph128_mask( \
486 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \
489 #define _mm_maskz_getmant_ph(U, A, B, C) \
490 ((__m128h)__builtin_ia32_getmantph128_mask( \
491 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
494 #define _mm256_getmant_ph(A, B, C) \
495 ((__m256h)__builtin_ia32_getmantph256_mask( \
496 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
497 (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
499 #define _mm256_mask_getmant_ph(W, U, A, B, C) \
500 ((__m256h)__builtin_ia32_getmantph256_mask( \
501 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
504 #define _mm256_maskz_getmant_ph(U, A, B, C) \
505 ((__m256h)__builtin_ia32_getmantph256_mask( \
506 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
507 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
509 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_scalef_ph(__m128h __A
,
511 return (__m128h
)__builtin_ia32_scalefph128_mask(
512 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)-1);
515 static __inline__ __m128h __DEFAULT_FN_ATTRS128
516 _mm_mask_scalef_ph(__m128h __W
, __mmask8 __U
, __m128h __A
, __m128h __B
) {
517 return (__m128h
)__builtin_ia32_scalefph128_mask((__v8hf
)__A
, (__v8hf
)__B
,
518 (__v8hf
)__W
, (__mmask8
)__U
);
521 static __inline__ __m128h __DEFAULT_FN_ATTRS128
522 _mm_maskz_scalef_ph(__mmask8 __U
, __m128h __A
, __m128h __B
) {
523 return (__m128h
)__builtin_ia32_scalefph128_mask(
524 (__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
527 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_scalef_ph(__m256h __A
,
529 return (__m256h
)__builtin_ia32_scalefph256_mask(
530 (__v16hf
)__A
, (__v16hf
)__B
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)-1);
533 static __inline__ __m256h __DEFAULT_FN_ATTRS256
534 _mm256_mask_scalef_ph(__m256h __W
, __mmask16 __U
, __m256h __A
, __m256h __B
) {
535 return (__m256h
)__builtin_ia32_scalefph256_mask((__v16hf
)__A
, (__v16hf
)__B
,
536 (__v16hf
)__W
, (__mmask16
)__U
);
539 static __inline__ __m256h __DEFAULT_FN_ATTRS256
540 _mm256_maskz_scalef_ph(__mmask16 __U
, __m256h __A
, __m256h __B
) {
541 return (__m256h
)__builtin_ia32_scalefph256_mask(
542 (__v16hf
)__A
, (__v16hf
)__B
, (__v16hf
)_mm256_setzero_ph(), (__mmask16
)__U
);
545 #define _mm_roundscale_ph(A, imm) \
546 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
547 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
550 #define _mm_mask_roundscale_ph(W, U, A, imm) \
551 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
552 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
554 #define _mm_maskz_roundscale_ph(U, A, imm) \
555 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
556 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
559 #define _mm256_roundscale_ph(A, imm) \
560 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
561 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
564 #define _mm256_mask_roundscale_ph(W, U, A, imm) \
565 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
566 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
569 #define _mm256_maskz_roundscale_ph(U, A, imm) \
570 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
571 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
574 #define _mm_reduce_ph(A, imm) \
575 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
576 (__v8hf)_mm_setzero_ph(), \
579 #define _mm_mask_reduce_ph(W, U, A, imm) \
580 ((__m128h)__builtin_ia32_reduceph128_mask( \
581 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
583 #define _mm_maskz_reduce_ph(U, A, imm) \
584 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
585 (__v8hf)_mm_setzero_ph(), \
588 #define _mm256_reduce_ph(A, imm) \
589 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
590 (__v16hf)_mm256_setzero_ph(), \
593 #define _mm256_mask_reduce_ph(W, U, A, imm) \
594 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
595 (__v16hf)(__m256h)(W), \
598 #define _mm256_maskz_reduce_ph(U, A, imm) \
599 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
600 (__v16hf)_mm256_setzero_ph(), \
603 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_sqrt_ph(__m128h __a
) {
604 return __builtin_ia32_sqrtph((__v8hf
)__a
);
607 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_sqrt_ph(__m128h __W
,
610 return (__m128h
)__builtin_ia32_selectph_128(
611 (__mmask8
)__U
, (__v8hf
)_mm_sqrt_ph(__A
), (__v8hf
)__W
);
614 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_sqrt_ph(__mmask8 __U
,
616 return (__m128h
)__builtin_ia32_selectph_128(
617 (__mmask8
)__U
, (__v8hf
)_mm_sqrt_ph(__A
), (__v8hf
)_mm_setzero_ph());
620 static __inline __m256h __DEFAULT_FN_ATTRS256
_mm256_sqrt_ph(__m256h __a
) {
621 return (__m256h
)__builtin_ia32_sqrtph256((__v16hf
)__a
);
624 static __inline__ __m256h __DEFAULT_FN_ATTRS256
625 _mm256_mask_sqrt_ph(__m256h __W
, __mmask16 __U
, __m256h __A
) {
626 return (__m256h
)__builtin_ia32_selectph_256(
627 (__mmask16
)__U
, (__v16hf
)_mm256_sqrt_ph(__A
), (__v16hf
)__W
);
630 static __inline__ __m256h __DEFAULT_FN_ATTRS256
631 _mm256_maskz_sqrt_ph(__mmask16 __U
, __m256h __A
) {
632 return (__m256h
)__builtin_ia32_selectph_256((__mmask16
)__U
,
633 (__v16hf
)_mm256_sqrt_ph(__A
),
634 (__v16hf
)_mm256_setzero_ph());
637 #define _mm_mask_fpclass_ph_mask(U, A, imm) \
638 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
639 (int)(imm), (__mmask8)(U)))
641 #define _mm_fpclass_ph_mask(A, imm) \
642 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
643 (int)(imm), (__mmask8)-1))
645 #define _mm256_mask_fpclass_ph_mask(U, A, imm) \
646 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
647 (int)(imm), (__mmask16)(U)))
649 #define _mm256_fpclass_ph_mask(A, imm) \
650 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
651 (int)(imm), (__mmask16)-1))
653 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtpd_ph(__m128d __A
) {
654 return (__m128h
)__builtin_ia32_vcvtpd2ph128_mask(
655 (__v2df
)__A
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1);
658 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtpd_ph(__m128h __W
,
661 return (__m128h
)__builtin_ia32_vcvtpd2ph128_mask((__v2df
)__A
, (__v8hf
)__W
,
665 static __inline__ __m128h __DEFAULT_FN_ATTRS128
666 _mm_maskz_cvtpd_ph(__mmask8 __U
, __m128d __A
) {
667 return (__m128h
)__builtin_ia32_vcvtpd2ph128_mask(
668 (__v2df
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
671 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_cvtpd_ph(__m256d __A
) {
672 return (__m128h
)__builtin_ia32_vcvtpd2ph256_mask(
673 (__v4df
)__A
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1);
676 static __inline__ __m128h __DEFAULT_FN_ATTRS256
677 _mm256_mask_cvtpd_ph(__m128h __W
, __mmask8 __U
, __m256d __A
) {
678 return (__m128h
)__builtin_ia32_vcvtpd2ph256_mask((__v4df
)__A
, (__v8hf
)__W
,
682 static __inline__ __m128h __DEFAULT_FN_ATTRS256
683 _mm256_maskz_cvtpd_ph(__mmask8 __U
, __m256d __A
) {
684 return (__m128h
)__builtin_ia32_vcvtpd2ph256_mask(
685 (__v4df
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
688 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_cvtph_pd(__m128h __A
) {
689 return (__m128d
)__builtin_ia32_vcvtph2pd128_mask(
690 (__v8hf
)__A
, (__v2df
)_mm_undefined_pd(), (__mmask8
)-1);
693 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_cvtph_pd(__m128d __W
,
696 return (__m128d
)__builtin_ia32_vcvtph2pd128_mask((__v8hf
)__A
, (__v2df
)__W
,
700 static __inline__ __m128d __DEFAULT_FN_ATTRS128
701 _mm_maskz_cvtph_pd(__mmask8 __U
, __m128h __A
) {
702 return (__m128d
)__builtin_ia32_vcvtph2pd128_mask(
703 (__v8hf
)__A
, (__v2df
)_mm_setzero_pd(), (__mmask8
)__U
);
706 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_cvtph_pd(__m128h __A
) {
707 return (__m256d
)__builtin_ia32_vcvtph2pd256_mask(
708 (__v8hf
)__A
, (__v4df
)_mm256_undefined_pd(), (__mmask8
)-1);
711 static __inline__ __m256d __DEFAULT_FN_ATTRS256
712 _mm256_mask_cvtph_pd(__m256d __W
, __mmask8 __U
, __m128h __A
) {
713 return (__m256d
)__builtin_ia32_vcvtph2pd256_mask((__v8hf
)__A
, (__v4df
)__W
,
717 static __inline__ __m256d __DEFAULT_FN_ATTRS256
718 _mm256_maskz_cvtph_pd(__mmask8 __U
, __m128h __A
) {
719 return (__m256d
)__builtin_ia32_vcvtph2pd256_mask(
720 (__v8hf
)__A
, (__v4df
)_mm256_setzero_pd(), (__mmask8
)__U
);
723 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvtph_epi16(__m128h __A
) {
724 return (__m128i
)__builtin_ia32_vcvtph2w128_mask(
725 (__v8hf
)__A
, (__v8hi
)_mm_undefined_si128(), (__mmask8
)-1);
728 static __inline__ __m128i __DEFAULT_FN_ATTRS128
729 _mm_mask_cvtph_epi16(__m128i __W
, __mmask8 __U
, __m128h __A
) {
730 return (__m128i
)__builtin_ia32_vcvtph2w128_mask((__v8hf
)__A
, (__v8hi
)__W
,
734 static __inline__ __m128i __DEFAULT_FN_ATTRS128
735 _mm_maskz_cvtph_epi16(__mmask8 __U
, __m128h __A
) {
736 return (__m128i
)__builtin_ia32_vcvtph2w128_mask(
737 (__v8hf
)__A
, (__v8hi
)_mm_setzero_si128(), (__mmask8
)__U
);
740 static __inline__ __m256i __DEFAULT_FN_ATTRS256
741 _mm256_cvtph_epi16(__m256h __A
) {
742 return (__m256i
)__builtin_ia32_vcvtph2w256_mask(
743 (__v16hf
)__A
, (__v16hi
)_mm256_undefined_si256(), (__mmask16
)-1);
746 static __inline__ __m256i __DEFAULT_FN_ATTRS256
747 _mm256_mask_cvtph_epi16(__m256i __W
, __mmask16 __U
, __m256h __A
) {
748 return (__m256i
)__builtin_ia32_vcvtph2w256_mask((__v16hf
)__A
, (__v16hi
)__W
,
752 static __inline__ __m256i __DEFAULT_FN_ATTRS256
753 _mm256_maskz_cvtph_epi16(__mmask16 __U
, __m256h __A
) {
754 return (__m256i
)__builtin_ia32_vcvtph2w256_mask(
755 (__v16hf
)__A
, (__v16hi
)_mm256_setzero_si256(), (__mmask16
)__U
);
758 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvttph_epi16(__m128h __A
) {
759 return (__m128i
)__builtin_ia32_vcvttph2w128_mask(
760 (__v8hf
)__A
, (__v8hi
)_mm_undefined_si128(), (__mmask8
)-1);
763 static __inline__ __m128i __DEFAULT_FN_ATTRS128
764 _mm_mask_cvttph_epi16(__m128i __W
, __mmask8 __U
, __m128h __A
) {
765 return (__m128i
)__builtin_ia32_vcvttph2w128_mask((__v8hf
)__A
, (__v8hi
)__W
,
769 static __inline__ __m128i __DEFAULT_FN_ATTRS128
770 _mm_maskz_cvttph_epi16(__mmask8 __U
, __m128h __A
) {
771 return (__m128i
)__builtin_ia32_vcvttph2w128_mask(
772 (__v8hf
)__A
, (__v8hi
)_mm_setzero_si128(), (__mmask8
)__U
);
775 static __inline__ __m256i __DEFAULT_FN_ATTRS256
776 _mm256_cvttph_epi16(__m256h __A
) {
777 return (__m256i
)__builtin_ia32_vcvttph2w256_mask(
778 (__v16hf
)__A
, (__v16hi
)_mm256_undefined_si256(), (__mmask16
)-1);
781 static __inline__ __m256i __DEFAULT_FN_ATTRS256
782 _mm256_mask_cvttph_epi16(__m256i __W
, __mmask16 __U
, __m256h __A
) {
783 return (__m256i
)__builtin_ia32_vcvttph2w256_mask((__v16hf
)__A
, (__v16hi
)__W
,
787 static __inline__ __m256i __DEFAULT_FN_ATTRS256
788 _mm256_maskz_cvttph_epi16(__mmask16 __U
, __m256h __A
) {
789 return (__m256i
)__builtin_ia32_vcvttph2w256_mask(
790 (__v16hf
)__A
, (__v16hi
)_mm256_setzero_si256(), (__mmask16
)__U
);
793 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtepi16_ph(__m128i __A
) {
794 return (__m128h
) __builtin_convertvector((__v8hi
)__A
, __v8hf
);
797 static __inline__ __m128h __DEFAULT_FN_ATTRS128
798 _mm_mask_cvtepi16_ph(__m128h __W
, __mmask8 __U
, __m128i __A
) {
799 return (__m128h
)__builtin_ia32_selectph_128(
800 (__mmask8
)__U
, (__v8hf
)_mm_cvtepi16_ph(__A
), (__v8hf
)__W
);
803 static __inline__ __m128h __DEFAULT_FN_ATTRS128
804 _mm_maskz_cvtepi16_ph(__mmask8 __U
, __m128i __A
) {
805 return (__m128h
)__builtin_ia32_selectph_128(
806 (__mmask8
)__U
, (__v8hf
)_mm_cvtepi16_ph(__A
), (__v8hf
)_mm_setzero_ph());
809 static __inline__ __m256h __DEFAULT_FN_ATTRS256
810 _mm256_cvtepi16_ph(__m256i __A
) {
811 return (__m256h
) __builtin_convertvector((__v16hi
)__A
, __v16hf
);
814 static __inline__ __m256h __DEFAULT_FN_ATTRS256
815 _mm256_mask_cvtepi16_ph(__m256h __W
, __mmask16 __U
, __m256i __A
) {
816 return (__m256h
)__builtin_ia32_selectph_256(
817 (__mmask16
)__U
, (__v16hf
)_mm256_cvtepi16_ph(__A
), (__v16hf
)__W
);
820 static __inline__ __m256h __DEFAULT_FN_ATTRS256
821 _mm256_maskz_cvtepi16_ph(__mmask16 __U
, __m256i __A
) {
822 return (__m256h
)__builtin_ia32_selectph_256((__mmask16
)__U
,
823 (__v16hf
)_mm256_cvtepi16_ph(__A
),
824 (__v16hf
)_mm256_setzero_ph());
827 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvtph_epu16(__m128h __A
) {
828 return (__m128i
)__builtin_ia32_vcvtph2uw128_mask(
829 (__v8hf
)__A
, (__v8hu
)_mm_undefined_si128(), (__mmask8
)-1);
832 static __inline__ __m128i __DEFAULT_FN_ATTRS128
833 _mm_mask_cvtph_epu16(__m128i __W
, __mmask8 __U
, __m128h __A
) {
834 return (__m128i
)__builtin_ia32_vcvtph2uw128_mask((__v8hf
)__A
, (__v8hu
)__W
,
838 static __inline__ __m128i __DEFAULT_FN_ATTRS128
839 _mm_maskz_cvtph_epu16(__mmask8 __U
, __m128h __A
) {
840 return (__m128i
)__builtin_ia32_vcvtph2uw128_mask(
841 (__v8hf
)__A
, (__v8hu
)_mm_setzero_si128(), (__mmask8
)__U
);
844 static __inline__ __m256i __DEFAULT_FN_ATTRS256
845 _mm256_cvtph_epu16(__m256h __A
) {
846 return (__m256i
)__builtin_ia32_vcvtph2uw256_mask(
847 (__v16hf
)__A
, (__v16hu
)_mm256_undefined_si256(), (__mmask16
)-1);
850 static __inline__ __m256i __DEFAULT_FN_ATTRS256
851 _mm256_mask_cvtph_epu16(__m256i __W
, __mmask16 __U
, __m256h __A
) {
852 return (__m256i
)__builtin_ia32_vcvtph2uw256_mask((__v16hf
)__A
, (__v16hu
)__W
,
856 static __inline__ __m256i __DEFAULT_FN_ATTRS256
857 _mm256_maskz_cvtph_epu16(__mmask16 __U
, __m256h __A
) {
858 return (__m256i
)__builtin_ia32_vcvtph2uw256_mask(
859 (__v16hf
)__A
, (__v16hu
)_mm256_setzero_si256(), (__mmask16
)__U
);
862 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvttph_epu16(__m128h __A
) {
863 return (__m128i
)__builtin_ia32_vcvttph2uw128_mask(
864 (__v8hf
)__A
, (__v8hu
)_mm_undefined_si128(), (__mmask8
)-1);
867 static __inline__ __m128i __DEFAULT_FN_ATTRS128
868 _mm_mask_cvttph_epu16(__m128i __W
, __mmask8 __U
, __m128h __A
) {
869 return (__m128i
)__builtin_ia32_vcvttph2uw128_mask((__v8hf
)__A
, (__v8hu
)__W
,
873 static __inline__ __m128i __DEFAULT_FN_ATTRS128
874 _mm_maskz_cvttph_epu16(__mmask8 __U
, __m128h __A
) {
875 return (__m128i
)__builtin_ia32_vcvttph2uw128_mask(
876 (__v8hf
)__A
, (__v8hu
)_mm_setzero_si128(), (__mmask8
)__U
);
879 static __inline__ __m256i __DEFAULT_FN_ATTRS256
880 _mm256_cvttph_epu16(__m256h __A
) {
881 return (__m256i
)__builtin_ia32_vcvttph2uw256_mask(
882 (__v16hf
)__A
, (__v16hu
)_mm256_undefined_si256(), (__mmask16
)-1);
885 static __inline__ __m256i __DEFAULT_FN_ATTRS256
886 _mm256_mask_cvttph_epu16(__m256i __W
, __mmask16 __U
, __m256h __A
) {
887 return (__m256i
)__builtin_ia32_vcvttph2uw256_mask((__v16hf
)__A
, (__v16hu
)__W
,
891 static __inline__ __m256i __DEFAULT_FN_ATTRS256
892 _mm256_maskz_cvttph_epu16(__mmask16 __U
, __m256h __A
) {
893 return (__m256i
)__builtin_ia32_vcvttph2uw256_mask(
894 (__v16hf
)__A
, (__v16hu
)_mm256_setzero_si256(), (__mmask16
)__U
);
897 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtepu16_ph(__m128i __A
) {
898 return (__m128h
) __builtin_convertvector((__v8hu
)__A
, __v8hf
);
901 static __inline__ __m128h __DEFAULT_FN_ATTRS128
902 _mm_mask_cvtepu16_ph(__m128h __W
, __mmask8 __U
, __m128i __A
) {
903 return (__m128h
)__builtin_ia32_selectph_128(
904 (__mmask8
)__U
, (__v8hf
)_mm_cvtepu16_ph(__A
), (__v8hf
)__W
);
907 static __inline__ __m128h __DEFAULT_FN_ATTRS128
908 _mm_maskz_cvtepu16_ph(__mmask8 __U
, __m128i __A
) {
909 return (__m128h
)__builtin_ia32_selectph_128(
910 (__mmask8
)__U
, (__v8hf
)_mm_cvtepu16_ph(__A
), (__v8hf
)_mm_setzero_ph());
913 static __inline__ __m256h __DEFAULT_FN_ATTRS256
914 _mm256_cvtepu16_ph(__m256i __A
) {
915 return (__m256h
) __builtin_convertvector((__v16hu
)__A
, __v16hf
);
918 static __inline__ __m256h __DEFAULT_FN_ATTRS256
919 _mm256_mask_cvtepu16_ph(__m256h __W
, __mmask16 __U
, __m256i __A
) {
920 return (__m256h
)__builtin_ia32_selectph_256(
921 (__mmask16
)__U
, (__v16hf
)_mm256_cvtepu16_ph(__A
), (__v16hf
)__W
);
924 static __inline__ __m256h __DEFAULT_FN_ATTRS256
925 _mm256_maskz_cvtepu16_ph(__mmask16 __U
, __m256i __A
) {
926 return (__m256h
)__builtin_ia32_selectph_256((__mmask16
)__U
,
927 (__v16hf
)_mm256_cvtepu16_ph(__A
),
928 (__v16hf
)_mm256_setzero_ph());
931 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvtph_epi32(__m128h __A
) {
932 return (__m128i
)__builtin_ia32_vcvtph2dq128_mask(
933 (__v8hf
)__A
, (__v4si
)_mm_undefined_si128(), (__mmask8
)-1);
936 static __inline__ __m128i __DEFAULT_FN_ATTRS128
937 _mm_mask_cvtph_epi32(__m128i __W
, __mmask8 __U
, __m128h __A
) {
938 return (__m128i
)__builtin_ia32_vcvtph2dq128_mask((__v8hf
)__A
, (__v4si
)__W
,
942 static __inline__ __m128i __DEFAULT_FN_ATTRS128
943 _mm_maskz_cvtph_epi32(__mmask8 __U
, __m128h __A
) {
944 return (__m128i
)__builtin_ia32_vcvtph2dq128_mask(
945 (__v8hf
)__A
, (__v4si
)_mm_setzero_si128(), (__mmask8
)__U
);
948 static __inline__ __m256i __DEFAULT_FN_ATTRS256
949 _mm256_cvtph_epi32(__m128h __A
) {
950 return (__m256i
)__builtin_ia32_vcvtph2dq256_mask(
951 (__v8hf
)__A
, (__v8si
)_mm256_undefined_si256(), (__mmask8
)-1);
954 static __inline__ __m256i __DEFAULT_FN_ATTRS256
955 _mm256_mask_cvtph_epi32(__m256i __W
, __mmask8 __U
, __m128h __A
) {
956 return (__m256i
)__builtin_ia32_vcvtph2dq256_mask((__v8hf
)__A
, (__v8si
)__W
,
960 static __inline__ __m256i __DEFAULT_FN_ATTRS256
961 _mm256_maskz_cvtph_epi32(__mmask8 __U
, __m128h __A
) {
962 return (__m256i
)__builtin_ia32_vcvtph2dq256_mask(
963 (__v8hf
)__A
, (__v8si
)_mm256_setzero_si256(), (__mmask8
)__U
);
966 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvtph_epu32(__m128h __A
) {
967 return (__m128i
)__builtin_ia32_vcvtph2udq128_mask(
968 (__v8hf
)__A
, (__v4su
)_mm_undefined_si128(), (__mmask8
)-1);
971 static __inline__ __m128i __DEFAULT_FN_ATTRS128
972 _mm_mask_cvtph_epu32(__m128i __W
, __mmask8 __U
, __m128h __A
) {
973 return (__m128i
)__builtin_ia32_vcvtph2udq128_mask((__v8hf
)__A
, (__v4su
)__W
,
977 static __inline__ __m128i __DEFAULT_FN_ATTRS128
978 _mm_maskz_cvtph_epu32(__mmask8 __U
, __m128h __A
) {
979 return (__m128i
)__builtin_ia32_vcvtph2udq128_mask(
980 (__v8hf
)__A
, (__v4su
)_mm_setzero_si128(), (__mmask8
)__U
);
983 static __inline__ __m256i __DEFAULT_FN_ATTRS256
984 _mm256_cvtph_epu32(__m128h __A
) {
985 return (__m256i
)__builtin_ia32_vcvtph2udq256_mask(
986 (__v8hf
)__A
, (__v8su
)_mm256_undefined_si256(), (__mmask8
)-1);
989 static __inline__ __m256i __DEFAULT_FN_ATTRS256
990 _mm256_mask_cvtph_epu32(__m256i __W
, __mmask8 __U
, __m128h __A
) {
991 return (__m256i
)__builtin_ia32_vcvtph2udq256_mask((__v8hf
)__A
, (__v8su
)__W
,
995 static __inline__ __m256i __DEFAULT_FN_ATTRS256
996 _mm256_maskz_cvtph_epu32(__mmask8 __U
, __m128h __A
) {
997 return (__m256i
)__builtin_ia32_vcvtph2udq256_mask(
998 (__v8hf
)__A
, (__v8su
)_mm256_setzero_si256(), (__mmask8
)__U
);
1001 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtepi32_ph(__m128i __A
) {
1002 return (__m128h
)__builtin_ia32_vcvtdq2ph128_mask(
1003 (__v4si
)__A
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1);
1006 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1007 _mm_mask_cvtepi32_ph(__m128h __W
, __mmask8 __U
, __m128i __A
) {
1008 return (__m128h
)__builtin_ia32_vcvtdq2ph128_mask((__v4si
)__A
, (__v8hf
)__W
,
1012 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1013 _mm_maskz_cvtepi32_ph(__mmask8 __U
, __m128i __A
) {
1014 return (__m128h
)__builtin_ia32_vcvtdq2ph128_mask(
1015 (__v4si
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
1018 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1019 _mm256_cvtepi32_ph(__m256i __A
) {
1020 return (__m128h
) __builtin_convertvector((__v8si
)__A
, __v8hf
);
1023 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1024 _mm256_mask_cvtepi32_ph(__m128h __W
, __mmask8 __U
, __m256i __A
) {
1025 return (__m128h
)__builtin_ia32_selectph_128(
1026 (__mmask8
)__U
, (__v8hf
)_mm256_cvtepi32_ph(__A
), (__v8hf
)__W
);
1029 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1030 _mm256_maskz_cvtepi32_ph(__mmask8 __U
, __m256i __A
) {
1031 return (__m128h
)__builtin_ia32_selectph_128(
1032 (__mmask8
)__U
, (__v8hf
)_mm256_cvtepi32_ph(__A
), (__v8hf
)_mm_setzero_ph());
1035 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtepu32_ph(__m128i __A
) {
1036 return (__m128h
)__builtin_ia32_vcvtudq2ph128_mask(
1037 (__v4su
)__A
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1);
1040 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1041 _mm_mask_cvtepu32_ph(__m128h __W
, __mmask8 __U
, __m128i __A
) {
1042 return (__m128h
)__builtin_ia32_vcvtudq2ph128_mask((__v4su
)__A
, (__v8hf
)__W
,
1046 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1047 _mm_maskz_cvtepu32_ph(__mmask8 __U
, __m128i __A
) {
1048 return (__m128h
)__builtin_ia32_vcvtudq2ph128_mask(
1049 (__v4su
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
1052 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1053 _mm256_cvtepu32_ph(__m256i __A
) {
1054 return (__m128h
) __builtin_convertvector((__v8su
)__A
, __v8hf
);
1057 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1058 _mm256_mask_cvtepu32_ph(__m128h __W
, __mmask8 __U
, __m256i __A
) {
1059 return (__m128h
)__builtin_ia32_selectph_128(
1060 (__mmask8
)__U
, (__v8hf
)_mm256_cvtepu32_ph(__A
), (__v8hf
)__W
);
1063 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1064 _mm256_maskz_cvtepu32_ph(__mmask8 __U
, __m256i __A
) {
1065 return (__m128h
)__builtin_ia32_selectph_128(
1066 (__mmask8
)__U
, (__v8hf
)_mm256_cvtepu32_ph(__A
), (__v8hf
)_mm_setzero_ph());
1069 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvttph_epi32(__m128h __A
) {
1070 return (__m128i
)__builtin_ia32_vcvttph2dq128_mask(
1071 (__v8hf
)__A
, (__v4si
)_mm_undefined_si128(), (__mmask8
)-1);
1074 static __inline__ __m128i __DEFAULT_FN_ATTRS128
1075 _mm_mask_cvttph_epi32(__m128i __W
, __mmask8 __U
, __m128h __A
) {
1076 return (__m128i
)__builtin_ia32_vcvttph2dq128_mask((__v8hf
)__A
, (__v4si
)__W
,
1080 static __inline__ __m128i __DEFAULT_FN_ATTRS128
1081 _mm_maskz_cvttph_epi32(__mmask8 __U
, __m128h __A
) {
1082 return (__m128i
)__builtin_ia32_vcvttph2dq128_mask(
1083 (__v8hf
)__A
, (__v4si
)_mm_setzero_si128(), (__mmask8
)__U
);
1086 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1087 _mm256_cvttph_epi32(__m128h __A
) {
1088 return (__m256i
)__builtin_ia32_vcvttph2dq256_mask(
1089 (__v8hf
)__A
, (__v8si
)_mm256_undefined_si256(), (__mmask8
)-1);
1092 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1093 _mm256_mask_cvttph_epi32(__m256i __W
, __mmask8 __U
, __m128h __A
) {
1094 return (__m256i
)__builtin_ia32_vcvttph2dq256_mask((__v8hf
)__A
, (__v8si
)__W
,
1098 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1099 _mm256_maskz_cvttph_epi32(__mmask8 __U
, __m128h __A
) {
1100 return (__m256i
)__builtin_ia32_vcvttph2dq256_mask(
1101 (__v8hf
)__A
, (__v8si
)_mm256_setzero_si256(), (__mmask8
)__U
);
1104 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvttph_epu32(__m128h __A
) {
1105 return (__m128i
)__builtin_ia32_vcvttph2udq128_mask(
1106 (__v8hf
)__A
, (__v4su
)_mm_undefined_si128(), (__mmask8
)-1);
1109 static __inline__ __m128i __DEFAULT_FN_ATTRS128
1110 _mm_mask_cvttph_epu32(__m128i __W
, __mmask8 __U
, __m128h __A
) {
1111 return (__m128i
)__builtin_ia32_vcvttph2udq128_mask((__v8hf
)__A
, (__v4su
)__W
,
1115 static __inline__ __m128i __DEFAULT_FN_ATTRS128
1116 _mm_maskz_cvttph_epu32(__mmask8 __U
, __m128h __A
) {
1117 return (__m128i
)__builtin_ia32_vcvttph2udq128_mask(
1118 (__v8hf
)__A
, (__v4su
)_mm_setzero_si128(), (__mmask8
)__U
);
1121 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1122 _mm256_cvttph_epu32(__m128h __A
) {
1123 return (__m256i
)__builtin_ia32_vcvttph2udq256_mask(
1124 (__v8hf
)__A
, (__v8su
)_mm256_undefined_si256(), (__mmask8
)-1);
1127 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1128 _mm256_mask_cvttph_epu32(__m256i __W
, __mmask8 __U
, __m128h __A
) {
1129 return (__m256i
)__builtin_ia32_vcvttph2udq256_mask((__v8hf
)__A
, (__v8su
)__W
,
1133 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1134 _mm256_maskz_cvttph_epu32(__mmask8 __U
, __m128h __A
) {
1135 return (__m256i
)__builtin_ia32_vcvttph2udq256_mask(
1136 (__v8hf
)__A
, (__v8su
)_mm256_setzero_si256(), (__mmask8
)__U
);
1139 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtepi64_ph(__m128i __A
) {
1140 return (__m128h
)__builtin_ia32_vcvtqq2ph128_mask(
1141 (__v2di
)__A
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1);
1144 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1145 _mm_mask_cvtepi64_ph(__m128h __W
, __mmask8 __U
, __m128i __A
) {
1146 return (__m128h
)__builtin_ia32_vcvtqq2ph128_mask((__v2di
)__A
, (__v8hf
)__W
,
1150 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1151 _mm_maskz_cvtepi64_ph(__mmask8 __U
, __m128i __A
) {
1152 return (__m128h
)__builtin_ia32_vcvtqq2ph128_mask(
1153 (__v2di
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
1156 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1157 _mm256_cvtepi64_ph(__m256i __A
) {
1158 return (__m128h
)__builtin_ia32_vcvtqq2ph256_mask(
1159 (__v4di
)__A
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1);
1162 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1163 _mm256_mask_cvtepi64_ph(__m128h __W
, __mmask8 __U
, __m256i __A
) {
1164 return (__m128h
)__builtin_ia32_vcvtqq2ph256_mask((__v4di
)__A
, (__v8hf
)__W
,
1168 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1169 _mm256_maskz_cvtepi64_ph(__mmask8 __U
, __m256i __A
) {
1170 return (__m128h
)__builtin_ia32_vcvtqq2ph256_mask(
1171 (__v4di
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
1174 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvtph_epi64(__m128h __A
) {
1175 return (__m128i
)__builtin_ia32_vcvtph2qq128_mask(
1176 (__v8hf
)__A
, (__v2di
)_mm_undefined_si128(), (__mmask8
)-1);
1179 static __inline__ __m128i __DEFAULT_FN_ATTRS128
1180 _mm_mask_cvtph_epi64(__m128i __W
, __mmask8 __U
, __m128h __A
) {
1181 return (__m128i
)__builtin_ia32_vcvtph2qq128_mask((__v8hf
)__A
, (__v2di
)__W
,
1185 static __inline__ __m128i __DEFAULT_FN_ATTRS128
1186 _mm_maskz_cvtph_epi64(__mmask8 __U
, __m128h __A
) {
1187 return (__m128i
)__builtin_ia32_vcvtph2qq128_mask(
1188 (__v8hf
)__A
, (__v2di
)_mm_setzero_si128(), (__mmask8
)__U
);
1191 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1192 _mm256_cvtph_epi64(__m128h __A
) {
1193 return (__m256i
)__builtin_ia32_vcvtph2qq256_mask(
1194 (__v8hf
)__A
, (__v4di
)_mm256_undefined_si256(), (__mmask8
)-1);
1197 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1198 _mm256_mask_cvtph_epi64(__m256i __W
, __mmask8 __U
, __m128h __A
) {
1199 return (__m256i
)__builtin_ia32_vcvtph2qq256_mask((__v8hf
)__A
, (__v4di
)__W
,
1203 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1204 _mm256_maskz_cvtph_epi64(__mmask8 __U
, __m128h __A
) {
1205 return (__m256i
)__builtin_ia32_vcvtph2qq256_mask(
1206 (__v8hf
)__A
, (__v4di
)_mm256_setzero_si256(), (__mmask8
)__U
);
1209 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtepu64_ph(__m128i __A
) {
1210 return (__m128h
)__builtin_ia32_vcvtuqq2ph128_mask(
1211 (__v2du
)__A
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1);
1214 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1215 _mm_mask_cvtepu64_ph(__m128h __W
, __mmask8 __U
, __m128i __A
) {
1216 return (__m128h
)__builtin_ia32_vcvtuqq2ph128_mask((__v2du
)__A
, (__v8hf
)__W
,
1220 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1221 _mm_maskz_cvtepu64_ph(__mmask8 __U
, __m128i __A
) {
1222 return (__m128h
)__builtin_ia32_vcvtuqq2ph128_mask(
1223 (__v2du
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
1226 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1227 _mm256_cvtepu64_ph(__m256i __A
) {
1228 return (__m128h
)__builtin_ia32_vcvtuqq2ph256_mask(
1229 (__v4du
)__A
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1);
1232 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1233 _mm256_mask_cvtepu64_ph(__m128h __W
, __mmask8 __U
, __m256i __A
) {
1234 return (__m128h
)__builtin_ia32_vcvtuqq2ph256_mask((__v4du
)__A
, (__v8hf
)__W
,
1238 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1239 _mm256_maskz_cvtepu64_ph(__mmask8 __U
, __m256i __A
) {
1240 return (__m128h
)__builtin_ia32_vcvtuqq2ph256_mask(
1241 (__v4du
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
1244 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvtph_epu64(__m128h __A
) {
1245 return (__m128i
)__builtin_ia32_vcvtph2uqq128_mask(
1246 (__v8hf
)__A
, (__v2du
)_mm_undefined_si128(), (__mmask8
)-1);
1249 static __inline__ __m128i __DEFAULT_FN_ATTRS128
1250 _mm_mask_cvtph_epu64(__m128i __W
, __mmask8 __U
, __m128h __A
) {
1251 return (__m128i
)__builtin_ia32_vcvtph2uqq128_mask((__v8hf
)__A
, (__v2du
)__W
,
1255 static __inline__ __m128i __DEFAULT_FN_ATTRS128
1256 _mm_maskz_cvtph_epu64(__mmask8 __U
, __m128h __A
) {
1257 return (__m128i
)__builtin_ia32_vcvtph2uqq128_mask(
1258 (__v8hf
)__A
, (__v2du
)_mm_setzero_si128(), (__mmask8
)__U
);
1261 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1262 _mm256_cvtph_epu64(__m128h __A
) {
1263 return (__m256i
)__builtin_ia32_vcvtph2uqq256_mask(
1264 (__v8hf
)__A
, (__v4du
)_mm256_undefined_si256(), (__mmask8
)-1);
1267 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1268 _mm256_mask_cvtph_epu64(__m256i __W
, __mmask8 __U
, __m128h __A
) {
1269 return (__m256i
)__builtin_ia32_vcvtph2uqq256_mask((__v8hf
)__A
, (__v4du
)__W
,
1273 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1274 _mm256_maskz_cvtph_epu64(__mmask8 __U
, __m128h __A
) {
1275 return (__m256i
)__builtin_ia32_vcvtph2uqq256_mask(
1276 (__v8hf
)__A
, (__v4du
)_mm256_setzero_si256(), (__mmask8
)__U
);
1279 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvttph_epi64(__m128h __A
) {
1280 return (__m128i
)__builtin_ia32_vcvttph2qq128_mask(
1281 (__v8hf
)__A
, (__v2di
)_mm_undefined_si128(), (__mmask8
)-1);
1284 static __inline__ __m128i __DEFAULT_FN_ATTRS128
1285 _mm_mask_cvttph_epi64(__m128i __W
, __mmask8 __U
, __m128h __A
) {
1286 return (__m128i
)__builtin_ia32_vcvttph2qq128_mask((__v8hf
)__A
, (__v2di
)__W
,
1290 static __inline__ __m128i __DEFAULT_FN_ATTRS128
1291 _mm_maskz_cvttph_epi64(__mmask8 __U
, __m128h __A
) {
1292 return (__m128i
)__builtin_ia32_vcvttph2qq128_mask(
1293 (__v8hf
)__A
, (__v2di
)_mm_setzero_si128(), (__mmask8
)__U
);
1296 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1297 _mm256_cvttph_epi64(__m128h __A
) {
1298 return (__m256i
)__builtin_ia32_vcvttph2qq256_mask(
1299 (__v8hf
)__A
, (__v4di
)_mm256_undefined_si256(), (__mmask8
)-1);
1302 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1303 _mm256_mask_cvttph_epi64(__m256i __W
, __mmask8 __U
, __m128h __A
) {
1304 return (__m256i
)__builtin_ia32_vcvttph2qq256_mask((__v8hf
)__A
, (__v4di
)__W
,
1308 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1309 _mm256_maskz_cvttph_epi64(__mmask8 __U
, __m128h __A
) {
1310 return (__m256i
)__builtin_ia32_vcvttph2qq256_mask(
1311 (__v8hf
)__A
, (__v4di
)_mm256_setzero_si256(), (__mmask8
)__U
);
1314 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvttph_epu64(__m128h __A
) {
1315 return (__m128i
)__builtin_ia32_vcvttph2uqq128_mask(
1316 (__v8hf
)__A
, (__v2du
)_mm_undefined_si128(), (__mmask8
)-1);
1319 static __inline__ __m128i __DEFAULT_FN_ATTRS128
1320 _mm_mask_cvttph_epu64(__m128i __W
, __mmask8 __U
, __m128h __A
) {
1321 return (__m128i
)__builtin_ia32_vcvttph2uqq128_mask((__v8hf
)__A
, (__v2du
)__W
,
1325 static __inline__ __m128i __DEFAULT_FN_ATTRS128
1326 _mm_maskz_cvttph_epu64(__mmask8 __U
, __m128h __A
) {
1327 return (__m128i
)__builtin_ia32_vcvttph2uqq128_mask(
1328 (__v8hf
)__A
, (__v2du
)_mm_setzero_si128(), (__mmask8
)__U
);
1331 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1332 _mm256_cvttph_epu64(__m128h __A
) {
1333 return (__m256i
)__builtin_ia32_vcvttph2uqq256_mask(
1334 (__v8hf
)__A
, (__v4du
)_mm256_undefined_si256(), (__mmask8
)-1);
1337 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1338 _mm256_mask_cvttph_epu64(__m256i __W
, __mmask8 __U
, __m128h __A
) {
1339 return (__m256i
)__builtin_ia32_vcvttph2uqq256_mask((__v8hf
)__A
, (__v4du
)__W
,
1343 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1344 _mm256_maskz_cvttph_epu64(__mmask8 __U
, __m128h __A
) {
1345 return (__m256i
)__builtin_ia32_vcvttph2uqq256_mask(
1346 (__v8hf
)__A
, (__v4du
)_mm256_setzero_si256(), (__mmask8
)__U
);
1349 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtxph_ps(__m128h __A
) {
1350 return (__m128
)__builtin_ia32_vcvtph2psx128_mask(
1351 (__v8hf
)__A
, (__v4sf
)_mm_undefined_ps(), (__mmask8
)-1);
1354 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_cvtxph_ps(__m128 __W
,
1357 return (__m128
)__builtin_ia32_vcvtph2psx128_mask((__v8hf
)__A
, (__v4sf
)__W
,
1361 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1362 _mm_maskz_cvtxph_ps(__mmask8 __U
, __m128h __A
) {
1363 return (__m128
)__builtin_ia32_vcvtph2psx128_mask(
1364 (__v8hf
)__A
, (__v4sf
)_mm_setzero_ps(), (__mmask8
)__U
);
1367 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtxph_ps(__m128h __A
) {
1368 return (__m256
)__builtin_ia32_vcvtph2psx256_mask(
1369 (__v8hf
)__A
, (__v8sf
)_mm256_undefined_ps(), (__mmask8
)-1);
1372 static __inline__ __m256 __DEFAULT_FN_ATTRS256
1373 _mm256_mask_cvtxph_ps(__m256 __W
, __mmask8 __U
, __m128h __A
) {
1374 return (__m256
)__builtin_ia32_vcvtph2psx256_mask((__v8hf
)__A
, (__v8sf
)__W
,
1378 static __inline__ __m256 __DEFAULT_FN_ATTRS256
1379 _mm256_maskz_cvtxph_ps(__mmask8 __U
, __m128h __A
) {
1380 return (__m256
)__builtin_ia32_vcvtph2psx256_mask(
1381 (__v8hf
)__A
, (__v8sf
)_mm256_setzero_ps(), (__mmask8
)__U
);
1384 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtxps_ph(__m128 __A
) {
1385 return (__m128h
)__builtin_ia32_vcvtps2phx128_mask(
1386 (__v4sf
)__A
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1);
1389 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtxps_ph(__m128h __W
,
1392 return (__m128h
)__builtin_ia32_vcvtps2phx128_mask((__v4sf
)__A
, (__v8hf
)__W
,
1396 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1397 _mm_maskz_cvtxps_ph(__mmask8 __U
, __m128 __A
) {
1398 return (__m128h
)__builtin_ia32_vcvtps2phx128_mask(
1399 (__v4sf
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
1402 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_cvtxps_ph(__m256 __A
) {
1403 return (__m128h
)__builtin_ia32_vcvtps2phx256_mask(
1404 (__v8sf
)__A
, (__v8hf
)_mm_undefined_ph(), (__mmask8
)-1);
1407 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1408 _mm256_mask_cvtxps_ph(__m128h __W
, __mmask8 __U
, __m256 __A
) {
1409 return (__m128h
)__builtin_ia32_vcvtps2phx256_mask((__v8sf
)__A
, (__v8hf
)__W
,
1413 static __inline__ __m128h __DEFAULT_FN_ATTRS256
1414 _mm256_maskz_cvtxps_ph(__mmask8 __U
, __m256 __A
) {
1415 return (__m128h
)__builtin_ia32_vcvtps2phx256_mask(
1416 (__v8sf
)__A
, (__v8hf
)_mm_setzero_ph(), (__mmask8
)__U
);
1419 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmadd_ph(__m128h __A
,
1422 return (__m128h
)__builtin_ia32_vfmaddph((__v8hf
)__A
, (__v8hf
)__B
,
1426 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmadd_ph(__m128h __A
,
1430 return (__m128h
)__builtin_ia32_selectph_128(
1432 __builtin_ia32_vfmaddph((__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__C
),
1436 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1437 _mm_mask3_fmadd_ph(__m128h __A
, __m128h __B
, __m128h __C
, __mmask8 __U
) {
1438 return (__m128h
)__builtin_ia32_selectph_128(
1440 __builtin_ia32_vfmaddph((__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__C
),
1444 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1445 _mm_maskz_fmadd_ph(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
1446 return (__m128h
)__builtin_ia32_selectph_128(
1448 __builtin_ia32_vfmaddph((__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__C
),
1449 (__v8hf
)_mm_setzero_ph());
1452 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmsub_ph(__m128h __A
,
1455 return (__m128h
)__builtin_ia32_vfmaddph((__v8hf
)__A
, (__v8hf
)__B
,
1459 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmsub_ph(__m128h __A
,
1463 return (__m128h
)__builtin_ia32_selectph_128(
1464 (__mmask8
)__U
, _mm_fmsub_ph((__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__C
),
1468 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1469 _mm_maskz_fmsub_ph(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
1470 return (__m128h
)__builtin_ia32_selectph_128(
1471 (__mmask8
)__U
, _mm_fmsub_ph((__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__C
),
1472 (__v8hf
)_mm_setzero_ph());
1475 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1476 _mm_mask3_fnmadd_ph(__m128h __A
, __m128h __B
, __m128h __C
, __mmask8 __U
) {
1477 return (__m128h
)__builtin_ia32_selectph_128(
1479 __builtin_ia32_vfmaddph(-(__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__C
),
1483 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1484 _mm_maskz_fnmadd_ph(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
1485 return (__m128h
)__builtin_ia32_selectph_128(
1487 __builtin_ia32_vfmaddph(-(__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__C
),
1488 (__v8hf
)_mm_setzero_ph());
1491 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1492 _mm_maskz_fnmsub_ph(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
1493 return (__m128h
)__builtin_ia32_selectph_128(
1495 __builtin_ia32_vfmaddph(-(__v8hf
)__A
, (__v8hf
)__B
, -(__v8hf
)__C
),
1496 (__v8hf
)_mm_setzero_ph());
1499 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_fmadd_ph(__m256h __A
,
1502 return (__m256h
)__builtin_ia32_vfmaddph256((__v16hf
)__A
, (__v16hf
)__B
,
1506 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1507 _mm256_mask_fmadd_ph(__m256h __A
, __mmask16 __U
, __m256h __B
, __m256h __C
) {
1508 return (__m256h
)__builtin_ia32_selectph_256(
1510 __builtin_ia32_vfmaddph256((__v16hf
)__A
, (__v16hf
)__B
, (__v16hf
)__C
),
1514 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1515 _mm256_mask3_fmadd_ph(__m256h __A
, __m256h __B
, __m256h __C
, __mmask16 __U
) {
1516 return (__m256h
)__builtin_ia32_selectph_256(
1518 __builtin_ia32_vfmaddph256((__v16hf
)__A
, (__v16hf
)__B
, (__v16hf
)__C
),
1522 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1523 _mm256_maskz_fmadd_ph(__mmask16 __U
, __m256h __A
, __m256h __B
, __m256h __C
) {
1524 return (__m256h
)__builtin_ia32_selectph_256(
1526 __builtin_ia32_vfmaddph256((__v16hf
)__A
, (__v16hf
)__B
, (__v16hf
)__C
),
1527 (__v16hf
)_mm256_setzero_ph());
1530 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_fmsub_ph(__m256h __A
,
1533 return (__m256h
)__builtin_ia32_vfmaddph256((__v16hf
)__A
, (__v16hf
)__B
,
1537 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1538 _mm256_mask_fmsub_ph(__m256h __A
, __mmask16 __U
, __m256h __B
, __m256h __C
) {
1539 return (__m256h
)__builtin_ia32_selectph_256(
1541 __builtin_ia32_vfmaddph256((__v16hf
)__A
, (__v16hf
)__B
, -(__v16hf
)__C
),
1545 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1546 _mm256_maskz_fmsub_ph(__mmask16 __U
, __m256h __A
, __m256h __B
, __m256h __C
) {
1547 return (__m256h
)__builtin_ia32_selectph_256(
1549 __builtin_ia32_vfmaddph256((__v16hf
)__A
, (__v16hf
)__B
, -(__v16hf
)__C
),
1550 (__v16hf
)_mm256_setzero_ph());
1553 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1554 _mm256_mask3_fnmadd_ph(__m256h __A
, __m256h __B
, __m256h __C
, __mmask16 __U
) {
1555 return (__m256h
)__builtin_ia32_selectph_256(
1557 __builtin_ia32_vfmaddph256(-(__v16hf
)__A
, (__v16hf
)__B
, (__v16hf
)__C
),
1561 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1562 _mm256_maskz_fnmadd_ph(__mmask16 __U
, __m256h __A
, __m256h __B
, __m256h __C
) {
1563 return (__m256h
)__builtin_ia32_selectph_256(
1565 __builtin_ia32_vfmaddph256(-(__v16hf
)__A
, (__v16hf
)__B
, (__v16hf
)__C
),
1566 (__v16hf
)_mm256_setzero_ph());
1569 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1570 _mm256_maskz_fnmsub_ph(__mmask16 __U
, __m256h __A
, __m256h __B
, __m256h __C
) {
1571 return (__m256h
)__builtin_ia32_selectph_256(
1573 __builtin_ia32_vfmaddph256(-(__v16hf
)__A
, (__v16hf
)__B
, -(__v16hf
)__C
),
1574 (__v16hf
)_mm256_setzero_ph());
1577 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmaddsub_ph(__m128h __A
,
1580 return (__m128h
)__builtin_ia32_vfmaddsubph((__v8hf
)__A
, (__v8hf
)__B
,
1584 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1585 _mm_mask_fmaddsub_ph(__m128h __A
, __mmask8 __U
, __m128h __B
, __m128h __C
) {
1586 return (__m128h
)__builtin_ia32_selectph_128(
1588 __builtin_ia32_vfmaddsubph((__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__C
),
1592 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1593 _mm_mask3_fmaddsub_ph(__m128h __A
, __m128h __B
, __m128h __C
, __mmask8 __U
) {
1594 return (__m128h
)__builtin_ia32_selectph_128(
1596 __builtin_ia32_vfmaddsubph((__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__C
),
1600 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1601 _mm_maskz_fmaddsub_ph(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
1602 return (__m128h
)__builtin_ia32_selectph_128(
1604 __builtin_ia32_vfmaddsubph((__v8hf
)__A
, (__v8hf
)__B
, (__v8hf
)__C
),
1605 (__v8hf
)_mm_setzero_ph());
1608 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmsubadd_ph(__m128h __A
,
1611 return (__m128h
)__builtin_ia32_vfmaddsubph((__v8hf
)__A
, (__v8hf
)__B
,
1615 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1616 _mm_mask_fmsubadd_ph(__m128h __A
, __mmask8 __U
, __m128h __B
, __m128h __C
) {
1617 return (__m128h
)__builtin_ia32_selectph_128(
1619 __builtin_ia32_vfmaddsubph((__v8hf
)__A
, (__v8hf
)__B
, -(__v8hf
)__C
),
1623 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1624 _mm_maskz_fmsubadd_ph(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
1625 return (__m128h
)__builtin_ia32_selectph_128(
1627 __builtin_ia32_vfmaddsubph((__v8hf
)__A
, (__v8hf
)__B
, -(__v8hf
)__C
),
1628 (__v8hf
)_mm_setzero_ph());
1631 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1632 _mm256_fmaddsub_ph(__m256h __A
, __m256h __B
, __m256h __C
) {
1633 return (__m256h
)__builtin_ia32_vfmaddsubph256((__v16hf
)__A
, (__v16hf
)__B
,
1637 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1638 _mm256_mask_fmaddsub_ph(__m256h __A
, __mmask16 __U
, __m256h __B
, __m256h __C
) {
1639 return (__m256h
)__builtin_ia32_selectph_256(
1641 __builtin_ia32_vfmaddsubph256((__v16hf
)__A
, (__v16hf
)__B
, (__v16hf
)__C
),
1645 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1646 _mm256_mask3_fmaddsub_ph(__m256h __A
, __m256h __B
, __m256h __C
, __mmask16 __U
) {
1647 return (__m256h
)__builtin_ia32_selectph_256(
1649 __builtin_ia32_vfmaddsubph256((__v16hf
)__A
, (__v16hf
)__B
, (__v16hf
)__C
),
1653 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1654 _mm256_maskz_fmaddsub_ph(__mmask16 __U
, __m256h __A
, __m256h __B
, __m256h __C
) {
1655 return (__m256h
)__builtin_ia32_selectph_256(
1657 __builtin_ia32_vfmaddsubph256((__v16hf
)__A
, (__v16hf
)__B
, (__v16hf
)__C
),
1658 (__v16hf
)_mm256_setzero_ph());
1661 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1662 _mm256_fmsubadd_ph(__m256h __A
, __m256h __B
, __m256h __C
) {
1663 return (__m256h
)__builtin_ia32_vfmaddsubph256((__v16hf
)__A
, (__v16hf
)__B
,
1667 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1668 _mm256_mask_fmsubadd_ph(__m256h __A
, __mmask16 __U
, __m256h __B
, __m256h __C
) {
1669 return (__m256h
)__builtin_ia32_selectph_256(
1671 __builtin_ia32_vfmaddsubph256((__v16hf
)__A
, (__v16hf
)__B
, -(__v16hf
)__C
),
1675 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1676 _mm256_maskz_fmsubadd_ph(__mmask16 __U
, __m256h __A
, __m256h __B
, __m256h __C
) {
1677 return (__m256h
)__builtin_ia32_selectph_256(
1679 __builtin_ia32_vfmaddsubph256((__v16hf
)__A
, (__v16hf
)__B
, -(__v16hf
)__C
),
1680 (__v16hf
)_mm256_setzero_ph());
1683 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1684 _mm_mask3_fmsub_ph(__m128h __A
, __m128h __B
, __m128h __C
, __mmask8 __U
) {
1685 return (__m128h
)__builtin_ia32_selectph_128(
1687 __builtin_ia32_vfmaddph((__v8hf
)__A
, (__v8hf
)__B
, -(__v8hf
)__C
),
1691 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1692 _mm256_mask3_fmsub_ph(__m256h __A
, __m256h __B
, __m256h __C
, __mmask16 __U
) {
1693 return (__m256h
)__builtin_ia32_selectph_256(
1695 __builtin_ia32_vfmaddph256((__v16hf
)__A
, (__v16hf
)__B
, -(__v16hf
)__C
),
1699 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1700 _mm_mask3_fmsubadd_ph(__m128h __A
, __m128h __B
, __m128h __C
, __mmask8 __U
) {
1701 return (__m128h
)__builtin_ia32_selectph_128(
1703 __builtin_ia32_vfmaddsubph((__v8hf
)__A
, (__v8hf
)__B
, -(__v8hf
)__C
),
1707 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1708 _mm256_mask3_fmsubadd_ph(__m256h __A
, __m256h __B
, __m256h __C
, __mmask16 __U
) {
1709 return (__m256h
)__builtin_ia32_selectph_256(
1711 __builtin_ia32_vfmaddsubph256((__v16hf
)__A
, (__v16hf
)__B
, -(__v16hf
)__C
),
1715 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fnmadd_ph(__m128h __A
,
1718 return (__m128h
)__builtin_ia32_vfmaddph((__v8hf
)__A
, -(__v8hf
)__B
,
1722 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1723 _mm_mask_fnmadd_ph(__m128h __A
, __mmask8 __U
, __m128h __B
, __m128h __C
) {
1724 return (__m128h
)__builtin_ia32_selectph_128(
1726 __builtin_ia32_vfmaddph((__v8hf
)__A
, -(__v8hf
)__B
, (__v8hf
)__C
),
1730 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_fnmadd_ph(__m256h __A
,
1733 return (__m256h
)__builtin_ia32_vfmaddph256((__v16hf
)__A
, -(__v16hf
)__B
,
1737 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1738 _mm256_mask_fnmadd_ph(__m256h __A
, __mmask16 __U
, __m256h __B
, __m256h __C
) {
1739 return (__m256h
)__builtin_ia32_selectph_256(
1741 __builtin_ia32_vfmaddph256((__v16hf
)__A
, -(__v16hf
)__B
, (__v16hf
)__C
),
1745 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fnmsub_ph(__m128h __A
,
1748 return (__m128h
)__builtin_ia32_vfmaddph((__v8hf
)__A
, -(__v8hf
)__B
,
1752 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1753 _mm_mask_fnmsub_ph(__m128h __A
, __mmask8 __U
, __m128h __B
, __m128h __C
) {
1754 return (__m128h
)__builtin_ia32_selectph_128(
1756 __builtin_ia32_vfmaddph((__v8hf
)__A
, -(__v8hf
)__B
, -(__v8hf
)__C
),
1760 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1761 _mm_mask3_fnmsub_ph(__m128h __A
, __m128h __B
, __m128h __C
, __mmask8 __U
) {
1762 return (__m128h
)__builtin_ia32_selectph_128(
1764 __builtin_ia32_vfmaddph((__v8hf
)__A
, -(__v8hf
)__B
, -(__v8hf
)__C
),
1768 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_fnmsub_ph(__m256h __A
,
1771 return (__m256h
)__builtin_ia32_vfmaddph256((__v16hf
)__A
, -(__v16hf
)__B
,
1775 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1776 _mm256_mask_fnmsub_ph(__m256h __A
, __mmask16 __U
, __m256h __B
, __m256h __C
) {
1777 return (__m256h
)__builtin_ia32_selectph_256(
1779 __builtin_ia32_vfmaddph256((__v16hf
)__A
, -(__v16hf
)__B
, -(__v16hf
)__C
),
1783 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1784 _mm256_mask3_fnmsub_ph(__m256h __A
, __m256h __B
, __m256h __C
, __mmask16 __U
) {
1785 return (__m256h
)__builtin_ia32_selectph_256(
1787 __builtin_ia32_vfmaddph256((__v16hf
)__A
, -(__v16hf
)__B
, -(__v16hf
)__C
),
1791 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fcmul_pch(__m128h __A
,
1793 return (__m128h
)__builtin_ia32_vfcmulcph128_mask(
1794 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)_mm_undefined_ph(), (__mmask8
)-1);
1797 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1798 _mm_mask_fcmul_pch(__m128h __W
, __mmask8 __U
, __m128h __A
, __m128h __B
) {
1799 return (__m128h
)__builtin_ia32_vfcmulcph128_mask((__v4sf
)__A
, (__v4sf
)__B
,
1800 (__v4sf
)__W
, (__mmask8
)__U
);
1803 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1804 _mm_maskz_fcmul_pch(__mmask8 __U
, __m128h __A
, __m128h __B
) {
1805 return (__m128h
)__builtin_ia32_vfcmulcph128_mask(
1806 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)_mm_setzero_ph(), (__mmask8
)__U
);
1809 static __inline__ __m256h __DEFAULT_FN_ATTRS128
_mm256_fcmul_pch(__m256h __A
,
1811 return (__m256h
)__builtin_ia32_vfcmulcph256_mask(
1812 (__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)_mm256_undefined_ph(), (__mmask8
)-1);
1815 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1816 _mm256_mask_fcmul_pch(__m256h __W
, __mmask8 __U
, __m256h __A
, __m256h __B
) {
1817 return (__m256h
)__builtin_ia32_vfcmulcph256_mask((__v8sf
)__A
, (__v8sf
)__B
,
1818 (__v8sf
)__W
, (__mmask8
)__U
);
1821 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1822 _mm256_maskz_fcmul_pch(__mmask8 __U
, __m256h __A
, __m256h __B
) {
1823 return (__m256h
)__builtin_ia32_vfcmulcph256_mask(
1824 (__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)_mm256_setzero_ph(), (__mmask8
)__U
);
1827 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fcmadd_pch(__m128h __A
,
1830 return (__m128h
)__builtin_ia32_vfcmaddcph128_mask((__v4sf
)__A
, (__v4sf
)__B
,
1831 (__v4sf
)__C
, (__mmask8
)-1);
1834 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1835 _mm_mask_fcmadd_pch(__m128h __A
, __mmask8 __U
, __m128h __B
, __m128h __C
) {
1836 return (__m128h
)__builtin_ia32_selectps_128(
1838 __builtin_ia32_vfcmaddcph128_mask((__v4sf
)__A
, (__v4sf
)(__m128h
)__B
,
1839 (__v4sf
)__C
, (__mmask8
)__U
),
1843 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1844 _mm_mask3_fcmadd_pch(__m128h __A
, __m128h __B
, __m128h __C
, __mmask8 __U
) {
1845 return (__m128h
)__builtin_ia32_vfcmaddcph128_mask((__v4sf
)__A
, (__v4sf
)__B
,
1846 (__v4sf
)__C
, (__mmask8
)__U
);
1849 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1850 _mm_maskz_fcmadd_pch(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
1851 return (__m128h
)__builtin_ia32_vfcmaddcph128_maskz(
1852 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
, (__mmask8
)__U
);
1855 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_fcmadd_pch(__m256h __A
,
1858 return (__m256h
)__builtin_ia32_vfcmaddcph256_mask((__v8sf
)__A
, (__v8sf
)__B
,
1859 (__v8sf
)__C
, (__mmask8
)-1);
1862 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1863 _mm256_mask_fcmadd_pch(__m256h __A
, __mmask8 __U
, __m256h __B
, __m256h __C
) {
1864 return (__m256h
)__builtin_ia32_selectps_256(
1866 __builtin_ia32_vfcmaddcph256_mask((__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)__C
,
1871 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1872 _mm256_mask3_fcmadd_pch(__m256h __A
, __m256h __B
, __m256h __C
, __mmask8 __U
) {
1873 return (__m256h
)__builtin_ia32_vfcmaddcph256_mask((__v8sf
)__A
, (__v8sf
)__B
,
1874 (__v8sf
)__C
, (__mmask8
)__U
);
1877 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1878 _mm256_maskz_fcmadd_pch(__mmask8 __U
, __m256h __A
, __m256h __B
, __m256h __C
) {
1879 return (__m256h
)__builtin_ia32_vfcmaddcph256_maskz(
1880 (__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)__C
, (__mmask8
)__U
);
1883 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmul_pch(__m128h __A
,
1885 return (__m128h
)__builtin_ia32_vfmulcph128_mask(
1886 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)_mm_undefined_ph(), (__mmask8
)-1);
1889 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmul_pch(__m128h __W
,
1893 return (__m128h
)__builtin_ia32_vfmulcph128_mask((__v4sf
)__A
, (__v4sf
)__B
,
1894 (__v4sf
)__W
, (__mmask8
)__U
);
1897 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1898 _mm_maskz_fmul_pch(__mmask8 __U
, __m128h __A
, __m128h __B
) {
1899 return (__m128h
)__builtin_ia32_vfmulcph128_mask(
1900 (__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)_mm_setzero_ph(), (__mmask8
)__U
);
1903 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_fmul_pch(__m256h __A
,
1905 return (__m256h
)__builtin_ia32_vfmulcph256_mask(
1906 (__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)_mm256_undefined_ph(), (__mmask8
)-1);
1909 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1910 _mm256_mask_fmul_pch(__m256h __W
, __mmask8 __U
, __m256h __A
, __m256h __B
) {
1911 return (__m256h
)__builtin_ia32_vfmulcph256_mask((__v8sf
)__A
, (__v8sf
)__B
,
1912 (__v8sf
)__W
, (__mmask8
)__U
);
1915 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1916 _mm256_maskz_fmul_pch(__mmask8 __U
, __m256h __A
, __m256h __B
) {
1917 return (__m256h
)__builtin_ia32_vfmulcph256_mask(
1918 (__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)_mm256_setzero_ph(), (__mmask8
)__U
);
1921 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_fmadd_pch(__m128h __A
,
1924 return (__m128h
)__builtin_ia32_vfmaddcph128_mask((__v4sf
)__A
, (__v4sf
)__B
,
1925 (__v4sf
)__C
, (__mmask8
)-1);
1928 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1929 _mm_mask_fmadd_pch(__m128h __A
, __mmask8 __U
, __m128h __B
, __m128h __C
) {
1930 return (__m128h
)__builtin_ia32_selectps_128(
1932 __builtin_ia32_vfmaddcph128_mask((__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
,
1937 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1938 _mm_mask3_fmadd_pch(__m128h __A
, __m128h __B
, __m128h __C
, __mmask8 __U
) {
1939 return (__m128h
)__builtin_ia32_vfmaddcph128_mask((__v4sf
)__A
, (__v4sf
)__B
,
1940 (__v4sf
)__C
, (__mmask8
)__U
);
1943 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1944 _mm_maskz_fmadd_pch(__mmask8 __U
, __m128h __A
, __m128h __B
, __m128h __C
) {
1945 return (__m128h
)__builtin_ia32_vfmaddcph128_maskz((__v4sf
)__A
, (__v4sf
)__B
,
1946 (__v4sf
)__C
, (__mmask8
)__U
);
1949 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_fmadd_pch(__m256h __A
,
1952 return (__m256h
)__builtin_ia32_vfmaddcph256_mask((__v8sf
)__A
, (__v8sf
)__B
,
1953 (__v8sf
)__C
, (__mmask8
)-1);
1956 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1957 _mm256_mask_fmadd_pch(__m256h __A
, __mmask8 __U
, __m256h __B
, __m256h __C
) {
1958 return (__m256h
)__builtin_ia32_selectps_256(
1960 __builtin_ia32_vfmaddcph256_mask((__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)__C
,
1965 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1966 _mm256_mask3_fmadd_pch(__m256h __A
, __m256h __B
, __m256h __C
, __mmask8 __U
) {
1967 return (__m256h
)__builtin_ia32_vfmaddcph256_mask((__v8sf
)__A
, (__v8sf
)__B
,
1968 (__v8sf
)__C
, (__mmask8
)__U
);
1971 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1972 _mm256_maskz_fmadd_pch(__mmask8 __U
, __m256h __A
, __m256h __B
, __m256h __C
) {
1973 return (__m256h
)__builtin_ia32_vfmaddcph256_maskz((__v8sf
)__A
, (__v8sf
)__B
,
1974 (__v8sf
)__C
, (__mmask8
)__U
);
1977 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_blend_ph(__mmask8 __U
,
1980 return (__m128h
)__builtin_ia32_selectph_128((__mmask8
)__U
, (__v8hf
)__W
,
1984 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1985 _mm256_mask_blend_ph(__mmask16 __U
, __m256h __A
, __m256h __W
) {
1986 return (__m256h
)__builtin_ia32_selectph_256((__mmask16
)__U
, (__v16hf
)__W
,
1990 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1991 _mm_permutex2var_ph(__m128h __A
, __m128i __I
, __m128h __B
) {
1992 return (__m128h
)__builtin_ia32_vpermi2varhi128((__v8hi
)__A
, (__v8hi
)__I
,
1996 static __inline__ __m256h __DEFAULT_FN_ATTRS256
1997 _mm256_permutex2var_ph(__m256h __A
, __m256i __I
, __m256h __B
) {
1998 return (__m256h
)__builtin_ia32_vpermi2varhi256((__v16hi
)__A
, (__v16hi
)__I
,
2002 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2003 _mm_permutexvar_ph(__m128i __A
, __m128h __B
) {
2004 return (__m128h
)__builtin_ia32_permvarhi128((__v8hi
)__B
, (__v8hi
)__A
);
2007 static __inline__ __m256h __DEFAULT_FN_ATTRS256
2008 _mm256_permutexvar_ph(__m256i __A
, __m256h __B
) {
2009 return (__m256h
)__builtin_ia32_permvarhi256((__v16hi
)__B
, (__v16hi
)__A
);
2012 static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2013 _mm256_reduce_add_ph(__m256h __W
) {
2014 return __builtin_ia32_reduce_fadd_ph256(-0.0f16
, __W
);
2017 static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2018 _mm256_reduce_mul_ph(__m256h __W
) {
2019 return __builtin_ia32_reduce_fmul_ph256(1.0f16
, __W
);
2022 static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2023 _mm256_reduce_max_ph(__m256h __V
) {
2024 return __builtin_ia32_reduce_fmax_ph256(__V
);
2027 static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2028 _mm256_reduce_min_ph(__m256h __V
) {
2029 return __builtin_ia32_reduce_fmin_ph256(__V
);
2032 static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2033 _mm_reduce_add_ph(__m128h __W
) {
2034 return __builtin_ia32_reduce_fadd_ph128(-0.0f16
, __W
);
2037 static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2038 _mm_reduce_mul_ph(__m128h __W
) {
2039 return __builtin_ia32_reduce_fmul_ph128(1.0f16
, __W
);
2042 static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2043 _mm_reduce_max_ph(__m128h __V
) {
2044 return __builtin_ia32_reduce_fmax_ph128(__V
);
2047 static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2048 _mm_reduce_min_ph(__m128h __V
) {
2049 return __builtin_ia32_reduce_fmin_ph128(__V
);
2052 // intrinsics below are alias for f*mul_*ch
2053 #define _mm_mul_pch(A, B) _mm_fmul_pch(A, B)
2054 #define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B)
2055 #define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B)
2056 #define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B)
2057 #define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B)
2058 #define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B)
2060 #define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B)
2061 #define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B)
2062 #define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B)
2063 #define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B)
2064 #define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B)
2065 #define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B)
2067 #undef __DEFAULT_FN_ATTRS128
2068 #undef __DEFAULT_FN_ATTRS256