1 /*===---- avx512fintrin.h - AVX2 intrinsics --------------------------------===
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 *===-----------------------------------------------------------------------===
24 #error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
27 #ifndef __AVX512FINTRIN_H
28 #define __AVX512FINTRIN_H
30 typedef double __v8df
__attribute__((__vector_size__(64)));
31 typedef float __v16sf
__attribute__((__vector_size__(64)));
32 typedef long long __v8di
__attribute__((__vector_size__(64)));
33 typedef int __v16si
__attribute__((__vector_size__(64)));
35 typedef float __m512
__attribute__((__vector_size__(64)));
36 typedef double __m512d
__attribute__((__vector_size__(64)));
37 typedef long long __m512i
__attribute__((__vector_size__(64)));
39 typedef unsigned char __mmask8
;
40 typedef unsigned short __mmask16
;
42 /* Rounding mode macros. */
43 #define _MM_FROUND_TO_NEAREST_INT 0x00
44 #define _MM_FROUND_TO_NEG_INF 0x01
45 #define _MM_FROUND_TO_POS_INF 0x02
46 #define _MM_FROUND_TO_ZERO 0x03
47 #define _MM_FROUND_CUR_DIRECTION 0x04
49 /* Create vectors with repeated elements */
51 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
52 _mm512_setzero_si512(void)
54 return (__m512i
)(__v8di
){ 0, 0, 0, 0, 0, 0, 0, 0 };
57 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
58 _mm512_maskz_set1_epi32(__mmask16 __M
, int __A
)
60 return (__m512i
) __builtin_ia32_pbroadcastd512_gpr_mask (__A
,
62 _mm512_setzero_si512 (),
66 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
67 _mm512_maskz_set1_epi64(__mmask8 __M
, long long __A
)
70 return (__m512i
) __builtin_ia32_pbroadcastq512_gpr_mask (__A
,
72 _mm512_setzero_si512 (),
75 return (__m512i
) __builtin_ia32_pbroadcastq512_mem_mask (__A
,
77 _mm512_setzero_si512 (),
82 static __inline __m512
__attribute__ ((__always_inline__
, __nodebug__
))
83 _mm512_setzero_ps(void)
85 return (__m512
){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
86 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
88 static __inline __m512d
__attribute__ ((__always_inline__
, __nodebug__
))
89 _mm512_setzero_pd(void)
91 return (__m512d
){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
94 static __inline __m512
__attribute__((__always_inline__
, __nodebug__
))
95 _mm512_set1_ps(float __w
)
97 return (__m512
){ __w
, __w
, __w
, __w
, __w
, __w
, __w
, __w
,
98 __w
, __w
, __w
, __w
, __w
, __w
, __w
, __w
};
101 static __inline __m512d
__attribute__((__always_inline__
, __nodebug__
))
102 _mm512_set1_pd(double __w
)
104 return (__m512d
){ __w
, __w
, __w
, __w
, __w
, __w
, __w
, __w
};
107 static __inline __m512i
__attribute__((__always_inline__
, __nodebug__
))
108 _mm512_set1_epi32(int __s
)
110 return (__m512i
)(__v16si
){ __s
, __s
, __s
, __s
, __s
, __s
, __s
, __s
,
111 __s
, __s
, __s
, __s
, __s
, __s
, __s
, __s
};
114 static __inline __m512i
__attribute__((__always_inline__
, __nodebug__
))
115 _mm512_set1_epi64(long long __d
)
117 return (__m512i
)(__v8di
){ __d
, __d
, __d
, __d
, __d
, __d
, __d
, __d
};
120 static __inline__ __m512
__attribute__((__always_inline__
, __nodebug__
))
121 _mm512_broadcastss_ps(__m128 __X
)
124 return (__v16sf
){ __f
, __f
, __f
, __f
,
127 __f
, __f
, __f
, __f
};
130 static __inline__ __m512d
__attribute__((__always_inline__
, __nodebug__
))
131 _mm512_broadcastsd_pd(__m128d __X
)
134 return (__v8df
){ __d
, __d
, __d
, __d
,
135 __d
, __d
, __d
, __d
};
138 /* Cast between vector types */
140 static __inline __m512d
__attribute__((__always_inline__
, __nodebug__
))
141 _mm512_castpd256_pd512(__m256d __a
)
143 return __builtin_shufflevector(__a
, __a
, 0, 1, 2, 3, -1, -1, -1, -1);
146 static __inline __m512
__attribute__((__always_inline__
, __nodebug__
))
147 _mm512_castps256_ps512(__m256 __a
)
149 return __builtin_shufflevector(__a
, __a
, 0, 1, 2, 3, 4, 5, 6, 7,
150 -1, -1, -1, -1, -1, -1, -1, -1);
153 static __inline __m128d
__attribute__((__always_inline__
, __nodebug__
))
154 _mm512_castpd512_pd128(__m512d __a
)
156 return __builtin_shufflevector(__a
, __a
, 0, 1);
159 static __inline __m128
__attribute__((__always_inline__
, __nodebug__
))
160 _mm512_castps512_ps128(__m512 __a
)
162 return __builtin_shufflevector(__a
, __a
, 0, 1, 2, 3);
167 static __inline __m512d
__attribute__((__always_inline__
, __nodebug__
))
168 _mm512_add_pd(__m512d __a
, __m512d __b
)
173 static __inline __m512
__attribute__((__always_inline__
, __nodebug__
))
174 _mm512_add_ps(__m512 __a
, __m512 __b
)
179 static __inline __m512d
__attribute__((__always_inline__
, __nodebug__
))
180 _mm512_mul_pd(__m512d __a
, __m512d __b
)
185 static __inline __m512
__attribute__((__always_inline__
, __nodebug__
))
186 _mm512_mul_ps(__m512 __a
, __m512 __b
)
191 static __inline __m512d
__attribute__((__always_inline__
, __nodebug__
))
192 _mm512_sub_pd(__m512d __a
, __m512d __b
)
197 static __inline __m512
__attribute__((__always_inline__
, __nodebug__
))
198 _mm512_sub_ps(__m512 __a
, __m512 __b
)
203 static __inline__ __m512d
__attribute__((__always_inline__
, __nodebug__
))
204 _mm512_max_pd(__m512d __A
, __m512d __B
)
206 return (__m512d
) __builtin_ia32_maxpd512_mask ((__v8df
) __A
,
209 _mm512_setzero_pd (),
211 _MM_FROUND_CUR_DIRECTION
);
214 static __inline__ __m512
__attribute__((__always_inline__
, __nodebug__
))
215 _mm512_max_ps(__m512 __A
, __m512 __B
)
217 return (__m512
) __builtin_ia32_maxps512_mask ((__v16sf
) __A
,
220 _mm512_setzero_ps (),
222 _MM_FROUND_CUR_DIRECTION
);
225 static __inline __m512i
226 __attribute__ ((__always_inline__
, __nodebug__
))
227 _mm512_max_epi32(__m512i __A
, __m512i __B
)
229 return (__m512i
) __builtin_ia32_pmaxsd512_mask ((__v16si
) __A
,
232 _mm512_setzero_si512 (),
236 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
237 _mm512_max_epu32(__m512i __A
, __m512i __B
)
239 return (__m512i
) __builtin_ia32_pmaxud512_mask ((__v16si
) __A
,
242 _mm512_setzero_si512 (),
246 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
247 _mm512_max_epi64(__m512i __A
, __m512i __B
)
249 return (__m512i
) __builtin_ia32_pmaxsq512_mask ((__v8di
) __A
,
252 _mm512_setzero_si512 (),
256 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
257 _mm512_max_epu64(__m512i __A
, __m512i __B
)
259 return (__m512i
) __builtin_ia32_pmaxuq512_mask ((__v8di
) __A
,
262 _mm512_setzero_si512 (),
266 static __inline__ __m512d
__attribute__((__always_inline__
, __nodebug__
))
267 _mm512_min_pd(__m512d __A
, __m512d __B
)
269 return (__m512d
) __builtin_ia32_minpd512_mask ((__v8df
) __A
,
272 _mm512_setzero_pd (),
274 _MM_FROUND_CUR_DIRECTION
);
277 static __inline__ __m512
__attribute__((__always_inline__
, __nodebug__
))
278 _mm512_min_ps(__m512 __A
, __m512 __B
)
280 return (__m512
) __builtin_ia32_minps512_mask ((__v16sf
) __A
,
283 _mm512_setzero_ps (),
285 _MM_FROUND_CUR_DIRECTION
);
288 static __inline __m512i
289 __attribute__ ((__always_inline__
, __nodebug__
))
290 _mm512_min_epi32(__m512i __A
, __m512i __B
)
292 return (__m512i
) __builtin_ia32_pminsd512_mask ((__v16si
) __A
,
295 _mm512_setzero_si512 (),
299 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
300 _mm512_min_epu32(__m512i __A
, __m512i __B
)
302 return (__m512i
) __builtin_ia32_pminud512_mask ((__v16si
) __A
,
305 _mm512_setzero_si512 (),
309 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
310 _mm512_min_epi64(__m512i __A
, __m512i __B
)
312 return (__m512i
) __builtin_ia32_pminsq512_mask ((__v8di
) __A
,
315 _mm512_setzero_si512 (),
319 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
320 _mm512_min_epu64(__m512i __A
, __m512i __B
)
322 return (__m512i
) __builtin_ia32_pminuq512_mask ((__v8di
) __A
,
325 _mm512_setzero_si512 (),
329 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
330 _mm512_mul_epi32(__m512i __X
, __m512i __Y
)
332 return (__m512i
) __builtin_ia32_pmuldq512_mask ((__v16si
) __X
,
335 _mm512_setzero_si512 (),
339 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
340 _mm512_mul_epu32(__m512i __X
, __m512i __Y
)
342 return (__m512i
) __builtin_ia32_pmuludq512_mask ((__v16si
) __X
,
345 _mm512_setzero_si512 (),
349 static __inline__ __m512d
__attribute__((__always_inline__
, __nodebug__
))
350 _mm512_sqrt_pd(__m512d a
)
352 return (__m512d
)__builtin_ia32_sqrtpd512_mask((__v8df
)a
,
353 (__v8df
) _mm512_setzero_pd (),
355 _MM_FROUND_CUR_DIRECTION
);
358 static __inline__ __m512
__attribute__((__always_inline__
, __nodebug__
))
359 _mm512_sqrt_ps(__m512 a
)
361 return (__m512
)__builtin_ia32_sqrtps512_mask((__v16sf
)a
,
362 (__v16sf
) _mm512_setzero_ps (),
364 _MM_FROUND_CUR_DIRECTION
);
367 static __inline__ __m512d
__attribute__((__always_inline__
, __nodebug__
))
368 _mm512_rsqrt14_pd(__m512d __A
)
370 return (__m512d
) __builtin_ia32_rsqrt14pd512_mask ((__v8df
) __A
,
372 _mm512_setzero_pd (),
375 static __inline__ __m512
__attribute__((__always_inline__
, __nodebug__
))
376 _mm512_rsqrt14_ps(__m512 __A
)
378 return (__m512
) __builtin_ia32_rsqrt14ps512_mask ((__v16sf
) __A
,
380 _mm512_setzero_ps (),
384 static __inline__ __m128
__attribute__((__always_inline__
, __nodebug__
))
385 _mm_rsqrt14_ss(__m128 __A
, __m128 __B
)
387 return (__m128
) __builtin_ia32_rsqrt14ss_mask ((__v4sf
) __A
,
394 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
395 _mm_rsqrt14_sd(__m128d __A
, __m128d __B
)
397 return (__m128d
) __builtin_ia32_rsqrt14sd_mask ((__v2df
) __A
,
404 static __inline__ __m512d
__attribute__((__always_inline__
, __nodebug__
))
405 _mm512_rcp14_pd(__m512d __A
)
407 return (__m512d
) __builtin_ia32_rcp14pd512_mask ((__v8df
) __A
,
409 _mm512_setzero_pd (),
413 static __inline__ __m512
__attribute__((__always_inline__
, __nodebug__
))
414 _mm512_rcp14_ps(__m512 __A
)
416 return (__m512
) __builtin_ia32_rcp14ps512_mask ((__v16sf
) __A
,
418 _mm512_setzero_ps (),
421 static __inline__ __m128
__attribute__((__always_inline__
, __nodebug__
))
422 _mm_rcp14_ss(__m128 __A
, __m128 __B
)
424 return (__m128
) __builtin_ia32_rcp14ss_mask ((__v4sf
) __A
,
431 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
432 _mm_rcp14_sd(__m128d __A
, __m128d __B
)
434 return (__m128d
) __builtin_ia32_rcp14sd_mask ((__v2df
) __A
,
441 static __inline __m512
__attribute__ ((__always_inline__
, __nodebug__
))
442 _mm512_floor_ps(__m512 __A
)
444 return (__m512
) __builtin_ia32_rndscaleps_mask ((__v16sf
) __A
,
447 _MM_FROUND_CUR_DIRECTION
);
450 static __inline __m512d
__attribute__ ((__always_inline__
, __nodebug__
))
451 _mm512_floor_pd(__m512d __A
)
453 return (__m512d
) __builtin_ia32_rndscalepd_mask ((__v8df
) __A
,
456 _MM_FROUND_CUR_DIRECTION
);
459 static __inline __m512
__attribute__ ((__always_inline__
, __nodebug__
))
460 _mm512_ceil_ps(__m512 __A
)
462 return (__m512
) __builtin_ia32_rndscaleps_mask ((__v16sf
) __A
,
465 _MM_FROUND_CUR_DIRECTION
);
468 static __inline __m512d
__attribute__ ((__always_inline__
, __nodebug__
))
469 _mm512_ceil_pd(__m512d __A
)
471 return (__m512d
) __builtin_ia32_rndscalepd_mask ((__v8df
) __A
,
474 _MM_FROUND_CUR_DIRECTION
);
477 static __inline __m512i
__attribute__ (( __always_inline__
, __nodebug__
))
478 _mm512_abs_epi64(__m512i __A
)
480 return (__m512i
) __builtin_ia32_pabsq512_mask ((__v8di
) __A
,
482 _mm512_setzero_si512 (),
486 static __inline __m512i
__attribute__ (( __always_inline__
, __nodebug__
))
487 _mm512_abs_epi32(__m512i __A
)
489 return (__m512i
) __builtin_ia32_pabsd512_mask ((__v16si
) __A
,
491 _mm512_setzero_si512 (),
495 static __inline __m512
__attribute__ ((__always_inline__
, __nodebug__
))
496 _mm512_roundscale_ps(__m512 __A
, const int __imm
)
498 return (__m512
) __builtin_ia32_rndscaleps_mask ((__v16sf
) __A
, __imm
,
500 _MM_FROUND_CUR_DIRECTION
);
502 static __inline __m512d
__attribute__ ((__always_inline__
, __nodebug__
))
503 _mm512_roundscale_pd(__m512d __A
, const int __imm
)
505 return (__m512d
) __builtin_ia32_rndscalepd_mask ((__v8df
) __A
, __imm
,
507 _MM_FROUND_CUR_DIRECTION
);
510 static __inline__ __m512d
__attribute__((__always_inline__
, __nodebug__
))
511 _mm512_fmadd_pd(__m512d __A
, __m512d __B
, __m512d __C
)
514 __builtin_ia32_vfmaddpd512_mask(__A
,
518 _MM_FROUND_CUR_DIRECTION
);
521 static __inline__ __m512d
__attribute__((__always_inline__
, __nodebug__
))
522 _mm512_fmsub_pd(__m512d __A
, __m512d __B
, __m512d __C
)
525 __builtin_ia32_vfmsubpd512_mask(__A
,
529 _MM_FROUND_CUR_DIRECTION
);
532 static __inline__ __m512d
__attribute__((__always_inline__
, __nodebug__
))
533 _mm512_fnmadd_pd(__m512d __A
, __m512d __B
, __m512d __C
)
536 __builtin_ia32_vfnmaddpd512_mask(__A
,
540 _MM_FROUND_CUR_DIRECTION
);
543 static __inline__ __m512
__attribute__((__always_inline__
, __nodebug__
))
544 _mm512_fmadd_ps(__m512 __A
, __m512 __B
, __m512 __C
)
547 __builtin_ia32_vfmaddps512_mask(__A
,
551 _MM_FROUND_CUR_DIRECTION
);
554 static __inline__ __m512
__attribute__((__always_inline__
, __nodebug__
))
555 _mm512_fmsub_ps(__m512 __A
, __m512 __B
, __m512 __C
)
558 __builtin_ia32_vfmsubps512_mask(__A
,
562 _MM_FROUND_CUR_DIRECTION
);
565 static __inline__ __m512
__attribute__((__always_inline__
, __nodebug__
))
566 _mm512_fnmadd_ps(__m512 __A
, __m512 __B
, __m512 __C
)
569 __builtin_ia32_vfnmaddps512_mask(__A
,
573 _MM_FROUND_CUR_DIRECTION
);
576 /* Vector permutations */
578 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
579 _mm512_permutex2var_epi32(__m512i __A
, __m512i __I
, __m512i __B
)
581 return (__m512i
) __builtin_ia32_vpermt2vard512_mask ((__v16si
) __I
587 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
588 _mm512_permutex2var_epi64(__m512i __A
, __m512i __I
, __m512i __B
)
590 return (__m512i
) __builtin_ia32_vpermt2varq512_mask ((__v8di
) __I
597 static __inline __m512d
__attribute__ ((__always_inline__
, __nodebug__
))
598 _mm512_permutex2var_pd(__m512d __A
, __m512i __I
, __m512d __B
)
600 return (__m512d
) __builtin_ia32_vpermt2varpd512_mask ((__v8di
) __I
606 static __inline __m512
__attribute__ ((__always_inline__
, __nodebug__
))
607 _mm512_permutex2var_ps(__m512 __A
, __m512i __I
, __m512 __B
)
609 return (__m512
) __builtin_ia32_vpermt2varps512_mask ((__v16si
) __I
616 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
617 _mm512_valign_epi64(__m512i __A
, __m512i __B
, const int __I
)
619 return (__m512i
) __builtin_ia32_alignq512_mask((__v8di
)__A
,
622 (__v8di
)_mm512_setzero_si512(),
626 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
627 _mm512_valign_epi32(__m512i __A
, __m512i __B
, const int __I
)
629 return (__m512i
)__builtin_ia32_alignd512_mask((__v16si
)__A
,
632 (__v16si
)_mm512_setzero_si512(),
638 static __inline __m512d
__attribute__ ((__always_inline__
, __nodebug__
))
639 _mm512_mask_blend_pd(__mmask8 __U
, __m512d __A
, __m512d __W
)
641 return (__m512d
) __builtin_ia32_blendmpd_512_mask ((__v8df
) __A
,
646 static __inline __m512
__attribute__ ((__always_inline__
, __nodebug__
))
647 _mm512_mask_blend_ps(__mmask16 __U
, __m512 __A
, __m512 __W
)
649 return (__m512
) __builtin_ia32_blendmps_512_mask ((__v16sf
) __A
,
654 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
655 _mm512_mask_blend_epi64(__mmask8 __U
, __m512i __A
, __m512i __W
)
657 return (__m512i
) __builtin_ia32_blendmq_512_mask ((__v8di
) __A
,
662 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
663 _mm512_mask_blend_epi32(__mmask16 __U
, __m512i __A
, __m512i __W
)
665 return (__m512i
) __builtin_ia32_blendmd_512_mask ((__v16si
) __A
,
672 static __inline __mmask16
__attribute__ ((__always_inline__
, __nodebug__
))
673 _mm512_cmp_ps_mask(__m512 a
, __m512 b
, const int p
)
675 return (__mmask16
) __builtin_ia32_cmpps512_mask ((__v16sf
) a
,
676 (__v16sf
) b
, p
, (__mmask16
) -1,
677 _MM_FROUND_CUR_DIRECTION
);
680 static __inline __mmask8
__attribute__ ((__always_inline__
, __nodebug__
))
681 _mm512_cmp_pd_mask(__m512d __X
, __m512d __Y
, const int __P
)
683 return (__mmask8
) __builtin_ia32_cmppd512_mask ((__v8df
) __X
,
686 _MM_FROUND_CUR_DIRECTION
);
691 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
692 _mm512_cvttps_epu32(__m512 __A
)
694 return (__m512i
) __builtin_ia32_cvttps2udq512_mask ((__v16sf
) __A
,
696 _mm512_setzero_si512 (),
698 _MM_FROUND_CUR_DIRECTION
);
701 static __inline __m512
__attribute__ (( __always_inline__
, __nodebug__
))
702 _mm512_cvt_roundepi32_ps(__m512i __A
, const int __R
)
704 return (__m512
) __builtin_ia32_cvtdq2ps512_mask ((__v16si
) __A
,
706 _mm512_setzero_ps (),
711 static __inline __m512
__attribute__ (( __always_inline__
, __nodebug__
))
712 _mm512_cvt_roundepu32_ps(__m512i __A
, const int __R
)
714 return (__m512
) __builtin_ia32_cvtudq2ps512_mask ((__v16si
) __A
,
716 _mm512_setzero_ps (),
721 static __inline __m512d
__attribute__ (( __always_inline__
, __nodebug__
))
722 _mm512_cvtepi32_pd(__m256i __A
)
724 return (__m512d
) __builtin_ia32_cvtdq2pd512_mask ((__v8si
) __A
,
726 _mm512_setzero_pd (),
730 static __inline __m512d
__attribute__ (( __always_inline__
, __nodebug__
))
731 _mm512_cvtepu32_pd(__m256i __A
)
733 return (__m512d
) __builtin_ia32_cvtudq2pd512_mask ((__v8si
) __A
,
735 _mm512_setzero_pd (),
738 static __inline __m256
__attribute__ (( __always_inline__
, __nodebug__
))
739 _mm512_cvt_roundpd_ps(__m512d __A
, const int __R
)
741 return (__m256
) __builtin_ia32_cvtpd2ps512_mask ((__v8df
) __A
,
743 _mm256_setzero_ps (),
748 static __inline __m256i
__attribute__ ((__always_inline__
, __nodebug__
))
749 _mm512_cvtps_ph(__m512 __A
, const int __I
)
751 return (__m256i
) __builtin_ia32_vcvtps2ph512_mask ((__v16sf
) __A
,
754 _mm256_setzero_si256 (),
758 static __inline __m512
__attribute__ ((__always_inline__
, __nodebug__
))
759 _mm512_cvtph_ps(__m256i __A
)
761 return (__m512
) __builtin_ia32_vcvtph2ps512_mask ((__v16hi
) __A
,
763 _mm512_setzero_ps (),
765 _MM_FROUND_CUR_DIRECTION
);
768 static __inline __m512i
__attribute__((__always_inline__
, __nodebug__
))
769 _mm512_cvttps_epi32(__m512 a
)
772 __builtin_ia32_cvttps2dq512_mask((__v16sf
) a
,
773 (__v16si
) _mm512_setzero_si512 (),
774 (__mmask16
) -1, _MM_FROUND_CUR_DIRECTION
);
777 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
778 _mm512_cvttpd_epi32(__m512d a
)
780 return (__m256i
)__builtin_ia32_cvttpd2dq512_mask((__v8df
) a
,
781 (__v8si
)_mm256_setzero_si256(),
783 _MM_FROUND_CUR_DIRECTION
);
786 static __inline __m256i
__attribute__ ((__always_inline__
, __nodebug__
))
787 _mm512_cvtt_roundpd_epi32(__m512d __A
, const int __R
)
789 return (__m256i
) __builtin_ia32_cvttpd2dq512_mask ((__v8df
) __A
,
791 _mm256_setzero_si256 (),
795 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
796 _mm512_cvtt_roundps_epi32(__m512 __A
, const int __R
)
798 return (__m512i
) __builtin_ia32_cvttps2dq512_mask ((__v16sf
) __A
,
800 _mm512_setzero_si512 (),
805 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
806 _mm512_cvt_roundps_epi32(__m512 __A
, const int __R
)
808 return (__m512i
) __builtin_ia32_cvtps2dq512_mask ((__v16sf
) __A
,
810 _mm512_setzero_si512 (),
814 static __inline __m256i
__attribute__ ((__always_inline__
, __nodebug__
))
815 _mm512_cvt_roundpd_epi32(__m512d __A
, const int __R
)
817 return (__m256i
) __builtin_ia32_cvtpd2dq512_mask ((__v8df
) __A
,
819 _mm256_setzero_si256 (),
823 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
824 _mm512_cvt_roundps_epu32(__m512 __A
, const int __R
)
826 return (__m512i
) __builtin_ia32_cvtps2udq512_mask ((__v16sf
) __A
,
828 _mm512_setzero_si512 (),
832 static __inline __m256i
__attribute__ ((__always_inline__
, __nodebug__
))
833 _mm512_cvt_roundpd_epu32(__m512d __A
, const int __R
)
835 return (__m256i
) __builtin_ia32_cvtpd2udq512_mask ((__v8df
) __A
,
837 _mm256_setzero_si256 (),
842 /* Unpack and Interleave */
843 static __inline __m512d
__attribute__((__always_inline__
, __nodebug__
))
844 _mm512_unpackhi_pd(__m512d __a
, __m512d __b
)
846 return __builtin_shufflevector(__a
, __b
, 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
849 static __inline __m512d
__attribute__((__always_inline__
, __nodebug__
))
850 _mm512_unpacklo_pd(__m512d __a
, __m512d __b
)
852 return __builtin_shufflevector(__a
, __b
, 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
855 static __inline __m512
__attribute__((__always_inline__
, __nodebug__
))
856 _mm512_unpackhi_ps(__m512 __a
, __m512 __b
)
858 return __builtin_shufflevector(__a
, __b
,
860 2+4, 18+4, 3+4, 19+4,
861 2+8, 18+8, 3+8, 19+8,
862 2+12, 18+12, 3+12, 19+12);
865 static __inline __m512
__attribute__((__always_inline__
, __nodebug__
))
866 _mm512_unpacklo_ps(__m512 __a
, __m512 __b
)
868 return __builtin_shufflevector(__a
, __b
,
870 0+4, 16+4, 1+4, 17+4,
871 0+8, 16+8, 1+8, 17+8,
872 0+12, 16+12, 1+12, 17+12);
877 static __inline __mmask16
__attribute__ ((__always_inline__
, __nodebug__
))
878 _mm512_test_epi32_mask(__m512i __A
, __m512i __B
)
880 return (__mmask16
) __builtin_ia32_ptestmd512 ((__v16si
) __A
,
885 static __inline __mmask8
__attribute__ ((__always_inline__
, __nodebug__
))
886 _mm512_test_epi64_mask(__m512i __A
, __m512i __B
)
888 return (__mmask8
) __builtin_ia32_ptestmq512 ((__v8di
) __A
,
895 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
896 _mm512_maskz_loadu_epi32(__mmask16 __U
, void const *__P
)
898 return (__m512i
) __builtin_ia32_loaddqusi512_mask ((const __v16si
*)__P
,
900 _mm512_setzero_si512 (),
904 static __inline __m512i
__attribute__ ((__always_inline__
, __nodebug__
))
905 _mm512_maskz_loadu_epi64(__mmask8 __U
, void const *__P
)
907 return (__m512i
) __builtin_ia32_loaddqudi512_mask ((const __v8di
*)__P
,
909 _mm512_setzero_si512 (),
913 static __inline __m512
__attribute__ ((__always_inline__
, __nodebug__
))
914 _mm512_maskz_loadu_ps(__mmask16 __U
, void const *__P
)
916 return (__m512
) __builtin_ia32_loadups512_mask ((const __v16sf
*)__P
,
918 _mm512_setzero_ps (),
922 static __inline __m512d
__attribute__ ((__always_inline__
, __nodebug__
))
923 _mm512_maskz_loadu_pd(__mmask8 __U
, void const *__P
)
925 return (__m512d
) __builtin_ia32_loadupd512_mask ((const __v8df
*)__P
,
927 _mm512_setzero_pd (),
931 static __inline __m512d
__attribute__((__always_inline__
, __nodebug__
))
932 _mm512_loadu_pd(double const *__p
)
936 } __attribute__((packed
, may_alias
));
937 return ((struct __loadu_pd
*)__p
)->__v
;
940 static __inline __m512
__attribute__((__always_inline__
, __nodebug__
))
941 _mm512_loadu_ps(float const *__p
)
945 } __attribute__((packed
, may_alias
));
946 return ((struct __loadu_ps
*)__p
)->__v
;
951 static __inline
void __attribute__ ((__always_inline__
, __nodebug__
))
952 _mm512_mask_storeu_epi64(void *__P
, __mmask8 __U
, __m512i __A
)
954 __builtin_ia32_storedqudi512_mask ((__v8di
*)__P
, (__v8di
) __A
,
958 static __inline
void __attribute__ ((__always_inline__
, __nodebug__
))
959 _mm512_mask_storeu_epi32(void *__P
, __mmask16 __U
, __m512i __A
)
961 __builtin_ia32_storedqusi512_mask ((__v16si
*)__P
, (__v16si
) __A
,
965 static __inline
void __attribute__ ((__always_inline__
, __nodebug__
))
966 _mm512_mask_storeu_pd(void *__P
, __mmask8 __U
, __m512d __A
)
968 __builtin_ia32_storeupd512_mask ((__v8df
*)__P
, (__v8df
) __A
, (__mmask8
) __U
);
971 static __inline
void __attribute__ ((__always_inline__
, __nodebug__
))
972 _mm512_storeu_pd(void *__P
, __m512d __A
)
974 __builtin_ia32_storeupd512_mask((__v8df
*)__P
, (__v8df
)__A
, (__mmask8
)-1);
977 static __inline
void __attribute__ ((__always_inline__
, __nodebug__
))
978 _mm512_mask_storeu_ps(void *__P
, __mmask16 __U
, __m512 __A
)
980 __builtin_ia32_storeups512_mask ((__v16sf
*)__P
, (__v16sf
) __A
,
984 static __inline
void __attribute__ ((__always_inline__
, __nodebug__
))
985 _mm512_storeu_ps(void *__P
, __m512 __A
)
987 __builtin_ia32_storeups512_mask((__v16sf
*)__P
, (__v16sf
)__A
, (__mmask16
)-1);
990 static __inline
void __attribute__ ((__always_inline__
, __nodebug__
))
991 _mm512_store_ps(void *__P
, __m512 __A
)
996 static __inline
void __attribute__ ((__always_inline__
, __nodebug__
))
997 _mm512_store_pd(void *__P
, __m512d __A
)
999 *(__m512d
*)__P
= __A
;
1004 static __inline __mmask16
__attribute__ ((__always_inline__
, __nodebug__
))
1005 _mm512_knot(__mmask16 __M
)
1007 return __builtin_ia32_knothi(__M
);
1010 /* Integer compare */
1012 static __inline__ __mmask16
__attribute__((__always_inline__
, __nodebug__
))
1013 _mm512_cmpeq_epi32_mask(__m512i __a
, __m512i __b
) {
1014 return (__mmask16
)__builtin_ia32_pcmpeqd512_mask((__v16si
)__a
, (__v16si
)__b
,
1018 static __inline__ __mmask16
__attribute__((__always_inline__
, __nodebug__
))
1019 _mm512_mask_cmpeq_epi32_mask(__mmask16 __u
, __m512i __a
, __m512i __b
) {
1020 return (__mmask16
)__builtin_ia32_pcmpeqd512_mask((__v16si
)__a
, (__v16si
)__b
,
1024 static __inline__ __mmask8
__attribute__((__always_inline__
, __nodebug__
))
1025 _mm512_mask_cmpeq_epi64_mask(__mmask8 __u
, __m512i __a
, __m512i __b
) {
1026 return (__mmask8
)__builtin_ia32_pcmpeqq512_mask((__v8di
)__a
, (__v8di
)__b
,
1030 static __inline__ __mmask8
__attribute__((__always_inline__
, __nodebug__
))
1031 _mm512_cmpeq_epi64_mask(__m512i __a
, __m512i __b
) {
1032 return (__mmask8
)__builtin_ia32_pcmpeqq512_mask((__v8di
)__a
, (__v8di
)__b
,
1036 #endif // __AVX512FINTRIN_H