1 /*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
10 #error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
15 #ifndef __AVX10_2NIINTRIN_H
16 #define __AVX10_2NIINTRIN_H
18 #define __DEFAULT_FN_ATTRS128 \
19 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
20 __min_vector_width__(128)))
21 #define __DEFAULT_FN_ATTRS256 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
23 __min_vector_width__(256)))
26 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_dpph_ps(__m128 __W
,
29 return (__m128
)__builtin_ia32_vdpphps128((__v4sf
)__W
, (__v8hf
)__A
,
33 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_dpph_ps(__m128 __W
,
37 return (__m128
)__builtin_ia32_selectps_128(
38 (__mmask8
)__U
, (__v4sf
)_mm_dpph_ps(__W
, __A
, __B
), (__v4sf
)__W
);
41 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_dpph_ps(__mmask8 __U
,
45 return (__m128
)__builtin_ia32_selectps_128((__mmask8
)__U
,
46 (__v4sf
)_mm_dpph_ps(__W
, __A
, __B
),
47 (__v4sf
)_mm_setzero_ps());
50 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_dpph_ps(__m256 __W
,
53 return (__m256
)__builtin_ia32_vdpphps256((__v8sf
)__W
, (__v16hf
)__A
,
57 static __inline__ __m256 __DEFAULT_FN_ATTRS256
58 _mm256_mask_dpph_ps(__m256 __W
, __mmask8 __U
, __m256h __A
, __m256h __B
) {
59 return (__m256
)__builtin_ia32_selectps_256(
60 (__mmask8
)__U
, (__v8sf
)_mm256_dpph_ps(__W
, __A
, __B
), (__v8sf
)__W
);
63 static __inline__ __m256 __DEFAULT_FN_ATTRS256
64 _mm256_maskz_dpph_ps(__mmask8 __U
, __m256 __W
, __m256h __A
, __m256h __B
) {
65 return (__m256
)__builtin_ia32_selectps_256(
66 (__mmask8
)__U
, (__v8sf
)_mm256_dpph_ps(__W
, __A
, __B
),
67 (__v8sf
)_mm256_setzero_ps());
71 #define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \
72 ((__m128i)__builtin_ia32_selectw_128( \
73 (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
74 (__v8hi)(__m128i)(W)))
76 #define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \
77 ((__m128i)__builtin_ia32_selectw_128( \
78 (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
79 (__v8hi)_mm_setzero_si128()))
81 #define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \
82 ((__m256i)__builtin_ia32_selectw_256( \
83 (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
84 (__v16hi)(__m256i)(W)))
86 #define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \
87 ((__m256i)__builtin_ia32_selectw_256( \
88 (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
89 (__v16hi)_mm256_setzero_si256()))
92 static __inline__ __m128i __DEFAULT_FN_ATTRS128
93 _mm_mask_dpbssd_epi32(__m128i __W
, __mmask8 __U
, __m128i __A
, __m128i __B
) {
94 return (__m128i
)__builtin_ia32_selectd_128(
95 __U
, (__v4si
)_mm_dpbssd_epi32(__W
, __A
, __B
), (__v4si
)__W
);
98 static __inline__ __m128i __DEFAULT_FN_ATTRS128
99 _mm_maskz_dpbssd_epi32(__mmask8 __U
, __m128i __W
, __m128i __A
, __m128i __B
) {
100 return (__m128i
)__builtin_ia32_selectd_128(
101 __U
, (__v4si
)_mm_dpbssd_epi32(__W
, __A
, __B
),
102 (__v4si
)_mm_setzero_si128());
105 static __inline__ __m256i __DEFAULT_FN_ATTRS256
106 _mm256_mask_dpbssd_epi32(__m256i __W
, __mmask8 __U
, __m256i __A
, __m256i __B
) {
107 return (__m256i
)__builtin_ia32_selectd_256(
108 __U
, (__v8si
)_mm256_dpbssd_epi32(__W
, __A
, __B
), (__v8si
)__W
);
111 static __inline__ __m256i __DEFAULT_FN_ATTRS256
112 _mm256_maskz_dpbssd_epi32(__mmask8 __U
, __m256i __W
, __m256i __A
, __m256i __B
) {
113 return (__m256i
)__builtin_ia32_selectd_256(
114 __U
, (__v8si
)_mm256_dpbssd_epi32(__W
, __A
, __B
),
115 (__v8si
)_mm256_setzero_si256());
118 static __inline__ __m128i __DEFAULT_FN_ATTRS128
119 _mm_mask_dpbssds_epi32(__m128i __W
, __mmask8 __U
, __m128i __A
, __m128i __B
) {
120 return (__m128i
)__builtin_ia32_selectd_128(
121 __U
, (__v4si
)_mm_dpbssds_epi32(__W
, __A
, __B
), (__v4si
)__W
);
124 static __inline__ __m128i __DEFAULT_FN_ATTRS128
125 _mm_maskz_dpbssds_epi32(__mmask8 __U
, __m128i __W
, __m128i __A
, __m128i __B
) {
126 return (__m128i
)__builtin_ia32_selectd_128(
127 __U
, (__v4si
)_mm_dpbssds_epi32(__W
, __A
, __B
),
128 (__v4si
)_mm_setzero_si128());
131 static __inline__ __m256i __DEFAULT_FN_ATTRS256
132 _mm256_mask_dpbssds_epi32(__m256i __W
, __mmask8 __U
, __m256i __A
, __m256i __B
) {
133 return (__m256i
)__builtin_ia32_selectd_256(
134 __U
, (__v8si
)_mm256_dpbssds_epi32(__W
, __A
, __B
), (__v8si
)__W
);
137 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbssds_epi32(
138 __mmask8 __U
, __m256i __W
, __m256i __A
, __m256i __B
) {
139 return (__m256i
)__builtin_ia32_selectd_256(
140 __U
, (__v8si
)_mm256_dpbssds_epi32(__W
, __A
, __B
),
141 (__v8si
)_mm256_setzero_si256());
144 static __inline__ __m128i __DEFAULT_FN_ATTRS128
145 _mm_mask_dpbsud_epi32(__m128i __W
, __mmask8 __U
, __m128i __A
, __m128i __B
) {
146 return (__m128i
)__builtin_ia32_selectd_128(
147 __U
, (__v4si
)_mm_dpbsud_epi32(__W
, __A
, __B
), (__v4si
)__W
);
150 static __inline__ __m128i __DEFAULT_FN_ATTRS128
151 _mm_maskz_dpbsud_epi32(__mmask8 __U
, __m128i __W
, __m128i __A
, __m128i __B
) {
152 return (__m128i
)__builtin_ia32_selectd_128(
153 __U
, (__v4si
)_mm_dpbsud_epi32(__W
, __A
, __B
),
154 (__v4si
)_mm_setzero_si128());
157 static __inline__ __m256i __DEFAULT_FN_ATTRS256
158 _mm256_mask_dpbsud_epi32(__m256i __W
, __mmask8 __U
, __m256i __A
, __m256i __B
) {
159 return (__m256i
)__builtin_ia32_selectd_256(
160 __U
, (__v8si
)_mm256_dpbsud_epi32(__W
, __A
, __B
), (__v8si
)__W
);
163 static __inline__ __m256i __DEFAULT_FN_ATTRS256
164 _mm256_maskz_dpbsud_epi32(__mmask8 __U
, __m256i __W
, __m256i __A
, __m256i __B
) {
165 return (__m256i
)__builtin_ia32_selectd_256(
166 __U
, (__v8si
)_mm256_dpbsud_epi32(__W
, __A
, __B
),
167 (__v8si
)_mm256_setzero_si256());
170 static __inline__ __m128i __DEFAULT_FN_ATTRS128
171 _mm_mask_dpbsuds_epi32(__m128i __W
, __mmask8 __U
, __m128i __A
, __m128i __B
) {
172 return (__m128i
)__builtin_ia32_selectd_128(
173 __U
, (__v4si
)_mm_dpbsuds_epi32(__W
, __A
, __B
), (__v4si
)__W
);
176 static __inline__ __m128i __DEFAULT_FN_ATTRS128
177 _mm_maskz_dpbsuds_epi32(__mmask8 __U
, __m128i __W
, __m128i __A
, __m128i __B
) {
178 return (__m128i
)__builtin_ia32_selectd_128(
179 __U
, (__v4si
)_mm_dpbsuds_epi32(__W
, __A
, __B
),
180 (__v4si
)_mm_setzero_si128());
183 static __inline__ __m256i __DEFAULT_FN_ATTRS256
184 _mm256_mask_dpbsuds_epi32(__m256i __W
, __mmask8 __U
, __m256i __A
, __m256i __B
) {
185 return (__m256i
)__builtin_ia32_selectd_256(
186 __U
, (__v8si
)_mm256_dpbsuds_epi32(__W
, __A
, __B
), (__v8si
)__W
);
189 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbsuds_epi32(
190 __mmask8 __U
, __m256i __W
, __m256i __A
, __m256i __B
) {
191 return (__m256i
)__builtin_ia32_selectd_256(
192 __U
, (__v8si
)_mm256_dpbsuds_epi32(__W
, __A
, __B
),
193 (__v8si
)_mm256_setzero_si256());
196 static __inline__ __m128i __DEFAULT_FN_ATTRS128
197 _mm_mask_dpbuud_epi32(__m128i __W
, __mmask8 __U
, __m128i __A
, __m128i __B
) {
198 return (__m128i
)__builtin_ia32_selectd_128(
199 __U
, (__v4si
)_mm_dpbuud_epi32(__W
, __A
, __B
), (__v4si
)__W
);
202 static __inline__ __m128i __DEFAULT_FN_ATTRS128
203 _mm_maskz_dpbuud_epi32(__mmask8 __U
, __m128i __W
, __m128i __A
, __m128i __B
) {
204 return (__m128i
)__builtin_ia32_selectd_128(
205 __U
, (__v4si
)_mm_dpbuud_epi32(__W
, __A
, __B
),
206 (__v4si
)_mm_setzero_si128());
209 static __inline__ __m256i __DEFAULT_FN_ATTRS256
210 _mm256_mask_dpbuud_epi32(__m256i __W
, __mmask8 __U
, __m256i __A
, __m256i __B
) {
211 return (__m256i
)__builtin_ia32_selectd_256(
212 __U
, (__v8si
)_mm256_dpbuud_epi32(__W
, __A
, __B
), (__v8si
)__W
);
215 static __inline__ __m256i __DEFAULT_FN_ATTRS256
216 _mm256_maskz_dpbuud_epi32(__mmask8 __U
, __m256i __W
, __m256i __A
, __m256i __B
) {
217 return (__m256i
)__builtin_ia32_selectd_256(
218 __U
, (__v8si
)_mm256_dpbuud_epi32(__W
, __A
, __B
),
219 (__v8si
)_mm256_setzero_si256());
222 static __inline__ __m128i __DEFAULT_FN_ATTRS128
223 _mm_mask_dpbuuds_epi32(__m128i __W
, __mmask8 __U
, __m128i __A
, __m128i __B
) {
224 return (__m128i
)__builtin_ia32_selectd_128(
225 __U
, (__v4si
)_mm_dpbuuds_epi32(__W
, __A
, __B
), (__v4si
)__W
);
228 static __inline__ __m128i __DEFAULT_FN_ATTRS128
229 _mm_maskz_dpbuuds_epi32(__mmask8 __U
, __m128i __W
, __m128i __A
, __m128i __B
) {
230 return (__m128i
)__builtin_ia32_selectd_128(
231 __U
, (__v4si
)_mm_dpbuuds_epi32(__W
, __A
, __B
),
232 (__v4si
)_mm_setzero_si128());
235 static __inline__ __m256i __DEFAULT_FN_ATTRS256
236 _mm256_mask_dpbuuds_epi32(__m256i __W
, __mmask8 __U
, __m256i __A
, __m256i __B
) {
237 return (__m256i
)__builtin_ia32_selectd_256(
238 __U
, (__v8si
)_mm256_dpbuuds_epi32(__W
, __A
, __B
), (__v8si
)__W
);
241 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbuuds_epi32(
242 __mmask8 __U
, __m256i __W
, __m256i __A
, __m256i __B
) {
243 return (__m256i
)__builtin_ia32_selectd_256(
244 __U
, (__v8si
)_mm256_dpbuuds_epi32(__W
, __A
, __B
),
245 (__v8si
)_mm256_setzero_si256());
249 static __inline__ __m128i __DEFAULT_FN_ATTRS128
250 _mm_mask_dpwsud_epi32(__m128i __A
, __mmask8 __U
, __m128i __B
, __m128i __C
) {
251 return (__m128i
)__builtin_ia32_selectd_128(
252 (__mmask8
)__U
, (__v4si
)_mm_dpwsud_epi32(__A
, __B
, __C
), (__v4si
)__A
);
255 static __inline__ __m128i __DEFAULT_FN_ATTRS128
256 _mm_maskz_dpwsud_epi32(__m128i __A
, __mmask8 __U
, __m128i __B
, __m128i __C
) {
257 return (__m128i
)__builtin_ia32_selectd_128(
258 (__mmask8
)__U
, (__v4si
)_mm_dpwsud_epi32(__A
, __B
, __C
),
259 (__v4si
)_mm_setzero_si128());
262 static __inline__ __m256i __DEFAULT_FN_ATTRS256
263 _mm256_mask_dpwsud_epi32(__m256i __A
, __mmask8 __U
, __m256i __B
, __m256i __C
) {
264 return (__m256i
)__builtin_ia32_selectd_256(
265 (__mmask8
)__U
, (__v8si
)_mm256_dpwsud_epi32(__A
, __B
, __C
), (__v8si
)__A
);
268 static __inline__ __m256i __DEFAULT_FN_ATTRS256
269 _mm256_maskz_dpwsud_epi32(__m256i __A
, __mmask8 __U
, __m256i __B
, __m256i __C
) {
270 return (__m256i
)__builtin_ia32_selectd_256(
271 (__mmask8
)__U
, (__v8si
)_mm256_dpwsud_epi32(__A
, __B
, __C
),
272 (__v8si
)_mm256_setzero_si256());
275 static __inline__ __m128i __DEFAULT_FN_ATTRS128
276 _mm_mask_dpwsuds_epi32(__m128i __A
, __mmask8 __U
, __m128i __B
, __m128i __C
) {
277 return (__m128i
)__builtin_ia32_selectd_128(
278 (__mmask8
)__U
, (__v4si
)_mm_dpwsuds_epi32(__A
, __B
, __C
), (__v4si
)__A
);
281 static __inline__ __m128i __DEFAULT_FN_ATTRS128
282 _mm_maskz_dpwsuds_epi32(__m128i __A
, __mmask8 __U
, __m128i __B
, __m128i __C
) {
283 return (__m128i
)__builtin_ia32_selectd_128(
284 (__mmask8
)__U
, (__v4si
)_mm_dpwsuds_epi32(__A
, __B
, __C
),
285 (__v4si
)_mm_setzero_si128());
288 static __inline__ __m256i __DEFAULT_FN_ATTRS256
289 _mm256_mask_dpwsuds_epi32(__m256i __A
, __mmask8 __U
, __m256i __B
, __m256i __C
) {
290 return (__m256i
)__builtin_ia32_selectd_256(
291 (__mmask8
)__U
, (__v8si
)_mm256_dpwsuds_epi32(__A
, __B
, __C
), (__v8si
)__A
);
294 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpwsuds_epi32(
295 __m256i __A
, __mmask8 __U
, __m256i __B
, __m256i __C
) {
296 return (__m256i
)__builtin_ia32_selectd_256(
297 (__mmask8
)__U
, (__v8si
)_mm256_dpwsuds_epi32(__A
, __B
, __C
),
298 (__v8si
)_mm256_setzero_si256());
301 static __inline__ __m128i __DEFAULT_FN_ATTRS128
302 _mm_mask_dpwusd_epi32(__m128i __A
, __mmask8 __U
, __m128i __B
, __m128i __C
) {
303 return (__m128i
)__builtin_ia32_selectd_128(
304 (__mmask8
)__U
, (__v4si
)_mm_dpwusd_epi32(__A
, __B
, __C
), (__v4si
)__A
);
307 static __inline__ __m128i __DEFAULT_FN_ATTRS128
308 _mm_maskz_dpwusd_epi32(__m128i __A
, __mmask8 __U
, __m128i __B
, __m128i __C
) {
309 return (__m128i
)__builtin_ia32_selectd_128(
310 (__mmask8
)__U
, (__v4si
)_mm_dpwusd_epi32(__A
, __B
, __C
),
311 (__v4si
)_mm_setzero_si128());
314 static __inline__ __m256i __DEFAULT_FN_ATTRS256
315 _mm256_mask_dpwusd_epi32(__m256i __A
, __mmask8 __U
, __m256i __B
, __m256i __C
) {
316 return (__m256i
)__builtin_ia32_selectd_256(
317 (__mmask8
)__U
, (__v8si
)_mm256_dpwusd_epi32(__A
, __B
, __C
), (__v8si
)__A
);
320 static __inline__ __m256i __DEFAULT_FN_ATTRS256
321 _mm256_maskz_dpwusd_epi32(__m256i __A
, __mmask8 __U
, __m256i __B
, __m256i __C
) {
322 return (__m256i
)__builtin_ia32_selectd_256(
323 (__mmask8
)__U
, (__v8si
)_mm256_dpwusd_epi32(__A
, __B
, __C
),
324 (__v8si
)_mm256_setzero_si256());
327 static __inline__ __m128i __DEFAULT_FN_ATTRS128
328 _mm_mask_dpwusds_epi32(__m128i __A
, __mmask8 __U
, __m128i __B
, __m128i __C
) {
329 return (__m128i
)__builtin_ia32_selectd_128(
330 (__mmask8
)__U
, (__v4si
)_mm_dpwusds_epi32(__A
, __B
, __C
), (__v4si
)__A
);
333 static __inline__ __m128i __DEFAULT_FN_ATTRS128
334 _mm_maskz_dpwusds_epi32(__m128i __A
, __mmask8 __U
, __m128i __B
, __m128i __C
) {
335 return (__m128i
)__builtin_ia32_selectd_128(
336 (__mmask8
)__U
, (__v4si
)_mm_dpwusds_epi32(__A
, __B
, __C
),
337 (__v4si
)_mm_setzero_si128());
340 static __inline__ __m256i __DEFAULT_FN_ATTRS256
341 _mm256_mask_dpwusds_epi32(__m256i __A
, __mmask8 __U
, __m256i __B
, __m256i __C
) {
342 return (__m256i
)__builtin_ia32_selectd_256(
343 (__mmask8
)__U
, (__v8si
)_mm256_dpwusds_epi32(__A
, __B
, __C
), (__v8si
)__A
);
346 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpwusds_epi32(
347 __m256i __A
, __mmask8 __U
, __m256i __B
, __m256i __C
) {
348 return (__m256i
)__builtin_ia32_selectd_256(
349 (__mmask8
)__U
, (__v8si
)_mm256_dpwusds_epi32(__A
, __B
, __C
),
350 (__v8si
)_mm256_setzero_si256());
353 static __inline__ __m128i __DEFAULT_FN_ATTRS128
354 _mm_mask_dpwuud_epi32(__m128i __A
, __mmask8 __U
, __m128i __B
, __m128i __C
) {
355 return (__m128i
)__builtin_ia32_selectd_128(
356 (__mmask8
)__U
, (__v4si
)_mm_dpwuud_epi32(__A
, __B
, __C
), (__v4si
)__A
);
359 static __inline__ __m128i __DEFAULT_FN_ATTRS128
360 _mm_maskz_dpwuud_epi32(__m128i __A
, __mmask8 __U
, __m128i __B
, __m128i __C
) {
361 return (__m128i
)__builtin_ia32_selectd_128(
362 (__mmask8
)__U
, (__v4si
)_mm_dpwuud_epi32(__A
, __B
, __C
),
363 (__v4si
)_mm_setzero_si128());
366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
367 _mm256_mask_dpwuud_epi32(__m256i __A
, __mmask8 __U
, __m256i __B
, __m256i __C
) {
368 return (__m256i
)__builtin_ia32_selectd_256(
369 (__mmask8
)__U
, (__v8si
)_mm256_dpwuud_epi32(__A
, __B
, __C
), (__v8si
)__A
);
372 static __inline__ __m256i __DEFAULT_FN_ATTRS256
373 _mm256_maskz_dpwuud_epi32(__m256i __A
, __mmask8 __U
, __m256i __B
, __m256i __C
) {
374 return (__m256i
)__builtin_ia32_selectd_256(
375 (__mmask8
)__U
, (__v8si
)_mm256_dpwuud_epi32(__A
, __B
, __C
),
376 (__v8si
)_mm256_setzero_si256());
379 static __inline__ __m128i __DEFAULT_FN_ATTRS128
380 _mm_mask_dpwuuds_epi32(__m128i __A
, __mmask8 __U
, __m128i __B
, __m128i __C
) {
381 return (__m128i
)__builtin_ia32_selectd_128(
382 (__mmask8
)__U
, (__v4si
)_mm_dpwuuds_epi32(__A
, __B
, __C
), (__v4si
)__A
);
385 static __inline__ __m128i __DEFAULT_FN_ATTRS128
386 _mm_maskz_dpwuuds_epi32(__m128i __A
, __mmask8 __U
, __m128i __B
, __m128i __C
) {
387 return (__m128i
)__builtin_ia32_selectd_128(
388 (__mmask8
)__U
, (__v4si
)_mm_dpwuuds_epi32(__A
, __B
, __C
),
389 (__v4si
)_mm_setzero_si128());
392 static __inline__ __m256i __DEFAULT_FN_ATTRS256
393 _mm256_mask_dpwuuds_epi32(__m256i __A
, __mmask8 __U
, __m256i __B
, __m256i __C
) {
394 return (__m256i
)__builtin_ia32_selectd_256(
395 (__mmask8
)__U
, (__v8si
)_mm256_dpwuuds_epi32(__A
, __B
, __C
), (__v8si
)__A
);
398 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpwuuds_epi32(
399 __m256i __A
, __mmask8 __U
, __m256i __B
, __m256i __C
) {
400 return (__m256i
)__builtin_ia32_selectd_256(
401 (__mmask8
)__U
, (__v8si
)_mm256_dpwuuds_epi32(__A
, __B
, __C
),
402 (__v8si
)_mm256_setzero_si256());
406 #define _mm256_add_round_pd(A, B, R) \
407 ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \
408 (__v4df)(__m256d)(B), (int)(R)))
410 #define _mm256_mask_add_round_pd(W, U, A, B, R) \
411 ((__m256d)__builtin_ia32_selectpd_256( \
412 (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
413 (__v4df)(__m256d)(W)))
415 #define _mm256_maskz_add_round_pd(U, A, B, R) \
416 ((__m256d)__builtin_ia32_selectpd_256( \
417 (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
418 (__v4df)_mm256_setzero_pd()))
420 #define _mm256_add_round_ph(A, B, R) \
421 ((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A), \
422 (__v16hf)(__m256h)(B), (int)(R)))
424 #define _mm256_mask_add_round_ph(W, U, A, B, R) \
425 ((__m256h)__builtin_ia32_selectph_256( \
426 (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
427 (__v16hf)(__m256h)(W)))
429 #define _mm256_maskz_add_round_ph(U, A, B, R) \
430 ((__m256h)__builtin_ia32_selectph_256( \
431 (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
432 (__v16hf)_mm256_setzero_ph()))
434 #define _mm256_add_round_ps(A, B, R) \
435 ((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A), \
436 (__v8sf)(__m256)(B), (int)(R)))
438 #define _mm256_mask_add_round_ps(W, U, A, B, R) \
439 ((__m256)__builtin_ia32_selectps_256( \
440 (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
441 (__v8sf)(__m256)(W)))
443 #define _mm256_maskz_add_round_ps(U, A, B, R) \
444 ((__m256)__builtin_ia32_selectps_256( \
445 (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
446 (__v8sf)_mm256_setzero_ps()))
448 #define _mm256_cmp_round_pd_mask(A, B, P, R) \
449 ((__mmask8)__builtin_ia32_vcmppd256_round_mask( \
450 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)-1, \
453 #define _mm256_mask_cmp_round_pd_mask(U, A, B, P, R) \
454 ((__mmask8)__builtin_ia32_vcmppd256_round_mask( \
455 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)(U), \
458 #define _mm256_cmp_round_ph_mask(A, B, P, R) \
459 ((__mmask16)__builtin_ia32_vcmpph256_round_mask( \
460 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)-1, \
463 #define _mm256_mask_cmp_round_ph_mask(U, A, B, P, R) \
464 ((__mmask16)__builtin_ia32_vcmpph256_round_mask( \
465 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)(U), \
468 #define _mm256_cmp_round_ps_mask(A, B, P, R) \
469 ((__mmask8)__builtin_ia32_vcmpps256_round_mask( \
470 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)-1, \
473 #define _mm256_mask_cmp_round_ps_mask(U, A, B, P, R) \
474 ((__mmask8)__builtin_ia32_vcmpps256_round_mask( \
475 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)(U), \
478 #define _mm256_cvt_roundepi32_ph(A, R) \
479 ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask( \
480 (__v8si)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
482 #define _mm256_mask_cvt_roundepi32_ph(W, U, A, R) \
483 ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask((__v8si)(A), (__v8hf)(W), \
484 (__mmask8)(U), (int)(R)))
486 #define _mm256_maskz_cvt_roundepi32_ph(U, A, R) \
487 ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask( \
488 (__v8si)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
490 #define _mm256_cvt_roundepi32_ps(A, R) \
491 ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \
492 (__v8sf)_mm256_setzero_ps(), \
493 (__mmask8)-1, (int)(R)))
495 #define _mm256_mask_cvt_roundepi32_ps(W, U, A, R) \
496 ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask( \
497 (__v8si)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
499 #define _mm256_maskz_cvt_roundepi32_ps(U, A, R) \
500 ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \
501 (__v8sf)_mm256_setzero_ps(), \
502 (__mmask8)(U), (int)(R)))
504 #define _mm256_cvt_roundpd_epi32(A, R) \
505 ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
506 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \
509 #define _mm256_mask_cvt_roundpd_epi32(W, U, A, R) \
510 ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
511 (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R)))
513 #define _mm256_maskz_cvt_roundpd_epi32(U, A, R) \
514 ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
515 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U), \
518 #define _mm256_cvt_roundpd_ph(A, R) \
519 ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask( \
520 (__v4df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
522 #define _mm256_mask_cvt_roundpd_ph(W, U, A, R) \
523 ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask((__v4df)(A), (__v8hf)(W), \
524 (__mmask8)(U), (int)(R)))
526 #define _mm256_maskz_cvt_roundpd_ph(U, A, R) \
527 ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask( \
528 (__v4df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
530 #define _mm256_cvt_roundpd_ps(A, R) \
531 ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \
532 (__v4df)(__m256d)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
534 #define _mm256_mask_cvt_roundpd_ps(W, U, A, R) \
535 ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \
536 (__v4df)(__m256d)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
538 #define _mm256_maskz_cvt_roundpd_ps(U, A, R) \
539 ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask((__v4df)(__m256d)(A), \
540 (__v4sf)_mm_setzero_ps(), \
541 (__mmask8)(U), (int)(R)))
543 #define _mm256_cvt_roundpd_epi64(A, R) \
544 ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
545 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
548 #define _mm256_mask_cvt_roundpd_epi64(W, U, A, R) \
549 ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
550 (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
552 #define _mm256_maskz_cvt_roundpd_epi64(U, A, R) \
553 ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
554 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
557 #define _mm256_cvt_roundpd_epu32(A, R) \
558 ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
559 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \
562 #define _mm256_mask_cvt_roundpd_epu32(W, U, A, R) \
563 ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
564 (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R)))
566 #define _mm256_maskz_cvt_roundpd_epu32(U, A, R) \
567 ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
568 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U), \
571 #define _mm256_cvt_roundpd_epu64(A, R) \
572 ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
573 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
576 #define _mm256_mask_cvt_roundpd_epu64(W, U, A, R) \
577 ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
578 (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
580 #define _mm256_maskz_cvt_roundpd_epu64(U, A, R) \
581 ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
582 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
585 #define _mm256_cvt_roundph_epi32(A, R) \
586 ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask( \
587 (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1), \
590 #define _mm256_mask_cvt_roundph_epi32(W, U, A, R) \
591 ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask((__v8hf)(A), (__v8si)(W), \
592 (__mmask8)(U), (int)(R)))
594 #define _mm256_maskz_cvt_roundph_epi32(U, A, R) \
595 ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask( \
596 (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
598 #define _mm256_cvt_roundph_pd(A, R) \
599 ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask( \
600 (__v8hf)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)(-1), (int)(R)))
602 #define _mm256_mask_cvt_roundph_pd(W, U, A, R) \
603 ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask((__v8hf)(A), (__v4df)(W), \
604 (__mmask8)(U), (int)(R)))
606 #define _mm256_maskz_cvt_roundph_pd(U, A, R) \
607 ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask( \
608 (__v8hf)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
610 #define _mm256_cvtx_roundph_ps(A, R) \
611 ((__m256)__builtin_ia32_vcvtph2psx256_round_mask( \
612 (__v8hf)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)(-1), (int)(R)))
614 #define _mm256_mask_cvtx_roundph_ps(W, U, A, R) \
615 ((__m256)__builtin_ia32_vcvtph2psx256_round_mask((__v8hf)(A), (__v8sf)(W), \
616 (__mmask8)(U), (int)(R)))
618 #define _mm256_maskz_cvtx_roundph_ps(U, A, R) \
619 ((__m256)__builtin_ia32_vcvtph2psx256_round_mask( \
620 (__v8hf)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
622 #define _mm256_cvt_roundph_epi64(A, R) \
623 ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask( \
624 (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1), \
627 #define _mm256_mask_cvt_roundph_epi64(W, U, A, R) \
628 ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask((__v8hf)(A), (__v4di)(W), \
629 (__mmask8)(U), (int)(R)))
631 #define _mm256_maskz_cvt_roundph_epi64(U, A, R) \
632 ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask( \
633 (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
635 #define _mm256_cvt_roundph_epu32(A, R) \
636 ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask( \
637 (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1), \
640 #define _mm256_mask_cvt_roundph_epu32(W, U, A, R) \
641 ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \
642 (__mmask8)(U), (int)(R)))
644 #define _mm256_maskz_cvt_roundph_epu32(U, A, R) \
645 ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask( \
646 (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
648 #define _mm256_cvt_roundph_epu64(A, R) \
649 ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask( \
650 (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1), \
653 #define _mm256_mask_cvt_roundph_epu64(W, U, A, R) \
654 ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \
655 (__mmask8)(U), (int)(R)))
657 #define _mm256_maskz_cvt_roundph_epu64(U, A, R) \
658 ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask( \
659 (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
661 #define _mm256_cvt_roundph_epu16(A, R) \
662 ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask( \
663 (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1), \
666 #define _mm256_mask_cvt_roundph_epu16(W, U, A, R) \
667 ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask((__v16hf)(A), (__v16hu)(W), \
668 (__mmask16)(U), (int)(R)))
670 #define _mm256_maskz_cvt_roundph_epu16(U, A, R) \
671 ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask( \
672 (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \
675 #define _mm256_cvt_roundph_epi16(A, R) \
676 ((__m256i)__builtin_ia32_vcvtph2w256_round_mask( \
677 (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1), \
680 #define _mm256_mask_cvt_roundph_epi16(W, U, A, R) \
681 ((__m256i)__builtin_ia32_vcvtph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \
682 (__mmask16)(U), (int)(R)))
684 #define _mm256_maskz_cvt_roundph_epi16(U, A, R) \
685 ((__m256i)__builtin_ia32_vcvtph2w256_round_mask( \
686 (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U), \
689 #define _mm256_cvt_roundps_epi32(A, R) \
690 ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
691 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \
694 #define _mm256_mask_cvt_roundps_epi32(W, U, A, R) \
695 ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
696 (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R)))
698 #define _mm256_maskz_cvt_roundps_epi32(U, A, R) \
699 ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
700 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \
703 #define _mm256_cvt_roundps_pd(A, R) \
704 ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
705 (__v4sf)(__m128)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \
708 #define _mm256_mask_cvt_roundps_pd(W, U, A, R) \
709 ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
710 (__v4sf)(__m128)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
712 #define _mm256_maskz_cvt_roundps_pd(U, A, R) \
713 ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
714 (__v4sf)(__m128)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
717 #define _mm256_cvt_roundps_ph(A, I) \
718 ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
719 (__v8hi)_mm_undefined_si128(), \
722 /* FIXME: We may use these way in future.
723 #define _mm256_cvt_roundps_ph(A, I) \
724 ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \
725 (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_undefined_si128(), \
727 #define _mm256_mask_cvt_roundps_ph(U, W, A, I) \
728 ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \
729 (__v8sf)(__m256)(A), (int)(I), (__v8hi)(__m128i)(U), (__mmask8)(W)))
730 #define _mm256_maskz_cvt_roundps_ph(W, A, I) \
731 ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \
732 (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_setzero_si128(), \
735 #define _mm256_cvtx_roundps_ph(A, R) \
736 ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask( \
737 (__v8sf)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
739 #define _mm256_mask_cvtx_roundps_ph(W, U, A, R) \
740 ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask((__v8sf)(A), (__v8hf)(W), \
741 (__mmask8)(U), (int)(R)))
743 #define _mm256_maskz_cvtx_roundps_ph(U, A, R) \
744 ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask( \
745 (__v8sf)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
747 #define _mm256_cvt_roundps_epi64(A, R) \
748 ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
749 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
752 #define _mm256_mask_cvt_roundps_epi64(W, U, A, R) \
753 ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
754 (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
756 #define _mm256_maskz_cvt_roundps_epi64(U, A, R) \
757 ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
758 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
761 #define _mm256_cvt_roundps_epu32(A, R) \
762 ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
763 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
766 #define _mm256_mask_cvt_roundps_epu32(W, U, A, R) \
767 ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
768 (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R)))
770 #define _mm256_maskz_cvt_roundps_epu32(U, A, R) \
771 ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
772 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \
775 #define _mm256_cvt_roundps_epu64(A, R) \
776 ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
777 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
780 #define _mm256_mask_cvt_roundps_epu64(W, U, A, R) \
781 ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
782 (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
784 #define _mm256_maskz_cvt_roundps_epu64(U, A, R) \
785 ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
786 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
789 #define _mm256_cvt_roundepi64_pd(A, R) \
790 ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
791 (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \
794 #define _mm256_mask_cvt_roundepi64_pd(W, U, A, R) \
795 ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
796 (__v4di)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
798 #define _mm256_maskz_cvt_roundepi64_pd(U, A, R) \
799 ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
800 (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
803 #define _mm256_cvt_roundepi64_ph(A, R) \
804 ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask( \
805 (__v4di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
807 #define _mm256_mask_cvt_roundepi64_ph(W, U, A, R) \
808 ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask((__v4di)(A), (__v8hf)(W), \
809 (__mmask8)(U), (int)(R)))
811 #define _mm256_maskz_cvt_roundepi64_ph(U, A, R) \
812 ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask( \
813 (__v4di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
815 #define _mm256_cvt_roundepi64_ps(A, R) \
816 ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \
817 (__v4di)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
819 #define _mm256_mask_cvt_roundepi64_ps(W, U, A, R) \
820 ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \
821 (__v4di)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
823 #define _mm256_maskz_cvt_roundepi64_ps(U, A, R) \
824 ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask((__v4di)(__m256i)(A), \
825 (__v4sf)_mm_setzero_ps(), \
826 (__mmask8)(U), (int)(R)))
828 #define _mm256_cvtt_roundpd_epi32(A, R) \
829 ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
830 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \
833 #define _mm256_mask_cvtt_roundpd_epi32(W, U, A, R) \
834 ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
835 (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R)))
837 #define _mm256_maskz_cvtt_roundpd_epi32(U, A, R) \
838 ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
839 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U), \
842 #define _mm256_cvtt_roundpd_epi64(A, R) \
843 ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
844 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
847 #define _mm256_mask_cvtt_roundpd_epi64(W, U, A, R) \
848 ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
849 (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
851 #define _mm256_maskz_cvtt_roundpd_epi64(U, A, R) \
852 ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
853 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
856 #define _mm256_cvtt_roundpd_epu32(A, R) \
857 ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
858 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \
861 #define _mm256_mask_cvtt_roundpd_epu32(W, U, A, R) \
862 ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
863 (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R)))
865 #define _mm256_maskz_cvtt_roundpd_epu32(U, A, R) \
866 ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
867 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U), \
870 #define _mm256_cvtt_roundpd_epu64(A, R) \
871 ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
872 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
875 #define _mm256_mask_cvtt_roundpd_epu64(W, U, A, R) \
876 ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
877 (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
879 #define _mm256_maskz_cvtt_roundpd_epu64(U, A, R) \
880 ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
881 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
884 #define _mm256_cvtt_roundph_epi32(A, R) \
885 ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask( \
886 (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1), \
889 #define _mm256_mask_cvtt_roundph_epi32(W, U, A, R) \
890 ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask((__v8hf)(A), (__v8si)(W), \
891 (__mmask8)(U), (int)(R)))
893 #define _mm256_maskz_cvtt_roundph_epi32(U, A, R) \
894 ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask( \
895 (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
897 #define _mm256_cvtt_roundph_epi64(A, R) \
898 ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask( \
899 (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1), \
902 #define _mm256_mask_cvtt_roundph_epi64(W, U, A, R) \
903 ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask((__v8hf)(A), (__v4di)(W), \
904 (__mmask8)(U), (int)(R)))
906 #define _mm256_maskz_cvtt_roundph_epi64(U, A, R) \
907 ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask( \
908 (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
910 #define _mm256_cvtt_roundph_epu32(A, R) \
911 ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask( \
912 (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1), \
915 #define _mm256_mask_cvtt_roundph_epu32(W, U, A, R) \
916 ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \
917 (__mmask8)(U), (int)(R)))
919 #define _mm256_maskz_cvtt_roundph_epu32(U, A, R) \
920 ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask( \
921 (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
923 #define _mm256_cvtt_roundph_epu64(A, R) \
924 ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask( \
925 (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1), \
928 #define _mm256_mask_cvtt_roundph_epu64(W, U, A, R) \
929 ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \
930 (__mmask8)(U), (int)(R)))
932 #define _mm256_maskz_cvtt_roundph_epu64(U, A, R) \
933 ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask( \
934 (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
936 #define _mm256_cvtt_roundph_epu16(A, R) \
937 ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \
938 (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1), \
941 #define _mm256_mask_cvtt_roundph_epu16(W, U, A, R) \
942 ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \
943 (__v16hf)(A), (__v16hu)(W), (__mmask16)(U), (int)(R)))
945 #define _mm256_maskz_cvtt_roundph_epu16(U, A, R) \
946 ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \
947 (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \
950 #define _mm256_cvtt_roundph_epi16(A, R) \
951 ((__m256i)__builtin_ia32_vcvttph2w256_round_mask( \
952 (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1), \
955 #define _mm256_mask_cvtt_roundph_epi16(W, U, A, R) \
956 ((__m256i)__builtin_ia32_vcvttph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \
957 (__mmask16)(U), (int)(R)))
959 #define _mm256_maskz_cvtt_roundph_epi16(U, A, R) \
960 ((__m256i)__builtin_ia32_vcvttph2w256_round_mask( \
961 (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U), \
964 #define _mm256_cvtt_roundps_epi32(A, R) \
965 ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
966 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \
969 #define _mm256_mask_cvtt_roundps_epi32(W, U, A, R) \
970 ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
971 (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R)))
973 #define _mm256_maskz_cvtt_roundps_epi32(U, A, R) \
974 ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
975 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \
978 #define _mm256_cvtt_roundps_epi64(A, R) \
979 ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
980 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
983 #define _mm256_mask_cvtt_roundps_epi64(W, U, A, R) \
984 ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
985 (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
987 #define _mm256_maskz_cvtt_roundps_epi64(U, A, R) \
988 ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
989 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
992 #define _mm256_cvtt_roundps_epu32(A, R) \
993 ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
994 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
997 #define _mm256_mask_cvtt_roundps_epu32(W, U, A, R) \
998 ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
999 (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R)))
1001 #define _mm256_maskz_cvtt_roundps_epu32(U, A, R) \
1002 ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
1003 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \
1006 #define _mm256_cvtt_roundps_epu64(A, R) \
1007 ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
1008 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
1011 #define _mm256_mask_cvtt_roundps_epu64(W, U, A, R) \
1012 ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
1013 (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
1015 #define _mm256_maskz_cvtt_roundps_epu64(U, A, R) \
1016 ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
1017 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
1020 #define _mm256_cvt_roundepu32_ph(A, R) \
1021 ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask( \
1022 (__v8su)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1024 #define _mm256_mask_cvt_roundepu32_ph(W, U, A, R) \
1025 ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask((__v8su)(A), (__v8hf)(W), \
1026 (__mmask8)(U), (int)(R)))
1028 #define _mm256_maskz_cvt_roundepu32_ph(U, A, R) \
1029 ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask( \
1030 (__v8su)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1032 #define _mm256_cvt_roundepu32_ps(A, R) \
1033 ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
1034 (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, \
1037 #define _mm256_mask_cvt_roundepu32_ps(W, U, A, R) \
1038 ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
1039 (__v8su)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
1041 #define _mm256_maskz_cvt_roundepu32_ps(U, A, R) \
1042 ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
1043 (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), \
1046 #define _mm256_cvt_roundepu64_pd(A, R) \
1047 ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
1048 (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \
1051 #define _mm256_mask_cvt_roundepu64_pd(W, U, A, R) \
1052 ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
1053 (__v4du)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1055 #define _mm256_maskz_cvt_roundepu64_pd(U, A, R) \
1056 ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
1057 (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
1060 #define _mm256_cvt_roundepu64_ph(A, R) \
1061 ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask( \
1062 (__v4du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1064 #define _mm256_mask_cvt_roundepu64_ph(W, U, A, R) \
1065 ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask((__v4du)(A), (__v8hf)(W), \
1066 (__mmask8)(U), (int)(R)))
1068 #define _mm256_maskz_cvt_roundepu64_ph(U, A, R) \
1069 ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask( \
1070 (__v4du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1072 #define _mm256_cvt_roundepu64_ps(A, R) \
1073 ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \
1074 (__v4du)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
1076 #define _mm256_mask_cvt_roundepu64_ps(W, U, A, R) \
1077 ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \
1078 (__v4du)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
1080 #define _mm256_maskz_cvt_roundepu64_ps(U, A, R) \
1081 ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask((__v4du)(__m256i)(A), \
1082 (__v4sf)_mm_setzero_ps(), \
1083 (__mmask8)(U), (int)(R)))
1085 #define _mm256_cvt_roundepu16_ph(A, R) \
1086 ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask( \
1087 (__v16hu)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1), \
1090 #define _mm256_mask_cvt_roundepu16_ph(W, U, A, R) \
1091 ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask((__v16hu)(A), (__v16hf)(W), \
1092 (__mmask16)(U), (int)(R)))
1094 #define _mm256_maskz_cvt_roundepu16_ph(U, A, R) \
1095 ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask( \
1096 (__v16hu)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1098 #define _mm256_cvt_roundepi16_ph(A, R) \
1099 ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask( \
1100 (__v16hi)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1), \
1103 #define _mm256_mask_cvt_roundepi16_ph(W, U, A, R) \
1104 ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask((__v16hi)(A), (__v16hf)(W), \
1105 (__mmask16)(U), (int)(R)))
1107 #define _mm256_maskz_cvt_roundepi16_ph(U, A, R) \
1108 ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask( \
1109 (__v16hi)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1111 #define _mm256_div_round_pd(A, B, R) \
1112 ((__m256d)__builtin_ia32_vdivpd256_round((__v4df)(__m256d)(A), \
1113 (__v4df)(__m256d)(B), (int)(R)))
1115 #define _mm256_mask_div_round_pd(W, U, A, B, R) \
1116 ((__m256d)__builtin_ia32_selectpd_256( \
1117 (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)), \
1118 (__v4df)(__m256d)(W)))
1120 #define _mm256_maskz_div_round_pd(U, A, B, R) \
1121 ((__m256d)__builtin_ia32_selectpd_256( \
1122 (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)), \
1123 (__v4df)_mm256_setzero_pd()))
1125 #define _mm256_div_round_ph(A, B, R) \
1126 ((__m256h)__builtin_ia32_vdivph256_round((__v16hf)(__m256h)(A), \
1127 (__v16hf)(__m256h)(B), (int)(R)))
1129 #define _mm256_mask_div_round_ph(W, U, A, B, R) \
1130 ((__m256h)__builtin_ia32_selectph_256( \
1131 (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)), \
1132 (__v16hf)(__m256h)(W)))
1134 #define _mm256_maskz_div_round_ph(U, A, B, R) \
1135 ((__m256h)__builtin_ia32_selectph_256( \
1136 (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)), \
1137 (__v16hf)_mm256_setzero_ph()))
1139 #define _mm256_div_round_ps(A, B, R) \
1140 ((__m256)__builtin_ia32_vdivps256_round((__v8sf)(__m256)(A), \
1141 (__v8sf)(__m256)(B), (int)(R)))
1143 #define _mm256_mask_div_round_ps(W, U, A, B, R) \
1144 ((__m256)__builtin_ia32_selectps_256( \
1145 (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)), \
1146 (__v8sf)(__m256)(W)))
1148 #define _mm256_maskz_div_round_ps(U, A, B, R) \
1149 ((__m256)__builtin_ia32_selectps_256( \
1150 (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)), \
1151 (__v8sf)_mm256_setzero_ps()))
1153 #define _mm256_fcmadd_round_pch(A, B, C, R) \
1154 ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \
1155 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1156 (__mmask8)-1, (int)(R)))
1158 #define _mm256_mask_fcmadd_round_pch(A, U, B, C, R) \
1159 ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask( \
1160 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1161 (__mmask8)(U), (int)(R)))
1163 #define _mm256_mask3_fcmadd_round_pch(A, B, C, U, R) \
1164 ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \
1165 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1166 (__mmask8)(U), (int)(R)))
1168 #define _mm256_maskz_fcmadd_round_pch(U, A, B, C, R) \
1169 ((__m256h)__builtin_ia32_vfcmaddcph256_round_maskz( \
1170 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1171 (__mmask8)(U), (int)(R)))
1173 #define _mm256_cmul_round_pch(A, B, R) \
1174 ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
1175 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1176 (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
1178 #define _mm256_mask_cmul_round_pch(W, U, A, B, R) \
1179 ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
1180 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W), \
1181 (__mmask8)(U), (int)(R)))
1183 #define _mm256_maskz_cmul_round_pch(U, A, B, R) \
1184 ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
1185 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1186 (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R)))
1188 #define _mm256_fixupimm_round_pd(A, B, C, imm, R) \
1189 ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \
1190 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
1191 (int)(imm), (__mmask8)-1, (int)(R)))
1193 #define _mm256_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
1194 ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \
1195 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
1196 (int)(imm), (__mmask8)(U), (int)(R)))
1198 #define _mm256_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
1199 ((__m256d)__builtin_ia32_vfixupimmpd256_round_maskz( \
1200 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
1201 (int)(imm), (__mmask8)(U), (int)(R)))
1203 #define _mm256_fixupimm_round_ps(A, B, C, imm, R) \
1204 ((__m256)__builtin_ia32_vfixupimmps256_round_mask( \
1205 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
1206 (int)(imm), (__mmask8)-1, (int)(R)))
1208 #define _mm256_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
1209 ((__m256)__builtin_ia32_vfixupimmps256_round_mask( \
1210 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
1211 (int)(imm), (__mmask8)(U), (int)(R)))
1213 #define _mm256_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
1214 ((__m256)__builtin_ia32_vfixupimmps256_round_maskz( \
1215 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
1216 (int)(imm), (__mmask8)(U), (int)(R)))
1218 #define _mm256_fmadd_round_pd(A, B, C, R) \
1219 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1220 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1221 (__mmask8)-1, (int)(R)))
1223 #define _mm256_mask_fmadd_round_pd(A, U, B, C, R) \
1224 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1225 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1226 (__mmask8)(U), (int)(R)))
1228 #define _mm256_mask3_fmadd_round_pd(A, B, C, U, R) \
1229 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \
1230 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1231 (__mmask8)(U), (int)(R)))
1233 #define _mm256_maskz_fmadd_round_pd(U, A, B, C, R) \
1234 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1235 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1236 (__mmask8)(U), (int)(R)))
1238 #define _mm256_fmsub_round_pd(A, B, C, R) \
1239 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1240 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1241 (__mmask8)-1, (int)(R)))
1243 #define _mm256_mask_fmsub_round_pd(A, U, B, C, R) \
1244 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1245 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1246 (__mmask8)(U), (int)(R)))
1248 #define _mm256_maskz_fmsub_round_pd(U, A, B, C, R) \
1249 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1250 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1251 (__mmask8)(U), (int)(R)))
1253 #define _mm256_fnmadd_round_pd(A, B, C, R) \
1254 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1255 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1256 (__mmask8)-1, (int)(R)))
1258 #define _mm256_mask3_fnmadd_round_pd(A, B, C, U, R) \
1259 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \
1260 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1261 (__mmask8)(U), (int)(R)))
1263 #define _mm256_maskz_fnmadd_round_pd(U, A, B, C, R) \
1264 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1265 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1266 (__mmask8)(U), (int)(R)))
1268 #define _mm256_fnmsub_round_pd(A, B, C, R) \
1269 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1270 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1271 (__mmask8)-1, (int)(R)))
1273 #define _mm256_maskz_fnmsub_round_pd(U, A, B, C, R) \
1274 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1275 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1276 (__mmask8)(U), (int)(R)))
1278 #define _mm256_fmadd_round_ph(A, B, C, R) \
1279 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1280 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1281 (__mmask16)-1, (int)(R)))
1283 #define _mm256_mask_fmadd_round_ph(A, U, B, C, R) \
1284 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1285 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1286 (__mmask16)(U), (int)(R)))
1288 #define _mm256_mask3_fmadd_round_ph(A, B, C, U, R) \
1289 ((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \
1290 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1291 (__mmask16)(U), (int)(R)))
1293 #define _mm256_maskz_fmadd_round_ph(U, A, B, C, R) \
1294 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1295 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1296 (__mmask16)(U), (int)(R)))
1298 #define _mm256_fmsub_round_ph(A, B, C, R) \
1299 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1300 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1301 (__mmask16)-1, (int)(R)))
1303 #define _mm256_mask_fmsub_round_ph(A, U, B, C, R) \
1304 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1305 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1306 (__mmask16)(U), (int)(R)))
1308 #define _mm256_maskz_fmsub_round_ph(U, A, B, C, R) \
1309 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1310 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1311 (__mmask16)(U), (int)(R)))
1313 #define _mm256_fnmadd_round_ph(A, B, C, R) \
1314 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1315 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1316 (__mmask16)-1, (int)(R)))
1318 #define _mm256_mask3_fnmadd_round_ph(A, B, C, U, R) \
1319 ((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \
1320 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1321 (__mmask16)(U), (int)(R)))
1323 #define _mm256_maskz_fnmadd_round_ph(U, A, B, C, R) \
1324 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1325 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1326 (__mmask16)(U), (int)(R)))
1328 #define _mm256_fnmsub_round_ph(A, B, C, R) \
1329 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1330 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1331 (__mmask16)-1, (int)(R)))
1333 #define _mm256_maskz_fnmsub_round_ph(U, A, B, C, R) \
1334 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1335 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1336 (__mmask16)(U), (int)(R)))
1338 #define _mm256_fmadd_round_ps(A, B, C, R) \
1339 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1340 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1341 (__mmask8)-1, (int)(R)))
1343 #define _mm256_mask_fmadd_round_ps(A, U, B, C, R) \
1344 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1345 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1346 (__mmask8)(U), (int)(R)))
1348 #define _mm256_mask3_fmadd_round_ps(A, B, C, U, R) \
1349 ((__m256)__builtin_ia32_vfmaddps256_round_mask3( \
1350 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1351 (__mmask8)(U), (int)(R)))
1353 #define _mm256_maskz_fmadd_round_ps(U, A, B, C, R) \
1354 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1355 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1356 (__mmask8)(U), (int)(R)))
1358 #define _mm256_fmsub_round_ps(A, B, C, R) \
1359 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1360 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1361 (__mmask8)-1, (int)(R)))
1363 #define _mm256_mask_fmsub_round_ps(A, U, B, C, R) \
1364 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1365 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1366 (__mmask8)(U), (int)(R)))
1368 #define _mm256_maskz_fmsub_round_ps(U, A, B, C, R) \
1369 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1370 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1371 (__mmask8)(U), (int)(R)))
1373 #define _mm256_fnmadd_round_ps(A, B, C, R) \
1374 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1375 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1376 (__mmask8)-1, (int)(R)))
1378 #define _mm256_mask3_fnmadd_round_ps(A, B, C, U, R) \
1379 ((__m256)__builtin_ia32_vfmaddps256_round_mask3( \
1380 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1381 (__mmask8)(U), (int)(R)))
1383 #define _mm256_maskz_fnmadd_round_ps(U, A, B, C, R) \
1384 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1385 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1386 (__mmask8)(U), (int)(R)))
1388 #define _mm256_fnmsub_round_ps(A, B, C, R) \
1389 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1390 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1391 (__mmask8)-1, (int)(R)))
1393 #define _mm256_maskz_fnmsub_round_ps(U, A, B, C, R) \
1394 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1395 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1396 (__mmask8)(U), (int)(R)))
1398 #define _mm256_fmadd_round_pch(A, B, C, R) \
1399 ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \
1400 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1401 (__mmask8)-1, (int)(R)))
1403 #define _mm256_mask_fmadd_round_pch(A, U, B, C, R) \
1404 ((__m256h)__builtin_ia32_vfmaddcph256_round_mask( \
1405 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1406 (__mmask8)(U), (int)(R)))
1408 #define _mm256_mask3_fmadd_round_pch(A, B, C, U, R) \
1409 ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \
1410 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1411 (__mmask8)(U), (int)(R)))
1413 #define _mm256_maskz_fmadd_round_pch(U, A, B, C, R) \
1414 ((__m256h)__builtin_ia32_vfmaddcph256_round_maskz( \
1415 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1416 (__mmask8)(U), (int)(R)))
1418 #define _mm256_fmaddsub_round_pd(A, B, C, R) \
1419 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1420 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1421 (__mmask8)-1, (int)(R)))
1423 #define _mm256_mask_fmaddsub_round_pd(A, U, B, C, R) \
1424 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1425 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1426 (__mmask8)(U), (int)(R)))
1428 #define _mm256_mask3_fmaddsub_round_pd(A, B, C, U, R) \
1429 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask3( \
1430 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1431 (__mmask8)(U), (int)(R)))
1433 #define _mm256_maskz_fmaddsub_round_pd(U, A, B, C, R) \
1434 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz( \
1435 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1436 (__mmask8)(U), (int)(R)))
1438 #define _mm256_fmsubadd_round_pd(A, B, C, R) \
1439 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1440 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1441 (__mmask8)-1, (int)(R)))
1443 #define _mm256_mask_fmsubadd_round_pd(A, U, B, C, R) \
1444 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1445 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1446 (__mmask8)(U), (int)(R)))
1448 #define _mm256_maskz_fmsubadd_round_pd(U, A, B, C, R) \
1449 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz( \
1450 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1451 (__mmask8)(U), (int)(R)))
1453 #define _mm256_fmaddsub_round_ph(A, B, C, R) \
1454 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1455 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1456 (__mmask16)-1, (int)(R)))
1458 #define _mm256_mask_fmaddsub_round_ph(A, U, B, C, R) \
1459 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1460 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1461 (__mmask16)(U), (int)(R)))
1463 #define _mm256_mask3_fmaddsub_round_ph(A, B, C, U, R) \
1464 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask3( \
1465 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1466 (__mmask16)(U), (int)(R)))
1468 #define _mm256_maskz_fmaddsub_round_ph(U, A, B, C, R) \
1469 ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz( \
1470 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1471 (__mmask16)(U), (int)(R)))
1473 #define _mm256_fmsubadd_round_ph(A, B, C, R) \
1474 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1475 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1476 (__mmask16)-1, (int)(R)))
1478 #define _mm256_mask_fmsubadd_round_ph(A, U, B, C, R) \
1479 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1480 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1481 (__mmask16)(U), (int)(R)))
1483 #define _mm256_maskz_fmsubadd_round_ph(U, A, B, C, R) \
1484 ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz( \
1485 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1486 (__mmask16)(U), (int)(R)))
1488 #define _mm256_fmaddsub_round_ps(A, B, C, R) \
1489 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1490 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1491 (__mmask8)-1, (int)(R)))
1493 #define _mm256_mask_fmaddsub_round_ps(A, U, B, C, R) \
1494 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1495 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1496 (__mmask8)(U), (int)(R)))
1498 #define _mm256_mask3_fmaddsub_round_ps(A, B, C, U, R) \
1499 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask3( \
1500 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1501 (__mmask8)(U), (int)(R)))
1503 #define _mm256_maskz_fmaddsub_round_ps(U, A, B, C, R) \
1504 ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz( \
1505 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1506 (__mmask8)(U), (int)(R)))
1508 #define _mm256_fmsubadd_round_ps(A, B, C, R) \
1509 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1510 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1511 (__mmask8)-1, (int)(R)))
1513 #define _mm256_mask_fmsubadd_round_ps(A, U, B, C, R) \
1514 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1515 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1516 (__mmask8)(U), (int)(R)))
1518 #define _mm256_maskz_fmsubadd_round_ps(U, A, B, C, R) \
1519 ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz( \
1520 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1521 (__mmask8)(U), (int)(R)))
1522 #define _mm256_mask3_fmsub_round_pd(A, B, C, U, R) \
1523 ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3( \
1524 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1525 (__mmask8)(U), (int)(R)))
1527 #define _mm256_mask3_fmsubadd_round_pd(A, B, C, U, R) \
1528 ((__m256d)__builtin_ia32_vfmsubaddpd256_round_mask3( \
1529 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1530 (__mmask8)(U), (int)(R)))
1532 #define _mm256_mask_fnmadd_round_pd(A, U, B, C, R) \
1533 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1534 (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1535 (__mmask8)(U), (int)(R)))
1537 #define _mm256_mask_fnmsub_round_pd(A, U, B, C, R) \
1538 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1539 (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1540 (__mmask8)(U), (int)(R)))
1542 #define _mm256_mask3_fnmsub_round_pd(A, B, C, U, R) \
1543 ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3( \
1544 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1545 (__mmask8)(U), (int)(R)))
1547 #define _mm256_mask3_fmsub_round_ph(A, B, C, U, R) \
1548 ((__m256h)__builtin_ia32_vfmsubph256_round_mask3( \
1549 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1550 (__mmask16)(U), (int)(R)))
1552 #define _mm256_mask3_fmsubadd_round_ph(A, B, C, U, R) \
1553 ((__m256h)__builtin_ia32_vfmsubaddph256_round_mask3( \
1554 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1555 (__mmask16)(U), (int)(R)))
1557 #define _mm256_mask_fnmadd_round_ph(A, U, B, C, R) \
1558 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1559 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1560 (__mmask16)(U), (int)(R)))
1562 #define _mm256_mask_fnmsub_round_ph(A, U, B, C, R) \
1563 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1564 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1565 (__mmask16)(U), (int)(R)))
1567 #define _mm256_mask3_fnmsub_round_ph(A, B, C, U, R) \
1568 ((__m256h)__builtin_ia32_vfmsubph256_round_mask3( \
1569 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1570 (__mmask16)(U), (int)(R)))
1572 #define _mm256_mask3_fmsub_round_ps(A, B, C, U, R) \
1573 ((__m256)__builtin_ia32_vfmsubps256_round_mask3( \
1574 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1575 (__mmask8)(U), (int)(R)))
1577 #define _mm256_mask3_fmsubadd_round_ps(A, B, C, U, R) \
1578 ((__m256)__builtin_ia32_vfmsubaddps256_round_mask3( \
1579 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1580 (__mmask8)(U), (int)(R)))
1582 #define _mm256_mask_fnmadd_round_ps(A, U, B, C, R) \
1583 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1584 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1585 (__mmask8)(U), (int)(R)))
1587 #define _mm256_mask_fnmsub_round_ps(A, U, B, C, R) \
1588 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1589 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1590 (__mmask8)(U), (int)(R)))
1592 #define _mm256_mask3_fnmsub_round_ps(A, B, C, U, R) \
1593 ((__m256)__builtin_ia32_vfmsubps256_round_mask3( \
1594 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1595 (__mmask8)(U), (int)(R)))
1597 #define _mm256_mul_round_pch(A, B, R) \
1598 ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
1599 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1600 (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
1602 #define _mm256_mask_mul_round_pch(W, U, A, B, R) \
1603 ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
1604 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W), \
1605 (__mmask8)(U), (int)(R)))
1607 #define _mm256_maskz_mul_round_pch(U, A, B, R) \
1608 ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
1609 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1610 (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R)))
1612 #define _mm256_getexp_round_pd(A, R) \
1613 ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
1614 (__v4df)(__m256d)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \
1617 #define _mm256_mask_getexp_round_pd(W, U, A, R) \
1618 ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
1619 (__v4df)(__m256d)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1621 #define _mm256_maskz_getexp_round_pd(U, A, R) \
1622 ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
1623 (__v4df)(__m256d)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
1626 #define _mm256_getexp_round_ph(A, R) \
1627 ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
1628 (__v16hf)(__m256h)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, \
1631 #define _mm256_mask_getexp_round_ph(W, U, A, R) \
1632 ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
1633 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R)))
1635 #define _mm256_maskz_getexp_round_ph(U, A, R) \
1636 ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
1637 (__v16hf)(__m256h)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), \
1640 #define _mm256_getexp_round_ps(A, R) \
1641 ((__m256)__builtin_ia32_vgetexpps256_round_mask( \
1642 (__v8sf)(__m256)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, \
1645 #define _mm256_mask_getexp_round_ps(W, U, A, R) \
1646 ((__m256)__builtin_ia32_vgetexpps256_round_mask( \
1647 (__v8sf)(__m256)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
1649 #define _mm256_maskz_getexp_round_ps(U, A, R) \
1650 ((__m256)__builtin_ia32_vgetexpps256_round_mask((__v8sf)(__m256)(A), \
1651 (__v8sf)_mm256_setzero_ps(), \
1652 (__mmask8)(U), (int)(R)))
1654 #define _mm256_getmant_round_pd(A, B, C, R) \
1655 ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
1656 (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \
1657 (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
1659 #define _mm256_mask_getmant_round_pd(W, U, A, B, C, R) \
1660 ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
1661 (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), (__v4df)(__m256d)(W), \
1662 (__mmask8)(U), (int)(R)))
1664 #define _mm256_maskz_getmant_round_pd(U, A, B, C, R) \
1665 ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
1666 (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \
1667 (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
1669 #define _mm256_getmant_round_ph(A, B, C, R) \
1670 ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
1671 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
1672 (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
1674 #define _mm256_mask_getmant_round_ph(W, U, A, B, C, R) \
1675 ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
1676 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
1677 (__mmask16)(U), (int)(R)))
1679 #define _mm256_maskz_getmant_round_ph(U, A, B, C, R) \
1680 ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
1681 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
1682 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1684 #define _mm256_getmant_round_ps(A, B, C, R) \
1685 ((__m256)__builtin_ia32_vgetmantps256_round_mask( \
1686 (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \
1687 (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R)))
1689 #define _mm256_mask_getmant_round_ps(W, U, A, B, C, R) \
1690 ((__m256)__builtin_ia32_vgetmantps256_round_mask( \
1691 (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), (__v8sf)(__m256)(W), \
1692 (__mmask8)(U), (int)(R)))
1694 #define _mm256_maskz_getmant_round_ps(U, A, B, C, R) \
1695 ((__m256)__builtin_ia32_vgetmantps256_round_mask( \
1696 (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \
1697 (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
1699 #define _mm256_max_round_pd(A, B, R) \
1700 ((__m256d)__builtin_ia32_vmaxpd256_round((__v4df)(__m256d)(A), \
1701 (__v4df)(__m256d)(B), (int)(R)))
1703 #define _mm256_mask_max_round_pd(W, U, A, B, R) \
1704 ((__m256d)__builtin_ia32_selectpd_256( \
1705 (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)), \
1706 (__v4df)(__m256d)(W)))
1708 #define _mm256_maskz_max_round_pd(U, A, B, R) \
1709 ((__m256d)__builtin_ia32_selectpd_256( \
1710 (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)), \
1711 (__v4df)_mm256_setzero_pd()))
1713 #define _mm256_max_round_ph(A, B, R) \
1714 ((__m256h)__builtin_ia32_vmaxph256_round((__v16hf)(__m256h)(A), \
1715 (__v16hf)(__m256h)(B), (int)(R)))
1717 #define _mm256_mask_max_round_ph(W, U, A, B, R) \
1718 ((__m256h)__builtin_ia32_selectph_256( \
1719 (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)), \
1720 (__v16hf)(__m256h)(W)))
1722 #define _mm256_maskz_max_round_ph(U, A, B, R) \
1723 ((__m256h)__builtin_ia32_selectph_256( \
1724 (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)), \
1725 (__v16hf)_mm256_setzero_ph()))
1727 #define _mm256_max_round_ps(A, B, R) \
1728 ((__m256)__builtin_ia32_vmaxps256_round((__v8sf)(__m256)(A), \
1729 (__v8sf)(__m256)(B), (int)(R)))
1731 #define _mm256_mask_max_round_ps(W, U, A, B, R) \
1732 ((__m256)__builtin_ia32_selectps_256( \
1733 (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)), \
1734 (__v8sf)(__m256)(W)))
1736 #define _mm256_maskz_max_round_ps(U, A, B, R) \
1737 ((__m256)__builtin_ia32_selectps_256( \
1738 (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)), \
1739 (__v8sf)_mm256_setzero_ps()))
1741 #define _mm256_min_round_pd(A, B, R) \
1742 ((__m256d)__builtin_ia32_vminpd256_round((__v4df)(__m256d)(A), \
1743 (__v4df)(__m256d)(B), (int)(R)))
1745 #define _mm256_mask_min_round_pd(W, U, A, B, R) \
1746 ((__m256d)__builtin_ia32_selectpd_256( \
1747 (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)), \
1748 (__v4df)(__m256d)(W)))
1750 #define _mm256_maskz_min_round_pd(U, A, B, R) \
1751 ((__m256d)__builtin_ia32_selectpd_256( \
1752 (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)), \
1753 (__v4df)_mm256_setzero_pd()))
1755 #define _mm256_min_round_ph(A, B, R) \
1756 ((__m256h)__builtin_ia32_vminph256_round((__v16hf)(__m256h)(A), \
1757 (__v16hf)(__m256h)(B), (int)(R)))
1759 #define _mm256_mask_min_round_ph(W, U, A, B, R) \
1760 ((__m256h)__builtin_ia32_selectph_256( \
1761 (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)), \
1762 (__v16hf)(__m256h)(W)))
1764 #define _mm256_maskz_min_round_ph(U, A, B, R) \
1765 ((__m256h)__builtin_ia32_selectph_256( \
1766 (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)), \
1767 (__v16hf)_mm256_setzero_ph()))
1769 #define _mm256_min_round_ps(A, B, R) \
1770 ((__m256)__builtin_ia32_vminps256_round((__v8sf)(__m256)(A), \
1771 (__v8sf)(__m256)(B), (int)(R)))
1773 #define _mm256_mask_min_round_ps(W, U, A, B, R) \
1774 ((__m256)__builtin_ia32_selectps_256( \
1775 (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)), \
1776 (__v8sf)(__m256)(W)))
1778 #define _mm256_maskz_min_round_ps(U, A, B, R) \
1779 ((__m256)__builtin_ia32_selectps_256( \
1780 (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)), \
1781 (__v8sf)_mm256_setzero_ps()))
1783 #define _mm256_mul_round_pd(A, B, R) \
1784 ((__m256d)__builtin_ia32_vmulpd256_round((__v4df)(__m256d)(A), \
1785 (__v4df)(__m256d)(B), (int)(R)))
1787 #define _mm256_mask_mul_round_pd(W, U, A, B, R) \
1788 ((__m256d)__builtin_ia32_selectpd_256( \
1789 (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)), \
1790 (__v4df)(__m256d)(W)))
1792 #define _mm256_maskz_mul_round_pd(U, A, B, R) \
1793 ((__m256d)__builtin_ia32_selectpd_256( \
1794 (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)), \
1795 (__v4df)_mm256_setzero_pd()))
1797 #define _mm256_mul_round_ph(A, B, R) \
1798 ((__m256h)__builtin_ia32_vmulph256_round((__v16hf)(__m256h)(A), \
1799 (__v16hf)(__m256h)(B), (int)(R)))
1801 #define _mm256_mask_mul_round_ph(W, U, A, B, R) \
1802 ((__m256h)__builtin_ia32_selectph_256( \
1803 (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)), \
1804 (__v16hf)(__m256h)(W)))
1806 #define _mm256_maskz_mul_round_ph(U, A, B, R) \
1807 ((__m256h)__builtin_ia32_selectph_256( \
1808 (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)), \
1809 (__v16hf)_mm256_setzero_ph()))
1811 #define _mm256_mul_round_ps(A, B, R) \
1812 ((__m256)__builtin_ia32_vmulps256_round((__v8sf)(__m256)(A), \
1813 (__v8sf)(__m256)(B), (int)(R)))
1815 #define _mm256_mask_mul_round_ps(W, U, A, B, R) \
1816 ((__m256)__builtin_ia32_selectps_256( \
1817 (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)), \
1818 (__v8sf)(__m256)(W)))
1820 #define _mm256_maskz_mul_round_ps(U, A, B, R) \
1821 ((__m256)__builtin_ia32_selectps_256( \
1822 (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)), \
1823 (__v8sf)_mm256_setzero_ps()))
1825 #define _mm256_range_round_pd(A, B, C, R) \
1826 ((__m256d)__builtin_ia32_vrangepd256_round_mask( \
1827 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
1828 (__v4df)_mm256_setzero_pd(), (__mmask8)-1, (int)(R)))
1830 #define _mm256_mask_range_round_pd(W, U, A, B, C, R) \
1831 ((__m256d)__builtin_ia32_vrangepd256_round_mask( \
1832 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
1833 (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1835 #define _mm256_maskz_range_round_pd(U, A, B, C, R) \
1836 ((__m256d)__builtin_ia32_vrangepd256_round_mask( \
1837 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
1838 (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
1840 #define _mm256_range_round_ps(A, B, C, R) \
1841 ((__m256)__builtin_ia32_vrangeps256_round_mask( \
1842 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
1843 (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, (int)(R)))
1845 #define _mm256_mask_range_round_ps(W, U, A, B, C, R) \
1846 ((__m256)__builtin_ia32_vrangeps256_round_mask( \
1847 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
1848 (__mmask8)(U), (int)(R)))
1850 #define _mm256_maskz_range_round_ps(U, A, B, C, R) \
1851 ((__m256)__builtin_ia32_vrangeps256_round_mask( \
1852 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
1853 (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
1855 #define _mm256_reduce_round_pd(A, B, R) \
1856 ((__m256d)__builtin_ia32_vreducepd256_round_mask( \
1857 (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \
1858 (__mmask8)-1, (int)(R)))
1860 #define _mm256_mask_reduce_round_pd(W, U, A, B, R) \
1861 ((__m256d)__builtin_ia32_vreducepd256_round_mask( \
1862 (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U), \
1865 #define _mm256_maskz_reduce_round_pd(U, A, B, R) \
1866 ((__m256d)__builtin_ia32_vreducepd256_round_mask( \
1867 (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \
1868 (__mmask8)(U), (int)(R)))
1870 #define _mm256_mask_reduce_round_ph(W, U, A, imm, R) \
1871 ((__m256h)__builtin_ia32_vreduceph256_round_mask( \
1872 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
1873 (__mmask16)(U), (int)(R)))
1875 #define _mm256_maskz_reduce_round_ph(U, A, imm, R) \
1876 ((__m256h)__builtin_ia32_vreduceph256_round_mask( \
1877 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
1878 (__mmask16)(U), (int)(R)))
1880 #define _mm256_reduce_round_ph(A, imm, R) \
1881 ((__m256h)__builtin_ia32_vreduceph256_round_mask( \
1882 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \
1883 (__mmask16)-1, (int)(R)))
1885 #define _mm256_reduce_round_ps(A, B, R) \
1886 ((__m256)__builtin_ia32_vreduceps256_round_mask( \
1887 (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \
1888 (__mmask8)-1, (int)(R)))
1890 #define _mm256_mask_reduce_round_ps(W, U, A, B, R) \
1891 ((__m256)__builtin_ia32_vreduceps256_round_mask( \
1892 (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U), \
1895 #define _mm256_maskz_reduce_round_ps(U, A, B, R) \
1896 ((__m256)__builtin_ia32_vreduceps256_round_mask( \
1897 (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \
1898 (__mmask8)(U), (int)(R)))
1900 #define _mm256_roundscale_round_pd(A, imm, R) \
1901 ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
1902 (__v4df)(__m256d)(A), (int)(imm), (__v4df)_mm256_undefined_pd(), \
1903 (__mmask8)-1, (int)(R)))
1905 #define _mm256_mask_roundscale_round_pd(A, B, C, imm, R) \
1906 ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
1907 (__v4df)(__m256d)(C), (int)(imm), (__v4df)(__m256d)(A), (__mmask8)(B), \
1910 #define _mm256_maskz_roundscale_round_pd(A, B, imm, R) \
1911 ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
1912 (__v4df)(__m256d)(B), (int)(imm), (__v4df)_mm256_setzero_pd(), \
1913 (__mmask8)(A), (int)(R)))
1915 #define _mm256_roundscale_round_ph(A, imm, R) \
1916 ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
1917 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \
1918 (__mmask16)-1, (int)(R)))
1920 #define _mm256_mask_roundscale_round_ph(A, B, C, imm, R) \
1921 ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
1922 (__v16hf)(__m256h)(C), (int)(imm), (__v16hf)(__m256h)(A), \
1923 (__mmask16)(B), (int)(R)))
1925 #define _mm256_maskz_roundscale_round_ph(A, B, imm, R) \
1926 ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
1927 (__v16hf)(__m256h)(B), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
1928 (__mmask16)(A), (int)(R)))
1930 #define _mm256_roundscale_round_ps(A, imm, R) \
1931 ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
1932 (__v8sf)(__m256)(A), (int)(imm), (__v8sf)_mm256_undefined_ps(), \
1933 (__mmask8)-1, (int)(R)))
1935 #define _mm256_mask_roundscale_round_ps(A, B, C, imm, R) \
1936 ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
1937 (__v8sf)(__m256)(C), (int)(imm), (__v8sf)(__m256)(A), (__mmask8)(B), \
1940 #define _mm256_maskz_roundscale_round_ps(A, B, imm, R) \
1941 ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
1942 (__v8sf)(__m256)(B), (int)(imm), (__v8sf)_mm256_setzero_ps(), \
1943 (__mmask8)(A), (int)(R)))
1945 #define _mm256_scalef_round_pd(A, B, R) \
1946 ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
1947 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), \
1948 (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
1950 #define _mm256_mask_scalef_round_pd(W, U, A, B, R) \
1951 ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
1952 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(W), \
1953 (__mmask8)(U), (int)(R)))
1955 #define _mm256_maskz_scalef_round_pd(U, A, B, R) \
1956 ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
1957 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)_mm256_setzero_pd(), \
1958 (__mmask8)(U), (int)(R)))
1960 #define _mm256_scalef_round_ph(A, B, R) \
1961 ((__m256h)__builtin_ia32_vscalefph256_round_mask( \
1962 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \
1963 (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
1965 #define _mm256_mask_scalef_round_ph(W, U, A, B, R) \
1966 ((__m256h)__builtin_ia32_vscalefph256_round_mask( \
1967 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(W), \
1968 (__mmask16)(U), (int)(R)))
1970 #define _mm256_maskz_scalef_round_ph(U, A, B, R) \
1971 ((__m256h)__builtin_ia32_vscalefph256_round_mask( \
1972 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \
1973 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1975 #define _mm256_scalef_round_ps(A, B, R) \
1976 ((__m256)__builtin_ia32_vscalefps256_round_mask( \
1977 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_undefined_ps(), \
1978 (__mmask8)-1, (int)(R)))
1980 #define _mm256_mask_scalef_round_ps(W, U, A, B, R) \
1981 ((__m256)__builtin_ia32_vscalefps256_round_mask( \
1982 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(W), \
1983 (__mmask8)(U), (int)(R)))
1985 #define _mm256_maskz_scalef_round_ps(U, A, B, R) \
1986 ((__m256)__builtin_ia32_vscalefps256_round_mask( \
1987 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_setzero_ps(), \
1988 (__mmask8)(U), (int)(R)))
1990 #define _mm256_sqrt_round_pd(A, R) \
1991 ((__m256d)__builtin_ia32_vsqrtpd256_round((__v4df)(__m256d)(A), (int)(R)))
1993 #define _mm256_mask_sqrt_round_pd(W, U, A, R) \
1994 ((__m256d)__builtin_ia32_selectpd_256( \
1995 (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)), \
1996 (__v4df)(__m256d)(W)))
1998 #define _mm256_maskz_sqrt_round_pd(U, A, R) \
1999 ((__m256d)__builtin_ia32_selectpd_256( \
2000 (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)), \
2001 (__v4df)_mm256_setzero_pd()))
2003 #define _mm256_sqrt_round_ph(A, R) \
2004 ((__m256h)__builtin_ia32_vsqrtph256_round((__v16hf)(__m256h)(A), (int)(R)))
2006 #define _mm256_mask_sqrt_round_ph(W, U, A, R) \
2007 ((__m256h)__builtin_ia32_selectph_256( \
2008 (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)), \
2009 (__v16hf)(__m256h)(W)))
2011 #define _mm256_maskz_sqrt_round_ph(U, A, R) \
2012 ((__m256h)__builtin_ia32_selectph_256( \
2013 (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)), \
2014 (__v16hf)_mm256_setzero_ph()))
2016 #define _mm256_sqrt_round_ps(A, R) \
2017 ((__m256)__builtin_ia32_vsqrtps256_round((__v8sf)(__m256)(A), (int)(R)))
2019 #define _mm256_mask_sqrt_round_ps(W, U, A, R) \
2020 ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
2021 (__v8sf)_mm256_sqrt_round_ps((A), (R)), \
2022 (__v8sf)(__m256)(W)))
2024 #define _mm256_maskz_sqrt_round_ps(U, A, R) \
2025 ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
2026 (__v8sf)_mm256_sqrt_round_ps((A), (R)), \
2027 (__v8sf)_mm256_setzero_ps()))
2029 #define _mm256_sub_round_pd(A, B, R) \
2030 ((__m256d)__builtin_ia32_vsubpd256_round((__v4df)(__m256d)(A), \
2031 (__v4df)(__m256d)(B), (int)(R)))
2033 #define _mm256_mask_sub_round_pd(W, U, A, B, R) \
2034 ((__m256d)__builtin_ia32_selectpd_256( \
2035 (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)), \
2036 (__v4df)(__m256d)(W)))
2038 #define _mm256_maskz_sub_round_pd(U, A, B, R) \
2039 ((__m256d)__builtin_ia32_selectpd_256( \
2040 (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)), \
2041 (__v4df)_mm256_setzero_pd()))
2043 #define _mm256_sub_round_ph(A, B, R) \
2044 ((__m256h)__builtin_ia32_vsubph256_round((__v16hf)(__m256h)(A), \
2045 (__v16hf)(__m256h)(B), (int)(R)))
2047 #define _mm256_mask_sub_round_ph(W, U, A, B, R) \
2048 ((__m256h)__builtin_ia32_selectph_256( \
2049 (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)), \
2050 (__v16hf)(__m256h)(W)))
2052 #define _mm256_maskz_sub_round_ph(U, A, B, R) \
2053 ((__m256h)__builtin_ia32_selectph_256( \
2054 (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)), \
2055 (__v16hf)_mm256_setzero_ph()))
2057 #define _mm256_sub_round_ps(A, B, R) \
2058 ((__m256)__builtin_ia32_vsubps256_round((__v8sf)(__m256)(A), \
2059 (__v8sf)(__m256)(B), (int)(R)))
2061 #define _mm256_mask_sub_round_ps(W, U, A, B, R) \
2062 ((__m256)__builtin_ia32_selectps_256( \
2063 (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \
2064 (__v8sf)(__m256)(W)))
2066 #define _mm256_maskz_sub_round_ps(U, A, B, R) \
2067 ((__m256)__builtin_ia32_selectps_256( \
2068 (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \
2069 (__v8sf)_mm256_setzero_ps()))
2071 #undef __DEFAULT_FN_ATTRS256
2072 #undef __DEFAULT_FN_ATTRS128
2074 #endif /* __AVX10_2NIINTRIN_H */
2075 #endif /* __SSE2__ */