[AMDGPU][AsmParser][NFC] Get rid of custom default operand handlers.
[llvm-project.git] / clang / lib / Headers / avx2intrin.h
blob5346a0209928e05e4f22cb93adafae662284449a
1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 #ifndef __IMMINTRIN_H
11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12 #endif
14 #ifndef __AVX2INTRIN_H
15 #define __AVX2INTRIN_H
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256)))
19 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128)))
21 /* SSE4 Multiple Packed Sums of Absolute Difference. */
22 #define _mm256_mpsadbw_epu8(X, Y, M) \
23 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
24 (__v32qi)(__m256i)(Y), (int)(M)))
26 static __inline__ __m256i __DEFAULT_FN_ATTRS256
27 _mm256_abs_epi8(__m256i __a)
29 return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
32 static __inline__ __m256i __DEFAULT_FN_ATTRS256
33 _mm256_abs_epi16(__m256i __a)
35 return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
38 static __inline__ __m256i __DEFAULT_FN_ATTRS256
39 _mm256_abs_epi32(__m256i __a)
41 return (__m256i)__builtin_elementwise_abs((__v8si)__a);
44 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
45 /// integers using signed saturation, and returns the 256-bit result.
46 ///
47 /// \code{.operation}
48 /// FOR i := 0 TO 7
49 /// j := i*16
50 /// k := i*8
51 /// result[7+k:k] := SATURATE8(__a[15+j:j])
52 /// result[71+k:64+k] := SATURATE8(__b[15+j:j])
53 /// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
54 /// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
55 /// ENDFOR
56 /// \endcode
57 ///
58 /// \headerfile <immintrin.h>
59 ///
60 /// This intrinsic corresponds to the \c VPACKSSWB instruction.
61 ///
62 /// \param __a
63 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
64 /// result[191:128].
65 /// \param __b
66 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
67 /// result[255:192].
68 /// \returns A 256-bit integer vector containing the result.
69 static __inline__ __m256i __DEFAULT_FN_ATTRS256
70 _mm256_packs_epi16(__m256i __a, __m256i __b)
72 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
75 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
76 /// integers using signed saturation, and returns the resulting 256-bit
77 /// vector of [16 x i16].
78 ///
79 /// \code{.operation}
80 /// FOR i := 0 TO 3
81 /// j := i*32
82 /// k := i*16
83 /// result[15+k:k] := SATURATE16(__a[31+j:j])
84 /// result[79+k:64+k] := SATURATE16(__b[31+j:j])
85 /// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
86 /// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
87 /// ENDFOR
88 /// \endcode
89 ///
90 /// \headerfile <immintrin.h>
91 ///
92 /// This intrinsic corresponds to the \c VPACKSSDW instruction.
93 ///
94 /// \param __a
95 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
96 /// result[191:128].
97 /// \param __b
98 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
99 /// result[255:192].
100 /// \returns A 256-bit vector of [16 x i16] containing the result.
101 static __inline__ __m256i __DEFAULT_FN_ATTRS256
102 _mm256_packs_epi32(__m256i __a, __m256i __b)
104 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
107 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
108 /// using unsigned saturation, and returns the 256-bit result.
110 /// \code{.operation}
111 /// FOR i := 0 TO 7
112 /// j := i*16
113 /// k := i*8
114 /// result[7+k:k] := SATURATE8U(__a[15+j:j])
115 /// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
116 /// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
117 /// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
118 /// ENDFOR
119 /// \endcode
121 /// \headerfile <immintrin.h>
123 /// This intrinsic corresponds to the \c VPACKUSWB instruction.
125 /// \param __a
126 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
127 /// result[191:128].
128 /// \param __b
129 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
130 /// result[255:192].
131 /// \returns A 256-bit integer vector containing the result.
132 static __inline__ __m256i __DEFAULT_FN_ATTRS256
133 _mm256_packus_epi16(__m256i __a, __m256i __b)
135 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
138 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
139 /// using unsigned saturation, and returns the resulting 256-bit vector of
140 /// [16 x i16].
142 /// \code{.operation}
143 /// FOR i := 0 TO 3
144 /// j := i*32
145 /// k := i*16
146 /// result[15+k:k] := SATURATE16U(__V1[31+j:j])
147 /// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
148 /// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
149 /// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
150 /// ENDFOR
151 /// \endcode
153 /// \headerfile <immintrin.h>
155 /// This intrinsic corresponds to the \c VPACKUSDW instruction.
157 /// \param __V1
158 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
159 /// result[191:128].
160 /// \param __V2
161 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
162 /// result[255:192].
163 /// \returns A 256-bit vector of [16 x i16] containing the result.
164 static __inline__ __m256i __DEFAULT_FN_ATTRS256
165 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
167 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
170 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
171 /// vectors and returns the lower 8 bits of each sum in the corresponding
172 /// byte of the 256-bit integer vector result (overflow is ignored).
174 /// \headerfile <immintrin.h>
176 /// This intrinsic corresponds to the \c VPADDB instruction.
178 /// \param __a
179 /// A 256-bit integer vector containing one of the source operands.
180 /// \param __b
181 /// A 256-bit integer vector containing one of the source operands.
182 /// \returns A 256-bit integer vector containing the sums.
183 static __inline__ __m256i __DEFAULT_FN_ATTRS256
184 _mm256_add_epi8(__m256i __a, __m256i __b)
186 return (__m256i)((__v32qu)__a + (__v32qu)__b);
189 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
190 /// [16 x i16] and returns the lower 16 bits of each sum in the
191 /// corresponding element of the [16 x i16] result (overflow is ignored).
193 /// \headerfile <immintrin.h>
195 /// This intrinsic corresponds to the \c VPADDW instruction.
197 /// \param __a
198 /// A 256-bit vector of [16 x i16] containing one of the source operands.
199 /// \param __b
200 /// A 256-bit vector of [16 x i16] containing one of the source operands.
201 /// \returns A 256-bit vector of [16 x i16] containing the sums.
202 static __inline__ __m256i __DEFAULT_FN_ATTRS256
203 _mm256_add_epi16(__m256i __a, __m256i __b)
205 return (__m256i)((__v16hu)__a + (__v16hu)__b);
208 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
209 /// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
210 /// element of the [8 x i32] result (overflow is ignored).
212 /// \headerfile <immintrin.h>
214 /// This intrinsic corresponds to the \c VPADDD instruction.
216 /// \param __a
217 /// A 256-bit vector of [8 x i32] containing one of the source operands.
218 /// \param __b
219 /// A 256-bit vector of [8 x i32] containing one of the source operands.
220 /// \returns A 256-bit vector of [8 x i32] containing the sums.
221 static __inline__ __m256i __DEFAULT_FN_ATTRS256
222 _mm256_add_epi32(__m256i __a, __m256i __b)
224 return (__m256i)((__v8su)__a + (__v8su)__b);
227 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
228 /// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
229 /// element of the [4 x i64] result (overflow is ignored).
231 /// \headerfile <immintrin.h>
233 /// This intrinsic corresponds to the \c VPADDQ instruction.
235 /// \param __a
236 /// A 256-bit vector of [4 x i64] containing one of the source operands.
237 /// \param __b
238 /// A 256-bit vector of [4 x i64] containing one of the source operands.
239 /// \returns A 256-bit vector of [4 x i64] containing the sums.
240 static __inline__ __m256i __DEFAULT_FN_ATTRS256
241 _mm256_add_epi64(__m256i __a, __m256i __b)
243 return (__m256i)((__v4du)__a + (__v4du)__b);
246 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
247 /// vectors using signed saturation, and returns each sum in the
248 /// corresponding byte of the 256-bit integer vector result.
250 /// \headerfile <immintrin.h>
252 /// This intrinsic corresponds to the \c VPADDSB instruction.
254 /// \param __a
255 /// A 256-bit integer vector containing one of the source operands.
256 /// \param __b
257 /// A 256-bit integer vector containing one of the source operands.
258 /// \returns A 256-bit integer vector containing the sums.
259 static __inline__ __m256i __DEFAULT_FN_ATTRS256
260 _mm256_adds_epi8(__m256i __a, __m256i __b)
262 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
265 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
266 /// [16 x i16] using signed saturation, and returns the [16 x i16] result.
268 /// \headerfile <immintrin.h>
270 /// This intrinsic corresponds to the \c VPADDSW instruction.
272 /// \param __a
273 /// A 256-bit vector of [16 x i16] containing one of the source operands.
274 /// \param __b
275 /// A 256-bit vector of [16 x i16] containing one of the source operands.
276 /// \returns A 256-bit vector of [16 x i16] containing the sums.
277 static __inline__ __m256i __DEFAULT_FN_ATTRS256
278 _mm256_adds_epi16(__m256i __a, __m256i __b)
280 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
283 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
284 /// vectors using unsigned saturation, and returns each sum in the
285 /// corresponding byte of the 256-bit integer vector result.
287 /// \headerfile <immintrin.h>
289 /// This intrinsic corresponds to the \c VPADDUSB instruction.
291 /// \param __a
292 /// A 256-bit integer vector containing one of the source operands.
293 /// \param __b
294 /// A 256-bit integer vector containing one of the source operands.
295 /// \returns A 256-bit integer vector containing the sums.
296 static __inline__ __m256i __DEFAULT_FN_ATTRS256
297 _mm256_adds_epu8(__m256i __a, __m256i __b)
299 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
302 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
303 /// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
305 /// \headerfile <immintrin.h>
307 /// This intrinsic corresponds to the \c VPADDUSW instruction.
309 /// \param __a
310 /// A 256-bit vector of [16 x i16] containing one of the source operands.
311 /// \param __b
312 /// A 256-bit vector of [16 x i16] containing one of the source operands.
313 /// \returns A 256-bit vector of [16 x i16] containing the sums.
314 static __inline__ __m256i __DEFAULT_FN_ATTRS256
315 _mm256_adds_epu16(__m256i __a, __m256i __b)
317 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
320 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
321 /// temporary 256-bit value, and the lower half of the 256-bit vector \a b
322 /// as the lower half of the temporary value. Right-shifts the temporary
323 /// value by \a n bytes, and uses the lower 16 bytes of the shifted value
324 /// as the lower 16 bytes of the result. Uses the upper halves of \a a and
325 /// \a b to make another temporary value, right shifts by \a n, and uses
326 /// the lower 16 bytes of the shifted value as the upper 16 bytes of the
327 /// result.
329 /// \headerfile <immintrin.h>
331 /// \code
332 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
333 /// \endcode
335 /// This intrinsic corresponds to the \c VPALIGNR instruction.
337 /// \param a
338 /// A 256-bit integer vector containing source values.
339 /// \param b
340 /// A 256-bit integer vector containing source values.
341 /// \param n
342 /// An immediate value specifying the number of bytes to shift.
343 /// \returns A 256-bit integer vector containing the result.
344 #define _mm256_alignr_epi8(a, b, n) \
345 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
346 (__v32qi)(__m256i)(b), (n)))
348 static __inline__ __m256i __DEFAULT_FN_ATTRS256
349 _mm256_and_si256(__m256i __a, __m256i __b)
351 return (__m256i)((__v4du)__a & (__v4du)__b);
354 static __inline__ __m256i __DEFAULT_FN_ATTRS256
355 _mm256_andnot_si256(__m256i __a, __m256i __b)
357 return (__m256i)(~(__v4du)__a & (__v4du)__b);
360 static __inline__ __m256i __DEFAULT_FN_ATTRS256
361 _mm256_avg_epu8(__m256i __a, __m256i __b)
363 return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
367 _mm256_avg_epu16(__m256i __a, __m256i __b)
369 return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
372 /// Merges 8-bit integer values from either of the two 256-bit vectors
373 /// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
374 /// the resulting 256-bit integer vector.
376 /// \code{.operation}
377 /// FOR i := 0 TO 31
378 /// j := i*8
379 /// IF __M[7+i] == 0
380 /// result[7+j:j] := __V1[7+j:j]
381 /// ELSE
382 /// result[7+j:j] := __V2[7+j:j]
383 /// FI
384 /// ENDFOR
385 /// \endcode
387 /// \headerfile <immintrin.h>
389 /// This intrinsic corresponds to the \c VPBLENDVB instruction.
391 /// \param __V1
392 /// A 256-bit integer vector containing source values.
393 /// \param __V2
394 /// A 256-bit integer vector containing source values.
395 /// \param __M
396 /// A 256-bit integer vector, with bit [7] of each byte specifying the
397 /// source for each corresponding byte of the result. When the mask bit
398 /// is 0, the byte is copied from \a __V1; otherwise, it is copied from
399 /// \a __V2.
400 /// \returns A 256-bit integer vector containing the result.
401 static __inline__ __m256i __DEFAULT_FN_ATTRS256
402 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
404 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
405 (__v32qi)__M);
408 /// Merges 16-bit integer values from either of the two 256-bit vectors
409 /// \a V1 or \a V2, as specified by the immediate integer operand \a M,
410 /// and returns the resulting 256-bit vector of [16 x i16].
412 /// \code{.operation}
413 /// FOR i := 0 TO 7
414 /// j := i*16
415 /// IF M[i] == 0
416 /// result[7+j:j] := V1[7+j:j]
417 /// result[135+j:128+j] := V1[135+j:128+j]
418 /// ELSE
419 /// result[7+j:j] := V2[7+j:j]
420 /// result[135+j:128+j] := V2[135+j:128+j]
421 /// FI
422 /// ENDFOR
423 /// \endcode
425 /// \headerfile <immintrin.h>
427 /// \code
428 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
429 /// \endcode
431 /// This intrinsic corresponds to the \c VPBLENDW instruction.
433 /// \param V1
434 /// A 256-bit vector of [16 x i16] containing source values.
435 /// \param V2
436 /// A 256-bit vector of [16 x i16] containing source values.
437 /// \param M
438 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
439 /// source for each element of the result. The position of the mask bit
440 /// corresponds to the index of a copied value. When a mask bit is 0, the
441 /// element is copied from \a V1; otherwise, it is copied from \a V2.
442 /// \a M[0] determines the source for elements 0 and 8, \a M[1] for
443 /// elements 1 and 9, and so forth.
444 /// \returns A 256-bit vector of [16 x i16] containing the result.
445 #define _mm256_blend_epi16(V1, V2, M) \
446 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
447 (__v16hi)(__m256i)(V2), (int)(M)))
449 static __inline__ __m256i __DEFAULT_FN_ATTRS256
450 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
452 return (__m256i)((__v32qi)__a == (__v32qi)__b);
455 static __inline__ __m256i __DEFAULT_FN_ATTRS256
456 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
458 return (__m256i)((__v16hi)__a == (__v16hi)__b);
461 static __inline__ __m256i __DEFAULT_FN_ATTRS256
462 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
464 return (__m256i)((__v8si)__a == (__v8si)__b);
467 static __inline__ __m256i __DEFAULT_FN_ATTRS256
468 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
470 return (__m256i)((__v4di)__a == (__v4di)__b);
473 static __inline__ __m256i __DEFAULT_FN_ATTRS256
474 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
476 /* This function always performs a signed comparison, but __v32qi is a char
477 which may be signed or unsigned, so use __v32qs. */
478 return (__m256i)((__v32qs)__a > (__v32qs)__b);
481 static __inline__ __m256i __DEFAULT_FN_ATTRS256
482 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
484 return (__m256i)((__v16hi)__a > (__v16hi)__b);
487 static __inline__ __m256i __DEFAULT_FN_ATTRS256
488 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
490 return (__m256i)((__v8si)__a > (__v8si)__b);
493 static __inline__ __m256i __DEFAULT_FN_ATTRS256
494 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
496 return (__m256i)((__v4di)__a > (__v4di)__b);
499 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
500 /// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
501 /// element of the [16 x i16] result (overflow is ignored). Sums from
502 /// \a __a are returned in the lower 64 bits of each 128-bit half of the
503 /// result; sums from \a __b are returned in the upper 64 bits of each
504 /// 128-bit half of the result.
506 /// \code{.operation}
507 /// FOR i := 0 TO 1
508 /// j := i*128
509 /// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
510 /// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
511 /// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
512 /// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
513 /// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
514 /// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
515 /// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
516 /// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
517 /// ENDFOR
518 /// \endcode
520 /// \headerfile <immintrin.h>
522 /// This intrinsic corresponds to the \c VPHADDW instruction.
524 /// \param __a
525 /// A 256-bit vector of [16 x i16] containing one of the source operands.
526 /// \param __b
527 /// A 256-bit vector of [16 x i16] containing one of the source operands.
528 /// \returns A 256-bit vector of [16 x i16] containing the sums.
529 static __inline__ __m256i __DEFAULT_FN_ATTRS256
530 _mm256_hadd_epi16(__m256i __a, __m256i __b)
532 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
535 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
536 /// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
537 /// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
538 /// are returned in the lower 64 bits of each 128-bit half of the result;
539 /// sums from \a __b are returned in the upper 64 bits of each 128-bit half
540 /// of the result.
542 /// \code{.operation}
543 /// FOR i := 0 TO 1
544 /// j := i*128
545 /// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
546 /// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
547 /// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
548 /// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
549 /// ENDFOR
550 /// \endcode
552 /// \headerfile <immintrin.h>
554 /// This intrinsic corresponds to the \c VPHADDD instruction.
556 /// \param __a
557 /// A 256-bit vector of [8 x i32] containing one of the source operands.
558 /// \param __b
559 /// A 256-bit vector of [8 x i32] containing one of the source operands.
560 /// \returns A 256-bit vector of [8 x i32] containing the sums.
561 static __inline__ __m256i __DEFAULT_FN_ATTRS256
562 _mm256_hadd_epi32(__m256i __a, __m256i __b)
564 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
567 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
568 /// vectors of [16 x i16] using signed saturation and returns each sum in
569 /// an element of the [16 x i16] result. Sums from \a __a are returned in
570 /// the lower 64 bits of each 128-bit half of the result; sums from \a __b
571 /// are returned in the upper 64 bits of each 128-bit half of the result.
573 /// \code{.operation}
574 /// FOR i := 0 TO 1
575 /// j := i*128
576 /// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
577 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
578 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
579 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
580 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
581 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
582 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
583 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
584 /// ENDFOR
585 /// \endcode
587 /// \headerfile <immintrin.h>
589 /// This intrinsic corresponds to the \c VPHADDSW instruction.
591 /// \param __a
592 /// A 256-bit vector of [16 x i16] containing one of the source operands.
593 /// \param __b
594 /// A 256-bit vector of [16 x i16] containing one of the source operands.
595 /// \returns A 256-bit vector of [16 x i16] containing the sums.
596 static __inline__ __m256i __DEFAULT_FN_ATTRS256
597 _mm256_hadds_epi16(__m256i __a, __m256i __b)
599 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
602 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
603 /// vectors of [16 x i16] and returns the lower 16 bits of each difference
604 /// in an element of the [16 x i16] result (overflow is ignored).
605 /// Differences from \a __a are returned in the lower 64 bits of each
606 /// 128-bit half of the result; differences from \a __b are returned in the
607 /// upper 64 bits of each 128-bit half of the result.
609 /// \code{.operation}
610 /// FOR i := 0 TO 1
611 /// j := i*128
612 /// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
613 /// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
614 /// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
615 /// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
616 /// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
617 /// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
618 /// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
619 /// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
620 /// ENDFOR
621 /// \endcode
623 /// \headerfile <immintrin.h>
625 /// This intrinsic corresponds to the \c VPHSUBW instruction.
627 /// \param __a
628 /// A 256-bit vector of [16 x i16] containing one of the source operands.
629 /// \param __b
630 /// A 256-bit vector of [16 x i16] containing one of the source operands.
631 /// \returns A 256-bit vector of [16 x i16] containing the differences.
632 static __inline__ __m256i __DEFAULT_FN_ATTRS256
633 _mm256_hsub_epi16(__m256i __a, __m256i __b)
635 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
638 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
639 /// vectors of [8 x i32] and returns the lower 32 bits of each difference in
640 /// an element of the [8 x i32] result (overflow is ignored). Differences
641 /// from \a __a are returned in the lower 64 bits of each 128-bit half of
642 /// the result; differences from \a __b are returned in the upper 64 bits
643 /// of each 128-bit half of the result.
645 /// \code{.operation}
646 /// FOR i := 0 TO 1
647 /// j := i*128
648 /// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
649 /// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
650 /// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
651 /// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
652 /// ENDFOR
653 /// \endcode
655 /// \headerfile <immintrin.h>
657 /// This intrinsic corresponds to the \c VPHSUBD instruction.
659 /// \param __a
660 /// A 256-bit vector of [8 x i32] containing one of the source operands.
661 /// \param __b
662 /// A 256-bit vector of [8 x i32] containing one of the source operands.
663 /// \returns A 256-bit vector of [8 x i32] containing the differences.
664 static __inline__ __m256i __DEFAULT_FN_ATTRS256
665 _mm256_hsub_epi32(__m256i __a, __m256i __b)
667 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
670 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
671 /// vectors of [16 x i16] using signed saturation and returns each sum in
672 /// an element of the [16 x i16] result. Differences from \a __a are
673 /// returned in the lower 64 bits of each 128-bit half of the result;
674 /// differences from \a __b are returned in the upper 64 bits of each
675 /// 128-bit half of the result.
677 /// \code{.operation}
678 /// FOR i := 0 TO 1
679 /// j := i*128
680 /// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
681 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
682 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
683 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
684 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
685 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
686 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
687 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
688 /// ENDFOR
689 /// \endcode
691 /// \headerfile <immintrin.h>
693 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
695 /// \param __a
696 /// A 256-bit vector of [16 x i16] containing one of the source operands.
697 /// \param __b
698 /// A 256-bit vector of [16 x i16] containing one of the source operands.
699 /// \returns A 256-bit vector of [16 x i16] containing the differences.
700 static __inline__ __m256i __DEFAULT_FN_ATTRS256
701 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
703 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
706 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
707 /// with the corresponding signed byte from the 256-bit integer vector in
708 /// \a __b, forming signed 16-bit intermediate products. Adds adjacent
709 /// pairs of those products using signed saturation to form 16-bit sums
710 /// returned as elements of the [16 x i16] result.
712 /// \code{.operation}
713 /// FOR i := 0 TO 15
714 /// j := i*16
715 /// temp1 := __a[j+7:j] * __b[j+7:j]
716 /// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
717 /// result[j+15:j] := SATURATE16(temp1 + temp2)
718 /// ENDFOR
719 /// \endcode
721 /// \headerfile <immintrin.h>
723 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
725 /// \param __a
726 /// A 256-bit vector containing one of the source operands.
727 /// \param __b
728 /// A 256-bit vector containing one of the source operands.
729 /// \returns A 256-bit vector of [16 x i16] containing the result.
730 static __inline__ __m256i __DEFAULT_FN_ATTRS256
731 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
733 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
736 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
737 /// [16 x i16], forming 32-bit intermediate products, and adds pairs of
738 /// those products to form 32-bit sums returned as elements of the
739 /// [8 x i32] result.
741 /// There is only one wraparound case: when all four of the 16-bit sources
742 /// are \c 0x8000, the result will be \c 0x80000000.
744 /// \code{.operation}
745 /// FOR i := 0 TO 7
746 /// j := i*32
747 /// temp1 := __a[j+15:j] * __b[j+15:j]
748 /// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
749 /// result[j+31:j] := temp1 + temp2
750 /// ENDFOR
751 /// \endcode
753 /// \headerfile <immintrin.h>
755 /// This intrinsic corresponds to the \c VPMADDWD instruction.
757 /// \param __a
758 /// A 256-bit vector of [16 x i16] containing one of the source operands.
759 /// \param __b
760 /// A 256-bit vector of [16 x i16] containing one of the source operands.
761 /// \returns A 256-bit vector of [8 x i32] containing the result.
762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
763 _mm256_madd_epi16(__m256i __a, __m256i __b)
765 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
768 static __inline__ __m256i __DEFAULT_FN_ATTRS256
769 _mm256_max_epi8(__m256i __a, __m256i __b)
771 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
774 static __inline__ __m256i __DEFAULT_FN_ATTRS256
775 _mm256_max_epi16(__m256i __a, __m256i __b)
777 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
780 static __inline__ __m256i __DEFAULT_FN_ATTRS256
781 _mm256_max_epi32(__m256i __a, __m256i __b)
783 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
786 static __inline__ __m256i __DEFAULT_FN_ATTRS256
787 _mm256_max_epu8(__m256i __a, __m256i __b)
789 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
792 static __inline__ __m256i __DEFAULT_FN_ATTRS256
793 _mm256_max_epu16(__m256i __a, __m256i __b)
795 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
798 static __inline__ __m256i __DEFAULT_FN_ATTRS256
799 _mm256_max_epu32(__m256i __a, __m256i __b)
801 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
804 static __inline__ __m256i __DEFAULT_FN_ATTRS256
805 _mm256_min_epi8(__m256i __a, __m256i __b)
807 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
810 static __inline__ __m256i __DEFAULT_FN_ATTRS256
811 _mm256_min_epi16(__m256i __a, __m256i __b)
813 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
816 static __inline__ __m256i __DEFAULT_FN_ATTRS256
817 _mm256_min_epi32(__m256i __a, __m256i __b)
819 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
822 static __inline__ __m256i __DEFAULT_FN_ATTRS256
823 _mm256_min_epu8(__m256i __a, __m256i __b)
825 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
828 static __inline__ __m256i __DEFAULT_FN_ATTRS256
829 _mm256_min_epu16(__m256i __a, __m256i __b)
831 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
834 static __inline__ __m256i __DEFAULT_FN_ATTRS256
835 _mm256_min_epu32(__m256i __a, __m256i __b)
837 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
840 static __inline__ int __DEFAULT_FN_ATTRS256
841 _mm256_movemask_epi8(__m256i __a)
843 return __builtin_ia32_pmovmskb256((__v32qi)__a);
846 static __inline__ __m256i __DEFAULT_FN_ATTRS256
847 _mm256_cvtepi8_epi16(__m128i __V)
849 /* This function always performs a signed extension, but __v16qi is a char
850 which may be signed or unsigned, so use __v16qs. */
851 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
854 static __inline__ __m256i __DEFAULT_FN_ATTRS256
855 _mm256_cvtepi8_epi32(__m128i __V)
857 /* This function always performs a signed extension, but __v16qi is a char
858 which may be signed or unsigned, so use __v16qs. */
859 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
862 static __inline__ __m256i __DEFAULT_FN_ATTRS256
863 _mm256_cvtepi8_epi64(__m128i __V)
865 /* This function always performs a signed extension, but __v16qi is a char
866 which may be signed or unsigned, so use __v16qs. */
867 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
870 static __inline__ __m256i __DEFAULT_FN_ATTRS256
871 _mm256_cvtepi16_epi32(__m128i __V)
873 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
876 static __inline__ __m256i __DEFAULT_FN_ATTRS256
877 _mm256_cvtepi16_epi64(__m128i __V)
879 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
882 static __inline__ __m256i __DEFAULT_FN_ATTRS256
883 _mm256_cvtepi32_epi64(__m128i __V)
885 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
888 static __inline__ __m256i __DEFAULT_FN_ATTRS256
889 _mm256_cvtepu8_epi16(__m128i __V)
891 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
894 static __inline__ __m256i __DEFAULT_FN_ATTRS256
895 _mm256_cvtepu8_epi32(__m128i __V)
897 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
900 static __inline__ __m256i __DEFAULT_FN_ATTRS256
901 _mm256_cvtepu8_epi64(__m128i __V)
903 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
906 static __inline__ __m256i __DEFAULT_FN_ATTRS256
907 _mm256_cvtepu16_epi32(__m128i __V)
909 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
912 static __inline__ __m256i __DEFAULT_FN_ATTRS256
913 _mm256_cvtepu16_epi64(__m128i __V)
915 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
918 static __inline__ __m256i __DEFAULT_FN_ATTRS256
919 _mm256_cvtepu32_epi64(__m128i __V)
921 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
924 /// Multiplies signed 32-bit integers from even-numbered elements of two
925 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
926 /// [4 x i64] result.
928 /// \code{.operation}
929 /// result[63:0] := __a[31:0] * __b[31:0]
930 /// result[127:64] := __a[95:64] * __b[95:64]
931 /// result[191:128] := __a[159:128] * __b[159:128]
932 /// result[255:192] := __a[223:192] * __b[223:192]
933 /// \endcode
935 /// \headerfile <immintrin.h>
937 /// This intrinsic corresponds to the \c VPMULDQ instruction.
939 /// \param __a
940 /// A 256-bit vector of [8 x i32] containing one of the source operands.
941 /// \param __b
942 /// A 256-bit vector of [8 x i32] containing one of the source operands.
943 /// \returns A 256-bit vector of [4 x i64] containing the products.
944 static __inline__ __m256i __DEFAULT_FN_ATTRS256
945 _mm256_mul_epi32(__m256i __a, __m256i __b)
947 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
950 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
951 /// [16 x i16], truncates the 32-bit results to the most significant 18
952 /// bits, rounds by adding 1, and returns bits [16:1] of each rounded
953 /// product in the [16 x i16] result.
955 /// \code{.operation}
956 /// FOR i := 0 TO 15
957 /// j := i*16
958 /// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
959 /// result[j+15:j] := temp[16:1]
960 /// \endcode
962 /// \headerfile <immintrin.h>
964 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
966 /// \param __a
967 /// A 256-bit vector of [16 x i16] containing one of the source operands.
968 /// \param __b
969 /// A 256-bit vector of [16 x i16] containing one of the source operands.
970 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
971 static __inline__ __m256i __DEFAULT_FN_ATTRS256
972 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
974 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
977 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
978 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
979 /// [16 x i16] result.
981 /// \headerfile <immintrin.h>
983 /// This intrinsic corresponds to the \c VPMULHUW instruction.
985 /// \param __a
986 /// A 256-bit vector of [16 x i16] containing one of the source operands.
987 /// \param __b
988 /// A 256-bit vector of [16 x i16] containing one of the source operands.
989 /// \returns A 256-bit vector of [16 x i16] containing the products.
990 static __inline__ __m256i __DEFAULT_FN_ATTRS256
991 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
993 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
996 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
997 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
998 /// [16 x i16] result.
1000 /// \headerfile <immintrin.h>
1002 /// This intrinsic corresponds to the \c VPMULHW instruction.
1004 /// \param __a
1005 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1006 /// \param __b
1007 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1008 /// \returns A 256-bit vector of [16 x i16] containing the products.
1009 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1010 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
1012 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1015 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1016 /// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1017 /// [16 x i16] result.
1019 /// \headerfile <immintrin.h>
1021 /// This intrinsic corresponds to the \c VPMULLW instruction.
1023 /// \param __a
1024 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1025 /// \param __b
1026 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1027 /// \returns A 256-bit vector of [16 x i16] containing the products.
1028 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1029 _mm256_mullo_epi16(__m256i __a, __m256i __b)
1031 return (__m256i)((__v16hu)__a * (__v16hu)__b);
1034 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1035 /// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1036 /// [8 x i32] result.
1038 /// \headerfile <immintrin.h>
1040 /// This intrinsic corresponds to the \c VPMULLD instruction.
1042 /// \param __a
1043 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1044 /// \param __b
1045 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1046 /// \returns A 256-bit vector of [8 x i32] containing the products.
1047 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1048 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
1050 return (__m256i)((__v8su)__a * (__v8su)__b);
1053 /// Multiplies unsigned 32-bit integers from even-numered elements of two
1054 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1055 /// [4 x i64] result.
1057 /// \code{.operation}
1058 /// result[63:0] := __a[31:0] * __b[31:0]
1059 /// result[127:64] := __a[95:64] * __b[95:64]
1060 /// result[191:128] := __a[159:128] * __b[159:128]
1061 /// result[255:192] := __a[223:192] * __b[223:192]
1062 /// \endcode
1064 /// \headerfile <immintrin.h>
1066 /// This intrinsic corresponds to the \c VPMULUDQ instruction.
1068 /// \param __a
1069 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1070 /// \param __b
1071 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1072 /// \returns A 256-bit vector of [4 x i64] containing the products.
1073 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1074 _mm256_mul_epu32(__m256i __a, __m256i __b)
1076 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1079 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1080 _mm256_or_si256(__m256i __a, __m256i __b)
1082 return (__m256i)((__v4du)__a | (__v4du)__b);
1085 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1086 _mm256_sad_epu8(__m256i __a, __m256i __b)
1088 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1091 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1092 /// to control information in the 256-bit integer vector \a __b, and
1093 /// returns the 256-bit result. In effect there are two separate 128-bit
1094 /// shuffles in the lower and upper halves.
1096 /// \code{.operation}
1097 /// FOR i := 0 TO 31
1098 /// j := i*8
1099 /// IF __b[j+7] == 1
1100 /// result[j+7:j] := 0
1101 /// ELSE
1102 /// k := __b[j+3:j] * 8
1103 /// IF i > 15
1104 /// k := k + 128
1105 /// FI
1106 /// result[j+7:j] := __a[k+7:k]
1107 /// FI
1108 /// ENDFOR
1109 /// \endcode
1111 /// \headerfile <immintrin.h>
1113 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1115 /// \param __a
1116 /// A 256-bit integer vector containing source values.
1117 /// \param __b
1118 /// A 256-bit integer vector containing control information to determine
1119 /// what goes into the corresponding byte of the result. If bit 7 of the
1120 /// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1121 /// control byte specify the index (within the same 128-bit half) of \a __a
1122 /// to copy to the result byte.
1123 /// \returns A 256-bit integer vector containing the result.
1124 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1125 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
1127 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1130 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1131 /// according to control information in the integer literal \a imm, and
1132 /// returns the 256-bit result. In effect there are two parallel 128-bit
1133 /// shuffles in the lower and upper halves.
1135 /// \code{.operation}
1136 /// FOR i := 0 to 3
1137 /// j := i*32
1138 /// k := (imm >> i*2)[1:0] * 32
1139 /// result[j+31:j] := a[k+31:k]
1140 /// result[128+j+31:128+j] := a[128+k+31:128+k]
1141 /// ENDFOR
1142 /// \endcode
1144 /// \headerfile <immintrin.h>
1146 /// \code
1147 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1148 /// \endcode
1150 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1152 /// \param a
1153 /// A 256-bit vector of [8 x i32] containing source values.
1154 /// \param imm
1155 /// An immediate 8-bit value specifying which elements to copy from \a a.
1156 /// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1157 /// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1158 /// forth.
1159 /// \returns A 256-bit vector of [8 x i32] containing the result.
1160 #define _mm256_shuffle_epi32(a, imm) \
1161 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1163 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1164 /// according to control information in the integer literal \a imm, and
1165 /// returns the 256-bit result. The upper 64 bits of each 128-bit half
1166 /// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1167 /// copied from \a a unchanged.
1169 /// \code{.operation}
1170 /// result[63:0] := a[63:0]
1171 /// result[191:128] := a[191:128]
1172 /// FOR i := 0 TO 3
1173 /// j := i * 16 + 64
1174 /// k := (imm >> i*2)[1:0] * 16 + 64
1175 /// result[j+15:j] := a[k+15:k]
1176 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1177 /// ENDFOR
1178 /// \endcode
1180 /// \headerfile <immintrin.h>
1182 /// \code
1183 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1184 /// \endcode
1186 /// This intrinsic corresponds to the \c VPSHUFHW instruction.
1188 /// \param a
1189 /// A 256-bit vector of [16 x i16] containing source values.
1190 /// \param imm
1191 /// An immediate 8-bit value specifying which elements to copy from \a a.
1192 /// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1193 /// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1194 /// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1195 /// \returns A 256-bit vector of [16 x i16] containing the result.
1196 #define _mm256_shufflehi_epi16(a, imm) \
1197 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1199 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1200 /// according to control information in the integer literal \a imm, and
1201 /// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1202 /// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1203 /// copied from \a a unchanged.
1205 /// \code{.operation}
1206 /// result[127:64] := a[127:64]
1207 /// result[255:192] := a[255:192]
1208 /// FOR i := 0 TO 3
1209 /// j := i * 16
1210 /// k := (imm >> i*2)[1:0] * 16
1211 /// result[j+15:j] := a[k+15:k]
1212 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1213 /// ENDFOR
1214 /// \endcode
1216 /// \headerfile <immintrin.h>
1218 /// \code
1219 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1220 /// \endcode
1222 /// This intrinsic corresponds to the \c VPSHUFLW instruction.
1224 /// \param a
1225 /// A 256-bit vector of [16 x i16] to use as a source of data for the
1226 /// result.
1227 /// \param imm
1228 /// An immediate 8-bit value specifying which elements to copy from \a a.
1229 /// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
1230 /// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
1231 /// forth.
1232 /// \returns A 256-bit vector of [16 x i16] containing the result.
1233 #define _mm256_shufflelo_epi16(a, imm) \
1234 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
1236 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1237 _mm256_sign_epi8(__m256i __a, __m256i __b)
1239 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
1242 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1243 _mm256_sign_epi16(__m256i __a, __m256i __b)
1245 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
1248 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1249 _mm256_sign_epi32(__m256i __a, __m256i __b)
1251 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
1254 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
1255 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
1256 /// is greater than 15, the returned result is all zeroes.
1258 /// \headerfile <immintrin.h>
1260 /// \code
1261 /// __m256i _mm256_slli_si256(__m256i a, const int imm);
1262 /// \endcode
1264 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
1266 /// \param a
1267 /// A 256-bit integer vector to be shifted.
1268 /// \param imm
1269 /// An unsigned immediate value specifying the shift count (in bytes).
1270 /// \returns A 256-bit integer vector containing the result.
1271 #define _mm256_slli_si256(a, imm) \
1272 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
1274 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
1275 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
1276 /// is greater than 15, the returned result is all zeroes.
1278 /// \headerfile <immintrin.h>
1280 /// \code
1281 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
1282 /// \endcode
1284 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
1286 /// \param a
1287 /// A 256-bit integer vector to be shifted.
1288 /// \param imm
1289 /// An unsigned immediate value specifying the shift count (in bytes).
1290 /// \returns A 256-bit integer vector containing the result.
1291 #define _mm256_bslli_epi128(a, imm) \
1292 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
1294 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1295 /// left by \a __count bits, shifting in zero bits, and returns the result.
1296 /// If \a __count is greater than 15, the returned result is all zeroes.
1298 /// \headerfile <immintrin.h>
1300 /// This intrinsic corresponds to the \c VPSLLW instruction.
1302 /// \param __a
1303 /// A 256-bit vector of [16 x i16] to be shifted.
1304 /// \param __count
1305 /// An unsigned integer value specifying the shift count (in bits).
1306 /// \returns A 256-bit vector of [16 x i16] containing the result.
1307 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1308 _mm256_slli_epi16(__m256i __a, int __count)
1310 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
1313 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1314 /// left by the number of bits specified by the lower 64 bits of \a __count,
1315 /// shifting in zero bits, and returns the result. If \a __count is greater
1316 /// than 15, the returned result is all zeroes.
1318 /// \headerfile <immintrin.h>
1320 /// This intrinsic corresponds to the \c VPSLLW instruction.
1322 /// \param __a
1323 /// A 256-bit vector of [16 x i16] to be shifted.
1324 /// \param __count
1325 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1326 /// shift count (in bits). The upper element is ignored.
1327 /// \returns A 256-bit vector of [16 x i16] containing the result.
1328 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1329 _mm256_sll_epi16(__m256i __a, __m128i __count)
1331 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
1334 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1335 /// left by \a __count bits, shifting in zero bits, and returns the result.
1336 /// If \a __count is greater than 31, the returned result is all zeroes.
1338 /// \headerfile <immintrin.h>
1340 /// This intrinsic corresponds to the \c VPSLLD instruction.
1342 /// \param __a
1343 /// A 256-bit vector of [8 x i32] to be shifted.
1344 /// \param __count
1345 /// An unsigned integer value specifying the shift count (in bits).
1346 /// \returns A 256-bit vector of [8 x i32] containing the result.
1347 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1348 _mm256_slli_epi32(__m256i __a, int __count)
1350 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
1353 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1354 /// left by the number of bits given in the lower 64 bits of \a __count,
1355 /// shifting in zero bits, and returns the result. If \a __count is greater
1356 /// than 31, the returned result is all zeroes.
1358 /// \headerfile <immintrin.h>
1360 /// This intrinsic corresponds to the \c VPSLLD instruction.
1362 /// \param __a
1363 /// A 256-bit vector of [8 x i32] to be shifted.
1364 /// \param __count
1365 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1366 /// shift count (in bits). The upper element is ignored.
1367 /// \returns A 256-bit vector of [8 x i32] containing the result.
1368 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1369 _mm256_sll_epi32(__m256i __a, __m128i __count)
1371 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
1374 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
1375 /// left by \a __count bits, shifting in zero bits, and returns the result.
1376 /// If \a __count is greater than 63, the returned result is all zeroes.
1378 /// \headerfile <immintrin.h>
1380 /// This intrinsic corresponds to the \c VPSLLQ instruction.
1382 /// \param __a
1383 /// A 256-bit vector of [4 x i64] to be shifted.
1384 /// \param __count
1385 /// An unsigned integer value specifying the shift count (in bits).
1386 /// \returns A 256-bit vector of [4 x i64] containing the result.
1387 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1388 _mm256_slli_epi64(__m256i __a, int __count)
1390 return __builtin_ia32_psllqi256((__v4di)__a, __count);
1393 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
1394 /// left by the number of bits given in the lower 64 bits of \a __count,
1395 /// shifting in zero bits, and returns the result. If \a __count is greater
1396 /// than 63, the returned result is all zeroes.
1398 /// \headerfile <immintrin.h>
1400 /// This intrinsic corresponds to the \c VPSLLQ instruction.
1402 /// \param __a
1403 /// A 256-bit vector of [4 x i64] to be shifted.
1404 /// \param __count
1405 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1406 /// shift count (in bits). The upper element is ignored.
1407 /// \returns A 256-bit vector of [4 x i64] containing the result.
1408 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1409 _mm256_sll_epi64(__m256i __a, __m128i __count)
1411 return __builtin_ia32_psllq256((__v4di)__a, __count);
1414 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1415 /// right by \a __count bits, shifting in sign bits, and returns the result.
1416 /// If \a __count is greater than 15, each element of the result is either
1417 /// 0 or -1 according to the corresponding input sign bit.
1419 /// \headerfile <immintrin.h>
1421 /// This intrinsic corresponds to the \c VPSRAW instruction.
1423 /// \param __a
1424 /// A 256-bit vector of [16 x i16] to be shifted.
1425 /// \param __count
1426 /// An unsigned integer value specifying the shift count (in bits).
1427 /// \returns A 256-bit vector of [16 x i16] containing the result.
1428 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1429 _mm256_srai_epi16(__m256i __a, int __count)
1431 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
1434 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1435 /// right by the number of bits given in the lower 64 bits of \a __count,
1436 /// shifting in sign bits, and returns the result. If \a __count is greater
1437 /// than 15, each element of the result is either 0 or -1 according to the
1438 /// corresponding input sign bit.
1440 /// \headerfile <immintrin.h>
1442 /// This intrinsic corresponds to the \c VPSRAW instruction.
1444 /// \param __a
1445 /// A 256-bit vector of [16 x i16] to be shifted.
1446 /// \param __count
1447 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1448 /// shift count (in bits). The upper element is ignored.
1449 /// \returns A 256-bit vector of [16 x i16] containing the result.
1450 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1451 _mm256_sra_epi16(__m256i __a, __m128i __count)
1453 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
1456 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1457 /// right by \a __count bits, shifting in sign bits, and returns the result.
1458 /// If \a __count is greater than 31, each element of the result is either
1459 /// 0 or -1 according to the corresponding input sign bit.
1461 /// \headerfile <immintrin.h>
1463 /// This intrinsic corresponds to the \c VPSRAD instruction.
1465 /// \param __a
1466 /// A 256-bit vector of [8 x i32] to be shifted.
1467 /// \param __count
1468 /// An unsigned integer value specifying the shift count (in bits).
1469 /// \returns A 256-bit vector of [8 x i32] containing the result.
1470 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1471 _mm256_srai_epi32(__m256i __a, int __count)
1473 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
1476 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1477 /// right by the number of bits given in the lower 64 bits of \a __count,
1478 /// shifting in sign bits, and returns the result. If \a __count is greater
1479 /// than 31, each element of the result is either 0 or -1 according to the
1480 /// corresponding input sign bit.
1482 /// \headerfile <immintrin.h>
1484 /// This intrinsic corresponds to the \c VPSRAD instruction.
1486 /// \param __a
1487 /// A 256-bit vector of [8 x i32] to be shifted.
1488 /// \param __count
1489 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1490 /// shift count (in bits). The upper element is ignored.
1491 /// \returns A 256-bit vector of [8 x i32] containing the result.
1492 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1493 _mm256_sra_epi32(__m256i __a, __m128i __count)
1495 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
1498 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
1499 /// \a imm bytes, shifting in zero bytes, and returns the result. If
1500 /// \a imm is greater than 15, the returned result is all zeroes.
1502 /// \headerfile <immintrin.h>
1504 /// \code
1505 /// __m256i _mm256_srli_si256(__m256i a, const int imm);
1506 /// \endcode
1508 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
1510 /// \param a
1511 /// A 256-bit integer vector to be shifted.
1512 /// \param imm
1513 /// An unsigned immediate value specifying the shift count (in bytes).
1514 /// \returns A 256-bit integer vector containing the result.
1515 #define _mm256_srli_si256(a, imm) \
1516 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
1518 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
1519 /// \a imm bytes, shifting in zero bytes, and returns the result. If
1520 /// \a imm is greater than 15, the returned result is all zeroes.
1522 /// \headerfile <immintrin.h>
1524 /// \code
1525 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
1526 /// \endcode
1528 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
1530 /// \param a
1531 /// A 256-bit integer vector to be shifted.
1532 /// \param imm
1533 /// An unsigned immediate value specifying the shift count (in bytes).
1534 /// \returns A 256-bit integer vector containing the result.
1535 #define _mm256_bsrli_epi128(a, imm) \
1536 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
1538 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1539 /// right by \a __count bits, shifting in zero bits, and returns the result.
1540 /// If \a __count is greater than 15, the returned result is all zeroes.
1542 /// \headerfile <immintrin.h>
1544 /// This intrinsic corresponds to the \c VPSRLW instruction.
1546 /// \param __a
1547 /// A 256-bit vector of [16 x i16] to be shifted.
1548 /// \param __count
1549 /// An unsigned integer value specifying the shift count (in bits).
1550 /// \returns A 256-bit vector of [16 x i16] containing the result.
1551 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1552 _mm256_srli_epi16(__m256i __a, int __count)
1554 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
1557 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1558 /// right by the number of bits given in the lower 64 bits of \a __count,
1559 /// shifting in zero bits, and returns the result. If \a __count is greater
1560 /// than 15, the returned result is all zeroes.
1562 /// \headerfile <immintrin.h>
1564 /// This intrinsic corresponds to the \c VPSRLW instruction.
1566 /// \param __a
1567 /// A 256-bit vector of [16 x i16] to be shifted.
1568 /// \param __count
1569 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1570 /// shift count (in bits). The upper element is ignored.
1571 /// \returns A 256-bit vector of [16 x i16] containing the result.
1572 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1573 _mm256_srl_epi16(__m256i __a, __m128i __count)
1575 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
1578 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1579 /// right by \a __count bits, shifting in zero bits, and returns the result.
1580 /// If \a __count is greater than 31, the returned result is all zeroes.
1582 /// \headerfile <immintrin.h>
1584 /// This intrinsic corresponds to the \c VPSRLD instruction.
1586 /// \param __a
1587 /// A 256-bit vector of [8 x i32] to be shifted.
1588 /// \param __count
1589 /// An unsigned integer value specifying the shift count (in bits).
1590 /// \returns A 256-bit vector of [8 x i32] containing the result.
1591 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1592 _mm256_srli_epi32(__m256i __a, int __count)
1594 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
1597 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1598 /// right by the number of bits given in the lower 64 bits of \a __count,
1599 /// shifting in zero bits, and returns the result. If \a __count is greater
1600 /// than 31, the returned result is all zeroes.
1602 /// \headerfile <immintrin.h>
1604 /// This intrinsic corresponds to the \c VPSRLD instruction.
1606 /// \param __a
1607 /// A 256-bit vector of [8 x i32] to be shifted.
1608 /// \param __count
1609 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1610 /// shift count (in bits). The upper element is ignored.
1611 /// \returns A 256-bit vector of [8 x i32] containing the result.
1612 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1613 _mm256_srl_epi32(__m256i __a, __m128i __count)
1615 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
1618 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
1619 /// right by \a __count bits, shifting in zero bits, and returns the result.
1620 /// If \a __count is greater than 63, the returned result is all zeroes.
1622 /// \headerfile <immintrin.h>
1624 /// This intrinsic corresponds to the \c VPSRLQ instruction.
1626 /// \param __a
1627 /// A 256-bit vector of [4 x i64] to be shifted.
1628 /// \param __count
1629 /// An unsigned integer value specifying the shift count (in bits).
1630 /// \returns A 256-bit vector of [4 x i64] containing the result.
1631 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1632 _mm256_srli_epi64(__m256i __a, int __count)
1634 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
1637 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
1638 /// right by the number of bits given in the lower 64 bits of \a __count,
1639 /// shifting in zero bits, and returns the result. If \a __count is greater
1640 /// than 63, the returned result is all zeroes.
1642 /// \headerfile <immintrin.h>
1644 /// This intrinsic corresponds to the \c VPSRLQ instruction.
1646 /// \param __a
1647 /// A 256-bit vector of [4 x i64] to be shifted.
1648 /// \param __count
1649 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1650 /// shift count (in bits). The upper element is ignored.
1651 /// \returns A 256-bit vector of [4 x i64] containing the result.
1652 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1653 _mm256_srl_epi64(__m256i __a, __m128i __count)
1655 return __builtin_ia32_psrlq256((__v4di)__a, __count);
1658 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
1659 /// vectors. Returns the lower 8 bits of each difference in the
1660 /// corresponding byte of the 256-bit integer vector result (overflow is
1661 /// ignored).
1663 /// \code{.operation}
1664 /// FOR i := 0 TO 31
1665 /// j := i*8
1666 /// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
1667 /// ENDFOR
1668 /// \endcode
1670 /// \headerfile <immintrin.h>
1672 /// This intrinsic corresponds to the \c VPSUBB instruction.
1674 /// \param __a
1675 /// A 256-bit integer vector containing the minuends.
1676 /// \param __b
1677 /// A 256-bit integer vector containing the subtrahends.
1678 /// \returns A 256-bit integer vector containing the differences.
1679 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1680 _mm256_sub_epi8(__m256i __a, __m256i __b)
1682 return (__m256i)((__v32qu)__a - (__v32qu)__b);
1685 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
1686 /// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
1687 /// the corresponding element of the [16 x i16] result (overflow is
1688 /// ignored).
1690 /// \code{.operation}
1691 /// FOR i := 0 TO 15
1692 /// j := i*16
1693 /// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
1694 /// ENDFOR
1695 /// \endcode
1697 /// \headerfile <immintrin.h>
1699 /// This intrinsic corresponds to the \c VPSUBW instruction.
1701 /// \param __a
1702 /// A 256-bit vector of [16 x i16] containing the minuends.
1703 /// \param __b
1704 /// A 256-bit vector of [16 x i16] containing the subtrahends.
1705 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1706 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1707 _mm256_sub_epi16(__m256i __a, __m256i __b)
1709 return (__m256i)((__v16hu)__a - (__v16hu)__b);
1712 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
1713 /// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
1714 /// the corresponding element of the [8 x i32] result (overflow is ignored).
1716 /// \code{.operation}
1717 /// FOR i := 0 TO 7
1718 /// j := i*32
1719 /// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
1720 /// ENDFOR
1721 /// \endcode
1723 /// \headerfile <immintrin.h>
1725 /// This intrinsic corresponds to the \c VPSUBD instruction.
1727 /// \param __a
1728 /// A 256-bit vector of [8 x i32] containing the minuends.
1729 /// \param __b
1730 /// A 256-bit vector of [8 x i32] containing the subtrahends.
1731 /// \returns A 256-bit vector of [8 x i32] containing the differences.
1732 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1733 _mm256_sub_epi32(__m256i __a, __m256i __b)
1735 return (__m256i)((__v8su)__a - (__v8su)__b);
1738 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
1739 /// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
1740 /// the corresponding element of the [4 x i64] result (overflow is ignored).
1742 /// \code{.operation}
1743 /// FOR i := 0 TO 3
1744 /// j := i*64
1745 /// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
1746 /// ENDFOR
1747 /// \endcode
1749 /// \headerfile <immintrin.h>
1751 /// This intrinsic corresponds to the \c VPSUBQ instruction.
1753 /// \param __a
1754 /// A 256-bit vector of [4 x i64] containing the minuends.
1755 /// \param __b
1756 /// A 256-bit vector of [4 x i64] containing the subtrahends.
1757 /// \returns A 256-bit vector of [4 x i64] containing the differences.
1758 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1759 _mm256_sub_epi64(__m256i __a, __m256i __b)
1761 return (__m256i)((__v4du)__a - (__v4du)__b);
1764 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
1765 /// vectors using signed saturation, and returns each differences in the
1766 /// corresponding byte of the 256-bit integer vector result.
1768 /// \code{.operation}
1769 /// FOR i := 0 TO 31
1770 /// j := i*8
1771 /// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
1772 /// ENDFOR
1773 /// \endcode
1775 /// \headerfile <immintrin.h>
1777 /// This intrinsic corresponds to the \c VPSUBSB instruction.
1779 /// \param __a
1780 /// A 256-bit integer vector containing the minuends.
1781 /// \param __b
1782 /// A 256-bit integer vector containing the subtrahends.
1783 /// \returns A 256-bit integer vector containing the differences.
1784 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1785 _mm256_subs_epi8(__m256i __a, __m256i __b)
1787 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
1790 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
1791 /// vectors of [16 x i16] using signed saturation, and returns each
1792 /// difference in the corresponding element of the [16 x i16] result.
1794 /// \code{.operation}
1795 /// FOR i := 0 TO 15
1796 /// j := i*16
1797 /// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
1798 /// ENDFOR
1799 /// \endcode
1801 /// \headerfile <immintrin.h>
1803 /// This intrinsic corresponds to the \c VPSUBSW instruction.
1805 /// \param __a
1806 /// A 256-bit vector of [16 x i16] containing the minuends.
1807 /// \param __b
1808 /// A 256-bit vector of [16 x i16] containing the subtrahends.
1809 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1810 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1811 _mm256_subs_epi16(__m256i __a, __m256i __b)
1813 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
1816 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
1817 /// vectors using unsigned saturation, and returns each difference in the
1818 /// corresponding byte of the 256-bit integer vector result. For each byte,
1819 /// computes <c> result = __a - __b </c>.
1821 /// \code{.operation}
1822 /// FOR i := 0 TO 31
1823 /// j := i*8
1824 /// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
1825 /// ENDFOR
1826 /// \endcode
1828 /// \headerfile <immintrin.h>
1830 /// This intrinsic corresponds to the \c VPSUBUSB instruction.
1832 /// \param __a
1833 /// A 256-bit integer vector containing the minuends.
1834 /// \param __b
1835 /// A 256-bit integer vector containing the subtrahends.
1836 /// \returns A 256-bit integer vector containing the differences.
1837 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1838 _mm256_subs_epu8(__m256i __a, __m256i __b)
1840 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
1843 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
1844 /// vectors of [16 x i16] using unsigned saturation, and returns each
1845 /// difference in the corresponding element of the [16 x i16] result.
1847 /// \code{.operation}
1848 /// FOR i := 0 TO 15
1849 /// j := i*16
1850 /// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
1851 /// ENDFOR
1852 /// \endcode
1854 /// \headerfile <immintrin.h>
1856 /// This intrinsic corresponds to the \c VPSUBUSW instruction.
1858 /// \param __a
1859 /// A 256-bit vector of [16 x i16] containing the minuends.
1860 /// \param __b
1861 /// A 256-bit vector of [16 x i16] containing the subtrahends.
1862 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1863 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1864 _mm256_subs_epu16(__m256i __a, __m256i __b)
1866 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
1869 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
1870 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
1871 /// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
1872 /// input; other bits in these parameters are ignored.
1874 /// \code{.operation}
1875 /// result[7:0] := __a[71:64]
1876 /// result[15:8] := __b[71:64]
1877 /// result[23:16] := __a[79:72]
1878 /// result[31:24] := __b[79:72]
1879 /// . . .
1880 /// result[127:120] := __b[127:120]
1881 /// result[135:128] := __a[199:192]
1882 /// . . .
1883 /// result[255:248] := __b[255:248]
1884 /// \endcode
1886 /// \headerfile <immintrin.h>
1888 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
1890 /// \param __a
1891 /// A 256-bit integer vector used as the source for the even-numbered bytes
1892 /// of the result.
1893 /// \param __b
1894 /// A 256-bit integer vector used as the source for the odd-numbered bytes
1895 /// of the result.
1896 /// \returns A 256-bit integer vector containing the result.
1897 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1898 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
1900 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
1903 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
1904 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
1905 /// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
1906 /// 128-bit half of \a __a and \a __b as input; other bits in these
1907 /// parameters are ignored.
1909 /// \code{.operation}
1910 /// result[15:0] := __a[79:64]
1911 /// result[31:16] := __b[79:64]
1912 /// result[47:32] := __a[95:80]
1913 /// result[63:48] := __b[95:80]
1914 /// . . .
1915 /// result[127:112] := __b[127:112]
1916 /// result[143:128] := __a[211:196]
1917 /// . . .
1918 /// result[255:240] := __b[255:240]
1919 /// \endcode
1921 /// \headerfile <immintrin.h>
1923 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
1925 /// \param __a
1926 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
1927 /// elements of the result.
1928 /// \param __b
1929 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
1930 /// elements of the result.
1931 /// \returns A 256-bit vector of [16 x i16] containing the result.
1932 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1933 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
1935 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1938 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
1939 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
1940 /// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
1941 /// of \a __a and \a __b as input; other bits in these parameters are
1942 /// ignored.
1944 /// \code{.operation}
1945 /// result[31:0] := __a[95:64]
1946 /// result[63:32] := __b[95:64]
1947 /// result[95:64] := __a[127:96]
1948 /// result[127:96] := __b[127:96]
1949 /// result[159:128] := __a[223:192]
1950 /// result[191:160] := __b[223:192]
1951 /// result[223:192] := __a[255:224]
1952 /// result[255:224] := __b[255:224]
1953 /// \endcode
1955 /// \headerfile <immintrin.h>
1957 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
1959 /// \param __a
1960 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
1961 /// elements of the result.
1962 /// \param __b
1963 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
1964 /// elements of the result.
1965 /// \returns A 256-bit vector of [8 x i32] containing the result.
1966 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1967 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
1969 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
1972 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
1973 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
1974 /// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
1975 /// of \a __a and \a __b as input; other bits in these parameters are
1976 /// ignored.
1978 /// \code{.operation}
1979 /// result[63:0] := __a[127:64]
1980 /// result[127:64] := __b[127:64]
1981 /// result[191:128] := __a[255:192]
1982 /// result[255:192] := __b[255:192]
1983 /// \endcode
1985 /// \headerfile <immintrin.h>
1987 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
1989 /// \param __a
1990 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
1991 /// elements of the result.
1992 /// \param __b
1993 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
1994 /// elements of the result.
1995 /// \returns A 256-bit vector of [4 x i64] containing the result.
1996 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1997 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
1999 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2002 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2003 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2004 /// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2005 /// input; other bits in these parameters are ignored.
2007 /// \code{.operation}
2008 /// result[7:0] := __a[7:0]
2009 /// result[15:8] := __b[7:0]
2010 /// result[23:16] := __a[15:8]
2011 /// result[31:24] := __b[15:8]
2012 /// . . .
2013 /// result[127:120] := __b[63:56]
2014 /// result[135:128] := __a[135:128]
2015 /// . . .
2016 /// result[255:248] := __b[191:184]
2017 /// \endcode
2019 /// \headerfile <immintrin.h>
2021 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2023 /// \param __a
2024 /// A 256-bit integer vector used as the source for the even-numbered bytes
2025 /// of the result.
2026 /// \param __b
2027 /// A 256-bit integer vector used as the source for the odd-numbered bytes
2028 /// of the result.
2029 /// \returns A 256-bit integer vector containing the result.
2030 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2031 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
2033 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2036 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2037 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2038 /// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2039 /// 128-bit half of \a __a and \a __b as input; other bits in these
2040 /// parameters are ignored.
2042 /// \code{.operation}
2043 /// result[15:0] := __a[15:0]
2044 /// result[31:16] := __b[15:0]
2045 /// result[47:32] := __a[31:16]
2046 /// result[63:48] := __b[31:16]
2047 /// . . .
2048 /// result[127:112] := __b[63:48]
2049 /// result[143:128] := __a[143:128]
2050 /// . . .
2051 /// result[255:239] := __b[191:176]
2052 /// \endcode
2054 /// \headerfile <immintrin.h>
2056 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2058 /// \param __a
2059 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2060 /// elements of the result.
2061 /// \param __b
2062 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2063 /// elements of the result.
2064 /// \returns A 256-bit vector of [16 x i16] containing the result.
2065 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2066 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
2068 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2071 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2072 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2073 /// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2074 /// of \a __a and \a __b as input; other bits in these parameters are
2075 /// ignored.
2077 /// \code{.operation}
2078 /// result[31:0] := __a[31:0]
2079 /// result[63:32] := __b[31:0]
2080 /// result[95:64] := __a[63:32]
2081 /// result[127:96] := __b[63:32]
2082 /// result[159:128] := __a[159:128]
2083 /// result[191:160] := __b[159:128]
2084 /// result[223:192] := __a[191:160]
2085 /// result[255:224] := __b[191:190]
2086 /// \endcode
2088 /// \headerfile <immintrin.h>
2090 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2092 /// \param __a
2093 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2094 /// elements of the result.
2095 /// \param __b
2096 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2097 /// elements of the result.
2098 /// \returns A 256-bit vector of [8 x i32] containing the result.
2099 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2100 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
2102 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2105 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2106 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2107 /// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2108 /// of \a __a and \a __b as input; other bits in these parameters are
2109 /// ignored.
2111 /// \code{.operation}
2112 /// result[63:0] := __a[63:0]
2113 /// result[127:64] := __b[63:0]
2114 /// result[191:128] := __a[191:128]
2115 /// result[255:192] := __b[191:128]
2116 /// \endcode
2118 /// \headerfile <immintrin.h>
2120 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2122 /// \param __a
2123 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2124 /// elements of the result.
2125 /// \param __b
2126 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2127 /// elements of the result.
2128 /// \returns A 256-bit vector of [4 x i64] containing the result.
2129 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2130 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
2132 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2135 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2136 _mm256_xor_si256(__m256i __a, __m256i __b)
2138 return (__m256i)((__v4du)__a ^ (__v4du)__b);
2141 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2142 _mm256_stream_load_si256(__m256i const *__V)
2144 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2145 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2148 /// Broadcasts the 32-bit floating-point value from the low element of the
2149 /// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2150 /// 128-bit vector of [4 x float].
2152 /// \headerfile <immintrin.h>
2154 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2156 /// \param __X
2157 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
2158 /// \returns A 128-bit vector of [4 x float] containing the result.
2159 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2160 _mm_broadcastss_ps(__m128 __X)
2162 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
2165 /// Broadcasts the 64-bit floating-point value from the low element of the
2166 /// 128-bit vector of [2 x double] in \a __a to both elements of the
2167 /// result's 128-bit vector of [2 x double].
2169 /// \headerfile <immintrin.h>
2171 /// This intrinsic corresponds to the \c MOVDDUP instruction.
2173 /// \param __a
2174 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
2175 /// \returns A 128-bit vector of [2 x double] containing the result.
2176 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2177 _mm_broadcastsd_pd(__m128d __a)
2179 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
2182 /// Broadcasts the 32-bit floating-point value from the low element of the
2183 /// 128-bit vector of [4 x float] in \a __X to all elements of the
2184 /// result's 256-bit vector of [8 x float].
2186 /// \headerfile <immintrin.h>
2188 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2190 /// \param __X
2191 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
2192 /// \returns A 256-bit vector of [8 x float] containing the result.
2193 static __inline__ __m256 __DEFAULT_FN_ATTRS256
2194 _mm256_broadcastss_ps(__m128 __X)
2196 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
2199 /// Broadcasts the 64-bit floating-point value from the low element of the
2200 /// 128-bit vector of [2 x double] in \a __X to all elements of the
2201 /// result's 256-bit vector of [4 x double].
2203 /// \headerfile <immintrin.h>
2205 /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
2207 /// \param __X
2208 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
2209 /// \returns A 256-bit vector of [4 x double] containing the result.
2210 static __inline__ __m256d __DEFAULT_FN_ATTRS256
2211 _mm256_broadcastsd_pd(__m128d __X)
2213 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
2216 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
2217 /// upper halves of the 256-bit result.
2219 /// \headerfile <immintrin.h>
2221 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
2223 /// \param __X
2224 /// A 128-bit integer vector to be broadcast.
2225 /// \returns A 256-bit integer vector containing the result.
2226 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2227 _mm256_broadcastsi128_si256(__m128i __X)
2229 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
2232 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
2234 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
2235 /// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
2236 /// as specified by the immediate integer operand \a M.
2238 /// \code{.operation}
2239 /// FOR i := 0 TO 3
2240 /// j := i*32
2241 /// IF M[i] == 0
2242 /// result[31+j:j] := V1[31+j:j]
2243 /// ELSE
2244 /// result[31+j:j] := V2[32+j:j]
2245 /// FI
2246 /// ENDFOR
2247 /// \endcode
2249 /// \headerfile <immintrin.h>
2251 /// \code
2252 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
2253 /// \endcode
2255 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
2257 /// \param V1
2258 /// A 128-bit vector of [4 x i32] containing source values.
2259 /// \param V2
2260 /// A 128-bit vector of [4 x i32] containing source values.
2261 /// \param M
2262 /// An immediate 8-bit integer operand, with bits [3:0] specifying the
2263 /// source for each element of the result. The position of the mask bit
2264 /// corresponds to the index of a copied value. When a mask bit is 0, the
2265 /// element is copied from \a V1; otherwise, it is copied from \a V2.
2266 /// \returns A 128-bit vector of [4 x i32] containing the result.
2267 #define _mm_blend_epi32(V1, V2, M) \
2268 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
2269 (__v4si)(__m128i)(V2), (int)(M)))
2271 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
2272 /// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
2273 /// as specified by the immediate integer operand \a M.
2275 /// \code{.operation}
2276 /// FOR i := 0 TO 7
2277 /// j := i*32
2278 /// IF M[i] == 0
2279 /// result[31+j:j] := V1[31+j:j]
2280 /// ELSE
2281 /// result[31+j:j] := V2[32+j:j]
2282 /// FI
2283 /// ENDFOR
2284 /// \endcode
2286 /// \headerfile <immintrin.h>
2288 /// \code
2289 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
2290 /// \endcode
2292 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
2294 /// \param V1
2295 /// A 256-bit vector of [8 x i32] containing source values.
2296 /// \param V2
2297 /// A 256-bit vector of [8 x i32] containing source values.
2298 /// \param M
2299 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
2300 /// source for each element of the result. The position of the mask bit
2301 /// corresponds to the index of a copied value. When a mask bit is 0, the
2302 /// element is copied from \a V1; otherwise, it is is copied from \a V2.
2303 /// \returns A 256-bit vector of [8 x i32] containing the result.
2304 #define _mm256_blend_epi32(V1, V2, M) \
2305 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
2306 (__v8si)(__m256i)(V2), (int)(M)))
2308 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
2309 /// bytes of the 256-bit result.
2311 /// \headerfile <immintrin.h>
2313 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
2315 /// \param __X
2316 /// A 128-bit integer vector whose low byte will be broadcast.
2317 /// \returns A 256-bit integer vector containing the result.
2318 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2319 _mm256_broadcastb_epi8(__m128i __X)
2321 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
2324 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
2325 /// to all elements of the result's 256-bit vector of [16 x i16].
2327 /// \headerfile <immintrin.h>
2329 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
2331 /// \param __X
2332 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
2333 /// \returns A 256-bit vector of [16 x i16] containing the result.
2334 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2335 _mm256_broadcastw_epi16(__m128i __X)
2337 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
2340 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
2341 /// to all elements of the result's 256-bit vector of [8 x i32].
2343 /// \headerfile <immintrin.h>
2345 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
2347 /// \param __X
2348 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
2349 /// \returns A 256-bit vector of [8 x i32] containing the result.
2350 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2351 _mm256_broadcastd_epi32(__m128i __X)
2353 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
2356 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
2357 /// to all elements of the result's 256-bit vector of [4 x i64].
2359 /// \headerfile <immintrin.h>
2361 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
2363 /// \param __X
2364 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
2365 /// \returns A 256-bit vector of [4 x i64] containing the result.
2366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2367 _mm256_broadcastq_epi64(__m128i __X)
2369 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
2372 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
2373 /// bytes of the 128-bit result.
2375 /// \headerfile <immintrin.h>
2377 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
2379 /// \param __X
2380 /// A 128-bit integer vector whose low byte will be broadcast.
2381 /// \returns A 128-bit integer vector containing the result.
2382 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2383 _mm_broadcastb_epi8(__m128i __X)
2385 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
2388 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
2389 /// \a __X to all elements of the result's 128-bit vector of [8 x i16].
2391 /// \headerfile <immintrin.h>
2393 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
2395 /// \param __X
2396 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
2397 /// \returns A 128-bit vector of [8 x i16] containing the result.
2398 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2399 _mm_broadcastw_epi16(__m128i __X)
2401 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
2404 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
2405 /// to all elements of the result's vector of [4 x i32].
2407 /// \headerfile <immintrin.h>
2409 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
2411 /// \param __X
2412 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
2413 /// \returns A 128-bit vector of [4 x i32] containing the result.
2414 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2415 _mm_broadcastd_epi32(__m128i __X)
2417 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
2420 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
2421 /// to both elements of the result's 128-bit vector of [2 x i64].
2423 /// \headerfile <immintrin.h>
2425 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
2427 /// \param __X
2428 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
2429 /// \returns A 128-bit vector of [2 x i64] containing the result.
2430 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2431 _mm_broadcastq_epi64(__m128i __X)
2433 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
2436 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
2437 /// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
2438 /// elements of the 256-bit vector of [8 x i32] in \a __b.
2440 /// \code{.operation}
2441 /// FOR i := 0 TO 7
2442 /// j := i*32
2443 /// k := __b[j+2:j] * 32
2444 /// result[j+31:j] := __a[k+31:k]
2445 /// ENDFOR
2446 /// \endcode
2448 /// \headerfile <immintrin.h>
2450 /// This intrinsic corresponds to the \c VPERMD instruction.
2452 /// \param __a
2453 /// A 256-bit vector of [8 x i32] containing the source values.
2454 /// \param __b
2455 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
2456 /// \a __a.
2457 /// \returns A 256-bit vector of [8 x i32] containing the result.
2458 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2459 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
2461 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
2464 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
2465 /// the 256-bit vector of [4 x double] in \a V as specified by the
2466 /// immediate value \a M.
2468 /// \code{.operation}
2469 /// FOR i := 0 TO 3
2470 /// j := i*64
2471 /// k := (M >> i*2)[1:0] * 64
2472 /// result[j+63:j] := V[k+63:k]
2473 /// ENDFOR
2474 /// \endcode
2476 /// \headerfile <immintrin.h>
2478 /// \code
2479 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
2480 /// \endcode
2482 /// This intrinsic corresponds to the \c VPERMPD instruction.
2484 /// \param V
2485 /// A 256-bit vector of [4 x double] containing the source values.
2486 /// \param M
2487 /// An immediate 8-bit value specifying which elements to copy from \a V.
2488 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
2489 /// \a M[3:2] specifies the index for element 1, and so forth.
2490 /// \returns A 256-bit vector of [4 x double] containing the result.
2491 #define _mm256_permute4x64_pd(V, M) \
2492 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
2494 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
2495 /// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
2496 /// the elements of the 256-bit vector of [8 x i32] in \a __b.
2498 /// \code{.operation}
2499 /// FOR i := 0 TO 7
2500 /// j := i*32
2501 /// k := __b[j+2:j] * 32
2502 /// result[j+31:j] := __a[k+31:k]
2503 /// ENDFOR
2504 /// \endcode
2506 /// \headerfile <immintrin.h>
2508 /// This intrinsic corresponds to the \c VPERMPS instruction.
2510 /// \param __a
2511 /// A 256-bit vector of [8 x float] containing the source values.
2512 /// \param __b
2513 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
2514 /// \a __a.
2515 /// \returns A 256-bit vector of [8 x float] containing the result.
2516 static __inline__ __m256 __DEFAULT_FN_ATTRS256
2517 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
2519 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
2522 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
2523 /// of the 256-bit vector of [4 x i64] in \a V as specified by the
2524 /// immediate value \a M.
2526 /// \code{.operation}
2527 /// FOR i := 0 TO 3
2528 /// j := i*64
2529 /// k := (M >> i*2)[1:0] * 64
2530 /// result[j+63:j] := V[k+63:k]
2531 /// ENDFOR
2532 /// \endcode
2534 /// \headerfile <immintrin.h>
2536 /// \code
2537 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
2538 /// \endcode
2540 /// This intrinsic corresponds to the \c VPERMQ instruction.
2542 /// \param V
2543 /// A 256-bit vector of [4 x i64] containing the source values.
2544 /// \param M
2545 /// An immediate 8-bit value specifying which elements to copy from \a V.
2546 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
2547 /// \a M[3:2] specifies the index for element 1, and so forth.
2548 /// \returns A 256-bit vector of [4 x i64] containing the result.
2549 #define _mm256_permute4x64_epi64(V, M) \
2550 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
2552 /// Sets each half of the 256-bit result either to zero or to one of the
2553 /// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
2554 /// as specified by the immediate value \a M.
2556 /// \code{.operation}
2557 /// FOR i := 0 TO 1
2558 /// j := i*128
2559 /// k := M >> (i*4)
2560 /// IF k[3] == 0
2561 /// CASE (k[1:0]) OF
2562 /// 0: result[127+j:j] := V1[127:0]
2563 /// 1: result[127+j:j] := V1[255:128]
2564 /// 2: result[127+j:j] := V2[127:0]
2565 /// 3: result[127+j:j] := V2[255:128]
2566 /// ESAC
2567 /// ELSE
2568 /// result[127+j:j] := 0
2569 /// FI
2570 /// ENDFOR
2571 /// \endcode
2573 /// \headerfile <immintrin.h>
2575 /// \code
2576 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
2577 /// \endcode
2579 /// This intrinsic corresponds to the \c VPERM2I128 instruction.
2581 /// \param V1
2582 /// A 256-bit integer vector containing source values.
2583 /// \param V2
2584 /// A 256-bit integer vector containing source values.
2585 /// \param M
2586 /// An immediate value specifying how to form the result. Bits [3:0]
2587 /// control the lower half of the result, bits [7:4] control the upper half.
2588 /// Within each 4-bit control value, if bit 3 is 1, the result is zero,
2589 /// otherwise bits [1:0] determine the source as follows. \n
2590 /// 0: the lower half of \a V1 \n
2591 /// 1: the upper half of \a V1 \n
2592 /// 2: the lower half of \a V2 \n
2593 /// 3: the upper half of \a V2
2594 /// \returns A 256-bit integer vector containing the result.
2595 #define _mm256_permute2x128_si256(V1, V2, M) \
2596 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
2598 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
2599 /// of the immediate \a M is zero, extracts the lower half of the result;
2600 /// otherwise, extracts the upper half.
2602 /// \headerfile <immintrin.h>
2604 /// \code
2605 /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
2606 /// \endcode
2608 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
2610 /// \param V
2611 /// A 256-bit integer vector containing the source values.
2612 /// \param M
2613 /// An immediate value specifying which half of \a V to extract.
2614 /// \returns A 128-bit integer vector containing the result.
2615 #define _mm256_extracti128_si256(V, M) \
2616 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
2618 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
2619 /// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
2620 /// is zero, overwrites the lower half of the result; otherwise,
2621 /// overwrites the upper half.
2623 /// \headerfile <immintrin.h>
2625 /// \code
2626 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
2627 /// \endcode
2629 /// This intrinsic corresponds to the \c VINSERTI128 instruction.
2631 /// \param V1
2632 /// A 256-bit integer vector containing a source value.
2633 /// \param V2
2634 /// A 128-bit integer vector containing a source value.
2635 /// \param M
2636 /// An immediate value specifying where to put \a V2 in the result.
2637 /// \returns A 256-bit integer vector containing the result.
2638 #define _mm256_inserti128_si256(V1, V2, M) \
2639 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
2640 (__v2di)(__m128i)(V2), (int)(M)))
2642 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2643 _mm256_maskload_epi32(int const *__X, __m256i __M)
2645 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
2648 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2649 _mm256_maskload_epi64(long long const *__X, __m256i __M)
2651 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
2654 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2655 _mm_maskload_epi32(int const *__X, __m128i __M)
2657 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
2660 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2661 _mm_maskload_epi64(long long const *__X, __m128i __M)
2663 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
2666 static __inline__ void __DEFAULT_FN_ATTRS256
2667 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
2669 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
2672 static __inline__ void __DEFAULT_FN_ATTRS256
2673 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
2675 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
2678 static __inline__ void __DEFAULT_FN_ATTRS128
2679 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
2681 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
2684 static __inline__ void __DEFAULT_FN_ATTRS128
2685 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
2687 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
2690 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
2691 /// left by the number of bits given in the corresponding element of the
2692 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
2693 /// returns the result. If the shift count for any element is greater than
2694 /// 31, the result for that element is zero.
2696 /// \headerfile <immintrin.h>
2698 /// This intrinsic corresponds to the \c VPSLLVD instruction.
2700 /// \param __X
2701 /// A 256-bit vector of [8 x i32] to be shifted.
2702 /// \param __Y
2703 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
2704 /// bits).
2705 /// \returns A 256-bit vector of [8 x i32] containing the result.
2706 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2707 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
2709 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
2712 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
2713 /// left by the number of bits given in the corresponding element of the
2714 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
2715 /// returns the result. If the shift count for any element is greater than
2716 /// 31, the result for that element is zero.
2718 /// \headerfile <immintrin.h>
2720 /// This intrinsic corresponds to the \c VPSLLVD instruction.
2722 /// \param __X
2723 /// A 128-bit vector of [4 x i32] to be shifted.
2724 /// \param __Y
2725 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
2726 /// bits).
2727 /// \returns A 128-bit vector of [4 x i32] containing the result.
2728 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2729 _mm_sllv_epi32(__m128i __X, __m128i __Y)
2731 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
2734 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
2735 /// left by the number of bits given in the corresponding element of the
2736 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
2737 /// returns the result. If the shift count for any element is greater than
2738 /// 63, the result for that element is zero.
2740 /// \headerfile <immintrin.h>
2742 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
2744 /// \param __X
2745 /// A 256-bit vector of [4 x i64] to be shifted.
2746 /// \param __Y
2747 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
2748 /// bits).
2749 /// \returns A 256-bit vector of [4 x i64] containing the result.
2750 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2751 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
2753 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
2756 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
2757 /// left by the number of bits given in the corresponding element of the
2758 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
2759 /// returns the result. If the shift count for any element is greater than
2760 /// 63, the result for that element is zero.
2762 /// \headerfile <immintrin.h>
2764 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
2766 /// \param __X
2767 /// A 128-bit vector of [2 x i64] to be shifted.
2768 /// \param __Y
2769 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
2770 /// bits).
2771 /// \returns A 128-bit vector of [2 x i64] containing the result.
2772 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2773 _mm_sllv_epi64(__m128i __X, __m128i __Y)
2775 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
2778 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
2779 /// right by the number of bits given in the corresponding element of the
2780 /// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
2781 /// returns the result. If the shift count for any element is greater than
2782 /// 31, the result for that element is 0 or -1 according to the sign bit
2783 /// for that element.
2785 /// \headerfile <immintrin.h>
2787 /// This intrinsic corresponds to the \c VPSRAVD instruction.
2789 /// \param __X
2790 /// A 256-bit vector of [8 x i32] to be shifted.
2791 /// \param __Y
2792 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
2793 /// bits).
2794 /// \returns A 256-bit vector of [8 x i32] containing the result.
2795 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2796 _mm256_srav_epi32(__m256i __X, __m256i __Y)
2798 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
2801 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
2802 /// right by the number of bits given in the corresponding element of the
2803 /// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
2804 /// returns the result. If the shift count for any element is greater than
2805 /// 31, the result for that element is 0 or -1 according to the sign bit
2806 /// for that element.
2808 /// \headerfile <immintrin.h>
2810 /// This intrinsic corresponds to the \c VPSRAVD instruction.
2812 /// \param __X
2813 /// A 128-bit vector of [4 x i32] to be shifted.
2814 /// \param __Y
2815 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
2816 /// bits).
2817 /// \returns A 128-bit vector of [4 x i32] containing the result.
2818 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2819 _mm_srav_epi32(__m128i __X, __m128i __Y)
2821 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
2824 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
2825 /// right by the number of bits given in the corresponding element of the
2826 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
2827 /// returns the result. If the shift count for any element is greater than
2828 /// 31, the result for that element is zero.
2830 /// \headerfile <immintrin.h>
2832 /// This intrinsic corresponds to the \c VPSRLVD instruction.
2834 /// \param __X
2835 /// A 256-bit vector of [8 x i32] to be shifted.
2836 /// \param __Y
2837 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
2838 /// bits).
2839 /// \returns A 256-bit vector of [8 x i32] containing the result.
2840 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2841 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
2843 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
2846 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
2847 /// right by the number of bits given in the corresponding element of the
2848 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
2849 /// returns the result. If the shift count for any element is greater than
2850 /// 31, the result for that element is zero.
2852 /// \headerfile <immintrin.h>
2854 /// This intrinsic corresponds to the \c VPSRLVD instruction.
2856 /// \param __X
2857 /// A 128-bit vector of [4 x i32] to be shifted.
2858 /// \param __Y
2859 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
2860 /// bits).
2861 /// \returns A 128-bit vector of [4 x i32] containing the result.
2862 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2863 _mm_srlv_epi32(__m128i __X, __m128i __Y)
2865 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
2868 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
2869 /// right by the number of bits given in the corresponding element of the
2870 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
2871 /// returns the result. If the shift count for any element is greater than
2872 /// 63, the result for that element is zero.
2874 /// \headerfile <immintrin.h>
2876 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
2878 /// \param __X
2879 /// A 256-bit vector of [4 x i64] to be shifted.
2880 /// \param __Y
2881 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
2882 /// bits).
2883 /// \returns A 256-bit vector of [4 x i64] containing the result.
2884 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2885 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
2887 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
2890 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
2891 /// right by the number of bits given in the corresponding element of the
2892 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
2893 /// returns the result. If the shift count for any element is greater than
2894 /// 63, the result for that element is zero.
2896 /// \headerfile <immintrin.h>
2898 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
2900 /// \param __X
2901 /// A 128-bit vector of [2 x i64] to be shifted.
2902 /// \param __Y
2903 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
2904 /// bits).
2905 /// \returns A 128-bit vector of [2 x i64] containing the result.
2906 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2907 _mm_srlv_epi64(__m128i __X, __m128i __Y)
2909 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
2912 /// Conditionally gathers two 64-bit floating-point values, either from the
2913 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
2914 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
2915 /// of [2 x double] in \a mask determines the source for each element.
2917 /// \code{.operation}
2918 /// FOR element := 0 to 1
2919 /// j := element*64
2920 /// k := element*32
2921 /// IF mask[j+63] == 0
2922 /// result[j+63:j] := a[j+63:j]
2923 /// ELSE
2924 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
2925 /// FI
2926 /// ENDFOR
2927 /// \endcode
2929 /// \headerfile <immintrin.h>
2931 /// \code
2932 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
2933 /// __m128d mask, const int s);
2934 /// \endcode
2936 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
2938 /// \param a
2939 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
2940 /// zero.
2941 /// \param m
2942 /// A pointer to the memory used for loading values.
2943 /// \param i
2944 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
2945 /// the first two elements are used.
2946 /// \param mask
2947 /// A 128-bit vector of [2 x double] containing the mask. The most
2948 /// significant bit of each element in the mask vector represents the mask
2949 /// bits. If a mask bit is zero, the corresponding value from vector \a a
2950 /// is gathered; otherwise the value is loaded from memory.
2951 /// \param s
2952 /// A literal constant scale factor for the indexes in \a i. Must be
2953 /// 1, 2, 4, or 8.
2954 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
2955 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
2956 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
2957 (double const *)(m), \
2958 (__v4si)(__m128i)(i), \
2959 (__v2df)(__m128d)(mask), (s)))
2961 /// Conditionally gathers four 64-bit floating-point values, either from the
2962 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
2963 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
2964 /// of [4 x double] in \a mask determines the source for each element.
2966 /// \code{.operation}
2967 /// FOR element := 0 to 3
2968 /// j := element*64
2969 /// k := element*32
2970 /// IF mask[j+63] == 0
2971 /// result[j+63:j] := a[j+63:j]
2972 /// ELSE
2973 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
2974 /// FI
2975 /// ENDFOR
2976 /// \endcode
2978 /// \headerfile <immintrin.h>
2980 /// \code
2981 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
2982 /// __m256d mask, const int s);
2983 /// \endcode
2985 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
2987 /// \param a
2988 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
2989 /// zero.
2990 /// \param m
2991 /// A pointer to the memory used for loading values.
2992 /// \param i
2993 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
2994 /// \param mask
2995 /// A 256-bit vector of [4 x double] containing the mask. The most
2996 /// significant bit of each element in the mask vector represents the mask
2997 /// bits. If a mask bit is zero, the corresponding value from vector \a a
2998 /// is gathered; otherwise the value is loaded from memory.
2999 /// \param s
3000 /// A literal constant scale factor for the indexes in \a i. Must be
3001 /// 1, 2, 4, or 8.
3002 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
3003 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
3004 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
3005 (double const *)(m), \
3006 (__v4si)(__m128i)(i), \
3007 (__v4df)(__m256d)(mask), (s)))
3009 /// Conditionally gathers two 64-bit floating-point values, either from the
3010 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3011 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3012 /// of [2 x double] in \a mask determines the source for each element.
3014 /// \code{.operation}
3015 /// FOR element := 0 to 1
3016 /// j := element*64
3017 /// k := element*64
3018 /// IF mask[j+63] == 0
3019 /// result[j+63:j] := a[j+63:j]
3020 /// ELSE
3021 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3022 /// FI
3023 /// ENDFOR
3024 /// \endcode
3026 /// \headerfile <immintrin.h>
3028 /// \code
3029 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
3030 /// __m128d mask, const int s);
3031 /// \endcode
3033 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
3035 /// \param a
3036 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
3037 /// zero.
3038 /// \param m
3039 /// A pointer to the memory used for loading values.
3040 /// \param i
3041 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3042 /// \param mask
3043 /// A 128-bit vector of [2 x double] containing the mask. The most
3044 /// significant bit of each element in the mask vector represents the mask
3045 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3046 /// is gathered; otherwise the value is loaded from memory.
3047 /// \param s
3048 /// A literal constant scale factor for the indexes in \a i. Must be
3049 /// 1, 2, 4, or 8.
3050 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3051 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
3052 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
3053 (double const *)(m), \
3054 (__v2di)(__m128i)(i), \
3055 (__v2df)(__m128d)(mask), (s)))
3057 /// Conditionally gathers four 64-bit floating-point values, either from the
3058 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
3059 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
3060 /// of [4 x double] in \a mask determines the source for each element.
3062 /// \code{.operation}
3063 /// FOR element := 0 to 3
3064 /// j := element*64
3065 /// k := element*64
3066 /// IF mask[j+63] == 0
3067 /// result[j+63:j] := a[j+63:j]
3068 /// ELSE
3069 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3070 /// FI
3071 /// ENDFOR
3072 /// \endcode
3074 /// \headerfile <immintrin.h>
3076 /// \code
3077 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
3078 /// __m256d mask, const int s);
3079 /// \endcode
3081 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
3083 /// \param a
3084 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
3085 /// zero.
3086 /// \param m
3087 /// A pointer to the memory used for loading values.
3088 /// \param i
3089 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3090 /// \param mask
3091 /// A 256-bit vector of [4 x double] containing the mask. The most
3092 /// significant bit of each element in the mask vector represents the mask
3093 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3094 /// is gathered; otherwise the value is loaded from memory.
3095 /// \param s
3096 /// A literal constant scale factor for the indexes in \a i. Must be
3097 /// 1, 2, 4, or 8.
3098 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
3099 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
3100 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
3101 (double const *)(m), \
3102 (__v4di)(__m256i)(i), \
3103 (__v4df)(__m256d)(mask), (s)))
3105 /// Conditionally gathers four 32-bit floating-point values, either from the
3106 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
3107 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3108 /// of [4 x float] in \a mask determines the source for each element.
3110 /// \code{.operation}
3111 /// FOR element := 0 to 3
3112 /// j := element*32
3113 /// k := element*32
3114 /// IF mask[j+31] == 0
3115 /// result[j+31:j] := a[j+31:j]
3116 /// ELSE
3117 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3118 /// FI
3119 /// ENDFOR
3120 /// \endcode
3122 /// \headerfile <immintrin.h>
3124 /// \code
3125 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
3126 /// __m128 mask, const int s);
3127 /// \endcode
3129 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
3131 /// \param a
3132 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
3133 /// zero.
3134 /// \param m
3135 /// A pointer to the memory used for loading values.
3136 /// \param i
3137 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3138 /// \param mask
3139 /// A 128-bit vector of [4 x float] containing the mask. The most
3140 /// significant bit of each element in the mask vector represents the mask
3141 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3142 /// is gathered; otherwise the value is loaded from memory.
3143 /// \param s
3144 /// A literal constant scale factor for the indexes in \a i. Must be
3145 /// 1, 2, 4, or 8.
3146 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3147 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
3148 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
3149 (float const *)(m), \
3150 (__v4si)(__m128i)(i), \
3151 (__v4sf)(__m128)(mask), (s)))
3153 /// Conditionally gathers eight 32-bit floating-point values, either from the
3154 /// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
3155 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
3156 /// of [8 x float] in \a mask determines the source for each element.
3158 /// \code{.operation}
3159 /// FOR element := 0 to 7
3160 /// j := element*32
3161 /// k := element*32
3162 /// IF mask[j+31] == 0
3163 /// result[j+31:j] := a[j+31:j]
3164 /// ELSE
3165 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3166 /// FI
3167 /// ENDFOR
3168 /// \endcode
3170 /// \headerfile <immintrin.h>
3172 /// \code
3173 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
3174 /// __m256 mask, const int s);
3175 /// \endcode
3177 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
3179 /// \param a
3180 /// A 256-bit vector of [8 x float] used as the source when a mask bit is
3181 /// zero.
3182 /// \param m
3183 /// A pointer to the memory used for loading values.
3184 /// \param i
3185 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
3186 /// \param mask
3187 /// A 256-bit vector of [8 x float] containing the mask. The most
3188 /// significant bit of each element in the mask vector represents the mask
3189 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3190 /// is gathered; otherwise the value is loaded from memory.
3191 /// \param s
3192 /// A literal constant scale factor for the indexes in \a i. Must be
3193 /// 1, 2, 4, or 8.
3194 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
3195 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
3196 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
3197 (float const *)(m), \
3198 (__v8si)(__m256i)(i), \
3199 (__v8sf)(__m256)(mask), (s)))
3201 /// Conditionally gathers two 32-bit floating-point values, either from the
3202 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
3203 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3204 /// of [4 x float] in \a mask determines the source for the lower two
3205 /// elements. The upper two elements of the result are zeroed.
3207 /// \code{.operation}
3208 /// FOR element := 0 to 1
3209 /// j := element*32
3210 /// k := element*64
3211 /// IF mask[j+31] == 0
3212 /// result[j+31:j] := a[j+31:j]
3213 /// ELSE
3214 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3215 /// FI
3216 /// ENDFOR
3217 /// result[127:64] := 0
3218 /// \endcode
3220 /// \headerfile <immintrin.h>
3222 /// \code
3223 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
3224 /// __m128 mask, const int s);
3225 /// \endcode
3227 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
3229 /// \param a
3230 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
3231 /// zero. Only the first two elements are used.
3232 /// \param m
3233 /// A pointer to the memory used for loading values.
3234 /// \param i
3235 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3236 /// \param mask
3237 /// A 128-bit vector of [4 x float] containing the mask. The most
3238 /// significant bit of each element in the mask vector represents the mask
3239 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3240 /// is gathered; otherwise the value is loaded from memory. Only the first
3241 /// two elements are used.
3242 /// \param s
3243 /// A literal constant scale factor for the indexes in \a i. Must be
3244 /// 1, 2, 4, or 8.
3245 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3246 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
3247 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
3248 (float const *)(m), \
3249 (__v2di)(__m128i)(i), \
3250 (__v4sf)(__m128)(mask), (s)))
3252 /// Conditionally gathers four 32-bit floating-point values, either from the
3253 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
3254 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
3255 /// of [4 x float] in \a mask determines the source for each element.
3257 /// \code{.operation}
3258 /// FOR element := 0 to 3
3259 /// j := element*32
3260 /// k := element*64
3261 /// IF mask[j+31] == 0
3262 /// result[j+31:j] := a[j+31:j]
3263 /// ELSE
3264 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3265 /// FI
3266 /// ENDFOR
3267 /// \endcode
3269 /// \headerfile <immintrin.h>
3271 /// \code
3272 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
3273 /// __m128 mask, const int s);
3274 /// \endcode
3276 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
3278 /// \param a
3279 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
3280 /// zero.
3281 /// \param m
3282 /// A pointer to the memory used for loading values.
3283 /// \param i
3284 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3285 /// \param mask
3286 /// A 128-bit vector of [4 x float] containing the mask. The most
3287 /// significant bit of each element in the mask vector represents the mask
3288 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3289 /// is gathered; otherwise the value is loaded from memory.
3290 /// \param s
3291 /// A literal constant scale factor for the indexes in \a i. Must be
3292 /// 1, 2, 4, or 8.
3293 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3294 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
3295 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
3296 (float const *)(m), \
3297 (__v4di)(__m256i)(i), \
3298 (__v4sf)(__m128)(mask), (s)))
3300 /// Conditionally gathers four 32-bit integer values, either from the
3301 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
3302 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3303 /// of [4 x i32] in \a mask determines the source for each element.
3305 /// \code{.operation}
3306 /// FOR element := 0 to 3
3307 /// j := element*32
3308 /// k := element*32
3309 /// IF mask[j+31] == 0
3310 /// result[j+31:j] := a[j+31:j]
3311 /// ELSE
3312 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3313 /// FI
3314 /// ENDFOR
3315 /// \endcode
3317 /// \headerfile <immintrin.h>
3319 /// \code
3320 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
3321 /// __m128i mask, const int s);
3322 /// \endcode
3324 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
3326 /// \param a
3327 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
3328 /// zero.
3329 /// \param m
3330 /// A pointer to the memory used for loading values.
3331 /// \param i
3332 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3333 /// \param mask
3334 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
3335 /// bit of each element in the mask vector represents the mask bits. If a
3336 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3337 /// otherwise the value is loaded from memory.
3338 /// \param s
3339 /// A literal constant scale factor for the indexes in \a i. Must be
3340 /// 1, 2, 4, or 8.
3341 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
3342 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
3343 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
3344 (int const *)(m), \
3345 (__v4si)(__m128i)(i), \
3346 (__v4si)(__m128i)(mask), (s)))
3348 /// Conditionally gathers eight 32-bit integer values, either from the
3349 /// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
3350 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
3351 /// of [8 x i32] in \a mask determines the source for each element.
3353 /// \code{.operation}
3354 /// FOR element := 0 to 7
3355 /// j := element*32
3356 /// k := element*32
3357 /// IF mask[j+31] == 0
3358 /// result[j+31:j] := a[j+31:j]
3359 /// ELSE
3360 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3361 /// FI
3362 /// ENDFOR
3363 /// \endcode
3365 /// \headerfile <immintrin.h>
3367 /// \code
3368 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
3369 /// __m256i mask, const int s);
3370 /// \endcode
3372 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
3374 /// \param a
3375 /// A 256-bit vector of [8 x i32] used as the source when a mask bit is
3376 /// zero.
3377 /// \param m
3378 /// A pointer to the memory used for loading values.
3379 /// \param i
3380 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
3381 /// \param mask
3382 /// A 256-bit vector of [8 x i32] containing the mask. The most significant
3383 /// bit of each element in the mask vector represents the mask bits. If a
3384 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3385 /// otherwise the value is loaded from memory.
3386 /// \param s
3387 /// A literal constant scale factor for the indexes in \a i. Must be
3388 /// 1, 2, 4, or 8.
3389 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
3390 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
3391 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
3392 (int const *)(m), \
3393 (__v8si)(__m256i)(i), \
3394 (__v8si)(__m256i)(mask), (s)))
3396 /// Conditionally gathers two 32-bit integer values, either from the
3397 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
3398 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3399 /// of [4 x i32] in \a mask determines the source for the lower two
3400 /// elements. The upper two elements of the result are zeroed.
3402 /// \code{.operation}
3403 /// FOR element := 0 to 1
3404 /// j := element*32
3405 /// k := element*64
3406 /// IF mask[j+31] == 0
3407 /// result[j+31:j] := a[j+31:j]
3408 /// ELSE
3409 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3410 /// FI
3411 /// ENDFOR
3412 /// result[127:64] := 0
3413 /// \endcode
3415 /// \headerfile <immintrin.h>
3417 /// \code
3418 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
3419 /// __m128i mask, const int s);
3420 /// \endcode
3422 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
3424 /// \param a
3425 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
3426 /// zero. Only the first two elements are used.
3427 /// \param m
3428 /// A pointer to the memory used for loading values.
3429 /// \param i
3430 /// A 128-bit vector of [2 x i64] containing indexes into \a m.
3431 /// \param mask
3432 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
3433 /// bit of each element in the mask vector represents the mask bits. If a
3434 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3435 /// otherwise the value is loaded from memory. Only the first two elements
3436 /// are used.
3437 /// \param s
3438 /// A literal constant scale factor for the indexes in \a i. Must be
3439 /// 1, 2, 4, or 8.
3440 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
3441 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
3442 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
3443 (int const *)(m), \
3444 (__v2di)(__m128i)(i), \
3445 (__v4si)(__m128i)(mask), (s)))
3447 /// Conditionally gathers four 32-bit integer values, either from the
3448 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
3449 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
3450 /// of [4 x i32] in \a mask determines the source for each element.
3452 /// \code{.operation}
3453 /// FOR element := 0 to 3
3454 /// j := element*32
3455 /// k := element*64
3456 /// IF mask[j+31] == 0
3457 /// result[j+31:j] := a[j+31:j]
3458 /// ELSE
3459 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3460 /// FI
3461 /// ENDFOR
3462 /// \endcode
3464 /// \headerfile <immintrin.h>
3466 /// \code
3467 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
3468 /// __m128i mask, const int s);
3469 /// \endcode
3471 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
3473 /// \param a
3474 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
3475 /// zero.
3476 /// \param m
3477 /// A pointer to the memory used for loading values.
3478 /// \param i
3479 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3480 /// \param mask
3481 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
3482 /// bit of each element in the mask vector represents the mask bits. If a
3483 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3484 /// otherwise the value is loaded from memory.
3485 /// \param s
3486 /// A literal constant scale factor for the indexes in \a i. Must be
3487 /// 1, 2, 4, or 8.
3488 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
3489 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
3490 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
3491 (int const *)(m), \
3492 (__v4di)(__m256i)(i), \
3493 (__v4si)(__m128i)(mask), (s)))
3495 /// Conditionally gathers two 64-bit integer values, either from the
3496 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
3497 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3498 /// of [2 x i64] in \a mask determines the source for each element.
3500 /// \code{.operation}
3501 /// FOR element := 0 to 1
3502 /// j := element*64
3503 /// k := element*32
3504 /// IF mask[j+63] == 0
3505 /// result[j+63:j] := a[j+63:j]
3506 /// ELSE
3507 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3508 /// FI
3509 /// ENDFOR
3510 /// \endcode
3512 /// \headerfile <immintrin.h>
3514 /// \code
3515 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
3516 /// __m128i mask, const int s);
3517 /// \endcode
3519 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
3521 /// \param a
3522 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
3523 /// zero.
3524 /// \param m
3525 /// A pointer to the memory used for loading values.
3526 /// \param i
3527 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3528 /// the first two elements are used.
3529 /// \param mask
3530 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
3531 /// bit of each element in the mask vector represents the mask bits. If a
3532 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3533 /// otherwise the value is loaded from memory.
3534 /// \param s
3535 /// A literal constant scale factor for the indexes in \a i. Must be
3536 /// 1, 2, 4, or 8.
3537 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
3538 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
3539 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
3540 (long long const *)(m), \
3541 (__v4si)(__m128i)(i), \
3542 (__v2di)(__m128i)(mask), (s)))
3544 /// Conditionally gathers four 64-bit integer values, either from the
3545 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
3546 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
3547 /// of [4 x i64] in \a mask determines the source for each element.
3549 /// \code{.operation}
3550 /// FOR element := 0 to 3
3551 /// j := element*64
3552 /// k := element*32
3553 /// IF mask[j+63] == 0
3554 /// result[j+63:j] := a[j+63:j]
3555 /// ELSE
3556 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3557 /// FI
3558 /// ENDFOR
3559 /// \endcode
3561 /// \headerfile <immintrin.h>
3563 /// \code
3564 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
3565 /// __m128i i, __m256i mask, const int s);
3566 /// \endcode
3568 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
3570 /// \param a
3571 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
3572 /// zero.
3573 /// \param m
3574 /// A pointer to the memory used for loading values.
3575 /// \param i
3576 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3577 /// \param mask
3578 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
3579 /// bit of each element in the mask vector represents the mask bits. If a
3580 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3581 /// otherwise the value is loaded from memory.
3582 /// \param s
3583 /// A literal constant scale factor for the indexes in \a i. Must be
3584 /// 1, 2, 4, or 8.
3585 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
3586 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
3587 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
3588 (long long const *)(m), \
3589 (__v4si)(__m128i)(i), \
3590 (__v4di)(__m256i)(mask), (s)))
3592 /// Conditionally gathers two 64-bit integer values, either from the
3593 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
3594 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3595 /// of [2 x i64] in \a mask determines the source for each element.
3597 /// \code{.operation}
3598 /// FOR element := 0 to 1
3599 /// j := element*64
3600 /// k := element*64
3601 /// IF mask[j+63] == 0
3602 /// result[j+63:j] := a[j+63:j]
3603 /// ELSE
3604 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3605 /// FI
3606 /// ENDFOR
3607 /// \endcode
3609 /// \headerfile <immintrin.h>
3611 /// \code
3612 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
3613 /// __m128i mask, const int s);
3614 /// \endcode
3616 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
3618 /// \param a
3619 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
3620 /// zero.
3621 /// \param m
3622 /// A pointer to the memory used for loading values.
3623 /// \param i
3624 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3625 /// \param mask
3626 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
3627 /// bit of each element in the mask vector represents the mask bits. If a
3628 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3629 /// otherwise the value is loaded from memory.
3630 /// \param s
3631 /// A literal constant scale factor for the indexes in \a i. Must be
3632 /// 1, 2, 4, or 8.
3633 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
3634 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
3635 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
3636 (long long const *)(m), \
3637 (__v2di)(__m128i)(i), \
3638 (__v2di)(__m128i)(mask), (s)))
3640 /// Conditionally gathers four 64-bit integer values, either from the
3641 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
3642 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
3643 /// of [4 x i64] in \a mask determines the source for each element.
3645 /// \code{.operation}
3646 /// FOR element := 0 to 3
3647 /// j := element*64
3648 /// k := element*64
3649 /// IF mask[j+63] == 0
3650 /// result[j+63:j] := a[j+63:j]
3651 /// ELSE
3652 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3653 /// FI
3654 /// ENDFOR
3655 /// \endcode
3657 /// \headerfile <immintrin.h>
3659 /// \code
3660 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
3661 /// __m256i i, __m256i mask, const int s);
3662 /// \endcode
3664 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
3666 /// \param a
3667 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
3668 /// zero.
3669 /// \param m
3670 /// A pointer to the memory used for loading values.
3671 /// \param i
3672 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3673 /// \param mask
3674 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
3675 /// bit of each element in the mask vector represents the mask bits. If a
3676 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3677 /// otherwise the value is loaded from memory.
3678 /// \param s
3679 /// A literal constant scale factor for the indexes in \a i. Must be
3680 /// 1, 2, 4, or 8.
3681 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
3682 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
3683 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
3684 (long long const *)(m), \
3685 (__v4di)(__m256i)(i), \
3686 (__v4di)(__m256i)(mask), (s)))
3688 /// Gathers two 64-bit floating-point values from memory \a m using scaled
3689 /// indexes from the 128-bit vector of [4 x i32] in \a i.
3691 /// \code{.operation}
3692 /// FOR element := 0 to 1
3693 /// j := element*64
3694 /// k := element*32
3695 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3696 /// ENDFOR
3697 /// \endcode
3699 /// \headerfile <immintrin.h>
3701 /// \code
3702 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
3703 /// \endcode
3705 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3707 /// \param m
3708 /// A pointer to the memory used for loading values.
3709 /// \param i
3710 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3711 /// the first two elements are used.
3712 /// \param s
3713 /// A literal constant scale factor for the indexes in \a i. Must be
3714 /// 1, 2, 4, or 8.
3715 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3716 #define _mm_i32gather_pd(m, i, s) \
3717 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
3718 (double const *)(m), \
3719 (__v4si)(__m128i)(i), \
3720 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
3721 _mm_setzero_pd()), \
3722 (s)))
3724 /// Gathers four 64-bit floating-point values from memory \a m using scaled
3725 /// indexes from the 128-bit vector of [4 x i32] in \a i.
3727 /// \code{.operation}
3728 /// FOR element := 0 to 3
3729 /// j := element*64
3730 /// k := element*32
3731 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3732 /// ENDFOR
3733 /// \endcode
3735 /// \headerfile <immintrin.h>
3737 /// \code
3738 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
3739 /// \endcode
3741 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3743 /// \param m
3744 /// A pointer to the memory used for loading values.
3745 /// \param i
3746 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3747 /// \param s
3748 /// A literal constant scale factor for the indexes in \a i. Must be
3749 /// 1, 2, 4, or 8.
3750 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
3751 #define _mm256_i32gather_pd(m, i, s) \
3752 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
3753 (double const *)(m), \
3754 (__v4si)(__m128i)(i), \
3755 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
3756 _mm256_setzero_pd(), \
3757 _CMP_EQ_OQ), \
3758 (s)))
3760 /// Gathers two 64-bit floating-point values from memory \a m using scaled
3761 /// indexes from the 128-bit vector of [2 x i64] in \a i.
3763 /// \code{.operation}
3764 /// FOR element := 0 to 1
3765 /// j := element*64
3766 /// k := element*64
3767 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3768 /// ENDFOR
3769 /// \endcode
3771 /// \headerfile <immintrin.h>
3773 /// \code
3774 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
3775 /// \endcode
3777 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
3779 /// \param m
3780 /// A pointer to the memory used for loading values.
3781 /// \param i
3782 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3783 /// \param s
3784 /// A literal constant scale factor for the indexes in \a i. Must be
3785 /// 1, 2, 4, or 8.
3786 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3787 #define _mm_i64gather_pd(m, i, s) \
3788 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
3789 (double const *)(m), \
3790 (__v2di)(__m128i)(i), \
3791 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
3792 _mm_setzero_pd()), \
3793 (s)))
3795 /// Gathers four 64-bit floating-point values from memory \a m using scaled
3796 /// indexes from the 256-bit vector of [4 x i64] in \a i.
3798 /// \code{.operation}
3799 /// FOR element := 0 to 3
3800 /// j := element*64
3801 /// k := element*64
3802 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3803 /// ENDFOR
3804 /// \endcode
3806 /// \headerfile <immintrin.h>
3808 /// \code
3809 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
3810 /// \endcode
3812 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
3814 /// \param m
3815 /// A pointer to the memory used for loading values.
3816 /// \param i
3817 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3818 /// \param s
3819 /// A literal constant scale factor for the indexes in \a i. Must be
3820 /// 1, 2, 4, or 8.
3821 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
3822 #define _mm256_i64gather_pd(m, i, s) \
3823 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
3824 (double const *)(m), \
3825 (__v4di)(__m256i)(i), \
3826 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
3827 _mm256_setzero_pd(), \
3828 _CMP_EQ_OQ), \
3829 (s)))
3831 /// Gathers four 32-bit floating-point values from memory \a m using scaled
3832 /// indexes from the 128-bit vector of [4 x i32] in \a i.
3834 /// \code{.operation}
3835 /// FOR element := 0 to 3
3836 /// j := element*32
3837 /// k := element*32
3838 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3839 /// ENDFOR
3840 /// \endcode
3842 /// \headerfile <immintrin.h>
3844 /// \code
3845 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
3846 /// \endcode
3848 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
3850 /// \param m
3851 /// A pointer to the memory used for loading values.
3852 /// \param i
3853 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3854 /// \param s
3855 /// A literal constant scale factor for the indexes in \a i. Must be
3856 /// 1, 2, 4, or 8.
3857 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3858 #define _mm_i32gather_ps(m, i, s) \
3859 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
3860 (float const *)(m), \
3861 (__v4si)(__m128i)(i), \
3862 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
3863 _mm_setzero_ps()), \
3864 (s)))
3866 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
3867 /// indexes from the 256-bit vector of [8 x i32] in \a i.
3869 /// \code{.operation}
3870 /// FOR element := 0 to 7
3871 /// j := element*32
3872 /// k := element*32
3873 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3874 /// ENDFOR
3875 /// \endcode
3877 /// \headerfile <immintrin.h>
3879 /// \code
3880 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
3881 /// \endcode
3883 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
3885 /// \param m
3886 /// A pointer to the memory used for loading values.
3887 /// \param i
3888 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
3889 /// \param s
3890 /// A literal constant scale factor for the indexes in \a i. Must be
3891 /// 1, 2, 4, or 8.
3892 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
3893 #define _mm256_i32gather_ps(m, i, s) \
3894 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
3895 (float const *)(m), \
3896 (__v8si)(__m256i)(i), \
3897 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
3898 _mm256_setzero_ps(), \
3899 _CMP_EQ_OQ), \
3900 (s)))
3902 /// Gathers two 32-bit floating-point values from memory \a m using scaled
3903 /// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
3904 /// elements of the result are zeroed.
3906 /// \code{.operation}
3907 /// FOR element := 0 to 1
3908 /// j := element*32
3909 /// k := element*64
3910 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3911 /// ENDFOR
3912 /// result[127:64] := 0
3913 /// \endcode
3915 /// \headerfile <immintrin.h>
3917 /// \code
3918 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
3919 /// \endcode
3921 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
3923 /// \param m
3924 /// A pointer to the memory used for loading values.
3925 /// \param i
3926 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3927 /// \param s
3928 /// A literal constant scale factor for the indexes in \a i. Must be
3929 /// 1, 2, 4, or 8.
3930 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3931 #define _mm_i64gather_ps(m, i, s) \
3932 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
3933 (float const *)(m), \
3934 (__v2di)(__m128i)(i), \
3935 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
3936 _mm_setzero_ps()), \
3937 (s)))
3939 /// Gathers four 32-bit floating-point values from memory \a m using scaled
3940 /// indexes from the 256-bit vector of [4 x i64] in \a i.
3942 /// \code{.operation}
3943 /// FOR element := 0 to 3
3944 /// j := element*32
3945 /// k := element*64
3946 /// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
3947 /// ENDFOR
3948 /// \endcode
3950 /// \headerfile <immintrin.h>
3952 /// \code
3953 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
3954 /// \endcode
3956 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
3958 /// \param m
3959 /// A pointer to the memory used for loading values.
3960 /// \param i
3961 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3962 /// \param s
3963 /// A literal constant scale factor for the indexes in \a i. Must be
3964 /// 1, 2, 4, or 8.
3965 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3966 #define _mm256_i64gather_ps(m, i, s) \
3967 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
3968 (float const *)(m), \
3969 (__v4di)(__m256i)(i), \
3970 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
3971 _mm_setzero_ps()), \
3972 (s)))
3974 /// Gathers four 32-bit floating-point values from memory \a m using scaled
3975 /// indexes from the 128-bit vector of [4 x i32] in \a i.
3977 /// \code{.operation}
3978 /// FOR element := 0 to 3
3979 /// j := element*32
3980 /// k := element*32
3981 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3982 /// ENDFOR
3983 /// \endcode
3985 /// \headerfile <immintrin.h>
3987 /// \code
3988 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
3989 /// \endcode
3991 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
3993 /// \param m
3994 /// A pointer to the memory used for loading values.
3995 /// \param i
3996 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3997 /// \param s
3998 /// A literal constant scale factor for the indexes in \a i. Must be
3999 /// 1, 2, 4, or 8.
4000 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4001 #define _mm_i32gather_epi32(m, i, s) \
4002 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
4003 (int const *)(m), (__v4si)(__m128i)(i), \
4004 (__v4si)_mm_set1_epi32(-1), (s)))
4006 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
4007 /// indexes from the 256-bit vector of [8 x i32] in \a i.
4009 /// \code{.operation}
4010 /// FOR element := 0 to 7
4011 /// j := element*32
4012 /// k := element*32
4013 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4014 /// ENDFOR
4015 /// \endcode
4017 /// \headerfile <immintrin.h>
4019 /// \code
4020 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
4021 /// \endcode
4023 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4025 /// \param m
4026 /// A pointer to the memory used for loading values.
4027 /// \param i
4028 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4029 /// \param s
4030 /// A literal constant scale factor for the indexes in \a i. Must be
4031 /// 1, 2, 4, or 8.
4032 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4033 #define _mm256_i32gather_epi32(m, i, s) \
4034 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
4035 (int const *)(m), (__v8si)(__m256i)(i), \
4036 (__v8si)_mm256_set1_epi32(-1), (s)))
4038 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
4039 /// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
4040 /// of the result are zeroed.
4042 /// \code{.operation}
4043 /// FOR element := 0 to 1
4044 /// j := element*32
4045 /// k := element*64
4046 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4047 /// ENDFOR
4048 /// result[127:64] := 0
4049 /// \endcode
4051 /// \headerfile <immintrin.h>
4053 /// \code
4054 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
4055 /// \endcode
4057 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4059 /// \param m
4060 /// A pointer to the memory used for loading values.
4061 /// \param i
4062 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4063 /// \param s
4064 /// A literal constant scale factor for the indexes in \a i. Must be
4065 /// 1, 2, 4, or 8.
4066 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4067 #define _mm_i64gather_epi32(m, i, s) \
4068 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
4069 (int const *)(m), (__v2di)(__m128i)(i), \
4070 (__v4si)_mm_set1_epi32(-1), (s)))
4072 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
4073 /// from the 256-bit vector of [4 x i64] in \a i.
4075 /// \code{.operation}
4076 /// FOR element := 0 to 3
4077 /// j := element*32
4078 /// k := element*64
4079 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4080 /// ENDFOR
4081 /// \endcode
4083 /// \headerfile <immintrin.h>
4085 /// \code
4086 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
4087 /// \endcode
4089 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4091 /// \param m
4092 /// A pointer to the memory used for loading values.
4093 /// \param i
4094 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4095 /// \param s
4096 /// A literal constant scale factor for the indexes in \a i. Must be
4097 /// 1, 2, 4, or 8.
4098 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4099 #define _mm256_i64gather_epi32(m, i, s) \
4100 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
4101 (int const *)(m), (__v4di)(__m256i)(i), \
4102 (__v4si)_mm_set1_epi32(-1), (s)))
4104 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
4105 /// from the 128-bit vector of [4 x i32] in \a i.
4107 /// \code{.operation}
4108 /// FOR element := 0 to 1
4109 /// j := element*64
4110 /// k := element*32
4111 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4112 /// ENDFOR
4113 /// \endcode
4115 /// \headerfile <immintrin.h>
4117 /// \code
4118 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
4119 /// \endcode
4121 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4123 /// \param m
4124 /// A pointer to the memory used for loading values.
4125 /// \param i
4126 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4127 /// the first two elements are used.
4128 /// \param s
4129 /// A literal constant scale factor for the indexes in \a i. Must be
4130 /// 1, 2, 4, or 8.
4131 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4132 #define _mm_i32gather_epi64(m, i, s) \
4133 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
4134 (long long const *)(m), \
4135 (__v4si)(__m128i)(i), \
4136 (__v2di)_mm_set1_epi64x(-1), (s)))
4138 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
4139 /// from the 128-bit vector of [4 x i32] in \a i.
4141 /// \code{.operation}
4142 /// FOR element := 0 to 3
4143 /// j := element*64
4144 /// k := element*32
4145 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4146 /// ENDFOR
4147 /// \endcode
4149 /// \headerfile <immintrin.h>
4151 /// \code
4152 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
4153 /// \endcode
4155 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4157 /// \param m
4158 /// A pointer to the memory used for loading values.
4159 /// \param i
4160 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4161 /// \param s
4162 /// A literal constant scale factor for the indexes in \a i. Must be
4163 /// 1, 2, 4, or 8.
4164 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4165 #define _mm256_i32gather_epi64(m, i, s) \
4166 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
4167 (long long const *)(m), \
4168 (__v4si)(__m128i)(i), \
4169 (__v4di)_mm256_set1_epi64x(-1), (s)))
4171 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
4172 /// from the 128-bit vector of [2 x i64] in \a i.
4174 /// \code{.operation}
4175 /// FOR element := 0 to 1
4176 /// j := element*64
4177 /// k := element*64
4178 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4179 /// ENDFOR
4180 /// \endcode
4182 /// \headerfile <immintrin.h>
4184 /// \code
4185 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
4186 /// \endcode
4188 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4190 /// \param m
4191 /// A pointer to the memory used for loading values.
4192 /// \param i
4193 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4194 /// \param s
4195 /// A literal constant scale factor for the indexes in \a i. Must be
4196 /// 1, 2, 4, or 8.
4197 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4198 #define _mm_i64gather_epi64(m, i, s) \
4199 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
4200 (long long const *)(m), \
4201 (__v2di)(__m128i)(i), \
4202 (__v2di)_mm_set1_epi64x(-1), (s)))
4204 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
4205 /// from the 256-bit vector of [4 x i64] in \a i.
4207 /// \code{.operation}
4208 /// FOR element := 0 to 3
4209 /// j := element*64
4210 /// k := element*64
4211 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4212 /// ENDFOR
4213 /// \endcode
4215 /// \headerfile <immintrin.h>
4217 /// \code
4218 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
4219 /// \endcode
4221 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4223 /// \param m
4224 /// A pointer to the memory used for loading values.
4225 /// \param i
4226 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4227 /// \param s
4228 /// A literal constant scale factor for the indexes in \a i. Must be
4229 /// 1, 2, 4, or 8.
4230 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4231 #define _mm256_i64gather_epi64(m, i, s) \
4232 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
4233 (long long const *)(m), \
4234 (__v4di)(__m256i)(i), \
4235 (__v4di)_mm256_set1_epi64x(-1), (s)))
4237 #undef __DEFAULT_FN_ATTRS256
4238 #undef __DEFAULT_FN_ATTRS128
4240 #endif /* __AVX2INTRIN_H */