[clang-format] Fix a bug in aligning comments above PPDirective (#72791)
[llvm-project.git] / clang / lib / Headers / avx2intrin.h
blob096cae01b57d019e68a1694f5f050d302ce7964e
1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 #ifndef __IMMINTRIN_H
11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12 #endif
14 #ifndef __AVX2INTRIN_H
15 #define __AVX2INTRIN_H
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, \
20 __target__("avx2,no-evex512"), __min_vector_width__(256)))
21 #define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, \
23 __target__("avx2,no-evex512"), __min_vector_width__(128)))
25 /* SSE4 Multiple Packed Sums of Absolute Difference. */
26 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
27 /// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
28 /// \a Y.
29 ///
30 /// Eight SAD results are computed using the lower half of the input
31 /// vectors, and another eight using the upper half. These 16-bit values
32 /// are returned in the lower and upper halves of the 256-bit result,
33 /// respectively.
34 ///
35 /// A single SAD operation selects four bytes from \a X and four bytes from
36 /// \a Y as input. It computes the differences between each \a X byte and
37 /// the corresponding \a Y byte, takes the absolute value of each
38 /// difference, and sums these four values to form one 16-bit result. The
39 /// intrinsic computes 16 of these results with different sets of input
40 /// bytes.
41 ///
42 /// For each set of eight results, the SAD operations use the same four
43 /// bytes from \a Y; the starting bit position for these four bytes is
44 /// specified by \a M[1:0] times 32. The eight operations use successive
45 /// sets of four bytes from \a X; the starting bit position for the first
46 /// set of four bytes is specified by \a M[2] times 32. These bit positions
47 /// are all relative to the 128-bit lane for each set of eight operations.
48 ///
49 /// \code{.operation}
50 /// r := 0
51 /// FOR i := 0 TO 1
52 /// j := i*3
53 /// Ybase := M[j+1:j]*32 + i*128
54 /// Xbase := M[j+2]*32 + i*128
55 /// FOR k := 0 TO 3
56 /// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
57 /// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
58 /// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
59 /// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
60 /// result[r+15:r] := temp0 + temp1 + temp2 + temp3
61 /// Xbase := Xbase + 8
62 /// r := r + 16
63 /// ENDFOR
64 /// ENDFOR
65 /// \endcode
66 ///
67 /// \headerfile <immintrin.h>
68 ///
69 /// \code
70 /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
71 /// \endcode
72 ///
73 /// This intrinsic corresponds to the \c VMPSADBW instruction.
74 ///
75 /// \param X
76 /// A 256-bit integer vector containing one of the inputs.
77 /// \param Y
78 /// A 256-bit integer vector containing one of the inputs.
79 /// \param M
80 /// An unsigned immediate value specifying the starting positions of the
81 /// bytes to operate on.
82 /// \returns A 256-bit vector of [16 x i16] containing the result.
83 #define _mm256_mpsadbw_epu8(X, Y, M) \
84 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
85 (__v32qi)(__m256i)(Y), (int)(M)))
87 /// Computes the absolute value of each signed byte in the 256-bit integer
88 /// vector \a __a and returns each value in the corresponding byte of
89 /// the result.
90 ///
91 /// \headerfile <immintrin.h>
92 ///
93 /// This intrinsic corresponds to the \c VPABSB instruction.
94 ///
95 /// \param __a
96 /// A 256-bit integer vector.
97 /// \returns A 256-bit integer vector containing the result.
98 static __inline__ __m256i __DEFAULT_FN_ATTRS256
99 _mm256_abs_epi8(__m256i __a)
101 return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
104 /// Computes the absolute value of each signed 16-bit element in the 256-bit
105 /// vector of [16 x i16] in \a __a and returns each value in the
106 /// corresponding element of the result.
108 /// \headerfile <immintrin.h>
110 /// This intrinsic corresponds to the \c VPABSW instruction.
112 /// \param __a
113 /// A 256-bit vector of [16 x i16].
114 /// \returns A 256-bit vector of [16 x i16] containing the result.
115 static __inline__ __m256i __DEFAULT_FN_ATTRS256
116 _mm256_abs_epi16(__m256i __a)
118 return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
121 /// Computes the absolute value of each signed 32-bit element in the 256-bit
122 /// vector of [8 x i32] in \a __a and returns each value in the
123 /// corresponding element of the result.
125 /// \headerfile <immintrin.h>
127 /// This intrinsic corresponds to the \c VPABSD instruction.
129 /// \param __a
130 /// A 256-bit vector of [8 x i32].
131 /// \returns A 256-bit vector of [8 x i32] containing the result.
132 static __inline__ __m256i __DEFAULT_FN_ATTRS256
133 _mm256_abs_epi32(__m256i __a)
135 return (__m256i)__builtin_elementwise_abs((__v8si)__a);
138 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
139 /// integers using signed saturation, and returns the 256-bit result.
141 /// \code{.operation}
142 /// FOR i := 0 TO 7
143 /// j := i*16
144 /// k := i*8
145 /// result[7+k:k] := SATURATE8(__a[15+j:j])
146 /// result[71+k:64+k] := SATURATE8(__b[15+j:j])
147 /// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
148 /// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
149 /// ENDFOR
150 /// \endcode
152 /// \headerfile <immintrin.h>
154 /// This intrinsic corresponds to the \c VPACKSSWB instruction.
156 /// \param __a
157 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
158 /// result[191:128].
159 /// \param __b
160 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
161 /// result[255:192].
162 /// \returns A 256-bit integer vector containing the result.
163 static __inline__ __m256i __DEFAULT_FN_ATTRS256
164 _mm256_packs_epi16(__m256i __a, __m256i __b)
166 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
169 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
170 /// integers using signed saturation, and returns the resulting 256-bit
171 /// vector of [16 x i16].
173 /// \code{.operation}
174 /// FOR i := 0 TO 3
175 /// j := i*32
176 /// k := i*16
177 /// result[15+k:k] := SATURATE16(__a[31+j:j])
178 /// result[79+k:64+k] := SATURATE16(__b[31+j:j])
179 /// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
180 /// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
181 /// ENDFOR
182 /// \endcode
184 /// \headerfile <immintrin.h>
186 /// This intrinsic corresponds to the \c VPACKSSDW instruction.
188 /// \param __a
189 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
190 /// result[191:128].
191 /// \param __b
192 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
193 /// result[255:192].
194 /// \returns A 256-bit vector of [16 x i16] containing the result.
195 static __inline__ __m256i __DEFAULT_FN_ATTRS256
196 _mm256_packs_epi32(__m256i __a, __m256i __b)
198 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
201 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
202 /// using unsigned saturation, and returns the 256-bit result.
204 /// \code{.operation}
205 /// FOR i := 0 TO 7
206 /// j := i*16
207 /// k := i*8
208 /// result[7+k:k] := SATURATE8U(__a[15+j:j])
209 /// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
210 /// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
211 /// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
212 /// ENDFOR
213 /// \endcode
215 /// \headerfile <immintrin.h>
217 /// This intrinsic corresponds to the \c VPACKUSWB instruction.
219 /// \param __a
220 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
221 /// result[191:128].
222 /// \param __b
223 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
224 /// result[255:192].
225 /// \returns A 256-bit integer vector containing the result.
226 static __inline__ __m256i __DEFAULT_FN_ATTRS256
227 _mm256_packus_epi16(__m256i __a, __m256i __b)
229 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
232 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
233 /// using unsigned saturation, and returns the resulting 256-bit vector of
234 /// [16 x i16].
236 /// \code{.operation}
237 /// FOR i := 0 TO 3
238 /// j := i*32
239 /// k := i*16
240 /// result[15+k:k] := SATURATE16U(__V1[31+j:j])
241 /// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
242 /// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
243 /// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
244 /// ENDFOR
245 /// \endcode
247 /// \headerfile <immintrin.h>
249 /// This intrinsic corresponds to the \c VPACKUSDW instruction.
251 /// \param __V1
252 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
253 /// result[191:128].
254 /// \param __V2
255 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
256 /// result[255:192].
257 /// \returns A 256-bit vector of [16 x i16] containing the result.
258 static __inline__ __m256i __DEFAULT_FN_ATTRS256
259 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
261 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
264 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
265 /// vectors and returns the lower 8 bits of each sum in the corresponding
266 /// byte of the 256-bit integer vector result (overflow is ignored).
268 /// \headerfile <immintrin.h>
270 /// This intrinsic corresponds to the \c VPADDB instruction.
272 /// \param __a
273 /// A 256-bit integer vector containing one of the source operands.
274 /// \param __b
275 /// A 256-bit integer vector containing one of the source operands.
276 /// \returns A 256-bit integer vector containing the sums.
277 static __inline__ __m256i __DEFAULT_FN_ATTRS256
278 _mm256_add_epi8(__m256i __a, __m256i __b)
280 return (__m256i)((__v32qu)__a + (__v32qu)__b);
283 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
284 /// [16 x i16] and returns the lower 16 bits of each sum in the
285 /// corresponding element of the [16 x i16] result (overflow is ignored).
287 /// \headerfile <immintrin.h>
289 /// This intrinsic corresponds to the \c VPADDW instruction.
291 /// \param __a
292 /// A 256-bit vector of [16 x i16] containing one of the source operands.
293 /// \param __b
294 /// A 256-bit vector of [16 x i16] containing one of the source operands.
295 /// \returns A 256-bit vector of [16 x i16] containing the sums.
296 static __inline__ __m256i __DEFAULT_FN_ATTRS256
297 _mm256_add_epi16(__m256i __a, __m256i __b)
299 return (__m256i)((__v16hu)__a + (__v16hu)__b);
302 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
303 /// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
304 /// element of the [8 x i32] result (overflow is ignored).
306 /// \headerfile <immintrin.h>
308 /// This intrinsic corresponds to the \c VPADDD instruction.
310 /// \param __a
311 /// A 256-bit vector of [8 x i32] containing one of the source operands.
312 /// \param __b
313 /// A 256-bit vector of [8 x i32] containing one of the source operands.
314 /// \returns A 256-bit vector of [8 x i32] containing the sums.
315 static __inline__ __m256i __DEFAULT_FN_ATTRS256
316 _mm256_add_epi32(__m256i __a, __m256i __b)
318 return (__m256i)((__v8su)__a + (__v8su)__b);
321 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
322 /// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
323 /// element of the [4 x i64] result (overflow is ignored).
325 /// \headerfile <immintrin.h>
327 /// This intrinsic corresponds to the \c VPADDQ instruction.
329 /// \param __a
330 /// A 256-bit vector of [4 x i64] containing one of the source operands.
331 /// \param __b
332 /// A 256-bit vector of [4 x i64] containing one of the source operands.
333 /// \returns A 256-bit vector of [4 x i64] containing the sums.
334 static __inline__ __m256i __DEFAULT_FN_ATTRS256
335 _mm256_add_epi64(__m256i __a, __m256i __b)
337 return (__m256i)((__v4du)__a + (__v4du)__b);
340 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
341 /// vectors using signed saturation, and returns each sum in the
342 /// corresponding byte of the 256-bit integer vector result.
344 /// \headerfile <immintrin.h>
346 /// This intrinsic corresponds to the \c VPADDSB instruction.
348 /// \param __a
349 /// A 256-bit integer vector containing one of the source operands.
350 /// \param __b
351 /// A 256-bit integer vector containing one of the source operands.
352 /// \returns A 256-bit integer vector containing the sums.
353 static __inline__ __m256i __DEFAULT_FN_ATTRS256
354 _mm256_adds_epi8(__m256i __a, __m256i __b)
356 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
359 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
360 /// [16 x i16] using signed saturation, and returns the [16 x i16] result.
362 /// \headerfile <immintrin.h>
364 /// This intrinsic corresponds to the \c VPADDSW instruction.
366 /// \param __a
367 /// A 256-bit vector of [16 x i16] containing one of the source operands.
368 /// \param __b
369 /// A 256-bit vector of [16 x i16] containing one of the source operands.
370 /// \returns A 256-bit vector of [16 x i16] containing the sums.
371 static __inline__ __m256i __DEFAULT_FN_ATTRS256
372 _mm256_adds_epi16(__m256i __a, __m256i __b)
374 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
377 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
378 /// vectors using unsigned saturation, and returns each sum in the
379 /// corresponding byte of the 256-bit integer vector result.
381 /// \headerfile <immintrin.h>
383 /// This intrinsic corresponds to the \c VPADDUSB instruction.
385 /// \param __a
386 /// A 256-bit integer vector containing one of the source operands.
387 /// \param __b
388 /// A 256-bit integer vector containing one of the source operands.
389 /// \returns A 256-bit integer vector containing the sums.
390 static __inline__ __m256i __DEFAULT_FN_ATTRS256
391 _mm256_adds_epu8(__m256i __a, __m256i __b)
393 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
396 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
397 /// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
399 /// \headerfile <immintrin.h>
401 /// This intrinsic corresponds to the \c VPADDUSW instruction.
403 /// \param __a
404 /// A 256-bit vector of [16 x i16] containing one of the source operands.
405 /// \param __b
406 /// A 256-bit vector of [16 x i16] containing one of the source operands.
407 /// \returns A 256-bit vector of [16 x i16] containing the sums.
408 static __inline__ __m256i __DEFAULT_FN_ATTRS256
409 _mm256_adds_epu16(__m256i __a, __m256i __b)
411 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
414 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
415 /// temporary 256-bit value, and the lower half of the 256-bit vector \a b
416 /// as the lower half of the temporary value. Right-shifts the temporary
417 /// value by \a n bytes, and uses the lower 16 bytes of the shifted value
418 /// as the lower 16 bytes of the result. Uses the upper halves of \a a and
419 /// \a b to make another temporary value, right shifts by \a n, and uses
420 /// the lower 16 bytes of the shifted value as the upper 16 bytes of the
421 /// result.
423 /// \headerfile <immintrin.h>
425 /// \code
426 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
427 /// \endcode
429 /// This intrinsic corresponds to the \c VPALIGNR instruction.
431 /// \param a
432 /// A 256-bit integer vector containing source values.
433 /// \param b
434 /// A 256-bit integer vector containing source values.
435 /// \param n
436 /// An immediate value specifying the number of bytes to shift.
437 /// \returns A 256-bit integer vector containing the result.
438 #define _mm256_alignr_epi8(a, b, n) \
439 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
440 (__v32qi)(__m256i)(b), (n)))
442 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
443 /// \a __b.
445 /// \headerfile <immintrin.h>
447 /// This intrinsic corresponds to the \c VPAND instruction.
449 /// \param __a
450 /// A 256-bit integer vector.
451 /// \param __b
452 /// A 256-bit integer vector.
453 /// \returns A 256-bit integer vector containing the result.
454 static __inline__ __m256i __DEFAULT_FN_ATTRS256
455 _mm256_and_si256(__m256i __a, __m256i __b)
457 return (__m256i)((__v4du)__a & (__v4du)__b);
460 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with
461 /// the bitwise NOT of the 256-bit integer vector in \a __a.
463 /// \headerfile <immintrin.h>
465 /// This intrinsic corresponds to the \c VPANDN instruction.
467 /// \param __a
468 /// A 256-bit integer vector.
469 /// \param __b
470 /// A 256-bit integer vector.
471 /// \returns A 256-bit integer vector containing the result.
472 static __inline__ __m256i __DEFAULT_FN_ATTRS256
473 _mm256_andnot_si256(__m256i __a, __m256i __b)
475 return (__m256i)(~(__v4du)__a & (__v4du)__b);
478 /// Computes the averages of the corresponding unsigned bytes in the two
479 /// 256-bit integer vectors in \a __a and \a __b and returns each
480 /// average in the corresponding byte of the 256-bit result.
482 /// \code{.operation}
483 /// FOR i := 0 TO 31
484 /// j := i*8
485 /// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
486 /// ENDFOR
487 /// \endcode
489 /// \headerfile <immintrin.h>
491 /// This intrinsic corresponds to the \c VPAVGB instruction.
493 /// \param __a
494 /// A 256-bit integer vector.
495 /// \param __b
496 /// A 256-bit integer vector.
497 /// \returns A 256-bit integer vector containing the result.
498 static __inline__ __m256i __DEFAULT_FN_ATTRS256
499 _mm256_avg_epu8(__m256i __a, __m256i __b)
501 return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
504 /// Computes the averages of the corresponding unsigned 16-bit integers in
505 /// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
506 /// each average in the corresponding element of the 256-bit result.
508 /// \code{.operation}
509 /// FOR i := 0 TO 15
510 /// j := i*16
511 /// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
512 /// ENDFOR
513 /// \endcode
515 /// \headerfile <immintrin.h>
517 /// This intrinsic corresponds to the \c VPAVGW instruction.
519 /// \param __a
520 /// A 256-bit vector of [16 x i16].
521 /// \param __b
522 /// A 256-bit vector of [16 x i16].
523 /// \returns A 256-bit vector of [16 x i16] containing the result.
524 static __inline__ __m256i __DEFAULT_FN_ATTRS256
525 _mm256_avg_epu16(__m256i __a, __m256i __b)
527 return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
530 /// Merges 8-bit integer values from either of the two 256-bit vectors
531 /// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
532 /// the resulting 256-bit integer vector.
534 /// \code{.operation}
535 /// FOR i := 0 TO 31
536 /// j := i*8
537 /// IF __M[7+i] == 0
538 /// result[7+j:j] := __V1[7+j:j]
539 /// ELSE
540 /// result[7+j:j] := __V2[7+j:j]
541 /// FI
542 /// ENDFOR
543 /// \endcode
545 /// \headerfile <immintrin.h>
547 /// This intrinsic corresponds to the \c VPBLENDVB instruction.
549 /// \param __V1
550 /// A 256-bit integer vector containing source values.
551 /// \param __V2
552 /// A 256-bit integer vector containing source values.
553 /// \param __M
554 /// A 256-bit integer vector, with bit [7] of each byte specifying the
555 /// source for each corresponding byte of the result. When the mask bit
556 /// is 0, the byte is copied from \a __V1; otherwise, it is copied from
557 /// \a __V2.
558 /// \returns A 256-bit integer vector containing the result.
559 static __inline__ __m256i __DEFAULT_FN_ATTRS256
560 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
562 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
563 (__v32qi)__M);
566 /// Merges 16-bit integer values from either of the two 256-bit vectors
567 /// \a V1 or \a V2, as specified by the immediate integer operand \a M,
568 /// and returns the resulting 256-bit vector of [16 x i16].
570 /// \code{.operation}
571 /// FOR i := 0 TO 7
572 /// j := i*16
573 /// IF M[i] == 0
574 /// result[7+j:j] := V1[7+j:j]
575 /// result[135+j:128+j] := V1[135+j:128+j]
576 /// ELSE
577 /// result[7+j:j] := V2[7+j:j]
578 /// result[135+j:128+j] := V2[135+j:128+j]
579 /// FI
580 /// ENDFOR
581 /// \endcode
583 /// \headerfile <immintrin.h>
585 /// \code
586 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
587 /// \endcode
589 /// This intrinsic corresponds to the \c VPBLENDW instruction.
591 /// \param V1
592 /// A 256-bit vector of [16 x i16] containing source values.
593 /// \param V2
594 /// A 256-bit vector of [16 x i16] containing source values.
595 /// \param M
596 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
597 /// source for each element of the result. The position of the mask bit
598 /// corresponds to the index of a copied value. When a mask bit is 0, the
599 /// element is copied from \a V1; otherwise, it is copied from \a V2.
600 /// \a M[0] determines the source for elements 0 and 8, \a M[1] for
601 /// elements 1 and 9, and so forth.
602 /// \returns A 256-bit vector of [16 x i16] containing the result.
603 #define _mm256_blend_epi16(V1, V2, M) \
604 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
605 (__v16hi)(__m256i)(V2), (int)(M)))
607 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
608 /// \a __b for equality and returns the outcomes in the corresponding
609 /// bytes of the 256-bit result.
611 /// \code{.operation}
612 /// FOR i := 0 TO 31
613 /// j := i*8
614 /// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
615 /// ENDFOR
616 /// \endcode
618 /// \headerfile <immintrin.h>
620 /// This intrinsic corresponds to the \c VPCMPEQB instruction.
622 /// \param __a
623 /// A 256-bit integer vector containing one of the inputs.
624 /// \param __b
625 /// A 256-bit integer vector containing one of the inputs.
626 /// \returns A 256-bit integer vector containing the result.
627 static __inline__ __m256i __DEFAULT_FN_ATTRS256
628 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
630 return (__m256i)((__v32qi)__a == (__v32qi)__b);
633 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
634 /// \a __a and \a __b for equality and returns the outcomes in the
635 /// corresponding elements of the 256-bit result.
637 /// \code{.operation}
638 /// FOR i := 0 TO 15
639 /// j := i*16
640 /// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
641 /// ENDFOR
642 /// \endcode
644 /// \headerfile <immintrin.h>
646 /// This intrinsic corresponds to the \c VPCMPEQW instruction.
648 /// \param __a
649 /// A 256-bit vector of [16 x i16] containing one of the inputs.
650 /// \param __b
651 /// A 256-bit vector of [16 x i16] containing one of the inputs.
652 /// \returns A 256-bit vector of [16 x i16] containing the result.
653 static __inline__ __m256i __DEFAULT_FN_ATTRS256
654 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
656 return (__m256i)((__v16hi)__a == (__v16hi)__b);
659 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
660 /// \a __a and \a __b for equality and returns the outcomes in the
661 /// corresponding elements of the 256-bit result.
663 /// \code{.operation}
664 /// FOR i := 0 TO 7
665 /// j := i*32
666 /// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
667 /// ENDFOR
668 /// \endcode
670 /// \headerfile <immintrin.h>
672 /// This intrinsic corresponds to the \c VPCMPEQD instruction.
674 /// \param __a
675 /// A 256-bit vector of [8 x i32] containing one of the inputs.
676 /// \param __b
677 /// A 256-bit vector of [8 x i32] containing one of the inputs.
678 /// \returns A 256-bit vector of [8 x i32] containing the result.
679 static __inline__ __m256i __DEFAULT_FN_ATTRS256
680 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
682 return (__m256i)((__v8si)__a == (__v8si)__b);
685 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
686 /// \a __a and \a __b for equality and returns the outcomes in the
687 /// corresponding elements of the 256-bit result.
689 /// \code{.operation}
690 /// FOR i := 0 TO 3
691 /// j := i*64
692 /// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
693 /// ENDFOR
694 /// \endcode
696 /// \headerfile <immintrin.h>
698 /// This intrinsic corresponds to the \c VPCMPEQQ instruction.
700 /// \param __a
701 /// A 256-bit vector of [4 x i64] containing one of the inputs.
702 /// \param __b
703 /// A 256-bit vector of [4 x i64] containing one of the inputs.
704 /// \returns A 256-bit vector of [4 x i64] containing the result.
705 static __inline__ __m256i __DEFAULT_FN_ATTRS256
706 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
708 return (__m256i)((__v4di)__a == (__v4di)__b);
711 /// Compares corresponding signed bytes in the 256-bit integer vectors in
712 /// \a __a and \a __b for greater-than and returns the outcomes in the
713 /// corresponding bytes of the 256-bit result.
715 /// \code{.operation}
716 /// FOR i := 0 TO 31
717 /// j := i*8
718 /// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
719 /// ENDFOR
720 /// \endcode
722 /// \headerfile <immintrin.h>
724 /// This intrinsic corresponds to the \c VPCMPGTB instruction.
726 /// \param __a
727 /// A 256-bit integer vector containing one of the inputs.
728 /// \param __b
729 /// A 256-bit integer vector containing one of the inputs.
730 /// \returns A 256-bit integer vector containing the result.
731 static __inline__ __m256i __DEFAULT_FN_ATTRS256
732 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
734 /* This function always performs a signed comparison, but __v32qi is a char
735 which may be signed or unsigned, so use __v32qs. */
736 return (__m256i)((__v32qs)__a > (__v32qs)__b);
739 /// Compares corresponding signed elements in the 256-bit vectors of
740 /// [16 x i16] in \a __a and \a __b for greater-than and returns the
741 /// outcomes in the corresponding elements of the 256-bit result.
743 /// \code{.operation}
744 /// FOR i := 0 TO 15
745 /// j := i*16
746 /// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
747 /// ENDFOR
748 /// \endcode
750 /// \headerfile <immintrin.h>
752 /// This intrinsic corresponds to the \c VPCMPGTW instruction.
754 /// \param __a
755 /// A 256-bit vector of [16 x i16] containing one of the inputs.
756 /// \param __b
757 /// A 256-bit vector of [16 x i16] containing one of the inputs.
758 /// \returns A 256-bit vector of [16 x i16] containing the result.
759 static __inline__ __m256i __DEFAULT_FN_ATTRS256
760 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
762 return (__m256i)((__v16hi)__a > (__v16hi)__b);
765 /// Compares corresponding signed elements in the 256-bit vectors of
766 /// [8 x i32] in \a __a and \a __b for greater-than and returns the
767 /// outcomes in the corresponding elements of the 256-bit result.
769 /// \code{.operation}
770 /// FOR i := 0 TO 7
771 /// j := i*32
772 /// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
773 /// ENDFOR
774 /// \endcode
776 /// \headerfile <immintrin.h>
778 /// This intrinsic corresponds to the \c VPCMPGTD instruction.
780 /// \param __a
781 /// A 256-bit vector of [8 x i32] containing one of the inputs.
782 /// \param __b
783 /// A 256-bit vector of [8 x i32] containing one of the inputs.
784 /// \returns A 256-bit vector of [8 x i32] containing the result.
785 static __inline__ __m256i __DEFAULT_FN_ATTRS256
786 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
788 return (__m256i)((__v8si)__a > (__v8si)__b);
791 /// Compares corresponding signed elements in the 256-bit vectors of
792 /// [4 x i64] in \a __a and \a __b for greater-than and returns the
793 /// outcomes in the corresponding elements of the 256-bit result.
795 /// \code{.operation}
796 /// FOR i := 0 TO 3
797 /// j := i*64
798 /// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
799 /// ENDFOR
800 /// \endcode
802 /// \headerfile <immintrin.h>
804 /// This intrinsic corresponds to the \c VPCMPGTQ instruction.
806 /// \param __a
807 /// A 256-bit vector of [4 x i64] containing one of the inputs.
808 /// \param __b
809 /// A 256-bit vector of [4 x i64] containing one of the inputs.
810 /// \returns A 256-bit vector of [4 x i64] containing the result.
811 static __inline__ __m256i __DEFAULT_FN_ATTRS256
812 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
814 return (__m256i)((__v4di)__a > (__v4di)__b);
817 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
818 /// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
819 /// element of the [16 x i16] result (overflow is ignored). Sums from
820 /// \a __a are returned in the lower 64 bits of each 128-bit half of the
821 /// result; sums from \a __b are returned in the upper 64 bits of each
822 /// 128-bit half of the result.
824 /// \code{.operation}
825 /// FOR i := 0 TO 1
826 /// j := i*128
827 /// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
828 /// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
829 /// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
830 /// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
831 /// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
832 /// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
833 /// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
834 /// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
835 /// ENDFOR
836 /// \endcode
838 /// \headerfile <immintrin.h>
840 /// This intrinsic corresponds to the \c VPHADDW instruction.
842 /// \param __a
843 /// A 256-bit vector of [16 x i16] containing one of the source operands.
844 /// \param __b
845 /// A 256-bit vector of [16 x i16] containing one of the source operands.
846 /// \returns A 256-bit vector of [16 x i16] containing the sums.
847 static __inline__ __m256i __DEFAULT_FN_ATTRS256
848 _mm256_hadd_epi16(__m256i __a, __m256i __b)
850 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
853 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
854 /// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
855 /// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
856 /// are returned in the lower 64 bits of each 128-bit half of the result;
857 /// sums from \a __b are returned in the upper 64 bits of each 128-bit half
858 /// of the result.
860 /// \code{.operation}
861 /// FOR i := 0 TO 1
862 /// j := i*128
863 /// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
864 /// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
865 /// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
866 /// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
867 /// ENDFOR
868 /// \endcode
870 /// \headerfile <immintrin.h>
872 /// This intrinsic corresponds to the \c VPHADDD instruction.
874 /// \param __a
875 /// A 256-bit vector of [8 x i32] containing one of the source operands.
876 /// \param __b
877 /// A 256-bit vector of [8 x i32] containing one of the source operands.
878 /// \returns A 256-bit vector of [8 x i32] containing the sums.
879 static __inline__ __m256i __DEFAULT_FN_ATTRS256
880 _mm256_hadd_epi32(__m256i __a, __m256i __b)
882 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
885 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
886 /// vectors of [16 x i16] using signed saturation and returns each sum in
887 /// an element of the [16 x i16] result. Sums from \a __a are returned in
888 /// the lower 64 bits of each 128-bit half of the result; sums from \a __b
889 /// are returned in the upper 64 bits of each 128-bit half of the result.
891 /// \code{.operation}
892 /// FOR i := 0 TO 1
893 /// j := i*128
894 /// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
895 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
896 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
897 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
898 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
899 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
900 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
901 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
902 /// ENDFOR
903 /// \endcode
905 /// \headerfile <immintrin.h>
907 /// This intrinsic corresponds to the \c VPHADDSW instruction.
909 /// \param __a
910 /// A 256-bit vector of [16 x i16] containing one of the source operands.
911 /// \param __b
912 /// A 256-bit vector of [16 x i16] containing one of the source operands.
913 /// \returns A 256-bit vector of [16 x i16] containing the sums.
914 static __inline__ __m256i __DEFAULT_FN_ATTRS256
915 _mm256_hadds_epi16(__m256i __a, __m256i __b)
917 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
920 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
921 /// vectors of [16 x i16] and returns the lower 16 bits of each difference
922 /// in an element of the [16 x i16] result (overflow is ignored).
923 /// Differences from \a __a are returned in the lower 64 bits of each
924 /// 128-bit half of the result; differences from \a __b are returned in the
925 /// upper 64 bits of each 128-bit half of the result.
927 /// \code{.operation}
928 /// FOR i := 0 TO 1
929 /// j := i*128
930 /// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
931 /// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
932 /// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
933 /// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
934 /// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
935 /// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
936 /// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
937 /// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
938 /// ENDFOR
939 /// \endcode
941 /// \headerfile <immintrin.h>
943 /// This intrinsic corresponds to the \c VPHSUBW instruction.
945 /// \param __a
946 /// A 256-bit vector of [16 x i16] containing one of the source operands.
947 /// \param __b
948 /// A 256-bit vector of [16 x i16] containing one of the source operands.
949 /// \returns A 256-bit vector of [16 x i16] containing the differences.
950 static __inline__ __m256i __DEFAULT_FN_ATTRS256
951 _mm256_hsub_epi16(__m256i __a, __m256i __b)
953 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
956 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
957 /// vectors of [8 x i32] and returns the lower 32 bits of each difference in
958 /// an element of the [8 x i32] result (overflow is ignored). Differences
959 /// from \a __a are returned in the lower 64 bits of each 128-bit half of
960 /// the result; differences from \a __b are returned in the upper 64 bits
961 /// of each 128-bit half of the result.
963 /// \code{.operation}
964 /// FOR i := 0 TO 1
965 /// j := i*128
966 /// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
967 /// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
968 /// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
969 /// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
970 /// ENDFOR
971 /// \endcode
973 /// \headerfile <immintrin.h>
975 /// This intrinsic corresponds to the \c VPHSUBD instruction.
977 /// \param __a
978 /// A 256-bit vector of [8 x i32] containing one of the source operands.
979 /// \param __b
980 /// A 256-bit vector of [8 x i32] containing one of the source operands.
981 /// \returns A 256-bit vector of [8 x i32] containing the differences.
982 static __inline__ __m256i __DEFAULT_FN_ATTRS256
983 _mm256_hsub_epi32(__m256i __a, __m256i __b)
985 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
988 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
989 /// vectors of [16 x i16] using signed saturation and returns each sum in
990 /// an element of the [16 x i16] result. Differences from \a __a are
991 /// returned in the lower 64 bits of each 128-bit half of the result;
992 /// differences from \a __b are returned in the upper 64 bits of each
993 /// 128-bit half of the result.
995 /// \code{.operation}
996 /// FOR i := 0 TO 1
997 /// j := i*128
998 /// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
999 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1000 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1001 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1002 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1003 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1004 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1005 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1006 /// ENDFOR
1007 /// \endcode
1009 /// \headerfile <immintrin.h>
1011 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
1013 /// \param __a
1014 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1015 /// \param __b
1016 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1017 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1018 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1019 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
1021 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1024 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1025 /// with the corresponding signed byte from the 256-bit integer vector in
1026 /// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1027 /// pairs of those products using signed saturation to form 16-bit sums
1028 /// returned as elements of the [16 x i16] result.
1030 /// \code{.operation}
1031 /// FOR i := 0 TO 15
1032 /// j := i*16
1033 /// temp1 := __a[j+7:j] * __b[j+7:j]
1034 /// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1035 /// result[j+15:j] := SATURATE16(temp1 + temp2)
1036 /// ENDFOR
1037 /// \endcode
1039 /// \headerfile <immintrin.h>
1041 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1043 /// \param __a
1044 /// A 256-bit vector containing one of the source operands.
1045 /// \param __b
1046 /// A 256-bit vector containing one of the source operands.
1047 /// \returns A 256-bit vector of [16 x i16] containing the result.
1048 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1049 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
1051 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1054 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1055 /// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1056 /// those products to form 32-bit sums returned as elements of the
1057 /// [8 x i32] result.
1059 /// There is only one wraparound case: when all four of the 16-bit sources
1060 /// are \c 0x8000, the result will be \c 0x80000000.
1062 /// \code{.operation}
1063 /// FOR i := 0 TO 7
1064 /// j := i*32
1065 /// temp1 := __a[j+15:j] * __b[j+15:j]
1066 /// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1067 /// result[j+31:j] := temp1 + temp2
1068 /// ENDFOR
1069 /// \endcode
1071 /// \headerfile <immintrin.h>
1073 /// This intrinsic corresponds to the \c VPMADDWD instruction.
1075 /// \param __a
1076 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1077 /// \param __b
1078 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1079 /// \returns A 256-bit vector of [8 x i32] containing the result.
1080 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1081 _mm256_madd_epi16(__m256i __a, __m256i __b)
1083 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1086 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1087 /// in \a __a and \a __b and returns the larger of each pair in the
1088 /// corresponding byte of the 256-bit result.
1090 /// \headerfile <immintrin.h>
1092 /// This intrinsic corresponds to the \c VPMAXSB instruction.
1094 /// \param __a
1095 /// A 256-bit integer vector.
1096 /// \param __b
1097 /// A 256-bit integer vector.
1098 /// \returns A 256-bit integer vector containing the result.
1099 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1100 _mm256_max_epi8(__m256i __a, __m256i __b)
1102 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1105 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1106 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1107 /// each pair in the corresponding element of the 256-bit result.
1109 /// \headerfile <immintrin.h>
1111 /// This intrinsic corresponds to the \c VPMAXSW instruction.
1113 /// \param __a
1114 /// A 256-bit vector of [16 x i16].
1115 /// \param __b
1116 /// A 256-bit vector of [16 x i16].
1117 /// \returns A 256-bit vector of [16 x i16] containing the result.
1118 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1119 _mm256_max_epi16(__m256i __a, __m256i __b)
1121 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1124 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1125 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1126 /// each pair in the corresponding element of the 256-bit result.
1128 /// \headerfile <immintrin.h>
1130 /// This intrinsic corresponds to the \c VPMAXSD instruction.
1132 /// \param __a
1133 /// A 256-bit vector of [8 x i32].
1134 /// \param __b
1135 /// A 256-bit vector of [8 x i32].
1136 /// \returns A 256-bit vector of [8 x i32] containing the result.
1137 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1138 _mm256_max_epi32(__m256i __a, __m256i __b)
1140 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1143 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1144 /// vectors in \a __a and \a __b and returns the larger of each pair in
1145 /// the corresponding byte of the 256-bit result.
1147 /// \headerfile <immintrin.h>
1149 /// This intrinsic corresponds to the \c VPMAXUB instruction.
1151 /// \param __a
1152 /// A 256-bit integer vector.
1153 /// \param __b
1154 /// A 256-bit integer vector.
1155 /// \returns A 256-bit integer vector containing the result.
1156 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1157 _mm256_max_epu8(__m256i __a, __m256i __b)
1159 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1162 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1163 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1164 /// each pair in the corresponding element of the 256-bit result.
1166 /// \headerfile <immintrin.h>
1168 /// This intrinsic corresponds to the \c VPMAXUW instruction.
1170 /// \param __a
1171 /// A 256-bit vector of [16 x i16].
1172 /// \param __b
1173 /// A 256-bit vector of [16 x i16].
1174 /// \returns A 256-bit vector of [16 x i16] containing the result.
1175 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1176 _mm256_max_epu16(__m256i __a, __m256i __b)
1178 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1181 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1182 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1183 /// each pair in the corresponding element of the 256-bit result.
1185 /// \headerfile <immintrin.h>
1187 /// This intrinsic corresponds to the \c VPMAXUD instruction.
1189 /// \param __a
1190 /// A 256-bit vector of [8 x i32].
1191 /// \param __b
1192 /// A 256-bit vector of [8 x i32].
1193 /// \returns A 256-bit vector of [8 x i32] containing the result.
1194 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1195 _mm256_max_epu32(__m256i __a, __m256i __b)
1197 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1200 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1201 /// in \a __a and \a __b and returns the smaller of each pair in the
1202 /// corresponding byte of the 256-bit result.
1204 /// \headerfile <immintrin.h>
1206 /// This intrinsic corresponds to the \c VPMINSB instruction.
1208 /// \param __a
1209 /// A 256-bit integer vector.
1210 /// \param __b
1211 /// A 256-bit integer vector.
1212 /// \returns A 256-bit integer vector containing the result.
1213 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1214 _mm256_min_epi8(__m256i __a, __m256i __b)
1216 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1219 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1220 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1221 /// each pair in the corresponding element of the 256-bit result.
1223 /// \headerfile <immintrin.h>
1225 /// This intrinsic corresponds to the \c VPMINSW instruction.
1227 /// \param __a
1228 /// A 256-bit vector of [16 x i16].
1229 /// \param __b
1230 /// A 256-bit vector of [16 x i16].
1231 /// \returns A 256-bit vector of [16 x i16] containing the result.
1232 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1233 _mm256_min_epi16(__m256i __a, __m256i __b)
1235 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1238 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1239 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1240 /// each pair in the corresponding element of the 256-bit result.
1242 /// \headerfile <immintrin.h>
1244 /// This intrinsic corresponds to the \c VPMINSD instruction.
1246 /// \param __a
1247 /// A 256-bit vector of [8 x i32].
1248 /// \param __b
1249 /// A 256-bit vector of [8 x i32].
1250 /// \returns A 256-bit vector of [8 x i32] containing the result.
1251 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1252 _mm256_min_epi32(__m256i __a, __m256i __b)
1254 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1257 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1258 /// vectors in \a __a and \a __b and returns the smaller of each pair in
1259 /// the corresponding byte of the 256-bit result.
1261 /// \headerfile <immintrin.h>
1263 /// This intrinsic corresponds to the \c VPMINUB instruction.
1265 /// \param __a
1266 /// A 256-bit integer vector.
1267 /// \param __b
1268 /// A 256-bit integer vector.
1269 /// \returns A 256-bit integer vector containing the result.
1270 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1271 _mm256_min_epu8(__m256i __a, __m256i __b)
1273 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1276 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1277 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1278 /// each pair in the corresponding element of the 256-bit result.
1280 /// \headerfile <immintrin.h>
1282 /// This intrinsic corresponds to the \c VPMINUW instruction.
1284 /// \param __a
1285 /// A 256-bit vector of [16 x i16].
1286 /// \param __b
1287 /// A 256-bit vector of [16 x i16].
1288 /// \returns A 256-bit vector of [16 x i16] containing the result.
1289 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1290 _mm256_min_epu16(__m256i __a, __m256i __b)
1292 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1295 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1296 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1297 /// each pair in the corresponding element of the 256-bit result.
1299 /// \headerfile <immintrin.h>
1301 /// This intrinsic corresponds to the \c VPMINUD instruction.
1303 /// \param __a
1304 /// A 256-bit vector of [8 x i32].
1305 /// \param __b
1306 /// A 256-bit vector of [8 x i32].
1307 /// \returns A 256-bit vector of [8 x i32] containing the result.
1308 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1309 _mm256_min_epu32(__m256i __a, __m256i __b)
1311 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1314 /// Creates a 32-bit integer mask from the most significant bit of each byte
1315 /// in the 256-bit integer vector in \a __a and returns the result.
1317 /// \code{.operation}
1318 /// FOR i := 0 TO 31
1319 /// j := i*8
1320 /// result[i] := __a[j+7]
1321 /// ENDFOR
1322 /// \endcode
1324 /// \headerfile <immintrin.h>
1326 /// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1328 /// \param __a
1329 /// A 256-bit integer vector containing the source bytes.
1330 /// \returns The 32-bit integer mask.
1331 static __inline__ int __DEFAULT_FN_ATTRS256
1332 _mm256_movemask_epi8(__m256i __a)
1334 return __builtin_ia32_pmovmskb256((__v32qi)__a);
1337 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1338 /// the 16-bit values in the corresponding elements of a 256-bit vector
1339 /// of [16 x i16].
1341 /// \code{.operation}
1342 /// FOR i := 0 TO 15
1343 /// j := i*8
1344 /// k := i*16
1345 /// result[k+15:k] := SignExtend(__V[j+7:j])
1346 /// ENDFOR
1347 /// \endcode
1349 /// \headerfile <immintrin.h>
1351 /// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1353 /// \param __V
1354 /// A 128-bit integer vector containing the source bytes.
1355 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1356 /// values.
1357 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1358 _mm256_cvtepi8_epi16(__m128i __V)
1360 /* This function always performs a signed extension, but __v16qi is a char
1361 which may be signed or unsigned, so use __v16qs. */
1362 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1365 /// Sign-extends bytes from the lower half of the 128-bit integer vector in
1366 /// \a __V and returns the 32-bit values in the corresponding elements of a
1367 /// 256-bit vector of [8 x i32].
1369 /// \code{.operation}
1370 /// FOR i := 0 TO 7
1371 /// j := i*8
1372 /// k := i*32
1373 /// result[k+31:k] := SignExtend(__V[j+7:j])
1374 /// ENDFOR
1375 /// \endcode
1377 /// \headerfile <immintrin.h>
1379 /// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1381 /// \param __V
1382 /// A 128-bit integer vector containing the source bytes.
1383 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1384 /// values.
1385 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1386 _mm256_cvtepi8_epi32(__m128i __V)
1388 /* This function always performs a signed extension, but __v16qi is a char
1389 which may be signed or unsigned, so use __v16qs. */
1390 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1393 /// Sign-extends the first four bytes from the 128-bit integer vector in
1394 /// \a __V and returns the 64-bit values in the corresponding elements of a
1395 /// 256-bit vector of [4 x i64].
1397 /// \code{.operation}
1398 /// result[63:0] := SignExtend(__V[7:0])
1399 /// result[127:64] := SignExtend(__V[15:8])
1400 /// result[191:128] := SignExtend(__V[23:16])
1401 /// result[255:192] := SignExtend(__V[31:24])
1402 /// \endcode
1404 /// \headerfile <immintrin.h>
1406 /// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1408 /// \param __V
1409 /// A 128-bit integer vector containing the source bytes.
1410 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1411 /// values.
1412 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1413 _mm256_cvtepi8_epi64(__m128i __V)
1415 /* This function always performs a signed extension, but __v16qi is a char
1416 which may be signed or unsigned, so use __v16qs. */
1417 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1420 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1421 /// \a __V and returns the 32-bit values in the corresponding elements of a
1422 /// 256-bit vector of [8 x i32].
1424 /// \code{.operation}
1425 /// FOR i := 0 TO 7
1426 /// j := i*16
1427 /// k := i*32
1428 /// result[k+31:k] := SignExtend(__V[j+15:j])
1429 /// ENDFOR
1430 /// \endcode
1432 /// \headerfile <immintrin.h>
1434 /// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1436 /// \param __V
1437 /// A 128-bit vector of [8 x i16] containing the source values.
1438 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1439 /// values.
1440 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1441 _mm256_cvtepi16_epi32(__m128i __V)
1443 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1446 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1447 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1448 /// elements of a 256-bit vector of [4 x i64].
1450 /// \code{.operation}
1451 /// result[63:0] := SignExtend(__V[15:0])
1452 /// result[127:64] := SignExtend(__V[31:16])
1453 /// result[191:128] := SignExtend(__V[47:32])
1454 /// result[255:192] := SignExtend(__V[64:48])
1455 /// \endcode
1457 /// \headerfile <immintrin.h>
1459 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1461 /// \param __V
1462 /// A 128-bit vector of [8 x i16] containing the source values.
1463 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1464 /// values.
1465 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1466 _mm256_cvtepi16_epi64(__m128i __V)
1468 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1471 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1472 /// \a __V and returns the 64-bit values in the corresponding elements of a
1473 /// 256-bit vector of [4 x i64].
1475 /// \code{.operation}
1476 /// result[63:0] := SignExtend(__V[31:0])
1477 /// result[127:64] := SignExtend(__V[63:32])
1478 /// result[191:128] := SignExtend(__V[95:64])
1479 /// result[255:192] := SignExtend(__V[127:96])
1480 /// \endcode
1482 /// \headerfile <immintrin.h>
1484 /// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1486 /// \param __V
1487 /// A 128-bit vector of [4 x i32] containing the source values.
1488 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1489 /// values.
1490 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1491 _mm256_cvtepi32_epi64(__m128i __V)
1493 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1496 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1497 /// the 16-bit values in the corresponding elements of a 256-bit vector
1498 /// of [16 x i16].
1500 /// \code{.operation}
1501 /// FOR i := 0 TO 15
1502 /// j := i*8
1503 /// k := i*16
1504 /// result[k+15:k] := ZeroExtend(__V[j+7:j])
1505 /// ENDFOR
1506 /// \endcode
1508 /// \headerfile <immintrin.h>
1510 /// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1512 /// \param __V
1513 /// A 128-bit integer vector containing the source bytes.
1514 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1515 /// values.
1516 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1517 _mm256_cvtepu8_epi16(__m128i __V)
1519 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1522 /// Zero-extends bytes from the lower half of the 128-bit integer vector in
1523 /// \a __V and returns the 32-bit values in the corresponding elements of a
1524 /// 256-bit vector of [8 x i32].
1526 /// \code{.operation}
1527 /// FOR i := 0 TO 7
1528 /// j := i*8
1529 /// k := i*32
1530 /// result[k+31:k] := ZeroExtend(__V[j+7:j])
1531 /// ENDFOR
1532 /// \endcode
1534 /// \headerfile <immintrin.h>
1536 /// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1538 /// \param __V
1539 /// A 128-bit integer vector containing the source bytes.
1540 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1541 /// values.
1542 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1543 _mm256_cvtepu8_epi32(__m128i __V)
1545 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1548 /// Zero-extends the first four bytes from the 128-bit integer vector in
1549 /// \a __V and returns the 64-bit values in the corresponding elements of a
1550 /// 256-bit vector of [4 x i64].
1552 /// \code{.operation}
1553 /// result[63:0] := ZeroExtend(__V[7:0])
1554 /// result[127:64] := ZeroExtend(__V[15:8])
1555 /// result[191:128] := ZeroExtend(__V[23:16])
1556 /// result[255:192] := ZeroExtend(__V[31:24])
1557 /// \endcode
1559 /// \headerfile <immintrin.h>
1561 /// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1563 /// \param __V
1564 /// A 128-bit integer vector containing the source bytes.
1565 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1566 /// values.
1567 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1568 _mm256_cvtepu8_epi64(__m128i __V)
1570 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1573 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1574 /// \a __V and returns the 32-bit values in the corresponding elements of a
1575 /// 256-bit vector of [8 x i32].
1577 /// \code{.operation}
1578 /// FOR i := 0 TO 7
1579 /// j := i*16
1580 /// k := i*32
1581 /// result[k+31:k] := ZeroExtend(__V[j+15:j])
1582 /// ENDFOR
1583 /// \endcode
1585 /// \headerfile <immintrin.h>
1587 /// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1589 /// \param __V
1590 /// A 128-bit vector of [8 x i16] containing the source values.
1591 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1592 /// values.
1593 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1594 _mm256_cvtepu16_epi32(__m128i __V)
1596 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1599 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1600 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1601 /// elements of a 256-bit vector of [4 x i64].
1603 /// \code{.operation}
1604 /// result[63:0] := ZeroExtend(__V[15:0])
1605 /// result[127:64] := ZeroExtend(__V[31:16])
1606 /// result[191:128] := ZeroExtend(__V[47:32])
1607 /// result[255:192] := ZeroExtend(__V[64:48])
1608 /// \endcode
1610 /// \headerfile <immintrin.h>
1612 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1614 /// \param __V
1615 /// A 128-bit vector of [8 x i16] containing the source values.
1616 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1617 /// values.
1618 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1619 _mm256_cvtepu16_epi64(__m128i __V)
1621 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1624 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1625 /// \a __V and returns the 64-bit values in the corresponding elements of a
1626 /// 256-bit vector of [4 x i64].
1628 /// \code{.operation}
1629 /// result[63:0] := ZeroExtend(__V[31:0])
1630 /// result[127:64] := ZeroExtend(__V[63:32])
1631 /// result[191:128] := ZeroExtend(__V[95:64])
1632 /// result[255:192] := ZeroExtend(__V[127:96])
1633 /// \endcode
1635 /// \headerfile <immintrin.h>
1637 /// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1639 /// \param __V
1640 /// A 128-bit vector of [4 x i32] containing the source values.
1641 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1642 /// values.
1643 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1644 _mm256_cvtepu32_epi64(__m128i __V)
1646 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1649 /// Multiplies signed 32-bit integers from even-numbered elements of two
1650 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1651 /// [4 x i64] result.
1653 /// \code{.operation}
1654 /// result[63:0] := __a[31:0] * __b[31:0]
1655 /// result[127:64] := __a[95:64] * __b[95:64]
1656 /// result[191:128] := __a[159:128] * __b[159:128]
1657 /// result[255:192] := __a[223:192] * __b[223:192]
1658 /// \endcode
1660 /// \headerfile <immintrin.h>
1662 /// This intrinsic corresponds to the \c VPMULDQ instruction.
1664 /// \param __a
1665 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1666 /// \param __b
1667 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1668 /// \returns A 256-bit vector of [4 x i64] containing the products.
1669 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1670 _mm256_mul_epi32(__m256i __a, __m256i __b)
1672 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1675 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1676 /// [16 x i16], truncates the 32-bit results to the most significant 18
1677 /// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1678 /// product in the [16 x i16] result.
1680 /// \code{.operation}
1681 /// FOR i := 0 TO 15
1682 /// j := i*16
1683 /// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1684 /// result[j+15:j] := temp[16:1]
1685 /// \endcode
1687 /// \headerfile <immintrin.h>
1689 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
1691 /// \param __a
1692 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1693 /// \param __b
1694 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1695 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1696 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1697 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1699 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1702 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1703 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1704 /// [16 x i16] result.
1706 /// \headerfile <immintrin.h>
1708 /// This intrinsic corresponds to the \c VPMULHUW instruction.
1710 /// \param __a
1711 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1712 /// \param __b
1713 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1714 /// \returns A 256-bit vector of [16 x i16] containing the products.
1715 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1716 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
1718 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
1721 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1722 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1723 /// [16 x i16] result.
1725 /// \headerfile <immintrin.h>
1727 /// This intrinsic corresponds to the \c VPMULHW instruction.
1729 /// \param __a
1730 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1731 /// \param __b
1732 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1733 /// \returns A 256-bit vector of [16 x i16] containing the products.
1734 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1735 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
1737 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1740 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1741 /// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1742 /// [16 x i16] result.
1744 /// \headerfile <immintrin.h>
1746 /// This intrinsic corresponds to the \c VPMULLW instruction.
1748 /// \param __a
1749 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1750 /// \param __b
1751 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1752 /// \returns A 256-bit vector of [16 x i16] containing the products.
1753 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1754 _mm256_mullo_epi16(__m256i __a, __m256i __b)
1756 return (__m256i)((__v16hu)__a * (__v16hu)__b);
1759 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1760 /// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1761 /// [8 x i32] result.
1763 /// \headerfile <immintrin.h>
1765 /// This intrinsic corresponds to the \c VPMULLD instruction.
1767 /// \param __a
1768 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1769 /// \param __b
1770 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1771 /// \returns A 256-bit vector of [8 x i32] containing the products.
1772 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1773 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
1775 return (__m256i)((__v8su)__a * (__v8su)__b);
1778 /// Multiplies unsigned 32-bit integers from even-numered elements of two
1779 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1780 /// [4 x i64] result.
1782 /// \code{.operation}
1783 /// result[63:0] := __a[31:0] * __b[31:0]
1784 /// result[127:64] := __a[95:64] * __b[95:64]
1785 /// result[191:128] := __a[159:128] * __b[159:128]
1786 /// result[255:192] := __a[223:192] * __b[223:192]
1787 /// \endcode
1789 /// \headerfile <immintrin.h>
1791 /// This intrinsic corresponds to the \c VPMULUDQ instruction.
1793 /// \param __a
1794 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1795 /// \param __b
1796 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1797 /// \returns A 256-bit vector of [4 x i64] containing the products.
1798 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1799 _mm256_mul_epu32(__m256i __a, __m256i __b)
1801 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1804 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1805 /// \a __b.
1807 /// \headerfile <immintrin.h>
1809 /// This intrinsic corresponds to the \c VPOR instruction.
1811 /// \param __a
1812 /// A 256-bit integer vector.
1813 /// \param __b
1814 /// A 256-bit integer vector.
1815 /// \returns A 256-bit integer vector containing the result.
1816 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1817 _mm256_or_si256(__m256i __a, __m256i __b)
1819 return (__m256i)((__v4du)__a | (__v4du)__b);
1822 /// Computes four sum of absolute difference (SAD) operations on sets of eight
1823 /// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1824 /// \a __b.
1826 /// One SAD result is computed for each set of eight bytes from \a __a and
1827 /// eight bytes from \a __b. The zero-extended SAD value is returned in the
1828 /// corresponding 64-bit element of the result.
1830 /// A single SAD operation takes the differences between the corresponding
1831 /// bytes of \a __a and \a __b, takes the absolute value of each difference,
1832 /// and sums these eight values to form one 16-bit result. This operation
1833 /// is repeated four times with successive sets of eight bytes.
1835 /// \code{.operation}
1836 /// FOR i := 0 TO 3
1837 /// j := i*64
1838 /// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1839 /// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1840 /// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1841 /// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1842 /// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1843 /// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1844 /// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1845 /// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1846 /// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1847 /// temp4 + temp5 + temp6 + temp7
1848 /// result[j+63:j+16] := 0
1849 /// ENDFOR
1850 /// \endcode
1852 /// \headerfile <immintrin.h>
1854 /// This intrinsic corresponds to the \c VPSADBW instruction.
1856 /// \param __a
1857 /// A 256-bit integer vector.
1858 /// \param __b
1859 /// A 256-bit integer vector.
1860 /// \returns A 256-bit integer vector containing the result.
1861 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1862 _mm256_sad_epu8(__m256i __a, __m256i __b)
1864 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1867 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1868 /// to control information in the 256-bit integer vector \a __b, and
1869 /// returns the 256-bit result. In effect there are two separate 128-bit
1870 /// shuffles in the lower and upper halves.
1872 /// \code{.operation}
1873 /// FOR i := 0 TO 31
1874 /// j := i*8
1875 /// IF __b[j+7] == 1
1876 /// result[j+7:j] := 0
1877 /// ELSE
1878 /// k := __b[j+3:j] * 8
1879 /// IF i > 15
1880 /// k := k + 128
1881 /// FI
1882 /// result[j+7:j] := __a[k+7:k]
1883 /// FI
1884 /// ENDFOR
1885 /// \endcode
1887 /// \headerfile <immintrin.h>
1889 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1891 /// \param __a
1892 /// A 256-bit integer vector containing source values.
1893 /// \param __b
1894 /// A 256-bit integer vector containing control information to determine
1895 /// what goes into the corresponding byte of the result. If bit 7 of the
1896 /// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1897 /// control byte specify the index (within the same 128-bit half) of \a __a
1898 /// to copy to the result byte.
1899 /// \returns A 256-bit integer vector containing the result.
1900 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1901 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
1903 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1906 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1907 /// according to control information in the integer literal \a imm, and
1908 /// returns the 256-bit result. In effect there are two parallel 128-bit
1909 /// shuffles in the lower and upper halves.
1911 /// \code{.operation}
1912 /// FOR i := 0 to 3
1913 /// j := i*32
1914 /// k := (imm >> i*2)[1:0] * 32
1915 /// result[j+31:j] := a[k+31:k]
1916 /// result[128+j+31:128+j] := a[128+k+31:128+k]
1917 /// ENDFOR
1918 /// \endcode
1920 /// \headerfile <immintrin.h>
1922 /// \code
1923 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1924 /// \endcode
1926 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1928 /// \param a
1929 /// A 256-bit vector of [8 x i32] containing source values.
1930 /// \param imm
1931 /// An immediate 8-bit value specifying which elements to copy from \a a.
1932 /// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1933 /// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1934 /// forth.
1935 /// \returns A 256-bit vector of [8 x i32] containing the result.
1936 #define _mm256_shuffle_epi32(a, imm) \
1937 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1939 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1940 /// according to control information in the integer literal \a imm, and
1941 /// returns the 256-bit result. The upper 64 bits of each 128-bit half
1942 /// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1943 /// copied from \a a unchanged.
1945 /// \code{.operation}
1946 /// result[63:0] := a[63:0]
1947 /// result[191:128] := a[191:128]
1948 /// FOR i := 0 TO 3
1949 /// j := i * 16 + 64
1950 /// k := (imm >> i*2)[1:0] * 16 + 64
1951 /// result[j+15:j] := a[k+15:k]
1952 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1953 /// ENDFOR
1954 /// \endcode
1956 /// \headerfile <immintrin.h>
1958 /// \code
1959 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1960 /// \endcode
1962 /// This intrinsic corresponds to the \c VPSHUFHW instruction.
1964 /// \param a
1965 /// A 256-bit vector of [16 x i16] containing source values.
1966 /// \param imm
1967 /// An immediate 8-bit value specifying which elements to copy from \a a.
1968 /// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1969 /// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1970 /// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1971 /// \returns A 256-bit vector of [16 x i16] containing the result.
1972 #define _mm256_shufflehi_epi16(a, imm) \
1973 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1975 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1976 /// according to control information in the integer literal \a imm, and
1977 /// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1978 /// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1979 /// copied from \a a unchanged.
1981 /// \code{.operation}
1982 /// result[127:64] := a[127:64]
1983 /// result[255:192] := a[255:192]
1984 /// FOR i := 0 TO 3
1985 /// j := i * 16
1986 /// k := (imm >> i*2)[1:0] * 16
1987 /// result[j+15:j] := a[k+15:k]
1988 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1989 /// ENDFOR
1990 /// \endcode
1992 /// \headerfile <immintrin.h>
1994 /// \code
1995 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1996 /// \endcode
1998 /// This intrinsic corresponds to the \c VPSHUFLW instruction.
2000 /// \param a
2001 /// A 256-bit vector of [16 x i16] to use as a source of data for the
2002 /// result.
2003 /// \param imm
2004 /// An immediate 8-bit value specifying which elements to copy from \a a.
2005 /// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2006 /// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2007 /// forth.
2008 /// \returns A 256-bit vector of [16 x i16] containing the result.
2009 #define _mm256_shufflelo_epi16(a, imm) \
2010 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2012 /// Sets each byte of the result to the corresponding byte of the 256-bit
2013 /// integer vector in \a __a, the negative of that byte, or zero, depending
2014 /// on whether the corresponding byte of the 256-bit integer vector in
2015 /// \a __b is greater than zero, less than zero, or equal to zero,
2016 /// respectively.
2018 /// \headerfile <immintrin.h>
2020 /// This intrinsic corresponds to the \c VPSIGNB instruction.
2022 /// \param __a
2023 /// A 256-bit integer vector.
2024 /// \param __b
2025 /// A 256-bit integer vector].
2026 /// \returns A 256-bit integer vector containing the result.
2027 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2028 _mm256_sign_epi8(__m256i __a, __m256i __b)
2030 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2033 /// Sets each element of the result to the corresponding element of the
2034 /// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
2035 /// or zero, depending on whether the corresponding element of the 256-bit
2036 /// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2037 /// equal to zero, respectively.
2039 /// \headerfile <immintrin.h>
2041 /// This intrinsic corresponds to the \c VPSIGNW instruction.
2043 /// \param __a
2044 /// A 256-bit vector of [16 x i16].
2045 /// \param __b
2046 /// A 256-bit vector of [16 x i16].
2047 /// \returns A 256-bit vector of [16 x i16] containing the result.
2048 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2049 _mm256_sign_epi16(__m256i __a, __m256i __b)
2051 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2054 /// Sets each element of the result to the corresponding element of the
2055 /// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2056 /// zero, depending on whether the corresponding element of the 256-bit
2057 /// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2058 /// equal to zero, respectively.
2060 /// \headerfile <immintrin.h>
2062 /// This intrinsic corresponds to the \c VPSIGND instruction.
2064 /// \param __a
2065 /// A 256-bit vector of [8 x i32].
2066 /// \param __b
2067 /// A 256-bit vector of [8 x i32].
2068 /// \returns A 256-bit vector of [8 x i32] containing the result.
2069 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2070 _mm256_sign_epi32(__m256i __a, __m256i __b)
2072 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2075 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2076 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2077 /// is greater than 15, the returned result is all zeroes.
2079 /// \headerfile <immintrin.h>
2081 /// \code
2082 /// __m256i _mm256_slli_si256(__m256i a, const int imm);
2083 /// \endcode
2085 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2087 /// \param a
2088 /// A 256-bit integer vector to be shifted.
2089 /// \param imm
2090 /// An unsigned immediate value specifying the shift count (in bytes).
2091 /// \returns A 256-bit integer vector containing the result.
2092 #define _mm256_slli_si256(a, imm) \
2093 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2095 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2096 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2097 /// is greater than 15, the returned result is all zeroes.
2099 /// \headerfile <immintrin.h>
2101 /// \code
2102 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2103 /// \endcode
2105 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2107 /// \param a
2108 /// A 256-bit integer vector to be shifted.
2109 /// \param imm
2110 /// An unsigned immediate value specifying the shift count (in bytes).
2111 /// \returns A 256-bit integer vector containing the result.
2112 #define _mm256_bslli_epi128(a, imm) \
2113 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2115 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2116 /// left by \a __count bits, shifting in zero bits, and returns the result.
2117 /// If \a __count is greater than 15, the returned result is all zeroes.
2119 /// \headerfile <immintrin.h>
2121 /// This intrinsic corresponds to the \c VPSLLW instruction.
2123 /// \param __a
2124 /// A 256-bit vector of [16 x i16] to be shifted.
2125 /// \param __count
2126 /// An unsigned integer value specifying the shift count (in bits).
2127 /// \returns A 256-bit vector of [16 x i16] containing the result.
2128 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2129 _mm256_slli_epi16(__m256i __a, int __count)
2131 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2134 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2135 /// left by the number of bits specified by the lower 64 bits of \a __count,
2136 /// shifting in zero bits, and returns the result. If \a __count is greater
2137 /// than 15, the returned result is all zeroes.
2139 /// \headerfile <immintrin.h>
2141 /// This intrinsic corresponds to the \c VPSLLW instruction.
2143 /// \param __a
2144 /// A 256-bit vector of [16 x i16] to be shifted.
2145 /// \param __count
2146 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2147 /// shift count (in bits). The upper element is ignored.
2148 /// \returns A 256-bit vector of [16 x i16] containing the result.
2149 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2150 _mm256_sll_epi16(__m256i __a, __m128i __count)
2152 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2155 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2156 /// left by \a __count bits, shifting in zero bits, and returns the result.
2157 /// If \a __count is greater than 31, the returned result is all zeroes.
2159 /// \headerfile <immintrin.h>
2161 /// This intrinsic corresponds to the \c VPSLLD instruction.
2163 /// \param __a
2164 /// A 256-bit vector of [8 x i32] to be shifted.
2165 /// \param __count
2166 /// An unsigned integer value specifying the shift count (in bits).
2167 /// \returns A 256-bit vector of [8 x i32] containing the result.
2168 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2169 _mm256_slli_epi32(__m256i __a, int __count)
2171 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2174 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2175 /// left by the number of bits given in the lower 64 bits of \a __count,
2176 /// shifting in zero bits, and returns the result. If \a __count is greater
2177 /// than 31, the returned result is all zeroes.
2179 /// \headerfile <immintrin.h>
2181 /// This intrinsic corresponds to the \c VPSLLD instruction.
2183 /// \param __a
2184 /// A 256-bit vector of [8 x i32] to be shifted.
2185 /// \param __count
2186 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2187 /// shift count (in bits). The upper element is ignored.
2188 /// \returns A 256-bit vector of [8 x i32] containing the result.
2189 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2190 _mm256_sll_epi32(__m256i __a, __m128i __count)
2192 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2195 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2196 /// left by \a __count bits, shifting in zero bits, and returns the result.
2197 /// If \a __count is greater than 63, the returned result is all zeroes.
2199 /// \headerfile <immintrin.h>
2201 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2203 /// \param __a
2204 /// A 256-bit vector of [4 x i64] to be shifted.
2205 /// \param __count
2206 /// An unsigned integer value specifying the shift count (in bits).
2207 /// \returns A 256-bit vector of [4 x i64] containing the result.
2208 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2209 _mm256_slli_epi64(__m256i __a, int __count)
2211 return __builtin_ia32_psllqi256((__v4di)__a, __count);
2214 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2215 /// left by the number of bits given in the lower 64 bits of \a __count,
2216 /// shifting in zero bits, and returns the result. If \a __count is greater
2217 /// than 63, the returned result is all zeroes.
2219 /// \headerfile <immintrin.h>
2221 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2223 /// \param __a
2224 /// A 256-bit vector of [4 x i64] to be shifted.
2225 /// \param __count
2226 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2227 /// shift count (in bits). The upper element is ignored.
2228 /// \returns A 256-bit vector of [4 x i64] containing the result.
2229 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2230 _mm256_sll_epi64(__m256i __a, __m128i __count)
2232 return __builtin_ia32_psllq256((__v4di)__a, __count);
2235 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2236 /// right by \a __count bits, shifting in sign bits, and returns the result.
2237 /// If \a __count is greater than 15, each element of the result is either
2238 /// 0 or -1 according to the corresponding input sign bit.
2240 /// \headerfile <immintrin.h>
2242 /// This intrinsic corresponds to the \c VPSRAW instruction.
2244 /// \param __a
2245 /// A 256-bit vector of [16 x i16] to be shifted.
2246 /// \param __count
2247 /// An unsigned integer value specifying the shift count (in bits).
2248 /// \returns A 256-bit vector of [16 x i16] containing the result.
2249 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2250 _mm256_srai_epi16(__m256i __a, int __count)
2252 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2255 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2256 /// right by the number of bits given in the lower 64 bits of \a __count,
2257 /// shifting in sign bits, and returns the result. If \a __count is greater
2258 /// than 15, each element of the result is either 0 or -1 according to the
2259 /// corresponding input sign bit.
2261 /// \headerfile <immintrin.h>
2263 /// This intrinsic corresponds to the \c VPSRAW instruction.
2265 /// \param __a
2266 /// A 256-bit vector of [16 x i16] to be shifted.
2267 /// \param __count
2268 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2269 /// shift count (in bits). The upper element is ignored.
2270 /// \returns A 256-bit vector of [16 x i16] containing the result.
2271 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2272 _mm256_sra_epi16(__m256i __a, __m128i __count)
2274 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2277 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2278 /// right by \a __count bits, shifting in sign bits, and returns the result.
2279 /// If \a __count is greater than 31, each element of the result is either
2280 /// 0 or -1 according to the corresponding input sign bit.
2282 /// \headerfile <immintrin.h>
2284 /// This intrinsic corresponds to the \c VPSRAD instruction.
2286 /// \param __a
2287 /// A 256-bit vector of [8 x i32] to be shifted.
2288 /// \param __count
2289 /// An unsigned integer value specifying the shift count (in bits).
2290 /// \returns A 256-bit vector of [8 x i32] containing the result.
2291 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2292 _mm256_srai_epi32(__m256i __a, int __count)
2294 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2297 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2298 /// right by the number of bits given in the lower 64 bits of \a __count,
2299 /// shifting in sign bits, and returns the result. If \a __count is greater
2300 /// than 31, each element of the result is either 0 or -1 according to the
2301 /// corresponding input sign bit.
2303 /// \headerfile <immintrin.h>
2305 /// This intrinsic corresponds to the \c VPSRAD instruction.
2307 /// \param __a
2308 /// A 256-bit vector of [8 x i32] to be shifted.
2309 /// \param __count
2310 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2311 /// shift count (in bits). The upper element is ignored.
2312 /// \returns A 256-bit vector of [8 x i32] containing the result.
2313 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2314 _mm256_sra_epi32(__m256i __a, __m128i __count)
2316 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2319 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2320 /// \a imm bytes, shifting in zero bytes, and returns the result. If
2321 /// \a imm is greater than 15, the returned result is all zeroes.
2323 /// \headerfile <immintrin.h>
2325 /// \code
2326 /// __m256i _mm256_srli_si256(__m256i a, const int imm);
2327 /// \endcode
2329 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2331 /// \param a
2332 /// A 256-bit integer vector to be shifted.
2333 /// \param imm
2334 /// An unsigned immediate value specifying the shift count (in bytes).
2335 /// \returns A 256-bit integer vector containing the result.
2336 #define _mm256_srli_si256(a, imm) \
2337 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2339 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2340 /// \a imm bytes, shifting in zero bytes, and returns the result. If
2341 /// \a imm is greater than 15, the returned result is all zeroes.
2343 /// \headerfile <immintrin.h>
2345 /// \code
2346 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2347 /// \endcode
2349 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2351 /// \param a
2352 /// A 256-bit integer vector to be shifted.
2353 /// \param imm
2354 /// An unsigned immediate value specifying the shift count (in bytes).
2355 /// \returns A 256-bit integer vector containing the result.
2356 #define _mm256_bsrli_epi128(a, imm) \
2357 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2359 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2360 /// right by \a __count bits, shifting in zero bits, and returns the result.
2361 /// If \a __count is greater than 15, the returned result is all zeroes.
2363 /// \headerfile <immintrin.h>
2365 /// This intrinsic corresponds to the \c VPSRLW instruction.
2367 /// \param __a
2368 /// A 256-bit vector of [16 x i16] to be shifted.
2369 /// \param __count
2370 /// An unsigned integer value specifying the shift count (in bits).
2371 /// \returns A 256-bit vector of [16 x i16] containing the result.
2372 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2373 _mm256_srli_epi16(__m256i __a, int __count)
2375 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2378 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2379 /// right by the number of bits given in the lower 64 bits of \a __count,
2380 /// shifting in zero bits, and returns the result. If \a __count is greater
2381 /// than 15, the returned result is all zeroes.
2383 /// \headerfile <immintrin.h>
2385 /// This intrinsic corresponds to the \c VPSRLW instruction.
2387 /// \param __a
2388 /// A 256-bit vector of [16 x i16] to be shifted.
2389 /// \param __count
2390 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2391 /// shift count (in bits). The upper element is ignored.
2392 /// \returns A 256-bit vector of [16 x i16] containing the result.
2393 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2394 _mm256_srl_epi16(__m256i __a, __m128i __count)
2396 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2399 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2400 /// right by \a __count bits, shifting in zero bits, and returns the result.
2401 /// If \a __count is greater than 31, the returned result is all zeroes.
2403 /// \headerfile <immintrin.h>
2405 /// This intrinsic corresponds to the \c VPSRLD instruction.
2407 /// \param __a
2408 /// A 256-bit vector of [8 x i32] to be shifted.
2409 /// \param __count
2410 /// An unsigned integer value specifying the shift count (in bits).
2411 /// \returns A 256-bit vector of [8 x i32] containing the result.
2412 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2413 _mm256_srli_epi32(__m256i __a, int __count)
2415 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2418 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2419 /// right by the number of bits given in the lower 64 bits of \a __count,
2420 /// shifting in zero bits, and returns the result. If \a __count is greater
2421 /// than 31, the returned result is all zeroes.
2423 /// \headerfile <immintrin.h>
2425 /// This intrinsic corresponds to the \c VPSRLD instruction.
2427 /// \param __a
2428 /// A 256-bit vector of [8 x i32] to be shifted.
2429 /// \param __count
2430 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2431 /// shift count (in bits). The upper element is ignored.
2432 /// \returns A 256-bit vector of [8 x i32] containing the result.
2433 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2434 _mm256_srl_epi32(__m256i __a, __m128i __count)
2436 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2439 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2440 /// right by \a __count bits, shifting in zero bits, and returns the result.
2441 /// If \a __count is greater than 63, the returned result is all zeroes.
2443 /// \headerfile <immintrin.h>
2445 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2447 /// \param __a
2448 /// A 256-bit vector of [4 x i64] to be shifted.
2449 /// \param __count
2450 /// An unsigned integer value specifying the shift count (in bits).
2451 /// \returns A 256-bit vector of [4 x i64] containing the result.
2452 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2453 _mm256_srli_epi64(__m256i __a, int __count)
2455 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2458 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2459 /// right by the number of bits given in the lower 64 bits of \a __count,
2460 /// shifting in zero bits, and returns the result. If \a __count is greater
2461 /// than 63, the returned result is all zeroes.
2463 /// \headerfile <immintrin.h>
2465 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2467 /// \param __a
2468 /// A 256-bit vector of [4 x i64] to be shifted.
2469 /// \param __count
2470 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2471 /// shift count (in bits). The upper element is ignored.
2472 /// \returns A 256-bit vector of [4 x i64] containing the result.
2473 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2474 _mm256_srl_epi64(__m256i __a, __m128i __count)
2476 return __builtin_ia32_psrlq256((__v4di)__a, __count);
2479 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2480 /// vectors. Returns the lower 8 bits of each difference in the
2481 /// corresponding byte of the 256-bit integer vector result (overflow is
2482 /// ignored).
2484 /// \code{.operation}
2485 /// FOR i := 0 TO 31
2486 /// j := i*8
2487 /// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2488 /// ENDFOR
2489 /// \endcode
2491 /// \headerfile <immintrin.h>
2493 /// This intrinsic corresponds to the \c VPSUBB instruction.
2495 /// \param __a
2496 /// A 256-bit integer vector containing the minuends.
2497 /// \param __b
2498 /// A 256-bit integer vector containing the subtrahends.
2499 /// \returns A 256-bit integer vector containing the differences.
2500 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2501 _mm256_sub_epi8(__m256i __a, __m256i __b)
2503 return (__m256i)((__v32qu)__a - (__v32qu)__b);
2506 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2507 /// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2508 /// the corresponding element of the [16 x i16] result (overflow is
2509 /// ignored).
2511 /// \code{.operation}
2512 /// FOR i := 0 TO 15
2513 /// j := i*16
2514 /// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2515 /// ENDFOR
2516 /// \endcode
2518 /// \headerfile <immintrin.h>
2520 /// This intrinsic corresponds to the \c VPSUBW instruction.
2522 /// \param __a
2523 /// A 256-bit vector of [16 x i16] containing the minuends.
2524 /// \param __b
2525 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2526 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2527 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2528 _mm256_sub_epi16(__m256i __a, __m256i __b)
2530 return (__m256i)((__v16hu)__a - (__v16hu)__b);
2533 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
2534 /// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2535 /// the corresponding element of the [8 x i32] result (overflow is ignored).
2537 /// \code{.operation}
2538 /// FOR i := 0 TO 7
2539 /// j := i*32
2540 /// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2541 /// ENDFOR
2542 /// \endcode
2544 /// \headerfile <immintrin.h>
2546 /// This intrinsic corresponds to the \c VPSUBD instruction.
2548 /// \param __a
2549 /// A 256-bit vector of [8 x i32] containing the minuends.
2550 /// \param __b
2551 /// A 256-bit vector of [8 x i32] containing the subtrahends.
2552 /// \returns A 256-bit vector of [8 x i32] containing the differences.
2553 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2554 _mm256_sub_epi32(__m256i __a, __m256i __b)
2556 return (__m256i)((__v8su)__a - (__v8su)__b);
2559 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
2560 /// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2561 /// the corresponding element of the [4 x i64] result (overflow is ignored).
2563 /// \code{.operation}
2564 /// FOR i := 0 TO 3
2565 /// j := i*64
2566 /// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2567 /// ENDFOR
2568 /// \endcode
2570 /// \headerfile <immintrin.h>
2572 /// This intrinsic corresponds to the \c VPSUBQ instruction.
2574 /// \param __a
2575 /// A 256-bit vector of [4 x i64] containing the minuends.
2576 /// \param __b
2577 /// A 256-bit vector of [4 x i64] containing the subtrahends.
2578 /// \returns A 256-bit vector of [4 x i64] containing the differences.
2579 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2580 _mm256_sub_epi64(__m256i __a, __m256i __b)
2582 return (__m256i)((__v4du)__a - (__v4du)__b);
2585 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2586 /// vectors using signed saturation, and returns each differences in the
2587 /// corresponding byte of the 256-bit integer vector result.
2589 /// \code{.operation}
2590 /// FOR i := 0 TO 31
2591 /// j := i*8
2592 /// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2593 /// ENDFOR
2594 /// \endcode
2596 /// \headerfile <immintrin.h>
2598 /// This intrinsic corresponds to the \c VPSUBSB instruction.
2600 /// \param __a
2601 /// A 256-bit integer vector containing the minuends.
2602 /// \param __b
2603 /// A 256-bit integer vector containing the subtrahends.
2604 /// \returns A 256-bit integer vector containing the differences.
2605 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2606 _mm256_subs_epi8(__m256i __a, __m256i __b)
2608 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2611 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2612 /// vectors of [16 x i16] using signed saturation, and returns each
2613 /// difference in the corresponding element of the [16 x i16] result.
2615 /// \code{.operation}
2616 /// FOR i := 0 TO 15
2617 /// j := i*16
2618 /// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2619 /// ENDFOR
2620 /// \endcode
2622 /// \headerfile <immintrin.h>
2624 /// This intrinsic corresponds to the \c VPSUBSW instruction.
2626 /// \param __a
2627 /// A 256-bit vector of [16 x i16] containing the minuends.
2628 /// \param __b
2629 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2630 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2631 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2632 _mm256_subs_epi16(__m256i __a, __m256i __b)
2634 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2637 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2638 /// vectors using unsigned saturation, and returns each difference in the
2639 /// corresponding byte of the 256-bit integer vector result. For each byte,
2640 /// computes <c> result = __a - __b </c>.
2642 /// \code{.operation}
2643 /// FOR i := 0 TO 31
2644 /// j := i*8
2645 /// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2646 /// ENDFOR
2647 /// \endcode
2649 /// \headerfile <immintrin.h>
2651 /// This intrinsic corresponds to the \c VPSUBUSB instruction.
2653 /// \param __a
2654 /// A 256-bit integer vector containing the minuends.
2655 /// \param __b
2656 /// A 256-bit integer vector containing the subtrahends.
2657 /// \returns A 256-bit integer vector containing the differences.
2658 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2659 _mm256_subs_epu8(__m256i __a, __m256i __b)
2661 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2664 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2665 /// vectors of [16 x i16] using unsigned saturation, and returns each
2666 /// difference in the corresponding element of the [16 x i16] result.
2668 /// \code{.operation}
2669 /// FOR i := 0 TO 15
2670 /// j := i*16
2671 /// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2672 /// ENDFOR
2673 /// \endcode
2675 /// \headerfile <immintrin.h>
2677 /// This intrinsic corresponds to the \c VPSUBUSW instruction.
2679 /// \param __a
2680 /// A 256-bit vector of [16 x i16] containing the minuends.
2681 /// \param __b
2682 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2683 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2684 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2685 _mm256_subs_epu16(__m256i __a, __m256i __b)
2687 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2690 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2691 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2692 /// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2693 /// input; other bits in these parameters are ignored.
2695 /// \code{.operation}
2696 /// result[7:0] := __a[71:64]
2697 /// result[15:8] := __b[71:64]
2698 /// result[23:16] := __a[79:72]
2699 /// result[31:24] := __b[79:72]
2700 /// . . .
2701 /// result[127:120] := __b[127:120]
2702 /// result[135:128] := __a[199:192]
2703 /// . . .
2704 /// result[255:248] := __b[255:248]
2705 /// \endcode
2707 /// \headerfile <immintrin.h>
2709 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2711 /// \param __a
2712 /// A 256-bit integer vector used as the source for the even-numbered bytes
2713 /// of the result.
2714 /// \param __b
2715 /// A 256-bit integer vector used as the source for the odd-numbered bytes
2716 /// of the result.
2717 /// \returns A 256-bit integer vector containing the result.
2718 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2719 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
2721 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2724 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2725 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2726 /// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2727 /// 128-bit half of \a __a and \a __b as input; other bits in these
2728 /// parameters are ignored.
2730 /// \code{.operation}
2731 /// result[15:0] := __a[79:64]
2732 /// result[31:16] := __b[79:64]
2733 /// result[47:32] := __a[95:80]
2734 /// result[63:48] := __b[95:80]
2735 /// . . .
2736 /// result[127:112] := __b[127:112]
2737 /// result[143:128] := __a[211:196]
2738 /// . . .
2739 /// result[255:240] := __b[255:240]
2740 /// \endcode
2742 /// \headerfile <immintrin.h>
2744 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2746 /// \param __a
2747 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2748 /// elements of the result.
2749 /// \param __b
2750 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2751 /// elements of the result.
2752 /// \returns A 256-bit vector of [16 x i16] containing the result.
2753 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2754 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
2756 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2759 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2760 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2761 /// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2762 /// of \a __a and \a __b as input; other bits in these parameters are
2763 /// ignored.
2765 /// \code{.operation}
2766 /// result[31:0] := __a[95:64]
2767 /// result[63:32] := __b[95:64]
2768 /// result[95:64] := __a[127:96]
2769 /// result[127:96] := __b[127:96]
2770 /// result[159:128] := __a[223:192]
2771 /// result[191:160] := __b[223:192]
2772 /// result[223:192] := __a[255:224]
2773 /// result[255:224] := __b[255:224]
2774 /// \endcode
2776 /// \headerfile <immintrin.h>
2778 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2780 /// \param __a
2781 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2782 /// elements of the result.
2783 /// \param __b
2784 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2785 /// elements of the result.
2786 /// \returns A 256-bit vector of [8 x i32] containing the result.
2787 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2788 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
2790 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2793 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2794 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2795 /// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2796 /// of \a __a and \a __b as input; other bits in these parameters are
2797 /// ignored.
2799 /// \code{.operation}
2800 /// result[63:0] := __a[127:64]
2801 /// result[127:64] := __b[127:64]
2802 /// result[191:128] := __a[255:192]
2803 /// result[255:192] := __b[255:192]
2804 /// \endcode
2806 /// \headerfile <immintrin.h>
2808 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2810 /// \param __a
2811 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2812 /// elements of the result.
2813 /// \param __b
2814 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2815 /// elements of the result.
2816 /// \returns A 256-bit vector of [4 x i64] containing the result.
2817 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2818 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
2820 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2823 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2824 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2825 /// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2826 /// input; other bits in these parameters are ignored.
2828 /// \code{.operation}
2829 /// result[7:0] := __a[7:0]
2830 /// result[15:8] := __b[7:0]
2831 /// result[23:16] := __a[15:8]
2832 /// result[31:24] := __b[15:8]
2833 /// . . .
2834 /// result[127:120] := __b[63:56]
2835 /// result[135:128] := __a[135:128]
2836 /// . . .
2837 /// result[255:248] := __b[191:184]
2838 /// \endcode
2840 /// \headerfile <immintrin.h>
2842 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2844 /// \param __a
2845 /// A 256-bit integer vector used as the source for the even-numbered bytes
2846 /// of the result.
2847 /// \param __b
2848 /// A 256-bit integer vector used as the source for the odd-numbered bytes
2849 /// of the result.
2850 /// \returns A 256-bit integer vector containing the result.
2851 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2852 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
2854 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2857 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2858 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2859 /// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2860 /// 128-bit half of \a __a and \a __b as input; other bits in these
2861 /// parameters are ignored.
2863 /// \code{.operation}
2864 /// result[15:0] := __a[15:0]
2865 /// result[31:16] := __b[15:0]
2866 /// result[47:32] := __a[31:16]
2867 /// result[63:48] := __b[31:16]
2868 /// . . .
2869 /// result[127:112] := __b[63:48]
2870 /// result[143:128] := __a[143:128]
2871 /// . . .
2872 /// result[255:239] := __b[191:176]
2873 /// \endcode
2875 /// \headerfile <immintrin.h>
2877 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2879 /// \param __a
2880 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2881 /// elements of the result.
2882 /// \param __b
2883 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2884 /// elements of the result.
2885 /// \returns A 256-bit vector of [16 x i16] containing the result.
2886 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2887 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
2889 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2892 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2893 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2894 /// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2895 /// of \a __a and \a __b as input; other bits in these parameters are
2896 /// ignored.
2898 /// \code{.operation}
2899 /// result[31:0] := __a[31:0]
2900 /// result[63:32] := __b[31:0]
2901 /// result[95:64] := __a[63:32]
2902 /// result[127:96] := __b[63:32]
2903 /// result[159:128] := __a[159:128]
2904 /// result[191:160] := __b[159:128]
2905 /// result[223:192] := __a[191:160]
2906 /// result[255:224] := __b[191:190]
2907 /// \endcode
2909 /// \headerfile <immintrin.h>
2911 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2913 /// \param __a
2914 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2915 /// elements of the result.
2916 /// \param __b
2917 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2918 /// elements of the result.
2919 /// \returns A 256-bit vector of [8 x i32] containing the result.
2920 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2921 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
2923 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2926 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2927 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2928 /// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2929 /// of \a __a and \a __b as input; other bits in these parameters are
2930 /// ignored.
2932 /// \code{.operation}
2933 /// result[63:0] := __a[63:0]
2934 /// result[127:64] := __b[63:0]
2935 /// result[191:128] := __a[191:128]
2936 /// result[255:192] := __b[191:128]
2937 /// \endcode
2939 /// \headerfile <immintrin.h>
2941 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2943 /// \param __a
2944 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2945 /// elements of the result.
2946 /// \param __b
2947 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2948 /// elements of the result.
2949 /// \returns A 256-bit vector of [4 x i64] containing the result.
2950 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2951 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
2953 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2956 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2957 /// \a __b.
2959 /// \headerfile <immintrin.h>
2961 /// This intrinsic corresponds to the \c VPXOR instruction.
2963 /// \param __a
2964 /// A 256-bit integer vector.
2965 /// \param __b
2966 /// A 256-bit integer vector.
2967 /// \returns A 256-bit integer vector containing the result.
2968 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2969 _mm256_xor_si256(__m256i __a, __m256i __b)
2971 return (__m256i)((__v4du)__a ^ (__v4du)__b);
2974 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2975 /// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2976 /// boundary.
2978 /// \headerfile <immintrin.h>
2980 /// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2982 /// \param __V
2983 /// A pointer to the 32-byte aligned memory containing the vector to load.
2984 /// \returns A 256-bit integer vector loaded from memory.
2985 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2986 _mm256_stream_load_si256(const void *__V)
2988 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2989 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2992 /// Broadcasts the 32-bit floating-point value from the low element of the
2993 /// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2994 /// 128-bit vector of [4 x float].
2996 /// \headerfile <immintrin.h>
2998 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3000 /// \param __X
3001 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
3002 /// \returns A 128-bit vector of [4 x float] containing the result.
3003 static __inline__ __m128 __DEFAULT_FN_ATTRS128
3004 _mm_broadcastss_ps(__m128 __X)
3006 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
3009 /// Broadcasts the 64-bit floating-point value from the low element of the
3010 /// 128-bit vector of [2 x double] in \a __a to both elements of the
3011 /// result's 128-bit vector of [2 x double].
3013 /// \headerfile <immintrin.h>
3015 /// This intrinsic corresponds to the \c MOVDDUP instruction.
3017 /// \param __a
3018 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
3019 /// \returns A 128-bit vector of [2 x double] containing the result.
3020 static __inline__ __m128d __DEFAULT_FN_ATTRS128
3021 _mm_broadcastsd_pd(__m128d __a)
3023 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
3026 /// Broadcasts the 32-bit floating-point value from the low element of the
3027 /// 128-bit vector of [4 x float] in \a __X to all elements of the
3028 /// result's 256-bit vector of [8 x float].
3030 /// \headerfile <immintrin.h>
3032 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3034 /// \param __X
3035 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
3036 /// \returns A 256-bit vector of [8 x float] containing the result.
3037 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3038 _mm256_broadcastss_ps(__m128 __X)
3040 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3043 /// Broadcasts the 64-bit floating-point value from the low element of the
3044 /// 128-bit vector of [2 x double] in \a __X to all elements of the
3045 /// result's 256-bit vector of [4 x double].
3047 /// \headerfile <immintrin.h>
3049 /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3051 /// \param __X
3052 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
3053 /// \returns A 256-bit vector of [4 x double] containing the result.
3054 static __inline__ __m256d __DEFAULT_FN_ATTRS256
3055 _mm256_broadcastsd_pd(__m128d __X)
3057 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3060 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
3061 /// upper halves of the 256-bit result.
3063 /// \headerfile <immintrin.h>
3065 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3067 /// \param __X
3068 /// A 128-bit integer vector to be broadcast.
3069 /// \returns A 256-bit integer vector containing the result.
3070 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3071 _mm256_broadcastsi128_si256(__m128i __X)
3073 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3076 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3078 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
3079 /// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3080 /// as specified by the immediate integer operand \a M.
3082 /// \code{.operation}
3083 /// FOR i := 0 TO 3
3084 /// j := i*32
3085 /// IF M[i] == 0
3086 /// result[31+j:j] := V1[31+j:j]
3087 /// ELSE
3088 /// result[31+j:j] := V2[32+j:j]
3089 /// FI
3090 /// ENDFOR
3091 /// \endcode
3093 /// \headerfile <immintrin.h>
3095 /// \code
3096 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3097 /// \endcode
3099 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3101 /// \param V1
3102 /// A 128-bit vector of [4 x i32] containing source values.
3103 /// \param V2
3104 /// A 128-bit vector of [4 x i32] containing source values.
3105 /// \param M
3106 /// An immediate 8-bit integer operand, with bits [3:0] specifying the
3107 /// source for each element of the result. The position of the mask bit
3108 /// corresponds to the index of a copied value. When a mask bit is 0, the
3109 /// element is copied from \a V1; otherwise, it is copied from \a V2.
3110 /// \returns A 128-bit vector of [4 x i32] containing the result.
3111 #define _mm_blend_epi32(V1, V2, M) \
3112 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3113 (__v4si)(__m128i)(V2), (int)(M)))
3115 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
3116 /// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3117 /// as specified by the immediate integer operand \a M.
3119 /// \code{.operation}
3120 /// FOR i := 0 TO 7
3121 /// j := i*32
3122 /// IF M[i] == 0
3123 /// result[31+j:j] := V1[31+j:j]
3124 /// ELSE
3125 /// result[31+j:j] := V2[32+j:j]
3126 /// FI
3127 /// ENDFOR
3128 /// \endcode
3130 /// \headerfile <immintrin.h>
3132 /// \code
3133 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3134 /// \endcode
3136 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3138 /// \param V1
3139 /// A 256-bit vector of [8 x i32] containing source values.
3140 /// \param V2
3141 /// A 256-bit vector of [8 x i32] containing source values.
3142 /// \param M
3143 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
3144 /// source for each element of the result. The position of the mask bit
3145 /// corresponds to the index of a copied value. When a mask bit is 0, the
3146 /// element is copied from \a V1; otherwise, it is is copied from \a V2.
3147 /// \returns A 256-bit vector of [8 x i32] containing the result.
3148 #define _mm256_blend_epi32(V1, V2, M) \
3149 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3150 (__v8si)(__m256i)(V2), (int)(M)))
3152 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3153 /// bytes of the 256-bit result.
3155 /// \headerfile <immintrin.h>
3157 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3159 /// \param __X
3160 /// A 128-bit integer vector whose low byte will be broadcast.
3161 /// \returns A 256-bit integer vector containing the result.
3162 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3163 _mm256_broadcastb_epi8(__m128i __X)
3165 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3168 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3169 /// to all elements of the result's 256-bit vector of [16 x i16].
3171 /// \headerfile <immintrin.h>
3173 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3175 /// \param __X
3176 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3177 /// \returns A 256-bit vector of [16 x i16] containing the result.
3178 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3179 _mm256_broadcastw_epi16(__m128i __X)
3181 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3184 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3185 /// to all elements of the result's 256-bit vector of [8 x i32].
3187 /// \headerfile <immintrin.h>
3189 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3191 /// \param __X
3192 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3193 /// \returns A 256-bit vector of [8 x i32] containing the result.
3194 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3195 _mm256_broadcastd_epi32(__m128i __X)
3197 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3200 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3201 /// to all elements of the result's 256-bit vector of [4 x i64].
3203 /// \headerfile <immintrin.h>
3205 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3207 /// \param __X
3208 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3209 /// \returns A 256-bit vector of [4 x i64] containing the result.
3210 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3211 _mm256_broadcastq_epi64(__m128i __X)
3213 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3216 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3217 /// bytes of the 128-bit result.
3219 /// \headerfile <immintrin.h>
3221 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3223 /// \param __X
3224 /// A 128-bit integer vector whose low byte will be broadcast.
3225 /// \returns A 128-bit integer vector containing the result.
3226 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3227 _mm_broadcastb_epi8(__m128i __X)
3229 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3232 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3233 /// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3235 /// \headerfile <immintrin.h>
3237 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3239 /// \param __X
3240 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3241 /// \returns A 128-bit vector of [8 x i16] containing the result.
3242 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3243 _mm_broadcastw_epi16(__m128i __X)
3245 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3248 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3249 /// to all elements of the result's vector of [4 x i32].
3251 /// \headerfile <immintrin.h>
3253 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3255 /// \param __X
3256 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3257 /// \returns A 128-bit vector of [4 x i32] containing the result.
3258 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3259 _mm_broadcastd_epi32(__m128i __X)
3261 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3264 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3265 /// to both elements of the result's 128-bit vector of [2 x i64].
3267 /// \headerfile <immintrin.h>
3269 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3271 /// \param __X
3272 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3273 /// \returns A 128-bit vector of [2 x i64] containing the result.
3274 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3275 _mm_broadcastq_epi64(__m128i __X)
3277 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3280 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3281 /// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3282 /// elements of the 256-bit vector of [8 x i32] in \a __b.
3284 /// \code{.operation}
3285 /// FOR i := 0 TO 7
3286 /// j := i*32
3287 /// k := __b[j+2:j] * 32
3288 /// result[j+31:j] := __a[k+31:k]
3289 /// ENDFOR
3290 /// \endcode
3292 /// \headerfile <immintrin.h>
3294 /// This intrinsic corresponds to the \c VPERMD instruction.
3296 /// \param __a
3297 /// A 256-bit vector of [8 x i32] containing the source values.
3298 /// \param __b
3299 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
3300 /// \a __a.
3301 /// \returns A 256-bit vector of [8 x i32] containing the result.
3302 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3303 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
3305 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3308 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3309 /// the 256-bit vector of [4 x double] in \a V as specified by the
3310 /// immediate value \a M.
3312 /// \code{.operation}
3313 /// FOR i := 0 TO 3
3314 /// j := i*64
3315 /// k := (M >> i*2)[1:0] * 64
3316 /// result[j+63:j] := V[k+63:k]
3317 /// ENDFOR
3318 /// \endcode
3320 /// \headerfile <immintrin.h>
3322 /// \code
3323 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3324 /// \endcode
3326 /// This intrinsic corresponds to the \c VPERMPD instruction.
3328 /// \param V
3329 /// A 256-bit vector of [4 x double] containing the source values.
3330 /// \param M
3331 /// An immediate 8-bit value specifying which elements to copy from \a V.
3332 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
3333 /// \a M[3:2] specifies the index for element 1, and so forth.
3334 /// \returns A 256-bit vector of [4 x double] containing the result.
3335 #define _mm256_permute4x64_pd(V, M) \
3336 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3338 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3339 /// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3340 /// the elements of the 256-bit vector of [8 x i32] in \a __b.
3342 /// \code{.operation}
3343 /// FOR i := 0 TO 7
3344 /// j := i*32
3345 /// k := __b[j+2:j] * 32
3346 /// result[j+31:j] := __a[k+31:k]
3347 /// ENDFOR
3348 /// \endcode
3350 /// \headerfile <immintrin.h>
3352 /// This intrinsic corresponds to the \c VPERMPS instruction.
3354 /// \param __a
3355 /// A 256-bit vector of [8 x float] containing the source values.
3356 /// \param __b
3357 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
3358 /// \a __a.
3359 /// \returns A 256-bit vector of [8 x float] containing the result.
3360 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3361 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
3363 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3366 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3367 /// of the 256-bit vector of [4 x i64] in \a V as specified by the
3368 /// immediate value \a M.
3370 /// \code{.operation}
3371 /// FOR i := 0 TO 3
3372 /// j := i*64
3373 /// k := (M >> i*2)[1:0] * 64
3374 /// result[j+63:j] := V[k+63:k]
3375 /// ENDFOR
3376 /// \endcode
3378 /// \headerfile <immintrin.h>
3380 /// \code
3381 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3382 /// \endcode
3384 /// This intrinsic corresponds to the \c VPERMQ instruction.
3386 /// \param V
3387 /// A 256-bit vector of [4 x i64] containing the source values.
3388 /// \param M
3389 /// An immediate 8-bit value specifying which elements to copy from \a V.
3390 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
3391 /// \a M[3:2] specifies the index for element 1, and so forth.
3392 /// \returns A 256-bit vector of [4 x i64] containing the result.
3393 #define _mm256_permute4x64_epi64(V, M) \
3394 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3396 /// Sets each half of the 256-bit result either to zero or to one of the
3397 /// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3398 /// as specified by the immediate value \a M.
3400 /// \code{.operation}
3401 /// FOR i := 0 TO 1
3402 /// j := i*128
3403 /// k := M >> (i*4)
3404 /// IF k[3] == 0
3405 /// CASE (k[1:0]) OF
3406 /// 0: result[127+j:j] := V1[127:0]
3407 /// 1: result[127+j:j] := V1[255:128]
3408 /// 2: result[127+j:j] := V2[127:0]
3409 /// 3: result[127+j:j] := V2[255:128]
3410 /// ESAC
3411 /// ELSE
3412 /// result[127+j:j] := 0
3413 /// FI
3414 /// ENDFOR
3415 /// \endcode
3417 /// \headerfile <immintrin.h>
3419 /// \code
3420 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3421 /// \endcode
3423 /// This intrinsic corresponds to the \c VPERM2I128 instruction.
3425 /// \param V1
3426 /// A 256-bit integer vector containing source values.
3427 /// \param V2
3428 /// A 256-bit integer vector containing source values.
3429 /// \param M
3430 /// An immediate value specifying how to form the result. Bits [3:0]
3431 /// control the lower half of the result, bits [7:4] control the upper half.
3432 /// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3433 /// otherwise bits [1:0] determine the source as follows. \n
3434 /// 0: the lower half of \a V1 \n
3435 /// 1: the upper half of \a V1 \n
3436 /// 2: the lower half of \a V2 \n
3437 /// 3: the upper half of \a V2
3438 /// \returns A 256-bit integer vector containing the result.
3439 #define _mm256_permute2x128_si256(V1, V2, M) \
3440 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3442 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3443 /// of the immediate \a M is zero, extracts the lower half of the result;
3444 /// otherwise, extracts the upper half.
3446 /// \headerfile <immintrin.h>
3448 /// \code
3449 /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3450 /// \endcode
3452 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3454 /// \param V
3455 /// A 256-bit integer vector containing the source values.
3456 /// \param M
3457 /// An immediate value specifying which half of \a V to extract.
3458 /// \returns A 128-bit integer vector containing the result.
3459 #define _mm256_extracti128_si256(V, M) \
3460 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3462 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3463 /// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3464 /// is zero, overwrites the lower half of the result; otherwise,
3465 /// overwrites the upper half.
3467 /// \headerfile <immintrin.h>
3469 /// \code
3470 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3471 /// \endcode
3473 /// This intrinsic corresponds to the \c VINSERTI128 instruction.
3475 /// \param V1
3476 /// A 256-bit integer vector containing a source value.
3477 /// \param V2
3478 /// A 128-bit integer vector containing a source value.
3479 /// \param M
3480 /// An immediate value specifying where to put \a V2 in the result.
3481 /// \returns A 256-bit integer vector containing the result.
3482 #define _mm256_inserti128_si256(V1, V2, M) \
3483 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3484 (__v2di)(__m128i)(V2), (int)(M)))
3486 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3487 /// the most significant bit of the corresponding element in the mask
3488 /// \a __M is set; otherwise, sets that element of the result to zero.
3489 /// Returns the 256-bit [8 x i32] result.
3491 /// \code{.operation}
3492 /// FOR i := 0 TO 7
3493 /// j := i*32
3494 /// IF __M[j+31] == 1
3495 /// result[j+31:j] := Load32(__X+(i*4))
3496 /// ELSE
3497 /// result[j+31:j] := 0
3498 /// FI
3499 /// ENDFOR
3500 /// \endcode
3502 /// \headerfile <immintrin.h>
3504 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3506 /// \param __X
3507 /// A pointer to the memory used for loading values.
3508 /// \param __M
3509 /// A 256-bit vector of [8 x i32] containing the mask bits.
3510 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3511 /// elements.
3512 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3513 _mm256_maskload_epi32(int const *__X, __m256i __M)
3515 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3518 /// Conditionally loads four 64-bit integer elements from memory \a __X, if
3519 /// the most significant bit of the corresponding element in the mask
3520 /// \a __M is set; otherwise, sets that element of the result to zero.
3521 /// Returns the 256-bit [4 x i64] result.
3523 /// \code{.operation}
3524 /// FOR i := 0 TO 3
3525 /// j := i*64
3526 /// IF __M[j+63] == 1
3527 /// result[j+63:j] := Load64(__X+(i*8))
3528 /// ELSE
3529 /// result[j+63:j] := 0
3530 /// FI
3531 /// ENDFOR
3532 /// \endcode
3534 /// \headerfile <immintrin.h>
3536 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3538 /// \param __X
3539 /// A pointer to the memory used for loading values.
3540 /// \param __M
3541 /// A 256-bit vector of [4 x i64] containing the mask bits.
3542 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3543 /// elements.
3544 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3545 _mm256_maskload_epi64(long long const *__X, __m256i __M)
3547 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3550 /// Conditionally loads four 32-bit integer elements from memory \a __X, if
3551 /// the most significant bit of the corresponding element in the mask
3552 /// \a __M is set; otherwise, sets that element of the result to zero.
3553 /// Returns the 128-bit [4 x i32] result.
3555 /// \code{.operation}
3556 /// FOR i := 0 TO 3
3557 /// j := i*32
3558 /// IF __M[j+31] == 1
3559 /// result[j+31:j] := Load32(__X+(i*4))
3560 /// ELSE
3561 /// result[j+31:j] := 0
3562 /// FI
3563 /// ENDFOR
3564 /// \endcode
3566 /// \headerfile <immintrin.h>
3568 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3570 /// \param __X
3571 /// A pointer to the memory used for loading values.
3572 /// \param __M
3573 /// A 128-bit vector of [4 x i32] containing the mask bits.
3574 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3575 /// elements.
3576 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3577 _mm_maskload_epi32(int const *__X, __m128i __M)
3579 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3582 /// Conditionally loads two 64-bit integer elements from memory \a __X, if
3583 /// the most significant bit of the corresponding element in the mask
3584 /// \a __M is set; otherwise, sets that element of the result to zero.
3585 /// Returns the 128-bit [2 x i64] result.
3587 /// \code{.operation}
3588 /// FOR i := 0 TO 1
3589 /// j := i*64
3590 /// IF __M[j+63] == 1
3591 /// result[j+63:j] := Load64(__X+(i*8))
3592 /// ELSE
3593 /// result[j+63:j] := 0
3594 /// FI
3595 /// ENDFOR
3596 /// \endcode
3598 /// \headerfile <immintrin.h>
3600 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3602 /// \param __X
3603 /// A pointer to the memory used for loading values.
3604 /// \param __M
3605 /// A 128-bit vector of [2 x i64] containing the mask bits.
3606 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3607 /// elements.
3608 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3609 _mm_maskload_epi64(long long const *__X, __m128i __M)
3611 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3614 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3615 /// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3616 /// the corresponding element in the mask \a __M is set; otherwise, the
3617 /// memory element is unchanged.
3619 /// \code{.operation}
3620 /// FOR i := 0 TO 7
3621 /// j := i*32
3622 /// IF __M[j+31] == 1
3623 /// Store32(__X+(i*4), __Y[j+31:j])
3624 /// FI
3625 /// ENDFOR
3626 /// \endcode
3628 /// \headerfile <immintrin.h>
3630 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3632 /// \param __X
3633 /// A pointer to the memory used for storing values.
3634 /// \param __M
3635 /// A 256-bit vector of [8 x i32] containing the mask bits.
3636 /// \param __Y
3637 /// A 256-bit vector of [8 x i32] containing the values to store.
3638 static __inline__ void __DEFAULT_FN_ATTRS256
3639 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3641 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3644 /// Conditionally stores four 64-bit integer elements from the 256-bit vector
3645 /// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3646 /// the corresponding element in the mask \a __M is set; otherwise, the
3647 /// memory element is unchanged.
3649 /// \code{.operation}
3650 /// FOR i := 0 TO 3
3651 /// j := i*64
3652 /// IF __M[j+63] == 1
3653 /// Store64(__X+(i*8), __Y[j+63:j])
3654 /// FI
3655 /// ENDFOR
3656 /// \endcode
3658 /// \headerfile <immintrin.h>
3660 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3662 /// \param __X
3663 /// A pointer to the memory used for storing values.
3664 /// \param __M
3665 /// A 256-bit vector of [4 x i64] containing the mask bits.
3666 /// \param __Y
3667 /// A 256-bit vector of [4 x i64] containing the values to store.
3668 static __inline__ void __DEFAULT_FN_ATTRS256
3669 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3671 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3674 /// Conditionally stores four 32-bit integer elements from the 128-bit vector
3675 /// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3676 /// the corresponding element in the mask \a __M is set; otherwise, the
3677 /// memory element is unchanged.
3679 /// \code{.operation}
3680 /// FOR i := 0 TO 3
3681 /// j := i*32
3682 /// IF __M[j+31] == 1
3683 /// Store32(__X+(i*4), __Y[j+31:j])
3684 /// FI
3685 /// ENDFOR
3686 /// \endcode
3688 /// \headerfile <immintrin.h>
3690 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3692 /// \param __X
3693 /// A pointer to the memory used for storing values.
3694 /// \param __M
3695 /// A 128-bit vector of [4 x i32] containing the mask bits.
3696 /// \param __Y
3697 /// A 128-bit vector of [4 x i32] containing the values to store.
3698 static __inline__ void __DEFAULT_FN_ATTRS128
3699 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3701 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3704 /// Conditionally stores two 64-bit integer elements from the 128-bit vector
3705 /// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3706 /// the corresponding element in the mask \a __M is set; otherwise, the
3707 /// memory element is unchanged.
3709 /// \code{.operation}
3710 /// FOR i := 0 TO 1
3711 /// j := i*64
3712 /// IF __M[j+63] == 1
3713 /// Store64(__X+(i*8), __Y[j+63:j])
3714 /// FI
3715 /// ENDFOR
3716 /// \endcode
3718 /// \headerfile <immintrin.h>
3720 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3722 /// \param __X
3723 /// A pointer to the memory used for storing values.
3724 /// \param __M
3725 /// A 128-bit vector of [2 x i64] containing the mask bits.
3726 /// \param __Y
3727 /// A 128-bit vector of [2 x i64] containing the values to store.
3728 static __inline__ void __DEFAULT_FN_ATTRS128
3729 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3731 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3734 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3735 /// left by the number of bits given in the corresponding element of the
3736 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3737 /// returns the result. If the shift count for any element is greater than
3738 /// 31, the result for that element is zero.
3740 /// \headerfile <immintrin.h>
3742 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3744 /// \param __X
3745 /// A 256-bit vector of [8 x i32] to be shifted.
3746 /// \param __Y
3747 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3748 /// bits).
3749 /// \returns A 256-bit vector of [8 x i32] containing the result.
3750 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3751 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
3753 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3756 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3757 /// left by the number of bits given in the corresponding element of the
3758 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3759 /// returns the result. If the shift count for any element is greater than
3760 /// 31, the result for that element is zero.
3762 /// \headerfile <immintrin.h>
3764 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3766 /// \param __X
3767 /// A 128-bit vector of [4 x i32] to be shifted.
3768 /// \param __Y
3769 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3770 /// bits).
3771 /// \returns A 128-bit vector of [4 x i32] containing the result.
3772 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3773 _mm_sllv_epi32(__m128i __X, __m128i __Y)
3775 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3778 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3779 /// left by the number of bits given in the corresponding element of the
3780 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3781 /// returns the result. If the shift count for any element is greater than
3782 /// 63, the result for that element is zero.
3784 /// \headerfile <immintrin.h>
3786 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3788 /// \param __X
3789 /// A 256-bit vector of [4 x i64] to be shifted.
3790 /// \param __Y
3791 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3792 /// bits).
3793 /// \returns A 256-bit vector of [4 x i64] containing the result.
3794 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3795 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
3797 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3800 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3801 /// left by the number of bits given in the corresponding element of the
3802 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3803 /// returns the result. If the shift count for any element is greater than
3804 /// 63, the result for that element is zero.
3806 /// \headerfile <immintrin.h>
3808 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3810 /// \param __X
3811 /// A 128-bit vector of [2 x i64] to be shifted.
3812 /// \param __Y
3813 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3814 /// bits).
3815 /// \returns A 128-bit vector of [2 x i64] containing the result.
3816 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3817 _mm_sllv_epi64(__m128i __X, __m128i __Y)
3819 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3822 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3823 /// right by the number of bits given in the corresponding element of the
3824 /// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3825 /// returns the result. If the shift count for any element is greater than
3826 /// 31, the result for that element is 0 or -1 according to the sign bit
3827 /// for that element.
3829 /// \headerfile <immintrin.h>
3831 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3833 /// \param __X
3834 /// A 256-bit vector of [8 x i32] to be shifted.
3835 /// \param __Y
3836 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3837 /// bits).
3838 /// \returns A 256-bit vector of [8 x i32] containing the result.
3839 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3840 _mm256_srav_epi32(__m256i __X, __m256i __Y)
3842 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3845 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3846 /// right by the number of bits given in the corresponding element of the
3847 /// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3848 /// returns the result. If the shift count for any element is greater than
3849 /// 31, the result for that element is 0 or -1 according to the sign bit
3850 /// for that element.
3852 /// \headerfile <immintrin.h>
3854 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3856 /// \param __X
3857 /// A 128-bit vector of [4 x i32] to be shifted.
3858 /// \param __Y
3859 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3860 /// bits).
3861 /// \returns A 128-bit vector of [4 x i32] containing the result.
3862 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3863 _mm_srav_epi32(__m128i __X, __m128i __Y)
3865 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3868 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3869 /// right by the number of bits given in the corresponding element of the
3870 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3871 /// returns the result. If the shift count for any element is greater than
3872 /// 31, the result for that element is zero.
3874 /// \headerfile <immintrin.h>
3876 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3878 /// \param __X
3879 /// A 256-bit vector of [8 x i32] to be shifted.
3880 /// \param __Y
3881 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3882 /// bits).
3883 /// \returns A 256-bit vector of [8 x i32] containing the result.
3884 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3885 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
3887 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3890 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3891 /// right by the number of bits given in the corresponding element of the
3892 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3893 /// returns the result. If the shift count for any element is greater than
3894 /// 31, the result for that element is zero.
3896 /// \headerfile <immintrin.h>
3898 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3900 /// \param __X
3901 /// A 128-bit vector of [4 x i32] to be shifted.
3902 /// \param __Y
3903 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3904 /// bits).
3905 /// \returns A 128-bit vector of [4 x i32] containing the result.
3906 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3907 _mm_srlv_epi32(__m128i __X, __m128i __Y)
3909 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3912 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3913 /// right by the number of bits given in the corresponding element of the
3914 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3915 /// returns the result. If the shift count for any element is greater than
3916 /// 63, the result for that element is zero.
3918 /// \headerfile <immintrin.h>
3920 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3922 /// \param __X
3923 /// A 256-bit vector of [4 x i64] to be shifted.
3924 /// \param __Y
3925 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3926 /// bits).
3927 /// \returns A 256-bit vector of [4 x i64] containing the result.
3928 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3929 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
3931 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3934 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3935 /// right by the number of bits given in the corresponding element of the
3936 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3937 /// returns the result. If the shift count for any element is greater than
3938 /// 63, the result for that element is zero.
3940 /// \headerfile <immintrin.h>
3942 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3944 /// \param __X
3945 /// A 128-bit vector of [2 x i64] to be shifted.
3946 /// \param __Y
3947 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3948 /// bits).
3949 /// \returns A 128-bit vector of [2 x i64] containing the result.
3950 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3951 _mm_srlv_epi64(__m128i __X, __m128i __Y)
3953 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3956 /// Conditionally gathers two 64-bit floating-point values, either from the
3957 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3958 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3959 /// of [2 x double] in \a mask determines the source for each element.
3961 /// \code{.operation}
3962 /// FOR element := 0 to 1
3963 /// j := element*64
3964 /// k := element*32
3965 /// IF mask[j+63] == 0
3966 /// result[j+63:j] := a[j+63:j]
3967 /// ELSE
3968 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3969 /// FI
3970 /// ENDFOR
3971 /// \endcode
3973 /// \headerfile <immintrin.h>
3975 /// \code
3976 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3977 /// __m128d mask, const int s);
3978 /// \endcode
3980 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3982 /// \param a
3983 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
3984 /// zero.
3985 /// \param m
3986 /// A pointer to the memory used for loading values.
3987 /// \param i
3988 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3989 /// the first two elements are used.
3990 /// \param mask
3991 /// A 128-bit vector of [2 x double] containing the mask. The most
3992 /// significant bit of each element in the mask vector represents the mask
3993 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3994 /// is gathered; otherwise the value is loaded from memory.
3995 /// \param s
3996 /// A literal constant scale factor for the indexes in \a i. Must be
3997 /// 1, 2, 4, or 8.
3998 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3999 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4000 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4001 (double const *)(m), \
4002 (__v4si)(__m128i)(i), \
4003 (__v2df)(__m128d)(mask), (s)))
4005 /// Conditionally gathers four 64-bit floating-point values, either from the
4006 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4007 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4008 /// of [4 x double] in \a mask determines the source for each element.
4010 /// \code{.operation}
4011 /// FOR element := 0 to 3
4012 /// j := element*64
4013 /// k := element*32
4014 /// IF mask[j+63] == 0
4015 /// result[j+63:j] := a[j+63:j]
4016 /// ELSE
4017 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4018 /// FI
4019 /// ENDFOR
4020 /// \endcode
4022 /// \headerfile <immintrin.h>
4024 /// \code
4025 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4026 /// __m256d mask, const int s);
4027 /// \endcode
4029 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4031 /// \param a
4032 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
4033 /// zero.
4034 /// \param m
4035 /// A pointer to the memory used for loading values.
4036 /// \param i
4037 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4038 /// \param mask
4039 /// A 256-bit vector of [4 x double] containing the mask. The most
4040 /// significant bit of each element in the mask vector represents the mask
4041 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4042 /// is gathered; otherwise the value is loaded from memory.
4043 /// \param s
4044 /// A literal constant scale factor for the indexes in \a i. Must be
4045 /// 1, 2, 4, or 8.
4046 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4047 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4048 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4049 (double const *)(m), \
4050 (__v4si)(__m128i)(i), \
4051 (__v4df)(__m256d)(mask), (s)))
4053 /// Conditionally gathers two 64-bit floating-point values, either from the
4054 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4055 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4056 /// of [2 x double] in \a mask determines the source for each element.
4058 /// \code{.operation}
4059 /// FOR element := 0 to 1
4060 /// j := element*64
4061 /// k := element*64
4062 /// IF mask[j+63] == 0
4063 /// result[j+63:j] := a[j+63:j]
4064 /// ELSE
4065 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4066 /// FI
4067 /// ENDFOR
4068 /// \endcode
4070 /// \headerfile <immintrin.h>
4072 /// \code
4073 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4074 /// __m128d mask, const int s);
4075 /// \endcode
4077 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4079 /// \param a
4080 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
4081 /// zero.
4082 /// \param m
4083 /// A pointer to the memory used for loading values.
4084 /// \param i
4085 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4086 /// \param mask
4087 /// A 128-bit vector of [2 x double] containing the mask. The most
4088 /// significant bit of each element in the mask vector represents the mask
4089 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4090 /// is gathered; otherwise the value is loaded from memory.
4091 /// \param s
4092 /// A literal constant scale factor for the indexes in \a i. Must be
4093 /// 1, 2, 4, or 8.
4094 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4095 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4096 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4097 (double const *)(m), \
4098 (__v2di)(__m128i)(i), \
4099 (__v2df)(__m128d)(mask), (s)))
4101 /// Conditionally gathers four 64-bit floating-point values, either from the
4102 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4103 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4104 /// of [4 x double] in \a mask determines the source for each element.
4106 /// \code{.operation}
4107 /// FOR element := 0 to 3
4108 /// j := element*64
4109 /// k := element*64
4110 /// IF mask[j+63] == 0
4111 /// result[j+63:j] := a[j+63:j]
4112 /// ELSE
4113 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4114 /// FI
4115 /// ENDFOR
4116 /// \endcode
4118 /// \headerfile <immintrin.h>
4120 /// \code
4121 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4122 /// __m256d mask, const int s);
4123 /// \endcode
4125 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4127 /// \param a
4128 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
4129 /// zero.
4130 /// \param m
4131 /// A pointer to the memory used for loading values.
4132 /// \param i
4133 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4134 /// \param mask
4135 /// A 256-bit vector of [4 x double] containing the mask. The most
4136 /// significant bit of each element in the mask vector represents the mask
4137 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4138 /// is gathered; otherwise the value is loaded from memory.
4139 /// \param s
4140 /// A literal constant scale factor for the indexes in \a i. Must be
4141 /// 1, 2, 4, or 8.
4142 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4143 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4144 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4145 (double const *)(m), \
4146 (__v4di)(__m256i)(i), \
4147 (__v4df)(__m256d)(mask), (s)))
4149 /// Conditionally gathers four 32-bit floating-point values, either from the
4150 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4151 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4152 /// of [4 x float] in \a mask determines the source for each element.
4154 /// \code{.operation}
4155 /// FOR element := 0 to 3
4156 /// j := element*32
4157 /// k := element*32
4158 /// IF mask[j+31] == 0
4159 /// result[j+31:j] := a[j+31:j]
4160 /// ELSE
4161 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4162 /// FI
4163 /// ENDFOR
4164 /// \endcode
4166 /// \headerfile <immintrin.h>
4168 /// \code
4169 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4170 /// __m128 mask, const int s);
4171 /// \endcode
4173 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4175 /// \param a
4176 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4177 /// zero.
4178 /// \param m
4179 /// A pointer to the memory used for loading values.
4180 /// \param i
4181 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4182 /// \param mask
4183 /// A 128-bit vector of [4 x float] containing the mask. The most
4184 /// significant bit of each element in the mask vector represents the mask
4185 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4186 /// is gathered; otherwise the value is loaded from memory.
4187 /// \param s
4188 /// A literal constant scale factor for the indexes in \a i. Must be
4189 /// 1, 2, 4, or 8.
4190 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4191 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4192 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4193 (float const *)(m), \
4194 (__v4si)(__m128i)(i), \
4195 (__v4sf)(__m128)(mask), (s)))
4197 /// Conditionally gathers eight 32-bit floating-point values, either from the
4198 /// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4199 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4200 /// of [8 x float] in \a mask determines the source for each element.
4202 /// \code{.operation}
4203 /// FOR element := 0 to 7
4204 /// j := element*32
4205 /// k := element*32
4206 /// IF mask[j+31] == 0
4207 /// result[j+31:j] := a[j+31:j]
4208 /// ELSE
4209 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4210 /// FI
4211 /// ENDFOR
4212 /// \endcode
4214 /// \headerfile <immintrin.h>
4216 /// \code
4217 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4218 /// __m256 mask, const int s);
4219 /// \endcode
4221 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4223 /// \param a
4224 /// A 256-bit vector of [8 x float] used as the source when a mask bit is
4225 /// zero.
4226 /// \param m
4227 /// A pointer to the memory used for loading values.
4228 /// \param i
4229 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4230 /// \param mask
4231 /// A 256-bit vector of [8 x float] containing the mask. The most
4232 /// significant bit of each element in the mask vector represents the mask
4233 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4234 /// is gathered; otherwise the value is loaded from memory.
4235 /// \param s
4236 /// A literal constant scale factor for the indexes in \a i. Must be
4237 /// 1, 2, 4, or 8.
4238 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4239 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4240 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4241 (float const *)(m), \
4242 (__v8si)(__m256i)(i), \
4243 (__v8sf)(__m256)(mask), (s)))
4245 /// Conditionally gathers two 32-bit floating-point values, either from the
4246 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4247 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4248 /// of [4 x float] in \a mask determines the source for the lower two
4249 /// elements. The upper two elements of the result are zeroed.
4251 /// \code{.operation}
4252 /// FOR element := 0 to 1
4253 /// j := element*32
4254 /// k := element*64
4255 /// IF mask[j+31] == 0
4256 /// result[j+31:j] := a[j+31:j]
4257 /// ELSE
4258 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4259 /// FI
4260 /// ENDFOR
4261 /// result[127:64] := 0
4262 /// \endcode
4264 /// \headerfile <immintrin.h>
4266 /// \code
4267 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4268 /// __m128 mask, const int s);
4269 /// \endcode
4271 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4273 /// \param a
4274 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4275 /// zero. Only the first two elements are used.
4276 /// \param m
4277 /// A pointer to the memory used for loading values.
4278 /// \param i
4279 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4280 /// \param mask
4281 /// A 128-bit vector of [4 x float] containing the mask. The most
4282 /// significant bit of each element in the mask vector represents the mask
4283 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4284 /// is gathered; otherwise the value is loaded from memory. Only the first
4285 /// two elements are used.
4286 /// \param s
4287 /// A literal constant scale factor for the indexes in \a i. Must be
4288 /// 1, 2, 4, or 8.
4289 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4290 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4291 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4292 (float const *)(m), \
4293 (__v2di)(__m128i)(i), \
4294 (__v4sf)(__m128)(mask), (s)))
4296 /// Conditionally gathers four 32-bit floating-point values, either from the
4297 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4298 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4299 /// of [4 x float] in \a mask determines the source for each element.
4301 /// \code{.operation}
4302 /// FOR element := 0 to 3
4303 /// j := element*32
4304 /// k := element*64
4305 /// IF mask[j+31] == 0
4306 /// result[j+31:j] := a[j+31:j]
4307 /// ELSE
4308 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4309 /// FI
4310 /// ENDFOR
4311 /// \endcode
4313 /// \headerfile <immintrin.h>
4315 /// \code
4316 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4317 /// __m128 mask, const int s);
4318 /// \endcode
4320 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4322 /// \param a
4323 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4324 /// zero.
4325 /// \param m
4326 /// A pointer to the memory used for loading values.
4327 /// \param i
4328 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4329 /// \param mask
4330 /// A 128-bit vector of [4 x float] containing the mask. The most
4331 /// significant bit of each element in the mask vector represents the mask
4332 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4333 /// is gathered; otherwise the value is loaded from memory.
4334 /// \param s
4335 /// A literal constant scale factor for the indexes in \a i. Must be
4336 /// 1, 2, 4, or 8.
4337 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4338 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4339 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4340 (float const *)(m), \
4341 (__v4di)(__m256i)(i), \
4342 (__v4sf)(__m128)(mask), (s)))
4344 /// Conditionally gathers four 32-bit integer values, either from the
4345 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4346 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4347 /// of [4 x i32] in \a mask determines the source for each element.
4349 /// \code{.operation}
4350 /// FOR element := 0 to 3
4351 /// j := element*32
4352 /// k := element*32
4353 /// IF mask[j+31] == 0
4354 /// result[j+31:j] := a[j+31:j]
4355 /// ELSE
4356 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4357 /// FI
4358 /// ENDFOR
4359 /// \endcode
4361 /// \headerfile <immintrin.h>
4363 /// \code
4364 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4365 /// __m128i mask, const int s);
4366 /// \endcode
4368 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4370 /// \param a
4371 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4372 /// zero.
4373 /// \param m
4374 /// A pointer to the memory used for loading values.
4375 /// \param i
4376 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4377 /// \param mask
4378 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4379 /// bit of each element in the mask vector represents the mask bits. If a
4380 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4381 /// otherwise the value is loaded from memory.
4382 /// \param s
4383 /// A literal constant scale factor for the indexes in \a i. Must be
4384 /// 1, 2, 4, or 8.
4385 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4386 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4387 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4388 (int const *)(m), \
4389 (__v4si)(__m128i)(i), \
4390 (__v4si)(__m128i)(mask), (s)))
4392 /// Conditionally gathers eight 32-bit integer values, either from the
4393 /// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4394 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4395 /// of [8 x i32] in \a mask determines the source for each element.
4397 /// \code{.operation}
4398 /// FOR element := 0 to 7
4399 /// j := element*32
4400 /// k := element*32
4401 /// IF mask[j+31] == 0
4402 /// result[j+31:j] := a[j+31:j]
4403 /// ELSE
4404 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4405 /// FI
4406 /// ENDFOR
4407 /// \endcode
4409 /// \headerfile <immintrin.h>
4411 /// \code
4412 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4413 /// __m256i mask, const int s);
4414 /// \endcode
4416 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4418 /// \param a
4419 /// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4420 /// zero.
4421 /// \param m
4422 /// A pointer to the memory used for loading values.
4423 /// \param i
4424 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4425 /// \param mask
4426 /// A 256-bit vector of [8 x i32] containing the mask. The most significant
4427 /// bit of each element in the mask vector represents the mask bits. If a
4428 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4429 /// otherwise the value is loaded from memory.
4430 /// \param s
4431 /// A literal constant scale factor for the indexes in \a i. Must be
4432 /// 1, 2, 4, or 8.
4433 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4434 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4435 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4436 (int const *)(m), \
4437 (__v8si)(__m256i)(i), \
4438 (__v8si)(__m256i)(mask), (s)))
4440 /// Conditionally gathers two 32-bit integer values, either from the
4441 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4442 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4443 /// of [4 x i32] in \a mask determines the source for the lower two
4444 /// elements. The upper two elements of the result are zeroed.
4446 /// \code{.operation}
4447 /// FOR element := 0 to 1
4448 /// j := element*32
4449 /// k := element*64
4450 /// IF mask[j+31] == 0
4451 /// result[j+31:j] := a[j+31:j]
4452 /// ELSE
4453 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4454 /// FI
4455 /// ENDFOR
4456 /// result[127:64] := 0
4457 /// \endcode
4459 /// \headerfile <immintrin.h>
4461 /// \code
4462 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4463 /// __m128i mask, const int s);
4464 /// \endcode
4466 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4468 /// \param a
4469 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4470 /// zero. Only the first two elements are used.
4471 /// \param m
4472 /// A pointer to the memory used for loading values.
4473 /// \param i
4474 /// A 128-bit vector of [2 x i64] containing indexes into \a m.
4475 /// \param mask
4476 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4477 /// bit of each element in the mask vector represents the mask bits. If a
4478 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4479 /// otherwise the value is loaded from memory. Only the first two elements
4480 /// are used.
4481 /// \param s
4482 /// A literal constant scale factor for the indexes in \a i. Must be
4483 /// 1, 2, 4, or 8.
4484 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4485 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4486 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4487 (int const *)(m), \
4488 (__v2di)(__m128i)(i), \
4489 (__v4si)(__m128i)(mask), (s)))
4491 /// Conditionally gathers four 32-bit integer values, either from the
4492 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4493 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4494 /// of [4 x i32] in \a mask determines the source for each element.
4496 /// \code{.operation}
4497 /// FOR element := 0 to 3
4498 /// j := element*32
4499 /// k := element*64
4500 /// IF mask[j+31] == 0
4501 /// result[j+31:j] := a[j+31:j]
4502 /// ELSE
4503 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4504 /// FI
4505 /// ENDFOR
4506 /// \endcode
4508 /// \headerfile <immintrin.h>
4510 /// \code
4511 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4512 /// __m128i mask, const int s);
4513 /// \endcode
4515 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4517 /// \param a
4518 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4519 /// zero.
4520 /// \param m
4521 /// A pointer to the memory used for loading values.
4522 /// \param i
4523 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4524 /// \param mask
4525 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4526 /// bit of each element in the mask vector represents the mask bits. If a
4527 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4528 /// otherwise the value is loaded from memory.
4529 /// \param s
4530 /// A literal constant scale factor for the indexes in \a i. Must be
4531 /// 1, 2, 4, or 8.
4532 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4533 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4534 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4535 (int const *)(m), \
4536 (__v4di)(__m256i)(i), \
4537 (__v4si)(__m128i)(mask), (s)))
4539 /// Conditionally gathers two 64-bit integer values, either from the
4540 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4541 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4542 /// of [2 x i64] in \a mask determines the source for each element.
4544 /// \code{.operation}
4545 /// FOR element := 0 to 1
4546 /// j := element*64
4547 /// k := element*32
4548 /// IF mask[j+63] == 0
4549 /// result[j+63:j] := a[j+63:j]
4550 /// ELSE
4551 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4552 /// FI
4553 /// ENDFOR
4554 /// \endcode
4556 /// \headerfile <immintrin.h>
4558 /// \code
4559 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4560 /// __m128i mask, const int s);
4561 /// \endcode
4563 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4565 /// \param a
4566 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4567 /// zero.
4568 /// \param m
4569 /// A pointer to the memory used for loading values.
4570 /// \param i
4571 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4572 /// the first two elements are used.
4573 /// \param mask
4574 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
4575 /// bit of each element in the mask vector represents the mask bits. If a
4576 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4577 /// otherwise the value is loaded from memory.
4578 /// \param s
4579 /// A literal constant scale factor for the indexes in \a i. Must be
4580 /// 1, 2, 4, or 8.
4581 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4582 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4583 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4584 (long long const *)(m), \
4585 (__v4si)(__m128i)(i), \
4586 (__v2di)(__m128i)(mask), (s)))
4588 /// Conditionally gathers four 64-bit integer values, either from the
4589 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4590 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4591 /// of [4 x i64] in \a mask determines the source for each element.
4593 /// \code{.operation}
4594 /// FOR element := 0 to 3
4595 /// j := element*64
4596 /// k := element*32
4597 /// IF mask[j+63] == 0
4598 /// result[j+63:j] := a[j+63:j]
4599 /// ELSE
4600 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4601 /// FI
4602 /// ENDFOR
4603 /// \endcode
4605 /// \headerfile <immintrin.h>
4607 /// \code
4608 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4609 /// __m128i i, __m256i mask, const int s);
4610 /// \endcode
4612 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4614 /// \param a
4615 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4616 /// zero.
4617 /// \param m
4618 /// A pointer to the memory used for loading values.
4619 /// \param i
4620 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4621 /// \param mask
4622 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
4623 /// bit of each element in the mask vector represents the mask bits. If a
4624 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4625 /// otherwise the value is loaded from memory.
4626 /// \param s
4627 /// A literal constant scale factor for the indexes in \a i. Must be
4628 /// 1, 2, 4, or 8.
4629 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4630 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4631 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4632 (long long const *)(m), \
4633 (__v4si)(__m128i)(i), \
4634 (__v4di)(__m256i)(mask), (s)))
4636 /// Conditionally gathers two 64-bit integer values, either from the
4637 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4638 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4639 /// of [2 x i64] in \a mask determines the source for each element.
4641 /// \code{.operation}
4642 /// FOR element := 0 to 1
4643 /// j := element*64
4644 /// k := element*64
4645 /// IF mask[j+63] == 0
4646 /// result[j+63:j] := a[j+63:j]
4647 /// ELSE
4648 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4649 /// FI
4650 /// ENDFOR
4651 /// \endcode
4653 /// \headerfile <immintrin.h>
4655 /// \code
4656 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4657 /// __m128i mask, const int s);
4658 /// \endcode
4660 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4662 /// \param a
4663 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4664 /// zero.
4665 /// \param m
4666 /// A pointer to the memory used for loading values.
4667 /// \param i
4668 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4669 /// \param mask
4670 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
4671 /// bit of each element in the mask vector represents the mask bits. If a
4672 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4673 /// otherwise the value is loaded from memory.
4674 /// \param s
4675 /// A literal constant scale factor for the indexes in \a i. Must be
4676 /// 1, 2, 4, or 8.
4677 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4678 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4679 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4680 (long long const *)(m), \
4681 (__v2di)(__m128i)(i), \
4682 (__v2di)(__m128i)(mask), (s)))
4684 /// Conditionally gathers four 64-bit integer values, either from the
4685 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4686 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4687 /// of [4 x i64] in \a mask determines the source for each element.
4689 /// \code{.operation}
4690 /// FOR element := 0 to 3
4691 /// j := element*64
4692 /// k := element*64
4693 /// IF mask[j+63] == 0
4694 /// result[j+63:j] := a[j+63:j]
4695 /// ELSE
4696 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4697 /// FI
4698 /// ENDFOR
4699 /// \endcode
4701 /// \headerfile <immintrin.h>
4703 /// \code
4704 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4705 /// __m256i i, __m256i mask, const int s);
4706 /// \endcode
4708 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4710 /// \param a
4711 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4712 /// zero.
4713 /// \param m
4714 /// A pointer to the memory used for loading values.
4715 /// \param i
4716 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4717 /// \param mask
4718 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
4719 /// bit of each element in the mask vector represents the mask bits. If a
4720 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4721 /// otherwise the value is loaded from memory.
4722 /// \param s
4723 /// A literal constant scale factor for the indexes in \a i. Must be
4724 /// 1, 2, 4, or 8.
4725 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4726 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4727 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4728 (long long const *)(m), \
4729 (__v4di)(__m256i)(i), \
4730 (__v4di)(__m256i)(mask), (s)))
4732 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4733 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4735 /// \code{.operation}
4736 /// FOR element := 0 to 1
4737 /// j := element*64
4738 /// k := element*32
4739 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4740 /// ENDFOR
4741 /// \endcode
4743 /// \headerfile <immintrin.h>
4745 /// \code
4746 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4747 /// \endcode
4749 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4751 /// \param m
4752 /// A pointer to the memory used for loading values.
4753 /// \param i
4754 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4755 /// the first two elements are used.
4756 /// \param s
4757 /// A literal constant scale factor for the indexes in \a i. Must be
4758 /// 1, 2, 4, or 8.
4759 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4760 #define _mm_i32gather_pd(m, i, s) \
4761 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4762 (double const *)(m), \
4763 (__v4si)(__m128i)(i), \
4764 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4765 _mm_setzero_pd()), \
4766 (s)))
4768 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4769 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4771 /// \code{.operation}
4772 /// FOR element := 0 to 3
4773 /// j := element*64
4774 /// k := element*32
4775 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4776 /// ENDFOR
4777 /// \endcode
4779 /// \headerfile <immintrin.h>
4781 /// \code
4782 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4783 /// \endcode
4785 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4787 /// \param m
4788 /// A pointer to the memory used for loading values.
4789 /// \param i
4790 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4791 /// \param s
4792 /// A literal constant scale factor for the indexes in \a i. Must be
4793 /// 1, 2, 4, or 8.
4794 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4795 #define _mm256_i32gather_pd(m, i, s) \
4796 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4797 (double const *)(m), \
4798 (__v4si)(__m128i)(i), \
4799 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4800 _mm256_setzero_pd(), \
4801 _CMP_EQ_OQ), \
4802 (s)))
4804 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4805 /// indexes from the 128-bit vector of [2 x i64] in \a i.
4807 /// \code{.operation}
4808 /// FOR element := 0 to 1
4809 /// j := element*64
4810 /// k := element*64
4811 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4812 /// ENDFOR
4813 /// \endcode
4815 /// \headerfile <immintrin.h>
4817 /// \code
4818 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4819 /// \endcode
4821 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4823 /// \param m
4824 /// A pointer to the memory used for loading values.
4825 /// \param i
4826 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4827 /// \param s
4828 /// A literal constant scale factor for the indexes in \a i. Must be
4829 /// 1, 2, 4, or 8.
4830 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4831 #define _mm_i64gather_pd(m, i, s) \
4832 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4833 (double const *)(m), \
4834 (__v2di)(__m128i)(i), \
4835 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4836 _mm_setzero_pd()), \
4837 (s)))
4839 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4840 /// indexes from the 256-bit vector of [4 x i64] in \a i.
4842 /// \code{.operation}
4843 /// FOR element := 0 to 3
4844 /// j := element*64
4845 /// k := element*64
4846 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4847 /// ENDFOR
4848 /// \endcode
4850 /// \headerfile <immintrin.h>
4852 /// \code
4853 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4854 /// \endcode
4856 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4858 /// \param m
4859 /// A pointer to the memory used for loading values.
4860 /// \param i
4861 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4862 /// \param s
4863 /// A literal constant scale factor for the indexes in \a i. Must be
4864 /// 1, 2, 4, or 8.
4865 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4866 #define _mm256_i64gather_pd(m, i, s) \
4867 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4868 (double const *)(m), \
4869 (__v4di)(__m256i)(i), \
4870 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4871 _mm256_setzero_pd(), \
4872 _CMP_EQ_OQ), \
4873 (s)))
4875 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4876 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4878 /// \code{.operation}
4879 /// FOR element := 0 to 3
4880 /// j := element*32
4881 /// k := element*32
4882 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4883 /// ENDFOR
4884 /// \endcode
4886 /// \headerfile <immintrin.h>
4888 /// \code
4889 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4890 /// \endcode
4892 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4894 /// \param m
4895 /// A pointer to the memory used for loading values.
4896 /// \param i
4897 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4898 /// \param s
4899 /// A literal constant scale factor for the indexes in \a i. Must be
4900 /// 1, 2, 4, or 8.
4901 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4902 #define _mm_i32gather_ps(m, i, s) \
4903 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4904 (float const *)(m), \
4905 (__v4si)(__m128i)(i), \
4906 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4907 _mm_setzero_ps()), \
4908 (s)))
4910 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
4911 /// indexes from the 256-bit vector of [8 x i32] in \a i.
4913 /// \code{.operation}
4914 /// FOR element := 0 to 7
4915 /// j := element*32
4916 /// k := element*32
4917 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4918 /// ENDFOR
4919 /// \endcode
4921 /// \headerfile <immintrin.h>
4923 /// \code
4924 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4925 /// \endcode
4927 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4929 /// \param m
4930 /// A pointer to the memory used for loading values.
4931 /// \param i
4932 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4933 /// \param s
4934 /// A literal constant scale factor for the indexes in \a i. Must be
4935 /// 1, 2, 4, or 8.
4936 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4937 #define _mm256_i32gather_ps(m, i, s) \
4938 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4939 (float const *)(m), \
4940 (__v8si)(__m256i)(i), \
4941 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4942 _mm256_setzero_ps(), \
4943 _CMP_EQ_OQ), \
4944 (s)))
4946 /// Gathers two 32-bit floating-point values from memory \a m using scaled
4947 /// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4948 /// elements of the result are zeroed.
4950 /// \code{.operation}
4951 /// FOR element := 0 to 1
4952 /// j := element*32
4953 /// k := element*64
4954 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4955 /// ENDFOR
4956 /// result[127:64] := 0
4957 /// \endcode
4959 /// \headerfile <immintrin.h>
4961 /// \code
4962 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4963 /// \endcode
4965 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4967 /// \param m
4968 /// A pointer to the memory used for loading values.
4969 /// \param i
4970 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4971 /// \param s
4972 /// A literal constant scale factor for the indexes in \a i. Must be
4973 /// 1, 2, 4, or 8.
4974 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4975 #define _mm_i64gather_ps(m, i, s) \
4976 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4977 (float const *)(m), \
4978 (__v2di)(__m128i)(i), \
4979 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4980 _mm_setzero_ps()), \
4981 (s)))
4983 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4984 /// indexes from the 256-bit vector of [4 x i64] in \a i.
4986 /// \code{.operation}
4987 /// FOR element := 0 to 3
4988 /// j := element*32
4989 /// k := element*64
4990 /// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4991 /// ENDFOR
4992 /// \endcode
4994 /// \headerfile <immintrin.h>
4996 /// \code
4997 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4998 /// \endcode
5000 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
5002 /// \param m
5003 /// A pointer to the memory used for loading values.
5004 /// \param i
5005 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5006 /// \param s
5007 /// A literal constant scale factor for the indexes in \a i. Must be
5008 /// 1, 2, 4, or 8.
5009 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
5010 #define _mm256_i64gather_ps(m, i, s) \
5011 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5012 (float const *)(m), \
5013 (__v4di)(__m256i)(i), \
5014 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5015 _mm_setzero_ps()), \
5016 (s)))
5018 /// Gathers four 32-bit floating-point values from memory \a m using scaled
5019 /// indexes from the 128-bit vector of [4 x i32] in \a i.
5021 /// \code{.operation}
5022 /// FOR element := 0 to 3
5023 /// j := element*32
5024 /// k := element*32
5025 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5026 /// ENDFOR
5027 /// \endcode
5029 /// \headerfile <immintrin.h>
5031 /// \code
5032 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5033 /// \endcode
5035 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5037 /// \param m
5038 /// A pointer to the memory used for loading values.
5039 /// \param i
5040 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5041 /// \param s
5042 /// A literal constant scale factor for the indexes in \a i. Must be
5043 /// 1, 2, 4, or 8.
5044 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5045 #define _mm_i32gather_epi32(m, i, s) \
5046 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5047 (int const *)(m), (__v4si)(__m128i)(i), \
5048 (__v4si)_mm_set1_epi32(-1), (s)))
5050 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
5051 /// indexes from the 256-bit vector of [8 x i32] in \a i.
5053 /// \code{.operation}
5054 /// FOR element := 0 to 7
5055 /// j := element*32
5056 /// k := element*32
5057 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5058 /// ENDFOR
5059 /// \endcode
5061 /// \headerfile <immintrin.h>
5063 /// \code
5064 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5065 /// \endcode
5067 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5069 /// \param m
5070 /// A pointer to the memory used for loading values.
5071 /// \param i
5072 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5073 /// \param s
5074 /// A literal constant scale factor for the indexes in \a i. Must be
5075 /// 1, 2, 4, or 8.
5076 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5077 #define _mm256_i32gather_epi32(m, i, s) \
5078 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5079 (int const *)(m), (__v8si)(__m256i)(i), \
5080 (__v8si)_mm256_set1_epi32(-1), (s)))
5082 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
5083 /// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5084 /// of the result are zeroed.
5086 /// \code{.operation}
5087 /// FOR element := 0 to 1
5088 /// j := element*32
5089 /// k := element*64
5090 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5091 /// ENDFOR
5092 /// result[127:64] := 0
5093 /// \endcode
5095 /// \headerfile <immintrin.h>
5097 /// \code
5098 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5099 /// \endcode
5101 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5103 /// \param m
5104 /// A pointer to the memory used for loading values.
5105 /// \param i
5106 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5107 /// \param s
5108 /// A literal constant scale factor for the indexes in \a i. Must be
5109 /// 1, 2, 4, or 8.
5110 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5111 #define _mm_i64gather_epi32(m, i, s) \
5112 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5113 (int const *)(m), (__v2di)(__m128i)(i), \
5114 (__v4si)_mm_set1_epi32(-1), (s)))
5116 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
5117 /// from the 256-bit vector of [4 x i64] in \a i.
5119 /// \code{.operation}
5120 /// FOR element := 0 to 3
5121 /// j := element*32
5122 /// k := element*64
5123 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5124 /// ENDFOR
5125 /// \endcode
5127 /// \headerfile <immintrin.h>
5129 /// \code
5130 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5131 /// \endcode
5133 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5135 /// \param m
5136 /// A pointer to the memory used for loading values.
5137 /// \param i
5138 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5139 /// \param s
5140 /// A literal constant scale factor for the indexes in \a i. Must be
5141 /// 1, 2, 4, or 8.
5142 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5143 #define _mm256_i64gather_epi32(m, i, s) \
5144 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5145 (int const *)(m), (__v4di)(__m256i)(i), \
5146 (__v4si)_mm_set1_epi32(-1), (s)))
5148 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5149 /// from the 128-bit vector of [4 x i32] in \a i.
5151 /// \code{.operation}
5152 /// FOR element := 0 to 1
5153 /// j := element*64
5154 /// k := element*32
5155 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5156 /// ENDFOR
5157 /// \endcode
5159 /// \headerfile <immintrin.h>
5161 /// \code
5162 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5163 /// \endcode
5165 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5167 /// \param m
5168 /// A pointer to the memory used for loading values.
5169 /// \param i
5170 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5171 /// the first two elements are used.
5172 /// \param s
5173 /// A literal constant scale factor for the indexes in \a i. Must be
5174 /// 1, 2, 4, or 8.
5175 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5176 #define _mm_i32gather_epi64(m, i, s) \
5177 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5178 (long long const *)(m), \
5179 (__v4si)(__m128i)(i), \
5180 (__v2di)_mm_set1_epi64x(-1), (s)))
5182 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5183 /// from the 128-bit vector of [4 x i32] in \a i.
5185 /// \code{.operation}
5186 /// FOR element := 0 to 3
5187 /// j := element*64
5188 /// k := element*32
5189 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5190 /// ENDFOR
5191 /// \endcode
5193 /// \headerfile <immintrin.h>
5195 /// \code
5196 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5197 /// \endcode
5199 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5201 /// \param m
5202 /// A pointer to the memory used for loading values.
5203 /// \param i
5204 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5205 /// \param s
5206 /// A literal constant scale factor for the indexes in \a i. Must be
5207 /// 1, 2, 4, or 8.
5208 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5209 #define _mm256_i32gather_epi64(m, i, s) \
5210 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5211 (long long const *)(m), \
5212 (__v4si)(__m128i)(i), \
5213 (__v4di)_mm256_set1_epi64x(-1), (s)))
5215 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5216 /// from the 128-bit vector of [2 x i64] in \a i.
5218 /// \code{.operation}
5219 /// FOR element := 0 to 1
5220 /// j := element*64
5221 /// k := element*64
5222 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5223 /// ENDFOR
5224 /// \endcode
5226 /// \headerfile <immintrin.h>
5228 /// \code
5229 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5230 /// \endcode
5232 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5234 /// \param m
5235 /// A pointer to the memory used for loading values.
5236 /// \param i
5237 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5238 /// \param s
5239 /// A literal constant scale factor for the indexes in \a i. Must be
5240 /// 1, 2, 4, or 8.
5241 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5242 #define _mm_i64gather_epi64(m, i, s) \
5243 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5244 (long long const *)(m), \
5245 (__v2di)(__m128i)(i), \
5246 (__v2di)_mm_set1_epi64x(-1), (s)))
5248 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5249 /// from the 256-bit vector of [4 x i64] in \a i.
5251 /// \code{.operation}
5252 /// FOR element := 0 to 3
5253 /// j := element*64
5254 /// k := element*64
5255 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5256 /// ENDFOR
5257 /// \endcode
5259 /// \headerfile <immintrin.h>
5261 /// \code
5262 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5263 /// \endcode
5265 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5267 /// \param m
5268 /// A pointer to the memory used for loading values.
5269 /// \param i
5270 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5271 /// \param s
5272 /// A literal constant scale factor for the indexes in \a i. Must be
5273 /// 1, 2, 4, or 8.
5274 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5275 #define _mm256_i64gather_epi64(m, i, s) \
5276 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5277 (long long const *)(m), \
5278 (__v4di)(__m256i)(i), \
5279 (__v4di)_mm256_set1_epi64x(-1), (s)))
5281 #undef __DEFAULT_FN_ATTRS256
5282 #undef __DEFAULT_FN_ATTRS128
5284 #endif /* __AVX2INTRIN_H */