1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
14 #ifndef __AVX2INTRIN_H
15 #define __AVX2INTRIN_H
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, \
20 __target__("avx2,no-evex512"), __min_vector_width__(256)))
21 #define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, \
23 __target__("avx2,no-evex512"), __min_vector_width__(128)))
25 /* SSE4 Multiple Packed Sums of Absolute Difference. */
26 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
27 /// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
30 /// Eight SAD results are computed using the lower half of the input
31 /// vectors, and another eight using the upper half. These 16-bit values
32 /// are returned in the lower and upper halves of the 256-bit result,
35 /// A single SAD operation selects four bytes from \a X and four bytes from
36 /// \a Y as input. It computes the differences between each \a X byte and
37 /// the corresponding \a Y byte, takes the absolute value of each
38 /// difference, and sums these four values to form one 16-bit result. The
39 /// intrinsic computes 16 of these results with different sets of input
42 /// For each set of eight results, the SAD operations use the same four
43 /// bytes from \a Y; the starting bit position for these four bytes is
44 /// specified by \a M[1:0] times 32. The eight operations use successive
45 /// sets of four bytes from \a X; the starting bit position for the first
46 /// set of four bytes is specified by \a M[2] times 32. These bit positions
47 /// are all relative to the 128-bit lane for each set of eight operations.
53 /// Ybase := M[j+1:j]*32 + i*128
54 /// Xbase := M[j+2]*32 + i*128
56 /// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
57 /// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
58 /// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
59 /// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
60 /// result[r+15:r] := temp0 + temp1 + temp2 + temp3
61 /// Xbase := Xbase + 8
67 /// \headerfile <immintrin.h>
70 /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
73 /// This intrinsic corresponds to the \c VMPSADBW instruction.
76 /// A 256-bit integer vector containing one of the inputs.
78 /// A 256-bit integer vector containing one of the inputs.
80 /// An unsigned immediate value specifying the starting positions of the
81 /// bytes to operate on.
82 /// \returns A 256-bit vector of [16 x i16] containing the result.
83 #define _mm256_mpsadbw_epu8(X, Y, M) \
84 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
85 (__v32qi)(__m256i)(Y), (int)(M)))
87 /// Computes the absolute value of each signed byte in the 256-bit integer
88 /// vector \a __a and returns each value in the corresponding byte of
91 /// \headerfile <immintrin.h>
93 /// This intrinsic corresponds to the \c VPABSB instruction.
96 /// A 256-bit integer vector.
97 /// \returns A 256-bit integer vector containing the result.
98 static __inline__ __m256i __DEFAULT_FN_ATTRS256
99 _mm256_abs_epi8(__m256i __a
)
101 return (__m256i
)__builtin_elementwise_abs((__v32qs
)__a
);
104 /// Computes the absolute value of each signed 16-bit element in the 256-bit
105 /// vector of [16 x i16] in \a __a and returns each value in the
106 /// corresponding element of the result.
108 /// \headerfile <immintrin.h>
110 /// This intrinsic corresponds to the \c VPABSW instruction.
113 /// A 256-bit vector of [16 x i16].
114 /// \returns A 256-bit vector of [16 x i16] containing the result.
115 static __inline__ __m256i __DEFAULT_FN_ATTRS256
116 _mm256_abs_epi16(__m256i __a
)
118 return (__m256i
)__builtin_elementwise_abs((__v16hi
)__a
);
121 /// Computes the absolute value of each signed 32-bit element in the 256-bit
122 /// vector of [8 x i32] in \a __a and returns each value in the
123 /// corresponding element of the result.
125 /// \headerfile <immintrin.h>
127 /// This intrinsic corresponds to the \c VPABSD instruction.
130 /// A 256-bit vector of [8 x i32].
131 /// \returns A 256-bit vector of [8 x i32] containing the result.
132 static __inline__ __m256i __DEFAULT_FN_ATTRS256
133 _mm256_abs_epi32(__m256i __a
)
135 return (__m256i
)__builtin_elementwise_abs((__v8si
)__a
);
138 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
139 /// integers using signed saturation, and returns the 256-bit result.
141 /// \code{.operation}
145 /// result[7+k:k] := SATURATE8(__a[15+j:j])
146 /// result[71+k:64+k] := SATURATE8(__b[15+j:j])
147 /// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
148 /// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
152 /// \headerfile <immintrin.h>
154 /// This intrinsic corresponds to the \c VPACKSSWB instruction.
157 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
160 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
162 /// \returns A 256-bit integer vector containing the result.
163 static __inline__ __m256i __DEFAULT_FN_ATTRS256
164 _mm256_packs_epi16(__m256i __a
, __m256i __b
)
166 return (__m256i
)__builtin_ia32_packsswb256((__v16hi
)__a
, (__v16hi
)__b
);
169 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
170 /// integers using signed saturation, and returns the resulting 256-bit
171 /// vector of [16 x i16].
173 /// \code{.operation}
177 /// result[15+k:k] := SATURATE16(__a[31+j:j])
178 /// result[79+k:64+k] := SATURATE16(__b[31+j:j])
179 /// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
180 /// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
184 /// \headerfile <immintrin.h>
186 /// This intrinsic corresponds to the \c VPACKSSDW instruction.
189 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
192 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
194 /// \returns A 256-bit vector of [16 x i16] containing the result.
195 static __inline__ __m256i __DEFAULT_FN_ATTRS256
196 _mm256_packs_epi32(__m256i __a
, __m256i __b
)
198 return (__m256i
)__builtin_ia32_packssdw256((__v8si
)__a
, (__v8si
)__b
);
201 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
202 /// using unsigned saturation, and returns the 256-bit result.
204 /// \code{.operation}
208 /// result[7+k:k] := SATURATE8U(__a[15+j:j])
209 /// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
210 /// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
211 /// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
215 /// \headerfile <immintrin.h>
217 /// This intrinsic corresponds to the \c VPACKUSWB instruction.
220 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
223 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
225 /// \returns A 256-bit integer vector containing the result.
226 static __inline__ __m256i __DEFAULT_FN_ATTRS256
227 _mm256_packus_epi16(__m256i __a
, __m256i __b
)
229 return (__m256i
)__builtin_ia32_packuswb256((__v16hi
)__a
, (__v16hi
)__b
);
232 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
233 /// using unsigned saturation, and returns the resulting 256-bit vector of
236 /// \code{.operation}
240 /// result[15+k:k] := SATURATE16U(__V1[31+j:j])
241 /// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
242 /// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
243 /// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
247 /// \headerfile <immintrin.h>
249 /// This intrinsic corresponds to the \c VPACKUSDW instruction.
252 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
255 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
257 /// \returns A 256-bit vector of [16 x i16] containing the result.
258 static __inline__ __m256i __DEFAULT_FN_ATTRS256
259 _mm256_packus_epi32(__m256i __V1
, __m256i __V2
)
261 return (__m256i
) __builtin_ia32_packusdw256((__v8si
)__V1
, (__v8si
)__V2
);
264 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
265 /// vectors and returns the lower 8 bits of each sum in the corresponding
266 /// byte of the 256-bit integer vector result (overflow is ignored).
268 /// \headerfile <immintrin.h>
270 /// This intrinsic corresponds to the \c VPADDB instruction.
273 /// A 256-bit integer vector containing one of the source operands.
275 /// A 256-bit integer vector containing one of the source operands.
276 /// \returns A 256-bit integer vector containing the sums.
277 static __inline__ __m256i __DEFAULT_FN_ATTRS256
278 _mm256_add_epi8(__m256i __a
, __m256i __b
)
280 return (__m256i
)((__v32qu
)__a
+ (__v32qu
)__b
);
283 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
284 /// [16 x i16] and returns the lower 16 bits of each sum in the
285 /// corresponding element of the [16 x i16] result (overflow is ignored).
287 /// \headerfile <immintrin.h>
289 /// This intrinsic corresponds to the \c VPADDW instruction.
292 /// A 256-bit vector of [16 x i16] containing one of the source operands.
294 /// A 256-bit vector of [16 x i16] containing one of the source operands.
295 /// \returns A 256-bit vector of [16 x i16] containing the sums.
296 static __inline__ __m256i __DEFAULT_FN_ATTRS256
297 _mm256_add_epi16(__m256i __a
, __m256i __b
)
299 return (__m256i
)((__v16hu
)__a
+ (__v16hu
)__b
);
302 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
303 /// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
304 /// element of the [8 x i32] result (overflow is ignored).
306 /// \headerfile <immintrin.h>
308 /// This intrinsic corresponds to the \c VPADDD instruction.
311 /// A 256-bit vector of [8 x i32] containing one of the source operands.
313 /// A 256-bit vector of [8 x i32] containing one of the source operands.
314 /// \returns A 256-bit vector of [8 x i32] containing the sums.
315 static __inline__ __m256i __DEFAULT_FN_ATTRS256
316 _mm256_add_epi32(__m256i __a
, __m256i __b
)
318 return (__m256i
)((__v8su
)__a
+ (__v8su
)__b
);
321 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
322 /// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
323 /// element of the [4 x i64] result (overflow is ignored).
325 /// \headerfile <immintrin.h>
327 /// This intrinsic corresponds to the \c VPADDQ instruction.
330 /// A 256-bit vector of [4 x i64] containing one of the source operands.
332 /// A 256-bit vector of [4 x i64] containing one of the source operands.
333 /// \returns A 256-bit vector of [4 x i64] containing the sums.
334 static __inline__ __m256i __DEFAULT_FN_ATTRS256
335 _mm256_add_epi64(__m256i __a
, __m256i __b
)
337 return (__m256i
)((__v4du
)__a
+ (__v4du
)__b
);
340 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
341 /// vectors using signed saturation, and returns each sum in the
342 /// corresponding byte of the 256-bit integer vector result.
344 /// \headerfile <immintrin.h>
346 /// This intrinsic corresponds to the \c VPADDSB instruction.
349 /// A 256-bit integer vector containing one of the source operands.
351 /// A 256-bit integer vector containing one of the source operands.
352 /// \returns A 256-bit integer vector containing the sums.
353 static __inline__ __m256i __DEFAULT_FN_ATTRS256
354 _mm256_adds_epi8(__m256i __a
, __m256i __b
)
356 return (__m256i
)__builtin_elementwise_add_sat((__v32qs
)__a
, (__v32qs
)__b
);
359 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
360 /// [16 x i16] using signed saturation, and returns the [16 x i16] result.
362 /// \headerfile <immintrin.h>
364 /// This intrinsic corresponds to the \c VPADDSW instruction.
367 /// A 256-bit vector of [16 x i16] containing one of the source operands.
369 /// A 256-bit vector of [16 x i16] containing one of the source operands.
370 /// \returns A 256-bit vector of [16 x i16] containing the sums.
371 static __inline__ __m256i __DEFAULT_FN_ATTRS256
372 _mm256_adds_epi16(__m256i __a
, __m256i __b
)
374 return (__m256i
)__builtin_elementwise_add_sat((__v16hi
)__a
, (__v16hi
)__b
);
377 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
378 /// vectors using unsigned saturation, and returns each sum in the
379 /// corresponding byte of the 256-bit integer vector result.
381 /// \headerfile <immintrin.h>
383 /// This intrinsic corresponds to the \c VPADDUSB instruction.
386 /// A 256-bit integer vector containing one of the source operands.
388 /// A 256-bit integer vector containing one of the source operands.
389 /// \returns A 256-bit integer vector containing the sums.
390 static __inline__ __m256i __DEFAULT_FN_ATTRS256
391 _mm256_adds_epu8(__m256i __a
, __m256i __b
)
393 return (__m256i
)__builtin_elementwise_add_sat((__v32qu
)__a
, (__v32qu
)__b
);
396 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
397 /// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
399 /// \headerfile <immintrin.h>
401 /// This intrinsic corresponds to the \c VPADDUSW instruction.
404 /// A 256-bit vector of [16 x i16] containing one of the source operands.
406 /// A 256-bit vector of [16 x i16] containing one of the source operands.
407 /// \returns A 256-bit vector of [16 x i16] containing the sums.
408 static __inline__ __m256i __DEFAULT_FN_ATTRS256
409 _mm256_adds_epu16(__m256i __a
, __m256i __b
)
411 return (__m256i
)__builtin_elementwise_add_sat((__v16hu
)__a
, (__v16hu
)__b
);
414 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
415 /// temporary 256-bit value, and the lower half of the 256-bit vector \a b
416 /// as the lower half of the temporary value. Right-shifts the temporary
417 /// value by \a n bytes, and uses the lower 16 bytes of the shifted value
418 /// as the lower 16 bytes of the result. Uses the upper halves of \a a and
419 /// \a b to make another temporary value, right shifts by \a n, and uses
420 /// the lower 16 bytes of the shifted value as the upper 16 bytes of the
423 /// \headerfile <immintrin.h>
426 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
429 /// This intrinsic corresponds to the \c VPALIGNR instruction.
432 /// A 256-bit integer vector containing source values.
434 /// A 256-bit integer vector containing source values.
436 /// An immediate value specifying the number of bytes to shift.
437 /// \returns A 256-bit integer vector containing the result.
438 #define _mm256_alignr_epi8(a, b, n) \
439 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
440 (__v32qi)(__m256i)(b), (n)))
442 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
445 /// \headerfile <immintrin.h>
447 /// This intrinsic corresponds to the \c VPAND instruction.
450 /// A 256-bit integer vector.
452 /// A 256-bit integer vector.
453 /// \returns A 256-bit integer vector containing the result.
454 static __inline__ __m256i __DEFAULT_FN_ATTRS256
455 _mm256_and_si256(__m256i __a
, __m256i __b
)
457 return (__m256i
)((__v4du
)__a
& (__v4du
)__b
);
460 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with
461 /// the bitwise NOT of the 256-bit integer vector in \a __a.
463 /// \headerfile <immintrin.h>
465 /// This intrinsic corresponds to the \c VPANDN instruction.
468 /// A 256-bit integer vector.
470 /// A 256-bit integer vector.
471 /// \returns A 256-bit integer vector containing the result.
472 static __inline__ __m256i __DEFAULT_FN_ATTRS256
473 _mm256_andnot_si256(__m256i __a
, __m256i __b
)
475 return (__m256i
)(~(__v4du
)__a
& (__v4du
)__b
);
478 /// Computes the averages of the corresponding unsigned bytes in the two
479 /// 256-bit integer vectors in \a __a and \a __b and returns each
480 /// average in the corresponding byte of the 256-bit result.
482 /// \code{.operation}
485 /// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
489 /// \headerfile <immintrin.h>
491 /// This intrinsic corresponds to the \c VPAVGB instruction.
494 /// A 256-bit integer vector.
496 /// A 256-bit integer vector.
497 /// \returns A 256-bit integer vector containing the result.
498 static __inline__ __m256i __DEFAULT_FN_ATTRS256
499 _mm256_avg_epu8(__m256i __a
, __m256i __b
)
501 return (__m256i
)__builtin_ia32_pavgb256((__v32qi
)__a
, (__v32qi
)__b
);
504 /// Computes the averages of the corresponding unsigned 16-bit integers in
505 /// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
506 /// each average in the corresponding element of the 256-bit result.
508 /// \code{.operation}
511 /// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
515 /// \headerfile <immintrin.h>
517 /// This intrinsic corresponds to the \c VPAVGW instruction.
520 /// A 256-bit vector of [16 x i16].
522 /// A 256-bit vector of [16 x i16].
523 /// \returns A 256-bit vector of [16 x i16] containing the result.
524 static __inline__ __m256i __DEFAULT_FN_ATTRS256
525 _mm256_avg_epu16(__m256i __a
, __m256i __b
)
527 return (__m256i
)__builtin_ia32_pavgw256((__v16hi
)__a
, (__v16hi
)__b
);
530 /// Merges 8-bit integer values from either of the two 256-bit vectors
531 /// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
532 /// the resulting 256-bit integer vector.
534 /// \code{.operation}
538 /// result[7+j:j] := __V1[7+j:j]
540 /// result[7+j:j] := __V2[7+j:j]
545 /// \headerfile <immintrin.h>
547 /// This intrinsic corresponds to the \c VPBLENDVB instruction.
550 /// A 256-bit integer vector containing source values.
552 /// A 256-bit integer vector containing source values.
554 /// A 256-bit integer vector, with bit [7] of each byte specifying the
555 /// source for each corresponding byte of the result. When the mask bit
556 /// is 0, the byte is copied from \a __V1; otherwise, it is copied from
558 /// \returns A 256-bit integer vector containing the result.
559 static __inline__ __m256i __DEFAULT_FN_ATTRS256
560 _mm256_blendv_epi8(__m256i __V1
, __m256i __V2
, __m256i __M
)
562 return (__m256i
)__builtin_ia32_pblendvb256((__v32qi
)__V1
, (__v32qi
)__V2
,
566 /// Merges 16-bit integer values from either of the two 256-bit vectors
567 /// \a V1 or \a V2, as specified by the immediate integer operand \a M,
568 /// and returns the resulting 256-bit vector of [16 x i16].
570 /// \code{.operation}
574 /// result[7+j:j] := V1[7+j:j]
575 /// result[135+j:128+j] := V1[135+j:128+j]
577 /// result[7+j:j] := V2[7+j:j]
578 /// result[135+j:128+j] := V2[135+j:128+j]
583 /// \headerfile <immintrin.h>
586 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
589 /// This intrinsic corresponds to the \c VPBLENDW instruction.
592 /// A 256-bit vector of [16 x i16] containing source values.
594 /// A 256-bit vector of [16 x i16] containing source values.
596 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
597 /// source for each element of the result. The position of the mask bit
598 /// corresponds to the index of a copied value. When a mask bit is 0, the
599 /// element is copied from \a V1; otherwise, it is copied from \a V2.
600 /// \a M[0] determines the source for elements 0 and 8, \a M[1] for
601 /// elements 1 and 9, and so forth.
602 /// \returns A 256-bit vector of [16 x i16] containing the result.
603 #define _mm256_blend_epi16(V1, V2, M) \
604 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
605 (__v16hi)(__m256i)(V2), (int)(M)))
607 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
608 /// \a __b for equality and returns the outcomes in the corresponding
609 /// bytes of the 256-bit result.
611 /// \code{.operation}
614 /// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
618 /// \headerfile <immintrin.h>
620 /// This intrinsic corresponds to the \c VPCMPEQB instruction.
623 /// A 256-bit integer vector containing one of the inputs.
625 /// A 256-bit integer vector containing one of the inputs.
626 /// \returns A 256-bit integer vector containing the result.
627 static __inline__ __m256i __DEFAULT_FN_ATTRS256
628 _mm256_cmpeq_epi8(__m256i __a
, __m256i __b
)
630 return (__m256i
)((__v32qi
)__a
== (__v32qi
)__b
);
633 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
634 /// \a __a and \a __b for equality and returns the outcomes in the
635 /// corresponding elements of the 256-bit result.
637 /// \code{.operation}
640 /// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
644 /// \headerfile <immintrin.h>
646 /// This intrinsic corresponds to the \c VPCMPEQW instruction.
649 /// A 256-bit vector of [16 x i16] containing one of the inputs.
651 /// A 256-bit vector of [16 x i16] containing one of the inputs.
652 /// \returns A 256-bit vector of [16 x i16] containing the result.
653 static __inline__ __m256i __DEFAULT_FN_ATTRS256
654 _mm256_cmpeq_epi16(__m256i __a
, __m256i __b
)
656 return (__m256i
)((__v16hi
)__a
== (__v16hi
)__b
);
659 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
660 /// \a __a and \a __b for equality and returns the outcomes in the
661 /// corresponding elements of the 256-bit result.
663 /// \code{.operation}
666 /// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
670 /// \headerfile <immintrin.h>
672 /// This intrinsic corresponds to the \c VPCMPEQD instruction.
675 /// A 256-bit vector of [8 x i32] containing one of the inputs.
677 /// A 256-bit vector of [8 x i32] containing one of the inputs.
678 /// \returns A 256-bit vector of [8 x i32] containing the result.
679 static __inline__ __m256i __DEFAULT_FN_ATTRS256
680 _mm256_cmpeq_epi32(__m256i __a
, __m256i __b
)
682 return (__m256i
)((__v8si
)__a
== (__v8si
)__b
);
685 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
686 /// \a __a and \a __b for equality and returns the outcomes in the
687 /// corresponding elements of the 256-bit result.
689 /// \code{.operation}
692 /// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
696 /// \headerfile <immintrin.h>
698 /// This intrinsic corresponds to the \c VPCMPEQQ instruction.
701 /// A 256-bit vector of [4 x i64] containing one of the inputs.
703 /// A 256-bit vector of [4 x i64] containing one of the inputs.
704 /// \returns A 256-bit vector of [4 x i64] containing the result.
705 static __inline__ __m256i __DEFAULT_FN_ATTRS256
706 _mm256_cmpeq_epi64(__m256i __a
, __m256i __b
)
708 return (__m256i
)((__v4di
)__a
== (__v4di
)__b
);
711 /// Compares corresponding signed bytes in the 256-bit integer vectors in
712 /// \a __a and \a __b for greater-than and returns the outcomes in the
713 /// corresponding bytes of the 256-bit result.
715 /// \code{.operation}
718 /// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
722 /// \headerfile <immintrin.h>
724 /// This intrinsic corresponds to the \c VPCMPGTB instruction.
727 /// A 256-bit integer vector containing one of the inputs.
729 /// A 256-bit integer vector containing one of the inputs.
730 /// \returns A 256-bit integer vector containing the result.
731 static __inline__ __m256i __DEFAULT_FN_ATTRS256
732 _mm256_cmpgt_epi8(__m256i __a
, __m256i __b
)
734 /* This function always performs a signed comparison, but __v32qi is a char
735 which may be signed or unsigned, so use __v32qs. */
736 return (__m256i
)((__v32qs
)__a
> (__v32qs
)__b
);
739 /// Compares corresponding signed elements in the 256-bit vectors of
740 /// [16 x i16] in \a __a and \a __b for greater-than and returns the
741 /// outcomes in the corresponding elements of the 256-bit result.
743 /// \code{.operation}
746 /// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
750 /// \headerfile <immintrin.h>
752 /// This intrinsic corresponds to the \c VPCMPGTW instruction.
755 /// A 256-bit vector of [16 x i16] containing one of the inputs.
757 /// A 256-bit vector of [16 x i16] containing one of the inputs.
758 /// \returns A 256-bit vector of [16 x i16] containing the result.
759 static __inline__ __m256i __DEFAULT_FN_ATTRS256
760 _mm256_cmpgt_epi16(__m256i __a
, __m256i __b
)
762 return (__m256i
)((__v16hi
)__a
> (__v16hi
)__b
);
765 /// Compares corresponding signed elements in the 256-bit vectors of
766 /// [8 x i32] in \a __a and \a __b for greater-than and returns the
767 /// outcomes in the corresponding elements of the 256-bit result.
769 /// \code{.operation}
772 /// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
776 /// \headerfile <immintrin.h>
778 /// This intrinsic corresponds to the \c VPCMPGTD instruction.
781 /// A 256-bit vector of [8 x i32] containing one of the inputs.
783 /// A 256-bit vector of [8 x i32] containing one of the inputs.
784 /// \returns A 256-bit vector of [8 x i32] containing the result.
785 static __inline__ __m256i __DEFAULT_FN_ATTRS256
786 _mm256_cmpgt_epi32(__m256i __a
, __m256i __b
)
788 return (__m256i
)((__v8si
)__a
> (__v8si
)__b
);
791 /// Compares corresponding signed elements in the 256-bit vectors of
792 /// [4 x i64] in \a __a and \a __b for greater-than and returns the
793 /// outcomes in the corresponding elements of the 256-bit result.
795 /// \code{.operation}
798 /// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
802 /// \headerfile <immintrin.h>
804 /// This intrinsic corresponds to the \c VPCMPGTQ instruction.
807 /// A 256-bit vector of [4 x i64] containing one of the inputs.
809 /// A 256-bit vector of [4 x i64] containing one of the inputs.
810 /// \returns A 256-bit vector of [4 x i64] containing the result.
811 static __inline__ __m256i __DEFAULT_FN_ATTRS256
812 _mm256_cmpgt_epi64(__m256i __a
, __m256i __b
)
814 return (__m256i
)((__v4di
)__a
> (__v4di
)__b
);
817 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
818 /// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
819 /// element of the [16 x i16] result (overflow is ignored). Sums from
820 /// \a __a are returned in the lower 64 bits of each 128-bit half of the
821 /// result; sums from \a __b are returned in the upper 64 bits of each
822 /// 128-bit half of the result.
824 /// \code{.operation}
827 /// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
828 /// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
829 /// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
830 /// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
831 /// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
832 /// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
833 /// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
834 /// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
838 /// \headerfile <immintrin.h>
840 /// This intrinsic corresponds to the \c VPHADDW instruction.
843 /// A 256-bit vector of [16 x i16] containing one of the source operands.
845 /// A 256-bit vector of [16 x i16] containing one of the source operands.
846 /// \returns A 256-bit vector of [16 x i16] containing the sums.
847 static __inline__ __m256i __DEFAULT_FN_ATTRS256
848 _mm256_hadd_epi16(__m256i __a
, __m256i __b
)
850 return (__m256i
)__builtin_ia32_phaddw256((__v16hi
)__a
, (__v16hi
)__b
);
853 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
854 /// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
855 /// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
856 /// are returned in the lower 64 bits of each 128-bit half of the result;
857 /// sums from \a __b are returned in the upper 64 bits of each 128-bit half
860 /// \code{.operation}
863 /// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
864 /// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
865 /// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
866 /// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
870 /// \headerfile <immintrin.h>
872 /// This intrinsic corresponds to the \c VPHADDD instruction.
875 /// A 256-bit vector of [8 x i32] containing one of the source operands.
877 /// A 256-bit vector of [8 x i32] containing one of the source operands.
878 /// \returns A 256-bit vector of [8 x i32] containing the sums.
879 static __inline__ __m256i __DEFAULT_FN_ATTRS256
880 _mm256_hadd_epi32(__m256i __a
, __m256i __b
)
882 return (__m256i
)__builtin_ia32_phaddd256((__v8si
)__a
, (__v8si
)__b
);
885 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
886 /// vectors of [16 x i16] using signed saturation and returns each sum in
887 /// an element of the [16 x i16] result. Sums from \a __a are returned in
888 /// the lower 64 bits of each 128-bit half of the result; sums from \a __b
889 /// are returned in the upper 64 bits of each 128-bit half of the result.
891 /// \code{.operation}
894 /// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
895 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
896 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
897 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
898 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
899 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
900 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
901 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
905 /// \headerfile <immintrin.h>
907 /// This intrinsic corresponds to the \c VPHADDSW instruction.
910 /// A 256-bit vector of [16 x i16] containing one of the source operands.
912 /// A 256-bit vector of [16 x i16] containing one of the source operands.
913 /// \returns A 256-bit vector of [16 x i16] containing the sums.
914 static __inline__ __m256i __DEFAULT_FN_ATTRS256
915 _mm256_hadds_epi16(__m256i __a
, __m256i __b
)
917 return (__m256i
)__builtin_ia32_phaddsw256((__v16hi
)__a
, (__v16hi
)__b
);
920 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
921 /// vectors of [16 x i16] and returns the lower 16 bits of each difference
922 /// in an element of the [16 x i16] result (overflow is ignored).
923 /// Differences from \a __a are returned in the lower 64 bits of each
924 /// 128-bit half of the result; differences from \a __b are returned in the
925 /// upper 64 bits of each 128-bit half of the result.
927 /// \code{.operation}
930 /// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
931 /// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
932 /// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
933 /// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
934 /// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
935 /// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
936 /// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
937 /// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
941 /// \headerfile <immintrin.h>
943 /// This intrinsic corresponds to the \c VPHSUBW instruction.
946 /// A 256-bit vector of [16 x i16] containing one of the source operands.
948 /// A 256-bit vector of [16 x i16] containing one of the source operands.
949 /// \returns A 256-bit vector of [16 x i16] containing the differences.
950 static __inline__ __m256i __DEFAULT_FN_ATTRS256
951 _mm256_hsub_epi16(__m256i __a
, __m256i __b
)
953 return (__m256i
)__builtin_ia32_phsubw256((__v16hi
)__a
, (__v16hi
)__b
);
956 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
957 /// vectors of [8 x i32] and returns the lower 32 bits of each difference in
958 /// an element of the [8 x i32] result (overflow is ignored). Differences
959 /// from \a __a are returned in the lower 64 bits of each 128-bit half of
960 /// the result; differences from \a __b are returned in the upper 64 bits
961 /// of each 128-bit half of the result.
963 /// \code{.operation}
966 /// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
967 /// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
968 /// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
969 /// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
973 /// \headerfile <immintrin.h>
975 /// This intrinsic corresponds to the \c VPHSUBD instruction.
978 /// A 256-bit vector of [8 x i32] containing one of the source operands.
980 /// A 256-bit vector of [8 x i32] containing one of the source operands.
981 /// \returns A 256-bit vector of [8 x i32] containing the differences.
982 static __inline__ __m256i __DEFAULT_FN_ATTRS256
983 _mm256_hsub_epi32(__m256i __a
, __m256i __b
)
985 return (__m256i
)__builtin_ia32_phsubd256((__v8si
)__a
, (__v8si
)__b
);
988 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
989 /// vectors of [16 x i16] using signed saturation and returns each sum in
990 /// an element of the [16 x i16] result. Differences from \a __a are
991 /// returned in the lower 64 bits of each 128-bit half of the result;
992 /// differences from \a __b are returned in the upper 64 bits of each
993 /// 128-bit half of the result.
995 /// \code{.operation}
998 /// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
999 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1000 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1001 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1002 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1003 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1004 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1005 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1009 /// \headerfile <immintrin.h>
1011 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
1014 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1016 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1017 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1018 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1019 _mm256_hsubs_epi16(__m256i __a
, __m256i __b
)
1021 return (__m256i
)__builtin_ia32_phsubsw256((__v16hi
)__a
, (__v16hi
)__b
);
1024 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1025 /// with the corresponding signed byte from the 256-bit integer vector in
1026 /// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1027 /// pairs of those products using signed saturation to form 16-bit sums
1028 /// returned as elements of the [16 x i16] result.
1030 /// \code{.operation}
1031 /// FOR i := 0 TO 15
1033 /// temp1 := __a[j+7:j] * __b[j+7:j]
1034 /// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1035 /// result[j+15:j] := SATURATE16(temp1 + temp2)
1039 /// \headerfile <immintrin.h>
1041 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1044 /// A 256-bit vector containing one of the source operands.
1046 /// A 256-bit vector containing one of the source operands.
1047 /// \returns A 256-bit vector of [16 x i16] containing the result.
1048 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1049 _mm256_maddubs_epi16(__m256i __a
, __m256i __b
)
1051 return (__m256i
)__builtin_ia32_pmaddubsw256((__v32qi
)__a
, (__v32qi
)__b
);
1054 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1055 /// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1056 /// those products to form 32-bit sums returned as elements of the
1057 /// [8 x i32] result.
1059 /// There is only one wraparound case: when all four of the 16-bit sources
1060 /// are \c 0x8000, the result will be \c 0x80000000.
1062 /// \code{.operation}
1065 /// temp1 := __a[j+15:j] * __b[j+15:j]
1066 /// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1067 /// result[j+31:j] := temp1 + temp2
1071 /// \headerfile <immintrin.h>
1073 /// This intrinsic corresponds to the \c VPMADDWD instruction.
1076 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1078 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1079 /// \returns A 256-bit vector of [8 x i32] containing the result.
1080 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1081 _mm256_madd_epi16(__m256i __a
, __m256i __b
)
1083 return (__m256i
)__builtin_ia32_pmaddwd256((__v16hi
)__a
, (__v16hi
)__b
);
1086 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1087 /// in \a __a and \a __b and returns the larger of each pair in the
1088 /// corresponding byte of the 256-bit result.
1090 /// \headerfile <immintrin.h>
1092 /// This intrinsic corresponds to the \c VPMAXSB instruction.
1095 /// A 256-bit integer vector.
1097 /// A 256-bit integer vector.
1098 /// \returns A 256-bit integer vector containing the result.
1099 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1100 _mm256_max_epi8(__m256i __a
, __m256i __b
)
1102 return (__m256i
)__builtin_elementwise_max((__v32qs
)__a
, (__v32qs
)__b
);
1105 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1106 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1107 /// each pair in the corresponding element of the 256-bit result.
1109 /// \headerfile <immintrin.h>
1111 /// This intrinsic corresponds to the \c VPMAXSW instruction.
1114 /// A 256-bit vector of [16 x i16].
1116 /// A 256-bit vector of [16 x i16].
1117 /// \returns A 256-bit vector of [16 x i16] containing the result.
1118 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1119 _mm256_max_epi16(__m256i __a
, __m256i __b
)
1121 return (__m256i
)__builtin_elementwise_max((__v16hi
)__a
, (__v16hi
)__b
);
1124 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1125 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1126 /// each pair in the corresponding element of the 256-bit result.
1128 /// \headerfile <immintrin.h>
1130 /// This intrinsic corresponds to the \c VPMAXSD instruction.
1133 /// A 256-bit vector of [8 x i32].
1135 /// A 256-bit vector of [8 x i32].
1136 /// \returns A 256-bit vector of [8 x i32] containing the result.
1137 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1138 _mm256_max_epi32(__m256i __a
, __m256i __b
)
1140 return (__m256i
)__builtin_elementwise_max((__v8si
)__a
, (__v8si
)__b
);
1143 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1144 /// vectors in \a __a and \a __b and returns the larger of each pair in
1145 /// the corresponding byte of the 256-bit result.
1147 /// \headerfile <immintrin.h>
1149 /// This intrinsic corresponds to the \c VPMAXUB instruction.
1152 /// A 256-bit integer vector.
1154 /// A 256-bit integer vector.
1155 /// \returns A 256-bit integer vector containing the result.
1156 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1157 _mm256_max_epu8(__m256i __a
, __m256i __b
)
1159 return (__m256i
)__builtin_elementwise_max((__v32qu
)__a
, (__v32qu
)__b
);
1162 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1163 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1164 /// each pair in the corresponding element of the 256-bit result.
1166 /// \headerfile <immintrin.h>
1168 /// This intrinsic corresponds to the \c VPMAXUW instruction.
1171 /// A 256-bit vector of [16 x i16].
1173 /// A 256-bit vector of [16 x i16].
1174 /// \returns A 256-bit vector of [16 x i16] containing the result.
1175 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1176 _mm256_max_epu16(__m256i __a
, __m256i __b
)
1178 return (__m256i
)__builtin_elementwise_max((__v16hu
)__a
, (__v16hu
)__b
);
1181 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1182 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1183 /// each pair in the corresponding element of the 256-bit result.
1185 /// \headerfile <immintrin.h>
1187 /// This intrinsic corresponds to the \c VPMAXUD instruction.
1190 /// A 256-bit vector of [8 x i32].
1192 /// A 256-bit vector of [8 x i32].
1193 /// \returns A 256-bit vector of [8 x i32] containing the result.
1194 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1195 _mm256_max_epu32(__m256i __a
, __m256i __b
)
1197 return (__m256i
)__builtin_elementwise_max((__v8su
)__a
, (__v8su
)__b
);
1200 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1201 /// in \a __a and \a __b and returns the smaller of each pair in the
1202 /// corresponding byte of the 256-bit result.
1204 /// \headerfile <immintrin.h>
1206 /// This intrinsic corresponds to the \c VPMINSB instruction.
1209 /// A 256-bit integer vector.
1211 /// A 256-bit integer vector.
1212 /// \returns A 256-bit integer vector containing the result.
1213 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1214 _mm256_min_epi8(__m256i __a
, __m256i __b
)
1216 return (__m256i
)__builtin_elementwise_min((__v32qs
)__a
, (__v32qs
)__b
);
1219 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1220 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1221 /// each pair in the corresponding element of the 256-bit result.
1223 /// \headerfile <immintrin.h>
1225 /// This intrinsic corresponds to the \c VPMINSW instruction.
1228 /// A 256-bit vector of [16 x i16].
1230 /// A 256-bit vector of [16 x i16].
1231 /// \returns A 256-bit vector of [16 x i16] containing the result.
1232 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1233 _mm256_min_epi16(__m256i __a
, __m256i __b
)
1235 return (__m256i
)__builtin_elementwise_min((__v16hi
)__a
, (__v16hi
)__b
);
1238 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1239 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1240 /// each pair in the corresponding element of the 256-bit result.
1242 /// \headerfile <immintrin.h>
1244 /// This intrinsic corresponds to the \c VPMINSD instruction.
1247 /// A 256-bit vector of [8 x i32].
1249 /// A 256-bit vector of [8 x i32].
1250 /// \returns A 256-bit vector of [8 x i32] containing the result.
1251 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1252 _mm256_min_epi32(__m256i __a
, __m256i __b
)
1254 return (__m256i
)__builtin_elementwise_min((__v8si
)__a
, (__v8si
)__b
);
1257 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1258 /// vectors in \a __a and \a __b and returns the smaller of each pair in
1259 /// the corresponding byte of the 256-bit result.
1261 /// \headerfile <immintrin.h>
1263 /// This intrinsic corresponds to the \c VPMINUB instruction.
1266 /// A 256-bit integer vector.
1268 /// A 256-bit integer vector.
1269 /// \returns A 256-bit integer vector containing the result.
1270 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1271 _mm256_min_epu8(__m256i __a
, __m256i __b
)
1273 return (__m256i
)__builtin_elementwise_min((__v32qu
)__a
, (__v32qu
)__b
);
1276 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1277 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1278 /// each pair in the corresponding element of the 256-bit result.
1280 /// \headerfile <immintrin.h>
1282 /// This intrinsic corresponds to the \c VPMINUW instruction.
1285 /// A 256-bit vector of [16 x i16].
1287 /// A 256-bit vector of [16 x i16].
1288 /// \returns A 256-bit vector of [16 x i16] containing the result.
1289 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1290 _mm256_min_epu16(__m256i __a
, __m256i __b
)
1292 return (__m256i
)__builtin_elementwise_min((__v16hu
)__a
, (__v16hu
)__b
);
1295 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1296 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1297 /// each pair in the corresponding element of the 256-bit result.
1299 /// \headerfile <immintrin.h>
1301 /// This intrinsic corresponds to the \c VPMINUD instruction.
1304 /// A 256-bit vector of [8 x i32].
1306 /// A 256-bit vector of [8 x i32].
1307 /// \returns A 256-bit vector of [8 x i32] containing the result.
1308 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1309 _mm256_min_epu32(__m256i __a
, __m256i __b
)
1311 return (__m256i
)__builtin_elementwise_min((__v8su
)__a
, (__v8su
)__b
);
1314 /// Creates a 32-bit integer mask from the most significant bit of each byte
1315 /// in the 256-bit integer vector in \a __a and returns the result.
1317 /// \code{.operation}
1318 /// FOR i := 0 TO 31
1320 /// result[i] := __a[j+7]
1324 /// \headerfile <immintrin.h>
1326 /// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1329 /// A 256-bit integer vector containing the source bytes.
1330 /// \returns The 32-bit integer mask.
1331 static __inline__
int __DEFAULT_FN_ATTRS256
1332 _mm256_movemask_epi8(__m256i __a
)
1334 return __builtin_ia32_pmovmskb256((__v32qi
)__a
);
1337 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1338 /// the 16-bit values in the corresponding elements of a 256-bit vector
1341 /// \code{.operation}
1342 /// FOR i := 0 TO 15
1345 /// result[k+15:k] := SignExtend(__V[j+7:j])
1349 /// \headerfile <immintrin.h>
1351 /// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1354 /// A 128-bit integer vector containing the source bytes.
1355 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1357 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1358 _mm256_cvtepi8_epi16(__m128i __V
)
1360 /* This function always performs a signed extension, but __v16qi is a char
1361 which may be signed or unsigned, so use __v16qs. */
1362 return (__m256i
)__builtin_convertvector((__v16qs
)__V
, __v16hi
);
1365 /// Sign-extends bytes from the lower half of the 128-bit integer vector in
1366 /// \a __V and returns the 32-bit values in the corresponding elements of a
1367 /// 256-bit vector of [8 x i32].
1369 /// \code{.operation}
1373 /// result[k+31:k] := SignExtend(__V[j+7:j])
1377 /// \headerfile <immintrin.h>
1379 /// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1382 /// A 128-bit integer vector containing the source bytes.
1383 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1385 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1386 _mm256_cvtepi8_epi32(__m128i __V
)
1388 /* This function always performs a signed extension, but __v16qi is a char
1389 which may be signed or unsigned, so use __v16qs. */
1390 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v16qs
)__V
, (__v16qs
)__V
, 0, 1, 2, 3, 4, 5, 6, 7), __v8si
);
1393 /// Sign-extends the first four bytes from the 128-bit integer vector in
1394 /// \a __V and returns the 64-bit values in the corresponding elements of a
1395 /// 256-bit vector of [4 x i64].
1397 /// \code{.operation}
1398 /// result[63:0] := SignExtend(__V[7:0])
1399 /// result[127:64] := SignExtend(__V[15:8])
1400 /// result[191:128] := SignExtend(__V[23:16])
1401 /// result[255:192] := SignExtend(__V[31:24])
1404 /// \headerfile <immintrin.h>
1406 /// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1409 /// A 128-bit integer vector containing the source bytes.
1410 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1412 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1413 _mm256_cvtepi8_epi64(__m128i __V
)
1415 /* This function always performs a signed extension, but __v16qi is a char
1416 which may be signed or unsigned, so use __v16qs. */
1417 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v16qs
)__V
, (__v16qs
)__V
, 0, 1, 2, 3), __v4di
);
1420 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1421 /// \a __V and returns the 32-bit values in the corresponding elements of a
1422 /// 256-bit vector of [8 x i32].
1424 /// \code{.operation}
1428 /// result[k+31:k] := SignExtend(__V[j+15:j])
1432 /// \headerfile <immintrin.h>
1434 /// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1437 /// A 128-bit vector of [8 x i16] containing the source values.
1438 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1440 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1441 _mm256_cvtepi16_epi32(__m128i __V
)
1443 return (__m256i
)__builtin_convertvector((__v8hi
)__V
, __v8si
);
1446 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1447 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1448 /// elements of a 256-bit vector of [4 x i64].
1450 /// \code{.operation}
1451 /// result[63:0] := SignExtend(__V[15:0])
1452 /// result[127:64] := SignExtend(__V[31:16])
1453 /// result[191:128] := SignExtend(__V[47:32])
1454 /// result[255:192] := SignExtend(__V[64:48])
1457 /// \headerfile <immintrin.h>
1459 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1462 /// A 128-bit vector of [8 x i16] containing the source values.
1463 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1465 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1466 _mm256_cvtepi16_epi64(__m128i __V
)
1468 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v8hi
)__V
, (__v8hi
)__V
, 0, 1, 2, 3), __v4di
);
1471 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1472 /// \a __V and returns the 64-bit values in the corresponding elements of a
1473 /// 256-bit vector of [4 x i64].
1475 /// \code{.operation}
1476 /// result[63:0] := SignExtend(__V[31:0])
1477 /// result[127:64] := SignExtend(__V[63:32])
1478 /// result[191:128] := SignExtend(__V[95:64])
1479 /// result[255:192] := SignExtend(__V[127:96])
1482 /// \headerfile <immintrin.h>
1484 /// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1487 /// A 128-bit vector of [4 x i32] containing the source values.
1488 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1490 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1491 _mm256_cvtepi32_epi64(__m128i __V
)
1493 return (__m256i
)__builtin_convertvector((__v4si
)__V
, __v4di
);
1496 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1497 /// the 16-bit values in the corresponding elements of a 256-bit vector
1500 /// \code{.operation}
1501 /// FOR i := 0 TO 15
1504 /// result[k+15:k] := ZeroExtend(__V[j+7:j])
1508 /// \headerfile <immintrin.h>
1510 /// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1513 /// A 128-bit integer vector containing the source bytes.
1514 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1516 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1517 _mm256_cvtepu8_epi16(__m128i __V
)
1519 return (__m256i
)__builtin_convertvector((__v16qu
)__V
, __v16hi
);
1522 /// Zero-extends bytes from the lower half of the 128-bit integer vector in
1523 /// \a __V and returns the 32-bit values in the corresponding elements of a
1524 /// 256-bit vector of [8 x i32].
1526 /// \code{.operation}
1530 /// result[k+31:k] := ZeroExtend(__V[j+7:j])
1534 /// \headerfile <immintrin.h>
1536 /// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1539 /// A 128-bit integer vector containing the source bytes.
1540 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1542 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1543 _mm256_cvtepu8_epi32(__m128i __V
)
1545 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v16qu
)__V
, (__v16qu
)__V
, 0, 1, 2, 3, 4, 5, 6, 7), __v8si
);
1548 /// Zero-extends the first four bytes from the 128-bit integer vector in
1549 /// \a __V and returns the 64-bit values in the corresponding elements of a
1550 /// 256-bit vector of [4 x i64].
1552 /// \code{.operation}
1553 /// result[63:0] := ZeroExtend(__V[7:0])
1554 /// result[127:64] := ZeroExtend(__V[15:8])
1555 /// result[191:128] := ZeroExtend(__V[23:16])
1556 /// result[255:192] := ZeroExtend(__V[31:24])
1559 /// \headerfile <immintrin.h>
1561 /// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1564 /// A 128-bit integer vector containing the source bytes.
1565 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1567 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1568 _mm256_cvtepu8_epi64(__m128i __V
)
1570 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v16qu
)__V
, (__v16qu
)__V
, 0, 1, 2, 3), __v4di
);
1573 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1574 /// \a __V and returns the 32-bit values in the corresponding elements of a
1575 /// 256-bit vector of [8 x i32].
1577 /// \code{.operation}
1581 /// result[k+31:k] := ZeroExtend(__V[j+15:j])
1585 /// \headerfile <immintrin.h>
1587 /// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1590 /// A 128-bit vector of [8 x i16] containing the source values.
1591 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1593 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1594 _mm256_cvtepu16_epi32(__m128i __V
)
1596 return (__m256i
)__builtin_convertvector((__v8hu
)__V
, __v8si
);
1599 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1600 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1601 /// elements of a 256-bit vector of [4 x i64].
1603 /// \code{.operation}
1604 /// result[63:0] := ZeroExtend(__V[15:0])
1605 /// result[127:64] := ZeroExtend(__V[31:16])
1606 /// result[191:128] := ZeroExtend(__V[47:32])
1607 /// result[255:192] := ZeroExtend(__V[64:48])
1610 /// \headerfile <immintrin.h>
1612 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1615 /// A 128-bit vector of [8 x i16] containing the source values.
1616 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1618 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1619 _mm256_cvtepu16_epi64(__m128i __V
)
1621 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v8hu
)__V
, (__v8hu
)__V
, 0, 1, 2, 3), __v4di
);
1624 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1625 /// \a __V and returns the 64-bit values in the corresponding elements of a
1626 /// 256-bit vector of [4 x i64].
1628 /// \code{.operation}
1629 /// result[63:0] := ZeroExtend(__V[31:0])
1630 /// result[127:64] := ZeroExtend(__V[63:32])
1631 /// result[191:128] := ZeroExtend(__V[95:64])
1632 /// result[255:192] := ZeroExtend(__V[127:96])
1635 /// \headerfile <immintrin.h>
1637 /// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1640 /// A 128-bit vector of [4 x i32] containing the source values.
1641 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1643 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1644 _mm256_cvtepu32_epi64(__m128i __V
)
1646 return (__m256i
)__builtin_convertvector((__v4su
)__V
, __v4di
);
1649 /// Multiplies signed 32-bit integers from even-numbered elements of two
1650 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1651 /// [4 x i64] result.
1653 /// \code{.operation}
1654 /// result[63:0] := __a[31:0] * __b[31:0]
1655 /// result[127:64] := __a[95:64] * __b[95:64]
1656 /// result[191:128] := __a[159:128] * __b[159:128]
1657 /// result[255:192] := __a[223:192] * __b[223:192]
1660 /// \headerfile <immintrin.h>
1662 /// This intrinsic corresponds to the \c VPMULDQ instruction.
1665 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1667 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1668 /// \returns A 256-bit vector of [4 x i64] containing the products.
1669 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1670 _mm256_mul_epi32(__m256i __a
, __m256i __b
)
1672 return (__m256i
)__builtin_ia32_pmuldq256((__v8si
)__a
, (__v8si
)__b
);
1675 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1676 /// [16 x i16], truncates the 32-bit results to the most significant 18
1677 /// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1678 /// product in the [16 x i16] result.
1680 /// \code{.operation}
1681 /// FOR i := 0 TO 15
1683 /// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1684 /// result[j+15:j] := temp[16:1]
1687 /// \headerfile <immintrin.h>
1689 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
1692 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1694 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1695 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1696 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1697 _mm256_mulhrs_epi16(__m256i __a
, __m256i __b
)
1699 return (__m256i
)__builtin_ia32_pmulhrsw256((__v16hi
)__a
, (__v16hi
)__b
);
1702 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1703 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1704 /// [16 x i16] result.
1706 /// \headerfile <immintrin.h>
1708 /// This intrinsic corresponds to the \c VPMULHUW instruction.
1711 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1713 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1714 /// \returns A 256-bit vector of [16 x i16] containing the products.
1715 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1716 _mm256_mulhi_epu16(__m256i __a
, __m256i __b
)
1718 return (__m256i
)__builtin_ia32_pmulhuw256((__v16hi
)__a
, (__v16hi
)__b
);
1721 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1722 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1723 /// [16 x i16] result.
1725 /// \headerfile <immintrin.h>
1727 /// This intrinsic corresponds to the \c VPMULHW instruction.
1730 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1732 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1733 /// \returns A 256-bit vector of [16 x i16] containing the products.
1734 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1735 _mm256_mulhi_epi16(__m256i __a
, __m256i __b
)
1737 return (__m256i
)__builtin_ia32_pmulhw256((__v16hi
)__a
, (__v16hi
)__b
);
1740 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1741 /// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1742 /// [16 x i16] result.
1744 /// \headerfile <immintrin.h>
1746 /// This intrinsic corresponds to the \c VPMULLW instruction.
1749 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1751 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1752 /// \returns A 256-bit vector of [16 x i16] containing the products.
1753 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1754 _mm256_mullo_epi16(__m256i __a
, __m256i __b
)
1756 return (__m256i
)((__v16hu
)__a
* (__v16hu
)__b
);
1759 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1760 /// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1761 /// [8 x i32] result.
1763 /// \headerfile <immintrin.h>
1765 /// This intrinsic corresponds to the \c VPMULLD instruction.
1768 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1770 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1771 /// \returns A 256-bit vector of [8 x i32] containing the products.
1772 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1773 _mm256_mullo_epi32 (__m256i __a
, __m256i __b
)
1775 return (__m256i
)((__v8su
)__a
* (__v8su
)__b
);
1778 /// Multiplies unsigned 32-bit integers from even-numered elements of two
1779 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1780 /// [4 x i64] result.
1782 /// \code{.operation}
1783 /// result[63:0] := __a[31:0] * __b[31:0]
1784 /// result[127:64] := __a[95:64] * __b[95:64]
1785 /// result[191:128] := __a[159:128] * __b[159:128]
1786 /// result[255:192] := __a[223:192] * __b[223:192]
1789 /// \headerfile <immintrin.h>
1791 /// This intrinsic corresponds to the \c VPMULUDQ instruction.
1794 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1796 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1797 /// \returns A 256-bit vector of [4 x i64] containing the products.
1798 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1799 _mm256_mul_epu32(__m256i __a
, __m256i __b
)
1801 return __builtin_ia32_pmuludq256((__v8si
)__a
, (__v8si
)__b
);
1804 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1807 /// \headerfile <immintrin.h>
1809 /// This intrinsic corresponds to the \c VPOR instruction.
1812 /// A 256-bit integer vector.
1814 /// A 256-bit integer vector.
1815 /// \returns A 256-bit integer vector containing the result.
1816 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1817 _mm256_or_si256(__m256i __a
, __m256i __b
)
1819 return (__m256i
)((__v4du
)__a
| (__v4du
)__b
);
1822 /// Computes four sum of absolute difference (SAD) operations on sets of eight
1823 /// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1826 /// One SAD result is computed for each set of eight bytes from \a __a and
1827 /// eight bytes from \a __b. The zero-extended SAD value is returned in the
1828 /// corresponding 64-bit element of the result.
1830 /// A single SAD operation takes the differences between the corresponding
1831 /// bytes of \a __a and \a __b, takes the absolute value of each difference,
1832 /// and sums these eight values to form one 16-bit result. This operation
1833 /// is repeated four times with successive sets of eight bytes.
1835 /// \code{.operation}
1838 /// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1839 /// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1840 /// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1841 /// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1842 /// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1843 /// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1844 /// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1845 /// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1846 /// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1847 /// temp4 + temp5 + temp6 + temp7
1848 /// result[j+63:j+16] := 0
1852 /// \headerfile <immintrin.h>
1854 /// This intrinsic corresponds to the \c VPSADBW instruction.
1857 /// A 256-bit integer vector.
1859 /// A 256-bit integer vector.
1860 /// \returns A 256-bit integer vector containing the result.
1861 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1862 _mm256_sad_epu8(__m256i __a
, __m256i __b
)
1864 return __builtin_ia32_psadbw256((__v32qi
)__a
, (__v32qi
)__b
);
1867 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1868 /// to control information in the 256-bit integer vector \a __b, and
1869 /// returns the 256-bit result. In effect there are two separate 128-bit
1870 /// shuffles in the lower and upper halves.
1872 /// \code{.operation}
1873 /// FOR i := 0 TO 31
1875 /// IF __b[j+7] == 1
1876 /// result[j+7:j] := 0
1878 /// k := __b[j+3:j] * 8
1882 /// result[j+7:j] := __a[k+7:k]
1887 /// \headerfile <immintrin.h>
1889 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1892 /// A 256-bit integer vector containing source values.
1894 /// A 256-bit integer vector containing control information to determine
1895 /// what goes into the corresponding byte of the result. If bit 7 of the
1896 /// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1897 /// control byte specify the index (within the same 128-bit half) of \a __a
1898 /// to copy to the result byte.
1899 /// \returns A 256-bit integer vector containing the result.
1900 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1901 _mm256_shuffle_epi8(__m256i __a
, __m256i __b
)
1903 return (__m256i
)__builtin_ia32_pshufb256((__v32qi
)__a
, (__v32qi
)__b
);
1906 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1907 /// according to control information in the integer literal \a imm, and
1908 /// returns the 256-bit result. In effect there are two parallel 128-bit
1909 /// shuffles in the lower and upper halves.
1911 /// \code{.operation}
1914 /// k := (imm >> i*2)[1:0] * 32
1915 /// result[j+31:j] := a[k+31:k]
1916 /// result[128+j+31:128+j] := a[128+k+31:128+k]
1920 /// \headerfile <immintrin.h>
1923 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1926 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1929 /// A 256-bit vector of [8 x i32] containing source values.
1931 /// An immediate 8-bit value specifying which elements to copy from \a a.
1932 /// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1933 /// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1935 /// \returns A 256-bit vector of [8 x i32] containing the result.
1936 #define _mm256_shuffle_epi32(a, imm) \
1937 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1939 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1940 /// according to control information in the integer literal \a imm, and
1941 /// returns the 256-bit result. The upper 64 bits of each 128-bit half
1942 /// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1943 /// copied from \a a unchanged.
1945 /// \code{.operation}
1946 /// result[63:0] := a[63:0]
1947 /// result[191:128] := a[191:128]
1949 /// j := i * 16 + 64
1950 /// k := (imm >> i*2)[1:0] * 16 + 64
1951 /// result[j+15:j] := a[k+15:k]
1952 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1956 /// \headerfile <immintrin.h>
1959 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1962 /// This intrinsic corresponds to the \c VPSHUFHW instruction.
1965 /// A 256-bit vector of [16 x i16] containing source values.
1967 /// An immediate 8-bit value specifying which elements to copy from \a a.
1968 /// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1969 /// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1970 /// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1971 /// \returns A 256-bit vector of [16 x i16] containing the result.
1972 #define _mm256_shufflehi_epi16(a, imm) \
1973 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1975 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1976 /// according to control information in the integer literal \a imm, and
1977 /// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1978 /// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1979 /// copied from \a a unchanged.
1981 /// \code{.operation}
1982 /// result[127:64] := a[127:64]
1983 /// result[255:192] := a[255:192]
1986 /// k := (imm >> i*2)[1:0] * 16
1987 /// result[j+15:j] := a[k+15:k]
1988 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1992 /// \headerfile <immintrin.h>
1995 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1998 /// This intrinsic corresponds to the \c VPSHUFLW instruction.
2001 /// A 256-bit vector of [16 x i16] to use as a source of data for the
2004 /// An immediate 8-bit value specifying which elements to copy from \a a.
2005 /// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2006 /// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2008 /// \returns A 256-bit vector of [16 x i16] containing the result.
2009 #define _mm256_shufflelo_epi16(a, imm) \
2010 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2012 /// Sets each byte of the result to the corresponding byte of the 256-bit
2013 /// integer vector in \a __a, the negative of that byte, or zero, depending
2014 /// on whether the corresponding byte of the 256-bit integer vector in
2015 /// \a __b is greater than zero, less than zero, or equal to zero,
2018 /// \headerfile <immintrin.h>
2020 /// This intrinsic corresponds to the \c VPSIGNB instruction.
2023 /// A 256-bit integer vector.
2025 /// A 256-bit integer vector].
2026 /// \returns A 256-bit integer vector containing the result.
2027 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2028 _mm256_sign_epi8(__m256i __a
, __m256i __b
)
2030 return (__m256i
)__builtin_ia32_psignb256((__v32qi
)__a
, (__v32qi
)__b
);
2033 /// Sets each element of the result to the corresponding element of the
2034 /// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
2035 /// or zero, depending on whether the corresponding element of the 256-bit
2036 /// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2037 /// equal to zero, respectively.
2039 /// \headerfile <immintrin.h>
2041 /// This intrinsic corresponds to the \c VPSIGNW instruction.
2044 /// A 256-bit vector of [16 x i16].
2046 /// A 256-bit vector of [16 x i16].
2047 /// \returns A 256-bit vector of [16 x i16] containing the result.
2048 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2049 _mm256_sign_epi16(__m256i __a
, __m256i __b
)
2051 return (__m256i
)__builtin_ia32_psignw256((__v16hi
)__a
, (__v16hi
)__b
);
2054 /// Sets each element of the result to the corresponding element of the
2055 /// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2056 /// zero, depending on whether the corresponding element of the 256-bit
2057 /// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2058 /// equal to zero, respectively.
2060 /// \headerfile <immintrin.h>
2062 /// This intrinsic corresponds to the \c VPSIGND instruction.
2065 /// A 256-bit vector of [8 x i32].
2067 /// A 256-bit vector of [8 x i32].
2068 /// \returns A 256-bit vector of [8 x i32] containing the result.
2069 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2070 _mm256_sign_epi32(__m256i __a
, __m256i __b
)
2072 return (__m256i
)__builtin_ia32_psignd256((__v8si
)__a
, (__v8si
)__b
);
2075 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2076 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2077 /// is greater than 15, the returned result is all zeroes.
2079 /// \headerfile <immintrin.h>
2082 /// __m256i _mm256_slli_si256(__m256i a, const int imm);
2085 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2088 /// A 256-bit integer vector to be shifted.
2090 /// An unsigned immediate value specifying the shift count (in bytes).
2091 /// \returns A 256-bit integer vector containing the result.
2092 #define _mm256_slli_si256(a, imm) \
2093 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2095 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2096 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2097 /// is greater than 15, the returned result is all zeroes.
2099 /// \headerfile <immintrin.h>
2102 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2105 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2108 /// A 256-bit integer vector to be shifted.
2110 /// An unsigned immediate value specifying the shift count (in bytes).
2111 /// \returns A 256-bit integer vector containing the result.
2112 #define _mm256_bslli_epi128(a, imm) \
2113 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2115 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2116 /// left by \a __count bits, shifting in zero bits, and returns the result.
2117 /// If \a __count is greater than 15, the returned result is all zeroes.
2119 /// \headerfile <immintrin.h>
2121 /// This intrinsic corresponds to the \c VPSLLW instruction.
2124 /// A 256-bit vector of [16 x i16] to be shifted.
2126 /// An unsigned integer value specifying the shift count (in bits).
2127 /// \returns A 256-bit vector of [16 x i16] containing the result.
2128 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2129 _mm256_slli_epi16(__m256i __a
, int __count
)
2131 return (__m256i
)__builtin_ia32_psllwi256((__v16hi
)__a
, __count
);
2134 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2135 /// left by the number of bits specified by the lower 64 bits of \a __count,
2136 /// shifting in zero bits, and returns the result. If \a __count is greater
2137 /// than 15, the returned result is all zeroes.
2139 /// \headerfile <immintrin.h>
2141 /// This intrinsic corresponds to the \c VPSLLW instruction.
2144 /// A 256-bit vector of [16 x i16] to be shifted.
2146 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2147 /// shift count (in bits). The upper element is ignored.
2148 /// \returns A 256-bit vector of [16 x i16] containing the result.
2149 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2150 _mm256_sll_epi16(__m256i __a
, __m128i __count
)
2152 return (__m256i
)__builtin_ia32_psllw256((__v16hi
)__a
, (__v8hi
)__count
);
2155 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2156 /// left by \a __count bits, shifting in zero bits, and returns the result.
2157 /// If \a __count is greater than 31, the returned result is all zeroes.
2159 /// \headerfile <immintrin.h>
2161 /// This intrinsic corresponds to the \c VPSLLD instruction.
2164 /// A 256-bit vector of [8 x i32] to be shifted.
2166 /// An unsigned integer value specifying the shift count (in bits).
2167 /// \returns A 256-bit vector of [8 x i32] containing the result.
2168 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2169 _mm256_slli_epi32(__m256i __a
, int __count
)
2171 return (__m256i
)__builtin_ia32_pslldi256((__v8si
)__a
, __count
);
2174 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2175 /// left by the number of bits given in the lower 64 bits of \a __count,
2176 /// shifting in zero bits, and returns the result. If \a __count is greater
2177 /// than 31, the returned result is all zeroes.
2179 /// \headerfile <immintrin.h>
2181 /// This intrinsic corresponds to the \c VPSLLD instruction.
2184 /// A 256-bit vector of [8 x i32] to be shifted.
2186 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2187 /// shift count (in bits). The upper element is ignored.
2188 /// \returns A 256-bit vector of [8 x i32] containing the result.
2189 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2190 _mm256_sll_epi32(__m256i __a
, __m128i __count
)
2192 return (__m256i
)__builtin_ia32_pslld256((__v8si
)__a
, (__v4si
)__count
);
2195 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2196 /// left by \a __count bits, shifting in zero bits, and returns the result.
2197 /// If \a __count is greater than 63, the returned result is all zeroes.
2199 /// \headerfile <immintrin.h>
2201 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2204 /// A 256-bit vector of [4 x i64] to be shifted.
2206 /// An unsigned integer value specifying the shift count (in bits).
2207 /// \returns A 256-bit vector of [4 x i64] containing the result.
2208 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2209 _mm256_slli_epi64(__m256i __a
, int __count
)
2211 return __builtin_ia32_psllqi256((__v4di
)__a
, __count
);
2214 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2215 /// left by the number of bits given in the lower 64 bits of \a __count,
2216 /// shifting in zero bits, and returns the result. If \a __count is greater
2217 /// than 63, the returned result is all zeroes.
2219 /// \headerfile <immintrin.h>
2221 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2224 /// A 256-bit vector of [4 x i64] to be shifted.
2226 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2227 /// shift count (in bits). The upper element is ignored.
2228 /// \returns A 256-bit vector of [4 x i64] containing the result.
2229 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2230 _mm256_sll_epi64(__m256i __a
, __m128i __count
)
2232 return __builtin_ia32_psllq256((__v4di
)__a
, __count
);
2235 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2236 /// right by \a __count bits, shifting in sign bits, and returns the result.
2237 /// If \a __count is greater than 15, each element of the result is either
2238 /// 0 or -1 according to the corresponding input sign bit.
2240 /// \headerfile <immintrin.h>
2242 /// This intrinsic corresponds to the \c VPSRAW instruction.
2245 /// A 256-bit vector of [16 x i16] to be shifted.
2247 /// An unsigned integer value specifying the shift count (in bits).
2248 /// \returns A 256-bit vector of [16 x i16] containing the result.
2249 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2250 _mm256_srai_epi16(__m256i __a
, int __count
)
2252 return (__m256i
)__builtin_ia32_psrawi256((__v16hi
)__a
, __count
);
2255 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2256 /// right by the number of bits given in the lower 64 bits of \a __count,
2257 /// shifting in sign bits, and returns the result. If \a __count is greater
2258 /// than 15, each element of the result is either 0 or -1 according to the
2259 /// corresponding input sign bit.
2261 /// \headerfile <immintrin.h>
2263 /// This intrinsic corresponds to the \c VPSRAW instruction.
2266 /// A 256-bit vector of [16 x i16] to be shifted.
2268 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2269 /// shift count (in bits). The upper element is ignored.
2270 /// \returns A 256-bit vector of [16 x i16] containing the result.
2271 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2272 _mm256_sra_epi16(__m256i __a
, __m128i __count
)
2274 return (__m256i
)__builtin_ia32_psraw256((__v16hi
)__a
, (__v8hi
)__count
);
2277 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2278 /// right by \a __count bits, shifting in sign bits, and returns the result.
2279 /// If \a __count is greater than 31, each element of the result is either
2280 /// 0 or -1 according to the corresponding input sign bit.
2282 /// \headerfile <immintrin.h>
2284 /// This intrinsic corresponds to the \c VPSRAD instruction.
2287 /// A 256-bit vector of [8 x i32] to be shifted.
2289 /// An unsigned integer value specifying the shift count (in bits).
2290 /// \returns A 256-bit vector of [8 x i32] containing the result.
2291 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2292 _mm256_srai_epi32(__m256i __a
, int __count
)
2294 return (__m256i
)__builtin_ia32_psradi256((__v8si
)__a
, __count
);
2297 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2298 /// right by the number of bits given in the lower 64 bits of \a __count,
2299 /// shifting in sign bits, and returns the result. If \a __count is greater
2300 /// than 31, each element of the result is either 0 or -1 according to the
2301 /// corresponding input sign bit.
2303 /// \headerfile <immintrin.h>
2305 /// This intrinsic corresponds to the \c VPSRAD instruction.
2308 /// A 256-bit vector of [8 x i32] to be shifted.
2310 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2311 /// shift count (in bits). The upper element is ignored.
2312 /// \returns A 256-bit vector of [8 x i32] containing the result.
2313 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2314 _mm256_sra_epi32(__m256i __a
, __m128i __count
)
2316 return (__m256i
)__builtin_ia32_psrad256((__v8si
)__a
, (__v4si
)__count
);
2319 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2320 /// \a imm bytes, shifting in zero bytes, and returns the result. If
2321 /// \a imm is greater than 15, the returned result is all zeroes.
2323 /// \headerfile <immintrin.h>
2326 /// __m256i _mm256_srli_si256(__m256i a, const int imm);
2329 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2332 /// A 256-bit integer vector to be shifted.
2334 /// An unsigned immediate value specifying the shift count (in bytes).
2335 /// \returns A 256-bit integer vector containing the result.
2336 #define _mm256_srli_si256(a, imm) \
2337 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2339 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2340 /// \a imm bytes, shifting in zero bytes, and returns the result. If
2341 /// \a imm is greater than 15, the returned result is all zeroes.
2343 /// \headerfile <immintrin.h>
2346 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2349 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2352 /// A 256-bit integer vector to be shifted.
2354 /// An unsigned immediate value specifying the shift count (in bytes).
2355 /// \returns A 256-bit integer vector containing the result.
2356 #define _mm256_bsrli_epi128(a, imm) \
2357 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2359 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2360 /// right by \a __count bits, shifting in zero bits, and returns the result.
2361 /// If \a __count is greater than 15, the returned result is all zeroes.
2363 /// \headerfile <immintrin.h>
2365 /// This intrinsic corresponds to the \c VPSRLW instruction.
2368 /// A 256-bit vector of [16 x i16] to be shifted.
2370 /// An unsigned integer value specifying the shift count (in bits).
2371 /// \returns A 256-bit vector of [16 x i16] containing the result.
2372 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2373 _mm256_srli_epi16(__m256i __a
, int __count
)
2375 return (__m256i
)__builtin_ia32_psrlwi256((__v16hi
)__a
, __count
);
2378 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2379 /// right by the number of bits given in the lower 64 bits of \a __count,
2380 /// shifting in zero bits, and returns the result. If \a __count is greater
2381 /// than 15, the returned result is all zeroes.
2383 /// \headerfile <immintrin.h>
2385 /// This intrinsic corresponds to the \c VPSRLW instruction.
2388 /// A 256-bit vector of [16 x i16] to be shifted.
2390 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2391 /// shift count (in bits). The upper element is ignored.
2392 /// \returns A 256-bit vector of [16 x i16] containing the result.
2393 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2394 _mm256_srl_epi16(__m256i __a
, __m128i __count
)
2396 return (__m256i
)__builtin_ia32_psrlw256((__v16hi
)__a
, (__v8hi
)__count
);
2399 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2400 /// right by \a __count bits, shifting in zero bits, and returns the result.
2401 /// If \a __count is greater than 31, the returned result is all zeroes.
2403 /// \headerfile <immintrin.h>
2405 /// This intrinsic corresponds to the \c VPSRLD instruction.
2408 /// A 256-bit vector of [8 x i32] to be shifted.
2410 /// An unsigned integer value specifying the shift count (in bits).
2411 /// \returns A 256-bit vector of [8 x i32] containing the result.
2412 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2413 _mm256_srli_epi32(__m256i __a
, int __count
)
2415 return (__m256i
)__builtin_ia32_psrldi256((__v8si
)__a
, __count
);
2418 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2419 /// right by the number of bits given in the lower 64 bits of \a __count,
2420 /// shifting in zero bits, and returns the result. If \a __count is greater
2421 /// than 31, the returned result is all zeroes.
2423 /// \headerfile <immintrin.h>
2425 /// This intrinsic corresponds to the \c VPSRLD instruction.
2428 /// A 256-bit vector of [8 x i32] to be shifted.
2430 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2431 /// shift count (in bits). The upper element is ignored.
2432 /// \returns A 256-bit vector of [8 x i32] containing the result.
2433 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2434 _mm256_srl_epi32(__m256i __a
, __m128i __count
)
2436 return (__m256i
)__builtin_ia32_psrld256((__v8si
)__a
, (__v4si
)__count
);
2439 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2440 /// right by \a __count bits, shifting in zero bits, and returns the result.
2441 /// If \a __count is greater than 63, the returned result is all zeroes.
2443 /// \headerfile <immintrin.h>
2445 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2448 /// A 256-bit vector of [4 x i64] to be shifted.
2450 /// An unsigned integer value specifying the shift count (in bits).
2451 /// \returns A 256-bit vector of [4 x i64] containing the result.
2452 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2453 _mm256_srli_epi64(__m256i __a
, int __count
)
2455 return __builtin_ia32_psrlqi256((__v4di
)__a
, __count
);
2458 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2459 /// right by the number of bits given in the lower 64 bits of \a __count,
2460 /// shifting in zero bits, and returns the result. If \a __count is greater
2461 /// than 63, the returned result is all zeroes.
2463 /// \headerfile <immintrin.h>
2465 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2468 /// A 256-bit vector of [4 x i64] to be shifted.
2470 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2471 /// shift count (in bits). The upper element is ignored.
2472 /// \returns A 256-bit vector of [4 x i64] containing the result.
2473 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2474 _mm256_srl_epi64(__m256i __a
, __m128i __count
)
2476 return __builtin_ia32_psrlq256((__v4di
)__a
, __count
);
2479 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2480 /// vectors. Returns the lower 8 bits of each difference in the
2481 /// corresponding byte of the 256-bit integer vector result (overflow is
2484 /// \code{.operation}
2485 /// FOR i := 0 TO 31
2487 /// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2491 /// \headerfile <immintrin.h>
2493 /// This intrinsic corresponds to the \c VPSUBB instruction.
2496 /// A 256-bit integer vector containing the minuends.
2498 /// A 256-bit integer vector containing the subtrahends.
2499 /// \returns A 256-bit integer vector containing the differences.
2500 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2501 _mm256_sub_epi8(__m256i __a
, __m256i __b
)
2503 return (__m256i
)((__v32qu
)__a
- (__v32qu
)__b
);
2506 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2507 /// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2508 /// the corresponding element of the [16 x i16] result (overflow is
2511 /// \code{.operation}
2512 /// FOR i := 0 TO 15
2514 /// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2518 /// \headerfile <immintrin.h>
2520 /// This intrinsic corresponds to the \c VPSUBW instruction.
2523 /// A 256-bit vector of [16 x i16] containing the minuends.
2525 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2526 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2527 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2528 _mm256_sub_epi16(__m256i __a
, __m256i __b
)
2530 return (__m256i
)((__v16hu
)__a
- (__v16hu
)__b
);
2533 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
2534 /// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2535 /// the corresponding element of the [8 x i32] result (overflow is ignored).
2537 /// \code{.operation}
2540 /// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2544 /// \headerfile <immintrin.h>
2546 /// This intrinsic corresponds to the \c VPSUBD instruction.
2549 /// A 256-bit vector of [8 x i32] containing the minuends.
2551 /// A 256-bit vector of [8 x i32] containing the subtrahends.
2552 /// \returns A 256-bit vector of [8 x i32] containing the differences.
2553 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2554 _mm256_sub_epi32(__m256i __a
, __m256i __b
)
2556 return (__m256i
)((__v8su
)__a
- (__v8su
)__b
);
2559 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
2560 /// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2561 /// the corresponding element of the [4 x i64] result (overflow is ignored).
2563 /// \code{.operation}
2566 /// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2570 /// \headerfile <immintrin.h>
2572 /// This intrinsic corresponds to the \c VPSUBQ instruction.
2575 /// A 256-bit vector of [4 x i64] containing the minuends.
2577 /// A 256-bit vector of [4 x i64] containing the subtrahends.
2578 /// \returns A 256-bit vector of [4 x i64] containing the differences.
2579 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2580 _mm256_sub_epi64(__m256i __a
, __m256i __b
)
2582 return (__m256i
)((__v4du
)__a
- (__v4du
)__b
);
2585 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2586 /// vectors using signed saturation, and returns each differences in the
2587 /// corresponding byte of the 256-bit integer vector result.
2589 /// \code{.operation}
2590 /// FOR i := 0 TO 31
2592 /// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2596 /// \headerfile <immintrin.h>
2598 /// This intrinsic corresponds to the \c VPSUBSB instruction.
2601 /// A 256-bit integer vector containing the minuends.
2603 /// A 256-bit integer vector containing the subtrahends.
2604 /// \returns A 256-bit integer vector containing the differences.
2605 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2606 _mm256_subs_epi8(__m256i __a
, __m256i __b
)
2608 return (__m256i
)__builtin_elementwise_sub_sat((__v32qs
)__a
, (__v32qs
)__b
);
2611 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2612 /// vectors of [16 x i16] using signed saturation, and returns each
2613 /// difference in the corresponding element of the [16 x i16] result.
2615 /// \code{.operation}
2616 /// FOR i := 0 TO 15
2618 /// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2622 /// \headerfile <immintrin.h>
2624 /// This intrinsic corresponds to the \c VPSUBSW instruction.
2627 /// A 256-bit vector of [16 x i16] containing the minuends.
2629 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2630 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2631 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2632 _mm256_subs_epi16(__m256i __a
, __m256i __b
)
2634 return (__m256i
)__builtin_elementwise_sub_sat((__v16hi
)__a
, (__v16hi
)__b
);
2637 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2638 /// vectors using unsigned saturation, and returns each difference in the
2639 /// corresponding byte of the 256-bit integer vector result. For each byte,
2640 /// computes <c> result = __a - __b </c>.
2642 /// \code{.operation}
2643 /// FOR i := 0 TO 31
2645 /// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2649 /// \headerfile <immintrin.h>
2651 /// This intrinsic corresponds to the \c VPSUBUSB instruction.
2654 /// A 256-bit integer vector containing the minuends.
2656 /// A 256-bit integer vector containing the subtrahends.
2657 /// \returns A 256-bit integer vector containing the differences.
2658 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2659 _mm256_subs_epu8(__m256i __a
, __m256i __b
)
2661 return (__m256i
)__builtin_elementwise_sub_sat((__v32qu
)__a
, (__v32qu
)__b
);
2664 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2665 /// vectors of [16 x i16] using unsigned saturation, and returns each
2666 /// difference in the corresponding element of the [16 x i16] result.
2668 /// \code{.operation}
2669 /// FOR i := 0 TO 15
2671 /// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2675 /// \headerfile <immintrin.h>
2677 /// This intrinsic corresponds to the \c VPSUBUSW instruction.
2680 /// A 256-bit vector of [16 x i16] containing the minuends.
2682 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2683 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2684 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2685 _mm256_subs_epu16(__m256i __a
, __m256i __b
)
2687 return (__m256i
)__builtin_elementwise_sub_sat((__v16hu
)__a
, (__v16hu
)__b
);
2690 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2691 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2692 /// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2693 /// input; other bits in these parameters are ignored.
2695 /// \code{.operation}
2696 /// result[7:0] := __a[71:64]
2697 /// result[15:8] := __b[71:64]
2698 /// result[23:16] := __a[79:72]
2699 /// result[31:24] := __b[79:72]
2701 /// result[127:120] := __b[127:120]
2702 /// result[135:128] := __a[199:192]
2704 /// result[255:248] := __b[255:248]
2707 /// \headerfile <immintrin.h>
2709 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2712 /// A 256-bit integer vector used as the source for the even-numbered bytes
2715 /// A 256-bit integer vector used as the source for the odd-numbered bytes
2717 /// \returns A 256-bit integer vector containing the result.
2718 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2719 _mm256_unpackhi_epi8(__m256i __a
, __m256i __b
)
2721 return (__m256i
)__builtin_shufflevector((__v32qi
)__a
, (__v32qi
)__b
, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2724 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2725 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2726 /// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2727 /// 128-bit half of \a __a and \a __b as input; other bits in these
2728 /// parameters are ignored.
2730 /// \code{.operation}
2731 /// result[15:0] := __a[79:64]
2732 /// result[31:16] := __b[79:64]
2733 /// result[47:32] := __a[95:80]
2734 /// result[63:48] := __b[95:80]
2736 /// result[127:112] := __b[127:112]
2737 /// result[143:128] := __a[211:196]
2739 /// result[255:240] := __b[255:240]
2742 /// \headerfile <immintrin.h>
2744 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2747 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2748 /// elements of the result.
2750 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2751 /// elements of the result.
2752 /// \returns A 256-bit vector of [16 x i16] containing the result.
2753 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2754 _mm256_unpackhi_epi16(__m256i __a
, __m256i __b
)
2756 return (__m256i
)__builtin_shufflevector((__v16hi
)__a
, (__v16hi
)__b
, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2759 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2760 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2761 /// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2762 /// of \a __a and \a __b as input; other bits in these parameters are
2765 /// \code{.operation}
2766 /// result[31:0] := __a[95:64]
2767 /// result[63:32] := __b[95:64]
2768 /// result[95:64] := __a[127:96]
2769 /// result[127:96] := __b[127:96]
2770 /// result[159:128] := __a[223:192]
2771 /// result[191:160] := __b[223:192]
2772 /// result[223:192] := __a[255:224]
2773 /// result[255:224] := __b[255:224]
2776 /// \headerfile <immintrin.h>
2778 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2781 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2782 /// elements of the result.
2784 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2785 /// elements of the result.
2786 /// \returns A 256-bit vector of [8 x i32] containing the result.
2787 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2788 _mm256_unpackhi_epi32(__m256i __a
, __m256i __b
)
2790 return (__m256i
)__builtin_shufflevector((__v8si
)__a
, (__v8si
)__b
, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2793 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2794 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2795 /// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2796 /// of \a __a and \a __b as input; other bits in these parameters are
2799 /// \code{.operation}
2800 /// result[63:0] := __a[127:64]
2801 /// result[127:64] := __b[127:64]
2802 /// result[191:128] := __a[255:192]
2803 /// result[255:192] := __b[255:192]
2806 /// \headerfile <immintrin.h>
2808 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2811 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2812 /// elements of the result.
2814 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2815 /// elements of the result.
2816 /// \returns A 256-bit vector of [4 x i64] containing the result.
2817 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2818 _mm256_unpackhi_epi64(__m256i __a
, __m256i __b
)
2820 return (__m256i
)__builtin_shufflevector((__v4di
)__a
, (__v4di
)__b
, 1, 4+1, 3, 4+3);
2823 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2824 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2825 /// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2826 /// input; other bits in these parameters are ignored.
2828 /// \code{.operation}
2829 /// result[7:0] := __a[7:0]
2830 /// result[15:8] := __b[7:0]
2831 /// result[23:16] := __a[15:8]
2832 /// result[31:24] := __b[15:8]
2834 /// result[127:120] := __b[63:56]
2835 /// result[135:128] := __a[135:128]
2837 /// result[255:248] := __b[191:184]
2840 /// \headerfile <immintrin.h>
2842 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2845 /// A 256-bit integer vector used as the source for the even-numbered bytes
2848 /// A 256-bit integer vector used as the source for the odd-numbered bytes
2850 /// \returns A 256-bit integer vector containing the result.
2851 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2852 _mm256_unpacklo_epi8(__m256i __a
, __m256i __b
)
2854 return (__m256i
)__builtin_shufflevector((__v32qi
)__a
, (__v32qi
)__b
, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2857 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2858 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2859 /// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2860 /// 128-bit half of \a __a and \a __b as input; other bits in these
2861 /// parameters are ignored.
2863 /// \code{.operation}
2864 /// result[15:0] := __a[15:0]
2865 /// result[31:16] := __b[15:0]
2866 /// result[47:32] := __a[31:16]
2867 /// result[63:48] := __b[31:16]
2869 /// result[127:112] := __b[63:48]
2870 /// result[143:128] := __a[143:128]
2872 /// result[255:239] := __b[191:176]
2875 /// \headerfile <immintrin.h>
2877 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2880 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2881 /// elements of the result.
2883 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2884 /// elements of the result.
2885 /// \returns A 256-bit vector of [16 x i16] containing the result.
2886 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2887 _mm256_unpacklo_epi16(__m256i __a
, __m256i __b
)
2889 return (__m256i
)__builtin_shufflevector((__v16hi
)__a
, (__v16hi
)__b
, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2892 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2893 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2894 /// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2895 /// of \a __a and \a __b as input; other bits in these parameters are
2898 /// \code{.operation}
2899 /// result[31:0] := __a[31:0]
2900 /// result[63:32] := __b[31:0]
2901 /// result[95:64] := __a[63:32]
2902 /// result[127:96] := __b[63:32]
2903 /// result[159:128] := __a[159:128]
2904 /// result[191:160] := __b[159:128]
2905 /// result[223:192] := __a[191:160]
2906 /// result[255:224] := __b[191:190]
2909 /// \headerfile <immintrin.h>
2911 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2914 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2915 /// elements of the result.
2917 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2918 /// elements of the result.
2919 /// \returns A 256-bit vector of [8 x i32] containing the result.
2920 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2921 _mm256_unpacklo_epi32(__m256i __a
, __m256i __b
)
2923 return (__m256i
)__builtin_shufflevector((__v8si
)__a
, (__v8si
)__b
, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2926 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2927 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2928 /// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2929 /// of \a __a and \a __b as input; other bits in these parameters are
2932 /// \code{.operation}
2933 /// result[63:0] := __a[63:0]
2934 /// result[127:64] := __b[63:0]
2935 /// result[191:128] := __a[191:128]
2936 /// result[255:192] := __b[191:128]
2939 /// \headerfile <immintrin.h>
2941 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2944 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2945 /// elements of the result.
2947 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2948 /// elements of the result.
2949 /// \returns A 256-bit vector of [4 x i64] containing the result.
2950 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2951 _mm256_unpacklo_epi64(__m256i __a
, __m256i __b
)
2953 return (__m256i
)__builtin_shufflevector((__v4di
)__a
, (__v4di
)__b
, 0, 4+0, 2, 4+2);
2956 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2959 /// \headerfile <immintrin.h>
2961 /// This intrinsic corresponds to the \c VPXOR instruction.
2964 /// A 256-bit integer vector.
2966 /// A 256-bit integer vector.
2967 /// \returns A 256-bit integer vector containing the result.
2968 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2969 _mm256_xor_si256(__m256i __a
, __m256i __b
)
2971 return (__m256i
)((__v4du
)__a
^ (__v4du
)__b
);
2974 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2975 /// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2978 /// \headerfile <immintrin.h>
2980 /// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2983 /// A pointer to the 32-byte aligned memory containing the vector to load.
2984 /// \returns A 256-bit integer vector loaded from memory.
2985 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2986 _mm256_stream_load_si256(const void *__V
)
2988 typedef __v4di __v4di_aligned
__attribute__((aligned(32)));
2989 return (__m256i
)__builtin_nontemporal_load((const __v4di_aligned
*)__V
);
2992 /// Broadcasts the 32-bit floating-point value from the low element of the
2993 /// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2994 /// 128-bit vector of [4 x float].
2996 /// \headerfile <immintrin.h>
2998 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3001 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
3002 /// \returns A 128-bit vector of [4 x float] containing the result.
3003 static __inline__ __m128 __DEFAULT_FN_ATTRS128
3004 _mm_broadcastss_ps(__m128 __X
)
3006 return (__m128
)__builtin_shufflevector((__v4sf
)__X
, (__v4sf
)__X
, 0, 0, 0, 0);
3009 /// Broadcasts the 64-bit floating-point value from the low element of the
3010 /// 128-bit vector of [2 x double] in \a __a to both elements of the
3011 /// result's 128-bit vector of [2 x double].
3013 /// \headerfile <immintrin.h>
3015 /// This intrinsic corresponds to the \c MOVDDUP instruction.
3018 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
3019 /// \returns A 128-bit vector of [2 x double] containing the result.
3020 static __inline__ __m128d __DEFAULT_FN_ATTRS128
3021 _mm_broadcastsd_pd(__m128d __a
)
3023 return __builtin_shufflevector((__v2df
)__a
, (__v2df
)__a
, 0, 0);
3026 /// Broadcasts the 32-bit floating-point value from the low element of the
3027 /// 128-bit vector of [4 x float] in \a __X to all elements of the
3028 /// result's 256-bit vector of [8 x float].
3030 /// \headerfile <immintrin.h>
3032 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3035 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
3036 /// \returns A 256-bit vector of [8 x float] containing the result.
3037 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3038 _mm256_broadcastss_ps(__m128 __X
)
3040 return (__m256
)__builtin_shufflevector((__v4sf
)__X
, (__v4sf
)__X
, 0, 0, 0, 0, 0, 0, 0, 0);
3043 /// Broadcasts the 64-bit floating-point value from the low element of the
3044 /// 128-bit vector of [2 x double] in \a __X to all elements of the
3045 /// result's 256-bit vector of [4 x double].
3047 /// \headerfile <immintrin.h>
3049 /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3052 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
3053 /// \returns A 256-bit vector of [4 x double] containing the result.
3054 static __inline__ __m256d __DEFAULT_FN_ATTRS256
3055 _mm256_broadcastsd_pd(__m128d __X
)
3057 return (__m256d
)__builtin_shufflevector((__v2df
)__X
, (__v2df
)__X
, 0, 0, 0, 0);
3060 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
3061 /// upper halves of the 256-bit result.
3063 /// \headerfile <immintrin.h>
3065 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3068 /// A 128-bit integer vector to be broadcast.
3069 /// \returns A 256-bit integer vector containing the result.
3070 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3071 _mm256_broadcastsi128_si256(__m128i __X
)
3073 return (__m256i
)__builtin_shufflevector((__v2di
)__X
, (__v2di
)__X
, 0, 1, 0, 1);
3076 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3078 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
3079 /// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3080 /// as specified by the immediate integer operand \a M.
3082 /// \code{.operation}
3086 /// result[31+j:j] := V1[31+j:j]
3088 /// result[31+j:j] := V2[32+j:j]
3093 /// \headerfile <immintrin.h>
3096 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3099 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3102 /// A 128-bit vector of [4 x i32] containing source values.
3104 /// A 128-bit vector of [4 x i32] containing source values.
3106 /// An immediate 8-bit integer operand, with bits [3:0] specifying the
3107 /// source for each element of the result. The position of the mask bit
3108 /// corresponds to the index of a copied value. When a mask bit is 0, the
3109 /// element is copied from \a V1; otherwise, it is copied from \a V2.
3110 /// \returns A 128-bit vector of [4 x i32] containing the result.
3111 #define _mm_blend_epi32(V1, V2, M) \
3112 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3113 (__v4si)(__m128i)(V2), (int)(M)))
3115 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
3116 /// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3117 /// as specified by the immediate integer operand \a M.
3119 /// \code{.operation}
3123 /// result[31+j:j] := V1[31+j:j]
3125 /// result[31+j:j] := V2[32+j:j]
3130 /// \headerfile <immintrin.h>
3133 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3136 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3139 /// A 256-bit vector of [8 x i32] containing source values.
3141 /// A 256-bit vector of [8 x i32] containing source values.
3143 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
3144 /// source for each element of the result. The position of the mask bit
3145 /// corresponds to the index of a copied value. When a mask bit is 0, the
3146 /// element is copied from \a V1; otherwise, it is is copied from \a V2.
3147 /// \returns A 256-bit vector of [8 x i32] containing the result.
3148 #define _mm256_blend_epi32(V1, V2, M) \
3149 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3150 (__v8si)(__m256i)(V2), (int)(M)))
3152 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3153 /// bytes of the 256-bit result.
3155 /// \headerfile <immintrin.h>
3157 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3160 /// A 128-bit integer vector whose low byte will be broadcast.
3161 /// \returns A 256-bit integer vector containing the result.
3162 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3163 _mm256_broadcastb_epi8(__m128i __X
)
3165 return (__m256i
)__builtin_shufflevector((__v16qi
)__X
, (__v16qi
)__X
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3168 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3169 /// to all elements of the result's 256-bit vector of [16 x i16].
3171 /// \headerfile <immintrin.h>
3173 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3176 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3177 /// \returns A 256-bit vector of [16 x i16] containing the result.
3178 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3179 _mm256_broadcastw_epi16(__m128i __X
)
3181 return (__m256i
)__builtin_shufflevector((__v8hi
)__X
, (__v8hi
)__X
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3184 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3185 /// to all elements of the result's 256-bit vector of [8 x i32].
3187 /// \headerfile <immintrin.h>
3189 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3192 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3193 /// \returns A 256-bit vector of [8 x i32] containing the result.
3194 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3195 _mm256_broadcastd_epi32(__m128i __X
)
3197 return (__m256i
)__builtin_shufflevector((__v4si
)__X
, (__v4si
)__X
, 0, 0, 0, 0, 0, 0, 0, 0);
3200 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3201 /// to all elements of the result's 256-bit vector of [4 x i64].
3203 /// \headerfile <immintrin.h>
3205 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3208 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3209 /// \returns A 256-bit vector of [4 x i64] containing the result.
3210 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3211 _mm256_broadcastq_epi64(__m128i __X
)
3213 return (__m256i
)__builtin_shufflevector((__v2di
)__X
, (__v2di
)__X
, 0, 0, 0, 0);
3216 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3217 /// bytes of the 128-bit result.
3219 /// \headerfile <immintrin.h>
3221 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3224 /// A 128-bit integer vector whose low byte will be broadcast.
3225 /// \returns A 128-bit integer vector containing the result.
3226 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3227 _mm_broadcastb_epi8(__m128i __X
)
3229 return (__m128i
)__builtin_shufflevector((__v16qi
)__X
, (__v16qi
)__X
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3232 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3233 /// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3235 /// \headerfile <immintrin.h>
3237 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3240 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3241 /// \returns A 128-bit vector of [8 x i16] containing the result.
3242 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3243 _mm_broadcastw_epi16(__m128i __X
)
3245 return (__m128i
)__builtin_shufflevector((__v8hi
)__X
, (__v8hi
)__X
, 0, 0, 0, 0, 0, 0, 0, 0);
3248 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3249 /// to all elements of the result's vector of [4 x i32].
3251 /// \headerfile <immintrin.h>
3253 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3256 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3257 /// \returns A 128-bit vector of [4 x i32] containing the result.
3258 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3259 _mm_broadcastd_epi32(__m128i __X
)
3261 return (__m128i
)__builtin_shufflevector((__v4si
)__X
, (__v4si
)__X
, 0, 0, 0, 0);
3264 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3265 /// to both elements of the result's 128-bit vector of [2 x i64].
3267 /// \headerfile <immintrin.h>
3269 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3272 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3273 /// \returns A 128-bit vector of [2 x i64] containing the result.
3274 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3275 _mm_broadcastq_epi64(__m128i __X
)
3277 return (__m128i
)__builtin_shufflevector((__v2di
)__X
, (__v2di
)__X
, 0, 0);
3280 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3281 /// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3282 /// elements of the 256-bit vector of [8 x i32] in \a __b.
3284 /// \code{.operation}
3287 /// k := __b[j+2:j] * 32
3288 /// result[j+31:j] := __a[k+31:k]
3292 /// \headerfile <immintrin.h>
3294 /// This intrinsic corresponds to the \c VPERMD instruction.
3297 /// A 256-bit vector of [8 x i32] containing the source values.
3299 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
3301 /// \returns A 256-bit vector of [8 x i32] containing the result.
3302 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3303 _mm256_permutevar8x32_epi32(__m256i __a
, __m256i __b
)
3305 return (__m256i
)__builtin_ia32_permvarsi256((__v8si
)__a
, (__v8si
)__b
);
3308 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3309 /// the 256-bit vector of [4 x double] in \a V as specified by the
3310 /// immediate value \a M.
3312 /// \code{.operation}
3315 /// k := (M >> i*2)[1:0] * 64
3316 /// result[j+63:j] := V[k+63:k]
3320 /// \headerfile <immintrin.h>
3323 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3326 /// This intrinsic corresponds to the \c VPERMPD instruction.
3329 /// A 256-bit vector of [4 x double] containing the source values.
3331 /// An immediate 8-bit value specifying which elements to copy from \a V.
3332 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
3333 /// \a M[3:2] specifies the index for element 1, and so forth.
3334 /// \returns A 256-bit vector of [4 x double] containing the result.
3335 #define _mm256_permute4x64_pd(V, M) \
3336 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3338 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3339 /// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3340 /// the elements of the 256-bit vector of [8 x i32] in \a __b.
3342 /// \code{.operation}
3345 /// k := __b[j+2:j] * 32
3346 /// result[j+31:j] := __a[k+31:k]
3350 /// \headerfile <immintrin.h>
3352 /// This intrinsic corresponds to the \c VPERMPS instruction.
3355 /// A 256-bit vector of [8 x float] containing the source values.
3357 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
3359 /// \returns A 256-bit vector of [8 x float] containing the result.
3360 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3361 _mm256_permutevar8x32_ps(__m256 __a
, __m256i __b
)
3363 return (__m256
)__builtin_ia32_permvarsf256((__v8sf
)__a
, (__v8si
)__b
);
3366 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3367 /// of the 256-bit vector of [4 x i64] in \a V as specified by the
3368 /// immediate value \a M.
3370 /// \code{.operation}
3373 /// k := (M >> i*2)[1:0] * 64
3374 /// result[j+63:j] := V[k+63:k]
3378 /// \headerfile <immintrin.h>
3381 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3384 /// This intrinsic corresponds to the \c VPERMQ instruction.
3387 /// A 256-bit vector of [4 x i64] containing the source values.
3389 /// An immediate 8-bit value specifying which elements to copy from \a V.
3390 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
3391 /// \a M[3:2] specifies the index for element 1, and so forth.
3392 /// \returns A 256-bit vector of [4 x i64] containing the result.
3393 #define _mm256_permute4x64_epi64(V, M) \
3394 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3396 /// Sets each half of the 256-bit result either to zero or to one of the
3397 /// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3398 /// as specified by the immediate value \a M.
3400 /// \code{.operation}
3405 /// CASE (k[1:0]) OF
3406 /// 0: result[127+j:j] := V1[127:0]
3407 /// 1: result[127+j:j] := V1[255:128]
3408 /// 2: result[127+j:j] := V2[127:0]
3409 /// 3: result[127+j:j] := V2[255:128]
3412 /// result[127+j:j] := 0
3417 /// \headerfile <immintrin.h>
3420 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3423 /// This intrinsic corresponds to the \c VPERM2I128 instruction.
3426 /// A 256-bit integer vector containing source values.
3428 /// A 256-bit integer vector containing source values.
3430 /// An immediate value specifying how to form the result. Bits [3:0]
3431 /// control the lower half of the result, bits [7:4] control the upper half.
3432 /// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3433 /// otherwise bits [1:0] determine the source as follows. \n
3434 /// 0: the lower half of \a V1 \n
3435 /// 1: the upper half of \a V1 \n
3436 /// 2: the lower half of \a V2 \n
3437 /// 3: the upper half of \a V2
3438 /// \returns A 256-bit integer vector containing the result.
3439 #define _mm256_permute2x128_si256(V1, V2, M) \
3440 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3442 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3443 /// of the immediate \a M is zero, extracts the lower half of the result;
3444 /// otherwise, extracts the upper half.
3446 /// \headerfile <immintrin.h>
3449 /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3452 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3455 /// A 256-bit integer vector containing the source values.
3457 /// An immediate value specifying which half of \a V to extract.
3458 /// \returns A 128-bit integer vector containing the result.
3459 #define _mm256_extracti128_si256(V, M) \
3460 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3462 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3463 /// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3464 /// is zero, overwrites the lower half of the result; otherwise,
3465 /// overwrites the upper half.
3467 /// \headerfile <immintrin.h>
3470 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3473 /// This intrinsic corresponds to the \c VINSERTI128 instruction.
3476 /// A 256-bit integer vector containing a source value.
3478 /// A 128-bit integer vector containing a source value.
3480 /// An immediate value specifying where to put \a V2 in the result.
3481 /// \returns A 256-bit integer vector containing the result.
3482 #define _mm256_inserti128_si256(V1, V2, M) \
3483 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3484 (__v2di)(__m128i)(V2), (int)(M)))
3486 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3487 /// the most significant bit of the corresponding element in the mask
3488 /// \a __M is set; otherwise, sets that element of the result to zero.
3489 /// Returns the 256-bit [8 x i32] result.
3491 /// \code{.operation}
3494 /// IF __M[j+31] == 1
3495 /// result[j+31:j] := Load32(__X+(i*4))
3497 /// result[j+31:j] := 0
3502 /// \headerfile <immintrin.h>
3504 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3507 /// A pointer to the memory used for loading values.
3509 /// A 256-bit vector of [8 x i32] containing the mask bits.
3510 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3512 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3513 _mm256_maskload_epi32(int const *__X
, __m256i __M
)
3515 return (__m256i
)__builtin_ia32_maskloadd256((const __v8si
*)__X
, (__v8si
)__M
);
3518 /// Conditionally loads four 64-bit integer elements from memory \a __X, if
3519 /// the most significant bit of the corresponding element in the mask
3520 /// \a __M is set; otherwise, sets that element of the result to zero.
3521 /// Returns the 256-bit [4 x i64] result.
3523 /// \code{.operation}
3526 /// IF __M[j+63] == 1
3527 /// result[j+63:j] := Load64(__X+(i*8))
3529 /// result[j+63:j] := 0
3534 /// \headerfile <immintrin.h>
3536 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3539 /// A pointer to the memory used for loading values.
3541 /// A 256-bit vector of [4 x i64] containing the mask bits.
3542 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3544 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3545 _mm256_maskload_epi64(long long const *__X
, __m256i __M
)
3547 return (__m256i
)__builtin_ia32_maskloadq256((const __v4di
*)__X
, (__v4di
)__M
);
3550 /// Conditionally loads four 32-bit integer elements from memory \a __X, if
3551 /// the most significant bit of the corresponding element in the mask
3552 /// \a __M is set; otherwise, sets that element of the result to zero.
3553 /// Returns the 128-bit [4 x i32] result.
3555 /// \code{.operation}
3558 /// IF __M[j+31] == 1
3559 /// result[j+31:j] := Load32(__X+(i*4))
3561 /// result[j+31:j] := 0
3566 /// \headerfile <immintrin.h>
3568 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3571 /// A pointer to the memory used for loading values.
3573 /// A 128-bit vector of [4 x i32] containing the mask bits.
3574 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3576 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3577 _mm_maskload_epi32(int const *__X
, __m128i __M
)
3579 return (__m128i
)__builtin_ia32_maskloadd((const __v4si
*)__X
, (__v4si
)__M
);
3582 /// Conditionally loads two 64-bit integer elements from memory \a __X, if
3583 /// the most significant bit of the corresponding element in the mask
3584 /// \a __M is set; otherwise, sets that element of the result to zero.
3585 /// Returns the 128-bit [2 x i64] result.
3587 /// \code{.operation}
3590 /// IF __M[j+63] == 1
3591 /// result[j+63:j] := Load64(__X+(i*8))
3593 /// result[j+63:j] := 0
3598 /// \headerfile <immintrin.h>
3600 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3603 /// A pointer to the memory used for loading values.
3605 /// A 128-bit vector of [2 x i64] containing the mask bits.
3606 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3608 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3609 _mm_maskload_epi64(long long const *__X
, __m128i __M
)
3611 return (__m128i
)__builtin_ia32_maskloadq((const __v2di
*)__X
, (__v2di
)__M
);
3614 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3615 /// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3616 /// the corresponding element in the mask \a __M is set; otherwise, the
3617 /// memory element is unchanged.
3619 /// \code{.operation}
3622 /// IF __M[j+31] == 1
3623 /// Store32(__X+(i*4), __Y[j+31:j])
3628 /// \headerfile <immintrin.h>
3630 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3633 /// A pointer to the memory used for storing values.
3635 /// A 256-bit vector of [8 x i32] containing the mask bits.
3637 /// A 256-bit vector of [8 x i32] containing the values to store.
3638 static __inline__
void __DEFAULT_FN_ATTRS256
3639 _mm256_maskstore_epi32(int *__X
, __m256i __M
, __m256i __Y
)
3641 __builtin_ia32_maskstored256((__v8si
*)__X
, (__v8si
)__M
, (__v8si
)__Y
);
3644 /// Conditionally stores four 64-bit integer elements from the 256-bit vector
3645 /// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3646 /// the corresponding element in the mask \a __M is set; otherwise, the
3647 /// memory element is unchanged.
3649 /// \code{.operation}
3652 /// IF __M[j+63] == 1
3653 /// Store64(__X+(i*8), __Y[j+63:j])
3658 /// \headerfile <immintrin.h>
3660 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3663 /// A pointer to the memory used for storing values.
3665 /// A 256-bit vector of [4 x i64] containing the mask bits.
3667 /// A 256-bit vector of [4 x i64] containing the values to store.
3668 static __inline__
void __DEFAULT_FN_ATTRS256
3669 _mm256_maskstore_epi64(long long *__X
, __m256i __M
, __m256i __Y
)
3671 __builtin_ia32_maskstoreq256((__v4di
*)__X
, (__v4di
)__M
, (__v4di
)__Y
);
3674 /// Conditionally stores four 32-bit integer elements from the 128-bit vector
3675 /// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3676 /// the corresponding element in the mask \a __M is set; otherwise, the
3677 /// memory element is unchanged.
3679 /// \code{.operation}
3682 /// IF __M[j+31] == 1
3683 /// Store32(__X+(i*4), __Y[j+31:j])
3688 /// \headerfile <immintrin.h>
3690 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3693 /// A pointer to the memory used for storing values.
3695 /// A 128-bit vector of [4 x i32] containing the mask bits.
3697 /// A 128-bit vector of [4 x i32] containing the values to store.
3698 static __inline__
void __DEFAULT_FN_ATTRS128
3699 _mm_maskstore_epi32(int *__X
, __m128i __M
, __m128i __Y
)
3701 __builtin_ia32_maskstored((__v4si
*)__X
, (__v4si
)__M
, (__v4si
)__Y
);
3704 /// Conditionally stores two 64-bit integer elements from the 128-bit vector
3705 /// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3706 /// the corresponding element in the mask \a __M is set; otherwise, the
3707 /// memory element is unchanged.
3709 /// \code{.operation}
3712 /// IF __M[j+63] == 1
3713 /// Store64(__X+(i*8), __Y[j+63:j])
3718 /// \headerfile <immintrin.h>
3720 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3723 /// A pointer to the memory used for storing values.
3725 /// A 128-bit vector of [2 x i64] containing the mask bits.
3727 /// A 128-bit vector of [2 x i64] containing the values to store.
3728 static __inline__
void __DEFAULT_FN_ATTRS128
3729 _mm_maskstore_epi64(long long *__X
, __m128i __M
, __m128i __Y
)
3731 __builtin_ia32_maskstoreq(( __v2di
*)__X
, (__v2di
)__M
, (__v2di
)__Y
);
3734 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3735 /// left by the number of bits given in the corresponding element of the
3736 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3737 /// returns the result. If the shift count for any element is greater than
3738 /// 31, the result for that element is zero.
3740 /// \headerfile <immintrin.h>
3742 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3745 /// A 256-bit vector of [8 x i32] to be shifted.
3747 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3749 /// \returns A 256-bit vector of [8 x i32] containing the result.
3750 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3751 _mm256_sllv_epi32(__m256i __X
, __m256i __Y
)
3753 return (__m256i
)__builtin_ia32_psllv8si((__v8si
)__X
, (__v8si
)__Y
);
3756 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3757 /// left by the number of bits given in the corresponding element of the
3758 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3759 /// returns the result. If the shift count for any element is greater than
3760 /// 31, the result for that element is zero.
3762 /// \headerfile <immintrin.h>
3764 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3767 /// A 128-bit vector of [4 x i32] to be shifted.
3769 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3771 /// \returns A 128-bit vector of [4 x i32] containing the result.
3772 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3773 _mm_sllv_epi32(__m128i __X
, __m128i __Y
)
3775 return (__m128i
)__builtin_ia32_psllv4si((__v4si
)__X
, (__v4si
)__Y
);
3778 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3779 /// left by the number of bits given in the corresponding element of the
3780 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3781 /// returns the result. If the shift count for any element is greater than
3782 /// 63, the result for that element is zero.
3784 /// \headerfile <immintrin.h>
3786 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3789 /// A 256-bit vector of [4 x i64] to be shifted.
3791 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3793 /// \returns A 256-bit vector of [4 x i64] containing the result.
3794 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3795 _mm256_sllv_epi64(__m256i __X
, __m256i __Y
)
3797 return (__m256i
)__builtin_ia32_psllv4di((__v4di
)__X
, (__v4di
)__Y
);
3800 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3801 /// left by the number of bits given in the corresponding element of the
3802 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3803 /// returns the result. If the shift count for any element is greater than
3804 /// 63, the result for that element is zero.
3806 /// \headerfile <immintrin.h>
3808 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3811 /// A 128-bit vector of [2 x i64] to be shifted.
3813 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3815 /// \returns A 128-bit vector of [2 x i64] containing the result.
3816 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3817 _mm_sllv_epi64(__m128i __X
, __m128i __Y
)
3819 return (__m128i
)__builtin_ia32_psllv2di((__v2di
)__X
, (__v2di
)__Y
);
3822 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3823 /// right by the number of bits given in the corresponding element of the
3824 /// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3825 /// returns the result. If the shift count for any element is greater than
3826 /// 31, the result for that element is 0 or -1 according to the sign bit
3827 /// for that element.
3829 /// \headerfile <immintrin.h>
3831 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3834 /// A 256-bit vector of [8 x i32] to be shifted.
3836 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3838 /// \returns A 256-bit vector of [8 x i32] containing the result.
3839 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3840 _mm256_srav_epi32(__m256i __X
, __m256i __Y
)
3842 return (__m256i
)__builtin_ia32_psrav8si((__v8si
)__X
, (__v8si
)__Y
);
3845 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3846 /// right by the number of bits given in the corresponding element of the
3847 /// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3848 /// returns the result. If the shift count for any element is greater than
3849 /// 31, the result for that element is 0 or -1 according to the sign bit
3850 /// for that element.
3852 /// \headerfile <immintrin.h>
3854 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3857 /// A 128-bit vector of [4 x i32] to be shifted.
3859 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3861 /// \returns A 128-bit vector of [4 x i32] containing the result.
3862 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3863 _mm_srav_epi32(__m128i __X
, __m128i __Y
)
3865 return (__m128i
)__builtin_ia32_psrav4si((__v4si
)__X
, (__v4si
)__Y
);
3868 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3869 /// right by the number of bits given in the corresponding element of the
3870 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3871 /// returns the result. If the shift count for any element is greater than
3872 /// 31, the result for that element is zero.
3874 /// \headerfile <immintrin.h>
3876 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3879 /// A 256-bit vector of [8 x i32] to be shifted.
3881 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3883 /// \returns A 256-bit vector of [8 x i32] containing the result.
3884 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3885 _mm256_srlv_epi32(__m256i __X
, __m256i __Y
)
3887 return (__m256i
)__builtin_ia32_psrlv8si((__v8si
)__X
, (__v8si
)__Y
);
3890 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3891 /// right by the number of bits given in the corresponding element of the
3892 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3893 /// returns the result. If the shift count for any element is greater than
3894 /// 31, the result for that element is zero.
3896 /// \headerfile <immintrin.h>
3898 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3901 /// A 128-bit vector of [4 x i32] to be shifted.
3903 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3905 /// \returns A 128-bit vector of [4 x i32] containing the result.
3906 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3907 _mm_srlv_epi32(__m128i __X
, __m128i __Y
)
3909 return (__m128i
)__builtin_ia32_psrlv4si((__v4si
)__X
, (__v4si
)__Y
);
3912 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3913 /// right by the number of bits given in the corresponding element of the
3914 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3915 /// returns the result. If the shift count for any element is greater than
3916 /// 63, the result for that element is zero.
3918 /// \headerfile <immintrin.h>
3920 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3923 /// A 256-bit vector of [4 x i64] to be shifted.
3925 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3927 /// \returns A 256-bit vector of [4 x i64] containing the result.
3928 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3929 _mm256_srlv_epi64(__m256i __X
, __m256i __Y
)
3931 return (__m256i
)__builtin_ia32_psrlv4di((__v4di
)__X
, (__v4di
)__Y
);
3934 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3935 /// right by the number of bits given in the corresponding element of the
3936 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3937 /// returns the result. If the shift count for any element is greater than
3938 /// 63, the result for that element is zero.
3940 /// \headerfile <immintrin.h>
3942 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3945 /// A 128-bit vector of [2 x i64] to be shifted.
3947 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3949 /// \returns A 128-bit vector of [2 x i64] containing the result.
3950 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3951 _mm_srlv_epi64(__m128i __X
, __m128i __Y
)
3953 return (__m128i
)__builtin_ia32_psrlv2di((__v2di
)__X
, (__v2di
)__Y
);
3956 /// Conditionally gathers two 64-bit floating-point values, either from the
3957 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3958 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3959 /// of [2 x double] in \a mask determines the source for each element.
3961 /// \code{.operation}
3962 /// FOR element := 0 to 1
3965 /// IF mask[j+63] == 0
3966 /// result[j+63:j] := a[j+63:j]
3968 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3973 /// \headerfile <immintrin.h>
3976 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3977 /// __m128d mask, const int s);
3980 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3983 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
3986 /// A pointer to the memory used for loading values.
3988 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3989 /// the first two elements are used.
3991 /// A 128-bit vector of [2 x double] containing the mask. The most
3992 /// significant bit of each element in the mask vector represents the mask
3993 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3994 /// is gathered; otherwise the value is loaded from memory.
3996 /// A literal constant scale factor for the indexes in \a i. Must be
3998 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3999 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4000 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4001 (double const *)(m), \
4002 (__v4si)(__m128i)(i), \
4003 (__v2df)(__m128d)(mask), (s)))
4005 /// Conditionally gathers four 64-bit floating-point values, either from the
4006 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4007 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4008 /// of [4 x double] in \a mask determines the source for each element.
4010 /// \code{.operation}
4011 /// FOR element := 0 to 3
4014 /// IF mask[j+63] == 0
4015 /// result[j+63:j] := a[j+63:j]
4017 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4022 /// \headerfile <immintrin.h>
4025 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4026 /// __m256d mask, const int s);
4029 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4032 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
4035 /// A pointer to the memory used for loading values.
4037 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4039 /// A 256-bit vector of [4 x double] containing the mask. The most
4040 /// significant bit of each element in the mask vector represents the mask
4041 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4042 /// is gathered; otherwise the value is loaded from memory.
4044 /// A literal constant scale factor for the indexes in \a i. Must be
4046 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4047 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4048 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4049 (double const *)(m), \
4050 (__v4si)(__m128i)(i), \
4051 (__v4df)(__m256d)(mask), (s)))
4053 /// Conditionally gathers two 64-bit floating-point values, either from the
4054 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4055 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4056 /// of [2 x double] in \a mask determines the source for each element.
4058 /// \code{.operation}
4059 /// FOR element := 0 to 1
4062 /// IF mask[j+63] == 0
4063 /// result[j+63:j] := a[j+63:j]
4065 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4070 /// \headerfile <immintrin.h>
4073 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4074 /// __m128d mask, const int s);
4077 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4080 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
4083 /// A pointer to the memory used for loading values.
4085 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4087 /// A 128-bit vector of [2 x double] containing the mask. The most
4088 /// significant bit of each element in the mask vector represents the mask
4089 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4090 /// is gathered; otherwise the value is loaded from memory.
4092 /// A literal constant scale factor for the indexes in \a i. Must be
4094 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4095 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4096 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4097 (double const *)(m), \
4098 (__v2di)(__m128i)(i), \
4099 (__v2df)(__m128d)(mask), (s)))
4101 /// Conditionally gathers four 64-bit floating-point values, either from the
4102 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4103 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4104 /// of [4 x double] in \a mask determines the source for each element.
4106 /// \code{.operation}
4107 /// FOR element := 0 to 3
4110 /// IF mask[j+63] == 0
4111 /// result[j+63:j] := a[j+63:j]
4113 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4118 /// \headerfile <immintrin.h>
4121 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4122 /// __m256d mask, const int s);
4125 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4128 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
4131 /// A pointer to the memory used for loading values.
4133 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4135 /// A 256-bit vector of [4 x double] containing the mask. The most
4136 /// significant bit of each element in the mask vector represents the mask
4137 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4138 /// is gathered; otherwise the value is loaded from memory.
4140 /// A literal constant scale factor for the indexes in \a i. Must be
4142 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4143 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4144 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4145 (double const *)(m), \
4146 (__v4di)(__m256i)(i), \
4147 (__v4df)(__m256d)(mask), (s)))
4149 /// Conditionally gathers four 32-bit floating-point values, either from the
4150 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4151 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4152 /// of [4 x float] in \a mask determines the source for each element.
4154 /// \code{.operation}
4155 /// FOR element := 0 to 3
4158 /// IF mask[j+31] == 0
4159 /// result[j+31:j] := a[j+31:j]
4161 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4166 /// \headerfile <immintrin.h>
4169 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4170 /// __m128 mask, const int s);
4173 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4176 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4179 /// A pointer to the memory used for loading values.
4181 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4183 /// A 128-bit vector of [4 x float] containing the mask. The most
4184 /// significant bit of each element in the mask vector represents the mask
4185 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4186 /// is gathered; otherwise the value is loaded from memory.
4188 /// A literal constant scale factor for the indexes in \a i. Must be
4190 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4191 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4192 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4193 (float const *)(m), \
4194 (__v4si)(__m128i)(i), \
4195 (__v4sf)(__m128)(mask), (s)))
4197 /// Conditionally gathers eight 32-bit floating-point values, either from the
4198 /// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4199 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4200 /// of [8 x float] in \a mask determines the source for each element.
4202 /// \code{.operation}
4203 /// FOR element := 0 to 7
4206 /// IF mask[j+31] == 0
4207 /// result[j+31:j] := a[j+31:j]
4209 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4214 /// \headerfile <immintrin.h>
4217 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4218 /// __m256 mask, const int s);
4221 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4224 /// A 256-bit vector of [8 x float] used as the source when a mask bit is
4227 /// A pointer to the memory used for loading values.
4229 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4231 /// A 256-bit vector of [8 x float] containing the mask. The most
4232 /// significant bit of each element in the mask vector represents the mask
4233 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4234 /// is gathered; otherwise the value is loaded from memory.
4236 /// A literal constant scale factor for the indexes in \a i. Must be
4238 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4239 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4240 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4241 (float const *)(m), \
4242 (__v8si)(__m256i)(i), \
4243 (__v8sf)(__m256)(mask), (s)))
4245 /// Conditionally gathers two 32-bit floating-point values, either from the
4246 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4247 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4248 /// of [4 x float] in \a mask determines the source for the lower two
4249 /// elements. The upper two elements of the result are zeroed.
4251 /// \code{.operation}
4252 /// FOR element := 0 to 1
4255 /// IF mask[j+31] == 0
4256 /// result[j+31:j] := a[j+31:j]
4258 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4261 /// result[127:64] := 0
4264 /// \headerfile <immintrin.h>
4267 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4268 /// __m128 mask, const int s);
4271 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4274 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4275 /// zero. Only the first two elements are used.
4277 /// A pointer to the memory used for loading values.
4279 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4281 /// A 128-bit vector of [4 x float] containing the mask. The most
4282 /// significant bit of each element in the mask vector represents the mask
4283 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4284 /// is gathered; otherwise the value is loaded from memory. Only the first
4285 /// two elements are used.
4287 /// A literal constant scale factor for the indexes in \a i. Must be
4289 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4290 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4291 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4292 (float const *)(m), \
4293 (__v2di)(__m128i)(i), \
4294 (__v4sf)(__m128)(mask), (s)))
4296 /// Conditionally gathers four 32-bit floating-point values, either from the
4297 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4298 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4299 /// of [4 x float] in \a mask determines the source for each element.
4301 /// \code{.operation}
4302 /// FOR element := 0 to 3
4305 /// IF mask[j+31] == 0
4306 /// result[j+31:j] := a[j+31:j]
4308 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4313 /// \headerfile <immintrin.h>
4316 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4317 /// __m128 mask, const int s);
4320 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4323 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4326 /// A pointer to the memory used for loading values.
4328 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4330 /// A 128-bit vector of [4 x float] containing the mask. The most
4331 /// significant bit of each element in the mask vector represents the mask
4332 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4333 /// is gathered; otherwise the value is loaded from memory.
4335 /// A literal constant scale factor for the indexes in \a i. Must be
4337 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4338 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4339 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4340 (float const *)(m), \
4341 (__v4di)(__m256i)(i), \
4342 (__v4sf)(__m128)(mask), (s)))
4344 /// Conditionally gathers four 32-bit integer values, either from the
4345 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4346 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4347 /// of [4 x i32] in \a mask determines the source for each element.
4349 /// \code{.operation}
4350 /// FOR element := 0 to 3
4353 /// IF mask[j+31] == 0
4354 /// result[j+31:j] := a[j+31:j]
4356 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4361 /// \headerfile <immintrin.h>
4364 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4365 /// __m128i mask, const int s);
4368 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4371 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4374 /// A pointer to the memory used for loading values.
4376 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4378 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4379 /// bit of each element in the mask vector represents the mask bits. If a
4380 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4381 /// otherwise the value is loaded from memory.
4383 /// A literal constant scale factor for the indexes in \a i. Must be
4385 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4386 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4387 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4389 (__v4si)(__m128i)(i), \
4390 (__v4si)(__m128i)(mask), (s)))
4392 /// Conditionally gathers eight 32-bit integer values, either from the
4393 /// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4394 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4395 /// of [8 x i32] in \a mask determines the source for each element.
4397 /// \code{.operation}
4398 /// FOR element := 0 to 7
4401 /// IF mask[j+31] == 0
4402 /// result[j+31:j] := a[j+31:j]
4404 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4409 /// \headerfile <immintrin.h>
4412 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4413 /// __m256i mask, const int s);
4416 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4419 /// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4422 /// A pointer to the memory used for loading values.
4424 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4426 /// A 256-bit vector of [8 x i32] containing the mask. The most significant
4427 /// bit of each element in the mask vector represents the mask bits. If a
4428 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4429 /// otherwise the value is loaded from memory.
4431 /// A literal constant scale factor for the indexes in \a i. Must be
4433 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4434 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4435 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4437 (__v8si)(__m256i)(i), \
4438 (__v8si)(__m256i)(mask), (s)))
4440 /// Conditionally gathers two 32-bit integer values, either from the
4441 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4442 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4443 /// of [4 x i32] in \a mask determines the source for the lower two
4444 /// elements. The upper two elements of the result are zeroed.
4446 /// \code{.operation}
4447 /// FOR element := 0 to 1
4450 /// IF mask[j+31] == 0
4451 /// result[j+31:j] := a[j+31:j]
4453 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4456 /// result[127:64] := 0
4459 /// \headerfile <immintrin.h>
4462 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4463 /// __m128i mask, const int s);
4466 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4469 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4470 /// zero. Only the first two elements are used.
4472 /// A pointer to the memory used for loading values.
4474 /// A 128-bit vector of [2 x i64] containing indexes into \a m.
4476 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4477 /// bit of each element in the mask vector represents the mask bits. If a
4478 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4479 /// otherwise the value is loaded from memory. Only the first two elements
4482 /// A literal constant scale factor for the indexes in \a i. Must be
4484 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4485 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4486 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4488 (__v2di)(__m128i)(i), \
4489 (__v4si)(__m128i)(mask), (s)))
4491 /// Conditionally gathers four 32-bit integer values, either from the
4492 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4493 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4494 /// of [4 x i32] in \a mask determines the source for each element.
4496 /// \code{.operation}
4497 /// FOR element := 0 to 3
4500 /// IF mask[j+31] == 0
4501 /// result[j+31:j] := a[j+31:j]
4503 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4508 /// \headerfile <immintrin.h>
4511 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4512 /// __m128i mask, const int s);
4515 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4518 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4521 /// A pointer to the memory used for loading values.
4523 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4525 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4526 /// bit of each element in the mask vector represents the mask bits. If a
4527 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4528 /// otherwise the value is loaded from memory.
4530 /// A literal constant scale factor for the indexes in \a i. Must be
4532 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4533 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4534 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4536 (__v4di)(__m256i)(i), \
4537 (__v4si)(__m128i)(mask), (s)))
4539 /// Conditionally gathers two 64-bit integer values, either from the
4540 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4541 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4542 /// of [2 x i64] in \a mask determines the source for each element.
4544 /// \code{.operation}
4545 /// FOR element := 0 to 1
4548 /// IF mask[j+63] == 0
4549 /// result[j+63:j] := a[j+63:j]
4551 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4556 /// \headerfile <immintrin.h>
4559 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4560 /// __m128i mask, const int s);
4563 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4566 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4569 /// A pointer to the memory used for loading values.
4571 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4572 /// the first two elements are used.
4574 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
4575 /// bit of each element in the mask vector represents the mask bits. If a
4576 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4577 /// otherwise the value is loaded from memory.
4579 /// A literal constant scale factor for the indexes in \a i. Must be
4581 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4582 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4583 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4584 (long long const *)(m), \
4585 (__v4si)(__m128i)(i), \
4586 (__v2di)(__m128i)(mask), (s)))
4588 /// Conditionally gathers four 64-bit integer values, either from the
4589 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4590 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4591 /// of [4 x i64] in \a mask determines the source for each element.
4593 /// \code{.operation}
4594 /// FOR element := 0 to 3
4597 /// IF mask[j+63] == 0
4598 /// result[j+63:j] := a[j+63:j]
4600 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4605 /// \headerfile <immintrin.h>
4608 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4609 /// __m128i i, __m256i mask, const int s);
4612 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4615 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4618 /// A pointer to the memory used for loading values.
4620 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4622 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
4623 /// bit of each element in the mask vector represents the mask bits. If a
4624 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4625 /// otherwise the value is loaded from memory.
4627 /// A literal constant scale factor for the indexes in \a i. Must be
4629 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4630 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4631 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4632 (long long const *)(m), \
4633 (__v4si)(__m128i)(i), \
4634 (__v4di)(__m256i)(mask), (s)))
4636 /// Conditionally gathers two 64-bit integer values, either from the
4637 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4638 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4639 /// of [2 x i64] in \a mask determines the source for each element.
4641 /// \code{.operation}
4642 /// FOR element := 0 to 1
4645 /// IF mask[j+63] == 0
4646 /// result[j+63:j] := a[j+63:j]
4648 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4653 /// \headerfile <immintrin.h>
4656 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4657 /// __m128i mask, const int s);
4660 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4663 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4666 /// A pointer to the memory used for loading values.
4668 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4670 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
4671 /// bit of each element in the mask vector represents the mask bits. If a
4672 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4673 /// otherwise the value is loaded from memory.
4675 /// A literal constant scale factor for the indexes in \a i. Must be
4677 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4678 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4679 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4680 (long long const *)(m), \
4681 (__v2di)(__m128i)(i), \
4682 (__v2di)(__m128i)(mask), (s)))
4684 /// Conditionally gathers four 64-bit integer values, either from the
4685 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4686 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4687 /// of [4 x i64] in \a mask determines the source for each element.
4689 /// \code{.operation}
4690 /// FOR element := 0 to 3
4693 /// IF mask[j+63] == 0
4694 /// result[j+63:j] := a[j+63:j]
4696 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4701 /// \headerfile <immintrin.h>
4704 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4705 /// __m256i i, __m256i mask, const int s);
4708 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4711 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4714 /// A pointer to the memory used for loading values.
4716 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4718 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
4719 /// bit of each element in the mask vector represents the mask bits. If a
4720 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4721 /// otherwise the value is loaded from memory.
4723 /// A literal constant scale factor for the indexes in \a i. Must be
4725 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4726 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4727 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4728 (long long const *)(m), \
4729 (__v4di)(__m256i)(i), \
4730 (__v4di)(__m256i)(mask), (s)))
4732 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4733 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4735 /// \code{.operation}
4736 /// FOR element := 0 to 1
4739 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4743 /// \headerfile <immintrin.h>
4746 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4749 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4752 /// A pointer to the memory used for loading values.
4754 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4755 /// the first two elements are used.
4757 /// A literal constant scale factor for the indexes in \a i. Must be
4759 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4760 #define _mm_i32gather_pd(m, i, s) \
4761 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4762 (double const *)(m), \
4763 (__v4si)(__m128i)(i), \
4764 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4765 _mm_setzero_pd()), \
4768 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4769 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4771 /// \code{.operation}
4772 /// FOR element := 0 to 3
4775 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4779 /// \headerfile <immintrin.h>
4782 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4785 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4788 /// A pointer to the memory used for loading values.
4790 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4792 /// A literal constant scale factor for the indexes in \a i. Must be
4794 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4795 #define _mm256_i32gather_pd(m, i, s) \
4796 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4797 (double const *)(m), \
4798 (__v4si)(__m128i)(i), \
4799 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4800 _mm256_setzero_pd(), \
4804 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4805 /// indexes from the 128-bit vector of [2 x i64] in \a i.
4807 /// \code{.operation}
4808 /// FOR element := 0 to 1
4811 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4815 /// \headerfile <immintrin.h>
4818 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4821 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4824 /// A pointer to the memory used for loading values.
4826 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4828 /// A literal constant scale factor for the indexes in \a i. Must be
4830 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4831 #define _mm_i64gather_pd(m, i, s) \
4832 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4833 (double const *)(m), \
4834 (__v2di)(__m128i)(i), \
4835 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4836 _mm_setzero_pd()), \
4839 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4840 /// indexes from the 256-bit vector of [4 x i64] in \a i.
4842 /// \code{.operation}
4843 /// FOR element := 0 to 3
4846 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4850 /// \headerfile <immintrin.h>
4853 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4856 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4859 /// A pointer to the memory used for loading values.
4861 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4863 /// A literal constant scale factor for the indexes in \a i. Must be
4865 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4866 #define _mm256_i64gather_pd(m, i, s) \
4867 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4868 (double const *)(m), \
4869 (__v4di)(__m256i)(i), \
4870 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4871 _mm256_setzero_pd(), \
4875 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4876 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4878 /// \code{.operation}
4879 /// FOR element := 0 to 3
4882 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4886 /// \headerfile <immintrin.h>
4889 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4892 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4895 /// A pointer to the memory used for loading values.
4897 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4899 /// A literal constant scale factor for the indexes in \a i. Must be
4901 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4902 #define _mm_i32gather_ps(m, i, s) \
4903 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4904 (float const *)(m), \
4905 (__v4si)(__m128i)(i), \
4906 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4907 _mm_setzero_ps()), \
4910 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
4911 /// indexes from the 256-bit vector of [8 x i32] in \a i.
4913 /// \code{.operation}
4914 /// FOR element := 0 to 7
4917 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4921 /// \headerfile <immintrin.h>
4924 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4927 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4930 /// A pointer to the memory used for loading values.
4932 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4934 /// A literal constant scale factor for the indexes in \a i. Must be
4936 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4937 #define _mm256_i32gather_ps(m, i, s) \
4938 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4939 (float const *)(m), \
4940 (__v8si)(__m256i)(i), \
4941 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4942 _mm256_setzero_ps(), \
4946 /// Gathers two 32-bit floating-point values from memory \a m using scaled
4947 /// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4948 /// elements of the result are zeroed.
4950 /// \code{.operation}
4951 /// FOR element := 0 to 1
4954 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4956 /// result[127:64] := 0
4959 /// \headerfile <immintrin.h>
4962 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4965 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4968 /// A pointer to the memory used for loading values.
4970 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4972 /// A literal constant scale factor for the indexes in \a i. Must be
4974 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4975 #define _mm_i64gather_ps(m, i, s) \
4976 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4977 (float const *)(m), \
4978 (__v2di)(__m128i)(i), \
4979 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4980 _mm_setzero_ps()), \
4983 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4984 /// indexes from the 256-bit vector of [4 x i64] in \a i.
4986 /// \code{.operation}
4987 /// FOR element := 0 to 3
4990 /// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4994 /// \headerfile <immintrin.h>
4997 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
5000 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
5003 /// A pointer to the memory used for loading values.
5005 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5007 /// A literal constant scale factor for the indexes in \a i. Must be
5009 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
5010 #define _mm256_i64gather_ps(m, i, s) \
5011 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5012 (float const *)(m), \
5013 (__v4di)(__m256i)(i), \
5014 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5015 _mm_setzero_ps()), \
5018 /// Gathers four 32-bit floating-point values from memory \a m using scaled
5019 /// indexes from the 128-bit vector of [4 x i32] in \a i.
5021 /// \code{.operation}
5022 /// FOR element := 0 to 3
5025 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5029 /// \headerfile <immintrin.h>
5032 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5035 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5038 /// A pointer to the memory used for loading values.
5040 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5042 /// A literal constant scale factor for the indexes in \a i. Must be
5044 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5045 #define _mm_i32gather_epi32(m, i, s) \
5046 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5047 (int const *)(m), (__v4si)(__m128i)(i), \
5048 (__v4si)_mm_set1_epi32(-1), (s)))
5050 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
5051 /// indexes from the 256-bit vector of [8 x i32] in \a i.
5053 /// \code{.operation}
5054 /// FOR element := 0 to 7
5057 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5061 /// \headerfile <immintrin.h>
5064 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5067 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5070 /// A pointer to the memory used for loading values.
5072 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5074 /// A literal constant scale factor for the indexes in \a i. Must be
5076 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5077 #define _mm256_i32gather_epi32(m, i, s) \
5078 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5079 (int const *)(m), (__v8si)(__m256i)(i), \
5080 (__v8si)_mm256_set1_epi32(-1), (s)))
5082 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
5083 /// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5084 /// of the result are zeroed.
5086 /// \code{.operation}
5087 /// FOR element := 0 to 1
5090 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5092 /// result[127:64] := 0
5095 /// \headerfile <immintrin.h>
5098 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5101 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5104 /// A pointer to the memory used for loading values.
5106 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5108 /// A literal constant scale factor for the indexes in \a i. Must be
5110 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5111 #define _mm_i64gather_epi32(m, i, s) \
5112 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5113 (int const *)(m), (__v2di)(__m128i)(i), \
5114 (__v4si)_mm_set1_epi32(-1), (s)))
5116 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
5117 /// from the 256-bit vector of [4 x i64] in \a i.
5119 /// \code{.operation}
5120 /// FOR element := 0 to 3
5123 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5127 /// \headerfile <immintrin.h>
5130 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5133 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5136 /// A pointer to the memory used for loading values.
5138 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5140 /// A literal constant scale factor for the indexes in \a i. Must be
5142 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5143 #define _mm256_i64gather_epi32(m, i, s) \
5144 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5145 (int const *)(m), (__v4di)(__m256i)(i), \
5146 (__v4si)_mm_set1_epi32(-1), (s)))
5148 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5149 /// from the 128-bit vector of [4 x i32] in \a i.
5151 /// \code{.operation}
5152 /// FOR element := 0 to 1
5155 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5159 /// \headerfile <immintrin.h>
5162 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5165 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5168 /// A pointer to the memory used for loading values.
5170 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5171 /// the first two elements are used.
5173 /// A literal constant scale factor for the indexes in \a i. Must be
5175 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5176 #define _mm_i32gather_epi64(m, i, s) \
5177 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5178 (long long const *)(m), \
5179 (__v4si)(__m128i)(i), \
5180 (__v2di)_mm_set1_epi64x(-1), (s)))
5182 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5183 /// from the 128-bit vector of [4 x i32] in \a i.
5185 /// \code{.operation}
5186 /// FOR element := 0 to 3
5189 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5193 /// \headerfile <immintrin.h>
5196 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5199 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5202 /// A pointer to the memory used for loading values.
5204 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5206 /// A literal constant scale factor for the indexes in \a i. Must be
5208 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5209 #define _mm256_i32gather_epi64(m, i, s) \
5210 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5211 (long long const *)(m), \
5212 (__v4si)(__m128i)(i), \
5213 (__v4di)_mm256_set1_epi64x(-1), (s)))
5215 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5216 /// from the 128-bit vector of [2 x i64] in \a i.
5218 /// \code{.operation}
5219 /// FOR element := 0 to 1
5222 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5226 /// \headerfile <immintrin.h>
5229 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5232 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5235 /// A pointer to the memory used for loading values.
5237 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5239 /// A literal constant scale factor for the indexes in \a i. Must be
5241 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5242 #define _mm_i64gather_epi64(m, i, s) \
5243 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5244 (long long const *)(m), \
5245 (__v2di)(__m128i)(i), \
5246 (__v2di)_mm_set1_epi64x(-1), (s)))
5248 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5249 /// from the 256-bit vector of [4 x i64] in \a i.
5251 /// \code{.operation}
5252 /// FOR element := 0 to 3
5255 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5259 /// \headerfile <immintrin.h>
5262 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5265 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5268 /// A pointer to the memory used for loading values.
5270 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5272 /// A literal constant scale factor for the indexes in \a i. Must be
5274 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5275 #define _mm256_i64gather_epi64(m, i, s) \
5276 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5277 (long long const *)(m), \
5278 (__v4di)(__m256i)(i), \
5279 (__v4di)_mm256_set1_epi64x(-1), (s)))
5281 #undef __DEFAULT_FN_ATTRS256
5282 #undef __DEFAULT_FN_ATTRS128
5284 #endif /* __AVX2INTRIN_H */