1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
14 #ifndef __AVX2INTRIN_H
15 #define __AVX2INTRIN_H
17 /* Define the default attributes for the functions in this file. */
18 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
19 #define __DEFAULT_FN_ATTRS256 \
20 __attribute__((__always_inline__, __nodebug__, \
21 __target__("avx2,no-evex512"), __min_vector_width__(256)))
22 #define __DEFAULT_FN_ATTRS128 \
23 __attribute__((__always_inline__, __nodebug__, \
24 __target__("avx2,no-evex512"), __min_vector_width__(128)))
26 #define __DEFAULT_FN_ATTRS256 \
27 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
28 __min_vector_width__(256)))
29 #define __DEFAULT_FN_ATTRS128 \
30 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
31 __min_vector_width__(128)))
34 /* SSE4 Multiple Packed Sums of Absolute Difference. */
35 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
36 /// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
39 /// Eight SAD results are computed using the lower half of the input
40 /// vectors, and another eight using the upper half. These 16-bit values
41 /// are returned in the lower and upper halves of the 256-bit result,
44 /// A single SAD operation selects four bytes from \a X and four bytes from
45 /// \a Y as input. It computes the differences between each \a X byte and
46 /// the corresponding \a Y byte, takes the absolute value of each
47 /// difference, and sums these four values to form one 16-bit result. The
48 /// intrinsic computes 16 of these results with different sets of input
51 /// For each set of eight results, the SAD operations use the same four
52 /// bytes from \a Y; the starting bit position for these four bytes is
53 /// specified by \a M[1:0] times 32. The eight operations use successive
54 /// sets of four bytes from \a X; the starting bit position for the first
55 /// set of four bytes is specified by \a M[2] times 32. These bit positions
56 /// are all relative to the 128-bit lane for each set of eight operations.
62 /// Ybase := M[j+1:j]*32 + i*128
63 /// Xbase := M[j+2]*32 + i*128
65 /// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
66 /// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
67 /// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
68 /// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
69 /// result[r+15:r] := temp0 + temp1 + temp2 + temp3
70 /// Xbase := Xbase + 8
76 /// \headerfile <immintrin.h>
79 /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
82 /// This intrinsic corresponds to the \c VMPSADBW instruction.
85 /// A 256-bit integer vector containing one of the inputs.
87 /// A 256-bit integer vector containing one of the inputs.
89 /// An unsigned immediate value specifying the starting positions of the
90 /// bytes to operate on.
91 /// \returns A 256-bit vector of [16 x i16] containing the result.
92 #define _mm256_mpsadbw_epu8(X, Y, M) \
93 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
94 (__v32qi)(__m256i)(Y), (int)(M)))
96 /// Computes the absolute value of each signed byte in the 256-bit integer
97 /// vector \a __a and returns each value in the corresponding byte of
100 /// \headerfile <immintrin.h>
102 /// This intrinsic corresponds to the \c VPABSB instruction.
105 /// A 256-bit integer vector.
106 /// \returns A 256-bit integer vector containing the result.
107 static __inline__ __m256i __DEFAULT_FN_ATTRS256
108 _mm256_abs_epi8(__m256i __a
)
110 return (__m256i
)__builtin_elementwise_abs((__v32qs
)__a
);
113 /// Computes the absolute value of each signed 16-bit element in the 256-bit
114 /// vector of [16 x i16] in \a __a and returns each value in the
115 /// corresponding element of the result.
117 /// \headerfile <immintrin.h>
119 /// This intrinsic corresponds to the \c VPABSW instruction.
122 /// A 256-bit vector of [16 x i16].
123 /// \returns A 256-bit vector of [16 x i16] containing the result.
124 static __inline__ __m256i __DEFAULT_FN_ATTRS256
125 _mm256_abs_epi16(__m256i __a
)
127 return (__m256i
)__builtin_elementwise_abs((__v16hi
)__a
);
130 /// Computes the absolute value of each signed 32-bit element in the 256-bit
131 /// vector of [8 x i32] in \a __a and returns each value in the
132 /// corresponding element of the result.
134 /// \headerfile <immintrin.h>
136 /// This intrinsic corresponds to the \c VPABSD instruction.
139 /// A 256-bit vector of [8 x i32].
140 /// \returns A 256-bit vector of [8 x i32] containing the result.
141 static __inline__ __m256i __DEFAULT_FN_ATTRS256
142 _mm256_abs_epi32(__m256i __a
)
144 return (__m256i
)__builtin_elementwise_abs((__v8si
)__a
);
147 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
148 /// integers using signed saturation, and returns the 256-bit result.
150 /// \code{.operation}
154 /// result[7+k:k] := SATURATE8(__a[15+j:j])
155 /// result[71+k:64+k] := SATURATE8(__b[15+j:j])
156 /// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
157 /// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
161 /// \headerfile <immintrin.h>
163 /// This intrinsic corresponds to the \c VPACKSSWB instruction.
166 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
169 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
171 /// \returns A 256-bit integer vector containing the result.
172 static __inline__ __m256i __DEFAULT_FN_ATTRS256
173 _mm256_packs_epi16(__m256i __a
, __m256i __b
)
175 return (__m256i
)__builtin_ia32_packsswb256((__v16hi
)__a
, (__v16hi
)__b
);
178 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
179 /// integers using signed saturation, and returns the resulting 256-bit
180 /// vector of [16 x i16].
182 /// \code{.operation}
186 /// result[15+k:k] := SATURATE16(__a[31+j:j])
187 /// result[79+k:64+k] := SATURATE16(__b[31+j:j])
188 /// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
189 /// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
193 /// \headerfile <immintrin.h>
195 /// This intrinsic corresponds to the \c VPACKSSDW instruction.
198 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
201 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
203 /// \returns A 256-bit vector of [16 x i16] containing the result.
204 static __inline__ __m256i __DEFAULT_FN_ATTRS256
205 _mm256_packs_epi32(__m256i __a
, __m256i __b
)
207 return (__m256i
)__builtin_ia32_packssdw256((__v8si
)__a
, (__v8si
)__b
);
210 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
211 /// using unsigned saturation, and returns the 256-bit result.
213 /// \code{.operation}
217 /// result[7+k:k] := SATURATE8U(__a[15+j:j])
218 /// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
219 /// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
220 /// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
224 /// \headerfile <immintrin.h>
226 /// This intrinsic corresponds to the \c VPACKUSWB instruction.
229 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
232 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
234 /// \returns A 256-bit integer vector containing the result.
235 static __inline__ __m256i __DEFAULT_FN_ATTRS256
236 _mm256_packus_epi16(__m256i __a
, __m256i __b
)
238 return (__m256i
)__builtin_ia32_packuswb256((__v16hi
)__a
, (__v16hi
)__b
);
241 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
242 /// using unsigned saturation, and returns the resulting 256-bit vector of
245 /// \code{.operation}
249 /// result[15+k:k] := SATURATE16U(__V1[31+j:j])
250 /// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
251 /// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
252 /// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
256 /// \headerfile <immintrin.h>
258 /// This intrinsic corresponds to the \c VPACKUSDW instruction.
261 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
264 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
266 /// \returns A 256-bit vector of [16 x i16] containing the result.
267 static __inline__ __m256i __DEFAULT_FN_ATTRS256
268 _mm256_packus_epi32(__m256i __V1
, __m256i __V2
)
270 return (__m256i
) __builtin_ia32_packusdw256((__v8si
)__V1
, (__v8si
)__V2
);
273 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
274 /// vectors and returns the lower 8 bits of each sum in the corresponding
275 /// byte of the 256-bit integer vector result (overflow is ignored).
277 /// \headerfile <immintrin.h>
279 /// This intrinsic corresponds to the \c VPADDB instruction.
282 /// A 256-bit integer vector containing one of the source operands.
284 /// A 256-bit integer vector containing one of the source operands.
285 /// \returns A 256-bit integer vector containing the sums.
286 static __inline__ __m256i __DEFAULT_FN_ATTRS256
287 _mm256_add_epi8(__m256i __a
, __m256i __b
)
289 return (__m256i
)((__v32qu
)__a
+ (__v32qu
)__b
);
292 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
293 /// [16 x i16] and returns the lower 16 bits of each sum in the
294 /// corresponding element of the [16 x i16] result (overflow is ignored).
296 /// \headerfile <immintrin.h>
298 /// This intrinsic corresponds to the \c VPADDW instruction.
301 /// A 256-bit vector of [16 x i16] containing one of the source operands.
303 /// A 256-bit vector of [16 x i16] containing one of the source operands.
304 /// \returns A 256-bit vector of [16 x i16] containing the sums.
305 static __inline__ __m256i __DEFAULT_FN_ATTRS256
306 _mm256_add_epi16(__m256i __a
, __m256i __b
)
308 return (__m256i
)((__v16hu
)__a
+ (__v16hu
)__b
);
311 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
312 /// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
313 /// element of the [8 x i32] result (overflow is ignored).
315 /// \headerfile <immintrin.h>
317 /// This intrinsic corresponds to the \c VPADDD instruction.
320 /// A 256-bit vector of [8 x i32] containing one of the source operands.
322 /// A 256-bit vector of [8 x i32] containing one of the source operands.
323 /// \returns A 256-bit vector of [8 x i32] containing the sums.
324 static __inline__ __m256i __DEFAULT_FN_ATTRS256
325 _mm256_add_epi32(__m256i __a
, __m256i __b
)
327 return (__m256i
)((__v8su
)__a
+ (__v8su
)__b
);
330 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
331 /// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
332 /// element of the [4 x i64] result (overflow is ignored).
334 /// \headerfile <immintrin.h>
336 /// This intrinsic corresponds to the \c VPADDQ instruction.
339 /// A 256-bit vector of [4 x i64] containing one of the source operands.
341 /// A 256-bit vector of [4 x i64] containing one of the source operands.
342 /// \returns A 256-bit vector of [4 x i64] containing the sums.
343 static __inline__ __m256i __DEFAULT_FN_ATTRS256
344 _mm256_add_epi64(__m256i __a
, __m256i __b
)
346 return (__m256i
)((__v4du
)__a
+ (__v4du
)__b
);
349 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
350 /// vectors using signed saturation, and returns each sum in the
351 /// corresponding byte of the 256-bit integer vector result.
353 /// \headerfile <immintrin.h>
355 /// This intrinsic corresponds to the \c VPADDSB instruction.
358 /// A 256-bit integer vector containing one of the source operands.
360 /// A 256-bit integer vector containing one of the source operands.
361 /// \returns A 256-bit integer vector containing the sums.
362 static __inline__ __m256i __DEFAULT_FN_ATTRS256
363 _mm256_adds_epi8(__m256i __a
, __m256i __b
)
365 return (__m256i
)__builtin_elementwise_add_sat((__v32qs
)__a
, (__v32qs
)__b
);
368 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
369 /// [16 x i16] using signed saturation, and returns the [16 x i16] result.
371 /// \headerfile <immintrin.h>
373 /// This intrinsic corresponds to the \c VPADDSW instruction.
376 /// A 256-bit vector of [16 x i16] containing one of the source operands.
378 /// A 256-bit vector of [16 x i16] containing one of the source operands.
379 /// \returns A 256-bit vector of [16 x i16] containing the sums.
380 static __inline__ __m256i __DEFAULT_FN_ATTRS256
381 _mm256_adds_epi16(__m256i __a
, __m256i __b
)
383 return (__m256i
)__builtin_elementwise_add_sat((__v16hi
)__a
, (__v16hi
)__b
);
386 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
387 /// vectors using unsigned saturation, and returns each sum in the
388 /// corresponding byte of the 256-bit integer vector result.
390 /// \headerfile <immintrin.h>
392 /// This intrinsic corresponds to the \c VPADDUSB instruction.
395 /// A 256-bit integer vector containing one of the source operands.
397 /// A 256-bit integer vector containing one of the source operands.
398 /// \returns A 256-bit integer vector containing the sums.
399 static __inline__ __m256i __DEFAULT_FN_ATTRS256
400 _mm256_adds_epu8(__m256i __a
, __m256i __b
)
402 return (__m256i
)__builtin_elementwise_add_sat((__v32qu
)__a
, (__v32qu
)__b
);
405 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
406 /// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
408 /// \headerfile <immintrin.h>
410 /// This intrinsic corresponds to the \c VPADDUSW instruction.
413 /// A 256-bit vector of [16 x i16] containing one of the source operands.
415 /// A 256-bit vector of [16 x i16] containing one of the source operands.
416 /// \returns A 256-bit vector of [16 x i16] containing the sums.
417 static __inline__ __m256i __DEFAULT_FN_ATTRS256
418 _mm256_adds_epu16(__m256i __a
, __m256i __b
)
420 return (__m256i
)__builtin_elementwise_add_sat((__v16hu
)__a
, (__v16hu
)__b
);
423 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
424 /// temporary 256-bit value, and the lower half of the 256-bit vector \a b
425 /// as the lower half of the temporary value. Right-shifts the temporary
426 /// value by \a n bytes, and uses the lower 16 bytes of the shifted value
427 /// as the lower 16 bytes of the result. Uses the upper halves of \a a and
428 /// \a b to make another temporary value, right shifts by \a n, and uses
429 /// the lower 16 bytes of the shifted value as the upper 16 bytes of the
432 /// \headerfile <immintrin.h>
435 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
438 /// This intrinsic corresponds to the \c VPALIGNR instruction.
441 /// A 256-bit integer vector containing source values.
443 /// A 256-bit integer vector containing source values.
445 /// An immediate value specifying the number of bytes to shift.
446 /// \returns A 256-bit integer vector containing the result.
447 #define _mm256_alignr_epi8(a, b, n) \
448 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
449 (__v32qi)(__m256i)(b), (n)))
451 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
454 /// \headerfile <immintrin.h>
456 /// This intrinsic corresponds to the \c VPAND instruction.
459 /// A 256-bit integer vector.
461 /// A 256-bit integer vector.
462 /// \returns A 256-bit integer vector containing the result.
463 static __inline__ __m256i __DEFAULT_FN_ATTRS256
464 _mm256_and_si256(__m256i __a
, __m256i __b
)
466 return (__m256i
)((__v4du
)__a
& (__v4du
)__b
);
469 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with
470 /// the bitwise NOT of the 256-bit integer vector in \a __a.
472 /// \headerfile <immintrin.h>
474 /// This intrinsic corresponds to the \c VPANDN instruction.
477 /// A 256-bit integer vector.
479 /// A 256-bit integer vector.
480 /// \returns A 256-bit integer vector containing the result.
481 static __inline__ __m256i __DEFAULT_FN_ATTRS256
482 _mm256_andnot_si256(__m256i __a
, __m256i __b
)
484 return (__m256i
)(~(__v4du
)__a
& (__v4du
)__b
);
487 /// Computes the averages of the corresponding unsigned bytes in the two
488 /// 256-bit integer vectors in \a __a and \a __b and returns each
489 /// average in the corresponding byte of the 256-bit result.
491 /// \code{.operation}
494 /// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
498 /// \headerfile <immintrin.h>
500 /// This intrinsic corresponds to the \c VPAVGB instruction.
503 /// A 256-bit integer vector.
505 /// A 256-bit integer vector.
506 /// \returns A 256-bit integer vector containing the result.
507 static __inline__ __m256i __DEFAULT_FN_ATTRS256
508 _mm256_avg_epu8(__m256i __a
, __m256i __b
)
510 return (__m256i
)__builtin_ia32_pavgb256((__v32qi
)__a
, (__v32qi
)__b
);
513 /// Computes the averages of the corresponding unsigned 16-bit integers in
514 /// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
515 /// each average in the corresponding element of the 256-bit result.
517 /// \code{.operation}
520 /// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
524 /// \headerfile <immintrin.h>
526 /// This intrinsic corresponds to the \c VPAVGW instruction.
529 /// A 256-bit vector of [16 x i16].
531 /// A 256-bit vector of [16 x i16].
532 /// \returns A 256-bit vector of [16 x i16] containing the result.
533 static __inline__ __m256i __DEFAULT_FN_ATTRS256
534 _mm256_avg_epu16(__m256i __a
, __m256i __b
)
536 return (__m256i
)__builtin_ia32_pavgw256((__v16hi
)__a
, (__v16hi
)__b
);
539 /// Merges 8-bit integer values from either of the two 256-bit vectors
540 /// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
541 /// the resulting 256-bit integer vector.
543 /// \code{.operation}
547 /// result[7+j:j] := __V1[7+j:j]
549 /// result[7+j:j] := __V2[7+j:j]
554 /// \headerfile <immintrin.h>
556 /// This intrinsic corresponds to the \c VPBLENDVB instruction.
559 /// A 256-bit integer vector containing source values.
561 /// A 256-bit integer vector containing source values.
563 /// A 256-bit integer vector, with bit [7] of each byte specifying the
564 /// source for each corresponding byte of the result. When the mask bit
565 /// is 0, the byte is copied from \a __V1; otherwise, it is copied from
567 /// \returns A 256-bit integer vector containing the result.
568 static __inline__ __m256i __DEFAULT_FN_ATTRS256
569 _mm256_blendv_epi8(__m256i __V1
, __m256i __V2
, __m256i __M
)
571 return (__m256i
)__builtin_ia32_pblendvb256((__v32qi
)__V1
, (__v32qi
)__V2
,
575 /// Merges 16-bit integer values from either of the two 256-bit vectors
576 /// \a V1 or \a V2, as specified by the immediate integer operand \a M,
577 /// and returns the resulting 256-bit vector of [16 x i16].
579 /// \code{.operation}
583 /// result[7+j:j] := V1[7+j:j]
584 /// result[135+j:128+j] := V1[135+j:128+j]
586 /// result[7+j:j] := V2[7+j:j]
587 /// result[135+j:128+j] := V2[135+j:128+j]
592 /// \headerfile <immintrin.h>
595 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
598 /// This intrinsic corresponds to the \c VPBLENDW instruction.
601 /// A 256-bit vector of [16 x i16] containing source values.
603 /// A 256-bit vector of [16 x i16] containing source values.
605 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
606 /// source for each element of the result. The position of the mask bit
607 /// corresponds to the index of a copied value. When a mask bit is 0, the
608 /// element is copied from \a V1; otherwise, it is copied from \a V2.
609 /// \a M[0] determines the source for elements 0 and 8, \a M[1] for
610 /// elements 1 and 9, and so forth.
611 /// \returns A 256-bit vector of [16 x i16] containing the result.
612 #define _mm256_blend_epi16(V1, V2, M) \
613 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
614 (__v16hi)(__m256i)(V2), (int)(M)))
616 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
617 /// \a __b for equality and returns the outcomes in the corresponding
618 /// bytes of the 256-bit result.
620 /// \code{.operation}
623 /// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
627 /// \headerfile <immintrin.h>
629 /// This intrinsic corresponds to the \c VPCMPEQB instruction.
632 /// A 256-bit integer vector containing one of the inputs.
634 /// A 256-bit integer vector containing one of the inputs.
635 /// \returns A 256-bit integer vector containing the result.
636 static __inline__ __m256i __DEFAULT_FN_ATTRS256
637 _mm256_cmpeq_epi8(__m256i __a
, __m256i __b
)
639 return (__m256i
)((__v32qi
)__a
== (__v32qi
)__b
);
642 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
643 /// \a __a and \a __b for equality and returns the outcomes in the
644 /// corresponding elements of the 256-bit result.
646 /// \code{.operation}
649 /// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
653 /// \headerfile <immintrin.h>
655 /// This intrinsic corresponds to the \c VPCMPEQW instruction.
658 /// A 256-bit vector of [16 x i16] containing one of the inputs.
660 /// A 256-bit vector of [16 x i16] containing one of the inputs.
661 /// \returns A 256-bit vector of [16 x i16] containing the result.
662 static __inline__ __m256i __DEFAULT_FN_ATTRS256
663 _mm256_cmpeq_epi16(__m256i __a
, __m256i __b
)
665 return (__m256i
)((__v16hi
)__a
== (__v16hi
)__b
);
668 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
669 /// \a __a and \a __b for equality and returns the outcomes in the
670 /// corresponding elements of the 256-bit result.
672 /// \code{.operation}
675 /// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
679 /// \headerfile <immintrin.h>
681 /// This intrinsic corresponds to the \c VPCMPEQD instruction.
684 /// A 256-bit vector of [8 x i32] containing one of the inputs.
686 /// A 256-bit vector of [8 x i32] containing one of the inputs.
687 /// \returns A 256-bit vector of [8 x i32] containing the result.
688 static __inline__ __m256i __DEFAULT_FN_ATTRS256
689 _mm256_cmpeq_epi32(__m256i __a
, __m256i __b
)
691 return (__m256i
)((__v8si
)__a
== (__v8si
)__b
);
694 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
695 /// \a __a and \a __b for equality and returns the outcomes in the
696 /// corresponding elements of the 256-bit result.
698 /// \code{.operation}
701 /// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
705 /// \headerfile <immintrin.h>
707 /// This intrinsic corresponds to the \c VPCMPEQQ instruction.
710 /// A 256-bit vector of [4 x i64] containing one of the inputs.
712 /// A 256-bit vector of [4 x i64] containing one of the inputs.
713 /// \returns A 256-bit vector of [4 x i64] containing the result.
714 static __inline__ __m256i __DEFAULT_FN_ATTRS256
715 _mm256_cmpeq_epi64(__m256i __a
, __m256i __b
)
717 return (__m256i
)((__v4di
)__a
== (__v4di
)__b
);
720 /// Compares corresponding signed bytes in the 256-bit integer vectors in
721 /// \a __a and \a __b for greater-than and returns the outcomes in the
722 /// corresponding bytes of the 256-bit result.
724 /// \code{.operation}
727 /// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
731 /// \headerfile <immintrin.h>
733 /// This intrinsic corresponds to the \c VPCMPGTB instruction.
736 /// A 256-bit integer vector containing one of the inputs.
738 /// A 256-bit integer vector containing one of the inputs.
739 /// \returns A 256-bit integer vector containing the result.
740 static __inline__ __m256i __DEFAULT_FN_ATTRS256
741 _mm256_cmpgt_epi8(__m256i __a
, __m256i __b
)
743 /* This function always performs a signed comparison, but __v32qi is a char
744 which may be signed or unsigned, so use __v32qs. */
745 return (__m256i
)((__v32qs
)__a
> (__v32qs
)__b
);
748 /// Compares corresponding signed elements in the 256-bit vectors of
749 /// [16 x i16] in \a __a and \a __b for greater-than and returns the
750 /// outcomes in the corresponding elements of the 256-bit result.
752 /// \code{.operation}
755 /// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
759 /// \headerfile <immintrin.h>
761 /// This intrinsic corresponds to the \c VPCMPGTW instruction.
764 /// A 256-bit vector of [16 x i16] containing one of the inputs.
766 /// A 256-bit vector of [16 x i16] containing one of the inputs.
767 /// \returns A 256-bit vector of [16 x i16] containing the result.
768 static __inline__ __m256i __DEFAULT_FN_ATTRS256
769 _mm256_cmpgt_epi16(__m256i __a
, __m256i __b
)
771 return (__m256i
)((__v16hi
)__a
> (__v16hi
)__b
);
774 /// Compares corresponding signed elements in the 256-bit vectors of
775 /// [8 x i32] in \a __a and \a __b for greater-than and returns the
776 /// outcomes in the corresponding elements of the 256-bit result.
778 /// \code{.operation}
781 /// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
785 /// \headerfile <immintrin.h>
787 /// This intrinsic corresponds to the \c VPCMPGTD instruction.
790 /// A 256-bit vector of [8 x i32] containing one of the inputs.
792 /// A 256-bit vector of [8 x i32] containing one of the inputs.
793 /// \returns A 256-bit vector of [8 x i32] containing the result.
794 static __inline__ __m256i __DEFAULT_FN_ATTRS256
795 _mm256_cmpgt_epi32(__m256i __a
, __m256i __b
)
797 return (__m256i
)((__v8si
)__a
> (__v8si
)__b
);
800 /// Compares corresponding signed elements in the 256-bit vectors of
801 /// [4 x i64] in \a __a and \a __b for greater-than and returns the
802 /// outcomes in the corresponding elements of the 256-bit result.
804 /// \code{.operation}
807 /// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
811 /// \headerfile <immintrin.h>
813 /// This intrinsic corresponds to the \c VPCMPGTQ instruction.
816 /// A 256-bit vector of [4 x i64] containing one of the inputs.
818 /// A 256-bit vector of [4 x i64] containing one of the inputs.
819 /// \returns A 256-bit vector of [4 x i64] containing the result.
820 static __inline__ __m256i __DEFAULT_FN_ATTRS256
821 _mm256_cmpgt_epi64(__m256i __a
, __m256i __b
)
823 return (__m256i
)((__v4di
)__a
> (__v4di
)__b
);
826 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
827 /// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
828 /// element of the [16 x i16] result (overflow is ignored). Sums from
829 /// \a __a are returned in the lower 64 bits of each 128-bit half of the
830 /// result; sums from \a __b are returned in the upper 64 bits of each
831 /// 128-bit half of the result.
833 /// \code{.operation}
836 /// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
837 /// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
838 /// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
839 /// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
840 /// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
841 /// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
842 /// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
843 /// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
847 /// \headerfile <immintrin.h>
849 /// This intrinsic corresponds to the \c VPHADDW instruction.
852 /// A 256-bit vector of [16 x i16] containing one of the source operands.
854 /// A 256-bit vector of [16 x i16] containing one of the source operands.
855 /// \returns A 256-bit vector of [16 x i16] containing the sums.
856 static __inline__ __m256i __DEFAULT_FN_ATTRS256
857 _mm256_hadd_epi16(__m256i __a
, __m256i __b
)
859 return (__m256i
)__builtin_ia32_phaddw256((__v16hi
)__a
, (__v16hi
)__b
);
862 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
863 /// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
864 /// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
865 /// are returned in the lower 64 bits of each 128-bit half of the result;
866 /// sums from \a __b are returned in the upper 64 bits of each 128-bit half
869 /// \code{.operation}
872 /// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
873 /// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
874 /// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
875 /// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
879 /// \headerfile <immintrin.h>
881 /// This intrinsic corresponds to the \c VPHADDD instruction.
884 /// A 256-bit vector of [8 x i32] containing one of the source operands.
886 /// A 256-bit vector of [8 x i32] containing one of the source operands.
887 /// \returns A 256-bit vector of [8 x i32] containing the sums.
888 static __inline__ __m256i __DEFAULT_FN_ATTRS256
889 _mm256_hadd_epi32(__m256i __a
, __m256i __b
)
891 return (__m256i
)__builtin_ia32_phaddd256((__v8si
)__a
, (__v8si
)__b
);
894 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
895 /// vectors of [16 x i16] using signed saturation and returns each sum in
896 /// an element of the [16 x i16] result. Sums from \a __a are returned in
897 /// the lower 64 bits of each 128-bit half of the result; sums from \a __b
898 /// are returned in the upper 64 bits of each 128-bit half of the result.
900 /// \code{.operation}
903 /// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
904 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
905 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
906 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
907 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
908 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
909 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
910 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
914 /// \headerfile <immintrin.h>
916 /// This intrinsic corresponds to the \c VPHADDSW instruction.
919 /// A 256-bit vector of [16 x i16] containing one of the source operands.
921 /// A 256-bit vector of [16 x i16] containing one of the source operands.
922 /// \returns A 256-bit vector of [16 x i16] containing the sums.
923 static __inline__ __m256i __DEFAULT_FN_ATTRS256
924 _mm256_hadds_epi16(__m256i __a
, __m256i __b
)
926 return (__m256i
)__builtin_ia32_phaddsw256((__v16hi
)__a
, (__v16hi
)__b
);
929 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
930 /// vectors of [16 x i16] and returns the lower 16 bits of each difference
931 /// in an element of the [16 x i16] result (overflow is ignored).
932 /// Differences from \a __a are returned in the lower 64 bits of each
933 /// 128-bit half of the result; differences from \a __b are returned in the
934 /// upper 64 bits of each 128-bit half of the result.
936 /// \code{.operation}
939 /// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
940 /// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
941 /// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
942 /// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
943 /// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
944 /// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
945 /// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
946 /// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
950 /// \headerfile <immintrin.h>
952 /// This intrinsic corresponds to the \c VPHSUBW instruction.
955 /// A 256-bit vector of [16 x i16] containing one of the source operands.
957 /// A 256-bit vector of [16 x i16] containing one of the source operands.
958 /// \returns A 256-bit vector of [16 x i16] containing the differences.
959 static __inline__ __m256i __DEFAULT_FN_ATTRS256
960 _mm256_hsub_epi16(__m256i __a
, __m256i __b
)
962 return (__m256i
)__builtin_ia32_phsubw256((__v16hi
)__a
, (__v16hi
)__b
);
965 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
966 /// vectors of [8 x i32] and returns the lower 32 bits of each difference in
967 /// an element of the [8 x i32] result (overflow is ignored). Differences
968 /// from \a __a are returned in the lower 64 bits of each 128-bit half of
969 /// the result; differences from \a __b are returned in the upper 64 bits
970 /// of each 128-bit half of the result.
972 /// \code{.operation}
975 /// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
976 /// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
977 /// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
978 /// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
982 /// \headerfile <immintrin.h>
984 /// This intrinsic corresponds to the \c VPHSUBD instruction.
987 /// A 256-bit vector of [8 x i32] containing one of the source operands.
989 /// A 256-bit vector of [8 x i32] containing one of the source operands.
990 /// \returns A 256-bit vector of [8 x i32] containing the differences.
991 static __inline__ __m256i __DEFAULT_FN_ATTRS256
992 _mm256_hsub_epi32(__m256i __a
, __m256i __b
)
994 return (__m256i
)__builtin_ia32_phsubd256((__v8si
)__a
, (__v8si
)__b
);
997 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
998 /// vectors of [16 x i16] using signed saturation and returns each sum in
999 /// an element of the [16 x i16] result. Differences from \a __a are
1000 /// returned in the lower 64 bits of each 128-bit half of the result;
1001 /// differences from \a __b are returned in the upper 64 bits of each
1002 /// 128-bit half of the result.
1004 /// \code{.operation}
1007 /// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
1008 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1009 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1010 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1011 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1012 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1013 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1014 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1018 /// \headerfile <immintrin.h>
1020 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
1023 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1025 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1026 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1027 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1028 _mm256_hsubs_epi16(__m256i __a
, __m256i __b
)
1030 return (__m256i
)__builtin_ia32_phsubsw256((__v16hi
)__a
, (__v16hi
)__b
);
1033 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1034 /// with the corresponding signed byte from the 256-bit integer vector in
1035 /// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1036 /// pairs of those products using signed saturation to form 16-bit sums
1037 /// returned as elements of the [16 x i16] result.
1039 /// \code{.operation}
1040 /// FOR i := 0 TO 15
1042 /// temp1 := __a[j+7:j] * __b[j+7:j]
1043 /// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1044 /// result[j+15:j] := SATURATE16(temp1 + temp2)
1048 /// \headerfile <immintrin.h>
1050 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1053 /// A 256-bit vector containing one of the source operands.
1055 /// A 256-bit vector containing one of the source operands.
1056 /// \returns A 256-bit vector of [16 x i16] containing the result.
1057 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1058 _mm256_maddubs_epi16(__m256i __a
, __m256i __b
)
1060 return (__m256i
)__builtin_ia32_pmaddubsw256((__v32qi
)__a
, (__v32qi
)__b
);
1063 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1064 /// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1065 /// those products to form 32-bit sums returned as elements of the
1066 /// [8 x i32] result.
1068 /// There is only one wraparound case: when all four of the 16-bit sources
1069 /// are \c 0x8000, the result will be \c 0x80000000.
1071 /// \code{.operation}
1074 /// temp1 := __a[j+15:j] * __b[j+15:j]
1075 /// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1076 /// result[j+31:j] := temp1 + temp2
1080 /// \headerfile <immintrin.h>
1082 /// This intrinsic corresponds to the \c VPMADDWD instruction.
1085 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1087 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1088 /// \returns A 256-bit vector of [8 x i32] containing the result.
1089 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1090 _mm256_madd_epi16(__m256i __a
, __m256i __b
)
1092 return (__m256i
)__builtin_ia32_pmaddwd256((__v16hi
)__a
, (__v16hi
)__b
);
1095 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1096 /// in \a __a and \a __b and returns the larger of each pair in the
1097 /// corresponding byte of the 256-bit result.
1099 /// \headerfile <immintrin.h>
1101 /// This intrinsic corresponds to the \c VPMAXSB instruction.
1104 /// A 256-bit integer vector.
1106 /// A 256-bit integer vector.
1107 /// \returns A 256-bit integer vector containing the result.
1108 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1109 _mm256_max_epi8(__m256i __a
, __m256i __b
)
1111 return (__m256i
)__builtin_elementwise_max((__v32qs
)__a
, (__v32qs
)__b
);
1114 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1115 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1116 /// each pair in the corresponding element of the 256-bit result.
1118 /// \headerfile <immintrin.h>
1120 /// This intrinsic corresponds to the \c VPMAXSW instruction.
1123 /// A 256-bit vector of [16 x i16].
1125 /// A 256-bit vector of [16 x i16].
1126 /// \returns A 256-bit vector of [16 x i16] containing the result.
1127 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1128 _mm256_max_epi16(__m256i __a
, __m256i __b
)
1130 return (__m256i
)__builtin_elementwise_max((__v16hi
)__a
, (__v16hi
)__b
);
1133 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1134 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1135 /// each pair in the corresponding element of the 256-bit result.
1137 /// \headerfile <immintrin.h>
1139 /// This intrinsic corresponds to the \c VPMAXSD instruction.
1142 /// A 256-bit vector of [8 x i32].
1144 /// A 256-bit vector of [8 x i32].
1145 /// \returns A 256-bit vector of [8 x i32] containing the result.
1146 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1147 _mm256_max_epi32(__m256i __a
, __m256i __b
)
1149 return (__m256i
)__builtin_elementwise_max((__v8si
)__a
, (__v8si
)__b
);
1152 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1153 /// vectors in \a __a and \a __b and returns the larger of each pair in
1154 /// the corresponding byte of the 256-bit result.
1156 /// \headerfile <immintrin.h>
1158 /// This intrinsic corresponds to the \c VPMAXUB instruction.
1161 /// A 256-bit integer vector.
1163 /// A 256-bit integer vector.
1164 /// \returns A 256-bit integer vector containing the result.
1165 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1166 _mm256_max_epu8(__m256i __a
, __m256i __b
)
1168 return (__m256i
)__builtin_elementwise_max((__v32qu
)__a
, (__v32qu
)__b
);
1171 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1172 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1173 /// each pair in the corresponding element of the 256-bit result.
1175 /// \headerfile <immintrin.h>
1177 /// This intrinsic corresponds to the \c VPMAXUW instruction.
1180 /// A 256-bit vector of [16 x i16].
1182 /// A 256-bit vector of [16 x i16].
1183 /// \returns A 256-bit vector of [16 x i16] containing the result.
1184 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1185 _mm256_max_epu16(__m256i __a
, __m256i __b
)
1187 return (__m256i
)__builtin_elementwise_max((__v16hu
)__a
, (__v16hu
)__b
);
1190 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1191 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1192 /// each pair in the corresponding element of the 256-bit result.
1194 /// \headerfile <immintrin.h>
1196 /// This intrinsic corresponds to the \c VPMAXUD instruction.
1199 /// A 256-bit vector of [8 x i32].
1201 /// A 256-bit vector of [8 x i32].
1202 /// \returns A 256-bit vector of [8 x i32] containing the result.
1203 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1204 _mm256_max_epu32(__m256i __a
, __m256i __b
)
1206 return (__m256i
)__builtin_elementwise_max((__v8su
)__a
, (__v8su
)__b
);
1209 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1210 /// in \a __a and \a __b and returns the smaller of each pair in the
1211 /// corresponding byte of the 256-bit result.
1213 /// \headerfile <immintrin.h>
1215 /// This intrinsic corresponds to the \c VPMINSB instruction.
1218 /// A 256-bit integer vector.
1220 /// A 256-bit integer vector.
1221 /// \returns A 256-bit integer vector containing the result.
1222 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1223 _mm256_min_epi8(__m256i __a
, __m256i __b
)
1225 return (__m256i
)__builtin_elementwise_min((__v32qs
)__a
, (__v32qs
)__b
);
1228 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1229 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1230 /// each pair in the corresponding element of the 256-bit result.
1232 /// \headerfile <immintrin.h>
1234 /// This intrinsic corresponds to the \c VPMINSW instruction.
1237 /// A 256-bit vector of [16 x i16].
1239 /// A 256-bit vector of [16 x i16].
1240 /// \returns A 256-bit vector of [16 x i16] containing the result.
1241 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1242 _mm256_min_epi16(__m256i __a
, __m256i __b
)
1244 return (__m256i
)__builtin_elementwise_min((__v16hi
)__a
, (__v16hi
)__b
);
1247 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1248 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1249 /// each pair in the corresponding element of the 256-bit result.
1251 /// \headerfile <immintrin.h>
1253 /// This intrinsic corresponds to the \c VPMINSD instruction.
1256 /// A 256-bit vector of [8 x i32].
1258 /// A 256-bit vector of [8 x i32].
1259 /// \returns A 256-bit vector of [8 x i32] containing the result.
1260 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1261 _mm256_min_epi32(__m256i __a
, __m256i __b
)
1263 return (__m256i
)__builtin_elementwise_min((__v8si
)__a
, (__v8si
)__b
);
1266 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1267 /// vectors in \a __a and \a __b and returns the smaller of each pair in
1268 /// the corresponding byte of the 256-bit result.
1270 /// \headerfile <immintrin.h>
1272 /// This intrinsic corresponds to the \c VPMINUB instruction.
1275 /// A 256-bit integer vector.
1277 /// A 256-bit integer vector.
1278 /// \returns A 256-bit integer vector containing the result.
1279 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1280 _mm256_min_epu8(__m256i __a
, __m256i __b
)
1282 return (__m256i
)__builtin_elementwise_min((__v32qu
)__a
, (__v32qu
)__b
);
1285 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1286 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1287 /// each pair in the corresponding element of the 256-bit result.
1289 /// \headerfile <immintrin.h>
1291 /// This intrinsic corresponds to the \c VPMINUW instruction.
1294 /// A 256-bit vector of [16 x i16].
1296 /// A 256-bit vector of [16 x i16].
1297 /// \returns A 256-bit vector of [16 x i16] containing the result.
1298 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1299 _mm256_min_epu16(__m256i __a
, __m256i __b
)
1301 return (__m256i
)__builtin_elementwise_min((__v16hu
)__a
, (__v16hu
)__b
);
1304 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1305 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1306 /// each pair in the corresponding element of the 256-bit result.
1308 /// \headerfile <immintrin.h>
1310 /// This intrinsic corresponds to the \c VPMINUD instruction.
1313 /// A 256-bit vector of [8 x i32].
1315 /// A 256-bit vector of [8 x i32].
1316 /// \returns A 256-bit vector of [8 x i32] containing the result.
1317 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1318 _mm256_min_epu32(__m256i __a
, __m256i __b
)
1320 return (__m256i
)__builtin_elementwise_min((__v8su
)__a
, (__v8su
)__b
);
1323 /// Creates a 32-bit integer mask from the most significant bit of each byte
1324 /// in the 256-bit integer vector in \a __a and returns the result.
1326 /// \code{.operation}
1327 /// FOR i := 0 TO 31
1329 /// result[i] := __a[j+7]
1333 /// \headerfile <immintrin.h>
1335 /// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1338 /// A 256-bit integer vector containing the source bytes.
1339 /// \returns The 32-bit integer mask.
1340 static __inline__
int __DEFAULT_FN_ATTRS256
1341 _mm256_movemask_epi8(__m256i __a
)
1343 return __builtin_ia32_pmovmskb256((__v32qi
)__a
);
1346 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1347 /// the 16-bit values in the corresponding elements of a 256-bit vector
1350 /// \code{.operation}
1351 /// FOR i := 0 TO 15
1354 /// result[k+15:k] := SignExtend(__V[j+7:j])
1358 /// \headerfile <immintrin.h>
1360 /// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1363 /// A 128-bit integer vector containing the source bytes.
1364 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1367 _mm256_cvtepi8_epi16(__m128i __V
)
1369 /* This function always performs a signed extension, but __v16qi is a char
1370 which may be signed or unsigned, so use __v16qs. */
1371 return (__m256i
)__builtin_convertvector((__v16qs
)__V
, __v16hi
);
1374 /// Sign-extends bytes from the lower half of the 128-bit integer vector in
1375 /// \a __V and returns the 32-bit values in the corresponding elements of a
1376 /// 256-bit vector of [8 x i32].
1378 /// \code{.operation}
1382 /// result[k+31:k] := SignExtend(__V[j+7:j])
1386 /// \headerfile <immintrin.h>
1388 /// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1391 /// A 128-bit integer vector containing the source bytes.
1392 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1394 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1395 _mm256_cvtepi8_epi32(__m128i __V
)
1397 /* This function always performs a signed extension, but __v16qi is a char
1398 which may be signed or unsigned, so use __v16qs. */
1399 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v16qs
)__V
, (__v16qs
)__V
, 0, 1, 2, 3, 4, 5, 6, 7), __v8si
);
1402 /// Sign-extends the first four bytes from the 128-bit integer vector in
1403 /// \a __V and returns the 64-bit values in the corresponding elements of a
1404 /// 256-bit vector of [4 x i64].
1406 /// \code{.operation}
1407 /// result[63:0] := SignExtend(__V[7:0])
1408 /// result[127:64] := SignExtend(__V[15:8])
1409 /// result[191:128] := SignExtend(__V[23:16])
1410 /// result[255:192] := SignExtend(__V[31:24])
1413 /// \headerfile <immintrin.h>
1415 /// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1418 /// A 128-bit integer vector containing the source bytes.
1419 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1421 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1422 _mm256_cvtepi8_epi64(__m128i __V
)
1424 /* This function always performs a signed extension, but __v16qi is a char
1425 which may be signed or unsigned, so use __v16qs. */
1426 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v16qs
)__V
, (__v16qs
)__V
, 0, 1, 2, 3), __v4di
);
1429 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1430 /// \a __V and returns the 32-bit values in the corresponding elements of a
1431 /// 256-bit vector of [8 x i32].
1433 /// \code{.operation}
1437 /// result[k+31:k] := SignExtend(__V[j+15:j])
1441 /// \headerfile <immintrin.h>
1443 /// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1446 /// A 128-bit vector of [8 x i16] containing the source values.
1447 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1449 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1450 _mm256_cvtepi16_epi32(__m128i __V
)
1452 return (__m256i
)__builtin_convertvector((__v8hi
)__V
, __v8si
);
1455 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1456 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1457 /// elements of a 256-bit vector of [4 x i64].
1459 /// \code{.operation}
1460 /// result[63:0] := SignExtend(__V[15:0])
1461 /// result[127:64] := SignExtend(__V[31:16])
1462 /// result[191:128] := SignExtend(__V[47:32])
1463 /// result[255:192] := SignExtend(__V[64:48])
1466 /// \headerfile <immintrin.h>
1468 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1471 /// A 128-bit vector of [8 x i16] containing the source values.
1472 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1474 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1475 _mm256_cvtepi16_epi64(__m128i __V
)
1477 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v8hi
)__V
, (__v8hi
)__V
, 0, 1, 2, 3), __v4di
);
1480 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1481 /// \a __V and returns the 64-bit values in the corresponding elements of a
1482 /// 256-bit vector of [4 x i64].
1484 /// \code{.operation}
1485 /// result[63:0] := SignExtend(__V[31:0])
1486 /// result[127:64] := SignExtend(__V[63:32])
1487 /// result[191:128] := SignExtend(__V[95:64])
1488 /// result[255:192] := SignExtend(__V[127:96])
1491 /// \headerfile <immintrin.h>
1493 /// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1496 /// A 128-bit vector of [4 x i32] containing the source values.
1497 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1499 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1500 _mm256_cvtepi32_epi64(__m128i __V
)
1502 return (__m256i
)__builtin_convertvector((__v4si
)__V
, __v4di
);
1505 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1506 /// the 16-bit values in the corresponding elements of a 256-bit vector
1509 /// \code{.operation}
1510 /// FOR i := 0 TO 15
1513 /// result[k+15:k] := ZeroExtend(__V[j+7:j])
1517 /// \headerfile <immintrin.h>
1519 /// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1522 /// A 128-bit integer vector containing the source bytes.
1523 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1525 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1526 _mm256_cvtepu8_epi16(__m128i __V
)
1528 return (__m256i
)__builtin_convertvector((__v16qu
)__V
, __v16hi
);
1531 /// Zero-extends bytes from the lower half of the 128-bit integer vector in
1532 /// \a __V and returns the 32-bit values in the corresponding elements of a
1533 /// 256-bit vector of [8 x i32].
1535 /// \code{.operation}
1539 /// result[k+31:k] := ZeroExtend(__V[j+7:j])
1543 /// \headerfile <immintrin.h>
1545 /// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1548 /// A 128-bit integer vector containing the source bytes.
1549 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1551 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1552 _mm256_cvtepu8_epi32(__m128i __V
)
1554 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v16qu
)__V
, (__v16qu
)__V
, 0, 1, 2, 3, 4, 5, 6, 7), __v8si
);
1557 /// Zero-extends the first four bytes from the 128-bit integer vector in
1558 /// \a __V and returns the 64-bit values in the corresponding elements of a
1559 /// 256-bit vector of [4 x i64].
1561 /// \code{.operation}
1562 /// result[63:0] := ZeroExtend(__V[7:0])
1563 /// result[127:64] := ZeroExtend(__V[15:8])
1564 /// result[191:128] := ZeroExtend(__V[23:16])
1565 /// result[255:192] := ZeroExtend(__V[31:24])
1568 /// \headerfile <immintrin.h>
1570 /// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1573 /// A 128-bit integer vector containing the source bytes.
1574 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1576 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1577 _mm256_cvtepu8_epi64(__m128i __V
)
1579 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v16qu
)__V
, (__v16qu
)__V
, 0, 1, 2, 3), __v4di
);
1582 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1583 /// \a __V and returns the 32-bit values in the corresponding elements of a
1584 /// 256-bit vector of [8 x i32].
1586 /// \code{.operation}
1590 /// result[k+31:k] := ZeroExtend(__V[j+15:j])
1594 /// \headerfile <immintrin.h>
1596 /// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1599 /// A 128-bit vector of [8 x i16] containing the source values.
1600 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1602 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1603 _mm256_cvtepu16_epi32(__m128i __V
)
1605 return (__m256i
)__builtin_convertvector((__v8hu
)__V
, __v8si
);
1608 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1609 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1610 /// elements of a 256-bit vector of [4 x i64].
1612 /// \code{.operation}
1613 /// result[63:0] := ZeroExtend(__V[15:0])
1614 /// result[127:64] := ZeroExtend(__V[31:16])
1615 /// result[191:128] := ZeroExtend(__V[47:32])
1616 /// result[255:192] := ZeroExtend(__V[64:48])
1619 /// \headerfile <immintrin.h>
1621 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1624 /// A 128-bit vector of [8 x i16] containing the source values.
1625 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1627 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1628 _mm256_cvtepu16_epi64(__m128i __V
)
1630 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v8hu
)__V
, (__v8hu
)__V
, 0, 1, 2, 3), __v4di
);
1633 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1634 /// \a __V and returns the 64-bit values in the corresponding elements of a
1635 /// 256-bit vector of [4 x i64].
1637 /// \code{.operation}
1638 /// result[63:0] := ZeroExtend(__V[31:0])
1639 /// result[127:64] := ZeroExtend(__V[63:32])
1640 /// result[191:128] := ZeroExtend(__V[95:64])
1641 /// result[255:192] := ZeroExtend(__V[127:96])
1644 /// \headerfile <immintrin.h>
1646 /// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1649 /// A 128-bit vector of [4 x i32] containing the source values.
1650 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1652 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1653 _mm256_cvtepu32_epi64(__m128i __V
)
1655 return (__m256i
)__builtin_convertvector((__v4su
)__V
, __v4di
);
1658 /// Multiplies signed 32-bit integers from even-numbered elements of two
1659 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1660 /// [4 x i64] result.
1662 /// \code{.operation}
1663 /// result[63:0] := __a[31:0] * __b[31:0]
1664 /// result[127:64] := __a[95:64] * __b[95:64]
1665 /// result[191:128] := __a[159:128] * __b[159:128]
1666 /// result[255:192] := __a[223:192] * __b[223:192]
1669 /// \headerfile <immintrin.h>
1671 /// This intrinsic corresponds to the \c VPMULDQ instruction.
1674 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1676 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1677 /// \returns A 256-bit vector of [4 x i64] containing the products.
1678 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1679 _mm256_mul_epi32(__m256i __a
, __m256i __b
)
1681 return (__m256i
)__builtin_ia32_pmuldq256((__v8si
)__a
, (__v8si
)__b
);
1684 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1685 /// [16 x i16], truncates the 32-bit results to the most significant 18
1686 /// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1687 /// product in the [16 x i16] result.
1689 /// \code{.operation}
1690 /// FOR i := 0 TO 15
1692 /// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1693 /// result[j+15:j] := temp[16:1]
1696 /// \headerfile <immintrin.h>
1698 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
1701 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1703 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1704 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1705 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1706 _mm256_mulhrs_epi16(__m256i __a
, __m256i __b
)
1708 return (__m256i
)__builtin_ia32_pmulhrsw256((__v16hi
)__a
, (__v16hi
)__b
);
1711 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1712 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1713 /// [16 x i16] result.
1715 /// \headerfile <immintrin.h>
1717 /// This intrinsic corresponds to the \c VPMULHUW instruction.
1720 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1722 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1723 /// \returns A 256-bit vector of [16 x i16] containing the products.
1724 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1725 _mm256_mulhi_epu16(__m256i __a
, __m256i __b
)
1727 return (__m256i
)__builtin_ia32_pmulhuw256((__v16hi
)__a
, (__v16hi
)__b
);
1730 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1731 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1732 /// [16 x i16] result.
1734 /// \headerfile <immintrin.h>
1736 /// This intrinsic corresponds to the \c VPMULHW instruction.
1739 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1741 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1742 /// \returns A 256-bit vector of [16 x i16] containing the products.
1743 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1744 _mm256_mulhi_epi16(__m256i __a
, __m256i __b
)
1746 return (__m256i
)__builtin_ia32_pmulhw256((__v16hi
)__a
, (__v16hi
)__b
);
1749 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1750 /// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1751 /// [16 x i16] result.
1753 /// \headerfile <immintrin.h>
1755 /// This intrinsic corresponds to the \c VPMULLW instruction.
1758 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1760 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1761 /// \returns A 256-bit vector of [16 x i16] containing the products.
1762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1763 _mm256_mullo_epi16(__m256i __a
, __m256i __b
)
1765 return (__m256i
)((__v16hu
)__a
* (__v16hu
)__b
);
1768 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1769 /// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1770 /// [8 x i32] result.
1772 /// \headerfile <immintrin.h>
1774 /// This intrinsic corresponds to the \c VPMULLD instruction.
1777 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1779 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1780 /// \returns A 256-bit vector of [8 x i32] containing the products.
1781 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1782 _mm256_mullo_epi32 (__m256i __a
, __m256i __b
)
1784 return (__m256i
)((__v8su
)__a
* (__v8su
)__b
);
1787 /// Multiplies unsigned 32-bit integers from even-numered elements of two
1788 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1789 /// [4 x i64] result.
1791 /// \code{.operation}
1792 /// result[63:0] := __a[31:0] * __b[31:0]
1793 /// result[127:64] := __a[95:64] * __b[95:64]
1794 /// result[191:128] := __a[159:128] * __b[159:128]
1795 /// result[255:192] := __a[223:192] * __b[223:192]
1798 /// \headerfile <immintrin.h>
1800 /// This intrinsic corresponds to the \c VPMULUDQ instruction.
1803 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1805 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1806 /// \returns A 256-bit vector of [4 x i64] containing the products.
1807 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1808 _mm256_mul_epu32(__m256i __a
, __m256i __b
)
1810 return __builtin_ia32_pmuludq256((__v8si
)__a
, (__v8si
)__b
);
1813 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1816 /// \headerfile <immintrin.h>
1818 /// This intrinsic corresponds to the \c VPOR instruction.
1821 /// A 256-bit integer vector.
1823 /// A 256-bit integer vector.
1824 /// \returns A 256-bit integer vector containing the result.
1825 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1826 _mm256_or_si256(__m256i __a
, __m256i __b
)
1828 return (__m256i
)((__v4du
)__a
| (__v4du
)__b
);
1831 /// Computes four sum of absolute difference (SAD) operations on sets of eight
1832 /// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1835 /// One SAD result is computed for each set of eight bytes from \a __a and
1836 /// eight bytes from \a __b. The zero-extended SAD value is returned in the
1837 /// corresponding 64-bit element of the result.
1839 /// A single SAD operation takes the differences between the corresponding
1840 /// bytes of \a __a and \a __b, takes the absolute value of each difference,
1841 /// and sums these eight values to form one 16-bit result. This operation
1842 /// is repeated four times with successive sets of eight bytes.
1844 /// \code{.operation}
1847 /// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1848 /// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1849 /// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1850 /// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1851 /// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1852 /// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1853 /// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1854 /// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1855 /// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1856 /// temp4 + temp5 + temp6 + temp7
1857 /// result[j+63:j+16] := 0
1861 /// \headerfile <immintrin.h>
1863 /// This intrinsic corresponds to the \c VPSADBW instruction.
1866 /// A 256-bit integer vector.
1868 /// A 256-bit integer vector.
1869 /// \returns A 256-bit integer vector containing the result.
1870 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1871 _mm256_sad_epu8(__m256i __a
, __m256i __b
)
1873 return __builtin_ia32_psadbw256((__v32qi
)__a
, (__v32qi
)__b
);
1876 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1877 /// to control information in the 256-bit integer vector \a __b, and
1878 /// returns the 256-bit result. In effect there are two separate 128-bit
1879 /// shuffles in the lower and upper halves.
1881 /// \code{.operation}
1882 /// FOR i := 0 TO 31
1884 /// IF __b[j+7] == 1
1885 /// result[j+7:j] := 0
1887 /// k := __b[j+3:j] * 8
1891 /// result[j+7:j] := __a[k+7:k]
1896 /// \headerfile <immintrin.h>
1898 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1901 /// A 256-bit integer vector containing source values.
1903 /// A 256-bit integer vector containing control information to determine
1904 /// what goes into the corresponding byte of the result. If bit 7 of the
1905 /// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1906 /// control byte specify the index (within the same 128-bit half) of \a __a
1907 /// to copy to the result byte.
1908 /// \returns A 256-bit integer vector containing the result.
1909 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1910 _mm256_shuffle_epi8(__m256i __a
, __m256i __b
)
1912 return (__m256i
)__builtin_ia32_pshufb256((__v32qi
)__a
, (__v32qi
)__b
);
1915 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1916 /// according to control information in the integer literal \a imm, and
1917 /// returns the 256-bit result. In effect there are two parallel 128-bit
1918 /// shuffles in the lower and upper halves.
1920 /// \code{.operation}
1923 /// k := (imm >> i*2)[1:0] * 32
1924 /// result[j+31:j] := a[k+31:k]
1925 /// result[128+j+31:128+j] := a[128+k+31:128+k]
1929 /// \headerfile <immintrin.h>
1932 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1935 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1938 /// A 256-bit vector of [8 x i32] containing source values.
1940 /// An immediate 8-bit value specifying which elements to copy from \a a.
1941 /// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1942 /// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1944 /// \returns A 256-bit vector of [8 x i32] containing the result.
1945 #define _mm256_shuffle_epi32(a, imm) \
1946 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1948 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1949 /// according to control information in the integer literal \a imm, and
1950 /// returns the 256-bit result. The upper 64 bits of each 128-bit half
1951 /// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1952 /// copied from \a a unchanged.
1954 /// \code{.operation}
1955 /// result[63:0] := a[63:0]
1956 /// result[191:128] := a[191:128]
1958 /// j := i * 16 + 64
1959 /// k := (imm >> i*2)[1:0] * 16 + 64
1960 /// result[j+15:j] := a[k+15:k]
1961 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1965 /// \headerfile <immintrin.h>
1968 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1971 /// This intrinsic corresponds to the \c VPSHUFHW instruction.
1974 /// A 256-bit vector of [16 x i16] containing source values.
1976 /// An immediate 8-bit value specifying which elements to copy from \a a.
1977 /// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1978 /// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1979 /// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1980 /// \returns A 256-bit vector of [16 x i16] containing the result.
1981 #define _mm256_shufflehi_epi16(a, imm) \
1982 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1984 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1985 /// according to control information in the integer literal \a imm, and
1986 /// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1987 /// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1988 /// copied from \a a unchanged.
1990 /// \code{.operation}
1991 /// result[127:64] := a[127:64]
1992 /// result[255:192] := a[255:192]
1995 /// k := (imm >> i*2)[1:0] * 16
1996 /// result[j+15:j] := a[k+15:k]
1997 /// result[128+j+15:128+j] := a[128+k+15:128+k]
2001 /// \headerfile <immintrin.h>
2004 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
2007 /// This intrinsic corresponds to the \c VPSHUFLW instruction.
2010 /// A 256-bit vector of [16 x i16] to use as a source of data for the
2013 /// An immediate 8-bit value specifying which elements to copy from \a a.
2014 /// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2015 /// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2017 /// \returns A 256-bit vector of [16 x i16] containing the result.
2018 #define _mm256_shufflelo_epi16(a, imm) \
2019 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2021 /// Sets each byte of the result to the corresponding byte of the 256-bit
2022 /// integer vector in \a __a, the negative of that byte, or zero, depending
2023 /// on whether the corresponding byte of the 256-bit integer vector in
2024 /// \a __b is greater than zero, less than zero, or equal to zero,
2027 /// \headerfile <immintrin.h>
2029 /// This intrinsic corresponds to the \c VPSIGNB instruction.
2032 /// A 256-bit integer vector.
2034 /// A 256-bit integer vector].
2035 /// \returns A 256-bit integer vector containing the result.
2036 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2037 _mm256_sign_epi8(__m256i __a
, __m256i __b
)
2039 return (__m256i
)__builtin_ia32_psignb256((__v32qi
)__a
, (__v32qi
)__b
);
2042 /// Sets each element of the result to the corresponding element of the
2043 /// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
2044 /// or zero, depending on whether the corresponding element of the 256-bit
2045 /// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2046 /// equal to zero, respectively.
2048 /// \headerfile <immintrin.h>
2050 /// This intrinsic corresponds to the \c VPSIGNW instruction.
2053 /// A 256-bit vector of [16 x i16].
2055 /// A 256-bit vector of [16 x i16].
2056 /// \returns A 256-bit vector of [16 x i16] containing the result.
2057 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2058 _mm256_sign_epi16(__m256i __a
, __m256i __b
)
2060 return (__m256i
)__builtin_ia32_psignw256((__v16hi
)__a
, (__v16hi
)__b
);
2063 /// Sets each element of the result to the corresponding element of the
2064 /// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2065 /// zero, depending on whether the corresponding element of the 256-bit
2066 /// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2067 /// equal to zero, respectively.
2069 /// \headerfile <immintrin.h>
2071 /// This intrinsic corresponds to the \c VPSIGND instruction.
2074 /// A 256-bit vector of [8 x i32].
2076 /// A 256-bit vector of [8 x i32].
2077 /// \returns A 256-bit vector of [8 x i32] containing the result.
2078 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2079 _mm256_sign_epi32(__m256i __a
, __m256i __b
)
2081 return (__m256i
)__builtin_ia32_psignd256((__v8si
)__a
, (__v8si
)__b
);
2084 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2085 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2086 /// is greater than 15, the returned result is all zeroes.
2088 /// \headerfile <immintrin.h>
2091 /// __m256i _mm256_slli_si256(__m256i a, const int imm);
2094 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2097 /// A 256-bit integer vector to be shifted.
2099 /// An unsigned immediate value specifying the shift count (in bytes).
2100 /// \returns A 256-bit integer vector containing the result.
2101 #define _mm256_slli_si256(a, imm) \
2102 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2104 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2105 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2106 /// is greater than 15, the returned result is all zeroes.
2108 /// \headerfile <immintrin.h>
2111 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2114 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2117 /// A 256-bit integer vector to be shifted.
2119 /// An unsigned immediate value specifying the shift count (in bytes).
2120 /// \returns A 256-bit integer vector containing the result.
2121 #define _mm256_bslli_epi128(a, imm) \
2122 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2124 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2125 /// left by \a __count bits, shifting in zero bits, and returns the result.
2126 /// If \a __count is greater than 15, the returned result is all zeroes.
2128 /// \headerfile <immintrin.h>
2130 /// This intrinsic corresponds to the \c VPSLLW instruction.
2133 /// A 256-bit vector of [16 x i16] to be shifted.
2135 /// An unsigned integer value specifying the shift count (in bits).
2136 /// \returns A 256-bit vector of [16 x i16] containing the result.
2137 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2138 _mm256_slli_epi16(__m256i __a
, int __count
)
2140 return (__m256i
)__builtin_ia32_psllwi256((__v16hi
)__a
, __count
);
2143 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2144 /// left by the number of bits specified by the lower 64 bits of \a __count,
2145 /// shifting in zero bits, and returns the result. If \a __count is greater
2146 /// than 15, the returned result is all zeroes.
2148 /// \headerfile <immintrin.h>
2150 /// This intrinsic corresponds to the \c VPSLLW instruction.
2153 /// A 256-bit vector of [16 x i16] to be shifted.
2155 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2156 /// shift count (in bits). The upper element is ignored.
2157 /// \returns A 256-bit vector of [16 x i16] containing the result.
2158 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2159 _mm256_sll_epi16(__m256i __a
, __m128i __count
)
2161 return (__m256i
)__builtin_ia32_psllw256((__v16hi
)__a
, (__v8hi
)__count
);
2164 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2165 /// left by \a __count bits, shifting in zero bits, and returns the result.
2166 /// If \a __count is greater than 31, the returned result is all zeroes.
2168 /// \headerfile <immintrin.h>
2170 /// This intrinsic corresponds to the \c VPSLLD instruction.
2173 /// A 256-bit vector of [8 x i32] to be shifted.
2175 /// An unsigned integer value specifying the shift count (in bits).
2176 /// \returns A 256-bit vector of [8 x i32] containing the result.
2177 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2178 _mm256_slli_epi32(__m256i __a
, int __count
)
2180 return (__m256i
)__builtin_ia32_pslldi256((__v8si
)__a
, __count
);
2183 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2184 /// left by the number of bits given in the lower 64 bits of \a __count,
2185 /// shifting in zero bits, and returns the result. If \a __count is greater
2186 /// than 31, the returned result is all zeroes.
2188 /// \headerfile <immintrin.h>
2190 /// This intrinsic corresponds to the \c VPSLLD instruction.
2193 /// A 256-bit vector of [8 x i32] to be shifted.
2195 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2196 /// shift count (in bits). The upper element is ignored.
2197 /// \returns A 256-bit vector of [8 x i32] containing the result.
2198 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2199 _mm256_sll_epi32(__m256i __a
, __m128i __count
)
2201 return (__m256i
)__builtin_ia32_pslld256((__v8si
)__a
, (__v4si
)__count
);
2204 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2205 /// left by \a __count bits, shifting in zero bits, and returns the result.
2206 /// If \a __count is greater than 63, the returned result is all zeroes.
2208 /// \headerfile <immintrin.h>
2210 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2213 /// A 256-bit vector of [4 x i64] to be shifted.
2215 /// An unsigned integer value specifying the shift count (in bits).
2216 /// \returns A 256-bit vector of [4 x i64] containing the result.
2217 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2218 _mm256_slli_epi64(__m256i __a
, int __count
)
2220 return __builtin_ia32_psllqi256((__v4di
)__a
, __count
);
2223 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2224 /// left by the number of bits given in the lower 64 bits of \a __count,
2225 /// shifting in zero bits, and returns the result. If \a __count is greater
2226 /// than 63, the returned result is all zeroes.
2228 /// \headerfile <immintrin.h>
2230 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2233 /// A 256-bit vector of [4 x i64] to be shifted.
2235 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2236 /// shift count (in bits). The upper element is ignored.
2237 /// \returns A 256-bit vector of [4 x i64] containing the result.
2238 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2239 _mm256_sll_epi64(__m256i __a
, __m128i __count
)
2241 return __builtin_ia32_psllq256((__v4di
)__a
, __count
);
2244 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2245 /// right by \a __count bits, shifting in sign bits, and returns the result.
2246 /// If \a __count is greater than 15, each element of the result is either
2247 /// 0 or -1 according to the corresponding input sign bit.
2249 /// \headerfile <immintrin.h>
2251 /// This intrinsic corresponds to the \c VPSRAW instruction.
2254 /// A 256-bit vector of [16 x i16] to be shifted.
2256 /// An unsigned integer value specifying the shift count (in bits).
2257 /// \returns A 256-bit vector of [16 x i16] containing the result.
2258 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2259 _mm256_srai_epi16(__m256i __a
, int __count
)
2261 return (__m256i
)__builtin_ia32_psrawi256((__v16hi
)__a
, __count
);
2264 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2265 /// right by the number of bits given in the lower 64 bits of \a __count,
2266 /// shifting in sign bits, and returns the result. If \a __count is greater
2267 /// than 15, each element of the result is either 0 or -1 according to the
2268 /// corresponding input sign bit.
2270 /// \headerfile <immintrin.h>
2272 /// This intrinsic corresponds to the \c VPSRAW instruction.
2275 /// A 256-bit vector of [16 x i16] to be shifted.
2277 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2278 /// shift count (in bits). The upper element is ignored.
2279 /// \returns A 256-bit vector of [16 x i16] containing the result.
2280 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2281 _mm256_sra_epi16(__m256i __a
, __m128i __count
)
2283 return (__m256i
)__builtin_ia32_psraw256((__v16hi
)__a
, (__v8hi
)__count
);
2286 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2287 /// right by \a __count bits, shifting in sign bits, and returns the result.
2288 /// If \a __count is greater than 31, each element of the result is either
2289 /// 0 or -1 according to the corresponding input sign bit.
2291 /// \headerfile <immintrin.h>
2293 /// This intrinsic corresponds to the \c VPSRAD instruction.
2296 /// A 256-bit vector of [8 x i32] to be shifted.
2298 /// An unsigned integer value specifying the shift count (in bits).
2299 /// \returns A 256-bit vector of [8 x i32] containing the result.
2300 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2301 _mm256_srai_epi32(__m256i __a
, int __count
)
2303 return (__m256i
)__builtin_ia32_psradi256((__v8si
)__a
, __count
);
2306 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2307 /// right by the number of bits given in the lower 64 bits of \a __count,
2308 /// shifting in sign bits, and returns the result. If \a __count is greater
2309 /// than 31, each element of the result is either 0 or -1 according to the
2310 /// corresponding input sign bit.
2312 /// \headerfile <immintrin.h>
2314 /// This intrinsic corresponds to the \c VPSRAD instruction.
2317 /// A 256-bit vector of [8 x i32] to be shifted.
2319 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2320 /// shift count (in bits). The upper element is ignored.
2321 /// \returns A 256-bit vector of [8 x i32] containing the result.
2322 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2323 _mm256_sra_epi32(__m256i __a
, __m128i __count
)
2325 return (__m256i
)__builtin_ia32_psrad256((__v8si
)__a
, (__v4si
)__count
);
2328 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2329 /// \a imm bytes, shifting in zero bytes, and returns the result. If
2330 /// \a imm is greater than 15, the returned result is all zeroes.
2332 /// \headerfile <immintrin.h>
2335 /// __m256i _mm256_srli_si256(__m256i a, const int imm);
2338 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2341 /// A 256-bit integer vector to be shifted.
2343 /// An unsigned immediate value specifying the shift count (in bytes).
2344 /// \returns A 256-bit integer vector containing the result.
2345 #define _mm256_srli_si256(a, imm) \
2346 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2348 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2349 /// \a imm bytes, shifting in zero bytes, and returns the result. If
2350 /// \a imm is greater than 15, the returned result is all zeroes.
2352 /// \headerfile <immintrin.h>
2355 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2358 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2361 /// A 256-bit integer vector to be shifted.
2363 /// An unsigned immediate value specifying the shift count (in bytes).
2364 /// \returns A 256-bit integer vector containing the result.
2365 #define _mm256_bsrli_epi128(a, imm) \
2366 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2368 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2369 /// right by \a __count bits, shifting in zero bits, and returns the result.
2370 /// If \a __count is greater than 15, the returned result is all zeroes.
2372 /// \headerfile <immintrin.h>
2374 /// This intrinsic corresponds to the \c VPSRLW instruction.
2377 /// A 256-bit vector of [16 x i16] to be shifted.
2379 /// An unsigned integer value specifying the shift count (in bits).
2380 /// \returns A 256-bit vector of [16 x i16] containing the result.
2381 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2382 _mm256_srli_epi16(__m256i __a
, int __count
)
2384 return (__m256i
)__builtin_ia32_psrlwi256((__v16hi
)__a
, __count
);
2387 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2388 /// right by the number of bits given in the lower 64 bits of \a __count,
2389 /// shifting in zero bits, and returns the result. If \a __count is greater
2390 /// than 15, the returned result is all zeroes.
2392 /// \headerfile <immintrin.h>
2394 /// This intrinsic corresponds to the \c VPSRLW instruction.
2397 /// A 256-bit vector of [16 x i16] to be shifted.
2399 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2400 /// shift count (in bits). The upper element is ignored.
2401 /// \returns A 256-bit vector of [16 x i16] containing the result.
2402 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2403 _mm256_srl_epi16(__m256i __a
, __m128i __count
)
2405 return (__m256i
)__builtin_ia32_psrlw256((__v16hi
)__a
, (__v8hi
)__count
);
2408 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2409 /// right by \a __count bits, shifting in zero bits, and returns the result.
2410 /// If \a __count is greater than 31, the returned result is all zeroes.
2412 /// \headerfile <immintrin.h>
2414 /// This intrinsic corresponds to the \c VPSRLD instruction.
2417 /// A 256-bit vector of [8 x i32] to be shifted.
2419 /// An unsigned integer value specifying the shift count (in bits).
2420 /// \returns A 256-bit vector of [8 x i32] containing the result.
2421 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2422 _mm256_srli_epi32(__m256i __a
, int __count
)
2424 return (__m256i
)__builtin_ia32_psrldi256((__v8si
)__a
, __count
);
2427 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2428 /// right by the number of bits given in the lower 64 bits of \a __count,
2429 /// shifting in zero bits, and returns the result. If \a __count is greater
2430 /// than 31, the returned result is all zeroes.
2432 /// \headerfile <immintrin.h>
2434 /// This intrinsic corresponds to the \c VPSRLD instruction.
2437 /// A 256-bit vector of [8 x i32] to be shifted.
2439 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2440 /// shift count (in bits). The upper element is ignored.
2441 /// \returns A 256-bit vector of [8 x i32] containing the result.
2442 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2443 _mm256_srl_epi32(__m256i __a
, __m128i __count
)
2445 return (__m256i
)__builtin_ia32_psrld256((__v8si
)__a
, (__v4si
)__count
);
2448 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2449 /// right by \a __count bits, shifting in zero bits, and returns the result.
2450 /// If \a __count is greater than 63, the returned result is all zeroes.
2452 /// \headerfile <immintrin.h>
2454 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2457 /// A 256-bit vector of [4 x i64] to be shifted.
2459 /// An unsigned integer value specifying the shift count (in bits).
2460 /// \returns A 256-bit vector of [4 x i64] containing the result.
2461 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2462 _mm256_srli_epi64(__m256i __a
, int __count
)
2464 return __builtin_ia32_psrlqi256((__v4di
)__a
, __count
);
2467 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2468 /// right by the number of bits given in the lower 64 bits of \a __count,
2469 /// shifting in zero bits, and returns the result. If \a __count is greater
2470 /// than 63, the returned result is all zeroes.
2472 /// \headerfile <immintrin.h>
2474 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2477 /// A 256-bit vector of [4 x i64] to be shifted.
2479 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2480 /// shift count (in bits). The upper element is ignored.
2481 /// \returns A 256-bit vector of [4 x i64] containing the result.
2482 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2483 _mm256_srl_epi64(__m256i __a
, __m128i __count
)
2485 return __builtin_ia32_psrlq256((__v4di
)__a
, __count
);
2488 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2489 /// vectors. Returns the lower 8 bits of each difference in the
2490 /// corresponding byte of the 256-bit integer vector result (overflow is
2493 /// \code{.operation}
2494 /// FOR i := 0 TO 31
2496 /// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2500 /// \headerfile <immintrin.h>
2502 /// This intrinsic corresponds to the \c VPSUBB instruction.
2505 /// A 256-bit integer vector containing the minuends.
2507 /// A 256-bit integer vector containing the subtrahends.
2508 /// \returns A 256-bit integer vector containing the differences.
2509 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2510 _mm256_sub_epi8(__m256i __a
, __m256i __b
)
2512 return (__m256i
)((__v32qu
)__a
- (__v32qu
)__b
);
2515 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2516 /// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2517 /// the corresponding element of the [16 x i16] result (overflow is
2520 /// \code{.operation}
2521 /// FOR i := 0 TO 15
2523 /// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2527 /// \headerfile <immintrin.h>
2529 /// This intrinsic corresponds to the \c VPSUBW instruction.
2532 /// A 256-bit vector of [16 x i16] containing the minuends.
2534 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2535 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2536 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2537 _mm256_sub_epi16(__m256i __a
, __m256i __b
)
2539 return (__m256i
)((__v16hu
)__a
- (__v16hu
)__b
);
2542 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
2543 /// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2544 /// the corresponding element of the [8 x i32] result (overflow is ignored).
2546 /// \code{.operation}
2549 /// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2553 /// \headerfile <immintrin.h>
2555 /// This intrinsic corresponds to the \c VPSUBD instruction.
2558 /// A 256-bit vector of [8 x i32] containing the minuends.
2560 /// A 256-bit vector of [8 x i32] containing the subtrahends.
2561 /// \returns A 256-bit vector of [8 x i32] containing the differences.
2562 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2563 _mm256_sub_epi32(__m256i __a
, __m256i __b
)
2565 return (__m256i
)((__v8su
)__a
- (__v8su
)__b
);
2568 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
2569 /// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2570 /// the corresponding element of the [4 x i64] result (overflow is ignored).
2572 /// \code{.operation}
2575 /// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2579 /// \headerfile <immintrin.h>
2581 /// This intrinsic corresponds to the \c VPSUBQ instruction.
2584 /// A 256-bit vector of [4 x i64] containing the minuends.
2586 /// A 256-bit vector of [4 x i64] containing the subtrahends.
2587 /// \returns A 256-bit vector of [4 x i64] containing the differences.
2588 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2589 _mm256_sub_epi64(__m256i __a
, __m256i __b
)
2591 return (__m256i
)((__v4du
)__a
- (__v4du
)__b
);
2594 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2595 /// vectors using signed saturation, and returns each differences in the
2596 /// corresponding byte of the 256-bit integer vector result.
2598 /// \code{.operation}
2599 /// FOR i := 0 TO 31
2601 /// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2605 /// \headerfile <immintrin.h>
2607 /// This intrinsic corresponds to the \c VPSUBSB instruction.
2610 /// A 256-bit integer vector containing the minuends.
2612 /// A 256-bit integer vector containing the subtrahends.
2613 /// \returns A 256-bit integer vector containing the differences.
2614 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2615 _mm256_subs_epi8(__m256i __a
, __m256i __b
)
2617 return (__m256i
)__builtin_elementwise_sub_sat((__v32qs
)__a
, (__v32qs
)__b
);
2620 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2621 /// vectors of [16 x i16] using signed saturation, and returns each
2622 /// difference in the corresponding element of the [16 x i16] result.
2624 /// \code{.operation}
2625 /// FOR i := 0 TO 15
2627 /// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2631 /// \headerfile <immintrin.h>
2633 /// This intrinsic corresponds to the \c VPSUBSW instruction.
2636 /// A 256-bit vector of [16 x i16] containing the minuends.
2638 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2639 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2640 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2641 _mm256_subs_epi16(__m256i __a
, __m256i __b
)
2643 return (__m256i
)__builtin_elementwise_sub_sat((__v16hi
)__a
, (__v16hi
)__b
);
2646 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2647 /// vectors using unsigned saturation, and returns each difference in the
2648 /// corresponding byte of the 256-bit integer vector result. For each byte,
2649 /// computes <c> result = __a - __b </c>.
2651 /// \code{.operation}
2652 /// FOR i := 0 TO 31
2654 /// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2658 /// \headerfile <immintrin.h>
2660 /// This intrinsic corresponds to the \c VPSUBUSB instruction.
2663 /// A 256-bit integer vector containing the minuends.
2665 /// A 256-bit integer vector containing the subtrahends.
2666 /// \returns A 256-bit integer vector containing the differences.
2667 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2668 _mm256_subs_epu8(__m256i __a
, __m256i __b
)
2670 return (__m256i
)__builtin_elementwise_sub_sat((__v32qu
)__a
, (__v32qu
)__b
);
2673 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2674 /// vectors of [16 x i16] using unsigned saturation, and returns each
2675 /// difference in the corresponding element of the [16 x i16] result.
2677 /// \code{.operation}
2678 /// FOR i := 0 TO 15
2680 /// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2684 /// \headerfile <immintrin.h>
2686 /// This intrinsic corresponds to the \c VPSUBUSW instruction.
2689 /// A 256-bit vector of [16 x i16] containing the minuends.
2691 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2692 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2693 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2694 _mm256_subs_epu16(__m256i __a
, __m256i __b
)
2696 return (__m256i
)__builtin_elementwise_sub_sat((__v16hu
)__a
, (__v16hu
)__b
);
2699 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2700 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2701 /// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2702 /// input; other bits in these parameters are ignored.
2704 /// \code{.operation}
2705 /// result[7:0] := __a[71:64]
2706 /// result[15:8] := __b[71:64]
2707 /// result[23:16] := __a[79:72]
2708 /// result[31:24] := __b[79:72]
2710 /// result[127:120] := __b[127:120]
2711 /// result[135:128] := __a[199:192]
2713 /// result[255:248] := __b[255:248]
2716 /// \headerfile <immintrin.h>
2718 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2721 /// A 256-bit integer vector used as the source for the even-numbered bytes
2724 /// A 256-bit integer vector used as the source for the odd-numbered bytes
2726 /// \returns A 256-bit integer vector containing the result.
2727 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2728 _mm256_unpackhi_epi8(__m256i __a
, __m256i __b
)
2730 return (__m256i
)__builtin_shufflevector((__v32qi
)__a
, (__v32qi
)__b
, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2733 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2734 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2735 /// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2736 /// 128-bit half of \a __a and \a __b as input; other bits in these
2737 /// parameters are ignored.
2739 /// \code{.operation}
2740 /// result[15:0] := __a[79:64]
2741 /// result[31:16] := __b[79:64]
2742 /// result[47:32] := __a[95:80]
2743 /// result[63:48] := __b[95:80]
2745 /// result[127:112] := __b[127:112]
2746 /// result[143:128] := __a[211:196]
2748 /// result[255:240] := __b[255:240]
2751 /// \headerfile <immintrin.h>
2753 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2756 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2757 /// elements of the result.
2759 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2760 /// elements of the result.
2761 /// \returns A 256-bit vector of [16 x i16] containing the result.
2762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2763 _mm256_unpackhi_epi16(__m256i __a
, __m256i __b
)
2765 return (__m256i
)__builtin_shufflevector((__v16hi
)__a
, (__v16hi
)__b
, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2768 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2769 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2770 /// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2771 /// of \a __a and \a __b as input; other bits in these parameters are
2774 /// \code{.operation}
2775 /// result[31:0] := __a[95:64]
2776 /// result[63:32] := __b[95:64]
2777 /// result[95:64] := __a[127:96]
2778 /// result[127:96] := __b[127:96]
2779 /// result[159:128] := __a[223:192]
2780 /// result[191:160] := __b[223:192]
2781 /// result[223:192] := __a[255:224]
2782 /// result[255:224] := __b[255:224]
2785 /// \headerfile <immintrin.h>
2787 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2790 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2791 /// elements of the result.
2793 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2794 /// elements of the result.
2795 /// \returns A 256-bit vector of [8 x i32] containing the result.
2796 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2797 _mm256_unpackhi_epi32(__m256i __a
, __m256i __b
)
2799 return (__m256i
)__builtin_shufflevector((__v8si
)__a
, (__v8si
)__b
, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2802 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2803 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2804 /// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2805 /// of \a __a and \a __b as input; other bits in these parameters are
2808 /// \code{.operation}
2809 /// result[63:0] := __a[127:64]
2810 /// result[127:64] := __b[127:64]
2811 /// result[191:128] := __a[255:192]
2812 /// result[255:192] := __b[255:192]
2815 /// \headerfile <immintrin.h>
2817 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2820 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2821 /// elements of the result.
2823 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2824 /// elements of the result.
2825 /// \returns A 256-bit vector of [4 x i64] containing the result.
2826 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2827 _mm256_unpackhi_epi64(__m256i __a
, __m256i __b
)
2829 return (__m256i
)__builtin_shufflevector((__v4di
)__a
, (__v4di
)__b
, 1, 4+1, 3, 4+3);
2832 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2833 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2834 /// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2835 /// input; other bits in these parameters are ignored.
2837 /// \code{.operation}
2838 /// result[7:0] := __a[7:0]
2839 /// result[15:8] := __b[7:0]
2840 /// result[23:16] := __a[15:8]
2841 /// result[31:24] := __b[15:8]
2843 /// result[127:120] := __b[63:56]
2844 /// result[135:128] := __a[135:128]
2846 /// result[255:248] := __b[191:184]
2849 /// \headerfile <immintrin.h>
2851 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2854 /// A 256-bit integer vector used as the source for the even-numbered bytes
2857 /// A 256-bit integer vector used as the source for the odd-numbered bytes
2859 /// \returns A 256-bit integer vector containing the result.
2860 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2861 _mm256_unpacklo_epi8(__m256i __a
, __m256i __b
)
2863 return (__m256i
)__builtin_shufflevector((__v32qi
)__a
, (__v32qi
)__b
, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2866 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2867 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2868 /// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2869 /// 128-bit half of \a __a and \a __b as input; other bits in these
2870 /// parameters are ignored.
2872 /// \code{.operation}
2873 /// result[15:0] := __a[15:0]
2874 /// result[31:16] := __b[15:0]
2875 /// result[47:32] := __a[31:16]
2876 /// result[63:48] := __b[31:16]
2878 /// result[127:112] := __b[63:48]
2879 /// result[143:128] := __a[143:128]
2881 /// result[255:239] := __b[191:176]
2884 /// \headerfile <immintrin.h>
2886 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2889 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2890 /// elements of the result.
2892 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2893 /// elements of the result.
2894 /// \returns A 256-bit vector of [16 x i16] containing the result.
2895 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2896 _mm256_unpacklo_epi16(__m256i __a
, __m256i __b
)
2898 return (__m256i
)__builtin_shufflevector((__v16hi
)__a
, (__v16hi
)__b
, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2901 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2902 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2903 /// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2904 /// of \a __a and \a __b as input; other bits in these parameters are
2907 /// \code{.operation}
2908 /// result[31:0] := __a[31:0]
2909 /// result[63:32] := __b[31:0]
2910 /// result[95:64] := __a[63:32]
2911 /// result[127:96] := __b[63:32]
2912 /// result[159:128] := __a[159:128]
2913 /// result[191:160] := __b[159:128]
2914 /// result[223:192] := __a[191:160]
2915 /// result[255:224] := __b[191:190]
2918 /// \headerfile <immintrin.h>
2920 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2923 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2924 /// elements of the result.
2926 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2927 /// elements of the result.
2928 /// \returns A 256-bit vector of [8 x i32] containing the result.
2929 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2930 _mm256_unpacklo_epi32(__m256i __a
, __m256i __b
)
2932 return (__m256i
)__builtin_shufflevector((__v8si
)__a
, (__v8si
)__b
, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2935 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2936 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2937 /// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2938 /// of \a __a and \a __b as input; other bits in these parameters are
2941 /// \code{.operation}
2942 /// result[63:0] := __a[63:0]
2943 /// result[127:64] := __b[63:0]
2944 /// result[191:128] := __a[191:128]
2945 /// result[255:192] := __b[191:128]
2948 /// \headerfile <immintrin.h>
2950 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2953 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2954 /// elements of the result.
2956 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2957 /// elements of the result.
2958 /// \returns A 256-bit vector of [4 x i64] containing the result.
2959 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2960 _mm256_unpacklo_epi64(__m256i __a
, __m256i __b
)
2962 return (__m256i
)__builtin_shufflevector((__v4di
)__a
, (__v4di
)__b
, 0, 4+0, 2, 4+2);
2965 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2968 /// \headerfile <immintrin.h>
2970 /// This intrinsic corresponds to the \c VPXOR instruction.
2973 /// A 256-bit integer vector.
2975 /// A 256-bit integer vector.
2976 /// \returns A 256-bit integer vector containing the result.
2977 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2978 _mm256_xor_si256(__m256i __a
, __m256i __b
)
2980 return (__m256i
)((__v4du
)__a
^ (__v4du
)__b
);
2983 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2984 /// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2987 /// \headerfile <immintrin.h>
2989 /// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2992 /// A pointer to the 32-byte aligned memory containing the vector to load.
2993 /// \returns A 256-bit integer vector loaded from memory.
2994 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2995 _mm256_stream_load_si256(const void *__V
)
2997 typedef __v4di __v4di_aligned
__attribute__((aligned(32)));
2998 return (__m256i
)__builtin_nontemporal_load((const __v4di_aligned
*)__V
);
3001 /// Broadcasts the 32-bit floating-point value from the low element of the
3002 /// 128-bit vector of [4 x float] in \a __X to all elements of the result's
3003 /// 128-bit vector of [4 x float].
3005 /// \headerfile <immintrin.h>
3007 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3010 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
3011 /// \returns A 128-bit vector of [4 x float] containing the result.
3012 static __inline__ __m128 __DEFAULT_FN_ATTRS128
3013 _mm_broadcastss_ps(__m128 __X
)
3015 return (__m128
)__builtin_shufflevector((__v4sf
)__X
, (__v4sf
)__X
, 0, 0, 0, 0);
3018 /// Broadcasts the 64-bit floating-point value from the low element of the
3019 /// 128-bit vector of [2 x double] in \a __a to both elements of the
3020 /// result's 128-bit vector of [2 x double].
3022 /// \headerfile <immintrin.h>
3024 /// This intrinsic corresponds to the \c MOVDDUP instruction.
3027 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
3028 /// \returns A 128-bit vector of [2 x double] containing the result.
3029 static __inline__ __m128d __DEFAULT_FN_ATTRS128
3030 _mm_broadcastsd_pd(__m128d __a
)
3032 return __builtin_shufflevector((__v2df
)__a
, (__v2df
)__a
, 0, 0);
3035 /// Broadcasts the 32-bit floating-point value from the low element of the
3036 /// 128-bit vector of [4 x float] in \a __X to all elements of the
3037 /// result's 256-bit vector of [8 x float].
3039 /// \headerfile <immintrin.h>
3041 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3044 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
3045 /// \returns A 256-bit vector of [8 x float] containing the result.
3046 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3047 _mm256_broadcastss_ps(__m128 __X
)
3049 return (__m256
)__builtin_shufflevector((__v4sf
)__X
, (__v4sf
)__X
, 0, 0, 0, 0, 0, 0, 0, 0);
3052 /// Broadcasts the 64-bit floating-point value from the low element of the
3053 /// 128-bit vector of [2 x double] in \a __X to all elements of the
3054 /// result's 256-bit vector of [4 x double].
3056 /// \headerfile <immintrin.h>
3058 /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3061 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
3062 /// \returns A 256-bit vector of [4 x double] containing the result.
3063 static __inline__ __m256d __DEFAULT_FN_ATTRS256
3064 _mm256_broadcastsd_pd(__m128d __X
)
3066 return (__m256d
)__builtin_shufflevector((__v2df
)__X
, (__v2df
)__X
, 0, 0, 0, 0);
3069 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
3070 /// upper halves of the 256-bit result.
3072 /// \headerfile <immintrin.h>
3074 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3077 /// A 128-bit integer vector to be broadcast.
3078 /// \returns A 256-bit integer vector containing the result.
3079 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3080 _mm256_broadcastsi128_si256(__m128i __X
)
3082 return (__m256i
)__builtin_shufflevector((__v2di
)__X
, (__v2di
)__X
, 0, 1, 0, 1);
3085 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3087 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
3088 /// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3089 /// as specified by the immediate integer operand \a M.
3091 /// \code{.operation}
3095 /// result[31+j:j] := V1[31+j:j]
3097 /// result[31+j:j] := V2[32+j:j]
3102 /// \headerfile <immintrin.h>
3105 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3108 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3111 /// A 128-bit vector of [4 x i32] containing source values.
3113 /// A 128-bit vector of [4 x i32] containing source values.
3115 /// An immediate 8-bit integer operand, with bits [3:0] specifying the
3116 /// source for each element of the result. The position of the mask bit
3117 /// corresponds to the index of a copied value. When a mask bit is 0, the
3118 /// element is copied from \a V1; otherwise, it is copied from \a V2.
3119 /// \returns A 128-bit vector of [4 x i32] containing the result.
3120 #define _mm_blend_epi32(V1, V2, M) \
3121 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3122 (__v4si)(__m128i)(V2), (int)(M)))
3124 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
3125 /// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3126 /// as specified by the immediate integer operand \a M.
3128 /// \code{.operation}
3132 /// result[31+j:j] := V1[31+j:j]
3134 /// result[31+j:j] := V2[32+j:j]
3139 /// \headerfile <immintrin.h>
3142 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3145 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3148 /// A 256-bit vector of [8 x i32] containing source values.
3150 /// A 256-bit vector of [8 x i32] containing source values.
3152 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
3153 /// source for each element of the result. The position of the mask bit
3154 /// corresponds to the index of a copied value. When a mask bit is 0, the
3155 /// element is copied from \a V1; otherwise, it is is copied from \a V2.
3156 /// \returns A 256-bit vector of [8 x i32] containing the result.
3157 #define _mm256_blend_epi32(V1, V2, M) \
3158 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3159 (__v8si)(__m256i)(V2), (int)(M)))
3161 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3162 /// bytes of the 256-bit result.
3164 /// \headerfile <immintrin.h>
3166 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3169 /// A 128-bit integer vector whose low byte will be broadcast.
3170 /// \returns A 256-bit integer vector containing the result.
3171 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3172 _mm256_broadcastb_epi8(__m128i __X
)
3174 return (__m256i
)__builtin_shufflevector((__v16qi
)__X
, (__v16qi
)__X
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3177 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3178 /// to all elements of the result's 256-bit vector of [16 x i16].
3180 /// \headerfile <immintrin.h>
3182 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3185 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3186 /// \returns A 256-bit vector of [16 x i16] containing the result.
3187 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3188 _mm256_broadcastw_epi16(__m128i __X
)
3190 return (__m256i
)__builtin_shufflevector((__v8hi
)__X
, (__v8hi
)__X
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3193 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3194 /// to all elements of the result's 256-bit vector of [8 x i32].
3196 /// \headerfile <immintrin.h>
3198 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3201 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3202 /// \returns A 256-bit vector of [8 x i32] containing the result.
3203 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3204 _mm256_broadcastd_epi32(__m128i __X
)
3206 return (__m256i
)__builtin_shufflevector((__v4si
)__X
, (__v4si
)__X
, 0, 0, 0, 0, 0, 0, 0, 0);
3209 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3210 /// to all elements of the result's 256-bit vector of [4 x i64].
3212 /// \headerfile <immintrin.h>
3214 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3217 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3218 /// \returns A 256-bit vector of [4 x i64] containing the result.
3219 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3220 _mm256_broadcastq_epi64(__m128i __X
)
3222 return (__m256i
)__builtin_shufflevector((__v2di
)__X
, (__v2di
)__X
, 0, 0, 0, 0);
3225 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3226 /// bytes of the 128-bit result.
3228 /// \headerfile <immintrin.h>
3230 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3233 /// A 128-bit integer vector whose low byte will be broadcast.
3234 /// \returns A 128-bit integer vector containing the result.
3235 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3236 _mm_broadcastb_epi8(__m128i __X
)
3238 return (__m128i
)__builtin_shufflevector((__v16qi
)__X
, (__v16qi
)__X
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3241 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3242 /// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3244 /// \headerfile <immintrin.h>
3246 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3249 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3250 /// \returns A 128-bit vector of [8 x i16] containing the result.
3251 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3252 _mm_broadcastw_epi16(__m128i __X
)
3254 return (__m128i
)__builtin_shufflevector((__v8hi
)__X
, (__v8hi
)__X
, 0, 0, 0, 0, 0, 0, 0, 0);
3257 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3258 /// to all elements of the result's vector of [4 x i32].
3260 /// \headerfile <immintrin.h>
3262 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3265 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3266 /// \returns A 128-bit vector of [4 x i32] containing the result.
3267 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3268 _mm_broadcastd_epi32(__m128i __X
)
3270 return (__m128i
)__builtin_shufflevector((__v4si
)__X
, (__v4si
)__X
, 0, 0, 0, 0);
3273 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3274 /// to both elements of the result's 128-bit vector of [2 x i64].
3276 /// \headerfile <immintrin.h>
3278 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3281 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3282 /// \returns A 128-bit vector of [2 x i64] containing the result.
3283 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3284 _mm_broadcastq_epi64(__m128i __X
)
3286 return (__m128i
)__builtin_shufflevector((__v2di
)__X
, (__v2di
)__X
, 0, 0);
3289 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3290 /// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3291 /// elements of the 256-bit vector of [8 x i32] in \a __b.
3293 /// \code{.operation}
3296 /// k := __b[j+2:j] * 32
3297 /// result[j+31:j] := __a[k+31:k]
3301 /// \headerfile <immintrin.h>
3303 /// This intrinsic corresponds to the \c VPERMD instruction.
3306 /// A 256-bit vector of [8 x i32] containing the source values.
3308 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
3310 /// \returns A 256-bit vector of [8 x i32] containing the result.
3311 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3312 _mm256_permutevar8x32_epi32(__m256i __a
, __m256i __b
)
3314 return (__m256i
)__builtin_ia32_permvarsi256((__v8si
)__a
, (__v8si
)__b
);
3317 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3318 /// the 256-bit vector of [4 x double] in \a V as specified by the
3319 /// immediate value \a M.
3321 /// \code{.operation}
3324 /// k := (M >> i*2)[1:0] * 64
3325 /// result[j+63:j] := V[k+63:k]
3329 /// \headerfile <immintrin.h>
3332 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3335 /// This intrinsic corresponds to the \c VPERMPD instruction.
3338 /// A 256-bit vector of [4 x double] containing the source values.
3340 /// An immediate 8-bit value specifying which elements to copy from \a V.
3341 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
3342 /// \a M[3:2] specifies the index for element 1, and so forth.
3343 /// \returns A 256-bit vector of [4 x double] containing the result.
3344 #define _mm256_permute4x64_pd(V, M) \
3345 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3347 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3348 /// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3349 /// the elements of the 256-bit vector of [8 x i32] in \a __b.
3351 /// \code{.operation}
3354 /// k := __b[j+2:j] * 32
3355 /// result[j+31:j] := __a[k+31:k]
3359 /// \headerfile <immintrin.h>
3361 /// This intrinsic corresponds to the \c VPERMPS instruction.
3364 /// A 256-bit vector of [8 x float] containing the source values.
3366 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
3368 /// \returns A 256-bit vector of [8 x float] containing the result.
3369 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3370 _mm256_permutevar8x32_ps(__m256 __a
, __m256i __b
)
3372 return (__m256
)__builtin_ia32_permvarsf256((__v8sf
)__a
, (__v8si
)__b
);
3375 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3376 /// of the 256-bit vector of [4 x i64] in \a V as specified by the
3377 /// immediate value \a M.
3379 /// \code{.operation}
3382 /// k := (M >> i*2)[1:0] * 64
3383 /// result[j+63:j] := V[k+63:k]
3387 /// \headerfile <immintrin.h>
3390 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3393 /// This intrinsic corresponds to the \c VPERMQ instruction.
3396 /// A 256-bit vector of [4 x i64] containing the source values.
3398 /// An immediate 8-bit value specifying which elements to copy from \a V.
3399 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
3400 /// \a M[3:2] specifies the index for element 1, and so forth.
3401 /// \returns A 256-bit vector of [4 x i64] containing the result.
3402 #define _mm256_permute4x64_epi64(V, M) \
3403 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3405 /// Sets each half of the 256-bit result either to zero or to one of the
3406 /// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3407 /// as specified by the immediate value \a M.
3409 /// \code{.operation}
3414 /// CASE (k[1:0]) OF
3415 /// 0: result[127+j:j] := V1[127:0]
3416 /// 1: result[127+j:j] := V1[255:128]
3417 /// 2: result[127+j:j] := V2[127:0]
3418 /// 3: result[127+j:j] := V2[255:128]
3421 /// result[127+j:j] := 0
3426 /// \headerfile <immintrin.h>
3429 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3432 /// This intrinsic corresponds to the \c VPERM2I128 instruction.
3435 /// A 256-bit integer vector containing source values.
3437 /// A 256-bit integer vector containing source values.
3439 /// An immediate value specifying how to form the result. Bits [3:0]
3440 /// control the lower half of the result, bits [7:4] control the upper half.
3441 /// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3442 /// otherwise bits [1:0] determine the source as follows. \n
3443 /// 0: the lower half of \a V1 \n
3444 /// 1: the upper half of \a V1 \n
3445 /// 2: the lower half of \a V2 \n
3446 /// 3: the upper half of \a V2
3447 /// \returns A 256-bit integer vector containing the result.
3448 #define _mm256_permute2x128_si256(V1, V2, M) \
3449 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3451 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3452 /// of the immediate \a M is zero, extracts the lower half of the result;
3453 /// otherwise, extracts the upper half.
3455 /// \headerfile <immintrin.h>
3458 /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3461 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3464 /// A 256-bit integer vector containing the source values.
3466 /// An immediate value specifying which half of \a V to extract.
3467 /// \returns A 128-bit integer vector containing the result.
3468 #define _mm256_extracti128_si256(V, M) \
3469 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3471 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3472 /// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3473 /// is zero, overwrites the lower half of the result; otherwise,
3474 /// overwrites the upper half.
3476 /// \headerfile <immintrin.h>
3479 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3482 /// This intrinsic corresponds to the \c VINSERTI128 instruction.
3485 /// A 256-bit integer vector containing a source value.
3487 /// A 128-bit integer vector containing a source value.
3489 /// An immediate value specifying where to put \a V2 in the result.
3490 /// \returns A 256-bit integer vector containing the result.
3491 #define _mm256_inserti128_si256(V1, V2, M) \
3492 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3493 (__v2di)(__m128i)(V2), (int)(M)))
3495 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3496 /// the most significant bit of the corresponding element in the mask
3497 /// \a __M is set; otherwise, sets that element of the result to zero.
3498 /// Returns the 256-bit [8 x i32] result.
3500 /// \code{.operation}
3503 /// IF __M[j+31] == 1
3504 /// result[j+31:j] := Load32(__X+(i*4))
3506 /// result[j+31:j] := 0
3511 /// \headerfile <immintrin.h>
3513 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3516 /// A pointer to the memory used for loading values.
3518 /// A 256-bit vector of [8 x i32] containing the mask bits.
3519 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3521 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3522 _mm256_maskload_epi32(int const *__X
, __m256i __M
)
3524 return (__m256i
)__builtin_ia32_maskloadd256((const __v8si
*)__X
, (__v8si
)__M
);
3527 /// Conditionally loads four 64-bit integer elements from memory \a __X, if
3528 /// the most significant bit of the corresponding element in the mask
3529 /// \a __M is set; otherwise, sets that element of the result to zero.
3530 /// Returns the 256-bit [4 x i64] result.
3532 /// \code{.operation}
3535 /// IF __M[j+63] == 1
3536 /// result[j+63:j] := Load64(__X+(i*8))
3538 /// result[j+63:j] := 0
3543 /// \headerfile <immintrin.h>
3545 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3548 /// A pointer to the memory used for loading values.
3550 /// A 256-bit vector of [4 x i64] containing the mask bits.
3551 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3553 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3554 _mm256_maskload_epi64(long long const *__X
, __m256i __M
)
3556 return (__m256i
)__builtin_ia32_maskloadq256((const __v4di
*)__X
, (__v4di
)__M
);
3559 /// Conditionally loads four 32-bit integer elements from memory \a __X, if
3560 /// the most significant bit of the corresponding element in the mask
3561 /// \a __M is set; otherwise, sets that element of the result to zero.
3562 /// Returns the 128-bit [4 x i32] result.
3564 /// \code{.operation}
3567 /// IF __M[j+31] == 1
3568 /// result[j+31:j] := Load32(__X+(i*4))
3570 /// result[j+31:j] := 0
3575 /// \headerfile <immintrin.h>
3577 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3580 /// A pointer to the memory used for loading values.
3582 /// A 128-bit vector of [4 x i32] containing the mask bits.
3583 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3585 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3586 _mm_maskload_epi32(int const *__X
, __m128i __M
)
3588 return (__m128i
)__builtin_ia32_maskloadd((const __v4si
*)__X
, (__v4si
)__M
);
3591 /// Conditionally loads two 64-bit integer elements from memory \a __X, if
3592 /// the most significant bit of the corresponding element in the mask
3593 /// \a __M is set; otherwise, sets that element of the result to zero.
3594 /// Returns the 128-bit [2 x i64] result.
3596 /// \code{.operation}
3599 /// IF __M[j+63] == 1
3600 /// result[j+63:j] := Load64(__X+(i*8))
3602 /// result[j+63:j] := 0
3607 /// \headerfile <immintrin.h>
3609 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3612 /// A pointer to the memory used for loading values.
3614 /// A 128-bit vector of [2 x i64] containing the mask bits.
3615 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3617 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3618 _mm_maskload_epi64(long long const *__X
, __m128i __M
)
3620 return (__m128i
)__builtin_ia32_maskloadq((const __v2di
*)__X
, (__v2di
)__M
);
3623 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3624 /// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3625 /// the corresponding element in the mask \a __M is set; otherwise, the
3626 /// memory element is unchanged.
3628 /// \code{.operation}
3631 /// IF __M[j+31] == 1
3632 /// Store32(__X+(i*4), __Y[j+31:j])
3637 /// \headerfile <immintrin.h>
3639 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3642 /// A pointer to the memory used for storing values.
3644 /// A 256-bit vector of [8 x i32] containing the mask bits.
3646 /// A 256-bit vector of [8 x i32] containing the values to store.
3647 static __inline__
void __DEFAULT_FN_ATTRS256
3648 _mm256_maskstore_epi32(int *__X
, __m256i __M
, __m256i __Y
)
3650 __builtin_ia32_maskstored256((__v8si
*)__X
, (__v8si
)__M
, (__v8si
)__Y
);
3653 /// Conditionally stores four 64-bit integer elements from the 256-bit vector
3654 /// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3655 /// the corresponding element in the mask \a __M is set; otherwise, the
3656 /// memory element is unchanged.
3658 /// \code{.operation}
3661 /// IF __M[j+63] == 1
3662 /// Store64(__X+(i*8), __Y[j+63:j])
3667 /// \headerfile <immintrin.h>
3669 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3672 /// A pointer to the memory used for storing values.
3674 /// A 256-bit vector of [4 x i64] containing the mask bits.
3676 /// A 256-bit vector of [4 x i64] containing the values to store.
3677 static __inline__
void __DEFAULT_FN_ATTRS256
3678 _mm256_maskstore_epi64(long long *__X
, __m256i __M
, __m256i __Y
)
3680 __builtin_ia32_maskstoreq256((__v4di
*)__X
, (__v4di
)__M
, (__v4di
)__Y
);
3683 /// Conditionally stores four 32-bit integer elements from the 128-bit vector
3684 /// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3685 /// the corresponding element in the mask \a __M is set; otherwise, the
3686 /// memory element is unchanged.
3688 /// \code{.operation}
3691 /// IF __M[j+31] == 1
3692 /// Store32(__X+(i*4), __Y[j+31:j])
3697 /// \headerfile <immintrin.h>
3699 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3702 /// A pointer to the memory used for storing values.
3704 /// A 128-bit vector of [4 x i32] containing the mask bits.
3706 /// A 128-bit vector of [4 x i32] containing the values to store.
3707 static __inline__
void __DEFAULT_FN_ATTRS128
3708 _mm_maskstore_epi32(int *__X
, __m128i __M
, __m128i __Y
)
3710 __builtin_ia32_maskstored((__v4si
*)__X
, (__v4si
)__M
, (__v4si
)__Y
);
3713 /// Conditionally stores two 64-bit integer elements from the 128-bit vector
3714 /// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3715 /// the corresponding element in the mask \a __M is set; otherwise, the
3716 /// memory element is unchanged.
3718 /// \code{.operation}
3721 /// IF __M[j+63] == 1
3722 /// Store64(__X+(i*8), __Y[j+63:j])
3727 /// \headerfile <immintrin.h>
3729 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3732 /// A pointer to the memory used for storing values.
3734 /// A 128-bit vector of [2 x i64] containing the mask bits.
3736 /// A 128-bit vector of [2 x i64] containing the values to store.
3737 static __inline__
void __DEFAULT_FN_ATTRS128
3738 _mm_maskstore_epi64(long long *__X
, __m128i __M
, __m128i __Y
)
3740 __builtin_ia32_maskstoreq(( __v2di
*)__X
, (__v2di
)__M
, (__v2di
)__Y
);
3743 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3744 /// left by the number of bits given in the corresponding element of the
3745 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3746 /// returns the result. If the shift count for any element is greater than
3747 /// 31, the result for that element is zero.
3749 /// \headerfile <immintrin.h>
3751 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3754 /// A 256-bit vector of [8 x i32] to be shifted.
3756 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3758 /// \returns A 256-bit vector of [8 x i32] containing the result.
3759 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3760 _mm256_sllv_epi32(__m256i __X
, __m256i __Y
)
3762 return (__m256i
)__builtin_ia32_psllv8si((__v8si
)__X
, (__v8si
)__Y
);
3765 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3766 /// left by the number of bits given in the corresponding element of the
3767 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3768 /// returns the result. If the shift count for any element is greater than
3769 /// 31, the result for that element is zero.
3771 /// \headerfile <immintrin.h>
3773 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3776 /// A 128-bit vector of [4 x i32] to be shifted.
3778 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3780 /// \returns A 128-bit vector of [4 x i32] containing the result.
3781 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3782 _mm_sllv_epi32(__m128i __X
, __m128i __Y
)
3784 return (__m128i
)__builtin_ia32_psllv4si((__v4si
)__X
, (__v4si
)__Y
);
3787 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3788 /// left by the number of bits given in the corresponding element of the
3789 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3790 /// returns the result. If the shift count for any element is greater than
3791 /// 63, the result for that element is zero.
3793 /// \headerfile <immintrin.h>
3795 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3798 /// A 256-bit vector of [4 x i64] to be shifted.
3800 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3802 /// \returns A 256-bit vector of [4 x i64] containing the result.
3803 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3804 _mm256_sllv_epi64(__m256i __X
, __m256i __Y
)
3806 return (__m256i
)__builtin_ia32_psllv4di((__v4di
)__X
, (__v4di
)__Y
);
3809 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3810 /// left by the number of bits given in the corresponding element of the
3811 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3812 /// returns the result. If the shift count for any element is greater than
3813 /// 63, the result for that element is zero.
3815 /// \headerfile <immintrin.h>
3817 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3820 /// A 128-bit vector of [2 x i64] to be shifted.
3822 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3824 /// \returns A 128-bit vector of [2 x i64] containing the result.
3825 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3826 _mm_sllv_epi64(__m128i __X
, __m128i __Y
)
3828 return (__m128i
)__builtin_ia32_psllv2di((__v2di
)__X
, (__v2di
)__Y
);
3831 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3832 /// right by the number of bits given in the corresponding element of the
3833 /// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3834 /// returns the result. If the shift count for any element is greater than
3835 /// 31, the result for that element is 0 or -1 according to the sign bit
3836 /// for that element.
3838 /// \headerfile <immintrin.h>
3840 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3843 /// A 256-bit vector of [8 x i32] to be shifted.
3845 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3847 /// \returns A 256-bit vector of [8 x i32] containing the result.
3848 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3849 _mm256_srav_epi32(__m256i __X
, __m256i __Y
)
3851 return (__m256i
)__builtin_ia32_psrav8si((__v8si
)__X
, (__v8si
)__Y
);
3854 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3855 /// right by the number of bits given in the corresponding element of the
3856 /// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3857 /// returns the result. If the shift count for any element is greater than
3858 /// 31, the result for that element is 0 or -1 according to the sign bit
3859 /// for that element.
3861 /// \headerfile <immintrin.h>
3863 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3866 /// A 128-bit vector of [4 x i32] to be shifted.
3868 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3870 /// \returns A 128-bit vector of [4 x i32] containing the result.
3871 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3872 _mm_srav_epi32(__m128i __X
, __m128i __Y
)
3874 return (__m128i
)__builtin_ia32_psrav4si((__v4si
)__X
, (__v4si
)__Y
);
3877 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3878 /// right by the number of bits given in the corresponding element of the
3879 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3880 /// returns the result. If the shift count for any element is greater than
3881 /// 31, the result for that element is zero.
3883 /// \headerfile <immintrin.h>
3885 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3888 /// A 256-bit vector of [8 x i32] to be shifted.
3890 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3892 /// \returns A 256-bit vector of [8 x i32] containing the result.
3893 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3894 _mm256_srlv_epi32(__m256i __X
, __m256i __Y
)
3896 return (__m256i
)__builtin_ia32_psrlv8si((__v8si
)__X
, (__v8si
)__Y
);
3899 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3900 /// right by the number of bits given in the corresponding element of the
3901 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3902 /// returns the result. If the shift count for any element is greater than
3903 /// 31, the result for that element is zero.
3905 /// \headerfile <immintrin.h>
3907 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3910 /// A 128-bit vector of [4 x i32] to be shifted.
3912 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3914 /// \returns A 128-bit vector of [4 x i32] containing the result.
3915 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3916 _mm_srlv_epi32(__m128i __X
, __m128i __Y
)
3918 return (__m128i
)__builtin_ia32_psrlv4si((__v4si
)__X
, (__v4si
)__Y
);
3921 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3922 /// right by the number of bits given in the corresponding element of the
3923 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3924 /// returns the result. If the shift count for any element is greater than
3925 /// 63, the result for that element is zero.
3927 /// \headerfile <immintrin.h>
3929 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3932 /// A 256-bit vector of [4 x i64] to be shifted.
3934 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3936 /// \returns A 256-bit vector of [4 x i64] containing the result.
3937 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3938 _mm256_srlv_epi64(__m256i __X
, __m256i __Y
)
3940 return (__m256i
)__builtin_ia32_psrlv4di((__v4di
)__X
, (__v4di
)__Y
);
3943 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3944 /// right by the number of bits given in the corresponding element of the
3945 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3946 /// returns the result. If the shift count for any element is greater than
3947 /// 63, the result for that element is zero.
3949 /// \headerfile <immintrin.h>
3951 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3954 /// A 128-bit vector of [2 x i64] to be shifted.
3956 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3958 /// \returns A 128-bit vector of [2 x i64] containing the result.
3959 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3960 _mm_srlv_epi64(__m128i __X
, __m128i __Y
)
3962 return (__m128i
)__builtin_ia32_psrlv2di((__v2di
)__X
, (__v2di
)__Y
);
3965 /// Conditionally gathers two 64-bit floating-point values, either from the
3966 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3967 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3968 /// of [2 x double] in \a mask determines the source for each element.
3970 /// \code{.operation}
3971 /// FOR element := 0 to 1
3974 /// IF mask[j+63] == 0
3975 /// result[j+63:j] := a[j+63:j]
3977 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3982 /// \headerfile <immintrin.h>
3985 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3986 /// __m128d mask, const int s);
3989 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3992 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
3995 /// A pointer to the memory used for loading values.
3997 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3998 /// the first two elements are used.
4000 /// A 128-bit vector of [2 x double] containing the mask. The most
4001 /// significant bit of each element in the mask vector represents the mask
4002 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4003 /// is gathered; otherwise the value is loaded from memory.
4005 /// A literal constant scale factor for the indexes in \a i. Must be
4007 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4008 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4009 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4010 (double const *)(m), \
4011 (__v4si)(__m128i)(i), \
4012 (__v2df)(__m128d)(mask), (s)))
4014 /// Conditionally gathers four 64-bit floating-point values, either from the
4015 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4016 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4017 /// of [4 x double] in \a mask determines the source for each element.
4019 /// \code{.operation}
4020 /// FOR element := 0 to 3
4023 /// IF mask[j+63] == 0
4024 /// result[j+63:j] := a[j+63:j]
4026 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4031 /// \headerfile <immintrin.h>
4034 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4035 /// __m256d mask, const int s);
4038 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4041 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
4044 /// A pointer to the memory used for loading values.
4046 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4048 /// A 256-bit vector of [4 x double] containing the mask. The most
4049 /// significant bit of each element in the mask vector represents the mask
4050 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4051 /// is gathered; otherwise the value is loaded from memory.
4053 /// A literal constant scale factor for the indexes in \a i. Must be
4055 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4056 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4057 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4058 (double const *)(m), \
4059 (__v4si)(__m128i)(i), \
4060 (__v4df)(__m256d)(mask), (s)))
4062 /// Conditionally gathers two 64-bit floating-point values, either from the
4063 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4064 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4065 /// of [2 x double] in \a mask determines the source for each element.
4067 /// \code{.operation}
4068 /// FOR element := 0 to 1
4071 /// IF mask[j+63] == 0
4072 /// result[j+63:j] := a[j+63:j]
4074 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4079 /// \headerfile <immintrin.h>
4082 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4083 /// __m128d mask, const int s);
4086 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4089 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
4092 /// A pointer to the memory used for loading values.
4094 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4096 /// A 128-bit vector of [2 x double] containing the mask. The most
4097 /// significant bit of each element in the mask vector represents the mask
4098 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4099 /// is gathered; otherwise the value is loaded from memory.
4101 /// A literal constant scale factor for the indexes in \a i. Must be
4103 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4104 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4105 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4106 (double const *)(m), \
4107 (__v2di)(__m128i)(i), \
4108 (__v2df)(__m128d)(mask), (s)))
4110 /// Conditionally gathers four 64-bit floating-point values, either from the
4111 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4112 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4113 /// of [4 x double] in \a mask determines the source for each element.
4115 /// \code{.operation}
4116 /// FOR element := 0 to 3
4119 /// IF mask[j+63] == 0
4120 /// result[j+63:j] := a[j+63:j]
4122 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4127 /// \headerfile <immintrin.h>
4130 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4131 /// __m256d mask, const int s);
4134 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4137 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
4140 /// A pointer to the memory used for loading values.
4142 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4144 /// A 256-bit vector of [4 x double] containing the mask. The most
4145 /// significant bit of each element in the mask vector represents the mask
4146 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4147 /// is gathered; otherwise the value is loaded from memory.
4149 /// A literal constant scale factor for the indexes in \a i. Must be
4151 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4152 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4153 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4154 (double const *)(m), \
4155 (__v4di)(__m256i)(i), \
4156 (__v4df)(__m256d)(mask), (s)))
4158 /// Conditionally gathers four 32-bit floating-point values, either from the
4159 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4160 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4161 /// of [4 x float] in \a mask determines the source for each element.
4163 /// \code{.operation}
4164 /// FOR element := 0 to 3
4167 /// IF mask[j+31] == 0
4168 /// result[j+31:j] := a[j+31:j]
4170 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4175 /// \headerfile <immintrin.h>
4178 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4179 /// __m128 mask, const int s);
4182 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4185 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4188 /// A pointer to the memory used for loading values.
4190 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4192 /// A 128-bit vector of [4 x float] containing the mask. The most
4193 /// significant bit of each element in the mask vector represents the mask
4194 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4195 /// is gathered; otherwise the value is loaded from memory.
4197 /// A literal constant scale factor for the indexes in \a i. Must be
4199 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4200 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4201 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4202 (float const *)(m), \
4203 (__v4si)(__m128i)(i), \
4204 (__v4sf)(__m128)(mask), (s)))
4206 /// Conditionally gathers eight 32-bit floating-point values, either from the
4207 /// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4208 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4209 /// of [8 x float] in \a mask determines the source for each element.
4211 /// \code{.operation}
4212 /// FOR element := 0 to 7
4215 /// IF mask[j+31] == 0
4216 /// result[j+31:j] := a[j+31:j]
4218 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4223 /// \headerfile <immintrin.h>
4226 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4227 /// __m256 mask, const int s);
4230 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4233 /// A 256-bit vector of [8 x float] used as the source when a mask bit is
4236 /// A pointer to the memory used for loading values.
4238 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4240 /// A 256-bit vector of [8 x float] containing the mask. The most
4241 /// significant bit of each element in the mask vector represents the mask
4242 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4243 /// is gathered; otherwise the value is loaded from memory.
4245 /// A literal constant scale factor for the indexes in \a i. Must be
4247 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4248 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4249 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4250 (float const *)(m), \
4251 (__v8si)(__m256i)(i), \
4252 (__v8sf)(__m256)(mask), (s)))
4254 /// Conditionally gathers two 32-bit floating-point values, either from the
4255 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4256 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4257 /// of [4 x float] in \a mask determines the source for the lower two
4258 /// elements. The upper two elements of the result are zeroed.
4260 /// \code{.operation}
4261 /// FOR element := 0 to 1
4264 /// IF mask[j+31] == 0
4265 /// result[j+31:j] := a[j+31:j]
4267 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4270 /// result[127:64] := 0
4273 /// \headerfile <immintrin.h>
4276 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4277 /// __m128 mask, const int s);
4280 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4283 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4284 /// zero. Only the first two elements are used.
4286 /// A pointer to the memory used for loading values.
4288 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4290 /// A 128-bit vector of [4 x float] containing the mask. The most
4291 /// significant bit of each element in the mask vector represents the mask
4292 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4293 /// is gathered; otherwise the value is loaded from memory. Only the first
4294 /// two elements are used.
4296 /// A literal constant scale factor for the indexes in \a i. Must be
4298 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4299 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4300 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4301 (float const *)(m), \
4302 (__v2di)(__m128i)(i), \
4303 (__v4sf)(__m128)(mask), (s)))
4305 /// Conditionally gathers four 32-bit floating-point values, either from the
4306 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4307 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4308 /// of [4 x float] in \a mask determines the source for each element.
4310 /// \code{.operation}
4311 /// FOR element := 0 to 3
4314 /// IF mask[j+31] == 0
4315 /// result[j+31:j] := a[j+31:j]
4317 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4322 /// \headerfile <immintrin.h>
4325 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4326 /// __m128 mask, const int s);
4329 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4332 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4335 /// A pointer to the memory used for loading values.
4337 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4339 /// A 128-bit vector of [4 x float] containing the mask. The most
4340 /// significant bit of each element in the mask vector represents the mask
4341 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4342 /// is gathered; otherwise the value is loaded from memory.
4344 /// A literal constant scale factor for the indexes in \a i. Must be
4346 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4347 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4348 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4349 (float const *)(m), \
4350 (__v4di)(__m256i)(i), \
4351 (__v4sf)(__m128)(mask), (s)))
4353 /// Conditionally gathers four 32-bit integer values, either from the
4354 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4355 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4356 /// of [4 x i32] in \a mask determines the source for each element.
4358 /// \code{.operation}
4359 /// FOR element := 0 to 3
4362 /// IF mask[j+31] == 0
4363 /// result[j+31:j] := a[j+31:j]
4365 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4370 /// \headerfile <immintrin.h>
4373 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4374 /// __m128i mask, const int s);
4377 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4380 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4383 /// A pointer to the memory used for loading values.
4385 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4387 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4388 /// bit of each element in the mask vector represents the mask bits. If a
4389 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4390 /// otherwise the value is loaded from memory.
4392 /// A literal constant scale factor for the indexes in \a i. Must be
4394 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4395 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4396 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4398 (__v4si)(__m128i)(i), \
4399 (__v4si)(__m128i)(mask), (s)))
4401 /// Conditionally gathers eight 32-bit integer values, either from the
4402 /// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4403 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4404 /// of [8 x i32] in \a mask determines the source for each element.
4406 /// \code{.operation}
4407 /// FOR element := 0 to 7
4410 /// IF mask[j+31] == 0
4411 /// result[j+31:j] := a[j+31:j]
4413 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4418 /// \headerfile <immintrin.h>
4421 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4422 /// __m256i mask, const int s);
4425 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4428 /// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4431 /// A pointer to the memory used for loading values.
4433 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4435 /// A 256-bit vector of [8 x i32] containing the mask. The most significant
4436 /// bit of each element in the mask vector represents the mask bits. If a
4437 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4438 /// otherwise the value is loaded from memory.
4440 /// A literal constant scale factor for the indexes in \a i. Must be
4442 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4443 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4444 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4446 (__v8si)(__m256i)(i), \
4447 (__v8si)(__m256i)(mask), (s)))
4449 /// Conditionally gathers two 32-bit integer values, either from the
4450 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4451 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4452 /// of [4 x i32] in \a mask determines the source for the lower two
4453 /// elements. The upper two elements of the result are zeroed.
4455 /// \code{.operation}
4456 /// FOR element := 0 to 1
4459 /// IF mask[j+31] == 0
4460 /// result[j+31:j] := a[j+31:j]
4462 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4465 /// result[127:64] := 0
4468 /// \headerfile <immintrin.h>
4471 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4472 /// __m128i mask, const int s);
4475 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4478 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4479 /// zero. Only the first two elements are used.
4481 /// A pointer to the memory used for loading values.
4483 /// A 128-bit vector of [2 x i64] containing indexes into \a m.
4485 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4486 /// bit of each element in the mask vector represents the mask bits. If a
4487 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4488 /// otherwise the value is loaded from memory. Only the first two elements
4491 /// A literal constant scale factor for the indexes in \a i. Must be
4493 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4494 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4495 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4497 (__v2di)(__m128i)(i), \
4498 (__v4si)(__m128i)(mask), (s)))
4500 /// Conditionally gathers four 32-bit integer values, either from the
4501 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4502 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4503 /// of [4 x i32] in \a mask determines the source for each element.
4505 /// \code{.operation}
4506 /// FOR element := 0 to 3
4509 /// IF mask[j+31] == 0
4510 /// result[j+31:j] := a[j+31:j]
4512 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4517 /// \headerfile <immintrin.h>
4520 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4521 /// __m128i mask, const int s);
4524 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4527 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4530 /// A pointer to the memory used for loading values.
4532 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4534 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4535 /// bit of each element in the mask vector represents the mask bits. If a
4536 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4537 /// otherwise the value is loaded from memory.
4539 /// A literal constant scale factor for the indexes in \a i. Must be
4541 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4542 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4543 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4545 (__v4di)(__m256i)(i), \
4546 (__v4si)(__m128i)(mask), (s)))
4548 /// Conditionally gathers two 64-bit integer values, either from the
4549 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4550 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4551 /// of [2 x i64] in \a mask determines the source for each element.
4553 /// \code{.operation}
4554 /// FOR element := 0 to 1
4557 /// IF mask[j+63] == 0
4558 /// result[j+63:j] := a[j+63:j]
4560 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4565 /// \headerfile <immintrin.h>
4568 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4569 /// __m128i mask, const int s);
4572 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4575 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4578 /// A pointer to the memory used for loading values.
4580 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4581 /// the first two elements are used.
4583 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
4584 /// bit of each element in the mask vector represents the mask bits. If a
4585 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4586 /// otherwise the value is loaded from memory.
4588 /// A literal constant scale factor for the indexes in \a i. Must be
4590 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4591 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4592 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4593 (long long const *)(m), \
4594 (__v4si)(__m128i)(i), \
4595 (__v2di)(__m128i)(mask), (s)))
4597 /// Conditionally gathers four 64-bit integer values, either from the
4598 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4599 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4600 /// of [4 x i64] in \a mask determines the source for each element.
4602 /// \code{.operation}
4603 /// FOR element := 0 to 3
4606 /// IF mask[j+63] == 0
4607 /// result[j+63:j] := a[j+63:j]
4609 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4614 /// \headerfile <immintrin.h>
4617 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4618 /// __m128i i, __m256i mask, const int s);
4621 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4624 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4627 /// A pointer to the memory used for loading values.
4629 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4631 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
4632 /// bit of each element in the mask vector represents the mask bits. If a
4633 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4634 /// otherwise the value is loaded from memory.
4636 /// A literal constant scale factor for the indexes in \a i. Must be
4638 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4639 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4640 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4641 (long long const *)(m), \
4642 (__v4si)(__m128i)(i), \
4643 (__v4di)(__m256i)(mask), (s)))
4645 /// Conditionally gathers two 64-bit integer values, either from the
4646 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4647 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4648 /// of [2 x i64] in \a mask determines the source for each element.
4650 /// \code{.operation}
4651 /// FOR element := 0 to 1
4654 /// IF mask[j+63] == 0
4655 /// result[j+63:j] := a[j+63:j]
4657 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4662 /// \headerfile <immintrin.h>
4665 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4666 /// __m128i mask, const int s);
4669 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4672 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4675 /// A pointer to the memory used for loading values.
4677 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4679 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
4680 /// bit of each element in the mask vector represents the mask bits. If a
4681 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4682 /// otherwise the value is loaded from memory.
4684 /// A literal constant scale factor for the indexes in \a i. Must be
4686 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4687 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4688 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4689 (long long const *)(m), \
4690 (__v2di)(__m128i)(i), \
4691 (__v2di)(__m128i)(mask), (s)))
4693 /// Conditionally gathers four 64-bit integer values, either from the
4694 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4695 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4696 /// of [4 x i64] in \a mask determines the source for each element.
4698 /// \code{.operation}
4699 /// FOR element := 0 to 3
4702 /// IF mask[j+63] == 0
4703 /// result[j+63:j] := a[j+63:j]
4705 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4710 /// \headerfile <immintrin.h>
4713 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4714 /// __m256i i, __m256i mask, const int s);
4717 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4720 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4723 /// A pointer to the memory used for loading values.
4725 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4727 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
4728 /// bit of each element in the mask vector represents the mask bits. If a
4729 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4730 /// otherwise the value is loaded from memory.
4732 /// A literal constant scale factor for the indexes in \a i. Must be
4734 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4735 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4736 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4737 (long long const *)(m), \
4738 (__v4di)(__m256i)(i), \
4739 (__v4di)(__m256i)(mask), (s)))
4741 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4742 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4744 /// \code{.operation}
4745 /// FOR element := 0 to 1
4748 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4752 /// \headerfile <immintrin.h>
4755 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4758 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4761 /// A pointer to the memory used for loading values.
4763 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4764 /// the first two elements are used.
4766 /// A literal constant scale factor for the indexes in \a i. Must be
4768 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4769 #define _mm_i32gather_pd(m, i, s) \
4770 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4771 (double const *)(m), \
4772 (__v4si)(__m128i)(i), \
4773 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4774 _mm_setzero_pd()), \
4777 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4778 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4780 /// \code{.operation}
4781 /// FOR element := 0 to 3
4784 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4788 /// \headerfile <immintrin.h>
4791 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4794 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4797 /// A pointer to the memory used for loading values.
4799 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4801 /// A literal constant scale factor for the indexes in \a i. Must be
4803 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4804 #define _mm256_i32gather_pd(m, i, s) \
4805 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4806 (double const *)(m), \
4807 (__v4si)(__m128i)(i), \
4808 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4809 _mm256_setzero_pd(), \
4813 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4814 /// indexes from the 128-bit vector of [2 x i64] in \a i.
4816 /// \code{.operation}
4817 /// FOR element := 0 to 1
4820 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4824 /// \headerfile <immintrin.h>
4827 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4830 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4833 /// A pointer to the memory used for loading values.
4835 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4837 /// A literal constant scale factor for the indexes in \a i. Must be
4839 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4840 #define _mm_i64gather_pd(m, i, s) \
4841 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4842 (double const *)(m), \
4843 (__v2di)(__m128i)(i), \
4844 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4845 _mm_setzero_pd()), \
4848 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4849 /// indexes from the 256-bit vector of [4 x i64] in \a i.
4851 /// \code{.operation}
4852 /// FOR element := 0 to 3
4855 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4859 /// \headerfile <immintrin.h>
4862 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4865 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4868 /// A pointer to the memory used for loading values.
4870 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4872 /// A literal constant scale factor for the indexes in \a i. Must be
4874 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4875 #define _mm256_i64gather_pd(m, i, s) \
4876 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4877 (double const *)(m), \
4878 (__v4di)(__m256i)(i), \
4879 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4880 _mm256_setzero_pd(), \
4884 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4885 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4887 /// \code{.operation}
4888 /// FOR element := 0 to 3
4891 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4895 /// \headerfile <immintrin.h>
4898 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4901 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4904 /// A pointer to the memory used for loading values.
4906 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4908 /// A literal constant scale factor for the indexes in \a i. Must be
4910 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4911 #define _mm_i32gather_ps(m, i, s) \
4912 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4913 (float const *)(m), \
4914 (__v4si)(__m128i)(i), \
4915 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4916 _mm_setzero_ps()), \
4919 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
4920 /// indexes from the 256-bit vector of [8 x i32] in \a i.
4922 /// \code{.operation}
4923 /// FOR element := 0 to 7
4926 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4930 /// \headerfile <immintrin.h>
4933 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4936 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4939 /// A pointer to the memory used for loading values.
4941 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4943 /// A literal constant scale factor for the indexes in \a i. Must be
4945 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4946 #define _mm256_i32gather_ps(m, i, s) \
4947 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4948 (float const *)(m), \
4949 (__v8si)(__m256i)(i), \
4950 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4951 _mm256_setzero_ps(), \
4955 /// Gathers two 32-bit floating-point values from memory \a m using scaled
4956 /// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4957 /// elements of the result are zeroed.
4959 /// \code{.operation}
4960 /// FOR element := 0 to 1
4963 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4965 /// result[127:64] := 0
4968 /// \headerfile <immintrin.h>
4971 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4974 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4977 /// A pointer to the memory used for loading values.
4979 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4981 /// A literal constant scale factor for the indexes in \a i. Must be
4983 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4984 #define _mm_i64gather_ps(m, i, s) \
4985 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4986 (float const *)(m), \
4987 (__v2di)(__m128i)(i), \
4988 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4989 _mm_setzero_ps()), \
4992 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4993 /// indexes from the 256-bit vector of [4 x i64] in \a i.
4995 /// \code{.operation}
4996 /// FOR element := 0 to 3
4999 /// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
5003 /// \headerfile <immintrin.h>
5006 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
5009 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
5012 /// A pointer to the memory used for loading values.
5014 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5016 /// A literal constant scale factor for the indexes in \a i. Must be
5018 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
5019 #define _mm256_i64gather_ps(m, i, s) \
5020 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5021 (float const *)(m), \
5022 (__v4di)(__m256i)(i), \
5023 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5024 _mm_setzero_ps()), \
5027 /// Gathers four 32-bit floating-point values from memory \a m using scaled
5028 /// indexes from the 128-bit vector of [4 x i32] in \a i.
5030 /// \code{.operation}
5031 /// FOR element := 0 to 3
5034 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5038 /// \headerfile <immintrin.h>
5041 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5044 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5047 /// A pointer to the memory used for loading values.
5049 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5051 /// A literal constant scale factor for the indexes in \a i. Must be
5053 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5054 #define _mm_i32gather_epi32(m, i, s) \
5055 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5056 (int const *)(m), (__v4si)(__m128i)(i), \
5057 (__v4si)_mm_set1_epi32(-1), (s)))
5059 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
5060 /// indexes from the 256-bit vector of [8 x i32] in \a i.
5062 /// \code{.operation}
5063 /// FOR element := 0 to 7
5066 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5070 /// \headerfile <immintrin.h>
5073 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5076 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5079 /// A pointer to the memory used for loading values.
5081 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5083 /// A literal constant scale factor for the indexes in \a i. Must be
5085 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5086 #define _mm256_i32gather_epi32(m, i, s) \
5087 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5088 (int const *)(m), (__v8si)(__m256i)(i), \
5089 (__v8si)_mm256_set1_epi32(-1), (s)))
5091 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
5092 /// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5093 /// of the result are zeroed.
5095 /// \code{.operation}
5096 /// FOR element := 0 to 1
5099 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5101 /// result[127:64] := 0
5104 /// \headerfile <immintrin.h>
5107 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5110 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5113 /// A pointer to the memory used for loading values.
5115 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5117 /// A literal constant scale factor for the indexes in \a i. Must be
5119 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5120 #define _mm_i64gather_epi32(m, i, s) \
5121 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5122 (int const *)(m), (__v2di)(__m128i)(i), \
5123 (__v4si)_mm_set1_epi32(-1), (s)))
5125 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
5126 /// from the 256-bit vector of [4 x i64] in \a i.
5128 /// \code{.operation}
5129 /// FOR element := 0 to 3
5132 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5136 /// \headerfile <immintrin.h>
5139 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5142 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5145 /// A pointer to the memory used for loading values.
5147 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5149 /// A literal constant scale factor for the indexes in \a i. Must be
5151 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5152 #define _mm256_i64gather_epi32(m, i, s) \
5153 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5154 (int const *)(m), (__v4di)(__m256i)(i), \
5155 (__v4si)_mm_set1_epi32(-1), (s)))
5157 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5158 /// from the 128-bit vector of [4 x i32] in \a i.
5160 /// \code{.operation}
5161 /// FOR element := 0 to 1
5164 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5168 /// \headerfile <immintrin.h>
5171 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5174 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5177 /// A pointer to the memory used for loading values.
5179 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5180 /// the first two elements are used.
5182 /// A literal constant scale factor for the indexes in \a i. Must be
5184 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5185 #define _mm_i32gather_epi64(m, i, s) \
5186 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5187 (long long const *)(m), \
5188 (__v4si)(__m128i)(i), \
5189 (__v2di)_mm_set1_epi64x(-1), (s)))
5191 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5192 /// from the 128-bit vector of [4 x i32] in \a i.
5194 /// \code{.operation}
5195 /// FOR element := 0 to 3
5198 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5202 /// \headerfile <immintrin.h>
5205 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5208 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5211 /// A pointer to the memory used for loading values.
5213 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5215 /// A literal constant scale factor for the indexes in \a i. Must be
5217 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5218 #define _mm256_i32gather_epi64(m, i, s) \
5219 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5220 (long long const *)(m), \
5221 (__v4si)(__m128i)(i), \
5222 (__v4di)_mm256_set1_epi64x(-1), (s)))
5224 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5225 /// from the 128-bit vector of [2 x i64] in \a i.
5227 /// \code{.operation}
5228 /// FOR element := 0 to 1
5231 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5235 /// \headerfile <immintrin.h>
5238 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5241 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5244 /// A pointer to the memory used for loading values.
5246 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5248 /// A literal constant scale factor for the indexes in \a i. Must be
5250 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5251 #define _mm_i64gather_epi64(m, i, s) \
5252 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5253 (long long const *)(m), \
5254 (__v2di)(__m128i)(i), \
5255 (__v2di)_mm_set1_epi64x(-1), (s)))
5257 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5258 /// from the 256-bit vector of [4 x i64] in \a i.
5260 /// \code{.operation}
5261 /// FOR element := 0 to 3
5264 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5268 /// \headerfile <immintrin.h>
5271 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5274 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5277 /// A pointer to the memory used for loading values.
5279 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5281 /// A literal constant scale factor for the indexes in \a i. Must be
5283 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5284 #define _mm256_i64gather_epi64(m, i, s) \
5285 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5286 (long long const *)(m), \
5287 (__v4di)(__m256i)(i), \
5288 (__v4di)_mm256_set1_epi64x(-1), (s)))
5290 #undef __DEFAULT_FN_ATTRS256
5291 #undef __DEFAULT_FN_ATTRS128
5293 #endif /* __AVX2INTRIN_H */