[clangd] Re-land "support outgoing calls in call hierarchy" (#117673)
[llvm-project.git] / clang / lib / Headers / avx2intrin.h
blobdc9fc073143236593b921831e2f2c07717fa42cc
1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 #ifndef __IMMINTRIN_H
11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12 #endif
14 #ifndef __AVX2INTRIN_H
15 #define __AVX2INTRIN_H
17 /* Define the default attributes for the functions in this file. */
18 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
19 #define __DEFAULT_FN_ATTRS256 \
20 __attribute__((__always_inline__, __nodebug__, \
21 __target__("avx2,no-evex512"), __min_vector_width__(256)))
22 #define __DEFAULT_FN_ATTRS128 \
23 __attribute__((__always_inline__, __nodebug__, \
24 __target__("avx2,no-evex512"), __min_vector_width__(128)))
25 #else
26 #define __DEFAULT_FN_ATTRS256 \
27 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
28 __min_vector_width__(256)))
29 #define __DEFAULT_FN_ATTRS128 \
30 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
31 __min_vector_width__(128)))
32 #endif
34 /* SSE4 Multiple Packed Sums of Absolute Difference. */
35 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
36 /// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
37 /// \a Y.
38 ///
39 /// Eight SAD results are computed using the lower half of the input
40 /// vectors, and another eight using the upper half. These 16-bit values
41 /// are returned in the lower and upper halves of the 256-bit result,
42 /// respectively.
43 ///
44 /// A single SAD operation selects four bytes from \a X and four bytes from
45 /// \a Y as input. It computes the differences between each \a X byte and
46 /// the corresponding \a Y byte, takes the absolute value of each
47 /// difference, and sums these four values to form one 16-bit result. The
48 /// intrinsic computes 16 of these results with different sets of input
49 /// bytes.
50 ///
51 /// For each set of eight results, the SAD operations use the same four
52 /// bytes from \a Y; the starting bit position for these four bytes is
53 /// specified by \a M[1:0] times 32. The eight operations use successive
54 /// sets of four bytes from \a X; the starting bit position for the first
55 /// set of four bytes is specified by \a M[2] times 32. These bit positions
56 /// are all relative to the 128-bit lane for each set of eight operations.
57 ///
58 /// \code{.operation}
59 /// r := 0
60 /// FOR i := 0 TO 1
61 /// j := i*3
62 /// Ybase := M[j+1:j]*32 + i*128
63 /// Xbase := M[j+2]*32 + i*128
64 /// FOR k := 0 TO 3
65 /// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
66 /// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
67 /// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
68 /// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
69 /// result[r+15:r] := temp0 + temp1 + temp2 + temp3
70 /// Xbase := Xbase + 8
71 /// r := r + 16
72 /// ENDFOR
73 /// ENDFOR
74 /// \endcode
75 ///
76 /// \headerfile <immintrin.h>
77 ///
78 /// \code
79 /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
80 /// \endcode
81 ///
82 /// This intrinsic corresponds to the \c VMPSADBW instruction.
83 ///
84 /// \param X
85 /// A 256-bit integer vector containing one of the inputs.
86 /// \param Y
87 /// A 256-bit integer vector containing one of the inputs.
88 /// \param M
89 /// An unsigned immediate value specifying the starting positions of the
90 /// bytes to operate on.
91 /// \returns A 256-bit vector of [16 x i16] containing the result.
92 #define _mm256_mpsadbw_epu8(X, Y, M) \
93 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
94 (__v32qi)(__m256i)(Y), (int)(M)))
96 /// Computes the absolute value of each signed byte in the 256-bit integer
97 /// vector \a __a and returns each value in the corresponding byte of
98 /// the result.
99 ///
100 /// \headerfile <immintrin.h>
102 /// This intrinsic corresponds to the \c VPABSB instruction.
104 /// \param __a
105 /// A 256-bit integer vector.
106 /// \returns A 256-bit integer vector containing the result.
107 static __inline__ __m256i __DEFAULT_FN_ATTRS256
108 _mm256_abs_epi8(__m256i __a)
110 return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
113 /// Computes the absolute value of each signed 16-bit element in the 256-bit
114 /// vector of [16 x i16] in \a __a and returns each value in the
115 /// corresponding element of the result.
117 /// \headerfile <immintrin.h>
119 /// This intrinsic corresponds to the \c VPABSW instruction.
121 /// \param __a
122 /// A 256-bit vector of [16 x i16].
123 /// \returns A 256-bit vector of [16 x i16] containing the result.
124 static __inline__ __m256i __DEFAULT_FN_ATTRS256
125 _mm256_abs_epi16(__m256i __a)
127 return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
130 /// Computes the absolute value of each signed 32-bit element in the 256-bit
131 /// vector of [8 x i32] in \a __a and returns each value in the
132 /// corresponding element of the result.
134 /// \headerfile <immintrin.h>
136 /// This intrinsic corresponds to the \c VPABSD instruction.
138 /// \param __a
139 /// A 256-bit vector of [8 x i32].
140 /// \returns A 256-bit vector of [8 x i32] containing the result.
141 static __inline__ __m256i __DEFAULT_FN_ATTRS256
142 _mm256_abs_epi32(__m256i __a)
144 return (__m256i)__builtin_elementwise_abs((__v8si)__a);
147 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
148 /// integers using signed saturation, and returns the 256-bit result.
150 /// \code{.operation}
151 /// FOR i := 0 TO 7
152 /// j := i*16
153 /// k := i*8
154 /// result[7+k:k] := SATURATE8(__a[15+j:j])
155 /// result[71+k:64+k] := SATURATE8(__b[15+j:j])
156 /// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
157 /// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
158 /// ENDFOR
159 /// \endcode
161 /// \headerfile <immintrin.h>
163 /// This intrinsic corresponds to the \c VPACKSSWB instruction.
165 /// \param __a
166 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
167 /// result[191:128].
168 /// \param __b
169 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
170 /// result[255:192].
171 /// \returns A 256-bit integer vector containing the result.
172 static __inline__ __m256i __DEFAULT_FN_ATTRS256
173 _mm256_packs_epi16(__m256i __a, __m256i __b)
175 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
178 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
179 /// integers using signed saturation, and returns the resulting 256-bit
180 /// vector of [16 x i16].
182 /// \code{.operation}
183 /// FOR i := 0 TO 3
184 /// j := i*32
185 /// k := i*16
186 /// result[15+k:k] := SATURATE16(__a[31+j:j])
187 /// result[79+k:64+k] := SATURATE16(__b[31+j:j])
188 /// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
189 /// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
190 /// ENDFOR
191 /// \endcode
193 /// \headerfile <immintrin.h>
195 /// This intrinsic corresponds to the \c VPACKSSDW instruction.
197 /// \param __a
198 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
199 /// result[191:128].
200 /// \param __b
201 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
202 /// result[255:192].
203 /// \returns A 256-bit vector of [16 x i16] containing the result.
204 static __inline__ __m256i __DEFAULT_FN_ATTRS256
205 _mm256_packs_epi32(__m256i __a, __m256i __b)
207 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
210 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
211 /// using unsigned saturation, and returns the 256-bit result.
213 /// \code{.operation}
214 /// FOR i := 0 TO 7
215 /// j := i*16
216 /// k := i*8
217 /// result[7+k:k] := SATURATE8U(__a[15+j:j])
218 /// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
219 /// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
220 /// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
221 /// ENDFOR
222 /// \endcode
224 /// \headerfile <immintrin.h>
226 /// This intrinsic corresponds to the \c VPACKUSWB instruction.
228 /// \param __a
229 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
230 /// result[191:128].
231 /// \param __b
232 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
233 /// result[255:192].
234 /// \returns A 256-bit integer vector containing the result.
235 static __inline__ __m256i __DEFAULT_FN_ATTRS256
236 _mm256_packus_epi16(__m256i __a, __m256i __b)
238 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
241 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
242 /// using unsigned saturation, and returns the resulting 256-bit vector of
243 /// [16 x i16].
245 /// \code{.operation}
246 /// FOR i := 0 TO 3
247 /// j := i*32
248 /// k := i*16
249 /// result[15+k:k] := SATURATE16U(__V1[31+j:j])
250 /// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
251 /// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
252 /// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
253 /// ENDFOR
254 /// \endcode
256 /// \headerfile <immintrin.h>
258 /// This intrinsic corresponds to the \c VPACKUSDW instruction.
260 /// \param __V1
261 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
262 /// result[191:128].
263 /// \param __V2
264 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
265 /// result[255:192].
266 /// \returns A 256-bit vector of [16 x i16] containing the result.
267 static __inline__ __m256i __DEFAULT_FN_ATTRS256
268 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
270 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
273 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
274 /// vectors and returns the lower 8 bits of each sum in the corresponding
275 /// byte of the 256-bit integer vector result (overflow is ignored).
277 /// \headerfile <immintrin.h>
279 /// This intrinsic corresponds to the \c VPADDB instruction.
281 /// \param __a
282 /// A 256-bit integer vector containing one of the source operands.
283 /// \param __b
284 /// A 256-bit integer vector containing one of the source operands.
285 /// \returns A 256-bit integer vector containing the sums.
286 static __inline__ __m256i __DEFAULT_FN_ATTRS256
287 _mm256_add_epi8(__m256i __a, __m256i __b)
289 return (__m256i)((__v32qu)__a + (__v32qu)__b);
292 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
293 /// [16 x i16] and returns the lower 16 bits of each sum in the
294 /// corresponding element of the [16 x i16] result (overflow is ignored).
296 /// \headerfile <immintrin.h>
298 /// This intrinsic corresponds to the \c VPADDW instruction.
300 /// \param __a
301 /// A 256-bit vector of [16 x i16] containing one of the source operands.
302 /// \param __b
303 /// A 256-bit vector of [16 x i16] containing one of the source operands.
304 /// \returns A 256-bit vector of [16 x i16] containing the sums.
305 static __inline__ __m256i __DEFAULT_FN_ATTRS256
306 _mm256_add_epi16(__m256i __a, __m256i __b)
308 return (__m256i)((__v16hu)__a + (__v16hu)__b);
311 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
312 /// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
313 /// element of the [8 x i32] result (overflow is ignored).
315 /// \headerfile <immintrin.h>
317 /// This intrinsic corresponds to the \c VPADDD instruction.
319 /// \param __a
320 /// A 256-bit vector of [8 x i32] containing one of the source operands.
321 /// \param __b
322 /// A 256-bit vector of [8 x i32] containing one of the source operands.
323 /// \returns A 256-bit vector of [8 x i32] containing the sums.
324 static __inline__ __m256i __DEFAULT_FN_ATTRS256
325 _mm256_add_epi32(__m256i __a, __m256i __b)
327 return (__m256i)((__v8su)__a + (__v8su)__b);
330 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
331 /// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
332 /// element of the [4 x i64] result (overflow is ignored).
334 /// \headerfile <immintrin.h>
336 /// This intrinsic corresponds to the \c VPADDQ instruction.
338 /// \param __a
339 /// A 256-bit vector of [4 x i64] containing one of the source operands.
340 /// \param __b
341 /// A 256-bit vector of [4 x i64] containing one of the source operands.
342 /// \returns A 256-bit vector of [4 x i64] containing the sums.
343 static __inline__ __m256i __DEFAULT_FN_ATTRS256
344 _mm256_add_epi64(__m256i __a, __m256i __b)
346 return (__m256i)((__v4du)__a + (__v4du)__b);
349 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
350 /// vectors using signed saturation, and returns each sum in the
351 /// corresponding byte of the 256-bit integer vector result.
353 /// \headerfile <immintrin.h>
355 /// This intrinsic corresponds to the \c VPADDSB instruction.
357 /// \param __a
358 /// A 256-bit integer vector containing one of the source operands.
359 /// \param __b
360 /// A 256-bit integer vector containing one of the source operands.
361 /// \returns A 256-bit integer vector containing the sums.
362 static __inline__ __m256i __DEFAULT_FN_ATTRS256
363 _mm256_adds_epi8(__m256i __a, __m256i __b)
365 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
368 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
369 /// [16 x i16] using signed saturation, and returns the [16 x i16] result.
371 /// \headerfile <immintrin.h>
373 /// This intrinsic corresponds to the \c VPADDSW instruction.
375 /// \param __a
376 /// A 256-bit vector of [16 x i16] containing one of the source operands.
377 /// \param __b
378 /// A 256-bit vector of [16 x i16] containing one of the source operands.
379 /// \returns A 256-bit vector of [16 x i16] containing the sums.
380 static __inline__ __m256i __DEFAULT_FN_ATTRS256
381 _mm256_adds_epi16(__m256i __a, __m256i __b)
383 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
386 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
387 /// vectors using unsigned saturation, and returns each sum in the
388 /// corresponding byte of the 256-bit integer vector result.
390 /// \headerfile <immintrin.h>
392 /// This intrinsic corresponds to the \c VPADDUSB instruction.
394 /// \param __a
395 /// A 256-bit integer vector containing one of the source operands.
396 /// \param __b
397 /// A 256-bit integer vector containing one of the source operands.
398 /// \returns A 256-bit integer vector containing the sums.
399 static __inline__ __m256i __DEFAULT_FN_ATTRS256
400 _mm256_adds_epu8(__m256i __a, __m256i __b)
402 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
405 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
406 /// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
408 /// \headerfile <immintrin.h>
410 /// This intrinsic corresponds to the \c VPADDUSW instruction.
412 /// \param __a
413 /// A 256-bit vector of [16 x i16] containing one of the source operands.
414 /// \param __b
415 /// A 256-bit vector of [16 x i16] containing one of the source operands.
416 /// \returns A 256-bit vector of [16 x i16] containing the sums.
417 static __inline__ __m256i __DEFAULT_FN_ATTRS256
418 _mm256_adds_epu16(__m256i __a, __m256i __b)
420 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
423 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
424 /// temporary 256-bit value, and the lower half of the 256-bit vector \a b
425 /// as the lower half of the temporary value. Right-shifts the temporary
426 /// value by \a n bytes, and uses the lower 16 bytes of the shifted value
427 /// as the lower 16 bytes of the result. Uses the upper halves of \a a and
428 /// \a b to make another temporary value, right shifts by \a n, and uses
429 /// the lower 16 bytes of the shifted value as the upper 16 bytes of the
430 /// result.
432 /// \headerfile <immintrin.h>
434 /// \code
435 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
436 /// \endcode
438 /// This intrinsic corresponds to the \c VPALIGNR instruction.
440 /// \param a
441 /// A 256-bit integer vector containing source values.
442 /// \param b
443 /// A 256-bit integer vector containing source values.
444 /// \param n
445 /// An immediate value specifying the number of bytes to shift.
446 /// \returns A 256-bit integer vector containing the result.
447 #define _mm256_alignr_epi8(a, b, n) \
448 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
449 (__v32qi)(__m256i)(b), (n)))
451 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
452 /// \a __b.
454 /// \headerfile <immintrin.h>
456 /// This intrinsic corresponds to the \c VPAND instruction.
458 /// \param __a
459 /// A 256-bit integer vector.
460 /// \param __b
461 /// A 256-bit integer vector.
462 /// \returns A 256-bit integer vector containing the result.
463 static __inline__ __m256i __DEFAULT_FN_ATTRS256
464 _mm256_and_si256(__m256i __a, __m256i __b)
466 return (__m256i)((__v4du)__a & (__v4du)__b);
469 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with
470 /// the bitwise NOT of the 256-bit integer vector in \a __a.
472 /// \headerfile <immintrin.h>
474 /// This intrinsic corresponds to the \c VPANDN instruction.
476 /// \param __a
477 /// A 256-bit integer vector.
478 /// \param __b
479 /// A 256-bit integer vector.
480 /// \returns A 256-bit integer vector containing the result.
481 static __inline__ __m256i __DEFAULT_FN_ATTRS256
482 _mm256_andnot_si256(__m256i __a, __m256i __b)
484 return (__m256i)(~(__v4du)__a & (__v4du)__b);
487 /// Computes the averages of the corresponding unsigned bytes in the two
488 /// 256-bit integer vectors in \a __a and \a __b and returns each
489 /// average in the corresponding byte of the 256-bit result.
491 /// \code{.operation}
492 /// FOR i := 0 TO 31
493 /// j := i*8
494 /// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
495 /// ENDFOR
496 /// \endcode
498 /// \headerfile <immintrin.h>
500 /// This intrinsic corresponds to the \c VPAVGB instruction.
502 /// \param __a
503 /// A 256-bit integer vector.
504 /// \param __b
505 /// A 256-bit integer vector.
506 /// \returns A 256-bit integer vector containing the result.
507 static __inline__ __m256i __DEFAULT_FN_ATTRS256
508 _mm256_avg_epu8(__m256i __a, __m256i __b)
510 return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
513 /// Computes the averages of the corresponding unsigned 16-bit integers in
514 /// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
515 /// each average in the corresponding element of the 256-bit result.
517 /// \code{.operation}
518 /// FOR i := 0 TO 15
519 /// j := i*16
520 /// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
521 /// ENDFOR
522 /// \endcode
524 /// \headerfile <immintrin.h>
526 /// This intrinsic corresponds to the \c VPAVGW instruction.
528 /// \param __a
529 /// A 256-bit vector of [16 x i16].
530 /// \param __b
531 /// A 256-bit vector of [16 x i16].
532 /// \returns A 256-bit vector of [16 x i16] containing the result.
533 static __inline__ __m256i __DEFAULT_FN_ATTRS256
534 _mm256_avg_epu16(__m256i __a, __m256i __b)
536 return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
539 /// Merges 8-bit integer values from either of the two 256-bit vectors
540 /// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
541 /// the resulting 256-bit integer vector.
543 /// \code{.operation}
544 /// FOR i := 0 TO 31
545 /// j := i*8
546 /// IF __M[7+i] == 0
547 /// result[7+j:j] := __V1[7+j:j]
548 /// ELSE
549 /// result[7+j:j] := __V2[7+j:j]
550 /// FI
551 /// ENDFOR
552 /// \endcode
554 /// \headerfile <immintrin.h>
556 /// This intrinsic corresponds to the \c VPBLENDVB instruction.
558 /// \param __V1
559 /// A 256-bit integer vector containing source values.
560 /// \param __V2
561 /// A 256-bit integer vector containing source values.
562 /// \param __M
563 /// A 256-bit integer vector, with bit [7] of each byte specifying the
564 /// source for each corresponding byte of the result. When the mask bit
565 /// is 0, the byte is copied from \a __V1; otherwise, it is copied from
566 /// \a __V2.
567 /// \returns A 256-bit integer vector containing the result.
568 static __inline__ __m256i __DEFAULT_FN_ATTRS256
569 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
571 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
572 (__v32qi)__M);
575 /// Merges 16-bit integer values from either of the two 256-bit vectors
576 /// \a V1 or \a V2, as specified by the immediate integer operand \a M,
577 /// and returns the resulting 256-bit vector of [16 x i16].
579 /// \code{.operation}
580 /// FOR i := 0 TO 7
581 /// j := i*16
582 /// IF M[i] == 0
583 /// result[7+j:j] := V1[7+j:j]
584 /// result[135+j:128+j] := V1[135+j:128+j]
585 /// ELSE
586 /// result[7+j:j] := V2[7+j:j]
587 /// result[135+j:128+j] := V2[135+j:128+j]
588 /// FI
589 /// ENDFOR
590 /// \endcode
592 /// \headerfile <immintrin.h>
594 /// \code
595 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
596 /// \endcode
598 /// This intrinsic corresponds to the \c VPBLENDW instruction.
600 /// \param V1
601 /// A 256-bit vector of [16 x i16] containing source values.
602 /// \param V2
603 /// A 256-bit vector of [16 x i16] containing source values.
604 /// \param M
605 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
606 /// source for each element of the result. The position of the mask bit
607 /// corresponds to the index of a copied value. When a mask bit is 0, the
608 /// element is copied from \a V1; otherwise, it is copied from \a V2.
609 /// \a M[0] determines the source for elements 0 and 8, \a M[1] for
610 /// elements 1 and 9, and so forth.
611 /// \returns A 256-bit vector of [16 x i16] containing the result.
612 #define _mm256_blend_epi16(V1, V2, M) \
613 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
614 (__v16hi)(__m256i)(V2), (int)(M)))
616 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
617 /// \a __b for equality and returns the outcomes in the corresponding
618 /// bytes of the 256-bit result.
620 /// \code{.operation}
621 /// FOR i := 0 TO 31
622 /// j := i*8
623 /// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
624 /// ENDFOR
625 /// \endcode
627 /// \headerfile <immintrin.h>
629 /// This intrinsic corresponds to the \c VPCMPEQB instruction.
631 /// \param __a
632 /// A 256-bit integer vector containing one of the inputs.
633 /// \param __b
634 /// A 256-bit integer vector containing one of the inputs.
635 /// \returns A 256-bit integer vector containing the result.
636 static __inline__ __m256i __DEFAULT_FN_ATTRS256
637 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
639 return (__m256i)((__v32qi)__a == (__v32qi)__b);
642 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
643 /// \a __a and \a __b for equality and returns the outcomes in the
644 /// corresponding elements of the 256-bit result.
646 /// \code{.operation}
647 /// FOR i := 0 TO 15
648 /// j := i*16
649 /// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
650 /// ENDFOR
651 /// \endcode
653 /// \headerfile <immintrin.h>
655 /// This intrinsic corresponds to the \c VPCMPEQW instruction.
657 /// \param __a
658 /// A 256-bit vector of [16 x i16] containing one of the inputs.
659 /// \param __b
660 /// A 256-bit vector of [16 x i16] containing one of the inputs.
661 /// \returns A 256-bit vector of [16 x i16] containing the result.
662 static __inline__ __m256i __DEFAULT_FN_ATTRS256
663 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
665 return (__m256i)((__v16hi)__a == (__v16hi)__b);
668 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
669 /// \a __a and \a __b for equality and returns the outcomes in the
670 /// corresponding elements of the 256-bit result.
672 /// \code{.operation}
673 /// FOR i := 0 TO 7
674 /// j := i*32
675 /// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
676 /// ENDFOR
677 /// \endcode
679 /// \headerfile <immintrin.h>
681 /// This intrinsic corresponds to the \c VPCMPEQD instruction.
683 /// \param __a
684 /// A 256-bit vector of [8 x i32] containing one of the inputs.
685 /// \param __b
686 /// A 256-bit vector of [8 x i32] containing one of the inputs.
687 /// \returns A 256-bit vector of [8 x i32] containing the result.
688 static __inline__ __m256i __DEFAULT_FN_ATTRS256
689 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
691 return (__m256i)((__v8si)__a == (__v8si)__b);
694 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
695 /// \a __a and \a __b for equality and returns the outcomes in the
696 /// corresponding elements of the 256-bit result.
698 /// \code{.operation}
699 /// FOR i := 0 TO 3
700 /// j := i*64
701 /// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
702 /// ENDFOR
703 /// \endcode
705 /// \headerfile <immintrin.h>
707 /// This intrinsic corresponds to the \c VPCMPEQQ instruction.
709 /// \param __a
710 /// A 256-bit vector of [4 x i64] containing one of the inputs.
711 /// \param __b
712 /// A 256-bit vector of [4 x i64] containing one of the inputs.
713 /// \returns A 256-bit vector of [4 x i64] containing the result.
714 static __inline__ __m256i __DEFAULT_FN_ATTRS256
715 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
717 return (__m256i)((__v4di)__a == (__v4di)__b);
720 /// Compares corresponding signed bytes in the 256-bit integer vectors in
721 /// \a __a and \a __b for greater-than and returns the outcomes in the
722 /// corresponding bytes of the 256-bit result.
724 /// \code{.operation}
725 /// FOR i := 0 TO 31
726 /// j := i*8
727 /// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
728 /// ENDFOR
729 /// \endcode
731 /// \headerfile <immintrin.h>
733 /// This intrinsic corresponds to the \c VPCMPGTB instruction.
735 /// \param __a
736 /// A 256-bit integer vector containing one of the inputs.
737 /// \param __b
738 /// A 256-bit integer vector containing one of the inputs.
739 /// \returns A 256-bit integer vector containing the result.
740 static __inline__ __m256i __DEFAULT_FN_ATTRS256
741 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
743 /* This function always performs a signed comparison, but __v32qi is a char
744 which may be signed or unsigned, so use __v32qs. */
745 return (__m256i)((__v32qs)__a > (__v32qs)__b);
748 /// Compares corresponding signed elements in the 256-bit vectors of
749 /// [16 x i16] in \a __a and \a __b for greater-than and returns the
750 /// outcomes in the corresponding elements of the 256-bit result.
752 /// \code{.operation}
753 /// FOR i := 0 TO 15
754 /// j := i*16
755 /// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
756 /// ENDFOR
757 /// \endcode
759 /// \headerfile <immintrin.h>
761 /// This intrinsic corresponds to the \c VPCMPGTW instruction.
763 /// \param __a
764 /// A 256-bit vector of [16 x i16] containing one of the inputs.
765 /// \param __b
766 /// A 256-bit vector of [16 x i16] containing one of the inputs.
767 /// \returns A 256-bit vector of [16 x i16] containing the result.
768 static __inline__ __m256i __DEFAULT_FN_ATTRS256
769 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
771 return (__m256i)((__v16hi)__a > (__v16hi)__b);
774 /// Compares corresponding signed elements in the 256-bit vectors of
775 /// [8 x i32] in \a __a and \a __b for greater-than and returns the
776 /// outcomes in the corresponding elements of the 256-bit result.
778 /// \code{.operation}
779 /// FOR i := 0 TO 7
780 /// j := i*32
781 /// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
782 /// ENDFOR
783 /// \endcode
785 /// \headerfile <immintrin.h>
787 /// This intrinsic corresponds to the \c VPCMPGTD instruction.
789 /// \param __a
790 /// A 256-bit vector of [8 x i32] containing one of the inputs.
791 /// \param __b
792 /// A 256-bit vector of [8 x i32] containing one of the inputs.
793 /// \returns A 256-bit vector of [8 x i32] containing the result.
794 static __inline__ __m256i __DEFAULT_FN_ATTRS256
795 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
797 return (__m256i)((__v8si)__a > (__v8si)__b);
800 /// Compares corresponding signed elements in the 256-bit vectors of
801 /// [4 x i64] in \a __a and \a __b for greater-than and returns the
802 /// outcomes in the corresponding elements of the 256-bit result.
804 /// \code{.operation}
805 /// FOR i := 0 TO 3
806 /// j := i*64
807 /// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
808 /// ENDFOR
809 /// \endcode
811 /// \headerfile <immintrin.h>
813 /// This intrinsic corresponds to the \c VPCMPGTQ instruction.
815 /// \param __a
816 /// A 256-bit vector of [4 x i64] containing one of the inputs.
817 /// \param __b
818 /// A 256-bit vector of [4 x i64] containing one of the inputs.
819 /// \returns A 256-bit vector of [4 x i64] containing the result.
820 static __inline__ __m256i __DEFAULT_FN_ATTRS256
821 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
823 return (__m256i)((__v4di)__a > (__v4di)__b);
826 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
827 /// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
828 /// element of the [16 x i16] result (overflow is ignored). Sums from
829 /// \a __a are returned in the lower 64 bits of each 128-bit half of the
830 /// result; sums from \a __b are returned in the upper 64 bits of each
831 /// 128-bit half of the result.
833 /// \code{.operation}
834 /// FOR i := 0 TO 1
835 /// j := i*128
836 /// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
837 /// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
838 /// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
839 /// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
840 /// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
841 /// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
842 /// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
843 /// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
844 /// ENDFOR
845 /// \endcode
847 /// \headerfile <immintrin.h>
849 /// This intrinsic corresponds to the \c VPHADDW instruction.
851 /// \param __a
852 /// A 256-bit vector of [16 x i16] containing one of the source operands.
853 /// \param __b
854 /// A 256-bit vector of [16 x i16] containing one of the source operands.
855 /// \returns A 256-bit vector of [16 x i16] containing the sums.
856 static __inline__ __m256i __DEFAULT_FN_ATTRS256
857 _mm256_hadd_epi16(__m256i __a, __m256i __b)
859 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
862 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
863 /// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
864 /// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
865 /// are returned in the lower 64 bits of each 128-bit half of the result;
866 /// sums from \a __b are returned in the upper 64 bits of each 128-bit half
867 /// of the result.
869 /// \code{.operation}
870 /// FOR i := 0 TO 1
871 /// j := i*128
872 /// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
873 /// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
874 /// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
875 /// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
876 /// ENDFOR
877 /// \endcode
879 /// \headerfile <immintrin.h>
881 /// This intrinsic corresponds to the \c VPHADDD instruction.
883 /// \param __a
884 /// A 256-bit vector of [8 x i32] containing one of the source operands.
885 /// \param __b
886 /// A 256-bit vector of [8 x i32] containing one of the source operands.
887 /// \returns A 256-bit vector of [8 x i32] containing the sums.
888 static __inline__ __m256i __DEFAULT_FN_ATTRS256
889 _mm256_hadd_epi32(__m256i __a, __m256i __b)
891 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
894 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
895 /// vectors of [16 x i16] using signed saturation and returns each sum in
896 /// an element of the [16 x i16] result. Sums from \a __a are returned in
897 /// the lower 64 bits of each 128-bit half of the result; sums from \a __b
898 /// are returned in the upper 64 bits of each 128-bit half of the result.
900 /// \code{.operation}
901 /// FOR i := 0 TO 1
902 /// j := i*128
903 /// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
904 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
905 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
906 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
907 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
908 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
909 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
910 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
911 /// ENDFOR
912 /// \endcode
914 /// \headerfile <immintrin.h>
916 /// This intrinsic corresponds to the \c VPHADDSW instruction.
918 /// \param __a
919 /// A 256-bit vector of [16 x i16] containing one of the source operands.
920 /// \param __b
921 /// A 256-bit vector of [16 x i16] containing one of the source operands.
922 /// \returns A 256-bit vector of [16 x i16] containing the sums.
923 static __inline__ __m256i __DEFAULT_FN_ATTRS256
924 _mm256_hadds_epi16(__m256i __a, __m256i __b)
926 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
929 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
930 /// vectors of [16 x i16] and returns the lower 16 bits of each difference
931 /// in an element of the [16 x i16] result (overflow is ignored).
932 /// Differences from \a __a are returned in the lower 64 bits of each
933 /// 128-bit half of the result; differences from \a __b are returned in the
934 /// upper 64 bits of each 128-bit half of the result.
936 /// \code{.operation}
937 /// FOR i := 0 TO 1
938 /// j := i*128
939 /// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
940 /// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
941 /// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
942 /// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
943 /// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
944 /// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
945 /// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
946 /// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
947 /// ENDFOR
948 /// \endcode
950 /// \headerfile <immintrin.h>
952 /// This intrinsic corresponds to the \c VPHSUBW instruction.
954 /// \param __a
955 /// A 256-bit vector of [16 x i16] containing one of the source operands.
956 /// \param __b
957 /// A 256-bit vector of [16 x i16] containing one of the source operands.
958 /// \returns A 256-bit vector of [16 x i16] containing the differences.
959 static __inline__ __m256i __DEFAULT_FN_ATTRS256
960 _mm256_hsub_epi16(__m256i __a, __m256i __b)
962 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
965 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
966 /// vectors of [8 x i32] and returns the lower 32 bits of each difference in
967 /// an element of the [8 x i32] result (overflow is ignored). Differences
968 /// from \a __a are returned in the lower 64 bits of each 128-bit half of
969 /// the result; differences from \a __b are returned in the upper 64 bits
970 /// of each 128-bit half of the result.
972 /// \code{.operation}
973 /// FOR i := 0 TO 1
974 /// j := i*128
975 /// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
976 /// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
977 /// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
978 /// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
979 /// ENDFOR
980 /// \endcode
982 /// \headerfile <immintrin.h>
984 /// This intrinsic corresponds to the \c VPHSUBD instruction.
986 /// \param __a
987 /// A 256-bit vector of [8 x i32] containing one of the source operands.
988 /// \param __b
989 /// A 256-bit vector of [8 x i32] containing one of the source operands.
990 /// \returns A 256-bit vector of [8 x i32] containing the differences.
991 static __inline__ __m256i __DEFAULT_FN_ATTRS256
992 _mm256_hsub_epi32(__m256i __a, __m256i __b)
994 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
997 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
998 /// vectors of [16 x i16] using signed saturation and returns each sum in
999 /// an element of the [16 x i16] result. Differences from \a __a are
1000 /// returned in the lower 64 bits of each 128-bit half of the result;
1001 /// differences from \a __b are returned in the upper 64 bits of each
1002 /// 128-bit half of the result.
1004 /// \code{.operation}
1005 /// FOR i := 0 TO 1
1006 /// j := i*128
1007 /// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
1008 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1009 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1010 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1011 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1012 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1013 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1014 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1015 /// ENDFOR
1016 /// \endcode
1018 /// \headerfile <immintrin.h>
1020 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
1022 /// \param __a
1023 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1024 /// \param __b
1025 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1026 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1027 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1028 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
1030 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1033 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1034 /// with the corresponding signed byte from the 256-bit integer vector in
1035 /// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1036 /// pairs of those products using signed saturation to form 16-bit sums
1037 /// returned as elements of the [16 x i16] result.
1039 /// \code{.operation}
1040 /// FOR i := 0 TO 15
1041 /// j := i*16
1042 /// temp1 := __a[j+7:j] * __b[j+7:j]
1043 /// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1044 /// result[j+15:j] := SATURATE16(temp1 + temp2)
1045 /// ENDFOR
1046 /// \endcode
1048 /// \headerfile <immintrin.h>
1050 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1052 /// \param __a
1053 /// A 256-bit vector containing one of the source operands.
1054 /// \param __b
1055 /// A 256-bit vector containing one of the source operands.
1056 /// \returns A 256-bit vector of [16 x i16] containing the result.
1057 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1058 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
1060 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1063 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1064 /// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1065 /// those products to form 32-bit sums returned as elements of the
1066 /// [8 x i32] result.
1068 /// There is only one wraparound case: when all four of the 16-bit sources
1069 /// are \c 0x8000, the result will be \c 0x80000000.
1071 /// \code{.operation}
1072 /// FOR i := 0 TO 7
1073 /// j := i*32
1074 /// temp1 := __a[j+15:j] * __b[j+15:j]
1075 /// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1076 /// result[j+31:j] := temp1 + temp2
1077 /// ENDFOR
1078 /// \endcode
1080 /// \headerfile <immintrin.h>
1082 /// This intrinsic corresponds to the \c VPMADDWD instruction.
1084 /// \param __a
1085 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1086 /// \param __b
1087 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1088 /// \returns A 256-bit vector of [8 x i32] containing the result.
1089 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1090 _mm256_madd_epi16(__m256i __a, __m256i __b)
1092 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1095 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1096 /// in \a __a and \a __b and returns the larger of each pair in the
1097 /// corresponding byte of the 256-bit result.
1099 /// \headerfile <immintrin.h>
1101 /// This intrinsic corresponds to the \c VPMAXSB instruction.
1103 /// \param __a
1104 /// A 256-bit integer vector.
1105 /// \param __b
1106 /// A 256-bit integer vector.
1107 /// \returns A 256-bit integer vector containing the result.
1108 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1109 _mm256_max_epi8(__m256i __a, __m256i __b)
1111 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1114 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1115 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1116 /// each pair in the corresponding element of the 256-bit result.
1118 /// \headerfile <immintrin.h>
1120 /// This intrinsic corresponds to the \c VPMAXSW instruction.
1122 /// \param __a
1123 /// A 256-bit vector of [16 x i16].
1124 /// \param __b
1125 /// A 256-bit vector of [16 x i16].
1126 /// \returns A 256-bit vector of [16 x i16] containing the result.
1127 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1128 _mm256_max_epi16(__m256i __a, __m256i __b)
1130 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1133 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1134 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1135 /// each pair in the corresponding element of the 256-bit result.
1137 /// \headerfile <immintrin.h>
1139 /// This intrinsic corresponds to the \c VPMAXSD instruction.
1141 /// \param __a
1142 /// A 256-bit vector of [8 x i32].
1143 /// \param __b
1144 /// A 256-bit vector of [8 x i32].
1145 /// \returns A 256-bit vector of [8 x i32] containing the result.
1146 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1147 _mm256_max_epi32(__m256i __a, __m256i __b)
1149 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1152 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1153 /// vectors in \a __a and \a __b and returns the larger of each pair in
1154 /// the corresponding byte of the 256-bit result.
1156 /// \headerfile <immintrin.h>
1158 /// This intrinsic corresponds to the \c VPMAXUB instruction.
1160 /// \param __a
1161 /// A 256-bit integer vector.
1162 /// \param __b
1163 /// A 256-bit integer vector.
1164 /// \returns A 256-bit integer vector containing the result.
1165 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1166 _mm256_max_epu8(__m256i __a, __m256i __b)
1168 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1171 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1172 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1173 /// each pair in the corresponding element of the 256-bit result.
1175 /// \headerfile <immintrin.h>
1177 /// This intrinsic corresponds to the \c VPMAXUW instruction.
1179 /// \param __a
1180 /// A 256-bit vector of [16 x i16].
1181 /// \param __b
1182 /// A 256-bit vector of [16 x i16].
1183 /// \returns A 256-bit vector of [16 x i16] containing the result.
1184 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1185 _mm256_max_epu16(__m256i __a, __m256i __b)
1187 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1190 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1191 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1192 /// each pair in the corresponding element of the 256-bit result.
1194 /// \headerfile <immintrin.h>
1196 /// This intrinsic corresponds to the \c VPMAXUD instruction.
1198 /// \param __a
1199 /// A 256-bit vector of [8 x i32].
1200 /// \param __b
1201 /// A 256-bit vector of [8 x i32].
1202 /// \returns A 256-bit vector of [8 x i32] containing the result.
1203 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1204 _mm256_max_epu32(__m256i __a, __m256i __b)
1206 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1209 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1210 /// in \a __a and \a __b and returns the smaller of each pair in the
1211 /// corresponding byte of the 256-bit result.
1213 /// \headerfile <immintrin.h>
1215 /// This intrinsic corresponds to the \c VPMINSB instruction.
1217 /// \param __a
1218 /// A 256-bit integer vector.
1219 /// \param __b
1220 /// A 256-bit integer vector.
1221 /// \returns A 256-bit integer vector containing the result.
1222 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1223 _mm256_min_epi8(__m256i __a, __m256i __b)
1225 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1228 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1229 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1230 /// each pair in the corresponding element of the 256-bit result.
1232 /// \headerfile <immintrin.h>
1234 /// This intrinsic corresponds to the \c VPMINSW instruction.
1236 /// \param __a
1237 /// A 256-bit vector of [16 x i16].
1238 /// \param __b
1239 /// A 256-bit vector of [16 x i16].
1240 /// \returns A 256-bit vector of [16 x i16] containing the result.
1241 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1242 _mm256_min_epi16(__m256i __a, __m256i __b)
1244 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1247 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1248 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1249 /// each pair in the corresponding element of the 256-bit result.
1251 /// \headerfile <immintrin.h>
1253 /// This intrinsic corresponds to the \c VPMINSD instruction.
1255 /// \param __a
1256 /// A 256-bit vector of [8 x i32].
1257 /// \param __b
1258 /// A 256-bit vector of [8 x i32].
1259 /// \returns A 256-bit vector of [8 x i32] containing the result.
1260 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1261 _mm256_min_epi32(__m256i __a, __m256i __b)
1263 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1266 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1267 /// vectors in \a __a and \a __b and returns the smaller of each pair in
1268 /// the corresponding byte of the 256-bit result.
1270 /// \headerfile <immintrin.h>
1272 /// This intrinsic corresponds to the \c VPMINUB instruction.
1274 /// \param __a
1275 /// A 256-bit integer vector.
1276 /// \param __b
1277 /// A 256-bit integer vector.
1278 /// \returns A 256-bit integer vector containing the result.
1279 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1280 _mm256_min_epu8(__m256i __a, __m256i __b)
1282 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1285 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1286 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1287 /// each pair in the corresponding element of the 256-bit result.
1289 /// \headerfile <immintrin.h>
1291 /// This intrinsic corresponds to the \c VPMINUW instruction.
1293 /// \param __a
1294 /// A 256-bit vector of [16 x i16].
1295 /// \param __b
1296 /// A 256-bit vector of [16 x i16].
1297 /// \returns A 256-bit vector of [16 x i16] containing the result.
1298 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1299 _mm256_min_epu16(__m256i __a, __m256i __b)
1301 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1304 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1305 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1306 /// each pair in the corresponding element of the 256-bit result.
1308 /// \headerfile <immintrin.h>
1310 /// This intrinsic corresponds to the \c VPMINUD instruction.
1312 /// \param __a
1313 /// A 256-bit vector of [8 x i32].
1314 /// \param __b
1315 /// A 256-bit vector of [8 x i32].
1316 /// \returns A 256-bit vector of [8 x i32] containing the result.
1317 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1318 _mm256_min_epu32(__m256i __a, __m256i __b)
1320 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1323 /// Creates a 32-bit integer mask from the most significant bit of each byte
1324 /// in the 256-bit integer vector in \a __a and returns the result.
1326 /// \code{.operation}
1327 /// FOR i := 0 TO 31
1328 /// j := i*8
1329 /// result[i] := __a[j+7]
1330 /// ENDFOR
1331 /// \endcode
1333 /// \headerfile <immintrin.h>
1335 /// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1337 /// \param __a
1338 /// A 256-bit integer vector containing the source bytes.
1339 /// \returns The 32-bit integer mask.
1340 static __inline__ int __DEFAULT_FN_ATTRS256
1341 _mm256_movemask_epi8(__m256i __a)
1343 return __builtin_ia32_pmovmskb256((__v32qi)__a);
1346 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1347 /// the 16-bit values in the corresponding elements of a 256-bit vector
1348 /// of [16 x i16].
1350 /// \code{.operation}
1351 /// FOR i := 0 TO 15
1352 /// j := i*8
1353 /// k := i*16
1354 /// result[k+15:k] := SignExtend(__V[j+7:j])
1355 /// ENDFOR
1356 /// \endcode
1358 /// \headerfile <immintrin.h>
1360 /// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1362 /// \param __V
1363 /// A 128-bit integer vector containing the source bytes.
1364 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1365 /// values.
1366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1367 _mm256_cvtepi8_epi16(__m128i __V)
1369 /* This function always performs a signed extension, but __v16qi is a char
1370 which may be signed or unsigned, so use __v16qs. */
1371 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1374 /// Sign-extends bytes from the lower half of the 128-bit integer vector in
1375 /// \a __V and returns the 32-bit values in the corresponding elements of a
1376 /// 256-bit vector of [8 x i32].
1378 /// \code{.operation}
1379 /// FOR i := 0 TO 7
1380 /// j := i*8
1381 /// k := i*32
1382 /// result[k+31:k] := SignExtend(__V[j+7:j])
1383 /// ENDFOR
1384 /// \endcode
1386 /// \headerfile <immintrin.h>
1388 /// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1390 /// \param __V
1391 /// A 128-bit integer vector containing the source bytes.
1392 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1393 /// values.
1394 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1395 _mm256_cvtepi8_epi32(__m128i __V)
1397 /* This function always performs a signed extension, but __v16qi is a char
1398 which may be signed or unsigned, so use __v16qs. */
1399 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1402 /// Sign-extends the first four bytes from the 128-bit integer vector in
1403 /// \a __V and returns the 64-bit values in the corresponding elements of a
1404 /// 256-bit vector of [4 x i64].
1406 /// \code{.operation}
1407 /// result[63:0] := SignExtend(__V[7:0])
1408 /// result[127:64] := SignExtend(__V[15:8])
1409 /// result[191:128] := SignExtend(__V[23:16])
1410 /// result[255:192] := SignExtend(__V[31:24])
1411 /// \endcode
1413 /// \headerfile <immintrin.h>
1415 /// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1417 /// \param __V
1418 /// A 128-bit integer vector containing the source bytes.
1419 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1420 /// values.
1421 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1422 _mm256_cvtepi8_epi64(__m128i __V)
1424 /* This function always performs a signed extension, but __v16qi is a char
1425 which may be signed or unsigned, so use __v16qs. */
1426 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1429 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1430 /// \a __V and returns the 32-bit values in the corresponding elements of a
1431 /// 256-bit vector of [8 x i32].
1433 /// \code{.operation}
1434 /// FOR i := 0 TO 7
1435 /// j := i*16
1436 /// k := i*32
1437 /// result[k+31:k] := SignExtend(__V[j+15:j])
1438 /// ENDFOR
1439 /// \endcode
1441 /// \headerfile <immintrin.h>
1443 /// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1445 /// \param __V
1446 /// A 128-bit vector of [8 x i16] containing the source values.
1447 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1448 /// values.
1449 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1450 _mm256_cvtepi16_epi32(__m128i __V)
1452 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1455 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1456 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1457 /// elements of a 256-bit vector of [4 x i64].
1459 /// \code{.operation}
1460 /// result[63:0] := SignExtend(__V[15:0])
1461 /// result[127:64] := SignExtend(__V[31:16])
1462 /// result[191:128] := SignExtend(__V[47:32])
1463 /// result[255:192] := SignExtend(__V[64:48])
1464 /// \endcode
1466 /// \headerfile <immintrin.h>
1468 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1470 /// \param __V
1471 /// A 128-bit vector of [8 x i16] containing the source values.
1472 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1473 /// values.
1474 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1475 _mm256_cvtepi16_epi64(__m128i __V)
1477 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1480 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1481 /// \a __V and returns the 64-bit values in the corresponding elements of a
1482 /// 256-bit vector of [4 x i64].
1484 /// \code{.operation}
1485 /// result[63:0] := SignExtend(__V[31:0])
1486 /// result[127:64] := SignExtend(__V[63:32])
1487 /// result[191:128] := SignExtend(__V[95:64])
1488 /// result[255:192] := SignExtend(__V[127:96])
1489 /// \endcode
1491 /// \headerfile <immintrin.h>
1493 /// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1495 /// \param __V
1496 /// A 128-bit vector of [4 x i32] containing the source values.
1497 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1498 /// values.
1499 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1500 _mm256_cvtepi32_epi64(__m128i __V)
1502 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1505 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1506 /// the 16-bit values in the corresponding elements of a 256-bit vector
1507 /// of [16 x i16].
1509 /// \code{.operation}
1510 /// FOR i := 0 TO 15
1511 /// j := i*8
1512 /// k := i*16
1513 /// result[k+15:k] := ZeroExtend(__V[j+7:j])
1514 /// ENDFOR
1515 /// \endcode
1517 /// \headerfile <immintrin.h>
1519 /// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1521 /// \param __V
1522 /// A 128-bit integer vector containing the source bytes.
1523 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1524 /// values.
1525 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1526 _mm256_cvtepu8_epi16(__m128i __V)
1528 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1531 /// Zero-extends bytes from the lower half of the 128-bit integer vector in
1532 /// \a __V and returns the 32-bit values in the corresponding elements of a
1533 /// 256-bit vector of [8 x i32].
1535 /// \code{.operation}
1536 /// FOR i := 0 TO 7
1537 /// j := i*8
1538 /// k := i*32
1539 /// result[k+31:k] := ZeroExtend(__V[j+7:j])
1540 /// ENDFOR
1541 /// \endcode
1543 /// \headerfile <immintrin.h>
1545 /// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1547 /// \param __V
1548 /// A 128-bit integer vector containing the source bytes.
1549 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1550 /// values.
1551 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1552 _mm256_cvtepu8_epi32(__m128i __V)
1554 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1557 /// Zero-extends the first four bytes from the 128-bit integer vector in
1558 /// \a __V and returns the 64-bit values in the corresponding elements of a
1559 /// 256-bit vector of [4 x i64].
1561 /// \code{.operation}
1562 /// result[63:0] := ZeroExtend(__V[7:0])
1563 /// result[127:64] := ZeroExtend(__V[15:8])
1564 /// result[191:128] := ZeroExtend(__V[23:16])
1565 /// result[255:192] := ZeroExtend(__V[31:24])
1566 /// \endcode
1568 /// \headerfile <immintrin.h>
1570 /// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1572 /// \param __V
1573 /// A 128-bit integer vector containing the source bytes.
1574 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1575 /// values.
1576 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1577 _mm256_cvtepu8_epi64(__m128i __V)
1579 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1582 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1583 /// \a __V and returns the 32-bit values in the corresponding elements of a
1584 /// 256-bit vector of [8 x i32].
1586 /// \code{.operation}
1587 /// FOR i := 0 TO 7
1588 /// j := i*16
1589 /// k := i*32
1590 /// result[k+31:k] := ZeroExtend(__V[j+15:j])
1591 /// ENDFOR
1592 /// \endcode
1594 /// \headerfile <immintrin.h>
1596 /// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1598 /// \param __V
1599 /// A 128-bit vector of [8 x i16] containing the source values.
1600 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1601 /// values.
1602 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1603 _mm256_cvtepu16_epi32(__m128i __V)
1605 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1608 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1609 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1610 /// elements of a 256-bit vector of [4 x i64].
1612 /// \code{.operation}
1613 /// result[63:0] := ZeroExtend(__V[15:0])
1614 /// result[127:64] := ZeroExtend(__V[31:16])
1615 /// result[191:128] := ZeroExtend(__V[47:32])
1616 /// result[255:192] := ZeroExtend(__V[64:48])
1617 /// \endcode
1619 /// \headerfile <immintrin.h>
1621 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1623 /// \param __V
1624 /// A 128-bit vector of [8 x i16] containing the source values.
1625 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1626 /// values.
1627 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1628 _mm256_cvtepu16_epi64(__m128i __V)
1630 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1633 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1634 /// \a __V and returns the 64-bit values in the corresponding elements of a
1635 /// 256-bit vector of [4 x i64].
1637 /// \code{.operation}
1638 /// result[63:0] := ZeroExtend(__V[31:0])
1639 /// result[127:64] := ZeroExtend(__V[63:32])
1640 /// result[191:128] := ZeroExtend(__V[95:64])
1641 /// result[255:192] := ZeroExtend(__V[127:96])
1642 /// \endcode
1644 /// \headerfile <immintrin.h>
1646 /// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1648 /// \param __V
1649 /// A 128-bit vector of [4 x i32] containing the source values.
1650 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1651 /// values.
1652 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1653 _mm256_cvtepu32_epi64(__m128i __V)
1655 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1658 /// Multiplies signed 32-bit integers from even-numbered elements of two
1659 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1660 /// [4 x i64] result.
1662 /// \code{.operation}
1663 /// result[63:0] := __a[31:0] * __b[31:0]
1664 /// result[127:64] := __a[95:64] * __b[95:64]
1665 /// result[191:128] := __a[159:128] * __b[159:128]
1666 /// result[255:192] := __a[223:192] * __b[223:192]
1667 /// \endcode
1669 /// \headerfile <immintrin.h>
1671 /// This intrinsic corresponds to the \c VPMULDQ instruction.
1673 /// \param __a
1674 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1675 /// \param __b
1676 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1677 /// \returns A 256-bit vector of [4 x i64] containing the products.
1678 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1679 _mm256_mul_epi32(__m256i __a, __m256i __b)
1681 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1684 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1685 /// [16 x i16], truncates the 32-bit results to the most significant 18
1686 /// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1687 /// product in the [16 x i16] result.
1689 /// \code{.operation}
1690 /// FOR i := 0 TO 15
1691 /// j := i*16
1692 /// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1693 /// result[j+15:j] := temp[16:1]
1694 /// \endcode
1696 /// \headerfile <immintrin.h>
1698 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
1700 /// \param __a
1701 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1702 /// \param __b
1703 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1704 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1705 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1706 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1708 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1711 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1712 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1713 /// [16 x i16] result.
1715 /// \headerfile <immintrin.h>
1717 /// This intrinsic corresponds to the \c VPMULHUW instruction.
1719 /// \param __a
1720 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1721 /// \param __b
1722 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1723 /// \returns A 256-bit vector of [16 x i16] containing the products.
1724 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1725 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
1727 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
1730 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1731 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1732 /// [16 x i16] result.
1734 /// \headerfile <immintrin.h>
1736 /// This intrinsic corresponds to the \c VPMULHW instruction.
1738 /// \param __a
1739 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1740 /// \param __b
1741 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1742 /// \returns A 256-bit vector of [16 x i16] containing the products.
1743 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1744 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
1746 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1749 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1750 /// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1751 /// [16 x i16] result.
1753 /// \headerfile <immintrin.h>
1755 /// This intrinsic corresponds to the \c VPMULLW instruction.
1757 /// \param __a
1758 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1759 /// \param __b
1760 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1761 /// \returns A 256-bit vector of [16 x i16] containing the products.
1762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1763 _mm256_mullo_epi16(__m256i __a, __m256i __b)
1765 return (__m256i)((__v16hu)__a * (__v16hu)__b);
1768 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1769 /// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1770 /// [8 x i32] result.
1772 /// \headerfile <immintrin.h>
1774 /// This intrinsic corresponds to the \c VPMULLD instruction.
1776 /// \param __a
1777 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1778 /// \param __b
1779 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1780 /// \returns A 256-bit vector of [8 x i32] containing the products.
1781 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1782 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
1784 return (__m256i)((__v8su)__a * (__v8su)__b);
1787 /// Multiplies unsigned 32-bit integers from even-numered elements of two
1788 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1789 /// [4 x i64] result.
1791 /// \code{.operation}
1792 /// result[63:0] := __a[31:0] * __b[31:0]
1793 /// result[127:64] := __a[95:64] * __b[95:64]
1794 /// result[191:128] := __a[159:128] * __b[159:128]
1795 /// result[255:192] := __a[223:192] * __b[223:192]
1796 /// \endcode
1798 /// \headerfile <immintrin.h>
1800 /// This intrinsic corresponds to the \c VPMULUDQ instruction.
1802 /// \param __a
1803 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1804 /// \param __b
1805 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1806 /// \returns A 256-bit vector of [4 x i64] containing the products.
1807 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1808 _mm256_mul_epu32(__m256i __a, __m256i __b)
1810 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1813 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1814 /// \a __b.
1816 /// \headerfile <immintrin.h>
1818 /// This intrinsic corresponds to the \c VPOR instruction.
1820 /// \param __a
1821 /// A 256-bit integer vector.
1822 /// \param __b
1823 /// A 256-bit integer vector.
1824 /// \returns A 256-bit integer vector containing the result.
1825 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1826 _mm256_or_si256(__m256i __a, __m256i __b)
1828 return (__m256i)((__v4du)__a | (__v4du)__b);
1831 /// Computes four sum of absolute difference (SAD) operations on sets of eight
1832 /// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1833 /// \a __b.
1835 /// One SAD result is computed for each set of eight bytes from \a __a and
1836 /// eight bytes from \a __b. The zero-extended SAD value is returned in the
1837 /// corresponding 64-bit element of the result.
1839 /// A single SAD operation takes the differences between the corresponding
1840 /// bytes of \a __a and \a __b, takes the absolute value of each difference,
1841 /// and sums these eight values to form one 16-bit result. This operation
1842 /// is repeated four times with successive sets of eight bytes.
1844 /// \code{.operation}
1845 /// FOR i := 0 TO 3
1846 /// j := i*64
1847 /// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1848 /// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1849 /// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1850 /// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1851 /// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1852 /// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1853 /// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1854 /// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1855 /// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1856 /// temp4 + temp5 + temp6 + temp7
1857 /// result[j+63:j+16] := 0
1858 /// ENDFOR
1859 /// \endcode
1861 /// \headerfile <immintrin.h>
1863 /// This intrinsic corresponds to the \c VPSADBW instruction.
1865 /// \param __a
1866 /// A 256-bit integer vector.
1867 /// \param __b
1868 /// A 256-bit integer vector.
1869 /// \returns A 256-bit integer vector containing the result.
1870 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1871 _mm256_sad_epu8(__m256i __a, __m256i __b)
1873 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1876 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1877 /// to control information in the 256-bit integer vector \a __b, and
1878 /// returns the 256-bit result. In effect there are two separate 128-bit
1879 /// shuffles in the lower and upper halves.
1881 /// \code{.operation}
1882 /// FOR i := 0 TO 31
1883 /// j := i*8
1884 /// IF __b[j+7] == 1
1885 /// result[j+7:j] := 0
1886 /// ELSE
1887 /// k := __b[j+3:j] * 8
1888 /// IF i > 15
1889 /// k := k + 128
1890 /// FI
1891 /// result[j+7:j] := __a[k+7:k]
1892 /// FI
1893 /// ENDFOR
1894 /// \endcode
1896 /// \headerfile <immintrin.h>
1898 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1900 /// \param __a
1901 /// A 256-bit integer vector containing source values.
1902 /// \param __b
1903 /// A 256-bit integer vector containing control information to determine
1904 /// what goes into the corresponding byte of the result. If bit 7 of the
1905 /// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1906 /// control byte specify the index (within the same 128-bit half) of \a __a
1907 /// to copy to the result byte.
1908 /// \returns A 256-bit integer vector containing the result.
1909 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1910 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
1912 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1915 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1916 /// according to control information in the integer literal \a imm, and
1917 /// returns the 256-bit result. In effect there are two parallel 128-bit
1918 /// shuffles in the lower and upper halves.
1920 /// \code{.operation}
1921 /// FOR i := 0 to 3
1922 /// j := i*32
1923 /// k := (imm >> i*2)[1:0] * 32
1924 /// result[j+31:j] := a[k+31:k]
1925 /// result[128+j+31:128+j] := a[128+k+31:128+k]
1926 /// ENDFOR
1927 /// \endcode
1929 /// \headerfile <immintrin.h>
1931 /// \code
1932 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1933 /// \endcode
1935 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1937 /// \param a
1938 /// A 256-bit vector of [8 x i32] containing source values.
1939 /// \param imm
1940 /// An immediate 8-bit value specifying which elements to copy from \a a.
1941 /// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1942 /// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1943 /// forth.
1944 /// \returns A 256-bit vector of [8 x i32] containing the result.
1945 #define _mm256_shuffle_epi32(a, imm) \
1946 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1948 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1949 /// according to control information in the integer literal \a imm, and
1950 /// returns the 256-bit result. The upper 64 bits of each 128-bit half
1951 /// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1952 /// copied from \a a unchanged.
1954 /// \code{.operation}
1955 /// result[63:0] := a[63:0]
1956 /// result[191:128] := a[191:128]
1957 /// FOR i := 0 TO 3
1958 /// j := i * 16 + 64
1959 /// k := (imm >> i*2)[1:0] * 16 + 64
1960 /// result[j+15:j] := a[k+15:k]
1961 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1962 /// ENDFOR
1963 /// \endcode
1965 /// \headerfile <immintrin.h>
1967 /// \code
1968 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1969 /// \endcode
1971 /// This intrinsic corresponds to the \c VPSHUFHW instruction.
1973 /// \param a
1974 /// A 256-bit vector of [16 x i16] containing source values.
1975 /// \param imm
1976 /// An immediate 8-bit value specifying which elements to copy from \a a.
1977 /// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1978 /// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1979 /// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1980 /// \returns A 256-bit vector of [16 x i16] containing the result.
1981 #define _mm256_shufflehi_epi16(a, imm) \
1982 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1984 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1985 /// according to control information in the integer literal \a imm, and
1986 /// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1987 /// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1988 /// copied from \a a unchanged.
1990 /// \code{.operation}
1991 /// result[127:64] := a[127:64]
1992 /// result[255:192] := a[255:192]
1993 /// FOR i := 0 TO 3
1994 /// j := i * 16
1995 /// k := (imm >> i*2)[1:0] * 16
1996 /// result[j+15:j] := a[k+15:k]
1997 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1998 /// ENDFOR
1999 /// \endcode
2001 /// \headerfile <immintrin.h>
2003 /// \code
2004 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
2005 /// \endcode
2007 /// This intrinsic corresponds to the \c VPSHUFLW instruction.
2009 /// \param a
2010 /// A 256-bit vector of [16 x i16] to use as a source of data for the
2011 /// result.
2012 /// \param imm
2013 /// An immediate 8-bit value specifying which elements to copy from \a a.
2014 /// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2015 /// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2016 /// forth.
2017 /// \returns A 256-bit vector of [16 x i16] containing the result.
2018 #define _mm256_shufflelo_epi16(a, imm) \
2019 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2021 /// Sets each byte of the result to the corresponding byte of the 256-bit
2022 /// integer vector in \a __a, the negative of that byte, or zero, depending
2023 /// on whether the corresponding byte of the 256-bit integer vector in
2024 /// \a __b is greater than zero, less than zero, or equal to zero,
2025 /// respectively.
2027 /// \headerfile <immintrin.h>
2029 /// This intrinsic corresponds to the \c VPSIGNB instruction.
2031 /// \param __a
2032 /// A 256-bit integer vector.
2033 /// \param __b
2034 /// A 256-bit integer vector].
2035 /// \returns A 256-bit integer vector containing the result.
2036 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2037 _mm256_sign_epi8(__m256i __a, __m256i __b)
2039 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2042 /// Sets each element of the result to the corresponding element of the
2043 /// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
2044 /// or zero, depending on whether the corresponding element of the 256-bit
2045 /// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2046 /// equal to zero, respectively.
2048 /// \headerfile <immintrin.h>
2050 /// This intrinsic corresponds to the \c VPSIGNW instruction.
2052 /// \param __a
2053 /// A 256-bit vector of [16 x i16].
2054 /// \param __b
2055 /// A 256-bit vector of [16 x i16].
2056 /// \returns A 256-bit vector of [16 x i16] containing the result.
2057 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2058 _mm256_sign_epi16(__m256i __a, __m256i __b)
2060 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2063 /// Sets each element of the result to the corresponding element of the
2064 /// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2065 /// zero, depending on whether the corresponding element of the 256-bit
2066 /// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2067 /// equal to zero, respectively.
2069 /// \headerfile <immintrin.h>
2071 /// This intrinsic corresponds to the \c VPSIGND instruction.
2073 /// \param __a
2074 /// A 256-bit vector of [8 x i32].
2075 /// \param __b
2076 /// A 256-bit vector of [8 x i32].
2077 /// \returns A 256-bit vector of [8 x i32] containing the result.
2078 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2079 _mm256_sign_epi32(__m256i __a, __m256i __b)
2081 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2084 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2085 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2086 /// is greater than 15, the returned result is all zeroes.
2088 /// \headerfile <immintrin.h>
2090 /// \code
2091 /// __m256i _mm256_slli_si256(__m256i a, const int imm);
2092 /// \endcode
2094 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2096 /// \param a
2097 /// A 256-bit integer vector to be shifted.
2098 /// \param imm
2099 /// An unsigned immediate value specifying the shift count (in bytes).
2100 /// \returns A 256-bit integer vector containing the result.
2101 #define _mm256_slli_si256(a, imm) \
2102 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2104 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2105 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2106 /// is greater than 15, the returned result is all zeroes.
2108 /// \headerfile <immintrin.h>
2110 /// \code
2111 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2112 /// \endcode
2114 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2116 /// \param a
2117 /// A 256-bit integer vector to be shifted.
2118 /// \param imm
2119 /// An unsigned immediate value specifying the shift count (in bytes).
2120 /// \returns A 256-bit integer vector containing the result.
2121 #define _mm256_bslli_epi128(a, imm) \
2122 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2124 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2125 /// left by \a __count bits, shifting in zero bits, and returns the result.
2126 /// If \a __count is greater than 15, the returned result is all zeroes.
2128 /// \headerfile <immintrin.h>
2130 /// This intrinsic corresponds to the \c VPSLLW instruction.
2132 /// \param __a
2133 /// A 256-bit vector of [16 x i16] to be shifted.
2134 /// \param __count
2135 /// An unsigned integer value specifying the shift count (in bits).
2136 /// \returns A 256-bit vector of [16 x i16] containing the result.
2137 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2138 _mm256_slli_epi16(__m256i __a, int __count)
2140 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2143 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2144 /// left by the number of bits specified by the lower 64 bits of \a __count,
2145 /// shifting in zero bits, and returns the result. If \a __count is greater
2146 /// than 15, the returned result is all zeroes.
2148 /// \headerfile <immintrin.h>
2150 /// This intrinsic corresponds to the \c VPSLLW instruction.
2152 /// \param __a
2153 /// A 256-bit vector of [16 x i16] to be shifted.
2154 /// \param __count
2155 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2156 /// shift count (in bits). The upper element is ignored.
2157 /// \returns A 256-bit vector of [16 x i16] containing the result.
2158 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2159 _mm256_sll_epi16(__m256i __a, __m128i __count)
2161 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2164 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2165 /// left by \a __count bits, shifting in zero bits, and returns the result.
2166 /// If \a __count is greater than 31, the returned result is all zeroes.
2168 /// \headerfile <immintrin.h>
2170 /// This intrinsic corresponds to the \c VPSLLD instruction.
2172 /// \param __a
2173 /// A 256-bit vector of [8 x i32] to be shifted.
2174 /// \param __count
2175 /// An unsigned integer value specifying the shift count (in bits).
2176 /// \returns A 256-bit vector of [8 x i32] containing the result.
2177 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2178 _mm256_slli_epi32(__m256i __a, int __count)
2180 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2183 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2184 /// left by the number of bits given in the lower 64 bits of \a __count,
2185 /// shifting in zero bits, and returns the result. If \a __count is greater
2186 /// than 31, the returned result is all zeroes.
2188 /// \headerfile <immintrin.h>
2190 /// This intrinsic corresponds to the \c VPSLLD instruction.
2192 /// \param __a
2193 /// A 256-bit vector of [8 x i32] to be shifted.
2194 /// \param __count
2195 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2196 /// shift count (in bits). The upper element is ignored.
2197 /// \returns A 256-bit vector of [8 x i32] containing the result.
2198 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2199 _mm256_sll_epi32(__m256i __a, __m128i __count)
2201 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2204 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2205 /// left by \a __count bits, shifting in zero bits, and returns the result.
2206 /// If \a __count is greater than 63, the returned result is all zeroes.
2208 /// \headerfile <immintrin.h>
2210 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2212 /// \param __a
2213 /// A 256-bit vector of [4 x i64] to be shifted.
2214 /// \param __count
2215 /// An unsigned integer value specifying the shift count (in bits).
2216 /// \returns A 256-bit vector of [4 x i64] containing the result.
2217 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2218 _mm256_slli_epi64(__m256i __a, int __count)
2220 return __builtin_ia32_psllqi256((__v4di)__a, __count);
2223 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2224 /// left by the number of bits given in the lower 64 bits of \a __count,
2225 /// shifting in zero bits, and returns the result. If \a __count is greater
2226 /// than 63, the returned result is all zeroes.
2228 /// \headerfile <immintrin.h>
2230 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2232 /// \param __a
2233 /// A 256-bit vector of [4 x i64] to be shifted.
2234 /// \param __count
2235 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2236 /// shift count (in bits). The upper element is ignored.
2237 /// \returns A 256-bit vector of [4 x i64] containing the result.
2238 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2239 _mm256_sll_epi64(__m256i __a, __m128i __count)
2241 return __builtin_ia32_psllq256((__v4di)__a, __count);
2244 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2245 /// right by \a __count bits, shifting in sign bits, and returns the result.
2246 /// If \a __count is greater than 15, each element of the result is either
2247 /// 0 or -1 according to the corresponding input sign bit.
2249 /// \headerfile <immintrin.h>
2251 /// This intrinsic corresponds to the \c VPSRAW instruction.
2253 /// \param __a
2254 /// A 256-bit vector of [16 x i16] to be shifted.
2255 /// \param __count
2256 /// An unsigned integer value specifying the shift count (in bits).
2257 /// \returns A 256-bit vector of [16 x i16] containing the result.
2258 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2259 _mm256_srai_epi16(__m256i __a, int __count)
2261 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2264 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2265 /// right by the number of bits given in the lower 64 bits of \a __count,
2266 /// shifting in sign bits, and returns the result. If \a __count is greater
2267 /// than 15, each element of the result is either 0 or -1 according to the
2268 /// corresponding input sign bit.
2270 /// \headerfile <immintrin.h>
2272 /// This intrinsic corresponds to the \c VPSRAW instruction.
2274 /// \param __a
2275 /// A 256-bit vector of [16 x i16] to be shifted.
2276 /// \param __count
2277 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2278 /// shift count (in bits). The upper element is ignored.
2279 /// \returns A 256-bit vector of [16 x i16] containing the result.
2280 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2281 _mm256_sra_epi16(__m256i __a, __m128i __count)
2283 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2286 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2287 /// right by \a __count bits, shifting in sign bits, and returns the result.
2288 /// If \a __count is greater than 31, each element of the result is either
2289 /// 0 or -1 according to the corresponding input sign bit.
2291 /// \headerfile <immintrin.h>
2293 /// This intrinsic corresponds to the \c VPSRAD instruction.
2295 /// \param __a
2296 /// A 256-bit vector of [8 x i32] to be shifted.
2297 /// \param __count
2298 /// An unsigned integer value specifying the shift count (in bits).
2299 /// \returns A 256-bit vector of [8 x i32] containing the result.
2300 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2301 _mm256_srai_epi32(__m256i __a, int __count)
2303 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2306 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2307 /// right by the number of bits given in the lower 64 bits of \a __count,
2308 /// shifting in sign bits, and returns the result. If \a __count is greater
2309 /// than 31, each element of the result is either 0 or -1 according to the
2310 /// corresponding input sign bit.
2312 /// \headerfile <immintrin.h>
2314 /// This intrinsic corresponds to the \c VPSRAD instruction.
2316 /// \param __a
2317 /// A 256-bit vector of [8 x i32] to be shifted.
2318 /// \param __count
2319 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2320 /// shift count (in bits). The upper element is ignored.
2321 /// \returns A 256-bit vector of [8 x i32] containing the result.
2322 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2323 _mm256_sra_epi32(__m256i __a, __m128i __count)
2325 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2328 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2329 /// \a imm bytes, shifting in zero bytes, and returns the result. If
2330 /// \a imm is greater than 15, the returned result is all zeroes.
2332 /// \headerfile <immintrin.h>
2334 /// \code
2335 /// __m256i _mm256_srli_si256(__m256i a, const int imm);
2336 /// \endcode
2338 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2340 /// \param a
2341 /// A 256-bit integer vector to be shifted.
2342 /// \param imm
2343 /// An unsigned immediate value specifying the shift count (in bytes).
2344 /// \returns A 256-bit integer vector containing the result.
2345 #define _mm256_srli_si256(a, imm) \
2346 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2348 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2349 /// \a imm bytes, shifting in zero bytes, and returns the result. If
2350 /// \a imm is greater than 15, the returned result is all zeroes.
2352 /// \headerfile <immintrin.h>
2354 /// \code
2355 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2356 /// \endcode
2358 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2360 /// \param a
2361 /// A 256-bit integer vector to be shifted.
2362 /// \param imm
2363 /// An unsigned immediate value specifying the shift count (in bytes).
2364 /// \returns A 256-bit integer vector containing the result.
2365 #define _mm256_bsrli_epi128(a, imm) \
2366 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2368 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2369 /// right by \a __count bits, shifting in zero bits, and returns the result.
2370 /// If \a __count is greater than 15, the returned result is all zeroes.
2372 /// \headerfile <immintrin.h>
2374 /// This intrinsic corresponds to the \c VPSRLW instruction.
2376 /// \param __a
2377 /// A 256-bit vector of [16 x i16] to be shifted.
2378 /// \param __count
2379 /// An unsigned integer value specifying the shift count (in bits).
2380 /// \returns A 256-bit vector of [16 x i16] containing the result.
2381 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2382 _mm256_srli_epi16(__m256i __a, int __count)
2384 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2387 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2388 /// right by the number of bits given in the lower 64 bits of \a __count,
2389 /// shifting in zero bits, and returns the result. If \a __count is greater
2390 /// than 15, the returned result is all zeroes.
2392 /// \headerfile <immintrin.h>
2394 /// This intrinsic corresponds to the \c VPSRLW instruction.
2396 /// \param __a
2397 /// A 256-bit vector of [16 x i16] to be shifted.
2398 /// \param __count
2399 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2400 /// shift count (in bits). The upper element is ignored.
2401 /// \returns A 256-bit vector of [16 x i16] containing the result.
2402 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2403 _mm256_srl_epi16(__m256i __a, __m128i __count)
2405 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2408 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2409 /// right by \a __count bits, shifting in zero bits, and returns the result.
2410 /// If \a __count is greater than 31, the returned result is all zeroes.
2412 /// \headerfile <immintrin.h>
2414 /// This intrinsic corresponds to the \c VPSRLD instruction.
2416 /// \param __a
2417 /// A 256-bit vector of [8 x i32] to be shifted.
2418 /// \param __count
2419 /// An unsigned integer value specifying the shift count (in bits).
2420 /// \returns A 256-bit vector of [8 x i32] containing the result.
2421 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2422 _mm256_srli_epi32(__m256i __a, int __count)
2424 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2427 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2428 /// right by the number of bits given in the lower 64 bits of \a __count,
2429 /// shifting in zero bits, and returns the result. If \a __count is greater
2430 /// than 31, the returned result is all zeroes.
2432 /// \headerfile <immintrin.h>
2434 /// This intrinsic corresponds to the \c VPSRLD instruction.
2436 /// \param __a
2437 /// A 256-bit vector of [8 x i32] to be shifted.
2438 /// \param __count
2439 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2440 /// shift count (in bits). The upper element is ignored.
2441 /// \returns A 256-bit vector of [8 x i32] containing the result.
2442 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2443 _mm256_srl_epi32(__m256i __a, __m128i __count)
2445 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2448 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2449 /// right by \a __count bits, shifting in zero bits, and returns the result.
2450 /// If \a __count is greater than 63, the returned result is all zeroes.
2452 /// \headerfile <immintrin.h>
2454 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2456 /// \param __a
2457 /// A 256-bit vector of [4 x i64] to be shifted.
2458 /// \param __count
2459 /// An unsigned integer value specifying the shift count (in bits).
2460 /// \returns A 256-bit vector of [4 x i64] containing the result.
2461 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2462 _mm256_srli_epi64(__m256i __a, int __count)
2464 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2467 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2468 /// right by the number of bits given in the lower 64 bits of \a __count,
2469 /// shifting in zero bits, and returns the result. If \a __count is greater
2470 /// than 63, the returned result is all zeroes.
2472 /// \headerfile <immintrin.h>
2474 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2476 /// \param __a
2477 /// A 256-bit vector of [4 x i64] to be shifted.
2478 /// \param __count
2479 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2480 /// shift count (in bits). The upper element is ignored.
2481 /// \returns A 256-bit vector of [4 x i64] containing the result.
2482 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2483 _mm256_srl_epi64(__m256i __a, __m128i __count)
2485 return __builtin_ia32_psrlq256((__v4di)__a, __count);
2488 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2489 /// vectors. Returns the lower 8 bits of each difference in the
2490 /// corresponding byte of the 256-bit integer vector result (overflow is
2491 /// ignored).
2493 /// \code{.operation}
2494 /// FOR i := 0 TO 31
2495 /// j := i*8
2496 /// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2497 /// ENDFOR
2498 /// \endcode
2500 /// \headerfile <immintrin.h>
2502 /// This intrinsic corresponds to the \c VPSUBB instruction.
2504 /// \param __a
2505 /// A 256-bit integer vector containing the minuends.
2506 /// \param __b
2507 /// A 256-bit integer vector containing the subtrahends.
2508 /// \returns A 256-bit integer vector containing the differences.
2509 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2510 _mm256_sub_epi8(__m256i __a, __m256i __b)
2512 return (__m256i)((__v32qu)__a - (__v32qu)__b);
2515 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2516 /// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2517 /// the corresponding element of the [16 x i16] result (overflow is
2518 /// ignored).
2520 /// \code{.operation}
2521 /// FOR i := 0 TO 15
2522 /// j := i*16
2523 /// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2524 /// ENDFOR
2525 /// \endcode
2527 /// \headerfile <immintrin.h>
2529 /// This intrinsic corresponds to the \c VPSUBW instruction.
2531 /// \param __a
2532 /// A 256-bit vector of [16 x i16] containing the minuends.
2533 /// \param __b
2534 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2535 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2536 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2537 _mm256_sub_epi16(__m256i __a, __m256i __b)
2539 return (__m256i)((__v16hu)__a - (__v16hu)__b);
2542 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
2543 /// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2544 /// the corresponding element of the [8 x i32] result (overflow is ignored).
2546 /// \code{.operation}
2547 /// FOR i := 0 TO 7
2548 /// j := i*32
2549 /// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2550 /// ENDFOR
2551 /// \endcode
2553 /// \headerfile <immintrin.h>
2555 /// This intrinsic corresponds to the \c VPSUBD instruction.
2557 /// \param __a
2558 /// A 256-bit vector of [8 x i32] containing the minuends.
2559 /// \param __b
2560 /// A 256-bit vector of [8 x i32] containing the subtrahends.
2561 /// \returns A 256-bit vector of [8 x i32] containing the differences.
2562 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2563 _mm256_sub_epi32(__m256i __a, __m256i __b)
2565 return (__m256i)((__v8su)__a - (__v8su)__b);
2568 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
2569 /// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2570 /// the corresponding element of the [4 x i64] result (overflow is ignored).
2572 /// \code{.operation}
2573 /// FOR i := 0 TO 3
2574 /// j := i*64
2575 /// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2576 /// ENDFOR
2577 /// \endcode
2579 /// \headerfile <immintrin.h>
2581 /// This intrinsic corresponds to the \c VPSUBQ instruction.
2583 /// \param __a
2584 /// A 256-bit vector of [4 x i64] containing the minuends.
2585 /// \param __b
2586 /// A 256-bit vector of [4 x i64] containing the subtrahends.
2587 /// \returns A 256-bit vector of [4 x i64] containing the differences.
2588 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2589 _mm256_sub_epi64(__m256i __a, __m256i __b)
2591 return (__m256i)((__v4du)__a - (__v4du)__b);
2594 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2595 /// vectors using signed saturation, and returns each differences in the
2596 /// corresponding byte of the 256-bit integer vector result.
2598 /// \code{.operation}
2599 /// FOR i := 0 TO 31
2600 /// j := i*8
2601 /// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2602 /// ENDFOR
2603 /// \endcode
2605 /// \headerfile <immintrin.h>
2607 /// This intrinsic corresponds to the \c VPSUBSB instruction.
2609 /// \param __a
2610 /// A 256-bit integer vector containing the minuends.
2611 /// \param __b
2612 /// A 256-bit integer vector containing the subtrahends.
2613 /// \returns A 256-bit integer vector containing the differences.
2614 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2615 _mm256_subs_epi8(__m256i __a, __m256i __b)
2617 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2620 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2621 /// vectors of [16 x i16] using signed saturation, and returns each
2622 /// difference in the corresponding element of the [16 x i16] result.
2624 /// \code{.operation}
2625 /// FOR i := 0 TO 15
2626 /// j := i*16
2627 /// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2628 /// ENDFOR
2629 /// \endcode
2631 /// \headerfile <immintrin.h>
2633 /// This intrinsic corresponds to the \c VPSUBSW instruction.
2635 /// \param __a
2636 /// A 256-bit vector of [16 x i16] containing the minuends.
2637 /// \param __b
2638 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2639 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2640 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2641 _mm256_subs_epi16(__m256i __a, __m256i __b)
2643 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2646 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2647 /// vectors using unsigned saturation, and returns each difference in the
2648 /// corresponding byte of the 256-bit integer vector result. For each byte,
2649 /// computes <c> result = __a - __b </c>.
2651 /// \code{.operation}
2652 /// FOR i := 0 TO 31
2653 /// j := i*8
2654 /// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2655 /// ENDFOR
2656 /// \endcode
2658 /// \headerfile <immintrin.h>
2660 /// This intrinsic corresponds to the \c VPSUBUSB instruction.
2662 /// \param __a
2663 /// A 256-bit integer vector containing the minuends.
2664 /// \param __b
2665 /// A 256-bit integer vector containing the subtrahends.
2666 /// \returns A 256-bit integer vector containing the differences.
2667 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2668 _mm256_subs_epu8(__m256i __a, __m256i __b)
2670 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2673 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2674 /// vectors of [16 x i16] using unsigned saturation, and returns each
2675 /// difference in the corresponding element of the [16 x i16] result.
2677 /// \code{.operation}
2678 /// FOR i := 0 TO 15
2679 /// j := i*16
2680 /// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2681 /// ENDFOR
2682 /// \endcode
2684 /// \headerfile <immintrin.h>
2686 /// This intrinsic corresponds to the \c VPSUBUSW instruction.
2688 /// \param __a
2689 /// A 256-bit vector of [16 x i16] containing the minuends.
2690 /// \param __b
2691 /// A 256-bit vector of [16 x i16] containing the subtrahends.
2692 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2693 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2694 _mm256_subs_epu16(__m256i __a, __m256i __b)
2696 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2699 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2700 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2701 /// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2702 /// input; other bits in these parameters are ignored.
2704 /// \code{.operation}
2705 /// result[7:0] := __a[71:64]
2706 /// result[15:8] := __b[71:64]
2707 /// result[23:16] := __a[79:72]
2708 /// result[31:24] := __b[79:72]
2709 /// . . .
2710 /// result[127:120] := __b[127:120]
2711 /// result[135:128] := __a[199:192]
2712 /// . . .
2713 /// result[255:248] := __b[255:248]
2714 /// \endcode
2716 /// \headerfile <immintrin.h>
2718 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2720 /// \param __a
2721 /// A 256-bit integer vector used as the source for the even-numbered bytes
2722 /// of the result.
2723 /// \param __b
2724 /// A 256-bit integer vector used as the source for the odd-numbered bytes
2725 /// of the result.
2726 /// \returns A 256-bit integer vector containing the result.
2727 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2728 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
2730 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2733 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2734 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2735 /// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2736 /// 128-bit half of \a __a and \a __b as input; other bits in these
2737 /// parameters are ignored.
2739 /// \code{.operation}
2740 /// result[15:0] := __a[79:64]
2741 /// result[31:16] := __b[79:64]
2742 /// result[47:32] := __a[95:80]
2743 /// result[63:48] := __b[95:80]
2744 /// . . .
2745 /// result[127:112] := __b[127:112]
2746 /// result[143:128] := __a[211:196]
2747 /// . . .
2748 /// result[255:240] := __b[255:240]
2749 /// \endcode
2751 /// \headerfile <immintrin.h>
2753 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2755 /// \param __a
2756 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2757 /// elements of the result.
2758 /// \param __b
2759 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2760 /// elements of the result.
2761 /// \returns A 256-bit vector of [16 x i16] containing the result.
2762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2763 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
2765 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2768 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2769 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2770 /// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2771 /// of \a __a and \a __b as input; other bits in these parameters are
2772 /// ignored.
2774 /// \code{.operation}
2775 /// result[31:0] := __a[95:64]
2776 /// result[63:32] := __b[95:64]
2777 /// result[95:64] := __a[127:96]
2778 /// result[127:96] := __b[127:96]
2779 /// result[159:128] := __a[223:192]
2780 /// result[191:160] := __b[223:192]
2781 /// result[223:192] := __a[255:224]
2782 /// result[255:224] := __b[255:224]
2783 /// \endcode
2785 /// \headerfile <immintrin.h>
2787 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2789 /// \param __a
2790 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2791 /// elements of the result.
2792 /// \param __b
2793 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2794 /// elements of the result.
2795 /// \returns A 256-bit vector of [8 x i32] containing the result.
2796 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2797 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
2799 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2802 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2803 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2804 /// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2805 /// of \a __a and \a __b as input; other bits in these parameters are
2806 /// ignored.
2808 /// \code{.operation}
2809 /// result[63:0] := __a[127:64]
2810 /// result[127:64] := __b[127:64]
2811 /// result[191:128] := __a[255:192]
2812 /// result[255:192] := __b[255:192]
2813 /// \endcode
2815 /// \headerfile <immintrin.h>
2817 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2819 /// \param __a
2820 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2821 /// elements of the result.
2822 /// \param __b
2823 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2824 /// elements of the result.
2825 /// \returns A 256-bit vector of [4 x i64] containing the result.
2826 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2827 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
2829 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2832 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2833 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2834 /// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2835 /// input; other bits in these parameters are ignored.
2837 /// \code{.operation}
2838 /// result[7:0] := __a[7:0]
2839 /// result[15:8] := __b[7:0]
2840 /// result[23:16] := __a[15:8]
2841 /// result[31:24] := __b[15:8]
2842 /// . . .
2843 /// result[127:120] := __b[63:56]
2844 /// result[135:128] := __a[135:128]
2845 /// . . .
2846 /// result[255:248] := __b[191:184]
2847 /// \endcode
2849 /// \headerfile <immintrin.h>
2851 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2853 /// \param __a
2854 /// A 256-bit integer vector used as the source for the even-numbered bytes
2855 /// of the result.
2856 /// \param __b
2857 /// A 256-bit integer vector used as the source for the odd-numbered bytes
2858 /// of the result.
2859 /// \returns A 256-bit integer vector containing the result.
2860 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2861 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
2863 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2866 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2867 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2868 /// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2869 /// 128-bit half of \a __a and \a __b as input; other bits in these
2870 /// parameters are ignored.
2872 /// \code{.operation}
2873 /// result[15:0] := __a[15:0]
2874 /// result[31:16] := __b[15:0]
2875 /// result[47:32] := __a[31:16]
2876 /// result[63:48] := __b[31:16]
2877 /// . . .
2878 /// result[127:112] := __b[63:48]
2879 /// result[143:128] := __a[143:128]
2880 /// . . .
2881 /// result[255:239] := __b[191:176]
2882 /// \endcode
2884 /// \headerfile <immintrin.h>
2886 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2888 /// \param __a
2889 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2890 /// elements of the result.
2891 /// \param __b
2892 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2893 /// elements of the result.
2894 /// \returns A 256-bit vector of [16 x i16] containing the result.
2895 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2896 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
2898 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2901 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2902 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2903 /// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2904 /// of \a __a and \a __b as input; other bits in these parameters are
2905 /// ignored.
2907 /// \code{.operation}
2908 /// result[31:0] := __a[31:0]
2909 /// result[63:32] := __b[31:0]
2910 /// result[95:64] := __a[63:32]
2911 /// result[127:96] := __b[63:32]
2912 /// result[159:128] := __a[159:128]
2913 /// result[191:160] := __b[159:128]
2914 /// result[223:192] := __a[191:160]
2915 /// result[255:224] := __b[191:190]
2916 /// \endcode
2918 /// \headerfile <immintrin.h>
2920 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2922 /// \param __a
2923 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2924 /// elements of the result.
2925 /// \param __b
2926 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2927 /// elements of the result.
2928 /// \returns A 256-bit vector of [8 x i32] containing the result.
2929 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2930 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
2932 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2935 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2936 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2937 /// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2938 /// of \a __a and \a __b as input; other bits in these parameters are
2939 /// ignored.
2941 /// \code{.operation}
2942 /// result[63:0] := __a[63:0]
2943 /// result[127:64] := __b[63:0]
2944 /// result[191:128] := __a[191:128]
2945 /// result[255:192] := __b[191:128]
2946 /// \endcode
2948 /// \headerfile <immintrin.h>
2950 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2952 /// \param __a
2953 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2954 /// elements of the result.
2955 /// \param __b
2956 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2957 /// elements of the result.
2958 /// \returns A 256-bit vector of [4 x i64] containing the result.
2959 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2960 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
2962 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2965 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2966 /// \a __b.
2968 /// \headerfile <immintrin.h>
2970 /// This intrinsic corresponds to the \c VPXOR instruction.
2972 /// \param __a
2973 /// A 256-bit integer vector.
2974 /// \param __b
2975 /// A 256-bit integer vector.
2976 /// \returns A 256-bit integer vector containing the result.
2977 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2978 _mm256_xor_si256(__m256i __a, __m256i __b)
2980 return (__m256i)((__v4du)__a ^ (__v4du)__b);
2983 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2984 /// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2985 /// boundary.
2987 /// \headerfile <immintrin.h>
2989 /// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2991 /// \param __V
2992 /// A pointer to the 32-byte aligned memory containing the vector to load.
2993 /// \returns A 256-bit integer vector loaded from memory.
2994 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2995 _mm256_stream_load_si256(const void *__V)
2997 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2998 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
3001 /// Broadcasts the 32-bit floating-point value from the low element of the
3002 /// 128-bit vector of [4 x float] in \a __X to all elements of the result's
3003 /// 128-bit vector of [4 x float].
3005 /// \headerfile <immintrin.h>
3007 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3009 /// \param __X
3010 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
3011 /// \returns A 128-bit vector of [4 x float] containing the result.
3012 static __inline__ __m128 __DEFAULT_FN_ATTRS128
3013 _mm_broadcastss_ps(__m128 __X)
3015 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
3018 /// Broadcasts the 64-bit floating-point value from the low element of the
3019 /// 128-bit vector of [2 x double] in \a __a to both elements of the
3020 /// result's 128-bit vector of [2 x double].
3022 /// \headerfile <immintrin.h>
3024 /// This intrinsic corresponds to the \c MOVDDUP instruction.
3026 /// \param __a
3027 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
3028 /// \returns A 128-bit vector of [2 x double] containing the result.
3029 static __inline__ __m128d __DEFAULT_FN_ATTRS128
3030 _mm_broadcastsd_pd(__m128d __a)
3032 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
3035 /// Broadcasts the 32-bit floating-point value from the low element of the
3036 /// 128-bit vector of [4 x float] in \a __X to all elements of the
3037 /// result's 256-bit vector of [8 x float].
3039 /// \headerfile <immintrin.h>
3041 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3043 /// \param __X
3044 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
3045 /// \returns A 256-bit vector of [8 x float] containing the result.
3046 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3047 _mm256_broadcastss_ps(__m128 __X)
3049 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3052 /// Broadcasts the 64-bit floating-point value from the low element of the
3053 /// 128-bit vector of [2 x double] in \a __X to all elements of the
3054 /// result's 256-bit vector of [4 x double].
3056 /// \headerfile <immintrin.h>
3058 /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3060 /// \param __X
3061 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
3062 /// \returns A 256-bit vector of [4 x double] containing the result.
3063 static __inline__ __m256d __DEFAULT_FN_ATTRS256
3064 _mm256_broadcastsd_pd(__m128d __X)
3066 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3069 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
3070 /// upper halves of the 256-bit result.
3072 /// \headerfile <immintrin.h>
3074 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3076 /// \param __X
3077 /// A 128-bit integer vector to be broadcast.
3078 /// \returns A 256-bit integer vector containing the result.
3079 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3080 _mm256_broadcastsi128_si256(__m128i __X)
3082 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3085 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3087 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
3088 /// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3089 /// as specified by the immediate integer operand \a M.
3091 /// \code{.operation}
3092 /// FOR i := 0 TO 3
3093 /// j := i*32
3094 /// IF M[i] == 0
3095 /// result[31+j:j] := V1[31+j:j]
3096 /// ELSE
3097 /// result[31+j:j] := V2[32+j:j]
3098 /// FI
3099 /// ENDFOR
3100 /// \endcode
3102 /// \headerfile <immintrin.h>
3104 /// \code
3105 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3106 /// \endcode
3108 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3110 /// \param V1
3111 /// A 128-bit vector of [4 x i32] containing source values.
3112 /// \param V2
3113 /// A 128-bit vector of [4 x i32] containing source values.
3114 /// \param M
3115 /// An immediate 8-bit integer operand, with bits [3:0] specifying the
3116 /// source for each element of the result. The position of the mask bit
3117 /// corresponds to the index of a copied value. When a mask bit is 0, the
3118 /// element is copied from \a V1; otherwise, it is copied from \a V2.
3119 /// \returns A 128-bit vector of [4 x i32] containing the result.
3120 #define _mm_blend_epi32(V1, V2, M) \
3121 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3122 (__v4si)(__m128i)(V2), (int)(M)))
3124 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
3125 /// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3126 /// as specified by the immediate integer operand \a M.
3128 /// \code{.operation}
3129 /// FOR i := 0 TO 7
3130 /// j := i*32
3131 /// IF M[i] == 0
3132 /// result[31+j:j] := V1[31+j:j]
3133 /// ELSE
3134 /// result[31+j:j] := V2[32+j:j]
3135 /// FI
3136 /// ENDFOR
3137 /// \endcode
3139 /// \headerfile <immintrin.h>
3141 /// \code
3142 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3143 /// \endcode
3145 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3147 /// \param V1
3148 /// A 256-bit vector of [8 x i32] containing source values.
3149 /// \param V2
3150 /// A 256-bit vector of [8 x i32] containing source values.
3151 /// \param M
3152 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
3153 /// source for each element of the result. The position of the mask bit
3154 /// corresponds to the index of a copied value. When a mask bit is 0, the
3155 /// element is copied from \a V1; otherwise, it is is copied from \a V2.
3156 /// \returns A 256-bit vector of [8 x i32] containing the result.
3157 #define _mm256_blend_epi32(V1, V2, M) \
3158 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3159 (__v8si)(__m256i)(V2), (int)(M)))
3161 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3162 /// bytes of the 256-bit result.
3164 /// \headerfile <immintrin.h>
3166 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3168 /// \param __X
3169 /// A 128-bit integer vector whose low byte will be broadcast.
3170 /// \returns A 256-bit integer vector containing the result.
3171 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3172 _mm256_broadcastb_epi8(__m128i __X)
3174 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3177 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3178 /// to all elements of the result's 256-bit vector of [16 x i16].
3180 /// \headerfile <immintrin.h>
3182 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3184 /// \param __X
3185 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3186 /// \returns A 256-bit vector of [16 x i16] containing the result.
3187 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3188 _mm256_broadcastw_epi16(__m128i __X)
3190 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3193 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3194 /// to all elements of the result's 256-bit vector of [8 x i32].
3196 /// \headerfile <immintrin.h>
3198 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3200 /// \param __X
3201 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3202 /// \returns A 256-bit vector of [8 x i32] containing the result.
3203 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3204 _mm256_broadcastd_epi32(__m128i __X)
3206 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3209 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3210 /// to all elements of the result's 256-bit vector of [4 x i64].
3212 /// \headerfile <immintrin.h>
3214 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3216 /// \param __X
3217 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3218 /// \returns A 256-bit vector of [4 x i64] containing the result.
3219 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3220 _mm256_broadcastq_epi64(__m128i __X)
3222 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3225 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3226 /// bytes of the 128-bit result.
3228 /// \headerfile <immintrin.h>
3230 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3232 /// \param __X
3233 /// A 128-bit integer vector whose low byte will be broadcast.
3234 /// \returns A 128-bit integer vector containing the result.
3235 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3236 _mm_broadcastb_epi8(__m128i __X)
3238 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3241 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3242 /// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3244 /// \headerfile <immintrin.h>
3246 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3248 /// \param __X
3249 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3250 /// \returns A 128-bit vector of [8 x i16] containing the result.
3251 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3252 _mm_broadcastw_epi16(__m128i __X)
3254 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3257 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3258 /// to all elements of the result's vector of [4 x i32].
3260 /// \headerfile <immintrin.h>
3262 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3264 /// \param __X
3265 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3266 /// \returns A 128-bit vector of [4 x i32] containing the result.
3267 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3268 _mm_broadcastd_epi32(__m128i __X)
3270 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3273 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3274 /// to both elements of the result's 128-bit vector of [2 x i64].
3276 /// \headerfile <immintrin.h>
3278 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3280 /// \param __X
3281 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3282 /// \returns A 128-bit vector of [2 x i64] containing the result.
3283 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3284 _mm_broadcastq_epi64(__m128i __X)
3286 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3289 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3290 /// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3291 /// elements of the 256-bit vector of [8 x i32] in \a __b.
3293 /// \code{.operation}
3294 /// FOR i := 0 TO 7
3295 /// j := i*32
3296 /// k := __b[j+2:j] * 32
3297 /// result[j+31:j] := __a[k+31:k]
3298 /// ENDFOR
3299 /// \endcode
3301 /// \headerfile <immintrin.h>
3303 /// This intrinsic corresponds to the \c VPERMD instruction.
3305 /// \param __a
3306 /// A 256-bit vector of [8 x i32] containing the source values.
3307 /// \param __b
3308 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
3309 /// \a __a.
3310 /// \returns A 256-bit vector of [8 x i32] containing the result.
3311 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3312 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
3314 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3317 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3318 /// the 256-bit vector of [4 x double] in \a V as specified by the
3319 /// immediate value \a M.
3321 /// \code{.operation}
3322 /// FOR i := 0 TO 3
3323 /// j := i*64
3324 /// k := (M >> i*2)[1:0] * 64
3325 /// result[j+63:j] := V[k+63:k]
3326 /// ENDFOR
3327 /// \endcode
3329 /// \headerfile <immintrin.h>
3331 /// \code
3332 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3333 /// \endcode
3335 /// This intrinsic corresponds to the \c VPERMPD instruction.
3337 /// \param V
3338 /// A 256-bit vector of [4 x double] containing the source values.
3339 /// \param M
3340 /// An immediate 8-bit value specifying which elements to copy from \a V.
3341 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
3342 /// \a M[3:2] specifies the index for element 1, and so forth.
3343 /// \returns A 256-bit vector of [4 x double] containing the result.
3344 #define _mm256_permute4x64_pd(V, M) \
3345 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3347 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3348 /// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3349 /// the elements of the 256-bit vector of [8 x i32] in \a __b.
3351 /// \code{.operation}
3352 /// FOR i := 0 TO 7
3353 /// j := i*32
3354 /// k := __b[j+2:j] * 32
3355 /// result[j+31:j] := __a[k+31:k]
3356 /// ENDFOR
3357 /// \endcode
3359 /// \headerfile <immintrin.h>
3361 /// This intrinsic corresponds to the \c VPERMPS instruction.
3363 /// \param __a
3364 /// A 256-bit vector of [8 x float] containing the source values.
3365 /// \param __b
3366 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
3367 /// \a __a.
3368 /// \returns A 256-bit vector of [8 x float] containing the result.
3369 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3370 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
3372 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3375 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3376 /// of the 256-bit vector of [4 x i64] in \a V as specified by the
3377 /// immediate value \a M.
3379 /// \code{.operation}
3380 /// FOR i := 0 TO 3
3381 /// j := i*64
3382 /// k := (M >> i*2)[1:0] * 64
3383 /// result[j+63:j] := V[k+63:k]
3384 /// ENDFOR
3385 /// \endcode
3387 /// \headerfile <immintrin.h>
3389 /// \code
3390 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3391 /// \endcode
3393 /// This intrinsic corresponds to the \c VPERMQ instruction.
3395 /// \param V
3396 /// A 256-bit vector of [4 x i64] containing the source values.
3397 /// \param M
3398 /// An immediate 8-bit value specifying which elements to copy from \a V.
3399 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
3400 /// \a M[3:2] specifies the index for element 1, and so forth.
3401 /// \returns A 256-bit vector of [4 x i64] containing the result.
3402 #define _mm256_permute4x64_epi64(V, M) \
3403 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3405 /// Sets each half of the 256-bit result either to zero or to one of the
3406 /// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3407 /// as specified by the immediate value \a M.
3409 /// \code{.operation}
3410 /// FOR i := 0 TO 1
3411 /// j := i*128
3412 /// k := M >> (i*4)
3413 /// IF k[3] == 0
3414 /// CASE (k[1:0]) OF
3415 /// 0: result[127+j:j] := V1[127:0]
3416 /// 1: result[127+j:j] := V1[255:128]
3417 /// 2: result[127+j:j] := V2[127:0]
3418 /// 3: result[127+j:j] := V2[255:128]
3419 /// ESAC
3420 /// ELSE
3421 /// result[127+j:j] := 0
3422 /// FI
3423 /// ENDFOR
3424 /// \endcode
3426 /// \headerfile <immintrin.h>
3428 /// \code
3429 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3430 /// \endcode
3432 /// This intrinsic corresponds to the \c VPERM2I128 instruction.
3434 /// \param V1
3435 /// A 256-bit integer vector containing source values.
3436 /// \param V2
3437 /// A 256-bit integer vector containing source values.
3438 /// \param M
3439 /// An immediate value specifying how to form the result. Bits [3:0]
3440 /// control the lower half of the result, bits [7:4] control the upper half.
3441 /// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3442 /// otherwise bits [1:0] determine the source as follows. \n
3443 /// 0: the lower half of \a V1 \n
3444 /// 1: the upper half of \a V1 \n
3445 /// 2: the lower half of \a V2 \n
3446 /// 3: the upper half of \a V2
3447 /// \returns A 256-bit integer vector containing the result.
3448 #define _mm256_permute2x128_si256(V1, V2, M) \
3449 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3451 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3452 /// of the immediate \a M is zero, extracts the lower half of the result;
3453 /// otherwise, extracts the upper half.
3455 /// \headerfile <immintrin.h>
3457 /// \code
3458 /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3459 /// \endcode
3461 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3463 /// \param V
3464 /// A 256-bit integer vector containing the source values.
3465 /// \param M
3466 /// An immediate value specifying which half of \a V to extract.
3467 /// \returns A 128-bit integer vector containing the result.
3468 #define _mm256_extracti128_si256(V, M) \
3469 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3471 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3472 /// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3473 /// is zero, overwrites the lower half of the result; otherwise,
3474 /// overwrites the upper half.
3476 /// \headerfile <immintrin.h>
3478 /// \code
3479 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3480 /// \endcode
3482 /// This intrinsic corresponds to the \c VINSERTI128 instruction.
3484 /// \param V1
3485 /// A 256-bit integer vector containing a source value.
3486 /// \param V2
3487 /// A 128-bit integer vector containing a source value.
3488 /// \param M
3489 /// An immediate value specifying where to put \a V2 in the result.
3490 /// \returns A 256-bit integer vector containing the result.
3491 #define _mm256_inserti128_si256(V1, V2, M) \
3492 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3493 (__v2di)(__m128i)(V2), (int)(M)))
3495 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3496 /// the most significant bit of the corresponding element in the mask
3497 /// \a __M is set; otherwise, sets that element of the result to zero.
3498 /// Returns the 256-bit [8 x i32] result.
3500 /// \code{.operation}
3501 /// FOR i := 0 TO 7
3502 /// j := i*32
3503 /// IF __M[j+31] == 1
3504 /// result[j+31:j] := Load32(__X+(i*4))
3505 /// ELSE
3506 /// result[j+31:j] := 0
3507 /// FI
3508 /// ENDFOR
3509 /// \endcode
3511 /// \headerfile <immintrin.h>
3513 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3515 /// \param __X
3516 /// A pointer to the memory used for loading values.
3517 /// \param __M
3518 /// A 256-bit vector of [8 x i32] containing the mask bits.
3519 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3520 /// elements.
3521 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3522 _mm256_maskload_epi32(int const *__X, __m256i __M)
3524 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3527 /// Conditionally loads four 64-bit integer elements from memory \a __X, if
3528 /// the most significant bit of the corresponding element in the mask
3529 /// \a __M is set; otherwise, sets that element of the result to zero.
3530 /// Returns the 256-bit [4 x i64] result.
3532 /// \code{.operation}
3533 /// FOR i := 0 TO 3
3534 /// j := i*64
3535 /// IF __M[j+63] == 1
3536 /// result[j+63:j] := Load64(__X+(i*8))
3537 /// ELSE
3538 /// result[j+63:j] := 0
3539 /// FI
3540 /// ENDFOR
3541 /// \endcode
3543 /// \headerfile <immintrin.h>
3545 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3547 /// \param __X
3548 /// A pointer to the memory used for loading values.
3549 /// \param __M
3550 /// A 256-bit vector of [4 x i64] containing the mask bits.
3551 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3552 /// elements.
3553 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3554 _mm256_maskload_epi64(long long const *__X, __m256i __M)
3556 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3559 /// Conditionally loads four 32-bit integer elements from memory \a __X, if
3560 /// the most significant bit of the corresponding element in the mask
3561 /// \a __M is set; otherwise, sets that element of the result to zero.
3562 /// Returns the 128-bit [4 x i32] result.
3564 /// \code{.operation}
3565 /// FOR i := 0 TO 3
3566 /// j := i*32
3567 /// IF __M[j+31] == 1
3568 /// result[j+31:j] := Load32(__X+(i*4))
3569 /// ELSE
3570 /// result[j+31:j] := 0
3571 /// FI
3572 /// ENDFOR
3573 /// \endcode
3575 /// \headerfile <immintrin.h>
3577 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3579 /// \param __X
3580 /// A pointer to the memory used for loading values.
3581 /// \param __M
3582 /// A 128-bit vector of [4 x i32] containing the mask bits.
3583 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3584 /// elements.
3585 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3586 _mm_maskload_epi32(int const *__X, __m128i __M)
3588 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3591 /// Conditionally loads two 64-bit integer elements from memory \a __X, if
3592 /// the most significant bit of the corresponding element in the mask
3593 /// \a __M is set; otherwise, sets that element of the result to zero.
3594 /// Returns the 128-bit [2 x i64] result.
3596 /// \code{.operation}
3597 /// FOR i := 0 TO 1
3598 /// j := i*64
3599 /// IF __M[j+63] == 1
3600 /// result[j+63:j] := Load64(__X+(i*8))
3601 /// ELSE
3602 /// result[j+63:j] := 0
3603 /// FI
3604 /// ENDFOR
3605 /// \endcode
3607 /// \headerfile <immintrin.h>
3609 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3611 /// \param __X
3612 /// A pointer to the memory used for loading values.
3613 /// \param __M
3614 /// A 128-bit vector of [2 x i64] containing the mask bits.
3615 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3616 /// elements.
3617 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3618 _mm_maskload_epi64(long long const *__X, __m128i __M)
3620 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3623 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3624 /// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3625 /// the corresponding element in the mask \a __M is set; otherwise, the
3626 /// memory element is unchanged.
3628 /// \code{.operation}
3629 /// FOR i := 0 TO 7
3630 /// j := i*32
3631 /// IF __M[j+31] == 1
3632 /// Store32(__X+(i*4), __Y[j+31:j])
3633 /// FI
3634 /// ENDFOR
3635 /// \endcode
3637 /// \headerfile <immintrin.h>
3639 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3641 /// \param __X
3642 /// A pointer to the memory used for storing values.
3643 /// \param __M
3644 /// A 256-bit vector of [8 x i32] containing the mask bits.
3645 /// \param __Y
3646 /// A 256-bit vector of [8 x i32] containing the values to store.
3647 static __inline__ void __DEFAULT_FN_ATTRS256
3648 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3650 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3653 /// Conditionally stores four 64-bit integer elements from the 256-bit vector
3654 /// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3655 /// the corresponding element in the mask \a __M is set; otherwise, the
3656 /// memory element is unchanged.
3658 /// \code{.operation}
3659 /// FOR i := 0 TO 3
3660 /// j := i*64
3661 /// IF __M[j+63] == 1
3662 /// Store64(__X+(i*8), __Y[j+63:j])
3663 /// FI
3664 /// ENDFOR
3665 /// \endcode
3667 /// \headerfile <immintrin.h>
3669 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3671 /// \param __X
3672 /// A pointer to the memory used for storing values.
3673 /// \param __M
3674 /// A 256-bit vector of [4 x i64] containing the mask bits.
3675 /// \param __Y
3676 /// A 256-bit vector of [4 x i64] containing the values to store.
3677 static __inline__ void __DEFAULT_FN_ATTRS256
3678 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3680 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3683 /// Conditionally stores four 32-bit integer elements from the 128-bit vector
3684 /// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3685 /// the corresponding element in the mask \a __M is set; otherwise, the
3686 /// memory element is unchanged.
3688 /// \code{.operation}
3689 /// FOR i := 0 TO 3
3690 /// j := i*32
3691 /// IF __M[j+31] == 1
3692 /// Store32(__X+(i*4), __Y[j+31:j])
3693 /// FI
3694 /// ENDFOR
3695 /// \endcode
3697 /// \headerfile <immintrin.h>
3699 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3701 /// \param __X
3702 /// A pointer to the memory used for storing values.
3703 /// \param __M
3704 /// A 128-bit vector of [4 x i32] containing the mask bits.
3705 /// \param __Y
3706 /// A 128-bit vector of [4 x i32] containing the values to store.
3707 static __inline__ void __DEFAULT_FN_ATTRS128
3708 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3710 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3713 /// Conditionally stores two 64-bit integer elements from the 128-bit vector
3714 /// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3715 /// the corresponding element in the mask \a __M is set; otherwise, the
3716 /// memory element is unchanged.
3718 /// \code{.operation}
3719 /// FOR i := 0 TO 1
3720 /// j := i*64
3721 /// IF __M[j+63] == 1
3722 /// Store64(__X+(i*8), __Y[j+63:j])
3723 /// FI
3724 /// ENDFOR
3725 /// \endcode
3727 /// \headerfile <immintrin.h>
3729 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3731 /// \param __X
3732 /// A pointer to the memory used for storing values.
3733 /// \param __M
3734 /// A 128-bit vector of [2 x i64] containing the mask bits.
3735 /// \param __Y
3736 /// A 128-bit vector of [2 x i64] containing the values to store.
3737 static __inline__ void __DEFAULT_FN_ATTRS128
3738 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3740 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3743 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3744 /// left by the number of bits given in the corresponding element of the
3745 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3746 /// returns the result. If the shift count for any element is greater than
3747 /// 31, the result for that element is zero.
3749 /// \headerfile <immintrin.h>
3751 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3753 /// \param __X
3754 /// A 256-bit vector of [8 x i32] to be shifted.
3755 /// \param __Y
3756 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3757 /// bits).
3758 /// \returns A 256-bit vector of [8 x i32] containing the result.
3759 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3760 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
3762 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3765 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3766 /// left by the number of bits given in the corresponding element of the
3767 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3768 /// returns the result. If the shift count for any element is greater than
3769 /// 31, the result for that element is zero.
3771 /// \headerfile <immintrin.h>
3773 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3775 /// \param __X
3776 /// A 128-bit vector of [4 x i32] to be shifted.
3777 /// \param __Y
3778 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3779 /// bits).
3780 /// \returns A 128-bit vector of [4 x i32] containing the result.
3781 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3782 _mm_sllv_epi32(__m128i __X, __m128i __Y)
3784 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3787 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3788 /// left by the number of bits given in the corresponding element of the
3789 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3790 /// returns the result. If the shift count for any element is greater than
3791 /// 63, the result for that element is zero.
3793 /// \headerfile <immintrin.h>
3795 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3797 /// \param __X
3798 /// A 256-bit vector of [4 x i64] to be shifted.
3799 /// \param __Y
3800 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3801 /// bits).
3802 /// \returns A 256-bit vector of [4 x i64] containing the result.
3803 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3804 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
3806 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3809 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3810 /// left by the number of bits given in the corresponding element of the
3811 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3812 /// returns the result. If the shift count for any element is greater than
3813 /// 63, the result for that element is zero.
3815 /// \headerfile <immintrin.h>
3817 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3819 /// \param __X
3820 /// A 128-bit vector of [2 x i64] to be shifted.
3821 /// \param __Y
3822 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3823 /// bits).
3824 /// \returns A 128-bit vector of [2 x i64] containing the result.
3825 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3826 _mm_sllv_epi64(__m128i __X, __m128i __Y)
3828 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3831 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3832 /// right by the number of bits given in the corresponding element of the
3833 /// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3834 /// returns the result. If the shift count for any element is greater than
3835 /// 31, the result for that element is 0 or -1 according to the sign bit
3836 /// for that element.
3838 /// \headerfile <immintrin.h>
3840 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3842 /// \param __X
3843 /// A 256-bit vector of [8 x i32] to be shifted.
3844 /// \param __Y
3845 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3846 /// bits).
3847 /// \returns A 256-bit vector of [8 x i32] containing the result.
3848 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3849 _mm256_srav_epi32(__m256i __X, __m256i __Y)
3851 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3854 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3855 /// right by the number of bits given in the corresponding element of the
3856 /// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3857 /// returns the result. If the shift count for any element is greater than
3858 /// 31, the result for that element is 0 or -1 according to the sign bit
3859 /// for that element.
3861 /// \headerfile <immintrin.h>
3863 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3865 /// \param __X
3866 /// A 128-bit vector of [4 x i32] to be shifted.
3867 /// \param __Y
3868 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3869 /// bits).
3870 /// \returns A 128-bit vector of [4 x i32] containing the result.
3871 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3872 _mm_srav_epi32(__m128i __X, __m128i __Y)
3874 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3877 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3878 /// right by the number of bits given in the corresponding element of the
3879 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3880 /// returns the result. If the shift count for any element is greater than
3881 /// 31, the result for that element is zero.
3883 /// \headerfile <immintrin.h>
3885 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3887 /// \param __X
3888 /// A 256-bit vector of [8 x i32] to be shifted.
3889 /// \param __Y
3890 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3891 /// bits).
3892 /// \returns A 256-bit vector of [8 x i32] containing the result.
3893 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3894 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
3896 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3899 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3900 /// right by the number of bits given in the corresponding element of the
3901 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3902 /// returns the result. If the shift count for any element is greater than
3903 /// 31, the result for that element is zero.
3905 /// \headerfile <immintrin.h>
3907 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3909 /// \param __X
3910 /// A 128-bit vector of [4 x i32] to be shifted.
3911 /// \param __Y
3912 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3913 /// bits).
3914 /// \returns A 128-bit vector of [4 x i32] containing the result.
3915 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3916 _mm_srlv_epi32(__m128i __X, __m128i __Y)
3918 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3921 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3922 /// right by the number of bits given in the corresponding element of the
3923 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3924 /// returns the result. If the shift count for any element is greater than
3925 /// 63, the result for that element is zero.
3927 /// \headerfile <immintrin.h>
3929 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3931 /// \param __X
3932 /// A 256-bit vector of [4 x i64] to be shifted.
3933 /// \param __Y
3934 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3935 /// bits).
3936 /// \returns A 256-bit vector of [4 x i64] containing the result.
3937 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3938 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
3940 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3943 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3944 /// right by the number of bits given in the corresponding element of the
3945 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3946 /// returns the result. If the shift count for any element is greater than
3947 /// 63, the result for that element is zero.
3949 /// \headerfile <immintrin.h>
3951 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3953 /// \param __X
3954 /// A 128-bit vector of [2 x i64] to be shifted.
3955 /// \param __Y
3956 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3957 /// bits).
3958 /// \returns A 128-bit vector of [2 x i64] containing the result.
3959 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3960 _mm_srlv_epi64(__m128i __X, __m128i __Y)
3962 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3965 /// Conditionally gathers two 64-bit floating-point values, either from the
3966 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3967 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3968 /// of [2 x double] in \a mask determines the source for each element.
3970 /// \code{.operation}
3971 /// FOR element := 0 to 1
3972 /// j := element*64
3973 /// k := element*32
3974 /// IF mask[j+63] == 0
3975 /// result[j+63:j] := a[j+63:j]
3976 /// ELSE
3977 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3978 /// FI
3979 /// ENDFOR
3980 /// \endcode
3982 /// \headerfile <immintrin.h>
3984 /// \code
3985 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3986 /// __m128d mask, const int s);
3987 /// \endcode
3989 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3991 /// \param a
3992 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
3993 /// zero.
3994 /// \param m
3995 /// A pointer to the memory used for loading values.
3996 /// \param i
3997 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3998 /// the first two elements are used.
3999 /// \param mask
4000 /// A 128-bit vector of [2 x double] containing the mask. The most
4001 /// significant bit of each element in the mask vector represents the mask
4002 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4003 /// is gathered; otherwise the value is loaded from memory.
4004 /// \param s
4005 /// A literal constant scale factor for the indexes in \a i. Must be
4006 /// 1, 2, 4, or 8.
4007 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4008 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4009 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4010 (double const *)(m), \
4011 (__v4si)(__m128i)(i), \
4012 (__v2df)(__m128d)(mask), (s)))
4014 /// Conditionally gathers four 64-bit floating-point values, either from the
4015 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4016 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4017 /// of [4 x double] in \a mask determines the source for each element.
4019 /// \code{.operation}
4020 /// FOR element := 0 to 3
4021 /// j := element*64
4022 /// k := element*32
4023 /// IF mask[j+63] == 0
4024 /// result[j+63:j] := a[j+63:j]
4025 /// ELSE
4026 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4027 /// FI
4028 /// ENDFOR
4029 /// \endcode
4031 /// \headerfile <immintrin.h>
4033 /// \code
4034 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4035 /// __m256d mask, const int s);
4036 /// \endcode
4038 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4040 /// \param a
4041 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
4042 /// zero.
4043 /// \param m
4044 /// A pointer to the memory used for loading values.
4045 /// \param i
4046 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4047 /// \param mask
4048 /// A 256-bit vector of [4 x double] containing the mask. The most
4049 /// significant bit of each element in the mask vector represents the mask
4050 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4051 /// is gathered; otherwise the value is loaded from memory.
4052 /// \param s
4053 /// A literal constant scale factor for the indexes in \a i. Must be
4054 /// 1, 2, 4, or 8.
4055 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4056 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4057 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4058 (double const *)(m), \
4059 (__v4si)(__m128i)(i), \
4060 (__v4df)(__m256d)(mask), (s)))
4062 /// Conditionally gathers two 64-bit floating-point values, either from the
4063 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4064 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4065 /// of [2 x double] in \a mask determines the source for each element.
4067 /// \code{.operation}
4068 /// FOR element := 0 to 1
4069 /// j := element*64
4070 /// k := element*64
4071 /// IF mask[j+63] == 0
4072 /// result[j+63:j] := a[j+63:j]
4073 /// ELSE
4074 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4075 /// FI
4076 /// ENDFOR
4077 /// \endcode
4079 /// \headerfile <immintrin.h>
4081 /// \code
4082 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4083 /// __m128d mask, const int s);
4084 /// \endcode
4086 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4088 /// \param a
4089 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
4090 /// zero.
4091 /// \param m
4092 /// A pointer to the memory used for loading values.
4093 /// \param i
4094 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4095 /// \param mask
4096 /// A 128-bit vector of [2 x double] containing the mask. The most
4097 /// significant bit of each element in the mask vector represents the mask
4098 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4099 /// is gathered; otherwise the value is loaded from memory.
4100 /// \param s
4101 /// A literal constant scale factor for the indexes in \a i. Must be
4102 /// 1, 2, 4, or 8.
4103 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4104 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4105 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4106 (double const *)(m), \
4107 (__v2di)(__m128i)(i), \
4108 (__v2df)(__m128d)(mask), (s)))
4110 /// Conditionally gathers four 64-bit floating-point values, either from the
4111 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4112 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4113 /// of [4 x double] in \a mask determines the source for each element.
4115 /// \code{.operation}
4116 /// FOR element := 0 to 3
4117 /// j := element*64
4118 /// k := element*64
4119 /// IF mask[j+63] == 0
4120 /// result[j+63:j] := a[j+63:j]
4121 /// ELSE
4122 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4123 /// FI
4124 /// ENDFOR
4125 /// \endcode
4127 /// \headerfile <immintrin.h>
4129 /// \code
4130 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4131 /// __m256d mask, const int s);
4132 /// \endcode
4134 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4136 /// \param a
4137 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
4138 /// zero.
4139 /// \param m
4140 /// A pointer to the memory used for loading values.
4141 /// \param i
4142 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4143 /// \param mask
4144 /// A 256-bit vector of [4 x double] containing the mask. The most
4145 /// significant bit of each element in the mask vector represents the mask
4146 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4147 /// is gathered; otherwise the value is loaded from memory.
4148 /// \param s
4149 /// A literal constant scale factor for the indexes in \a i. Must be
4150 /// 1, 2, 4, or 8.
4151 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4152 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4153 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4154 (double const *)(m), \
4155 (__v4di)(__m256i)(i), \
4156 (__v4df)(__m256d)(mask), (s)))
4158 /// Conditionally gathers four 32-bit floating-point values, either from the
4159 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4160 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4161 /// of [4 x float] in \a mask determines the source for each element.
4163 /// \code{.operation}
4164 /// FOR element := 0 to 3
4165 /// j := element*32
4166 /// k := element*32
4167 /// IF mask[j+31] == 0
4168 /// result[j+31:j] := a[j+31:j]
4169 /// ELSE
4170 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4171 /// FI
4172 /// ENDFOR
4173 /// \endcode
4175 /// \headerfile <immintrin.h>
4177 /// \code
4178 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4179 /// __m128 mask, const int s);
4180 /// \endcode
4182 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4184 /// \param a
4185 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4186 /// zero.
4187 /// \param m
4188 /// A pointer to the memory used for loading values.
4189 /// \param i
4190 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4191 /// \param mask
4192 /// A 128-bit vector of [4 x float] containing the mask. The most
4193 /// significant bit of each element in the mask vector represents the mask
4194 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4195 /// is gathered; otherwise the value is loaded from memory.
4196 /// \param s
4197 /// A literal constant scale factor for the indexes in \a i. Must be
4198 /// 1, 2, 4, or 8.
4199 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4200 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4201 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4202 (float const *)(m), \
4203 (__v4si)(__m128i)(i), \
4204 (__v4sf)(__m128)(mask), (s)))
4206 /// Conditionally gathers eight 32-bit floating-point values, either from the
4207 /// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4208 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4209 /// of [8 x float] in \a mask determines the source for each element.
4211 /// \code{.operation}
4212 /// FOR element := 0 to 7
4213 /// j := element*32
4214 /// k := element*32
4215 /// IF mask[j+31] == 0
4216 /// result[j+31:j] := a[j+31:j]
4217 /// ELSE
4218 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4219 /// FI
4220 /// ENDFOR
4221 /// \endcode
4223 /// \headerfile <immintrin.h>
4225 /// \code
4226 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4227 /// __m256 mask, const int s);
4228 /// \endcode
4230 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4232 /// \param a
4233 /// A 256-bit vector of [8 x float] used as the source when a mask bit is
4234 /// zero.
4235 /// \param m
4236 /// A pointer to the memory used for loading values.
4237 /// \param i
4238 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4239 /// \param mask
4240 /// A 256-bit vector of [8 x float] containing the mask. The most
4241 /// significant bit of each element in the mask vector represents the mask
4242 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4243 /// is gathered; otherwise the value is loaded from memory.
4244 /// \param s
4245 /// A literal constant scale factor for the indexes in \a i. Must be
4246 /// 1, 2, 4, or 8.
4247 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4248 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4249 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4250 (float const *)(m), \
4251 (__v8si)(__m256i)(i), \
4252 (__v8sf)(__m256)(mask), (s)))
4254 /// Conditionally gathers two 32-bit floating-point values, either from the
4255 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4256 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4257 /// of [4 x float] in \a mask determines the source for the lower two
4258 /// elements. The upper two elements of the result are zeroed.
4260 /// \code{.operation}
4261 /// FOR element := 0 to 1
4262 /// j := element*32
4263 /// k := element*64
4264 /// IF mask[j+31] == 0
4265 /// result[j+31:j] := a[j+31:j]
4266 /// ELSE
4267 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4268 /// FI
4269 /// ENDFOR
4270 /// result[127:64] := 0
4271 /// \endcode
4273 /// \headerfile <immintrin.h>
4275 /// \code
4276 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4277 /// __m128 mask, const int s);
4278 /// \endcode
4280 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4282 /// \param a
4283 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4284 /// zero. Only the first two elements are used.
4285 /// \param m
4286 /// A pointer to the memory used for loading values.
4287 /// \param i
4288 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4289 /// \param mask
4290 /// A 128-bit vector of [4 x float] containing the mask. The most
4291 /// significant bit of each element in the mask vector represents the mask
4292 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4293 /// is gathered; otherwise the value is loaded from memory. Only the first
4294 /// two elements are used.
4295 /// \param s
4296 /// A literal constant scale factor for the indexes in \a i. Must be
4297 /// 1, 2, 4, or 8.
4298 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4299 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4300 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4301 (float const *)(m), \
4302 (__v2di)(__m128i)(i), \
4303 (__v4sf)(__m128)(mask), (s)))
4305 /// Conditionally gathers four 32-bit floating-point values, either from the
4306 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4307 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4308 /// of [4 x float] in \a mask determines the source for each element.
4310 /// \code{.operation}
4311 /// FOR element := 0 to 3
4312 /// j := element*32
4313 /// k := element*64
4314 /// IF mask[j+31] == 0
4315 /// result[j+31:j] := a[j+31:j]
4316 /// ELSE
4317 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4318 /// FI
4319 /// ENDFOR
4320 /// \endcode
4322 /// \headerfile <immintrin.h>
4324 /// \code
4325 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4326 /// __m128 mask, const int s);
4327 /// \endcode
4329 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4331 /// \param a
4332 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
4333 /// zero.
4334 /// \param m
4335 /// A pointer to the memory used for loading values.
4336 /// \param i
4337 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4338 /// \param mask
4339 /// A 128-bit vector of [4 x float] containing the mask. The most
4340 /// significant bit of each element in the mask vector represents the mask
4341 /// bits. If a mask bit is zero, the corresponding value from vector \a a
4342 /// is gathered; otherwise the value is loaded from memory.
4343 /// \param s
4344 /// A literal constant scale factor for the indexes in \a i. Must be
4345 /// 1, 2, 4, or 8.
4346 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4347 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4348 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4349 (float const *)(m), \
4350 (__v4di)(__m256i)(i), \
4351 (__v4sf)(__m128)(mask), (s)))
4353 /// Conditionally gathers four 32-bit integer values, either from the
4354 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4355 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4356 /// of [4 x i32] in \a mask determines the source for each element.
4358 /// \code{.operation}
4359 /// FOR element := 0 to 3
4360 /// j := element*32
4361 /// k := element*32
4362 /// IF mask[j+31] == 0
4363 /// result[j+31:j] := a[j+31:j]
4364 /// ELSE
4365 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4366 /// FI
4367 /// ENDFOR
4368 /// \endcode
4370 /// \headerfile <immintrin.h>
4372 /// \code
4373 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4374 /// __m128i mask, const int s);
4375 /// \endcode
4377 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4379 /// \param a
4380 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4381 /// zero.
4382 /// \param m
4383 /// A pointer to the memory used for loading values.
4384 /// \param i
4385 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4386 /// \param mask
4387 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4388 /// bit of each element in the mask vector represents the mask bits. If a
4389 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4390 /// otherwise the value is loaded from memory.
4391 /// \param s
4392 /// A literal constant scale factor for the indexes in \a i. Must be
4393 /// 1, 2, 4, or 8.
4394 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4395 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4396 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4397 (int const *)(m), \
4398 (__v4si)(__m128i)(i), \
4399 (__v4si)(__m128i)(mask), (s)))
4401 /// Conditionally gathers eight 32-bit integer values, either from the
4402 /// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4403 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4404 /// of [8 x i32] in \a mask determines the source for each element.
4406 /// \code{.operation}
4407 /// FOR element := 0 to 7
4408 /// j := element*32
4409 /// k := element*32
4410 /// IF mask[j+31] == 0
4411 /// result[j+31:j] := a[j+31:j]
4412 /// ELSE
4413 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4414 /// FI
4415 /// ENDFOR
4416 /// \endcode
4418 /// \headerfile <immintrin.h>
4420 /// \code
4421 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4422 /// __m256i mask, const int s);
4423 /// \endcode
4425 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4427 /// \param a
4428 /// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4429 /// zero.
4430 /// \param m
4431 /// A pointer to the memory used for loading values.
4432 /// \param i
4433 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4434 /// \param mask
4435 /// A 256-bit vector of [8 x i32] containing the mask. The most significant
4436 /// bit of each element in the mask vector represents the mask bits. If a
4437 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4438 /// otherwise the value is loaded from memory.
4439 /// \param s
4440 /// A literal constant scale factor for the indexes in \a i. Must be
4441 /// 1, 2, 4, or 8.
4442 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4443 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4444 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4445 (int const *)(m), \
4446 (__v8si)(__m256i)(i), \
4447 (__v8si)(__m256i)(mask), (s)))
4449 /// Conditionally gathers two 32-bit integer values, either from the
4450 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4451 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4452 /// of [4 x i32] in \a mask determines the source for the lower two
4453 /// elements. The upper two elements of the result are zeroed.
4455 /// \code{.operation}
4456 /// FOR element := 0 to 1
4457 /// j := element*32
4458 /// k := element*64
4459 /// IF mask[j+31] == 0
4460 /// result[j+31:j] := a[j+31:j]
4461 /// ELSE
4462 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4463 /// FI
4464 /// ENDFOR
4465 /// result[127:64] := 0
4466 /// \endcode
4468 /// \headerfile <immintrin.h>
4470 /// \code
4471 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4472 /// __m128i mask, const int s);
4473 /// \endcode
4475 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4477 /// \param a
4478 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4479 /// zero. Only the first two elements are used.
4480 /// \param m
4481 /// A pointer to the memory used for loading values.
4482 /// \param i
4483 /// A 128-bit vector of [2 x i64] containing indexes into \a m.
4484 /// \param mask
4485 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4486 /// bit of each element in the mask vector represents the mask bits. If a
4487 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4488 /// otherwise the value is loaded from memory. Only the first two elements
4489 /// are used.
4490 /// \param s
4491 /// A literal constant scale factor for the indexes in \a i. Must be
4492 /// 1, 2, 4, or 8.
4493 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4494 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4495 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4496 (int const *)(m), \
4497 (__v2di)(__m128i)(i), \
4498 (__v4si)(__m128i)(mask), (s)))
4500 /// Conditionally gathers four 32-bit integer values, either from the
4501 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4502 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4503 /// of [4 x i32] in \a mask determines the source for each element.
4505 /// \code{.operation}
4506 /// FOR element := 0 to 3
4507 /// j := element*32
4508 /// k := element*64
4509 /// IF mask[j+31] == 0
4510 /// result[j+31:j] := a[j+31:j]
4511 /// ELSE
4512 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4513 /// FI
4514 /// ENDFOR
4515 /// \endcode
4517 /// \headerfile <immintrin.h>
4519 /// \code
4520 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4521 /// __m128i mask, const int s);
4522 /// \endcode
4524 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4526 /// \param a
4527 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4528 /// zero.
4529 /// \param m
4530 /// A pointer to the memory used for loading values.
4531 /// \param i
4532 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4533 /// \param mask
4534 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
4535 /// bit of each element in the mask vector represents the mask bits. If a
4536 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4537 /// otherwise the value is loaded from memory.
4538 /// \param s
4539 /// A literal constant scale factor for the indexes in \a i. Must be
4540 /// 1, 2, 4, or 8.
4541 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4542 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4543 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4544 (int const *)(m), \
4545 (__v4di)(__m256i)(i), \
4546 (__v4si)(__m128i)(mask), (s)))
4548 /// Conditionally gathers two 64-bit integer values, either from the
4549 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4550 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4551 /// of [2 x i64] in \a mask determines the source for each element.
4553 /// \code{.operation}
4554 /// FOR element := 0 to 1
4555 /// j := element*64
4556 /// k := element*32
4557 /// IF mask[j+63] == 0
4558 /// result[j+63:j] := a[j+63:j]
4559 /// ELSE
4560 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4561 /// FI
4562 /// ENDFOR
4563 /// \endcode
4565 /// \headerfile <immintrin.h>
4567 /// \code
4568 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4569 /// __m128i mask, const int s);
4570 /// \endcode
4572 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4574 /// \param a
4575 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4576 /// zero.
4577 /// \param m
4578 /// A pointer to the memory used for loading values.
4579 /// \param i
4580 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4581 /// the first two elements are used.
4582 /// \param mask
4583 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
4584 /// bit of each element in the mask vector represents the mask bits. If a
4585 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4586 /// otherwise the value is loaded from memory.
4587 /// \param s
4588 /// A literal constant scale factor for the indexes in \a i. Must be
4589 /// 1, 2, 4, or 8.
4590 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4591 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4592 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4593 (long long const *)(m), \
4594 (__v4si)(__m128i)(i), \
4595 (__v2di)(__m128i)(mask), (s)))
4597 /// Conditionally gathers four 64-bit integer values, either from the
4598 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4599 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4600 /// of [4 x i64] in \a mask determines the source for each element.
4602 /// \code{.operation}
4603 /// FOR element := 0 to 3
4604 /// j := element*64
4605 /// k := element*32
4606 /// IF mask[j+63] == 0
4607 /// result[j+63:j] := a[j+63:j]
4608 /// ELSE
4609 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4610 /// FI
4611 /// ENDFOR
4612 /// \endcode
4614 /// \headerfile <immintrin.h>
4616 /// \code
4617 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4618 /// __m128i i, __m256i mask, const int s);
4619 /// \endcode
4621 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4623 /// \param a
4624 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4625 /// zero.
4626 /// \param m
4627 /// A pointer to the memory used for loading values.
4628 /// \param i
4629 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4630 /// \param mask
4631 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
4632 /// bit of each element in the mask vector represents the mask bits. If a
4633 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4634 /// otherwise the value is loaded from memory.
4635 /// \param s
4636 /// A literal constant scale factor for the indexes in \a i. Must be
4637 /// 1, 2, 4, or 8.
4638 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4639 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4640 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4641 (long long const *)(m), \
4642 (__v4si)(__m128i)(i), \
4643 (__v4di)(__m256i)(mask), (s)))
4645 /// Conditionally gathers two 64-bit integer values, either from the
4646 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4647 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4648 /// of [2 x i64] in \a mask determines the source for each element.
4650 /// \code{.operation}
4651 /// FOR element := 0 to 1
4652 /// j := element*64
4653 /// k := element*64
4654 /// IF mask[j+63] == 0
4655 /// result[j+63:j] := a[j+63:j]
4656 /// ELSE
4657 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4658 /// FI
4659 /// ENDFOR
4660 /// \endcode
4662 /// \headerfile <immintrin.h>
4664 /// \code
4665 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4666 /// __m128i mask, const int s);
4667 /// \endcode
4669 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4671 /// \param a
4672 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4673 /// zero.
4674 /// \param m
4675 /// A pointer to the memory used for loading values.
4676 /// \param i
4677 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4678 /// \param mask
4679 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
4680 /// bit of each element in the mask vector represents the mask bits. If a
4681 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4682 /// otherwise the value is loaded from memory.
4683 /// \param s
4684 /// A literal constant scale factor for the indexes in \a i. Must be
4685 /// 1, 2, 4, or 8.
4686 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4687 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4688 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4689 (long long const *)(m), \
4690 (__v2di)(__m128i)(i), \
4691 (__v2di)(__m128i)(mask), (s)))
4693 /// Conditionally gathers four 64-bit integer values, either from the
4694 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4695 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4696 /// of [4 x i64] in \a mask determines the source for each element.
4698 /// \code{.operation}
4699 /// FOR element := 0 to 3
4700 /// j := element*64
4701 /// k := element*64
4702 /// IF mask[j+63] == 0
4703 /// result[j+63:j] := a[j+63:j]
4704 /// ELSE
4705 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4706 /// FI
4707 /// ENDFOR
4708 /// \endcode
4710 /// \headerfile <immintrin.h>
4712 /// \code
4713 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4714 /// __m256i i, __m256i mask, const int s);
4715 /// \endcode
4717 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4719 /// \param a
4720 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4721 /// zero.
4722 /// \param m
4723 /// A pointer to the memory used for loading values.
4724 /// \param i
4725 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4726 /// \param mask
4727 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
4728 /// bit of each element in the mask vector represents the mask bits. If a
4729 /// mask bit is zero, the corresponding value from vector \a a is gathered;
4730 /// otherwise the value is loaded from memory.
4731 /// \param s
4732 /// A literal constant scale factor for the indexes in \a i. Must be
4733 /// 1, 2, 4, or 8.
4734 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4735 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4736 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4737 (long long const *)(m), \
4738 (__v4di)(__m256i)(i), \
4739 (__v4di)(__m256i)(mask), (s)))
4741 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4742 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4744 /// \code{.operation}
4745 /// FOR element := 0 to 1
4746 /// j := element*64
4747 /// k := element*32
4748 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4749 /// ENDFOR
4750 /// \endcode
4752 /// \headerfile <immintrin.h>
4754 /// \code
4755 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4756 /// \endcode
4758 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4760 /// \param m
4761 /// A pointer to the memory used for loading values.
4762 /// \param i
4763 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4764 /// the first two elements are used.
4765 /// \param s
4766 /// A literal constant scale factor for the indexes in \a i. Must be
4767 /// 1, 2, 4, or 8.
4768 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4769 #define _mm_i32gather_pd(m, i, s) \
4770 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4771 (double const *)(m), \
4772 (__v4si)(__m128i)(i), \
4773 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4774 _mm_setzero_pd()), \
4775 (s)))
4777 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4778 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4780 /// \code{.operation}
4781 /// FOR element := 0 to 3
4782 /// j := element*64
4783 /// k := element*32
4784 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4785 /// ENDFOR
4786 /// \endcode
4788 /// \headerfile <immintrin.h>
4790 /// \code
4791 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4792 /// \endcode
4794 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4796 /// \param m
4797 /// A pointer to the memory used for loading values.
4798 /// \param i
4799 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4800 /// \param s
4801 /// A literal constant scale factor for the indexes in \a i. Must be
4802 /// 1, 2, 4, or 8.
4803 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4804 #define _mm256_i32gather_pd(m, i, s) \
4805 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4806 (double const *)(m), \
4807 (__v4si)(__m128i)(i), \
4808 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4809 _mm256_setzero_pd(), \
4810 _CMP_EQ_OQ), \
4811 (s)))
4813 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4814 /// indexes from the 128-bit vector of [2 x i64] in \a i.
4816 /// \code{.operation}
4817 /// FOR element := 0 to 1
4818 /// j := element*64
4819 /// k := element*64
4820 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4821 /// ENDFOR
4822 /// \endcode
4824 /// \headerfile <immintrin.h>
4826 /// \code
4827 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4828 /// \endcode
4830 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4832 /// \param m
4833 /// A pointer to the memory used for loading values.
4834 /// \param i
4835 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4836 /// \param s
4837 /// A literal constant scale factor for the indexes in \a i. Must be
4838 /// 1, 2, 4, or 8.
4839 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4840 #define _mm_i64gather_pd(m, i, s) \
4841 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4842 (double const *)(m), \
4843 (__v2di)(__m128i)(i), \
4844 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4845 _mm_setzero_pd()), \
4846 (s)))
4848 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4849 /// indexes from the 256-bit vector of [4 x i64] in \a i.
4851 /// \code{.operation}
4852 /// FOR element := 0 to 3
4853 /// j := element*64
4854 /// k := element*64
4855 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4856 /// ENDFOR
4857 /// \endcode
4859 /// \headerfile <immintrin.h>
4861 /// \code
4862 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4863 /// \endcode
4865 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4867 /// \param m
4868 /// A pointer to the memory used for loading values.
4869 /// \param i
4870 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4871 /// \param s
4872 /// A literal constant scale factor for the indexes in \a i. Must be
4873 /// 1, 2, 4, or 8.
4874 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4875 #define _mm256_i64gather_pd(m, i, s) \
4876 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4877 (double const *)(m), \
4878 (__v4di)(__m256i)(i), \
4879 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4880 _mm256_setzero_pd(), \
4881 _CMP_EQ_OQ), \
4882 (s)))
4884 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4885 /// indexes from the 128-bit vector of [4 x i32] in \a i.
4887 /// \code{.operation}
4888 /// FOR element := 0 to 3
4889 /// j := element*32
4890 /// k := element*32
4891 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4892 /// ENDFOR
4893 /// \endcode
4895 /// \headerfile <immintrin.h>
4897 /// \code
4898 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4899 /// \endcode
4901 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4903 /// \param m
4904 /// A pointer to the memory used for loading values.
4905 /// \param i
4906 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4907 /// \param s
4908 /// A literal constant scale factor for the indexes in \a i. Must be
4909 /// 1, 2, 4, or 8.
4910 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4911 #define _mm_i32gather_ps(m, i, s) \
4912 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4913 (float const *)(m), \
4914 (__v4si)(__m128i)(i), \
4915 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4916 _mm_setzero_ps()), \
4917 (s)))
4919 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
4920 /// indexes from the 256-bit vector of [8 x i32] in \a i.
4922 /// \code{.operation}
4923 /// FOR element := 0 to 7
4924 /// j := element*32
4925 /// k := element*32
4926 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4927 /// ENDFOR
4928 /// \endcode
4930 /// \headerfile <immintrin.h>
4932 /// \code
4933 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4934 /// \endcode
4936 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4938 /// \param m
4939 /// A pointer to the memory used for loading values.
4940 /// \param i
4941 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4942 /// \param s
4943 /// A literal constant scale factor for the indexes in \a i. Must be
4944 /// 1, 2, 4, or 8.
4945 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4946 #define _mm256_i32gather_ps(m, i, s) \
4947 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4948 (float const *)(m), \
4949 (__v8si)(__m256i)(i), \
4950 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4951 _mm256_setzero_ps(), \
4952 _CMP_EQ_OQ), \
4953 (s)))
4955 /// Gathers two 32-bit floating-point values from memory \a m using scaled
4956 /// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4957 /// elements of the result are zeroed.
4959 /// \code{.operation}
4960 /// FOR element := 0 to 1
4961 /// j := element*32
4962 /// k := element*64
4963 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4964 /// ENDFOR
4965 /// result[127:64] := 0
4966 /// \endcode
4968 /// \headerfile <immintrin.h>
4970 /// \code
4971 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4972 /// \endcode
4974 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4976 /// \param m
4977 /// A pointer to the memory used for loading values.
4978 /// \param i
4979 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4980 /// \param s
4981 /// A literal constant scale factor for the indexes in \a i. Must be
4982 /// 1, 2, 4, or 8.
4983 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4984 #define _mm_i64gather_ps(m, i, s) \
4985 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4986 (float const *)(m), \
4987 (__v2di)(__m128i)(i), \
4988 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4989 _mm_setzero_ps()), \
4990 (s)))
4992 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4993 /// indexes from the 256-bit vector of [4 x i64] in \a i.
4995 /// \code{.operation}
4996 /// FOR element := 0 to 3
4997 /// j := element*32
4998 /// k := element*64
4999 /// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
5000 /// ENDFOR
5001 /// \endcode
5003 /// \headerfile <immintrin.h>
5005 /// \code
5006 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
5007 /// \endcode
5009 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
5011 /// \param m
5012 /// A pointer to the memory used for loading values.
5013 /// \param i
5014 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5015 /// \param s
5016 /// A literal constant scale factor for the indexes in \a i. Must be
5017 /// 1, 2, 4, or 8.
5018 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
5019 #define _mm256_i64gather_ps(m, i, s) \
5020 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5021 (float const *)(m), \
5022 (__v4di)(__m256i)(i), \
5023 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5024 _mm_setzero_ps()), \
5025 (s)))
5027 /// Gathers four 32-bit floating-point values from memory \a m using scaled
5028 /// indexes from the 128-bit vector of [4 x i32] in \a i.
5030 /// \code{.operation}
5031 /// FOR element := 0 to 3
5032 /// j := element*32
5033 /// k := element*32
5034 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5035 /// ENDFOR
5036 /// \endcode
5038 /// \headerfile <immintrin.h>
5040 /// \code
5041 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5042 /// \endcode
5044 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5046 /// \param m
5047 /// A pointer to the memory used for loading values.
5048 /// \param i
5049 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5050 /// \param s
5051 /// A literal constant scale factor for the indexes in \a i. Must be
5052 /// 1, 2, 4, or 8.
5053 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5054 #define _mm_i32gather_epi32(m, i, s) \
5055 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5056 (int const *)(m), (__v4si)(__m128i)(i), \
5057 (__v4si)_mm_set1_epi32(-1), (s)))
5059 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
5060 /// indexes from the 256-bit vector of [8 x i32] in \a i.
5062 /// \code{.operation}
5063 /// FOR element := 0 to 7
5064 /// j := element*32
5065 /// k := element*32
5066 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5067 /// ENDFOR
5068 /// \endcode
5070 /// \headerfile <immintrin.h>
5072 /// \code
5073 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5074 /// \endcode
5076 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5078 /// \param m
5079 /// A pointer to the memory used for loading values.
5080 /// \param i
5081 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5082 /// \param s
5083 /// A literal constant scale factor for the indexes in \a i. Must be
5084 /// 1, 2, 4, or 8.
5085 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5086 #define _mm256_i32gather_epi32(m, i, s) \
5087 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5088 (int const *)(m), (__v8si)(__m256i)(i), \
5089 (__v8si)_mm256_set1_epi32(-1), (s)))
5091 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
5092 /// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5093 /// of the result are zeroed.
5095 /// \code{.operation}
5096 /// FOR element := 0 to 1
5097 /// j := element*32
5098 /// k := element*64
5099 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5100 /// ENDFOR
5101 /// result[127:64] := 0
5102 /// \endcode
5104 /// \headerfile <immintrin.h>
5106 /// \code
5107 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5108 /// \endcode
5110 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5112 /// \param m
5113 /// A pointer to the memory used for loading values.
5114 /// \param i
5115 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5116 /// \param s
5117 /// A literal constant scale factor for the indexes in \a i. Must be
5118 /// 1, 2, 4, or 8.
5119 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5120 #define _mm_i64gather_epi32(m, i, s) \
5121 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5122 (int const *)(m), (__v2di)(__m128i)(i), \
5123 (__v4si)_mm_set1_epi32(-1), (s)))
5125 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
5126 /// from the 256-bit vector of [4 x i64] in \a i.
5128 /// \code{.operation}
5129 /// FOR element := 0 to 3
5130 /// j := element*32
5131 /// k := element*64
5132 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5133 /// ENDFOR
5134 /// \endcode
5136 /// \headerfile <immintrin.h>
5138 /// \code
5139 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5140 /// \endcode
5142 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5144 /// \param m
5145 /// A pointer to the memory used for loading values.
5146 /// \param i
5147 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5148 /// \param s
5149 /// A literal constant scale factor for the indexes in \a i. Must be
5150 /// 1, 2, 4, or 8.
5151 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5152 #define _mm256_i64gather_epi32(m, i, s) \
5153 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5154 (int const *)(m), (__v4di)(__m256i)(i), \
5155 (__v4si)_mm_set1_epi32(-1), (s)))
5157 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5158 /// from the 128-bit vector of [4 x i32] in \a i.
5160 /// \code{.operation}
5161 /// FOR element := 0 to 1
5162 /// j := element*64
5163 /// k := element*32
5164 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5165 /// ENDFOR
5166 /// \endcode
5168 /// \headerfile <immintrin.h>
5170 /// \code
5171 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5172 /// \endcode
5174 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5176 /// \param m
5177 /// A pointer to the memory used for loading values.
5178 /// \param i
5179 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5180 /// the first two elements are used.
5181 /// \param s
5182 /// A literal constant scale factor for the indexes in \a i. Must be
5183 /// 1, 2, 4, or 8.
5184 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5185 #define _mm_i32gather_epi64(m, i, s) \
5186 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5187 (long long const *)(m), \
5188 (__v4si)(__m128i)(i), \
5189 (__v2di)_mm_set1_epi64x(-1), (s)))
5191 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5192 /// from the 128-bit vector of [4 x i32] in \a i.
5194 /// \code{.operation}
5195 /// FOR element := 0 to 3
5196 /// j := element*64
5197 /// k := element*32
5198 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5199 /// ENDFOR
5200 /// \endcode
5202 /// \headerfile <immintrin.h>
5204 /// \code
5205 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5206 /// \endcode
5208 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5210 /// \param m
5211 /// A pointer to the memory used for loading values.
5212 /// \param i
5213 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5214 /// \param s
5215 /// A literal constant scale factor for the indexes in \a i. Must be
5216 /// 1, 2, 4, or 8.
5217 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5218 #define _mm256_i32gather_epi64(m, i, s) \
5219 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5220 (long long const *)(m), \
5221 (__v4si)(__m128i)(i), \
5222 (__v4di)_mm256_set1_epi64x(-1), (s)))
5224 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5225 /// from the 128-bit vector of [2 x i64] in \a i.
5227 /// \code{.operation}
5228 /// FOR element := 0 to 1
5229 /// j := element*64
5230 /// k := element*64
5231 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5232 /// ENDFOR
5233 /// \endcode
5235 /// \headerfile <immintrin.h>
5237 /// \code
5238 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5239 /// \endcode
5241 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5243 /// \param m
5244 /// A pointer to the memory used for loading values.
5245 /// \param i
5246 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5247 /// \param s
5248 /// A literal constant scale factor for the indexes in \a i. Must be
5249 /// 1, 2, 4, or 8.
5250 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5251 #define _mm_i64gather_epi64(m, i, s) \
5252 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5253 (long long const *)(m), \
5254 (__v2di)(__m128i)(i), \
5255 (__v2di)_mm_set1_epi64x(-1), (s)))
5257 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5258 /// from the 256-bit vector of [4 x i64] in \a i.
5260 /// \code{.operation}
5261 /// FOR element := 0 to 3
5262 /// j := element*64
5263 /// k := element*64
5264 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5265 /// ENDFOR
5266 /// \endcode
5268 /// \headerfile <immintrin.h>
5270 /// \code
5271 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5272 /// \endcode
5274 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5276 /// \param m
5277 /// A pointer to the memory used for loading values.
5278 /// \param i
5279 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5280 /// \param s
5281 /// A literal constant scale factor for the indexes in \a i. Must be
5282 /// 1, 2, 4, or 8.
5283 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5284 #define _mm256_i64gather_epi64(m, i, s) \
5285 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5286 (long long const *)(m), \
5287 (__v4di)(__m256i)(i), \
5288 (__v4di)_mm256_set1_epi64x(-1), (s)))
5290 #undef __DEFAULT_FN_ATTRS256
5291 #undef __DEFAULT_FN_ATTRS128
5293 #endif /* __AVX2INTRIN_H */