1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
14 #ifndef __AVX2INTRIN_H
15 #define __AVX2INTRIN_H
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256)))
19 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128)))
21 /* SSE4 Multiple Packed Sums of Absolute Difference. */
22 #define _mm256_mpsadbw_epu8(X, Y, M) \
23 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
24 (__v32qi)(__m256i)(Y), (int)(M)))
26 static __inline__ __m256i __DEFAULT_FN_ATTRS256
27 _mm256_abs_epi8(__m256i __a
)
29 return (__m256i
)__builtin_elementwise_abs((__v32qs
)__a
);
32 static __inline__ __m256i __DEFAULT_FN_ATTRS256
33 _mm256_abs_epi16(__m256i __a
)
35 return (__m256i
)__builtin_elementwise_abs((__v16hi
)__a
);
38 static __inline__ __m256i __DEFAULT_FN_ATTRS256
39 _mm256_abs_epi32(__m256i __a
)
41 return (__m256i
)__builtin_elementwise_abs((__v8si
)__a
);
44 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
45 /// integers using signed saturation, and returns the 256-bit result.
51 /// result[7+k:k] := SATURATE8(__a[15+j:j])
52 /// result[71+k:64+k] := SATURATE8(__b[15+j:j])
53 /// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
54 /// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
58 /// \headerfile <immintrin.h>
60 /// This intrinsic corresponds to the \c VPACKSSWB instruction.
63 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
66 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
68 /// \returns A 256-bit integer vector containing the result.
69 static __inline__ __m256i __DEFAULT_FN_ATTRS256
70 _mm256_packs_epi16(__m256i __a
, __m256i __b
)
72 return (__m256i
)__builtin_ia32_packsswb256((__v16hi
)__a
, (__v16hi
)__b
);
75 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
76 /// integers using signed saturation, and returns the resulting 256-bit
77 /// vector of [16 x i16].
83 /// result[15+k:k] := SATURATE16(__a[31+j:j])
84 /// result[79+k:64+k] := SATURATE16(__b[31+j:j])
85 /// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
86 /// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
90 /// \headerfile <immintrin.h>
92 /// This intrinsic corresponds to the \c VPACKSSDW instruction.
95 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
98 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
100 /// \returns A 256-bit vector of [16 x i16] containing the result.
101 static __inline__ __m256i __DEFAULT_FN_ATTRS256
102 _mm256_packs_epi32(__m256i __a
, __m256i __b
)
104 return (__m256i
)__builtin_ia32_packssdw256((__v8si
)__a
, (__v8si
)__b
);
107 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
108 /// using unsigned saturation, and returns the 256-bit result.
110 /// \code{.operation}
114 /// result[7+k:k] := SATURATE8U(__a[15+j:j])
115 /// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
116 /// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
117 /// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
121 /// \headerfile <immintrin.h>
123 /// This intrinsic corresponds to the \c VPACKUSWB instruction.
126 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and
129 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and
131 /// \returns A 256-bit integer vector containing the result.
132 static __inline__ __m256i __DEFAULT_FN_ATTRS256
133 _mm256_packus_epi16(__m256i __a
, __m256i __b
)
135 return (__m256i
)__builtin_ia32_packuswb256((__v16hi
)__a
, (__v16hi
)__b
);
138 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
139 /// using unsigned saturation, and returns the resulting 256-bit vector of
142 /// \code{.operation}
146 /// result[15+k:k] := SATURATE16U(__V1[31+j:j])
147 /// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
148 /// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
149 /// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
153 /// \headerfile <immintrin.h>
155 /// This intrinsic corresponds to the \c VPACKUSDW instruction.
158 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and
161 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and
163 /// \returns A 256-bit vector of [16 x i16] containing the result.
164 static __inline__ __m256i __DEFAULT_FN_ATTRS256
165 _mm256_packus_epi32(__m256i __V1
, __m256i __V2
)
167 return (__m256i
) __builtin_ia32_packusdw256((__v8si
)__V1
, (__v8si
)__V2
);
170 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
171 /// vectors and returns the lower 8 bits of each sum in the corresponding
172 /// byte of the 256-bit integer vector result (overflow is ignored).
174 /// \headerfile <immintrin.h>
176 /// This intrinsic corresponds to the \c VPADDB instruction.
179 /// A 256-bit integer vector containing one of the source operands.
181 /// A 256-bit integer vector containing one of the source operands.
182 /// \returns A 256-bit integer vector containing the sums.
183 static __inline__ __m256i __DEFAULT_FN_ATTRS256
184 _mm256_add_epi8(__m256i __a
, __m256i __b
)
186 return (__m256i
)((__v32qu
)__a
+ (__v32qu
)__b
);
189 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
190 /// [16 x i16] and returns the lower 16 bits of each sum in the
191 /// corresponding element of the [16 x i16] result (overflow is ignored).
193 /// \headerfile <immintrin.h>
195 /// This intrinsic corresponds to the \c VPADDW instruction.
198 /// A 256-bit vector of [16 x i16] containing one of the source operands.
200 /// A 256-bit vector of [16 x i16] containing one of the source operands.
201 /// \returns A 256-bit vector of [16 x i16] containing the sums.
202 static __inline__ __m256i __DEFAULT_FN_ATTRS256
203 _mm256_add_epi16(__m256i __a
, __m256i __b
)
205 return (__m256i
)((__v16hu
)__a
+ (__v16hu
)__b
);
208 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
209 /// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
210 /// element of the [8 x i32] result (overflow is ignored).
212 /// \headerfile <immintrin.h>
214 /// This intrinsic corresponds to the \c VPADDD instruction.
217 /// A 256-bit vector of [8 x i32] containing one of the source operands.
219 /// A 256-bit vector of [8 x i32] containing one of the source operands.
220 /// \returns A 256-bit vector of [8 x i32] containing the sums.
221 static __inline__ __m256i __DEFAULT_FN_ATTRS256
222 _mm256_add_epi32(__m256i __a
, __m256i __b
)
224 return (__m256i
)((__v8su
)__a
+ (__v8su
)__b
);
227 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
228 /// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
229 /// element of the [4 x i64] result (overflow is ignored).
231 /// \headerfile <immintrin.h>
233 /// This intrinsic corresponds to the \c VPADDQ instruction.
236 /// A 256-bit vector of [4 x i64] containing one of the source operands.
238 /// A 256-bit vector of [4 x i64] containing one of the source operands.
239 /// \returns A 256-bit vector of [4 x i64] containing the sums.
240 static __inline__ __m256i __DEFAULT_FN_ATTRS256
241 _mm256_add_epi64(__m256i __a
, __m256i __b
)
243 return (__m256i
)((__v4du
)__a
+ (__v4du
)__b
);
246 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
247 /// vectors using signed saturation, and returns each sum in the
248 /// corresponding byte of the 256-bit integer vector result.
250 /// \headerfile <immintrin.h>
252 /// This intrinsic corresponds to the \c VPADDSB instruction.
255 /// A 256-bit integer vector containing one of the source operands.
257 /// A 256-bit integer vector containing one of the source operands.
258 /// \returns A 256-bit integer vector containing the sums.
259 static __inline__ __m256i __DEFAULT_FN_ATTRS256
260 _mm256_adds_epi8(__m256i __a
, __m256i __b
)
262 return (__m256i
)__builtin_elementwise_add_sat((__v32qs
)__a
, (__v32qs
)__b
);
265 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
266 /// [16 x i16] using signed saturation, and returns the [16 x i16] result.
268 /// \headerfile <immintrin.h>
270 /// This intrinsic corresponds to the \c VPADDSW instruction.
273 /// A 256-bit vector of [16 x i16] containing one of the source operands.
275 /// A 256-bit vector of [16 x i16] containing one of the source operands.
276 /// \returns A 256-bit vector of [16 x i16] containing the sums.
277 static __inline__ __m256i __DEFAULT_FN_ATTRS256
278 _mm256_adds_epi16(__m256i __a
, __m256i __b
)
280 return (__m256i
)__builtin_elementwise_add_sat((__v16hi
)__a
, (__v16hi
)__b
);
283 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
284 /// vectors using unsigned saturation, and returns each sum in the
285 /// corresponding byte of the 256-bit integer vector result.
287 /// \headerfile <immintrin.h>
289 /// This intrinsic corresponds to the \c VPADDUSB instruction.
292 /// A 256-bit integer vector containing one of the source operands.
294 /// A 256-bit integer vector containing one of the source operands.
295 /// \returns A 256-bit integer vector containing the sums.
296 static __inline__ __m256i __DEFAULT_FN_ATTRS256
297 _mm256_adds_epu8(__m256i __a
, __m256i __b
)
299 return (__m256i
)__builtin_elementwise_add_sat((__v32qu
)__a
, (__v32qu
)__b
);
302 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
303 /// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
305 /// \headerfile <immintrin.h>
307 /// This intrinsic corresponds to the \c VPADDUSW instruction.
310 /// A 256-bit vector of [16 x i16] containing one of the source operands.
312 /// A 256-bit vector of [16 x i16] containing one of the source operands.
313 /// \returns A 256-bit vector of [16 x i16] containing the sums.
314 static __inline__ __m256i __DEFAULT_FN_ATTRS256
315 _mm256_adds_epu16(__m256i __a
, __m256i __b
)
317 return (__m256i
)__builtin_elementwise_add_sat((__v16hu
)__a
, (__v16hu
)__b
);
320 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
321 /// temporary 256-bit value, and the lower half of the 256-bit vector \a b
322 /// as the lower half of the temporary value. Right-shifts the temporary
323 /// value by \a n bytes, and uses the lower 16 bytes of the shifted value
324 /// as the lower 16 bytes of the result. Uses the upper halves of \a a and
325 /// \a b to make another temporary value, right shifts by \a n, and uses
326 /// the lower 16 bytes of the shifted value as the upper 16 bytes of the
329 /// \headerfile <immintrin.h>
332 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
335 /// This intrinsic corresponds to the \c VPALIGNR instruction.
338 /// A 256-bit integer vector containing source values.
340 /// A 256-bit integer vector containing source values.
342 /// An immediate value specifying the number of bytes to shift.
343 /// \returns A 256-bit integer vector containing the result.
344 #define _mm256_alignr_epi8(a, b, n) \
345 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
346 (__v32qi)(__m256i)(b), (n)))
348 static __inline__ __m256i __DEFAULT_FN_ATTRS256
349 _mm256_and_si256(__m256i __a
, __m256i __b
)
351 return (__m256i
)((__v4du
)__a
& (__v4du
)__b
);
354 static __inline__ __m256i __DEFAULT_FN_ATTRS256
355 _mm256_andnot_si256(__m256i __a
, __m256i __b
)
357 return (__m256i
)(~(__v4du
)__a
& (__v4du
)__b
);
360 static __inline__ __m256i __DEFAULT_FN_ATTRS256
361 _mm256_avg_epu8(__m256i __a
, __m256i __b
)
363 return (__m256i
)__builtin_ia32_pavgb256((__v32qi
)__a
, (__v32qi
)__b
);
366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
367 _mm256_avg_epu16(__m256i __a
, __m256i __b
)
369 return (__m256i
)__builtin_ia32_pavgw256((__v16hi
)__a
, (__v16hi
)__b
);
372 /// Merges 8-bit integer values from either of the two 256-bit vectors
373 /// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
374 /// the resulting 256-bit integer vector.
376 /// \code{.operation}
380 /// result[7+j:j] := __V1[7+j:j]
382 /// result[7+j:j] := __V2[7+j:j]
387 /// \headerfile <immintrin.h>
389 /// This intrinsic corresponds to the \c VPBLENDVB instruction.
392 /// A 256-bit integer vector containing source values.
394 /// A 256-bit integer vector containing source values.
396 /// A 256-bit integer vector, with bit [7] of each byte specifying the
397 /// source for each corresponding byte of the result. When the mask bit
398 /// is 0, the byte is copied from \a __V1; otherwise, it is copied from
400 /// \returns A 256-bit integer vector containing the result.
401 static __inline__ __m256i __DEFAULT_FN_ATTRS256
402 _mm256_blendv_epi8(__m256i __V1
, __m256i __V2
, __m256i __M
)
404 return (__m256i
)__builtin_ia32_pblendvb256((__v32qi
)__V1
, (__v32qi
)__V2
,
408 /// Merges 16-bit integer values from either of the two 256-bit vectors
409 /// \a V1 or \a V2, as specified by the immediate integer operand \a M,
410 /// and returns the resulting 256-bit vector of [16 x i16].
412 /// \code{.operation}
416 /// result[7+j:j] := V1[7+j:j]
417 /// result[135+j:128+j] := V1[135+j:128+j]
419 /// result[7+j:j] := V2[7+j:j]
420 /// result[135+j:128+j] := V2[135+j:128+j]
425 /// \headerfile <immintrin.h>
428 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
431 /// This intrinsic corresponds to the \c VPBLENDW instruction.
434 /// A 256-bit vector of [16 x i16] containing source values.
436 /// A 256-bit vector of [16 x i16] containing source values.
438 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
439 /// source for each element of the result. The position of the mask bit
440 /// corresponds to the index of a copied value. When a mask bit is 0, the
441 /// element is copied from \a V1; otherwise, it is copied from \a V2.
442 /// \a M[0] determines the source for elements 0 and 8, \a M[1] for
443 /// elements 1 and 9, and so forth.
444 /// \returns A 256-bit vector of [16 x i16] containing the result.
445 #define _mm256_blend_epi16(V1, V2, M) \
446 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
447 (__v16hi)(__m256i)(V2), (int)(M)))
449 static __inline__ __m256i __DEFAULT_FN_ATTRS256
450 _mm256_cmpeq_epi8(__m256i __a
, __m256i __b
)
452 return (__m256i
)((__v32qi
)__a
== (__v32qi
)__b
);
455 static __inline__ __m256i __DEFAULT_FN_ATTRS256
456 _mm256_cmpeq_epi16(__m256i __a
, __m256i __b
)
458 return (__m256i
)((__v16hi
)__a
== (__v16hi
)__b
);
461 static __inline__ __m256i __DEFAULT_FN_ATTRS256
462 _mm256_cmpeq_epi32(__m256i __a
, __m256i __b
)
464 return (__m256i
)((__v8si
)__a
== (__v8si
)__b
);
467 static __inline__ __m256i __DEFAULT_FN_ATTRS256
468 _mm256_cmpeq_epi64(__m256i __a
, __m256i __b
)
470 return (__m256i
)((__v4di
)__a
== (__v4di
)__b
);
473 static __inline__ __m256i __DEFAULT_FN_ATTRS256
474 _mm256_cmpgt_epi8(__m256i __a
, __m256i __b
)
476 /* This function always performs a signed comparison, but __v32qi is a char
477 which may be signed or unsigned, so use __v32qs. */
478 return (__m256i
)((__v32qs
)__a
> (__v32qs
)__b
);
481 static __inline__ __m256i __DEFAULT_FN_ATTRS256
482 _mm256_cmpgt_epi16(__m256i __a
, __m256i __b
)
484 return (__m256i
)((__v16hi
)__a
> (__v16hi
)__b
);
487 static __inline__ __m256i __DEFAULT_FN_ATTRS256
488 _mm256_cmpgt_epi32(__m256i __a
, __m256i __b
)
490 return (__m256i
)((__v8si
)__a
> (__v8si
)__b
);
493 static __inline__ __m256i __DEFAULT_FN_ATTRS256
494 _mm256_cmpgt_epi64(__m256i __a
, __m256i __b
)
496 return (__m256i
)((__v4di
)__a
> (__v4di
)__b
);
499 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
500 /// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
501 /// element of the [16 x i16] result (overflow is ignored). Sums from
502 /// \a __a are returned in the lower 64 bits of each 128-bit half of the
503 /// result; sums from \a __b are returned in the upper 64 bits of each
504 /// 128-bit half of the result.
506 /// \code{.operation}
509 /// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
510 /// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
511 /// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
512 /// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
513 /// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
514 /// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
515 /// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
516 /// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
520 /// \headerfile <immintrin.h>
522 /// This intrinsic corresponds to the \c VPHADDW instruction.
525 /// A 256-bit vector of [16 x i16] containing one of the source operands.
527 /// A 256-bit vector of [16 x i16] containing one of the source operands.
528 /// \returns A 256-bit vector of [16 x i16] containing the sums.
529 static __inline__ __m256i __DEFAULT_FN_ATTRS256
530 _mm256_hadd_epi16(__m256i __a
, __m256i __b
)
532 return (__m256i
)__builtin_ia32_phaddw256((__v16hi
)__a
, (__v16hi
)__b
);
535 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
536 /// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
537 /// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
538 /// are returned in the lower 64 bits of each 128-bit half of the result;
539 /// sums from \a __b are returned in the upper 64 bits of each 128-bit half
542 /// \code{.operation}
545 /// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
546 /// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
547 /// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
548 /// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
552 /// \headerfile <immintrin.h>
554 /// This intrinsic corresponds to the \c VPHADDD instruction.
557 /// A 256-bit vector of [8 x i32] containing one of the source operands.
559 /// A 256-bit vector of [8 x i32] containing one of the source operands.
560 /// \returns A 256-bit vector of [8 x i32] containing the sums.
561 static __inline__ __m256i __DEFAULT_FN_ATTRS256
562 _mm256_hadd_epi32(__m256i __a
, __m256i __b
)
564 return (__m256i
)__builtin_ia32_phaddd256((__v8si
)__a
, (__v8si
)__b
);
567 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
568 /// vectors of [16 x i16] using signed saturation and returns each sum in
569 /// an element of the [16 x i16] result. Sums from \a __a are returned in
570 /// the lower 64 bits of each 128-bit half of the result; sums from \a __b
571 /// are returned in the upper 64 bits of each 128-bit half of the result.
573 /// \code{.operation}
576 /// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
577 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
578 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
579 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
580 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
581 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
582 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
583 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
587 /// \headerfile <immintrin.h>
589 /// This intrinsic corresponds to the \c VPHADDSW instruction.
592 /// A 256-bit vector of [16 x i16] containing one of the source operands.
594 /// A 256-bit vector of [16 x i16] containing one of the source operands.
595 /// \returns A 256-bit vector of [16 x i16] containing the sums.
596 static __inline__ __m256i __DEFAULT_FN_ATTRS256
597 _mm256_hadds_epi16(__m256i __a
, __m256i __b
)
599 return (__m256i
)__builtin_ia32_phaddsw256((__v16hi
)__a
, (__v16hi
)__b
);
602 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
603 /// vectors of [16 x i16] and returns the lower 16 bits of each difference
604 /// in an element of the [16 x i16] result (overflow is ignored).
605 /// Differences from \a __a are returned in the lower 64 bits of each
606 /// 128-bit half of the result; differences from \a __b are returned in the
607 /// upper 64 bits of each 128-bit half of the result.
609 /// \code{.operation}
612 /// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
613 /// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
614 /// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
615 /// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
616 /// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
617 /// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
618 /// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
619 /// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
623 /// \headerfile <immintrin.h>
625 /// This intrinsic corresponds to the \c VPHSUBW instruction.
628 /// A 256-bit vector of [16 x i16] containing one of the source operands.
630 /// A 256-bit vector of [16 x i16] containing one of the source operands.
631 /// \returns A 256-bit vector of [16 x i16] containing the differences.
632 static __inline__ __m256i __DEFAULT_FN_ATTRS256
633 _mm256_hsub_epi16(__m256i __a
, __m256i __b
)
635 return (__m256i
)__builtin_ia32_phsubw256((__v16hi
)__a
, (__v16hi
)__b
);
638 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
639 /// vectors of [8 x i32] and returns the lower 32 bits of each difference in
640 /// an element of the [8 x i32] result (overflow is ignored). Differences
641 /// from \a __a are returned in the lower 64 bits of each 128-bit half of
642 /// the result; differences from \a __b are returned in the upper 64 bits
643 /// of each 128-bit half of the result.
645 /// \code{.operation}
648 /// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
649 /// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
650 /// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
651 /// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
655 /// \headerfile <immintrin.h>
657 /// This intrinsic corresponds to the \c VPHSUBD instruction.
660 /// A 256-bit vector of [8 x i32] containing one of the source operands.
662 /// A 256-bit vector of [8 x i32] containing one of the source operands.
663 /// \returns A 256-bit vector of [8 x i32] containing the differences.
664 static __inline__ __m256i __DEFAULT_FN_ATTRS256
665 _mm256_hsub_epi32(__m256i __a
, __m256i __b
)
667 return (__m256i
)__builtin_ia32_phsubd256((__v8si
)__a
, (__v8si
)__b
);
670 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
671 /// vectors of [16 x i16] using signed saturation and returns each sum in
672 /// an element of the [16 x i16] result. Differences from \a __a are
673 /// returned in the lower 64 bits of each 128-bit half of the result;
674 /// differences from \a __b are returned in the upper 64 bits of each
675 /// 128-bit half of the result.
677 /// \code{.operation}
680 /// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
681 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
682 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
683 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
684 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
685 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
686 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
687 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
691 /// \headerfile <immintrin.h>
693 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
696 /// A 256-bit vector of [16 x i16] containing one of the source operands.
698 /// A 256-bit vector of [16 x i16] containing one of the source operands.
699 /// \returns A 256-bit vector of [16 x i16] containing the differences.
700 static __inline__ __m256i __DEFAULT_FN_ATTRS256
701 _mm256_hsubs_epi16(__m256i __a
, __m256i __b
)
703 return (__m256i
)__builtin_ia32_phsubsw256((__v16hi
)__a
, (__v16hi
)__b
);
706 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
707 /// with the corresponding signed byte from the 256-bit integer vector in
708 /// \a __b, forming signed 16-bit intermediate products. Adds adjacent
709 /// pairs of those products using signed saturation to form 16-bit sums
710 /// returned as elements of the [16 x i16] result.
712 /// \code{.operation}
715 /// temp1 := __a[j+7:j] * __b[j+7:j]
716 /// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
717 /// result[j+15:j] := SATURATE16(temp1 + temp2)
721 /// \headerfile <immintrin.h>
723 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
726 /// A 256-bit vector containing one of the source operands.
728 /// A 256-bit vector containing one of the source operands.
729 /// \returns A 256-bit vector of [16 x i16] containing the result.
730 static __inline__ __m256i __DEFAULT_FN_ATTRS256
731 _mm256_maddubs_epi16(__m256i __a
, __m256i __b
)
733 return (__m256i
)__builtin_ia32_pmaddubsw256((__v32qi
)__a
, (__v32qi
)__b
);
736 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
737 /// [16 x i16], forming 32-bit intermediate products, and adds pairs of
738 /// those products to form 32-bit sums returned as elements of the
739 /// [8 x i32] result.
741 /// There is only one wraparound case: when all four of the 16-bit sources
742 /// are \c 0x8000, the result will be \c 0x80000000.
744 /// \code{.operation}
747 /// temp1 := __a[j+15:j] * __b[j+15:j]
748 /// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
749 /// result[j+31:j] := temp1 + temp2
753 /// \headerfile <immintrin.h>
755 /// This intrinsic corresponds to the \c VPMADDWD instruction.
758 /// A 256-bit vector of [16 x i16] containing one of the source operands.
760 /// A 256-bit vector of [16 x i16] containing one of the source operands.
761 /// \returns A 256-bit vector of [8 x i32] containing the result.
762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
763 _mm256_madd_epi16(__m256i __a
, __m256i __b
)
765 return (__m256i
)__builtin_ia32_pmaddwd256((__v16hi
)__a
, (__v16hi
)__b
);
768 static __inline__ __m256i __DEFAULT_FN_ATTRS256
769 _mm256_max_epi8(__m256i __a
, __m256i __b
)
771 return (__m256i
)__builtin_elementwise_max((__v32qs
)__a
, (__v32qs
)__b
);
774 static __inline__ __m256i __DEFAULT_FN_ATTRS256
775 _mm256_max_epi16(__m256i __a
, __m256i __b
)
777 return (__m256i
)__builtin_elementwise_max((__v16hi
)__a
, (__v16hi
)__b
);
780 static __inline__ __m256i __DEFAULT_FN_ATTRS256
781 _mm256_max_epi32(__m256i __a
, __m256i __b
)
783 return (__m256i
)__builtin_elementwise_max((__v8si
)__a
, (__v8si
)__b
);
786 static __inline__ __m256i __DEFAULT_FN_ATTRS256
787 _mm256_max_epu8(__m256i __a
, __m256i __b
)
789 return (__m256i
)__builtin_elementwise_max((__v32qu
)__a
, (__v32qu
)__b
);
792 static __inline__ __m256i __DEFAULT_FN_ATTRS256
793 _mm256_max_epu16(__m256i __a
, __m256i __b
)
795 return (__m256i
)__builtin_elementwise_max((__v16hu
)__a
, (__v16hu
)__b
);
798 static __inline__ __m256i __DEFAULT_FN_ATTRS256
799 _mm256_max_epu32(__m256i __a
, __m256i __b
)
801 return (__m256i
)__builtin_elementwise_max((__v8su
)__a
, (__v8su
)__b
);
804 static __inline__ __m256i __DEFAULT_FN_ATTRS256
805 _mm256_min_epi8(__m256i __a
, __m256i __b
)
807 return (__m256i
)__builtin_elementwise_min((__v32qs
)__a
, (__v32qs
)__b
);
810 static __inline__ __m256i __DEFAULT_FN_ATTRS256
811 _mm256_min_epi16(__m256i __a
, __m256i __b
)
813 return (__m256i
)__builtin_elementwise_min((__v16hi
)__a
, (__v16hi
)__b
);
816 static __inline__ __m256i __DEFAULT_FN_ATTRS256
817 _mm256_min_epi32(__m256i __a
, __m256i __b
)
819 return (__m256i
)__builtin_elementwise_min((__v8si
)__a
, (__v8si
)__b
);
822 static __inline__ __m256i __DEFAULT_FN_ATTRS256
823 _mm256_min_epu8(__m256i __a
, __m256i __b
)
825 return (__m256i
)__builtin_elementwise_min((__v32qu
)__a
, (__v32qu
)__b
);
828 static __inline__ __m256i __DEFAULT_FN_ATTRS256
829 _mm256_min_epu16(__m256i __a
, __m256i __b
)
831 return (__m256i
)__builtin_elementwise_min((__v16hu
)__a
, (__v16hu
)__b
);
834 static __inline__ __m256i __DEFAULT_FN_ATTRS256
835 _mm256_min_epu32(__m256i __a
, __m256i __b
)
837 return (__m256i
)__builtin_elementwise_min((__v8su
)__a
, (__v8su
)__b
);
840 static __inline__
int __DEFAULT_FN_ATTRS256
841 _mm256_movemask_epi8(__m256i __a
)
843 return __builtin_ia32_pmovmskb256((__v32qi
)__a
);
846 static __inline__ __m256i __DEFAULT_FN_ATTRS256
847 _mm256_cvtepi8_epi16(__m128i __V
)
849 /* This function always performs a signed extension, but __v16qi is a char
850 which may be signed or unsigned, so use __v16qs. */
851 return (__m256i
)__builtin_convertvector((__v16qs
)__V
, __v16hi
);
854 static __inline__ __m256i __DEFAULT_FN_ATTRS256
855 _mm256_cvtepi8_epi32(__m128i __V
)
857 /* This function always performs a signed extension, but __v16qi is a char
858 which may be signed or unsigned, so use __v16qs. */
859 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v16qs
)__V
, (__v16qs
)__V
, 0, 1, 2, 3, 4, 5, 6, 7), __v8si
);
862 static __inline__ __m256i __DEFAULT_FN_ATTRS256
863 _mm256_cvtepi8_epi64(__m128i __V
)
865 /* This function always performs a signed extension, but __v16qi is a char
866 which may be signed or unsigned, so use __v16qs. */
867 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v16qs
)__V
, (__v16qs
)__V
, 0, 1, 2, 3), __v4di
);
870 static __inline__ __m256i __DEFAULT_FN_ATTRS256
871 _mm256_cvtepi16_epi32(__m128i __V
)
873 return (__m256i
)__builtin_convertvector((__v8hi
)__V
, __v8si
);
876 static __inline__ __m256i __DEFAULT_FN_ATTRS256
877 _mm256_cvtepi16_epi64(__m128i __V
)
879 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v8hi
)__V
, (__v8hi
)__V
, 0, 1, 2, 3), __v4di
);
882 static __inline__ __m256i __DEFAULT_FN_ATTRS256
883 _mm256_cvtepi32_epi64(__m128i __V
)
885 return (__m256i
)__builtin_convertvector((__v4si
)__V
, __v4di
);
888 static __inline__ __m256i __DEFAULT_FN_ATTRS256
889 _mm256_cvtepu8_epi16(__m128i __V
)
891 return (__m256i
)__builtin_convertvector((__v16qu
)__V
, __v16hi
);
894 static __inline__ __m256i __DEFAULT_FN_ATTRS256
895 _mm256_cvtepu8_epi32(__m128i __V
)
897 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v16qu
)__V
, (__v16qu
)__V
, 0, 1, 2, 3, 4, 5, 6, 7), __v8si
);
900 static __inline__ __m256i __DEFAULT_FN_ATTRS256
901 _mm256_cvtepu8_epi64(__m128i __V
)
903 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v16qu
)__V
, (__v16qu
)__V
, 0, 1, 2, 3), __v4di
);
906 static __inline__ __m256i __DEFAULT_FN_ATTRS256
907 _mm256_cvtepu16_epi32(__m128i __V
)
909 return (__m256i
)__builtin_convertvector((__v8hu
)__V
, __v8si
);
912 static __inline__ __m256i __DEFAULT_FN_ATTRS256
913 _mm256_cvtepu16_epi64(__m128i __V
)
915 return (__m256i
)__builtin_convertvector(__builtin_shufflevector((__v8hu
)__V
, (__v8hu
)__V
, 0, 1, 2, 3), __v4di
);
918 static __inline__ __m256i __DEFAULT_FN_ATTRS256
919 _mm256_cvtepu32_epi64(__m128i __V
)
921 return (__m256i
)__builtin_convertvector((__v4su
)__V
, __v4di
);
924 /// Multiplies signed 32-bit integers from even-numbered elements of two
925 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
926 /// [4 x i64] result.
928 /// \code{.operation}
929 /// result[63:0] := __a[31:0] * __b[31:0]
930 /// result[127:64] := __a[95:64] * __b[95:64]
931 /// result[191:128] := __a[159:128] * __b[159:128]
932 /// result[255:192] := __a[223:192] * __b[223:192]
935 /// \headerfile <immintrin.h>
937 /// This intrinsic corresponds to the \c VPMULDQ instruction.
940 /// A 256-bit vector of [8 x i32] containing one of the source operands.
942 /// A 256-bit vector of [8 x i32] containing one of the source operands.
943 /// \returns A 256-bit vector of [4 x i64] containing the products.
944 static __inline__ __m256i __DEFAULT_FN_ATTRS256
945 _mm256_mul_epi32(__m256i __a
, __m256i __b
)
947 return (__m256i
)__builtin_ia32_pmuldq256((__v8si
)__a
, (__v8si
)__b
);
950 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
951 /// [16 x i16], truncates the 32-bit results to the most significant 18
952 /// bits, rounds by adding 1, and returns bits [16:1] of each rounded
953 /// product in the [16 x i16] result.
955 /// \code{.operation}
958 /// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
959 /// result[j+15:j] := temp[16:1]
962 /// \headerfile <immintrin.h>
964 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
967 /// A 256-bit vector of [16 x i16] containing one of the source operands.
969 /// A 256-bit vector of [16 x i16] containing one of the source operands.
970 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
971 static __inline__ __m256i __DEFAULT_FN_ATTRS256
972 _mm256_mulhrs_epi16(__m256i __a
, __m256i __b
)
974 return (__m256i
)__builtin_ia32_pmulhrsw256((__v16hi
)__a
, (__v16hi
)__b
);
977 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
978 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
979 /// [16 x i16] result.
981 /// \headerfile <immintrin.h>
983 /// This intrinsic corresponds to the \c VPMULHUW instruction.
986 /// A 256-bit vector of [16 x i16] containing one of the source operands.
988 /// A 256-bit vector of [16 x i16] containing one of the source operands.
989 /// \returns A 256-bit vector of [16 x i16] containing the products.
990 static __inline__ __m256i __DEFAULT_FN_ATTRS256
991 _mm256_mulhi_epu16(__m256i __a
, __m256i __b
)
993 return (__m256i
)__builtin_ia32_pmulhuw256((__v16hi
)__a
, (__v16hi
)__b
);
996 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
997 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
998 /// [16 x i16] result.
1000 /// \headerfile <immintrin.h>
1002 /// This intrinsic corresponds to the \c VPMULHW instruction.
1005 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1007 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1008 /// \returns A 256-bit vector of [16 x i16] containing the products.
1009 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1010 _mm256_mulhi_epi16(__m256i __a
, __m256i __b
)
1012 return (__m256i
)__builtin_ia32_pmulhw256((__v16hi
)__a
, (__v16hi
)__b
);
1015 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1016 /// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1017 /// [16 x i16] result.
1019 /// \headerfile <immintrin.h>
1021 /// This intrinsic corresponds to the \c VPMULLW instruction.
1024 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1026 /// A 256-bit vector of [16 x i16] containing one of the source operands.
1027 /// \returns A 256-bit vector of [16 x i16] containing the products.
1028 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1029 _mm256_mullo_epi16(__m256i __a
, __m256i __b
)
1031 return (__m256i
)((__v16hu
)__a
* (__v16hu
)__b
);
1034 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1035 /// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1036 /// [8 x i32] result.
1038 /// \headerfile <immintrin.h>
1040 /// This intrinsic corresponds to the \c VPMULLD instruction.
1043 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1045 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1046 /// \returns A 256-bit vector of [8 x i32] containing the products.
1047 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1048 _mm256_mullo_epi32 (__m256i __a
, __m256i __b
)
1050 return (__m256i
)((__v8su
)__a
* (__v8su
)__b
);
1053 /// Multiplies unsigned 32-bit integers from even-numered elements of two
1054 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1055 /// [4 x i64] result.
1057 /// \code{.operation}
1058 /// result[63:0] := __a[31:0] * __b[31:0]
1059 /// result[127:64] := __a[95:64] * __b[95:64]
1060 /// result[191:128] := __a[159:128] * __b[159:128]
1061 /// result[255:192] := __a[223:192] * __b[223:192]
1064 /// \headerfile <immintrin.h>
1066 /// This intrinsic corresponds to the \c VPMULUDQ instruction.
1069 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1071 /// A 256-bit vector of [8 x i32] containing one of the source operands.
1072 /// \returns A 256-bit vector of [4 x i64] containing the products.
1073 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1074 _mm256_mul_epu32(__m256i __a
, __m256i __b
)
1076 return __builtin_ia32_pmuludq256((__v8si
)__a
, (__v8si
)__b
);
1079 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1080 _mm256_or_si256(__m256i __a
, __m256i __b
)
1082 return (__m256i
)((__v4du
)__a
| (__v4du
)__b
);
1085 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1086 _mm256_sad_epu8(__m256i __a
, __m256i __b
)
1088 return __builtin_ia32_psadbw256((__v32qi
)__a
, (__v32qi
)__b
);
1091 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1092 /// to control information in the 256-bit integer vector \a __b, and
1093 /// returns the 256-bit result. In effect there are two separate 128-bit
1094 /// shuffles in the lower and upper halves.
1096 /// \code{.operation}
1097 /// FOR i := 0 TO 31
1099 /// IF __b[j+7] == 1
1100 /// result[j+7:j] := 0
1102 /// k := __b[j+3:j] * 8
1106 /// result[j+7:j] := __a[k+7:k]
1111 /// \headerfile <immintrin.h>
1113 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1116 /// A 256-bit integer vector containing source values.
1118 /// A 256-bit integer vector containing control information to determine
1119 /// what goes into the corresponding byte of the result. If bit 7 of the
1120 /// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1121 /// control byte specify the index (within the same 128-bit half) of \a __a
1122 /// to copy to the result byte.
1123 /// \returns A 256-bit integer vector containing the result.
1124 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1125 _mm256_shuffle_epi8(__m256i __a
, __m256i __b
)
1127 return (__m256i
)__builtin_ia32_pshufb256((__v32qi
)__a
, (__v32qi
)__b
);
1130 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1131 /// according to control information in the integer literal \a imm, and
1132 /// returns the 256-bit result. In effect there are two parallel 128-bit
1133 /// shuffles in the lower and upper halves.
1135 /// \code{.operation}
1138 /// k := (imm >> i*2)[1:0] * 32
1139 /// result[j+31:j] := a[k+31:k]
1140 /// result[128+j+31:128+j] := a[128+k+31:128+k]
1144 /// \headerfile <immintrin.h>
1147 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1150 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1153 /// A 256-bit vector of [8 x i32] containing source values.
1155 /// An immediate 8-bit value specifying which elements to copy from \a a.
1156 /// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1157 /// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1159 /// \returns A 256-bit vector of [8 x i32] containing the result.
1160 #define _mm256_shuffle_epi32(a, imm) \
1161 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1163 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1164 /// according to control information in the integer literal \a imm, and
1165 /// returns the 256-bit result. The upper 64 bits of each 128-bit half
1166 /// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1167 /// copied from \a a unchanged.
1169 /// \code{.operation}
1170 /// result[63:0] := a[63:0]
1171 /// result[191:128] := a[191:128]
1173 /// j := i * 16 + 64
1174 /// k := (imm >> i*2)[1:0] * 16 + 64
1175 /// result[j+15:j] := a[k+15:k]
1176 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1180 /// \headerfile <immintrin.h>
1183 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1186 /// This intrinsic corresponds to the \c VPSHUFHW instruction.
1189 /// A 256-bit vector of [16 x i16] containing source values.
1191 /// An immediate 8-bit value specifying which elements to copy from \a a.
1192 /// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1193 /// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1194 /// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1195 /// \returns A 256-bit vector of [16 x i16] containing the result.
1196 #define _mm256_shufflehi_epi16(a, imm) \
1197 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1199 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1200 /// according to control information in the integer literal \a imm, and
1201 /// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1202 /// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1203 /// copied from \a a unchanged.
1205 /// \code{.operation}
1206 /// result[127:64] := a[127:64]
1207 /// result[255:192] := a[255:192]
1210 /// k := (imm >> i*2)[1:0] * 16
1211 /// result[j+15:j] := a[k+15:k]
1212 /// result[128+j+15:128+j] := a[128+k+15:128+k]
1216 /// \headerfile <immintrin.h>
1219 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1222 /// This intrinsic corresponds to the \c VPSHUFLW instruction.
1225 /// A 256-bit vector of [16 x i16] to use as a source of data for the
1228 /// An immediate 8-bit value specifying which elements to copy from \a a.
1229 /// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
1230 /// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
1232 /// \returns A 256-bit vector of [16 x i16] containing the result.
1233 #define _mm256_shufflelo_epi16(a, imm) \
1234 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
1236 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1237 _mm256_sign_epi8(__m256i __a
, __m256i __b
)
1239 return (__m256i
)__builtin_ia32_psignb256((__v32qi
)__a
, (__v32qi
)__b
);
1242 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1243 _mm256_sign_epi16(__m256i __a
, __m256i __b
)
1245 return (__m256i
)__builtin_ia32_psignw256((__v16hi
)__a
, (__v16hi
)__b
);
1248 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1249 _mm256_sign_epi32(__m256i __a
, __m256i __b
)
1251 return (__m256i
)__builtin_ia32_psignd256((__v8si
)__a
, (__v8si
)__b
);
1254 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
1255 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
1256 /// is greater than 15, the returned result is all zeroes.
1258 /// \headerfile <immintrin.h>
1261 /// __m256i _mm256_slli_si256(__m256i a, const int imm);
1264 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
1267 /// A 256-bit integer vector to be shifted.
1269 /// An unsigned immediate value specifying the shift count (in bytes).
1270 /// \returns A 256-bit integer vector containing the result.
1271 #define _mm256_slli_si256(a, imm) \
1272 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
1274 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
1275 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
1276 /// is greater than 15, the returned result is all zeroes.
1278 /// \headerfile <immintrin.h>
1281 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
1284 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
1287 /// A 256-bit integer vector to be shifted.
1289 /// An unsigned immediate value specifying the shift count (in bytes).
1290 /// \returns A 256-bit integer vector containing the result.
1291 #define _mm256_bslli_epi128(a, imm) \
1292 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
1294 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1295 /// left by \a __count bits, shifting in zero bits, and returns the result.
1296 /// If \a __count is greater than 15, the returned result is all zeroes.
1298 /// \headerfile <immintrin.h>
1300 /// This intrinsic corresponds to the \c VPSLLW instruction.
1303 /// A 256-bit vector of [16 x i16] to be shifted.
1305 /// An unsigned integer value specifying the shift count (in bits).
1306 /// \returns A 256-bit vector of [16 x i16] containing the result.
1307 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1308 _mm256_slli_epi16(__m256i __a
, int __count
)
1310 return (__m256i
)__builtin_ia32_psllwi256((__v16hi
)__a
, __count
);
1313 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1314 /// left by the number of bits specified by the lower 64 bits of \a __count,
1315 /// shifting in zero bits, and returns the result. If \a __count is greater
1316 /// than 15, the returned result is all zeroes.
1318 /// \headerfile <immintrin.h>
1320 /// This intrinsic corresponds to the \c VPSLLW instruction.
1323 /// A 256-bit vector of [16 x i16] to be shifted.
1325 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1326 /// shift count (in bits). The upper element is ignored.
1327 /// \returns A 256-bit vector of [16 x i16] containing the result.
1328 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1329 _mm256_sll_epi16(__m256i __a
, __m128i __count
)
1331 return (__m256i
)__builtin_ia32_psllw256((__v16hi
)__a
, (__v8hi
)__count
);
1334 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1335 /// left by \a __count bits, shifting in zero bits, and returns the result.
1336 /// If \a __count is greater than 31, the returned result is all zeroes.
1338 /// \headerfile <immintrin.h>
1340 /// This intrinsic corresponds to the \c VPSLLD instruction.
1343 /// A 256-bit vector of [8 x i32] to be shifted.
1345 /// An unsigned integer value specifying the shift count (in bits).
1346 /// \returns A 256-bit vector of [8 x i32] containing the result.
1347 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1348 _mm256_slli_epi32(__m256i __a
, int __count
)
1350 return (__m256i
)__builtin_ia32_pslldi256((__v8si
)__a
, __count
);
1353 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1354 /// left by the number of bits given in the lower 64 bits of \a __count,
1355 /// shifting in zero bits, and returns the result. If \a __count is greater
1356 /// than 31, the returned result is all zeroes.
1358 /// \headerfile <immintrin.h>
1360 /// This intrinsic corresponds to the \c VPSLLD instruction.
1363 /// A 256-bit vector of [8 x i32] to be shifted.
1365 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1366 /// shift count (in bits). The upper element is ignored.
1367 /// \returns A 256-bit vector of [8 x i32] containing the result.
1368 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1369 _mm256_sll_epi32(__m256i __a
, __m128i __count
)
1371 return (__m256i
)__builtin_ia32_pslld256((__v8si
)__a
, (__v4si
)__count
);
1374 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
1375 /// left by \a __count bits, shifting in zero bits, and returns the result.
1376 /// If \a __count is greater than 63, the returned result is all zeroes.
1378 /// \headerfile <immintrin.h>
1380 /// This intrinsic corresponds to the \c VPSLLQ instruction.
1383 /// A 256-bit vector of [4 x i64] to be shifted.
1385 /// An unsigned integer value specifying the shift count (in bits).
1386 /// \returns A 256-bit vector of [4 x i64] containing the result.
1387 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1388 _mm256_slli_epi64(__m256i __a
, int __count
)
1390 return __builtin_ia32_psllqi256((__v4di
)__a
, __count
);
1393 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
1394 /// left by the number of bits given in the lower 64 bits of \a __count,
1395 /// shifting in zero bits, and returns the result. If \a __count is greater
1396 /// than 63, the returned result is all zeroes.
1398 /// \headerfile <immintrin.h>
1400 /// This intrinsic corresponds to the \c VPSLLQ instruction.
1403 /// A 256-bit vector of [4 x i64] to be shifted.
1405 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1406 /// shift count (in bits). The upper element is ignored.
1407 /// \returns A 256-bit vector of [4 x i64] containing the result.
1408 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1409 _mm256_sll_epi64(__m256i __a
, __m128i __count
)
1411 return __builtin_ia32_psllq256((__v4di
)__a
, __count
);
1414 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1415 /// right by \a __count bits, shifting in sign bits, and returns the result.
1416 /// If \a __count is greater than 15, each element of the result is either
1417 /// 0 or -1 according to the corresponding input sign bit.
1419 /// \headerfile <immintrin.h>
1421 /// This intrinsic corresponds to the \c VPSRAW instruction.
1424 /// A 256-bit vector of [16 x i16] to be shifted.
1426 /// An unsigned integer value specifying the shift count (in bits).
1427 /// \returns A 256-bit vector of [16 x i16] containing the result.
1428 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1429 _mm256_srai_epi16(__m256i __a
, int __count
)
1431 return (__m256i
)__builtin_ia32_psrawi256((__v16hi
)__a
, __count
);
1434 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1435 /// right by the number of bits given in the lower 64 bits of \a __count,
1436 /// shifting in sign bits, and returns the result. If \a __count is greater
1437 /// than 15, each element of the result is either 0 or -1 according to the
1438 /// corresponding input sign bit.
1440 /// \headerfile <immintrin.h>
1442 /// This intrinsic corresponds to the \c VPSRAW instruction.
1445 /// A 256-bit vector of [16 x i16] to be shifted.
1447 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1448 /// shift count (in bits). The upper element is ignored.
1449 /// \returns A 256-bit vector of [16 x i16] containing the result.
1450 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1451 _mm256_sra_epi16(__m256i __a
, __m128i __count
)
1453 return (__m256i
)__builtin_ia32_psraw256((__v16hi
)__a
, (__v8hi
)__count
);
1456 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1457 /// right by \a __count bits, shifting in sign bits, and returns the result.
1458 /// If \a __count is greater than 31, each element of the result is either
1459 /// 0 or -1 according to the corresponding input sign bit.
1461 /// \headerfile <immintrin.h>
1463 /// This intrinsic corresponds to the \c VPSRAD instruction.
1466 /// A 256-bit vector of [8 x i32] to be shifted.
1468 /// An unsigned integer value specifying the shift count (in bits).
1469 /// \returns A 256-bit vector of [8 x i32] containing the result.
1470 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1471 _mm256_srai_epi32(__m256i __a
, int __count
)
1473 return (__m256i
)__builtin_ia32_psradi256((__v8si
)__a
, __count
);
1476 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1477 /// right by the number of bits given in the lower 64 bits of \a __count,
1478 /// shifting in sign bits, and returns the result. If \a __count is greater
1479 /// than 31, each element of the result is either 0 or -1 according to the
1480 /// corresponding input sign bit.
1482 /// \headerfile <immintrin.h>
1484 /// This intrinsic corresponds to the \c VPSRAD instruction.
1487 /// A 256-bit vector of [8 x i32] to be shifted.
1489 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1490 /// shift count (in bits). The upper element is ignored.
1491 /// \returns A 256-bit vector of [8 x i32] containing the result.
1492 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1493 _mm256_sra_epi32(__m256i __a
, __m128i __count
)
1495 return (__m256i
)__builtin_ia32_psrad256((__v8si
)__a
, (__v4si
)__count
);
1498 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
1499 /// \a imm bytes, shifting in zero bytes, and returns the result. If
1500 /// \a imm is greater than 15, the returned result is all zeroes.
1502 /// \headerfile <immintrin.h>
1505 /// __m256i _mm256_srli_si256(__m256i a, const int imm);
1508 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
1511 /// A 256-bit integer vector to be shifted.
1513 /// An unsigned immediate value specifying the shift count (in bytes).
1514 /// \returns A 256-bit integer vector containing the result.
1515 #define _mm256_srli_si256(a, imm) \
1516 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
1518 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
1519 /// \a imm bytes, shifting in zero bytes, and returns the result. If
1520 /// \a imm is greater than 15, the returned result is all zeroes.
1522 /// \headerfile <immintrin.h>
1525 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
1528 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
1531 /// A 256-bit integer vector to be shifted.
1533 /// An unsigned immediate value specifying the shift count (in bytes).
1534 /// \returns A 256-bit integer vector containing the result.
1535 #define _mm256_bsrli_epi128(a, imm) \
1536 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
1538 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1539 /// right by \a __count bits, shifting in zero bits, and returns the result.
1540 /// If \a __count is greater than 15, the returned result is all zeroes.
1542 /// \headerfile <immintrin.h>
1544 /// This intrinsic corresponds to the \c VPSRLW instruction.
1547 /// A 256-bit vector of [16 x i16] to be shifted.
1549 /// An unsigned integer value specifying the shift count (in bits).
1550 /// \returns A 256-bit vector of [16 x i16] containing the result.
1551 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1552 _mm256_srli_epi16(__m256i __a
, int __count
)
1554 return (__m256i
)__builtin_ia32_psrlwi256((__v16hi
)__a
, __count
);
1557 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1558 /// right by the number of bits given in the lower 64 bits of \a __count,
1559 /// shifting in zero bits, and returns the result. If \a __count is greater
1560 /// than 15, the returned result is all zeroes.
1562 /// \headerfile <immintrin.h>
1564 /// This intrinsic corresponds to the \c VPSRLW instruction.
1567 /// A 256-bit vector of [16 x i16] to be shifted.
1569 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1570 /// shift count (in bits). The upper element is ignored.
1571 /// \returns A 256-bit vector of [16 x i16] containing the result.
1572 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1573 _mm256_srl_epi16(__m256i __a
, __m128i __count
)
1575 return (__m256i
)__builtin_ia32_psrlw256((__v16hi
)__a
, (__v8hi
)__count
);
1578 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1579 /// right by \a __count bits, shifting in zero bits, and returns the result.
1580 /// If \a __count is greater than 31, the returned result is all zeroes.
1582 /// \headerfile <immintrin.h>
1584 /// This intrinsic corresponds to the \c VPSRLD instruction.
1587 /// A 256-bit vector of [8 x i32] to be shifted.
1589 /// An unsigned integer value specifying the shift count (in bits).
1590 /// \returns A 256-bit vector of [8 x i32] containing the result.
1591 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1592 _mm256_srli_epi32(__m256i __a
, int __count
)
1594 return (__m256i
)__builtin_ia32_psrldi256((__v8si
)__a
, __count
);
1597 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1598 /// right by the number of bits given in the lower 64 bits of \a __count,
1599 /// shifting in zero bits, and returns the result. If \a __count is greater
1600 /// than 31, the returned result is all zeroes.
1602 /// \headerfile <immintrin.h>
1604 /// This intrinsic corresponds to the \c VPSRLD instruction.
1607 /// A 256-bit vector of [8 x i32] to be shifted.
1609 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1610 /// shift count (in bits). The upper element is ignored.
1611 /// \returns A 256-bit vector of [8 x i32] containing the result.
1612 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1613 _mm256_srl_epi32(__m256i __a
, __m128i __count
)
1615 return (__m256i
)__builtin_ia32_psrld256((__v8si
)__a
, (__v4si
)__count
);
1618 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
1619 /// right by \a __count bits, shifting in zero bits, and returns the result.
1620 /// If \a __count is greater than 63, the returned result is all zeroes.
1622 /// \headerfile <immintrin.h>
1624 /// This intrinsic corresponds to the \c VPSRLQ instruction.
1627 /// A 256-bit vector of [4 x i64] to be shifted.
1629 /// An unsigned integer value specifying the shift count (in bits).
1630 /// \returns A 256-bit vector of [4 x i64] containing the result.
1631 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1632 _mm256_srli_epi64(__m256i __a
, int __count
)
1634 return __builtin_ia32_psrlqi256((__v4di
)__a
, __count
);
1637 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
1638 /// right by the number of bits given in the lower 64 bits of \a __count,
1639 /// shifting in zero bits, and returns the result. If \a __count is greater
1640 /// than 63, the returned result is all zeroes.
1642 /// \headerfile <immintrin.h>
1644 /// This intrinsic corresponds to the \c VPSRLQ instruction.
1647 /// A 256-bit vector of [4 x i64] to be shifted.
1649 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1650 /// shift count (in bits). The upper element is ignored.
1651 /// \returns A 256-bit vector of [4 x i64] containing the result.
1652 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1653 _mm256_srl_epi64(__m256i __a
, __m128i __count
)
1655 return __builtin_ia32_psrlq256((__v4di
)__a
, __count
);
1658 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
1659 /// vectors. Returns the lower 8 bits of each difference in the
1660 /// corresponding byte of the 256-bit integer vector result (overflow is
1663 /// \code{.operation}
1664 /// FOR i := 0 TO 31
1666 /// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
1670 /// \headerfile <immintrin.h>
1672 /// This intrinsic corresponds to the \c VPSUBB instruction.
1675 /// A 256-bit integer vector containing the minuends.
1677 /// A 256-bit integer vector containing the subtrahends.
1678 /// \returns A 256-bit integer vector containing the differences.
1679 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1680 _mm256_sub_epi8(__m256i __a
, __m256i __b
)
1682 return (__m256i
)((__v32qu
)__a
- (__v32qu
)__b
);
1685 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
1686 /// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
1687 /// the corresponding element of the [16 x i16] result (overflow is
1690 /// \code{.operation}
1691 /// FOR i := 0 TO 15
1693 /// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
1697 /// \headerfile <immintrin.h>
1699 /// This intrinsic corresponds to the \c VPSUBW instruction.
1702 /// A 256-bit vector of [16 x i16] containing the minuends.
1704 /// A 256-bit vector of [16 x i16] containing the subtrahends.
1705 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1706 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1707 _mm256_sub_epi16(__m256i __a
, __m256i __b
)
1709 return (__m256i
)((__v16hu
)__a
- (__v16hu
)__b
);
1712 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
1713 /// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
1714 /// the corresponding element of the [8 x i32] result (overflow is ignored).
1716 /// \code{.operation}
1719 /// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
1723 /// \headerfile <immintrin.h>
1725 /// This intrinsic corresponds to the \c VPSUBD instruction.
1728 /// A 256-bit vector of [8 x i32] containing the minuends.
1730 /// A 256-bit vector of [8 x i32] containing the subtrahends.
1731 /// \returns A 256-bit vector of [8 x i32] containing the differences.
1732 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1733 _mm256_sub_epi32(__m256i __a
, __m256i __b
)
1735 return (__m256i
)((__v8su
)__a
- (__v8su
)__b
);
1738 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
1739 /// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
1740 /// the corresponding element of the [4 x i64] result (overflow is ignored).
1742 /// \code{.operation}
1745 /// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
1749 /// \headerfile <immintrin.h>
1751 /// This intrinsic corresponds to the \c VPSUBQ instruction.
1754 /// A 256-bit vector of [4 x i64] containing the minuends.
1756 /// A 256-bit vector of [4 x i64] containing the subtrahends.
1757 /// \returns A 256-bit vector of [4 x i64] containing the differences.
1758 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1759 _mm256_sub_epi64(__m256i __a
, __m256i __b
)
1761 return (__m256i
)((__v4du
)__a
- (__v4du
)__b
);
1764 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
1765 /// vectors using signed saturation, and returns each differences in the
1766 /// corresponding byte of the 256-bit integer vector result.
1768 /// \code{.operation}
1769 /// FOR i := 0 TO 31
1771 /// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
1775 /// \headerfile <immintrin.h>
1777 /// This intrinsic corresponds to the \c VPSUBSB instruction.
1780 /// A 256-bit integer vector containing the minuends.
1782 /// A 256-bit integer vector containing the subtrahends.
1783 /// \returns A 256-bit integer vector containing the differences.
1784 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1785 _mm256_subs_epi8(__m256i __a
, __m256i __b
)
1787 return (__m256i
)__builtin_elementwise_sub_sat((__v32qs
)__a
, (__v32qs
)__b
);
1790 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
1791 /// vectors of [16 x i16] using signed saturation, and returns each
1792 /// difference in the corresponding element of the [16 x i16] result.
1794 /// \code{.operation}
1795 /// FOR i := 0 TO 15
1797 /// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
1801 /// \headerfile <immintrin.h>
1803 /// This intrinsic corresponds to the \c VPSUBSW instruction.
1806 /// A 256-bit vector of [16 x i16] containing the minuends.
1808 /// A 256-bit vector of [16 x i16] containing the subtrahends.
1809 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1810 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1811 _mm256_subs_epi16(__m256i __a
, __m256i __b
)
1813 return (__m256i
)__builtin_elementwise_sub_sat((__v16hi
)__a
, (__v16hi
)__b
);
1816 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
1817 /// vectors using unsigned saturation, and returns each difference in the
1818 /// corresponding byte of the 256-bit integer vector result. For each byte,
1819 /// computes <c> result = __a - __b </c>.
1821 /// \code{.operation}
1822 /// FOR i := 0 TO 31
1824 /// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
1828 /// \headerfile <immintrin.h>
1830 /// This intrinsic corresponds to the \c VPSUBUSB instruction.
1833 /// A 256-bit integer vector containing the minuends.
1835 /// A 256-bit integer vector containing the subtrahends.
1836 /// \returns A 256-bit integer vector containing the differences.
1837 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1838 _mm256_subs_epu8(__m256i __a
, __m256i __b
)
1840 return (__m256i
)__builtin_elementwise_sub_sat((__v32qu
)__a
, (__v32qu
)__b
);
1843 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
1844 /// vectors of [16 x i16] using unsigned saturation, and returns each
1845 /// difference in the corresponding element of the [16 x i16] result.
1847 /// \code{.operation}
1848 /// FOR i := 0 TO 15
1850 /// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
1854 /// \headerfile <immintrin.h>
1856 /// This intrinsic corresponds to the \c VPSUBUSW instruction.
1859 /// A 256-bit vector of [16 x i16] containing the minuends.
1861 /// A 256-bit vector of [16 x i16] containing the subtrahends.
1862 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1863 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1864 _mm256_subs_epu16(__m256i __a
, __m256i __b
)
1866 return (__m256i
)__builtin_elementwise_sub_sat((__v16hu
)__a
, (__v16hu
)__b
);
1869 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
1870 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
1871 /// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
1872 /// input; other bits in these parameters are ignored.
1874 /// \code{.operation}
1875 /// result[7:0] := __a[71:64]
1876 /// result[15:8] := __b[71:64]
1877 /// result[23:16] := __a[79:72]
1878 /// result[31:24] := __b[79:72]
1880 /// result[127:120] := __b[127:120]
1881 /// result[135:128] := __a[199:192]
1883 /// result[255:248] := __b[255:248]
1886 /// \headerfile <immintrin.h>
1888 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
1891 /// A 256-bit integer vector used as the source for the even-numbered bytes
1894 /// A 256-bit integer vector used as the source for the odd-numbered bytes
1896 /// \returns A 256-bit integer vector containing the result.
1897 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1898 _mm256_unpackhi_epi8(__m256i __a
, __m256i __b
)
1900 return (__m256i
)__builtin_shufflevector((__v32qi
)__a
, (__v32qi
)__b
, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
1903 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
1904 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
1905 /// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
1906 /// 128-bit half of \a __a and \a __b as input; other bits in these
1907 /// parameters are ignored.
1909 /// \code{.operation}
1910 /// result[15:0] := __a[79:64]
1911 /// result[31:16] := __b[79:64]
1912 /// result[47:32] := __a[95:80]
1913 /// result[63:48] := __b[95:80]
1915 /// result[127:112] := __b[127:112]
1916 /// result[143:128] := __a[211:196]
1918 /// result[255:240] := __b[255:240]
1921 /// \headerfile <immintrin.h>
1923 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
1926 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
1927 /// elements of the result.
1929 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
1930 /// elements of the result.
1931 /// \returns A 256-bit vector of [16 x i16] containing the result.
1932 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1933 _mm256_unpackhi_epi16(__m256i __a
, __m256i __b
)
1935 return (__m256i
)__builtin_shufflevector((__v16hi
)__a
, (__v16hi
)__b
, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1938 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
1939 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
1940 /// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
1941 /// of \a __a and \a __b as input; other bits in these parameters are
1944 /// \code{.operation}
1945 /// result[31:0] := __a[95:64]
1946 /// result[63:32] := __b[95:64]
1947 /// result[95:64] := __a[127:96]
1948 /// result[127:96] := __b[127:96]
1949 /// result[159:128] := __a[223:192]
1950 /// result[191:160] := __b[223:192]
1951 /// result[223:192] := __a[255:224]
1952 /// result[255:224] := __b[255:224]
1955 /// \headerfile <immintrin.h>
1957 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
1960 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
1961 /// elements of the result.
1963 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
1964 /// elements of the result.
1965 /// \returns A 256-bit vector of [8 x i32] containing the result.
1966 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1967 _mm256_unpackhi_epi32(__m256i __a
, __m256i __b
)
1969 return (__m256i
)__builtin_shufflevector((__v8si
)__a
, (__v8si
)__b
, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
1972 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
1973 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
1974 /// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
1975 /// of \a __a and \a __b as input; other bits in these parameters are
1978 /// \code{.operation}
1979 /// result[63:0] := __a[127:64]
1980 /// result[127:64] := __b[127:64]
1981 /// result[191:128] := __a[255:192]
1982 /// result[255:192] := __b[255:192]
1985 /// \headerfile <immintrin.h>
1987 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
1990 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
1991 /// elements of the result.
1993 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
1994 /// elements of the result.
1995 /// \returns A 256-bit vector of [4 x i64] containing the result.
1996 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1997 _mm256_unpackhi_epi64(__m256i __a
, __m256i __b
)
1999 return (__m256i
)__builtin_shufflevector((__v4di
)__a
, (__v4di
)__b
, 1, 4+1, 3, 4+3);
2002 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2003 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2004 /// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2005 /// input; other bits in these parameters are ignored.
2007 /// \code{.operation}
2008 /// result[7:0] := __a[7:0]
2009 /// result[15:8] := __b[7:0]
2010 /// result[23:16] := __a[15:8]
2011 /// result[31:24] := __b[15:8]
2013 /// result[127:120] := __b[63:56]
2014 /// result[135:128] := __a[135:128]
2016 /// result[255:248] := __b[191:184]
2019 /// \headerfile <immintrin.h>
2021 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2024 /// A 256-bit integer vector used as the source for the even-numbered bytes
2027 /// A 256-bit integer vector used as the source for the odd-numbered bytes
2029 /// \returns A 256-bit integer vector containing the result.
2030 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2031 _mm256_unpacklo_epi8(__m256i __a
, __m256i __b
)
2033 return (__m256i
)__builtin_shufflevector((__v32qi
)__a
, (__v32qi
)__b
, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2036 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2037 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2038 /// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2039 /// 128-bit half of \a __a and \a __b as input; other bits in these
2040 /// parameters are ignored.
2042 /// \code{.operation}
2043 /// result[15:0] := __a[15:0]
2044 /// result[31:16] := __b[15:0]
2045 /// result[47:32] := __a[31:16]
2046 /// result[63:48] := __b[31:16]
2048 /// result[127:112] := __b[63:48]
2049 /// result[143:128] := __a[143:128]
2051 /// result[255:239] := __b[191:176]
2054 /// \headerfile <immintrin.h>
2056 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2059 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2060 /// elements of the result.
2062 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2063 /// elements of the result.
2064 /// \returns A 256-bit vector of [16 x i16] containing the result.
2065 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2066 _mm256_unpacklo_epi16(__m256i __a
, __m256i __b
)
2068 return (__m256i
)__builtin_shufflevector((__v16hi
)__a
, (__v16hi
)__b
, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2071 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2072 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2073 /// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2074 /// of \a __a and \a __b as input; other bits in these parameters are
2077 /// \code{.operation}
2078 /// result[31:0] := __a[31:0]
2079 /// result[63:32] := __b[31:0]
2080 /// result[95:64] := __a[63:32]
2081 /// result[127:96] := __b[63:32]
2082 /// result[159:128] := __a[159:128]
2083 /// result[191:160] := __b[159:128]
2084 /// result[223:192] := __a[191:160]
2085 /// result[255:224] := __b[191:190]
2088 /// \headerfile <immintrin.h>
2090 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2093 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2094 /// elements of the result.
2096 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2097 /// elements of the result.
2098 /// \returns A 256-bit vector of [8 x i32] containing the result.
2099 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2100 _mm256_unpacklo_epi32(__m256i __a
, __m256i __b
)
2102 return (__m256i
)__builtin_shufflevector((__v8si
)__a
, (__v8si
)__b
, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2105 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2106 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2107 /// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2108 /// of \a __a and \a __b as input; other bits in these parameters are
2111 /// \code{.operation}
2112 /// result[63:0] := __a[63:0]
2113 /// result[127:64] := __b[63:0]
2114 /// result[191:128] := __a[191:128]
2115 /// result[255:192] := __b[191:128]
2118 /// \headerfile <immintrin.h>
2120 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2123 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2124 /// elements of the result.
2126 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2127 /// elements of the result.
2128 /// \returns A 256-bit vector of [4 x i64] containing the result.
2129 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2130 _mm256_unpacklo_epi64(__m256i __a
, __m256i __b
)
2132 return (__m256i
)__builtin_shufflevector((__v4di
)__a
, (__v4di
)__b
, 0, 4+0, 2, 4+2);
2135 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2136 _mm256_xor_si256(__m256i __a
, __m256i __b
)
2138 return (__m256i
)((__v4du
)__a
^ (__v4du
)__b
);
2141 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2142 _mm256_stream_load_si256(__m256i
const *__V
)
2144 typedef __v4di __v4di_aligned
__attribute__((aligned(32)));
2145 return (__m256i
)__builtin_nontemporal_load((const __v4di_aligned
*)__V
);
2148 /// Broadcasts the 32-bit floating-point value from the low element of the
2149 /// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2150 /// 128-bit vector of [4 x float].
2152 /// \headerfile <immintrin.h>
2154 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2157 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
2158 /// \returns A 128-bit vector of [4 x float] containing the result.
2159 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2160 _mm_broadcastss_ps(__m128 __X
)
2162 return (__m128
)__builtin_shufflevector((__v4sf
)__X
, (__v4sf
)__X
, 0, 0, 0, 0);
2165 /// Broadcasts the 64-bit floating-point value from the low element of the
2166 /// 128-bit vector of [2 x double] in \a __a to both elements of the
2167 /// result's 128-bit vector of [2 x double].
2169 /// \headerfile <immintrin.h>
2171 /// This intrinsic corresponds to the \c MOVDDUP instruction.
2174 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
2175 /// \returns A 128-bit vector of [2 x double] containing the result.
2176 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2177 _mm_broadcastsd_pd(__m128d __a
)
2179 return __builtin_shufflevector((__v2df
)__a
, (__v2df
)__a
, 0, 0);
2182 /// Broadcasts the 32-bit floating-point value from the low element of the
2183 /// 128-bit vector of [4 x float] in \a __X to all elements of the
2184 /// result's 256-bit vector of [8 x float].
2186 /// \headerfile <immintrin.h>
2188 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2191 /// A 128-bit vector of [4 x float] whose low element will be broadcast.
2192 /// \returns A 256-bit vector of [8 x float] containing the result.
2193 static __inline__ __m256 __DEFAULT_FN_ATTRS256
2194 _mm256_broadcastss_ps(__m128 __X
)
2196 return (__m256
)__builtin_shufflevector((__v4sf
)__X
, (__v4sf
)__X
, 0, 0, 0, 0, 0, 0, 0, 0);
2199 /// Broadcasts the 64-bit floating-point value from the low element of the
2200 /// 128-bit vector of [2 x double] in \a __X to all elements of the
2201 /// result's 256-bit vector of [4 x double].
2203 /// \headerfile <immintrin.h>
2205 /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
2208 /// A 128-bit vector of [2 x double] whose low element will be broadcast.
2209 /// \returns A 256-bit vector of [4 x double] containing the result.
2210 static __inline__ __m256d __DEFAULT_FN_ATTRS256
2211 _mm256_broadcastsd_pd(__m128d __X
)
2213 return (__m256d
)__builtin_shufflevector((__v2df
)__X
, (__v2df
)__X
, 0, 0, 0, 0);
2216 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
2217 /// upper halves of the 256-bit result.
2219 /// \headerfile <immintrin.h>
2221 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
2224 /// A 128-bit integer vector to be broadcast.
2225 /// \returns A 256-bit integer vector containing the result.
2226 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2227 _mm256_broadcastsi128_si256(__m128i __X
)
2229 return (__m256i
)__builtin_shufflevector((__v2di
)__X
, (__v2di
)__X
, 0, 1, 0, 1);
2232 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
2234 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
2235 /// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
2236 /// as specified by the immediate integer operand \a M.
2238 /// \code{.operation}
2242 /// result[31+j:j] := V1[31+j:j]
2244 /// result[31+j:j] := V2[32+j:j]
2249 /// \headerfile <immintrin.h>
2252 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
2255 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
2258 /// A 128-bit vector of [4 x i32] containing source values.
2260 /// A 128-bit vector of [4 x i32] containing source values.
2262 /// An immediate 8-bit integer operand, with bits [3:0] specifying the
2263 /// source for each element of the result. The position of the mask bit
2264 /// corresponds to the index of a copied value. When a mask bit is 0, the
2265 /// element is copied from \a V1; otherwise, it is copied from \a V2.
2266 /// \returns A 128-bit vector of [4 x i32] containing the result.
2267 #define _mm_blend_epi32(V1, V2, M) \
2268 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
2269 (__v4si)(__m128i)(V2), (int)(M)))
2271 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
2272 /// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
2273 /// as specified by the immediate integer operand \a M.
2275 /// \code{.operation}
2279 /// result[31+j:j] := V1[31+j:j]
2281 /// result[31+j:j] := V2[32+j:j]
2286 /// \headerfile <immintrin.h>
2289 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
2292 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
2295 /// A 256-bit vector of [8 x i32] containing source values.
2297 /// A 256-bit vector of [8 x i32] containing source values.
2299 /// An immediate 8-bit integer operand, with bits [7:0] specifying the
2300 /// source for each element of the result. The position of the mask bit
2301 /// corresponds to the index of a copied value. When a mask bit is 0, the
2302 /// element is copied from \a V1; otherwise, it is is copied from \a V2.
2303 /// \returns A 256-bit vector of [8 x i32] containing the result.
2304 #define _mm256_blend_epi32(V1, V2, M) \
2305 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
2306 (__v8si)(__m256i)(V2), (int)(M)))
2308 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
2309 /// bytes of the 256-bit result.
2311 /// \headerfile <immintrin.h>
2313 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
2316 /// A 128-bit integer vector whose low byte will be broadcast.
2317 /// \returns A 256-bit integer vector containing the result.
2318 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2319 _mm256_broadcastb_epi8(__m128i __X
)
2321 return (__m256i
)__builtin_shufflevector((__v16qi
)__X
, (__v16qi
)__X
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
2324 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
2325 /// to all elements of the result's 256-bit vector of [16 x i16].
2327 /// \headerfile <immintrin.h>
2329 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
2332 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
2333 /// \returns A 256-bit vector of [16 x i16] containing the result.
2334 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2335 _mm256_broadcastw_epi16(__m128i __X
)
2337 return (__m256i
)__builtin_shufflevector((__v8hi
)__X
, (__v8hi
)__X
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
2340 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
2341 /// to all elements of the result's 256-bit vector of [8 x i32].
2343 /// \headerfile <immintrin.h>
2345 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
2348 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
2349 /// \returns A 256-bit vector of [8 x i32] containing the result.
2350 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2351 _mm256_broadcastd_epi32(__m128i __X
)
2353 return (__m256i
)__builtin_shufflevector((__v4si
)__X
, (__v4si
)__X
, 0, 0, 0, 0, 0, 0, 0, 0);
2356 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
2357 /// to all elements of the result's 256-bit vector of [4 x i64].
2359 /// \headerfile <immintrin.h>
2361 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
2364 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
2365 /// \returns A 256-bit vector of [4 x i64] containing the result.
2366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2367 _mm256_broadcastq_epi64(__m128i __X
)
2369 return (__m256i
)__builtin_shufflevector((__v2di
)__X
, (__v2di
)__X
, 0, 0, 0, 0);
2372 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
2373 /// bytes of the 128-bit result.
2375 /// \headerfile <immintrin.h>
2377 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
2380 /// A 128-bit integer vector whose low byte will be broadcast.
2381 /// \returns A 128-bit integer vector containing the result.
2382 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2383 _mm_broadcastb_epi8(__m128i __X
)
2385 return (__m128i
)__builtin_shufflevector((__v16qi
)__X
, (__v16qi
)__X
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
2388 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
2389 /// \a __X to all elements of the result's 128-bit vector of [8 x i16].
2391 /// \headerfile <immintrin.h>
2393 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
2396 /// A 128-bit vector of [8 x i16] whose low element will be broadcast.
2397 /// \returns A 128-bit vector of [8 x i16] containing the result.
2398 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2399 _mm_broadcastw_epi16(__m128i __X
)
2401 return (__m128i
)__builtin_shufflevector((__v8hi
)__X
, (__v8hi
)__X
, 0, 0, 0, 0, 0, 0, 0, 0);
2404 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
2405 /// to all elements of the result's vector of [4 x i32].
2407 /// \headerfile <immintrin.h>
2409 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
2412 /// A 128-bit vector of [4 x i32] whose low element will be broadcast.
2413 /// \returns A 128-bit vector of [4 x i32] containing the result.
2414 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2415 _mm_broadcastd_epi32(__m128i __X
)
2417 return (__m128i
)__builtin_shufflevector((__v4si
)__X
, (__v4si
)__X
, 0, 0, 0, 0);
2420 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
2421 /// to both elements of the result's 128-bit vector of [2 x i64].
2423 /// \headerfile <immintrin.h>
2425 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
2428 /// A 128-bit vector of [2 x i64] whose low element will be broadcast.
2429 /// \returns A 128-bit vector of [2 x i64] containing the result.
2430 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2431 _mm_broadcastq_epi64(__m128i __X
)
2433 return (__m128i
)__builtin_shufflevector((__v2di
)__X
, (__v2di
)__X
, 0, 0);
2436 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
2437 /// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
2438 /// elements of the 256-bit vector of [8 x i32] in \a __b.
2440 /// \code{.operation}
2443 /// k := __b[j+2:j] * 32
2444 /// result[j+31:j] := __a[k+31:k]
2448 /// \headerfile <immintrin.h>
2450 /// This intrinsic corresponds to the \c VPERMD instruction.
2453 /// A 256-bit vector of [8 x i32] containing the source values.
2455 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
2457 /// \returns A 256-bit vector of [8 x i32] containing the result.
2458 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2459 _mm256_permutevar8x32_epi32(__m256i __a
, __m256i __b
)
2461 return (__m256i
)__builtin_ia32_permvarsi256((__v8si
)__a
, (__v8si
)__b
);
2464 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
2465 /// the 256-bit vector of [4 x double] in \a V as specified by the
2466 /// immediate value \a M.
2468 /// \code{.operation}
2471 /// k := (M >> i*2)[1:0] * 64
2472 /// result[j+63:j] := V[k+63:k]
2476 /// \headerfile <immintrin.h>
2479 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
2482 /// This intrinsic corresponds to the \c VPERMPD instruction.
2485 /// A 256-bit vector of [4 x double] containing the source values.
2487 /// An immediate 8-bit value specifying which elements to copy from \a V.
2488 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
2489 /// \a M[3:2] specifies the index for element 1, and so forth.
2490 /// \returns A 256-bit vector of [4 x double] containing the result.
2491 #define _mm256_permute4x64_pd(V, M) \
2492 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
2494 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
2495 /// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
2496 /// the elements of the 256-bit vector of [8 x i32] in \a __b.
2498 /// \code{.operation}
2501 /// k := __b[j+2:j] * 32
2502 /// result[j+31:j] := __a[k+31:k]
2506 /// \headerfile <immintrin.h>
2508 /// This intrinsic corresponds to the \c VPERMPS instruction.
2511 /// A 256-bit vector of [8 x float] containing the source values.
2513 /// A 256-bit vector of [8 x i32] containing indexes of values to use from
2515 /// \returns A 256-bit vector of [8 x float] containing the result.
2516 static __inline__ __m256 __DEFAULT_FN_ATTRS256
2517 _mm256_permutevar8x32_ps(__m256 __a
, __m256i __b
)
2519 return (__m256
)__builtin_ia32_permvarsf256((__v8sf
)__a
, (__v8si
)__b
);
2522 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
2523 /// of the 256-bit vector of [4 x i64] in \a V as specified by the
2524 /// immediate value \a M.
2526 /// \code{.operation}
2529 /// k := (M >> i*2)[1:0] * 64
2530 /// result[j+63:j] := V[k+63:k]
2534 /// \headerfile <immintrin.h>
2537 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
2540 /// This intrinsic corresponds to the \c VPERMQ instruction.
2543 /// A 256-bit vector of [4 x i64] containing the source values.
2545 /// An immediate 8-bit value specifying which elements to copy from \a V.
2546 /// \a M[1:0] specifies the index in \a a for element 0 of the result,
2547 /// \a M[3:2] specifies the index for element 1, and so forth.
2548 /// \returns A 256-bit vector of [4 x i64] containing the result.
2549 #define _mm256_permute4x64_epi64(V, M) \
2550 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
2552 /// Sets each half of the 256-bit result either to zero or to one of the
2553 /// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
2554 /// as specified by the immediate value \a M.
2556 /// \code{.operation}
2561 /// CASE (k[1:0]) OF
2562 /// 0: result[127+j:j] := V1[127:0]
2563 /// 1: result[127+j:j] := V1[255:128]
2564 /// 2: result[127+j:j] := V2[127:0]
2565 /// 3: result[127+j:j] := V2[255:128]
2568 /// result[127+j:j] := 0
2573 /// \headerfile <immintrin.h>
2576 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
2579 /// This intrinsic corresponds to the \c VPERM2I128 instruction.
2582 /// A 256-bit integer vector containing source values.
2584 /// A 256-bit integer vector containing source values.
2586 /// An immediate value specifying how to form the result. Bits [3:0]
2587 /// control the lower half of the result, bits [7:4] control the upper half.
2588 /// Within each 4-bit control value, if bit 3 is 1, the result is zero,
2589 /// otherwise bits [1:0] determine the source as follows. \n
2590 /// 0: the lower half of \a V1 \n
2591 /// 1: the upper half of \a V1 \n
2592 /// 2: the lower half of \a V2 \n
2593 /// 3: the upper half of \a V2
2594 /// \returns A 256-bit integer vector containing the result.
2595 #define _mm256_permute2x128_si256(V1, V2, M) \
2596 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
2598 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
2599 /// of the immediate \a M is zero, extracts the lower half of the result;
2600 /// otherwise, extracts the upper half.
2602 /// \headerfile <immintrin.h>
2605 /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
2608 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
2611 /// A 256-bit integer vector containing the source values.
2613 /// An immediate value specifying which half of \a V to extract.
2614 /// \returns A 128-bit integer vector containing the result.
2615 #define _mm256_extracti128_si256(V, M) \
2616 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
2618 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
2619 /// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
2620 /// is zero, overwrites the lower half of the result; otherwise,
2621 /// overwrites the upper half.
2623 /// \headerfile <immintrin.h>
2626 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
2629 /// This intrinsic corresponds to the \c VINSERTI128 instruction.
2632 /// A 256-bit integer vector containing a source value.
2634 /// A 128-bit integer vector containing a source value.
2636 /// An immediate value specifying where to put \a V2 in the result.
2637 /// \returns A 256-bit integer vector containing the result.
2638 #define _mm256_inserti128_si256(V1, V2, M) \
2639 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
2640 (__v2di)(__m128i)(V2), (int)(M)))
2642 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2643 _mm256_maskload_epi32(int const *__X
, __m256i __M
)
2645 return (__m256i
)__builtin_ia32_maskloadd256((const __v8si
*)__X
, (__v8si
)__M
);
2648 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2649 _mm256_maskload_epi64(long long const *__X
, __m256i __M
)
2651 return (__m256i
)__builtin_ia32_maskloadq256((const __v4di
*)__X
, (__v4di
)__M
);
2654 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2655 _mm_maskload_epi32(int const *__X
, __m128i __M
)
2657 return (__m128i
)__builtin_ia32_maskloadd((const __v4si
*)__X
, (__v4si
)__M
);
2660 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2661 _mm_maskload_epi64(long long const *__X
, __m128i __M
)
2663 return (__m128i
)__builtin_ia32_maskloadq((const __v2di
*)__X
, (__v2di
)__M
);
2666 static __inline__
void __DEFAULT_FN_ATTRS256
2667 _mm256_maskstore_epi32(int *__X
, __m256i __M
, __m256i __Y
)
2669 __builtin_ia32_maskstored256((__v8si
*)__X
, (__v8si
)__M
, (__v8si
)__Y
);
2672 static __inline__
void __DEFAULT_FN_ATTRS256
2673 _mm256_maskstore_epi64(long long *__X
, __m256i __M
, __m256i __Y
)
2675 __builtin_ia32_maskstoreq256((__v4di
*)__X
, (__v4di
)__M
, (__v4di
)__Y
);
2678 static __inline__
void __DEFAULT_FN_ATTRS128
2679 _mm_maskstore_epi32(int *__X
, __m128i __M
, __m128i __Y
)
2681 __builtin_ia32_maskstored((__v4si
*)__X
, (__v4si
)__M
, (__v4si
)__Y
);
2684 static __inline__
void __DEFAULT_FN_ATTRS128
2685 _mm_maskstore_epi64(long long *__X
, __m128i __M
, __m128i __Y
)
2687 __builtin_ia32_maskstoreq(( __v2di
*)__X
, (__v2di
)__M
, (__v2di
)__Y
);
2690 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
2691 /// left by the number of bits given in the corresponding element of the
2692 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
2693 /// returns the result. If the shift count for any element is greater than
2694 /// 31, the result for that element is zero.
2696 /// \headerfile <immintrin.h>
2698 /// This intrinsic corresponds to the \c VPSLLVD instruction.
2701 /// A 256-bit vector of [8 x i32] to be shifted.
2703 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
2705 /// \returns A 256-bit vector of [8 x i32] containing the result.
2706 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2707 _mm256_sllv_epi32(__m256i __X
, __m256i __Y
)
2709 return (__m256i
)__builtin_ia32_psllv8si((__v8si
)__X
, (__v8si
)__Y
);
2712 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
2713 /// left by the number of bits given in the corresponding element of the
2714 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
2715 /// returns the result. If the shift count for any element is greater than
2716 /// 31, the result for that element is zero.
2718 /// \headerfile <immintrin.h>
2720 /// This intrinsic corresponds to the \c VPSLLVD instruction.
2723 /// A 128-bit vector of [4 x i32] to be shifted.
2725 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
2727 /// \returns A 128-bit vector of [4 x i32] containing the result.
2728 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2729 _mm_sllv_epi32(__m128i __X
, __m128i __Y
)
2731 return (__m128i
)__builtin_ia32_psllv4si((__v4si
)__X
, (__v4si
)__Y
);
2734 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
2735 /// left by the number of bits given in the corresponding element of the
2736 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
2737 /// returns the result. If the shift count for any element is greater than
2738 /// 63, the result for that element is zero.
2740 /// \headerfile <immintrin.h>
2742 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
2745 /// A 256-bit vector of [4 x i64] to be shifted.
2747 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
2749 /// \returns A 256-bit vector of [4 x i64] containing the result.
2750 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2751 _mm256_sllv_epi64(__m256i __X
, __m256i __Y
)
2753 return (__m256i
)__builtin_ia32_psllv4di((__v4di
)__X
, (__v4di
)__Y
);
2756 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
2757 /// left by the number of bits given in the corresponding element of the
2758 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
2759 /// returns the result. If the shift count for any element is greater than
2760 /// 63, the result for that element is zero.
2762 /// \headerfile <immintrin.h>
2764 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
2767 /// A 128-bit vector of [2 x i64] to be shifted.
2769 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
2771 /// \returns A 128-bit vector of [2 x i64] containing the result.
2772 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2773 _mm_sllv_epi64(__m128i __X
, __m128i __Y
)
2775 return (__m128i
)__builtin_ia32_psllv2di((__v2di
)__X
, (__v2di
)__Y
);
2778 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
2779 /// right by the number of bits given in the corresponding element of the
2780 /// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
2781 /// returns the result. If the shift count for any element is greater than
2782 /// 31, the result for that element is 0 or -1 according to the sign bit
2783 /// for that element.
2785 /// \headerfile <immintrin.h>
2787 /// This intrinsic corresponds to the \c VPSRAVD instruction.
2790 /// A 256-bit vector of [8 x i32] to be shifted.
2792 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
2794 /// \returns A 256-bit vector of [8 x i32] containing the result.
2795 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2796 _mm256_srav_epi32(__m256i __X
, __m256i __Y
)
2798 return (__m256i
)__builtin_ia32_psrav8si((__v8si
)__X
, (__v8si
)__Y
);
2801 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
2802 /// right by the number of bits given in the corresponding element of the
2803 /// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
2804 /// returns the result. If the shift count for any element is greater than
2805 /// 31, the result for that element is 0 or -1 according to the sign bit
2806 /// for that element.
2808 /// \headerfile <immintrin.h>
2810 /// This intrinsic corresponds to the \c VPSRAVD instruction.
2813 /// A 128-bit vector of [4 x i32] to be shifted.
2815 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
2817 /// \returns A 128-bit vector of [4 x i32] containing the result.
2818 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2819 _mm_srav_epi32(__m128i __X
, __m128i __Y
)
2821 return (__m128i
)__builtin_ia32_psrav4si((__v4si
)__X
, (__v4si
)__Y
);
2824 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
2825 /// right by the number of bits given in the corresponding element of the
2826 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
2827 /// returns the result. If the shift count for any element is greater than
2828 /// 31, the result for that element is zero.
2830 /// \headerfile <immintrin.h>
2832 /// This intrinsic corresponds to the \c VPSRLVD instruction.
2835 /// A 256-bit vector of [8 x i32] to be shifted.
2837 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
2839 /// \returns A 256-bit vector of [8 x i32] containing the result.
2840 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2841 _mm256_srlv_epi32(__m256i __X
, __m256i __Y
)
2843 return (__m256i
)__builtin_ia32_psrlv8si((__v8si
)__X
, (__v8si
)__Y
);
2846 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
2847 /// right by the number of bits given in the corresponding element of the
2848 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
2849 /// returns the result. If the shift count for any element is greater than
2850 /// 31, the result for that element is zero.
2852 /// \headerfile <immintrin.h>
2854 /// This intrinsic corresponds to the \c VPSRLVD instruction.
2857 /// A 128-bit vector of [4 x i32] to be shifted.
2859 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
2861 /// \returns A 128-bit vector of [4 x i32] containing the result.
2862 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2863 _mm_srlv_epi32(__m128i __X
, __m128i __Y
)
2865 return (__m128i
)__builtin_ia32_psrlv4si((__v4si
)__X
, (__v4si
)__Y
);
2868 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
2869 /// right by the number of bits given in the corresponding element of the
2870 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
2871 /// returns the result. If the shift count for any element is greater than
2872 /// 63, the result for that element is zero.
2874 /// \headerfile <immintrin.h>
2876 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
2879 /// A 256-bit vector of [4 x i64] to be shifted.
2881 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
2883 /// \returns A 256-bit vector of [4 x i64] containing the result.
2884 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2885 _mm256_srlv_epi64(__m256i __X
, __m256i __Y
)
2887 return (__m256i
)__builtin_ia32_psrlv4di((__v4di
)__X
, (__v4di
)__Y
);
2890 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
2891 /// right by the number of bits given in the corresponding element of the
2892 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
2893 /// returns the result. If the shift count for any element is greater than
2894 /// 63, the result for that element is zero.
2896 /// \headerfile <immintrin.h>
2898 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
2901 /// A 128-bit vector of [2 x i64] to be shifted.
2903 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
2905 /// \returns A 128-bit vector of [2 x i64] containing the result.
2906 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2907 _mm_srlv_epi64(__m128i __X
, __m128i __Y
)
2909 return (__m128i
)__builtin_ia32_psrlv2di((__v2di
)__X
, (__v2di
)__Y
);
2912 /// Conditionally gathers two 64-bit floating-point values, either from the
2913 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
2914 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
2915 /// of [2 x double] in \a mask determines the source for each element.
2917 /// \code{.operation}
2918 /// FOR element := 0 to 1
2921 /// IF mask[j+63] == 0
2922 /// result[j+63:j] := a[j+63:j]
2924 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
2929 /// \headerfile <immintrin.h>
2932 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
2933 /// __m128d mask, const int s);
2936 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
2939 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
2942 /// A pointer to the memory used for loading values.
2944 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
2945 /// the first two elements are used.
2947 /// A 128-bit vector of [2 x double] containing the mask. The most
2948 /// significant bit of each element in the mask vector represents the mask
2949 /// bits. If a mask bit is zero, the corresponding value from vector \a a
2950 /// is gathered; otherwise the value is loaded from memory.
2952 /// A literal constant scale factor for the indexes in \a i. Must be
2954 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
2955 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
2956 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
2957 (double const *)(m), \
2958 (__v4si)(__m128i)(i), \
2959 (__v2df)(__m128d)(mask), (s)))
2961 /// Conditionally gathers four 64-bit floating-point values, either from the
2962 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
2963 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
2964 /// of [4 x double] in \a mask determines the source for each element.
2966 /// \code{.operation}
2967 /// FOR element := 0 to 3
2970 /// IF mask[j+63] == 0
2971 /// result[j+63:j] := a[j+63:j]
2973 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
2978 /// \headerfile <immintrin.h>
2981 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
2982 /// __m256d mask, const int s);
2985 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
2988 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
2991 /// A pointer to the memory used for loading values.
2993 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
2995 /// A 256-bit vector of [4 x double] containing the mask. The most
2996 /// significant bit of each element in the mask vector represents the mask
2997 /// bits. If a mask bit is zero, the corresponding value from vector \a a
2998 /// is gathered; otherwise the value is loaded from memory.
3000 /// A literal constant scale factor for the indexes in \a i. Must be
3002 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
3003 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
3004 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
3005 (double const *)(m), \
3006 (__v4si)(__m128i)(i), \
3007 (__v4df)(__m256d)(mask), (s)))
3009 /// Conditionally gathers two 64-bit floating-point values, either from the
3010 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3011 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3012 /// of [2 x double] in \a mask determines the source for each element.
3014 /// \code{.operation}
3015 /// FOR element := 0 to 1
3018 /// IF mask[j+63] == 0
3019 /// result[j+63:j] := a[j+63:j]
3021 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3026 /// \headerfile <immintrin.h>
3029 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
3030 /// __m128d mask, const int s);
3033 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
3036 /// A 128-bit vector of [2 x double] used as the source when a mask bit is
3039 /// A pointer to the memory used for loading values.
3041 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3043 /// A 128-bit vector of [2 x double] containing the mask. The most
3044 /// significant bit of each element in the mask vector represents the mask
3045 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3046 /// is gathered; otherwise the value is loaded from memory.
3048 /// A literal constant scale factor for the indexes in \a i. Must be
3050 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3051 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
3052 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
3053 (double const *)(m), \
3054 (__v2di)(__m128i)(i), \
3055 (__v2df)(__m128d)(mask), (s)))
3057 /// Conditionally gathers four 64-bit floating-point values, either from the
3058 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
3059 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
3060 /// of [4 x double] in \a mask determines the source for each element.
3062 /// \code{.operation}
3063 /// FOR element := 0 to 3
3066 /// IF mask[j+63] == 0
3067 /// result[j+63:j] := a[j+63:j]
3069 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3074 /// \headerfile <immintrin.h>
3077 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
3078 /// __m256d mask, const int s);
3081 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
3084 /// A 256-bit vector of [4 x double] used as the source when a mask bit is
3087 /// A pointer to the memory used for loading values.
3089 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3091 /// A 256-bit vector of [4 x double] containing the mask. The most
3092 /// significant bit of each element in the mask vector represents the mask
3093 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3094 /// is gathered; otherwise the value is loaded from memory.
3096 /// A literal constant scale factor for the indexes in \a i. Must be
3098 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
3099 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
3100 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
3101 (double const *)(m), \
3102 (__v4di)(__m256i)(i), \
3103 (__v4df)(__m256d)(mask), (s)))
3105 /// Conditionally gathers four 32-bit floating-point values, either from the
3106 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
3107 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3108 /// of [4 x float] in \a mask determines the source for each element.
3110 /// \code{.operation}
3111 /// FOR element := 0 to 3
3114 /// IF mask[j+31] == 0
3115 /// result[j+31:j] := a[j+31:j]
3117 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3122 /// \headerfile <immintrin.h>
3125 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
3126 /// __m128 mask, const int s);
3129 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
3132 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
3135 /// A pointer to the memory used for loading values.
3137 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3139 /// A 128-bit vector of [4 x float] containing the mask. The most
3140 /// significant bit of each element in the mask vector represents the mask
3141 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3142 /// is gathered; otherwise the value is loaded from memory.
3144 /// A literal constant scale factor for the indexes in \a i. Must be
3146 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3147 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
3148 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
3149 (float const *)(m), \
3150 (__v4si)(__m128i)(i), \
3151 (__v4sf)(__m128)(mask), (s)))
3153 /// Conditionally gathers eight 32-bit floating-point values, either from the
3154 /// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
3155 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
3156 /// of [8 x float] in \a mask determines the source for each element.
3158 /// \code{.operation}
3159 /// FOR element := 0 to 7
3162 /// IF mask[j+31] == 0
3163 /// result[j+31:j] := a[j+31:j]
3165 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3170 /// \headerfile <immintrin.h>
3173 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
3174 /// __m256 mask, const int s);
3177 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
3180 /// A 256-bit vector of [8 x float] used as the source when a mask bit is
3183 /// A pointer to the memory used for loading values.
3185 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
3187 /// A 256-bit vector of [8 x float] containing the mask. The most
3188 /// significant bit of each element in the mask vector represents the mask
3189 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3190 /// is gathered; otherwise the value is loaded from memory.
3192 /// A literal constant scale factor for the indexes in \a i. Must be
3194 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
3195 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
3196 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
3197 (float const *)(m), \
3198 (__v8si)(__m256i)(i), \
3199 (__v8sf)(__m256)(mask), (s)))
3201 /// Conditionally gathers two 32-bit floating-point values, either from the
3202 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
3203 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3204 /// of [4 x float] in \a mask determines the source for the lower two
3205 /// elements. The upper two elements of the result are zeroed.
3207 /// \code{.operation}
3208 /// FOR element := 0 to 1
3211 /// IF mask[j+31] == 0
3212 /// result[j+31:j] := a[j+31:j]
3214 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3217 /// result[127:64] := 0
3220 /// \headerfile <immintrin.h>
3223 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
3224 /// __m128 mask, const int s);
3227 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
3230 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
3231 /// zero. Only the first two elements are used.
3233 /// A pointer to the memory used for loading values.
3235 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3237 /// A 128-bit vector of [4 x float] containing the mask. The most
3238 /// significant bit of each element in the mask vector represents the mask
3239 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3240 /// is gathered; otherwise the value is loaded from memory. Only the first
3241 /// two elements are used.
3243 /// A literal constant scale factor for the indexes in \a i. Must be
3245 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3246 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
3247 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
3248 (float const *)(m), \
3249 (__v2di)(__m128i)(i), \
3250 (__v4sf)(__m128)(mask), (s)))
3252 /// Conditionally gathers four 32-bit floating-point values, either from the
3253 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
3254 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
3255 /// of [4 x float] in \a mask determines the source for each element.
3257 /// \code{.operation}
3258 /// FOR element := 0 to 3
3261 /// IF mask[j+31] == 0
3262 /// result[j+31:j] := a[j+31:j]
3264 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3269 /// \headerfile <immintrin.h>
3272 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
3273 /// __m128 mask, const int s);
3276 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
3279 /// A 128-bit vector of [4 x float] used as the source when a mask bit is
3282 /// A pointer to the memory used for loading values.
3284 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3286 /// A 128-bit vector of [4 x float] containing the mask. The most
3287 /// significant bit of each element in the mask vector represents the mask
3288 /// bits. If a mask bit is zero, the corresponding value from vector \a a
3289 /// is gathered; otherwise the value is loaded from memory.
3291 /// A literal constant scale factor for the indexes in \a i. Must be
3293 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3294 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
3295 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
3296 (float const *)(m), \
3297 (__v4di)(__m256i)(i), \
3298 (__v4sf)(__m128)(mask), (s)))
3300 /// Conditionally gathers four 32-bit integer values, either from the
3301 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
3302 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3303 /// of [4 x i32] in \a mask determines the source for each element.
3305 /// \code{.operation}
3306 /// FOR element := 0 to 3
3309 /// IF mask[j+31] == 0
3310 /// result[j+31:j] := a[j+31:j]
3312 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3317 /// \headerfile <immintrin.h>
3320 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
3321 /// __m128i mask, const int s);
3324 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
3327 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
3330 /// A pointer to the memory used for loading values.
3332 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3334 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
3335 /// bit of each element in the mask vector represents the mask bits. If a
3336 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3337 /// otherwise the value is loaded from memory.
3339 /// A literal constant scale factor for the indexes in \a i. Must be
3341 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
3342 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
3343 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
3345 (__v4si)(__m128i)(i), \
3346 (__v4si)(__m128i)(mask), (s)))
3348 /// Conditionally gathers eight 32-bit integer values, either from the
3349 /// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
3350 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
3351 /// of [8 x i32] in \a mask determines the source for each element.
3353 /// \code{.operation}
3354 /// FOR element := 0 to 7
3357 /// IF mask[j+31] == 0
3358 /// result[j+31:j] := a[j+31:j]
3360 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3365 /// \headerfile <immintrin.h>
3368 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
3369 /// __m256i mask, const int s);
3372 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
3375 /// A 256-bit vector of [8 x i32] used as the source when a mask bit is
3378 /// A pointer to the memory used for loading values.
3380 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
3382 /// A 256-bit vector of [8 x i32] containing the mask. The most significant
3383 /// bit of each element in the mask vector represents the mask bits. If a
3384 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3385 /// otherwise the value is loaded from memory.
3387 /// A literal constant scale factor for the indexes in \a i. Must be
3389 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
3390 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
3391 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
3393 (__v8si)(__m256i)(i), \
3394 (__v8si)(__m256i)(mask), (s)))
3396 /// Conditionally gathers two 32-bit integer values, either from the
3397 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
3398 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3399 /// of [4 x i32] in \a mask determines the source for the lower two
3400 /// elements. The upper two elements of the result are zeroed.
3402 /// \code{.operation}
3403 /// FOR element := 0 to 1
3406 /// IF mask[j+31] == 0
3407 /// result[j+31:j] := a[j+31:j]
3409 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3412 /// result[127:64] := 0
3415 /// \headerfile <immintrin.h>
3418 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
3419 /// __m128i mask, const int s);
3422 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
3425 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
3426 /// zero. Only the first two elements are used.
3428 /// A pointer to the memory used for loading values.
3430 /// A 128-bit vector of [2 x i64] containing indexes into \a m.
3432 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
3433 /// bit of each element in the mask vector represents the mask bits. If a
3434 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3435 /// otherwise the value is loaded from memory. Only the first two elements
3438 /// A literal constant scale factor for the indexes in \a i. Must be
3440 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
3441 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
3442 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
3444 (__v2di)(__m128i)(i), \
3445 (__v4si)(__m128i)(mask), (s)))
3447 /// Conditionally gathers four 32-bit integer values, either from the
3448 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
3449 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
3450 /// of [4 x i32] in \a mask determines the source for each element.
3452 /// \code{.operation}
3453 /// FOR element := 0 to 3
3456 /// IF mask[j+31] == 0
3457 /// result[j+31:j] := a[j+31:j]
3459 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3464 /// \headerfile <immintrin.h>
3467 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
3468 /// __m128i mask, const int s);
3471 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
3474 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is
3477 /// A pointer to the memory used for loading values.
3479 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3481 /// A 128-bit vector of [4 x i32] containing the mask. The most significant
3482 /// bit of each element in the mask vector represents the mask bits. If a
3483 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3484 /// otherwise the value is loaded from memory.
3486 /// A literal constant scale factor for the indexes in \a i. Must be
3488 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
3489 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
3490 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
3492 (__v4di)(__m256i)(i), \
3493 (__v4si)(__m128i)(mask), (s)))
3495 /// Conditionally gathers two 64-bit integer values, either from the
3496 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
3497 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3498 /// of [2 x i64] in \a mask determines the source for each element.
3500 /// \code{.operation}
3501 /// FOR element := 0 to 1
3504 /// IF mask[j+63] == 0
3505 /// result[j+63:j] := a[j+63:j]
3507 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3512 /// \headerfile <immintrin.h>
3515 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
3516 /// __m128i mask, const int s);
3519 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
3522 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
3525 /// A pointer to the memory used for loading values.
3527 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3528 /// the first two elements are used.
3530 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
3531 /// bit of each element in the mask vector represents the mask bits. If a
3532 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3533 /// otherwise the value is loaded from memory.
3535 /// A literal constant scale factor for the indexes in \a i. Must be
3537 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
3538 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
3539 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
3540 (long long const *)(m), \
3541 (__v4si)(__m128i)(i), \
3542 (__v2di)(__m128i)(mask), (s)))
3544 /// Conditionally gathers four 64-bit integer values, either from the
3545 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
3546 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
3547 /// of [4 x i64] in \a mask determines the source for each element.
3549 /// \code{.operation}
3550 /// FOR element := 0 to 3
3553 /// IF mask[j+63] == 0
3554 /// result[j+63:j] := a[j+63:j]
3556 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3561 /// \headerfile <immintrin.h>
3564 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
3565 /// __m128i i, __m256i mask, const int s);
3568 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
3571 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
3574 /// A pointer to the memory used for loading values.
3576 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3578 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
3579 /// bit of each element in the mask vector represents the mask bits. If a
3580 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3581 /// otherwise the value is loaded from memory.
3583 /// A literal constant scale factor for the indexes in \a i. Must be
3585 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
3586 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
3587 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
3588 (long long const *)(m), \
3589 (__v4si)(__m128i)(i), \
3590 (__v4di)(__m256i)(mask), (s)))
3592 /// Conditionally gathers two 64-bit integer values, either from the
3593 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
3594 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3595 /// of [2 x i64] in \a mask determines the source for each element.
3597 /// \code{.operation}
3598 /// FOR element := 0 to 1
3601 /// IF mask[j+63] == 0
3602 /// result[j+63:j] := a[j+63:j]
3604 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3609 /// \headerfile <immintrin.h>
3612 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
3613 /// __m128i mask, const int s);
3616 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
3619 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is
3622 /// A pointer to the memory used for loading values.
3624 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3626 /// A 128-bit vector of [2 x i64] containing the mask. The most significant
3627 /// bit of each element in the mask vector represents the mask bits. If a
3628 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3629 /// otherwise the value is loaded from memory.
3631 /// A literal constant scale factor for the indexes in \a i. Must be
3633 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
3634 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
3635 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
3636 (long long const *)(m), \
3637 (__v2di)(__m128i)(i), \
3638 (__v2di)(__m128i)(mask), (s)))
3640 /// Conditionally gathers four 64-bit integer values, either from the
3641 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
3642 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
3643 /// of [4 x i64] in \a mask determines the source for each element.
3645 /// \code{.operation}
3646 /// FOR element := 0 to 3
3649 /// IF mask[j+63] == 0
3650 /// result[j+63:j] := a[j+63:j]
3652 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3657 /// \headerfile <immintrin.h>
3660 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
3661 /// __m256i i, __m256i mask, const int s);
3664 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
3667 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is
3670 /// A pointer to the memory used for loading values.
3672 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3674 /// A 256-bit vector of [4 x i64] containing the mask. The most significant
3675 /// bit of each element in the mask vector represents the mask bits. If a
3676 /// mask bit is zero, the corresponding value from vector \a a is gathered;
3677 /// otherwise the value is loaded from memory.
3679 /// A literal constant scale factor for the indexes in \a i. Must be
3681 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
3682 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
3683 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
3684 (long long const *)(m), \
3685 (__v4di)(__m256i)(i), \
3686 (__v4di)(__m256i)(mask), (s)))
3688 /// Gathers two 64-bit floating-point values from memory \a m using scaled
3689 /// indexes from the 128-bit vector of [4 x i32] in \a i.
3691 /// \code{.operation}
3692 /// FOR element := 0 to 1
3695 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3699 /// \headerfile <immintrin.h>
3702 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
3705 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3708 /// A pointer to the memory used for loading values.
3710 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3711 /// the first two elements are used.
3713 /// A literal constant scale factor for the indexes in \a i. Must be
3715 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3716 #define _mm_i32gather_pd(m, i, s) \
3717 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
3718 (double const *)(m), \
3719 (__v4si)(__m128i)(i), \
3720 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
3721 _mm_setzero_pd()), \
3724 /// Gathers four 64-bit floating-point values from memory \a m using scaled
3725 /// indexes from the 128-bit vector of [4 x i32] in \a i.
3727 /// \code{.operation}
3728 /// FOR element := 0 to 3
3731 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3735 /// \headerfile <immintrin.h>
3738 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
3741 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3744 /// A pointer to the memory used for loading values.
3746 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3748 /// A literal constant scale factor for the indexes in \a i. Must be
3750 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
3751 #define _mm256_i32gather_pd(m, i, s) \
3752 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
3753 (double const *)(m), \
3754 (__v4si)(__m128i)(i), \
3755 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
3756 _mm256_setzero_pd(), \
3760 /// Gathers two 64-bit floating-point values from memory \a m using scaled
3761 /// indexes from the 128-bit vector of [2 x i64] in \a i.
3763 /// \code{.operation}
3764 /// FOR element := 0 to 1
3767 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3771 /// \headerfile <immintrin.h>
3774 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
3777 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
3780 /// A pointer to the memory used for loading values.
3782 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3784 /// A literal constant scale factor for the indexes in \a i. Must be
3786 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3787 #define _mm_i64gather_pd(m, i, s) \
3788 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
3789 (double const *)(m), \
3790 (__v2di)(__m128i)(i), \
3791 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
3792 _mm_setzero_pd()), \
3795 /// Gathers four 64-bit floating-point values from memory \a m using scaled
3796 /// indexes from the 256-bit vector of [4 x i64] in \a i.
3798 /// \code{.operation}
3799 /// FOR element := 0 to 3
3802 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3806 /// \headerfile <immintrin.h>
3809 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
3812 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
3815 /// A pointer to the memory used for loading values.
3817 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3819 /// A literal constant scale factor for the indexes in \a i. Must be
3821 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
3822 #define _mm256_i64gather_pd(m, i, s) \
3823 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
3824 (double const *)(m), \
3825 (__v4di)(__m256i)(i), \
3826 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
3827 _mm256_setzero_pd(), \
3831 /// Gathers four 32-bit floating-point values from memory \a m using scaled
3832 /// indexes from the 128-bit vector of [4 x i32] in \a i.
3834 /// \code{.operation}
3835 /// FOR element := 0 to 3
3838 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3842 /// \headerfile <immintrin.h>
3845 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
3848 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
3851 /// A pointer to the memory used for loading values.
3853 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3855 /// A literal constant scale factor for the indexes in \a i. Must be
3857 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3858 #define _mm_i32gather_ps(m, i, s) \
3859 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
3860 (float const *)(m), \
3861 (__v4si)(__m128i)(i), \
3862 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
3863 _mm_setzero_ps()), \
3866 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
3867 /// indexes from the 256-bit vector of [8 x i32] in \a i.
3869 /// \code{.operation}
3870 /// FOR element := 0 to 7
3873 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3877 /// \headerfile <immintrin.h>
3880 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
3883 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
3886 /// A pointer to the memory used for loading values.
3888 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
3890 /// A literal constant scale factor for the indexes in \a i. Must be
3892 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
3893 #define _mm256_i32gather_ps(m, i, s) \
3894 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
3895 (float const *)(m), \
3896 (__v8si)(__m256i)(i), \
3897 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
3898 _mm256_setzero_ps(), \
3902 /// Gathers two 32-bit floating-point values from memory \a m using scaled
3903 /// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
3904 /// elements of the result are zeroed.
3906 /// \code{.operation}
3907 /// FOR element := 0 to 1
3910 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3912 /// result[127:64] := 0
3915 /// \headerfile <immintrin.h>
3918 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
3921 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
3924 /// A pointer to the memory used for loading values.
3926 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3928 /// A literal constant scale factor for the indexes in \a i. Must be
3930 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3931 #define _mm_i64gather_ps(m, i, s) \
3932 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
3933 (float const *)(m), \
3934 (__v2di)(__m128i)(i), \
3935 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
3936 _mm_setzero_ps()), \
3939 /// Gathers four 32-bit floating-point values from memory \a m using scaled
3940 /// indexes from the 256-bit vector of [4 x i64] in \a i.
3942 /// \code{.operation}
3943 /// FOR element := 0 to 3
3946 /// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
3950 /// \headerfile <immintrin.h>
3953 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
3956 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
3959 /// A pointer to the memory used for loading values.
3961 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3963 /// A literal constant scale factor for the indexes in \a i. Must be
3965 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3966 #define _mm256_i64gather_ps(m, i, s) \
3967 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
3968 (float const *)(m), \
3969 (__v4di)(__m256i)(i), \
3970 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
3971 _mm_setzero_ps()), \
3974 /// Gathers four 32-bit floating-point values from memory \a m using scaled
3975 /// indexes from the 128-bit vector of [4 x i32] in \a i.
3977 /// \code{.operation}
3978 /// FOR element := 0 to 3
3981 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3985 /// \headerfile <immintrin.h>
3988 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
3991 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
3994 /// A pointer to the memory used for loading values.
3996 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3998 /// A literal constant scale factor for the indexes in \a i. Must be
4000 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4001 #define _mm_i32gather_epi32(m, i, s) \
4002 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
4003 (int const *)(m), (__v4si)(__m128i)(i), \
4004 (__v4si)_mm_set1_epi32(-1), (s)))
4006 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
4007 /// indexes from the 256-bit vector of [8 x i32] in \a i.
4009 /// \code{.operation}
4010 /// FOR element := 0 to 7
4013 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4017 /// \headerfile <immintrin.h>
4020 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
4023 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4026 /// A pointer to the memory used for loading values.
4028 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4030 /// A literal constant scale factor for the indexes in \a i. Must be
4032 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4033 #define _mm256_i32gather_epi32(m, i, s) \
4034 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
4035 (int const *)(m), (__v8si)(__m256i)(i), \
4036 (__v8si)_mm256_set1_epi32(-1), (s)))
4038 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
4039 /// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
4040 /// of the result are zeroed.
4042 /// \code{.operation}
4043 /// FOR element := 0 to 1
4046 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4048 /// result[127:64] := 0
4051 /// \headerfile <immintrin.h>
4054 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
4057 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4060 /// A pointer to the memory used for loading values.
4062 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4064 /// A literal constant scale factor for the indexes in \a i. Must be
4066 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4067 #define _mm_i64gather_epi32(m, i, s) \
4068 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
4069 (int const *)(m), (__v2di)(__m128i)(i), \
4070 (__v4si)_mm_set1_epi32(-1), (s)))
4072 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
4073 /// from the 256-bit vector of [4 x i64] in \a i.
4075 /// \code{.operation}
4076 /// FOR element := 0 to 3
4079 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4083 /// \headerfile <immintrin.h>
4086 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
4089 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4092 /// A pointer to the memory used for loading values.
4094 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4096 /// A literal constant scale factor for the indexes in \a i. Must be
4098 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4099 #define _mm256_i64gather_epi32(m, i, s) \
4100 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
4101 (int const *)(m), (__v4di)(__m256i)(i), \
4102 (__v4si)_mm_set1_epi32(-1), (s)))
4104 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
4105 /// from the 128-bit vector of [4 x i32] in \a i.
4107 /// \code{.operation}
4108 /// FOR element := 0 to 1
4111 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4115 /// \headerfile <immintrin.h>
4118 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
4121 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4124 /// A pointer to the memory used for loading values.
4126 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4127 /// the first two elements are used.
4129 /// A literal constant scale factor for the indexes in \a i. Must be
4131 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4132 #define _mm_i32gather_epi64(m, i, s) \
4133 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
4134 (long long const *)(m), \
4135 (__v4si)(__m128i)(i), \
4136 (__v2di)_mm_set1_epi64x(-1), (s)))
4138 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
4139 /// from the 128-bit vector of [4 x i32] in \a i.
4141 /// \code{.operation}
4142 /// FOR element := 0 to 3
4145 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4149 /// \headerfile <immintrin.h>
4152 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
4155 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4158 /// A pointer to the memory used for loading values.
4160 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4162 /// A literal constant scale factor for the indexes in \a i. Must be
4164 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4165 #define _mm256_i32gather_epi64(m, i, s) \
4166 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
4167 (long long const *)(m), \
4168 (__v4si)(__m128i)(i), \
4169 (__v4di)_mm256_set1_epi64x(-1), (s)))
4171 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
4172 /// from the 128-bit vector of [2 x i64] in \a i.
4174 /// \code{.operation}
4175 /// FOR element := 0 to 1
4178 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4182 /// \headerfile <immintrin.h>
4185 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
4188 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4191 /// A pointer to the memory used for loading values.
4193 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4195 /// A literal constant scale factor for the indexes in \a i. Must be
4197 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4198 #define _mm_i64gather_epi64(m, i, s) \
4199 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
4200 (long long const *)(m), \
4201 (__v2di)(__m128i)(i), \
4202 (__v2di)_mm_set1_epi64x(-1), (s)))
4204 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
4205 /// from the 256-bit vector of [4 x i64] in \a i.
4207 /// \code{.operation}
4208 /// FOR element := 0 to 3
4211 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4215 /// \headerfile <immintrin.h>
4218 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
4221 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4224 /// A pointer to the memory used for loading values.
4226 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4228 /// A literal constant scale factor for the indexes in \a i. Must be
4230 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4231 #define _mm256_i64gather_epi64(m, i, s) \
4232 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
4233 (long long const *)(m), \
4234 (__v4di)(__m256i)(i), \
4235 (__v4di)_mm256_set1_epi64x(-1), (s)))
4237 #undef __DEFAULT_FN_ATTRS256
4238 #undef __DEFAULT_FN_ATTRS128
4240 #endif /* __AVX2INTRIN_H */