clang/lib/Headers/avx2intrin.h

   1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __IMMINTRIN_H
  11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
  12 #endif
  13
  14 #ifndef __AVX2INTRIN_H
  15 #define __AVX2INTRIN_H
  16
  17 /* Define the default attributes for the functions in this file. */
  18 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256)))
  19 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128)))
  20
  21 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
  22 #define _mm256_mpsadbw_epu8(X, Y, M) \
  23   ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
  24                                       (__v32qi)(__m256i)(Y), (int)(M)))
  25
  26 static __inline__ __m256i __DEFAULT_FN_ATTRS256
  27 _mm256_abs_epi8(__m256i __a)
  28 {
  29     return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
  30 }
  31
  32 static __inline__ __m256i __DEFAULT_FN_ATTRS256
  33 _mm256_abs_epi16(__m256i __a)
  34 {
  35     return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
  36 }
  37
  38 static __inline__ __m256i __DEFAULT_FN_ATTRS256
  39 _mm256_abs_epi32(__m256i __a)
  40 {
  41     return (__m256i)__builtin_elementwise_abs((__v8si)__a);
  42 }
  43
  44 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
  45 ///    integers using signed saturation, and returns the 256-bit result.
  46 ///
  47 /// \code{.operation}
  48 /// FOR i := 0 TO 7
  49 ///   j := i*16
  50 ///   k := i*8
  51 ///   result[7+k:k] := SATURATE8(__a[15+j:j])
  52 ///   result[71+k:64+k] := SATURATE8(__b[15+j:j])
  53 ///   result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
  54 ///   result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
  55 /// ENDFOR
  56 /// \endcode
  57 ///
  58 /// \headerfile <immintrin.h>
  59 ///
  60 /// This intrinsic corresponds to the \c VPACKSSWB instruction.
  61 ///
  62 /// \param __a
  63 ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
  64 ///    result[191:128].
  65 /// \param __b
  66 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
  67 ///    result[255:192].
  68 /// \returns A 256-bit integer vector containing the result.
  69 static __inline__ __m256i __DEFAULT_FN_ATTRS256
  70 _mm256_packs_epi16(__m256i __a, __m256i __b)
  71 {
  72   return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
  73 }
  74
  75 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
  76 ///    integers using signed saturation, and returns the resulting 256-bit
  77 ///    vector of [16 x i16].
  78 ///
  79 /// \code{.operation}
  80 /// FOR i := 0 TO 3
  81 ///   j := i*32
  82 ///   k := i*16
  83 ///   result[15+k:k] := SATURATE16(__a[31+j:j])
  84 ///   result[79+k:64+k] := SATURATE16(__b[31+j:j])
  85 ///   result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
  86 ///   result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
  87 /// ENDFOR
  88 /// \endcode
  89 ///
  90 /// \headerfile <immintrin.h>
  91 ///
  92 /// This intrinsic corresponds to the \c VPACKSSDW instruction.
  93 ///
  94 /// \param __a
  95 ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
  96 ///    result[191:128].
  97 /// \param __b
  98 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
  99 ///    result[255:192].
 100 /// \returns A 256-bit vector of [16 x i16] containing the result.
 101 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 102 _mm256_packs_epi32(__m256i __a, __m256i __b)
 103 {
 104   return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
 105 }
 106
 107 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
 108 ///    using unsigned saturation, and returns the 256-bit result.
 109 ///
 110 /// \code{.operation}
 111 /// FOR i := 0 TO 7
 112 ///   j := i*16
 113 ///   k := i*8
 114 ///   result[7+k:k] := SATURATE8U(__a[15+j:j])
 115 ///   result[71+k:64+k] := SATURATE8U(__b[15+j:j])
 116 ///   result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
 117 ///   result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
 118 /// ENDFOR
 119 /// \endcode
 120 ///
 121 /// \headerfile <immintrin.h>
 122 ///
 123 /// This intrinsic corresponds to the \c VPACKUSWB instruction.
 124 ///
 125 /// \param __a
 126 ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
 127 ///    result[191:128].
 128 /// \param __b
 129 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
 130 ///    result[255:192].
 131 /// \returns A 256-bit integer vector containing the result.
 132 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 133 _mm256_packus_epi16(__m256i __a, __m256i __b)
 134 {
 135   return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
 136 }
 137
 138 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
 139 ///    using unsigned saturation, and returns the resulting 256-bit vector of
 140 ///    [16 x i16].
 141 ///
 142 /// \code{.operation}
 143 /// FOR i := 0 TO 3
 144 ///   j := i*32
 145 ///   k := i*16
 146 ///   result[15+k:k] := SATURATE16U(__V1[31+j:j])
 147 ///   result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
 148 ///   result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
 149 ///   result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
 150 /// ENDFOR
 151 /// \endcode
 152 ///
 153 /// \headerfile <immintrin.h>
 154 ///
 155 /// This intrinsic corresponds to the \c VPACKUSDW instruction.
 156 ///
 157 /// \param __V1
 158 ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
 159 ///    result[191:128].
 160 /// \param __V2
 161 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
 162 ///    result[255:192].
 163 /// \returns A 256-bit vector of [16 x i16] containing the result.
 164 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 165 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
 166 {
 167   return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
 168 }
 169
 170 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
 171 ///    vectors and returns the lower 8 bits of each sum in the corresponding
 172 ///    byte of the 256-bit integer vector result (overflow is ignored).
 173 ///
 174 /// \headerfile <immintrin.h>
 175 ///
 176 /// This intrinsic corresponds to the \c VPADDB instruction.
 177 ///
 178 /// \param __a
 179 ///    A 256-bit integer vector containing one of the source operands.
 180 /// \param __b
 181 ///    A 256-bit integer vector containing one of the source operands.
 182 /// \returns A 256-bit integer vector containing the sums.
 183 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 184 _mm256_add_epi8(__m256i __a, __m256i __b)
 185 {
 186   return (__m256i)((__v32qu)__a + (__v32qu)__b);
 187 }
 188
 189 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
 190 ///    [16 x i16] and returns the lower 16 bits of each sum in the
 191 ///    corresponding element of the [16 x i16] result (overflow is ignored).
 192 ///
 193 /// \headerfile <immintrin.h>
 194 ///
 195 /// This intrinsic corresponds to the \c VPADDW instruction.
 196 ///
 197 /// \param __a
 198 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 199 /// \param __b
 200 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 201 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 202 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 203 _mm256_add_epi16(__m256i __a, __m256i __b)
 204 {
 205   return (__m256i)((__v16hu)__a + (__v16hu)__b);
 206 }
 207
 208 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
 209 ///    [8 x i32] and returns the lower 32 bits of each sum in the corresponding
 210 ///    element of the [8 x i32] result (overflow is ignored).
 211 ///
 212 /// \headerfile <immintrin.h>
 213 ///
 214 /// This intrinsic corresponds to the \c VPADDD instruction.
 215 ///
 216 /// \param __a
 217 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 218 /// \param __b
 219 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 220 /// \returns A 256-bit vector of [8 x i32] containing the sums.
 221 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 222 _mm256_add_epi32(__m256i __a, __m256i __b)
 223 {
 224   return (__m256i)((__v8su)__a + (__v8su)__b);
 225 }
 226
 227 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
 228 ///    [4 x i64] and returns the lower 64 bits of each sum in the corresponding
 229 ///    element of the [4 x i64] result (overflow is ignored).
 230 ///
 231 /// \headerfile <immintrin.h>
 232 ///
 233 /// This intrinsic corresponds to the \c VPADDQ instruction.
 234 ///
 235 /// \param __a
 236 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
 237 /// \param __b
 238 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
 239 /// \returns A 256-bit vector of [4 x i64] containing the sums.
 240 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 241 _mm256_add_epi64(__m256i __a, __m256i __b)
 242 {
 243   return (__m256i)((__v4du)__a + (__v4du)__b);
 244 }
 245
 246 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
 247 ///    vectors using signed saturation, and returns each sum in the
 248 ///    corresponding byte of the 256-bit integer vector result.
 249 ///
 250 /// \headerfile <immintrin.h>
 251 ///
 252 /// This intrinsic corresponds to the \c VPADDSB instruction.
 253 ///
 254 /// \param __a
 255 ///    A 256-bit integer vector containing one of the source operands.
 256 /// \param __b
 257 ///    A 256-bit integer vector containing one of the source operands.
 258 /// \returns A 256-bit integer vector containing the sums.
 259 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 260 _mm256_adds_epi8(__m256i __a, __m256i __b)
 261 {
 262   return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
 263 }
 264
 265 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
 266 ///    [16 x i16] using signed saturation, and returns the [16 x i16] result.
 267 ///
 268 /// \headerfile <immintrin.h>
 269 ///
 270 /// This intrinsic corresponds to the \c VPADDSW instruction.
 271 ///
 272 /// \param __a
 273 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 274 /// \param __b
 275 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 276 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 277 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 278 _mm256_adds_epi16(__m256i __a, __m256i __b)
 279 {
 280   return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
 281 }
 282
 283 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
 284 ///    vectors using unsigned saturation, and returns each sum in the
 285 ///    corresponding byte of the 256-bit integer vector result.
 286 ///
 287 /// \headerfile <immintrin.h>
 288 ///
 289 /// This intrinsic corresponds to the \c VPADDUSB instruction.
 290 ///
 291 /// \param __a
 292 ///    A 256-bit integer vector containing one of the source operands.
 293 /// \param __b
 294 ///    A 256-bit integer vector containing one of the source operands.
 295 /// \returns A 256-bit integer vector containing the sums.
 296 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 297 _mm256_adds_epu8(__m256i __a, __m256i __b)
 298 {
 299   return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
 300 }
 301
 302 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
 303 ///    [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
 304 ///
 305 /// \headerfile <immintrin.h>
 306 ///
 307 /// This intrinsic corresponds to the \c VPADDUSW instruction.
 308 ///
 309 /// \param __a
 310 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 311 /// \param __b
 312 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 313 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 314 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 315 _mm256_adds_epu16(__m256i __a, __m256i __b)
 316 {
 317   return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
 318 }
 319
 320 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
 321 ///    temporary 256-bit value, and the lower half of the 256-bit vector \a b
 322 ///    as the lower half of the temporary value. Right-shifts the temporary
 323 ///    value by \a n bytes, and uses the lower 16 bytes of the shifted value
 324 ///    as the lower 16 bytes of the result. Uses the upper halves of \a a and
 325 ///    \a b to make another temporary value, right shifts by \a n, and uses
 326 ///    the lower 16 bytes of the shifted value as the upper 16 bytes of the
 327 ///    result.
 328 ///
 329 /// \headerfile <immintrin.h>
 330 ///
 331 /// \code
 332 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
 333 /// \endcode
 334 ///
 335 /// This intrinsic corresponds to the \c VPALIGNR instruction.
 336 ///
 337 /// \param a
 338 ///    A 256-bit integer vector containing source values.
 339 /// \param b
 340 ///    A 256-bit integer vector containing source values.
 341 /// \param n
 342 ///    An immediate value specifying the number of bytes to shift.
 343 /// \returns A 256-bit integer vector containing the result.
 344 #define _mm256_alignr_epi8(a, b, n) \
 345   ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
 346                                       (__v32qi)(__m256i)(b), (n)))
 347
 348 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 349 _mm256_and_si256(__m256i __a, __m256i __b)
 350 {
 351   return (__m256i)((__v4du)__a & (__v4du)__b);
 352 }
 353
 354 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 355 _mm256_andnot_si256(__m256i __a, __m256i __b)
 356 {
 357   return (__m256i)(~(__v4du)__a & (__v4du)__b);
 358 }
 359
 360 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 361 _mm256_avg_epu8(__m256i __a, __m256i __b)
 362 {
 363   return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
 364 }
 365
 366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 367 _mm256_avg_epu16(__m256i __a, __m256i __b)
 368 {
 369   return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
 370 }
 371
 372 /// Merges 8-bit integer values from either of the two 256-bit vectors
 373 ///    \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
 374 ///    the resulting 256-bit integer vector.
 375 ///
 376 /// \code{.operation}
 377 /// FOR i := 0 TO 31
 378 ///   j := i*8
 379 ///   IF __M[7+i] == 0
 380 ///     result[7+j:j] := __V1[7+j:j]
 381 ///   ELSE
 382 ///     result[7+j:j] := __V2[7+j:j]
 383 ///   FI
 384 /// ENDFOR
 385 /// \endcode
 386 ///
 387 /// \headerfile <immintrin.h>
 388 ///
 389 /// This intrinsic corresponds to the \c VPBLENDVB instruction.
 390 ///
 391 /// \param __V1
 392 ///    A 256-bit integer vector containing source values.
 393 /// \param __V2
 394 ///    A 256-bit integer vector containing source values.
 395 /// \param __M
 396 ///    A 256-bit integer vector, with bit [7] of each byte specifying the
 397 ///    source for each corresponding byte of the result. When the mask bit
 398 ///    is 0, the byte is copied from \a __V1; otherwise, it is copied from
 399 ///    \a __V2.
 400 /// \returns A 256-bit integer vector containing the result.
 401 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 402 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
 403 {
 404   return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
 405                                               (__v32qi)__M);
 406 }
 407
 408 /// Merges 16-bit integer values from either of the two 256-bit vectors
 409 ///    \a V1 or \a V2, as specified by the immediate integer operand \a M,
 410 ///    and returns the resulting 256-bit vector of [16 x i16].
 411 ///
 412 /// \code{.operation}
 413 /// FOR i := 0 TO 7
 414 ///   j := i*16
 415 ///   IF M[i] == 0
 416 ///     result[7+j:j] := V1[7+j:j]
 417 ///     result[135+j:128+j] := V1[135+j:128+j]
 418 ///   ELSE
 419 ///     result[7+j:j] := V2[7+j:j]
 420 ///     result[135+j:128+j] := V2[135+j:128+j]
 421 ///   FI
 422 /// ENDFOR
 423 /// \endcode
 424 ///
 425 /// \headerfile <immintrin.h>
 426 ///
 427 /// \code
 428 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
 429 /// \endcode
 430 ///
 431 /// This intrinsic corresponds to the \c VPBLENDW instruction.
 432 ///
 433 /// \param V1
 434 ///    A 256-bit vector of [16 x i16] containing source values.
 435 /// \param V2
 436 ///    A 256-bit vector of [16 x i16] containing source values.
 437 /// \param M
 438 ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
 439 ///    source for each element of the result. The position of the mask bit
 440 ///    corresponds to the index of a copied value. When a mask bit is 0, the
 441 ///    element is copied from \a V1; otherwise, it is copied from \a V2.
 442 ///    \a M[0] determines the source for elements 0 and 8, \a M[1] for
 443 ///    elements 1 and 9, and so forth.
 444 /// \returns A 256-bit vector of [16 x i16] containing the result.
 445 #define _mm256_blend_epi16(V1, V2, M) \
 446   ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
 447                                       (__v16hi)(__m256i)(V2), (int)(M)))
 448
 449 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 450 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
 451 {
 452   return (__m256i)((__v32qi)__a == (__v32qi)__b);
 453 }
 454
 455 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 456 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
 457 {
 458   return (__m256i)((__v16hi)__a == (__v16hi)__b);
 459 }
 460
 461 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 462 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
 463 {
 464   return (__m256i)((__v8si)__a == (__v8si)__b);
 465 }
 466
 467 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 468 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
 469 {
 470   return (__m256i)((__v4di)__a == (__v4di)__b);
 471 }
 472
 473 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 474 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
 475 {
 476   /* This function always performs a signed comparison, but __v32qi is a char
 477      which may be signed or unsigned, so use __v32qs. */
 478   return (__m256i)((__v32qs)__a > (__v32qs)__b);
 479 }
 480
 481 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 482 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
 483 {
 484   return (__m256i)((__v16hi)__a > (__v16hi)__b);
 485 }
 486
 487 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 488 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
 489 {
 490   return (__m256i)((__v8si)__a > (__v8si)__b);
 491 }
 492
 493 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 494 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 495 {
 496   return (__m256i)((__v4di)__a > (__v4di)__b);
 497 }
 498
 499 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
 500 ///    vectors of [16 x i16] and returns the lower 16 bits of each sum in an
 501 ///    element of the [16 x i16] result (overflow is ignored). Sums from
 502 ///    \a __a are returned in the lower 64 bits of each 128-bit half of the
 503 ///    result; sums from \a __b are returned in the upper 64 bits of each
 504 ///    128-bit half of the result.
 505 ///
 506 /// \code{.operation}
 507 /// FOR i := 0 TO 1
 508 ///   j := i*128
 509 ///   result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
 510 ///   result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
 511 ///   result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
 512 ///   result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
 513 ///   result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
 514 ///   result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
 515 ///   result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
 516 ///   result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
 517 /// ENDFOR
 518 /// \endcode
 519 ///
 520 /// \headerfile <immintrin.h>
 521 ///
 522 /// This intrinsic corresponds to the \c VPHADDW instruction.
 523 ///
 524 /// \param __a
 525 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 526 /// \param __b
 527 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 528 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 529 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 530 _mm256_hadd_epi16(__m256i __a, __m256i __b)
 531 {
 532     return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
 533 }
 534
 535 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
 536 ///    vectors of [8 x i32] and returns the lower 32 bits of each sum in an
 537 ///    element of the [8 x i32] result (overflow is ignored). Sums from \a __a
 538 ///    are returned in the lower 64 bits of each 128-bit half of the result;
 539 ///    sums from \a __b are returned in the upper 64 bits of each 128-bit half
 540 ///    of the result.
 541 ///
 542 /// \code{.operation}
 543 /// FOR i := 0 TO 1
 544 ///   j := i*128
 545 ///   result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
 546 ///   result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
 547 ///   result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
 548 ///   result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
 549 /// ENDFOR
 550 /// \endcode
 551 ///
 552 /// \headerfile <immintrin.h>
 553 ///
 554 /// This intrinsic corresponds to the \c VPHADDD instruction.
 555 ///
 556 /// \param __a
 557 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 558 /// \param __b
 559 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 560 /// \returns A 256-bit vector of [8 x i32] containing the sums.
 561 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 562 _mm256_hadd_epi32(__m256i __a, __m256i __b)
 563 {
 564     return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
 565 }
 566
 567 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
 568 ///    vectors of [16 x i16] using signed saturation and returns each sum in
 569 ///    an element of the [16 x i16] result. Sums from \a __a are returned in
 570 ///    the lower 64 bits of each 128-bit half of the result; sums from \a __b
 571 ///    are returned in the upper 64 bits of each 128-bit half of the result.
 572 ///
 573 /// \code{.operation}
 574 /// FOR i := 0 TO 1
 575 ///   j := i*128
 576 ///   result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
 577 ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
 578 ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
 579 ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
 580 ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
 581 ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
 582 ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
 583 ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
 584 /// ENDFOR
 585 /// \endcode
 586 ///
 587 /// \headerfile <immintrin.h>
 588 ///
 589 /// This intrinsic corresponds to the \c VPHADDSW instruction.
 590 ///
 591 /// \param __a
 592 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 593 /// \param __b
 594 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 595 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 596 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 597 _mm256_hadds_epi16(__m256i __a, __m256i __b)
 598 {
 599     return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
 600 }
 601
 602 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
 603 ///    vectors of [16 x i16] and returns the lower 16 bits of each difference
 604 ///    in an element of the [16 x i16] result (overflow is ignored).
 605 ///    Differences from \a __a are returned in the lower 64 bits of each
 606 ///    128-bit half of the result; differences from \a __b are returned in the
 607 ///    upper 64 bits of each 128-bit half of the result.
 608 ///
 609 /// \code{.operation}
 610 /// FOR i := 0 TO 1
 611 ///   j := i*128
 612 ///   result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
 613 ///   result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
 614 ///   result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
 615 ///   result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
 616 ///   result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
 617 ///   result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
 618 ///   result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
 619 ///   result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
 620 /// ENDFOR
 621 /// \endcode
 622 ///
 623 /// \headerfile <immintrin.h>
 624 ///
 625 /// This intrinsic corresponds to the \c VPHSUBW instruction.
 626 ///
 627 /// \param __a
 628 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 629 /// \param __b
 630 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 631 /// \returns A 256-bit vector of [16 x i16] containing the differences.
 632 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 633 _mm256_hsub_epi16(__m256i __a, __m256i __b)
 634 {
 635     return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
 636 }
 637
 638 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
 639 ///    vectors of [8 x i32] and returns the lower 32 bits of each difference in
 640 ///    an element of the [8 x i32] result (overflow is ignored). Differences
 641 ///    from \a __a are returned in the lower 64 bits of each 128-bit half of
 642 ///    the result; differences from \a __b are returned in the upper 64 bits
 643 ///    of each 128-bit half of the result.
 644 ///
 645 /// \code{.operation}
 646 /// FOR i := 0 TO 1
 647 ///   j := i*128
 648 ///   result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
 649 ///   result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
 650 ///   result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
 651 ///   result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
 652 /// ENDFOR
 653 /// \endcode
 654 ///
 655 /// \headerfile <immintrin.h>
 656 ///
 657 /// This intrinsic corresponds to the \c VPHSUBD instruction.
 658 ///
 659 /// \param __a
 660 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 661 /// \param __b
 662 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 663 /// \returns A 256-bit vector of [8 x i32] containing the differences.
 664 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 665 _mm256_hsub_epi32(__m256i __a, __m256i __b)
 666 {
 667     return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
 668 }
 669
 670 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
 671 ///    vectors of [16 x i16] using signed saturation and returns each sum in
 672 ///    an element of the [16 x i16] result. Differences from \a __a are
 673 ///    returned in the lower 64 bits of each 128-bit half of the result;
 674 ///    differences from \a __b are returned in the upper 64 bits of each
 675 ///    128-bit half of the result.
 676 ///
 677 /// \code{.operation}
 678 /// FOR i := 0 TO 1
 679 ///   j := i*128
 680 ///   result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
 681 ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
 682 ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
 683 ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
 684 ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
 685 ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
 686 ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
 687 ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
 688 /// ENDFOR
 689 /// \endcode
 690 ///
 691 /// \headerfile <immintrin.h>
 692 ///
 693 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
 694 ///
 695 /// \param __a
 696 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 697 /// \param __b
 698 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 699 /// \returns A 256-bit vector of [16 x i16] containing the differences.
 700 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 701 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
 702 {
 703     return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
 704 }
 705
 706 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
 707 ///    with the corresponding signed byte from the 256-bit integer vector in
 708 ///    \a __b, forming signed 16-bit intermediate products. Adds adjacent
 709 ///    pairs of those products using signed saturation to form 16-bit sums
 710 ///    returned as elements of the [16 x i16] result.
 711 ///
 712 /// \code{.operation}
 713 /// FOR i := 0 TO 15
 714 ///   j := i*16
 715 ///   temp1 := __a[j+7:j] * __b[j+7:j]
 716 ///   temp2 := __a[j+15:j+8] * __b[j+15:j+8]
 717 ///   result[j+15:j] := SATURATE16(temp1 + temp2)
 718 /// ENDFOR
 719 /// \endcode
 720 ///
 721 /// \headerfile <immintrin.h>
 722 ///
 723 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
 724 ///
 725 /// \param __a
 726 ///    A 256-bit vector containing one of the source operands.
 727 /// \param __b
 728 ///    A 256-bit vector containing one of the source operands.
 729 /// \returns A 256-bit vector of [16 x i16] containing the result.
 730 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 731 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
 732 {
 733     return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
 734 }
 735
 736 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
 737 ///    [16 x i16], forming 32-bit intermediate products, and adds pairs of
 738 ///    those products to form 32-bit sums returned as elements of the
 739 ///    [8 x i32] result.
 740 ///
 741 ///    There is only one wraparound case: when all four of the 16-bit sources
 742 ///    are \c 0x8000, the result will be \c 0x80000000.
 743 ///
 744 /// \code{.operation}
 745 /// FOR i := 0 TO 7
 746 ///   j := i*32
 747 ///   temp1 := __a[j+15:j] * __b[j+15:j]
 748 ///   temp2 := __a[j+31:j+16] * __b[j+31:j+16]
 749 ///   result[j+31:j] := temp1 + temp2
 750 /// ENDFOR
 751 /// \endcode
 752 ///
 753 /// \headerfile <immintrin.h>
 754 ///
 755 /// This intrinsic corresponds to the \c VPMADDWD instruction.
 756 ///
 757 /// \param __a
 758 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 759 /// \param __b
 760 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 761 /// \returns A 256-bit vector of [8 x i32] containing the result.
 762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 763 _mm256_madd_epi16(__m256i __a, __m256i __b)
 764 {
 765   return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
 766 }
 767
 768 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 769 _mm256_max_epi8(__m256i __a, __m256i __b)
 770 {
 771   return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
 772 }
 773
 774 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 775 _mm256_max_epi16(__m256i __a, __m256i __b)
 776 {
 777   return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
 778 }
 779
 780 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 781 _mm256_max_epi32(__m256i __a, __m256i __b)
 782 {
 783   return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
 784 }
 785
 786 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 787 _mm256_max_epu8(__m256i __a, __m256i __b)
 788 {
 789   return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
 790 }
 791
 792 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 793 _mm256_max_epu16(__m256i __a, __m256i __b)
 794 {
 795   return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
 796 }
 797
 798 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 799 _mm256_max_epu32(__m256i __a, __m256i __b)
 800 {
 801   return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
 802 }
 803
 804 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 805 _mm256_min_epi8(__m256i __a, __m256i __b)
 806 {
 807   return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
 808 }
 809
 810 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 811 _mm256_min_epi16(__m256i __a, __m256i __b)
 812 {
 813   return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
 814 }
 815
 816 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 817 _mm256_min_epi32(__m256i __a, __m256i __b)
 818 {
 819   return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
 820 }
 821
 822 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 823 _mm256_min_epu8(__m256i __a, __m256i __b)
 824 {
 825   return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
 826 }
 827
 828 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 829 _mm256_min_epu16(__m256i __a, __m256i __b)
 830 {
 831   return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
 832 }
 833
 834 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 835 _mm256_min_epu32(__m256i __a, __m256i __b)
 836 {
 837   return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
 838 }
 839
 840 static __inline__ int __DEFAULT_FN_ATTRS256
 841 _mm256_movemask_epi8(__m256i __a)
 842 {
 843   return __builtin_ia32_pmovmskb256((__v32qi)__a);
 844 }
 845
 846 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 847 _mm256_cvtepi8_epi16(__m128i __V)
 848 {
 849   /* This function always performs a signed extension, but __v16qi is a char
 850      which may be signed or unsigned, so use __v16qs. */
 851   return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
 852 }
 853
 854 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 855 _mm256_cvtepi8_epi32(__m128i __V)
 856 {
 857   /* This function always performs a signed extension, but __v16qi is a char
 858      which may be signed or unsigned, so use __v16qs. */
 859   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
 860 }
 861
 862 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 863 _mm256_cvtepi8_epi64(__m128i __V)
 864 {
 865   /* This function always performs a signed extension, but __v16qi is a char
 866      which may be signed or unsigned, so use __v16qs. */
 867   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
 868 }
 869
 870 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 871 _mm256_cvtepi16_epi32(__m128i __V)
 872 {
 873   return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
 874 }
 875
 876 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 877 _mm256_cvtepi16_epi64(__m128i __V)
 878 {
 879   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
 880 }
 881
 882 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 883 _mm256_cvtepi32_epi64(__m128i __V)
 884 {
 885   return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
 886 }
 887
 888 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 889 _mm256_cvtepu8_epi16(__m128i __V)
 890 {
 891   return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
 892 }
 893
 894 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 895 _mm256_cvtepu8_epi32(__m128i __V)
 896 {
 897   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
 898 }
 899
 900 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 901 _mm256_cvtepu8_epi64(__m128i __V)
 902 {
 903   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
 904 }
 905
 906 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 907 _mm256_cvtepu16_epi32(__m128i __V)
 908 {
 909   return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
 910 }
 911
 912 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 913 _mm256_cvtepu16_epi64(__m128i __V)
 914 {
 915   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
 916 }
 917
 918 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 919 _mm256_cvtepu32_epi64(__m128i __V)
 920 {
 921   return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
 922 }
 923
 924 /// Multiplies signed 32-bit integers from even-numbered elements of two
 925 ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
 926 ///    [4 x i64] result.
 927 ///
 928 /// \code{.operation}
 929 /// result[63:0] := __a[31:0] * __b[31:0]
 930 /// result[127:64] := __a[95:64] * __b[95:64]
 931 /// result[191:128] := __a[159:128] * __b[159:128]
 932 /// result[255:192] := __a[223:192] * __b[223:192]
 933 /// \endcode
 934 ///
 935 /// \headerfile <immintrin.h>
 936 ///
 937 /// This intrinsic corresponds to the \c VPMULDQ instruction.
 938 ///
 939 /// \param __a
 940 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 941 /// \param __b
 942 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 943 /// \returns A 256-bit vector of [4 x i64] containing the products.
 944 static __inline__  __m256i __DEFAULT_FN_ATTRS256
 945 _mm256_mul_epi32(__m256i __a, __m256i __b)
 946 {
 947   return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
 948 }
 949
 950 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
 951 ///    [16 x i16], truncates the 32-bit results to the most significant 18
 952 ///    bits, rounds by adding 1, and returns bits [16:1] of each rounded
 953 ///    product in the [16 x i16] result.
 954 ///
 955 /// \code{.operation}
 956 /// FOR i := 0 TO 15
 957 ///   j := i*16
 958 ///   temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
 959 ///   result[j+15:j] := temp[16:1]
 960 /// \endcode
 961 ///
 962 /// \headerfile <immintrin.h>
 963 ///
 964 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
 965 ///
 966 /// \param __a
 967 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 968 /// \param __b
 969 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 970 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
 971 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 972 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
 973 {
 974   return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
 975 }
 976
 977 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
 978 ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
 979 ///    [16 x i16] result.
 980 ///
 981 /// \headerfile <immintrin.h>
 982 ///
 983 /// This intrinsic corresponds to the \c VPMULHUW instruction.
 984 ///
 985 /// \param __a
 986 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 987 /// \param __b
 988 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 989 /// \returns A 256-bit vector of [16 x i16] containing the products.
 990 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 991 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
 992 {
 993   return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
 994 }
 995
 996 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
 997 ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
 998 ///    [16 x i16] result.
 999 ///
1000 /// \headerfile <immintrin.h>
1001 ///
1002 /// This intrinsic corresponds to the \c VPMULHW instruction.
1003 ///
1004 /// \param __a
1005 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1006 /// \param __b
1007 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1008 /// \returns A 256-bit vector of [16 x i16] containing the products.
1009 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1010 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
1011 {
1012   return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1013 }
1014
1015 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1016 ///    [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1017 ///    [16 x i16] result.
1018 ///
1019 /// \headerfile <immintrin.h>
1020 ///
1021 /// This intrinsic corresponds to the \c VPMULLW instruction.
1022 ///
1023 /// \param __a
1024 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1025 /// \param __b
1026 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1027 /// \returns A 256-bit vector of [16 x i16] containing the products.
1028 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1029 _mm256_mullo_epi16(__m256i __a, __m256i __b)
1030 {
1031   return (__m256i)((__v16hu)__a * (__v16hu)__b);
1032 }
1033
1034 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1035 ///    [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1036 ///    [8 x i32] result.
1037 ///
1038 /// \headerfile <immintrin.h>
1039 ///
1040 /// This intrinsic corresponds to the \c VPMULLD instruction.
1041 ///
1042 /// \param __a
1043 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1044 /// \param __b
1045 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1046 /// \returns A 256-bit vector of [8 x i32] containing the products.
1047 static __inline__  __m256i __DEFAULT_FN_ATTRS256
1048 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
1049 {
1050   return (__m256i)((__v8su)__a * (__v8su)__b);
1051 }
1052
1053 /// Multiplies unsigned 32-bit integers from even-numered elements of two
1054 ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1055 ///    [4 x i64] result.
1056 ///
1057 /// \code{.operation}
1058 /// result[63:0] := __a[31:0] * __b[31:0]
1059 /// result[127:64] := __a[95:64] * __b[95:64]
1060 /// result[191:128] := __a[159:128] * __b[159:128]
1061 /// result[255:192] := __a[223:192] * __b[223:192]
1062 /// \endcode
1063 ///
1064 /// \headerfile <immintrin.h>
1065 ///
1066 /// This intrinsic corresponds to the \c VPMULUDQ instruction.
1067 ///
1068 /// \param __a
1069 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1070 /// \param __b
1071 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1072 /// \returns A 256-bit vector of [4 x i64] containing the products.
1073 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1074 _mm256_mul_epu32(__m256i __a, __m256i __b)
1075 {
1076   return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1077 }
1078
1079 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1080 _mm256_or_si256(__m256i __a, __m256i __b)
1081 {
1082   return (__m256i)((__v4du)__a | (__v4du)__b);
1083 }
1084
1085 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1086 _mm256_sad_epu8(__m256i __a, __m256i __b)
1087 {
1088   return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1089 }
1090
1091 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1092 ///    to control information in the 256-bit integer vector \a __b, and
1093 ///    returns the 256-bit result. In effect there are two separate 128-bit
1094 ///    shuffles in the lower and upper halves.
1095 ///
1096 /// \code{.operation}
1097 /// FOR i := 0 TO 31
1098 ///   j := i*8
1099 ///   IF __b[j+7] == 1
1100 ///     result[j+7:j] := 0
1101 ///   ELSE
1102 ///     k := __b[j+3:j] * 8
1103 ///     IF i > 15
1104 ///       k := k + 128
1105 ///     FI
1106 ///     result[j+7:j] := __a[k+7:k]
1107 ///   FI
1108 /// ENDFOR
1109 /// \endcode
1110 ///
1111 /// \headerfile <immintrin.h>
1112 ///
1113 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1114 ///
1115 /// \param __a
1116 ///    A 256-bit integer vector containing source values.
1117 /// \param __b
1118 ///    A 256-bit integer vector containing control information to determine
1119 ///    what goes into the corresponding byte of the result. If bit 7 of the
1120 ///    control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1121 ///    control byte specify the index (within the same 128-bit half) of \a __a
1122 ///    to copy to the result byte.
1123 /// \returns A 256-bit integer vector containing the result.
1124 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1125 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
1126 {
1127   return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1128 }
1129
1130 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1131 ///    according to control information in the integer literal \a imm, and
1132 ///    returns the 256-bit result. In effect there are two parallel 128-bit
1133 ///    shuffles in the lower and upper halves.
1134 ///
1135 /// \code{.operation}
1136 /// FOR i := 0 to 3
1137 ///   j := i*32
1138 ///   k := (imm >> i*2)[1:0] * 32
1139 ///   result[j+31:j] := a[k+31:k]
1140 ///   result[128+j+31:128+j] := a[128+k+31:128+k]
1141 /// ENDFOR
1142 /// \endcode
1143 ///
1144 /// \headerfile <immintrin.h>
1145 ///
1146 /// \code
1147 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1148 /// \endcode
1149 ///
1150 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1151 ///
1152 /// \param a
1153 ///    A 256-bit vector of [8 x i32] containing source values.
1154 /// \param imm
1155 ///    An immediate 8-bit value specifying which elements to copy from \a a.
1156 ///    \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1157 ///    result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1158 ///    forth.
1159 /// \returns A 256-bit vector of [8 x i32] containing the result.
1160 #define _mm256_shuffle_epi32(a, imm) \
1161   ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1162
1163 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1164 ///    according to control information in the integer literal \a imm, and
1165 ///    returns the 256-bit result. The upper 64 bits of each 128-bit half
1166 ///    are shuffled in parallel; the lower 64 bits of each 128-bit half are
1167 ///    copied from \a a unchanged.
1168 ///
1169 /// \code{.operation}
1170 /// result[63:0] := a[63:0]
1171 /// result[191:128] := a[191:128]
1172 /// FOR i := 0 TO 3
1173 ///   j := i * 16 + 64
1174 ///   k := (imm >> i*2)[1:0] * 16 + 64
1175 ///   result[j+15:j] := a[k+15:k]
1176 ///   result[128+j+15:128+j] := a[128+k+15:128+k]
1177 /// ENDFOR
1178 /// \endcode
1179 ///
1180 /// \headerfile <immintrin.h>
1181 ///
1182 /// \code
1183 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1184 /// \endcode
1185 ///
1186 /// This intrinsic corresponds to the \c VPSHUFHW instruction.
1187 ///
1188 /// \param a
1189 ///    A 256-bit vector of [16 x i16] containing source values.
1190 /// \param imm
1191 ///    An immediate 8-bit value specifying which elements to copy from \a a.
1192 ///    \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1193 ///    result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1194 ///    forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1195 /// \returns A 256-bit vector of [16 x i16] containing the result.
1196 #define _mm256_shufflehi_epi16(a, imm) \
1197   ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1198
1199 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1200 ///    according to control information in the integer literal \a imm, and
1201 ///    returns the 256-bit [16 x i16] result. The lower 64 bits of each
1202 ///    128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1203 ///    copied from \a a unchanged.
1204 ///
1205 /// \code{.operation}
1206 /// result[127:64] := a[127:64]
1207 /// result[255:192] := a[255:192]
1208 /// FOR i := 0 TO 3
1209 ///   j := i * 16
1210 ///   k := (imm >> i*2)[1:0] * 16
1211 ///   result[j+15:j] := a[k+15:k]
1212 ///   result[128+j+15:128+j] := a[128+k+15:128+k]
1213 /// ENDFOR
1214 /// \endcode
1215 ///
1216 /// \headerfile <immintrin.h>
1217 ///
1218 /// \code
1219 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1220 /// \endcode
1221 ///
1222 /// This intrinsic corresponds to the \c VPSHUFLW instruction.
1223 ///
1224 /// \param a
1225 ///    A 256-bit vector of [16 x i16] to use as a source of data for the
1226 ///    result.
1227 /// \param imm
1228 ///    An immediate 8-bit value specifying which elements to copy from \a a.
1229 ///    \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
1230 ///    result, \a imm[3:2] specifies the index for elements 1 and 9, and so
1231 ///    forth.
1232 /// \returns A 256-bit vector of [16 x i16] containing the result.
1233 #define _mm256_shufflelo_epi16(a, imm) \
1234   ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
1235
1236 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1237 _mm256_sign_epi8(__m256i __a, __m256i __b)
1238 {
1239     return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
1240 }
1241
1242 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1243 _mm256_sign_epi16(__m256i __a, __m256i __b)
1244 {
1245     return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
1246 }
1247
1248 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1249 _mm256_sign_epi32(__m256i __a, __m256i __b)
1250 {
1251     return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
1252 }
1253
1254 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
1255 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
1256 ///    is greater than 15, the returned result is all zeroes.
1257 ///
1258 /// \headerfile <immintrin.h>
1259 ///
1260 /// \code
1261 /// __m256i _mm256_slli_si256(__m256i a, const int imm);
1262 /// \endcode
1263 ///
1264 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
1265 ///
1266 /// \param a
1267 ///    A 256-bit integer vector to be shifted.
1268 /// \param imm
1269 ///     An unsigned immediate value specifying the shift count (in bytes).
1270 /// \returns A 256-bit integer vector containing the result.
1271 #define _mm256_slli_si256(a, imm) \
1272   ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
1273
1274 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
1275 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
1276 ///    is greater than 15, the returned result is all zeroes.
1277 ///
1278 /// \headerfile <immintrin.h>
1279 ///
1280 /// \code
1281 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
1282 /// \endcode
1283 ///
1284 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
1285 ///
1286 /// \param a
1287 ///    A 256-bit integer vector to be shifted.
1288 /// \param imm
1289 ///    An unsigned immediate value specifying the shift count (in bytes).
1290 /// \returns A 256-bit integer vector containing the result.
1291 #define _mm256_bslli_epi128(a, imm) \
1292   ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
1293
1294 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1295 ///    left by \a __count bits, shifting in zero bits, and returns the result.
1296 ///    If \a __count is greater than 15, the returned result is all zeroes.
1297 ///
1298 /// \headerfile <immintrin.h>
1299 ///
1300 /// This intrinsic corresponds to the \c VPSLLW instruction.
1301 ///
1302 /// \param __a
1303 ///    A 256-bit vector of [16 x i16] to be shifted.
1304 /// \param __count
1305 ///    An unsigned integer value specifying the shift count (in bits).
1306 /// \returns A 256-bit vector of [16 x i16] containing the result.
1307 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1308 _mm256_slli_epi16(__m256i __a, int __count)
1309 {
1310   return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
1311 }
1312
1313 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1314 ///    left by the number of bits specified by the lower 64 bits of \a __count,
1315 ///    shifting in zero bits, and returns the result. If \a __count is greater
1316 ///    than 15, the returned result is all zeroes.
1317 ///
1318 /// \headerfile <immintrin.h>
1319 ///
1320 /// This intrinsic corresponds to the \c VPSLLW instruction.
1321 ///
1322 /// \param __a
1323 ///    A 256-bit vector of [16 x i16] to be shifted.
1324 /// \param __count
1325 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1326 ///    shift count (in bits). The upper element is ignored.
1327 /// \returns A 256-bit vector of [16 x i16] containing the result.
1328 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1329 _mm256_sll_epi16(__m256i __a, __m128i __count)
1330 {
1331   return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
1332 }
1333
1334 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1335 ///    left by \a __count bits, shifting in zero bits, and returns the result.
1336 ///    If \a __count is greater than 31, the returned result is all zeroes.
1337 ///
1338 /// \headerfile <immintrin.h>
1339 ///
1340 /// This intrinsic corresponds to the \c VPSLLD instruction.
1341 ///
1342 /// \param __a
1343 ///    A 256-bit vector of [8 x i32] to be shifted.
1344 /// \param __count
1345 ///    An unsigned integer value specifying the shift count (in bits).
1346 /// \returns A 256-bit vector of [8 x i32] containing the result.
1347 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1348 _mm256_slli_epi32(__m256i __a, int __count)
1349 {
1350   return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
1351 }
1352
1353 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1354 ///    left by the number of bits given in the lower 64 bits of \a __count,
1355 ///    shifting in zero bits, and returns the result. If \a __count is greater
1356 ///    than 31, the returned result is all zeroes.
1357 ///
1358 /// \headerfile <immintrin.h>
1359 ///
1360 /// This intrinsic corresponds to the \c VPSLLD instruction.
1361 ///
1362 /// \param __a
1363 ///    A 256-bit vector of [8 x i32] to be shifted.
1364 /// \param __count
1365 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1366 ///    shift count (in bits). The upper element is ignored.
1367 /// \returns A 256-bit vector of [8 x i32] containing the result.
1368 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1369 _mm256_sll_epi32(__m256i __a, __m128i __count)
1370 {
1371   return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
1372 }
1373
1374 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
1375 ///    left by \a __count bits, shifting in zero bits, and returns the result.
1376 ///    If \a __count is greater than 63, the returned result is all zeroes.
1377 ///
1378 /// \headerfile <immintrin.h>
1379 ///
1380 /// This intrinsic corresponds to the \c VPSLLQ instruction.
1381 ///
1382 /// \param __a
1383 ///    A 256-bit vector of [4 x i64] to be shifted.
1384 /// \param __count
1385 ///    An unsigned integer value specifying the shift count (in bits).
1386 /// \returns A 256-bit vector of [4 x i64] containing the result.
1387 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1388 _mm256_slli_epi64(__m256i __a, int __count)
1389 {
1390   return __builtin_ia32_psllqi256((__v4di)__a, __count);
1391 }
1392
1393 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
1394 ///    left by the number of bits given in the lower 64 bits of \a __count,
1395 ///    shifting in zero bits, and returns the result. If \a __count is greater
1396 ///    than 63, the returned result is all zeroes.
1397 ///
1398 /// \headerfile <immintrin.h>
1399 ///
1400 /// This intrinsic corresponds to the \c VPSLLQ instruction.
1401 ///
1402 /// \param __a
1403 ///    A 256-bit vector of [4 x i64] to be shifted.
1404 /// \param __count
1405 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1406 ///    shift count (in bits). The upper element is ignored.
1407 /// \returns A 256-bit vector of [4 x i64] containing the result.
1408 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1409 _mm256_sll_epi64(__m256i __a, __m128i __count)
1410 {
1411   return __builtin_ia32_psllq256((__v4di)__a, __count);
1412 }
1413
1414 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1415 ///    right by \a __count bits, shifting in sign bits, and returns the result.
1416 ///    If \a __count is greater than 15, each element of the result is either
1417 ///    0 or -1 according to the corresponding input sign bit.
1418 ///
1419 /// \headerfile <immintrin.h>
1420 ///
1421 /// This intrinsic corresponds to the \c VPSRAW instruction.
1422 ///
1423 /// \param __a
1424 ///    A 256-bit vector of [16 x i16] to be shifted.
1425 /// \param __count
1426 ///    An unsigned integer value specifying the shift count (in bits).
1427 /// \returns A 256-bit vector of [16 x i16] containing the result.
1428 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1429 _mm256_srai_epi16(__m256i __a, int __count)
1430 {
1431   return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
1432 }
1433
1434 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1435 ///    right by the number of bits given in the lower 64 bits of \a __count,
1436 ///    shifting in sign bits, and returns the result. If \a __count is greater
1437 ///    than 15, each element of the result is either 0 or -1 according to the
1438 ///    corresponding input sign bit.
1439 ///
1440 /// \headerfile <immintrin.h>
1441 ///
1442 /// This intrinsic corresponds to the \c VPSRAW instruction.
1443 ///
1444 /// \param __a
1445 ///    A 256-bit vector of [16 x i16] to be shifted.
1446 /// \param __count
1447 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1448 ///    shift count (in bits). The upper element is ignored.
1449 /// \returns A 256-bit vector of [16 x i16] containing the result.
1450 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1451 _mm256_sra_epi16(__m256i __a, __m128i __count)
1452 {
1453   return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
1454 }
1455
1456 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1457 ///    right by \a __count bits, shifting in sign bits, and returns the result.
1458 ///    If \a __count is greater than 31, each element of the result is either
1459 ///    0 or -1 according to the corresponding input sign bit.
1460 ///
1461 /// \headerfile <immintrin.h>
1462 ///
1463 /// This intrinsic corresponds to the \c VPSRAD instruction.
1464 ///
1465 /// \param __a
1466 ///    A 256-bit vector of [8 x i32] to be shifted.
1467 /// \param __count
1468 ///    An unsigned integer value specifying the shift count (in bits).
1469 /// \returns A 256-bit vector of [8 x i32] containing the result.
1470 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1471 _mm256_srai_epi32(__m256i __a, int __count)
1472 {
1473   return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
1474 }
1475
1476 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1477 ///    right by the number of bits given in the lower 64 bits of \a __count,
1478 ///    shifting in sign bits, and returns the result. If \a __count is greater
1479 ///    than 31, each element of the result is either 0 or -1 according to the
1480 ///    corresponding input sign bit.
1481 ///
1482 /// \headerfile <immintrin.h>
1483 ///
1484 /// This intrinsic corresponds to the \c VPSRAD instruction.
1485 ///
1486 /// \param __a
1487 ///    A 256-bit vector of [8 x i32] to be shifted.
1488 /// \param __count
1489 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1490 ///    shift count (in bits). The upper element is ignored.
1491 /// \returns A 256-bit vector of [8 x i32] containing the result.
1492 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1493 _mm256_sra_epi32(__m256i __a, __m128i __count)
1494 {
1495   return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
1496 }
1497
1498 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
1499 ///    \a imm bytes, shifting in zero bytes, and returns the result. If
1500 ///    \a imm is greater than 15, the returned result is all zeroes.
1501 ///
1502 /// \headerfile <immintrin.h>
1503 ///
1504 /// \code
1505 /// __m256i _mm256_srli_si256(__m256i a, const int imm);
1506 /// \endcode
1507 ///
1508 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
1509 ///
1510 /// \param a
1511 ///    A 256-bit integer vector to be shifted.
1512 /// \param imm
1513 ///    An unsigned immediate value specifying the shift count (in bytes).
1514 /// \returns A 256-bit integer vector containing the result.
1515 #define _mm256_srli_si256(a, imm) \
1516   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
1517
1518 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
1519 ///    \a imm bytes, shifting in zero bytes, and returns the result. If
1520 ///    \a imm is greater than 15, the returned result is all zeroes.
1521 ///
1522 /// \headerfile <immintrin.h>
1523 ///
1524 /// \code
1525 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
1526 /// \endcode
1527 ///
1528 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
1529 ///
1530 /// \param a
1531 ///    A 256-bit integer vector to be shifted.
1532 /// \param imm
1533 ///     An unsigned immediate value specifying the shift count (in bytes).
1534 /// \returns A 256-bit integer vector containing the result.
1535 #define _mm256_bsrli_epi128(a, imm) \
1536   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
1537
1538 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1539 ///    right by \a __count bits, shifting in zero bits, and returns the result.
1540 ///    If \a __count is greater than 15, the returned result is all zeroes.
1541 ///
1542 /// \headerfile <immintrin.h>
1543 ///
1544 /// This intrinsic corresponds to the \c VPSRLW instruction.
1545 ///
1546 /// \param __a
1547 ///    A 256-bit vector of [16 x i16] to be shifted.
1548 /// \param __count
1549 ///    An unsigned integer value specifying the shift count (in bits).
1550 /// \returns A 256-bit vector of [16 x i16] containing the result.
1551 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1552 _mm256_srli_epi16(__m256i __a, int __count)
1553 {
1554   return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
1555 }
1556
1557 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
1558 ///    right by the number of bits given in the lower 64 bits of \a __count,
1559 ///    shifting in zero bits, and returns the result. If \a __count is greater
1560 ///    than 15, the returned result is all zeroes.
1561 ///
1562 /// \headerfile <immintrin.h>
1563 ///
1564 /// This intrinsic corresponds to the \c VPSRLW instruction.
1565 ///
1566 /// \param __a
1567 ///    A 256-bit vector of [16 x i16] to be shifted.
1568 /// \param __count
1569 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1570 ///    shift count (in bits). The upper element is ignored.
1571 /// \returns A 256-bit vector of [16 x i16] containing the result.
1572 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1573 _mm256_srl_epi16(__m256i __a, __m128i __count)
1574 {
1575   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
1576 }
1577
1578 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1579 ///    right by \a __count bits, shifting in zero bits, and returns the result.
1580 ///    If \a __count is greater than 31, the returned result is all zeroes.
1581 ///
1582 /// \headerfile <immintrin.h>
1583 ///
1584 /// This intrinsic corresponds to the \c VPSRLD instruction.
1585 ///
1586 /// \param __a
1587 ///    A 256-bit vector of [8 x i32] to be shifted.
1588 /// \param __count
1589 ///    An unsigned integer value specifying the shift count (in bits).
1590 /// \returns A 256-bit vector of [8 x i32] containing the result.
1591 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1592 _mm256_srli_epi32(__m256i __a, int __count)
1593 {
1594   return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
1595 }
1596
1597 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
1598 ///    right by the number of bits given in the lower 64 bits of \a __count,
1599 ///    shifting in zero bits, and returns the result. If \a __count is greater
1600 ///    than 31, the returned result is all zeroes.
1601 ///
1602 /// \headerfile <immintrin.h>
1603 ///
1604 /// This intrinsic corresponds to the \c VPSRLD instruction.
1605 ///
1606 /// \param __a
1607 ///    A 256-bit vector of [8 x i32] to be shifted.
1608 /// \param __count
1609 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1610 ///    shift count (in bits). The upper element is ignored.
1611 /// \returns A 256-bit vector of [8 x i32] containing the result.
1612 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1613 _mm256_srl_epi32(__m256i __a, __m128i __count)
1614 {
1615   return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
1616 }
1617
1618 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
1619 ///    right by \a __count bits, shifting in zero bits, and returns the result.
1620 ///    If \a __count is greater than 63, the returned result is all zeroes.
1621 ///
1622 /// \headerfile <immintrin.h>
1623 ///
1624 /// This intrinsic corresponds to the \c VPSRLQ instruction.
1625 ///
1626 /// \param __a
1627 ///    A 256-bit vector of [4 x i64] to be shifted.
1628 /// \param __count
1629 ///    An unsigned integer value specifying the shift count (in bits).
1630 /// \returns A 256-bit vector of [4 x i64] containing the result.
1631 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1632 _mm256_srli_epi64(__m256i __a, int __count)
1633 {
1634   return __builtin_ia32_psrlqi256((__v4di)__a, __count);
1635 }
1636
1637 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
1638 ///    right by the number of bits given in the lower 64 bits of \a __count,
1639 ///    shifting in zero bits, and returns the result. If \a __count is greater
1640 ///    than 63, the returned result is all zeroes.
1641 ///
1642 /// \headerfile <immintrin.h>
1643 ///
1644 /// This intrinsic corresponds to the \c VPSRLQ instruction.
1645 ///
1646 /// \param __a
1647 ///    A 256-bit vector of [4 x i64] to be shifted.
1648 /// \param __count
1649 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
1650 ///    shift count (in bits). The upper element is ignored.
1651 /// \returns A 256-bit vector of [4 x i64] containing the result.
1652 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1653 _mm256_srl_epi64(__m256i __a, __m128i __count)
1654 {
1655   return __builtin_ia32_psrlq256((__v4di)__a, __count);
1656 }
1657
1658 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
1659 ///    vectors. Returns the lower 8 bits of each difference in the
1660 ///    corresponding byte of the 256-bit integer vector result (overflow is
1661 ///    ignored).
1662 ///
1663 /// \code{.operation}
1664 /// FOR i := 0 TO 31
1665 ///   j := i*8
1666 ///   result[j+7:j] := __a[j+7:j] - __b[j+7:j]
1667 /// ENDFOR
1668 /// \endcode
1669 ///
1670 /// \headerfile <immintrin.h>
1671 ///
1672 /// This intrinsic corresponds to the \c VPSUBB instruction.
1673 ///
1674 /// \param __a
1675 ///    A 256-bit integer vector containing the minuends.
1676 /// \param __b
1677 ///    A 256-bit integer vector containing the subtrahends.
1678 /// \returns A 256-bit integer vector containing the differences.
1679 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1680 _mm256_sub_epi8(__m256i __a, __m256i __b)
1681 {
1682   return (__m256i)((__v32qu)__a - (__v32qu)__b);
1683 }
1684
1685 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
1686 ///    vectors of [16 x i16]. Returns the lower 16 bits of each difference in
1687 ///    the corresponding element of the [16 x i16] result (overflow is
1688 ///    ignored).
1689 ///
1690 /// \code{.operation}
1691 /// FOR i := 0 TO 15
1692 ///   j := i*16
1693 ///   result[j+15:j] := __a[j+15:j] - __b[j+15:j]
1694 /// ENDFOR
1695 /// \endcode
1696 ///
1697 /// \headerfile <immintrin.h>
1698 ///
1699 /// This intrinsic corresponds to the \c VPSUBW instruction.
1700 ///
1701 /// \param __a
1702 ///    A 256-bit vector of [16 x i16] containing the minuends.
1703 /// \param __b
1704 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
1705 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1706 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1707 _mm256_sub_epi16(__m256i __a, __m256i __b)
1708 {
1709   return (__m256i)((__v16hu)__a - (__v16hu)__b);
1710 }
1711
1712 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
1713 ///    vectors of [8 x i32]. Returns the lower 32 bits of each difference in
1714 ///    the corresponding element of the [8 x i32] result (overflow is ignored).
1715 ///
1716 /// \code{.operation}
1717 /// FOR i := 0 TO 7
1718 ///   j := i*32
1719 ///   result[j+31:j] := __a[j+31:j] - __b[j+31:j]
1720 /// ENDFOR
1721 /// \endcode
1722 ///
1723 /// \headerfile <immintrin.h>
1724 ///
1725 /// This intrinsic corresponds to the \c VPSUBD instruction.
1726 ///
1727 /// \param __a
1728 ///    A 256-bit vector of [8 x i32] containing the minuends.
1729 /// \param __b
1730 ///    A 256-bit vector of [8 x i32] containing the subtrahends.
1731 /// \returns A 256-bit vector of [8 x i32] containing the differences.
1732 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1733 _mm256_sub_epi32(__m256i __a, __m256i __b)
1734 {
1735   return (__m256i)((__v8su)__a - (__v8su)__b);
1736 }
1737
1738 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
1739 ///    vectors of [4 x i64]. Returns the lower 64 bits of each difference in
1740 ///    the corresponding element of the [4 x i64] result (overflow is ignored).
1741 ///
1742 /// \code{.operation}
1743 /// FOR i := 0 TO 3
1744 ///   j := i*64
1745 ///   result[j+63:j] := __a[j+63:j] - __b[j+63:j]
1746 /// ENDFOR
1747 /// \endcode
1748 ///
1749 /// \headerfile <immintrin.h>
1750 ///
1751 /// This intrinsic corresponds to the \c VPSUBQ instruction.
1752 ///
1753 /// \param __a
1754 ///    A 256-bit vector of [4 x i64] containing the minuends.
1755 /// \param __b
1756 ///    A 256-bit vector of [4 x i64] containing the subtrahends.
1757 /// \returns A 256-bit vector of [4 x i64] containing the differences.
1758 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1759 _mm256_sub_epi64(__m256i __a, __m256i __b)
1760 {
1761   return (__m256i)((__v4du)__a - (__v4du)__b);
1762 }
1763
1764 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
1765 ///    vectors using signed saturation, and returns each differences in the
1766 ///    corresponding byte of the 256-bit integer vector result.
1767 ///
1768 /// \code{.operation}
1769 /// FOR i := 0 TO 31
1770 ///   j := i*8
1771 ///   result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
1772 /// ENDFOR
1773 /// \endcode
1774 ///
1775 /// \headerfile <immintrin.h>
1776 ///
1777 /// This intrinsic corresponds to the \c VPSUBSB instruction.
1778 ///
1779 /// \param __a
1780 ///    A 256-bit integer vector containing the minuends.
1781 /// \param __b
1782 ///    A 256-bit integer vector containing the subtrahends.
1783 /// \returns A 256-bit integer vector containing the differences.
1784 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1785 _mm256_subs_epi8(__m256i __a, __m256i __b)
1786 {
1787   return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
1788 }
1789
1790 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
1791 ///    vectors of [16 x i16] using signed saturation, and returns each
1792 ///    difference in the corresponding element of the [16 x i16] result.
1793 ///
1794 /// \code{.operation}
1795 /// FOR i := 0 TO 15
1796 ///   j := i*16
1797 ///   result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
1798 /// ENDFOR
1799 /// \endcode
1800 ///
1801 /// \headerfile <immintrin.h>
1802 ///
1803 /// This intrinsic corresponds to the \c VPSUBSW instruction.
1804 ///
1805 /// \param __a
1806 ///    A 256-bit vector of [16 x i16] containing the minuends.
1807 /// \param __b
1808 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
1809 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1810 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1811 _mm256_subs_epi16(__m256i __a, __m256i __b)
1812 {
1813   return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
1814 }
1815
1816 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
1817 ///    vectors using unsigned saturation, and returns each difference in the
1818 ///    corresponding byte of the 256-bit integer vector result. For each byte,
1819 ///    computes <c> result = __a - __b </c>.
1820 ///
1821 /// \code{.operation}
1822 /// FOR i := 0 TO 31
1823 ///   j := i*8
1824 ///   result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
1825 /// ENDFOR
1826 /// \endcode
1827 ///
1828 /// \headerfile <immintrin.h>
1829 ///
1830 /// This intrinsic corresponds to the \c VPSUBUSB instruction.
1831 ///
1832 /// \param __a
1833 ///    A 256-bit integer vector containing the minuends.
1834 /// \param __b
1835 ///    A 256-bit integer vector containing the subtrahends.
1836 /// \returns A 256-bit integer vector containing the differences.
1837 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1838 _mm256_subs_epu8(__m256i __a, __m256i __b)
1839 {
1840   return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
1841 }
1842
1843 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
1844 ///    vectors of [16 x i16] using unsigned saturation, and returns each
1845 ///    difference in the corresponding element of the [16 x i16] result.
1846 ///
1847 /// \code{.operation}
1848 /// FOR i := 0 TO 15
1849 ///   j := i*16
1850 ///   result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
1851 /// ENDFOR
1852 /// \endcode
1853 ///
1854 /// \headerfile <immintrin.h>
1855 ///
1856 /// This intrinsic corresponds to the \c VPSUBUSW instruction.
1857 ///
1858 /// \param __a
1859 ///    A 256-bit vector of [16 x i16] containing the minuends.
1860 /// \param __b
1861 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
1862 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1863 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1864 _mm256_subs_epu16(__m256i __a, __m256i __b)
1865 {
1866   return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
1867 }
1868
1869 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
1870 ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
1871 ///    uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
1872 ///    input; other bits in these parameters are ignored.
1873 ///
1874 /// \code{.operation}
1875 /// result[7:0] := __a[71:64]
1876 /// result[15:8] := __b[71:64]
1877 /// result[23:16] := __a[79:72]
1878 /// result[31:24] := __b[79:72]
1879 /// . . .
1880 /// result[127:120] := __b[127:120]
1881 /// result[135:128] := __a[199:192]
1882 /// . . .
1883 /// result[255:248] := __b[255:248]
1884 /// \endcode
1885 ///
1886 /// \headerfile <immintrin.h>
1887 ///
1888 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
1889 ///
1890 /// \param __a
1891 ///    A 256-bit integer vector used as the source for the even-numbered bytes
1892 ///    of the result.
1893 /// \param __b
1894 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
1895 ///    of the result.
1896 /// \returns A 256-bit integer vector containing the result.
1897 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1898 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
1899 {
1900   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
1901 }
1902
1903 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
1904 ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
1905 ///    vector of [16 x i16]. Specifically, uses the upper 64 bits of each
1906 ///    128-bit half of \a __a and \a __b as input; other bits in these
1907 ///    parameters are ignored.
1908 ///
1909 /// \code{.operation}
1910 /// result[15:0] := __a[79:64]
1911 /// result[31:16] := __b[79:64]
1912 /// result[47:32] := __a[95:80]
1913 /// result[63:48] := __b[95:80]
1914 /// . . .
1915 /// result[127:112] := __b[127:112]
1916 /// result[143:128] := __a[211:196]
1917 /// . . .
1918 /// result[255:240] := __b[255:240]
1919 /// \endcode
1920 ///
1921 /// \headerfile <immintrin.h>
1922 ///
1923 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
1924 ///
1925 /// \param __a
1926 ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
1927 ///    elements of the result.
1928 /// \param __b
1929 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
1930 ///    elements of the result.
1931 /// \returns A 256-bit vector of [16 x i16] containing the result.
1932 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1933 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
1934 {
1935   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1936 }
1937
1938 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
1939 ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
1940 ///    of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
1941 ///    of \a __a and \a __b as input; other bits in these parameters are
1942 ///    ignored.
1943 ///
1944 /// \code{.operation}
1945 /// result[31:0] := __a[95:64]
1946 /// result[63:32] := __b[95:64]
1947 /// result[95:64] := __a[127:96]
1948 /// result[127:96] := __b[127:96]
1949 /// result[159:128] := __a[223:192]
1950 /// result[191:160] := __b[223:192]
1951 /// result[223:192] := __a[255:224]
1952 /// result[255:224] := __b[255:224]
1953 /// \endcode
1954 ///
1955 /// \headerfile <immintrin.h>
1956 ///
1957 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
1958 ///
1959 /// \param __a
1960 ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
1961 ///    elements of the result.
1962 /// \param __b
1963 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
1964 ///    elements of the result.
1965 /// \returns A 256-bit vector of [8 x i32] containing the result.
1966 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1967 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
1968 {
1969   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
1970 }
1971
1972 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
1973 ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
1974 ///    of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
1975 ///    of \a __a and \a __b as input; other bits in these parameters are
1976 ///    ignored.
1977 ///
1978 /// \code{.operation}
1979 /// result[63:0] := __a[127:64]
1980 /// result[127:64] := __b[127:64]
1981 /// result[191:128] := __a[255:192]
1982 /// result[255:192] := __b[255:192]
1983 /// \endcode
1984 ///
1985 /// \headerfile <immintrin.h>
1986 ///
1987 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
1988 ///
1989 /// \param __a
1990 ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
1991 ///    elements of the result.
1992 /// \param __b
1993 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
1994 ///    elements of the result.
1995 /// \returns A 256-bit vector of [4 x i64] containing the result.
1996 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1997 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
1998 {
1999   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2000 }
2001
2002 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2003 ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2004 ///    uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2005 ///    input; other bits in these parameters are ignored.
2006 ///
2007 /// \code{.operation}
2008 /// result[7:0] := __a[7:0]
2009 /// result[15:8] := __b[7:0]
2010 /// result[23:16] := __a[15:8]
2011 /// result[31:24] := __b[15:8]
2012 /// . . .
2013 /// result[127:120] := __b[63:56]
2014 /// result[135:128] := __a[135:128]
2015 /// . . .
2016 /// result[255:248] := __b[191:184]
2017 /// \endcode
2018 ///
2019 /// \headerfile <immintrin.h>
2020 ///
2021 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2022 ///
2023 /// \param __a
2024 ///    A 256-bit integer vector used as the source for the even-numbered bytes
2025 ///    of the result.
2026 /// \param __b
2027 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
2028 ///    of the result.
2029 /// \returns A 256-bit integer vector containing the result.
2030 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2031 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
2032 {
2033   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2034 }
2035
2036 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2037 ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2038 ///    vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2039 ///    128-bit half of \a __a and \a __b as input; other bits in these
2040 ///    parameters are ignored.
2041 ///
2042 /// \code{.operation}
2043 /// result[15:0] := __a[15:0]
2044 /// result[31:16] := __b[15:0]
2045 /// result[47:32] := __a[31:16]
2046 /// result[63:48] := __b[31:16]
2047 /// . . .
2048 /// result[127:112] := __b[63:48]
2049 /// result[143:128] := __a[143:128]
2050 /// . . .
2051 /// result[255:239] := __b[191:176]
2052 /// \endcode
2053 ///
2054 /// \headerfile <immintrin.h>
2055 ///
2056 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2057 ///
2058 /// \param __a
2059 ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2060 ///    elements of the result.
2061 /// \param __b
2062 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2063 ///    elements of the result.
2064 /// \returns A 256-bit vector of [16 x i16] containing the result.
2065 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2066 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
2067 {
2068   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2069 }
2070
2071 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2072 ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2073 ///    of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2074 ///    of \a __a and \a __b as input; other bits in these parameters are
2075 ///    ignored.
2076 ///
2077 /// \code{.operation}
2078 /// result[31:0] := __a[31:0]
2079 /// result[63:32] := __b[31:0]
2080 /// result[95:64] := __a[63:32]
2081 /// result[127:96] := __b[63:32]
2082 /// result[159:128] := __a[159:128]
2083 /// result[191:160] := __b[159:128]
2084 /// result[223:192] := __a[191:160]
2085 /// result[255:224] := __b[191:190]
2086 /// \endcode
2087 ///
2088 /// \headerfile <immintrin.h>
2089 ///
2090 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2091 ///
2092 /// \param __a
2093 ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2094 ///    elements of the result.
2095 /// \param __b
2096 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2097 ///    elements of the result.
2098 /// \returns A 256-bit vector of [8 x i32] containing the result.
2099 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2100 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
2101 {
2102   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2103 }
2104
2105 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2106 ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2107 ///    of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2108 ///    of \a __a and \a __b as input; other bits in these parameters are
2109 ///    ignored.
2110 ///
2111 /// \code{.operation}
2112 /// result[63:0] := __a[63:0]
2113 /// result[127:64] := __b[63:0]
2114 /// result[191:128] := __a[191:128]
2115 /// result[255:192] := __b[191:128]
2116 /// \endcode
2117 ///
2118 /// \headerfile <immintrin.h>
2119 ///
2120 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2121 ///
2122 /// \param __a
2123 ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2124 ///    elements of the result.
2125 /// \param __b
2126 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2127 ///    elements of the result.
2128 /// \returns A 256-bit vector of [4 x i64] containing the result.
2129 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2130 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
2131 {
2132   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2133 }
2134
2135 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2136 _mm256_xor_si256(__m256i __a, __m256i __b)
2137 {
2138   return (__m256i)((__v4du)__a ^ (__v4du)__b);
2139 }
2140
2141 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2142 _mm256_stream_load_si256(__m256i const *__V)
2143 {
2144   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2145   return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2146 }
2147
2148 /// Broadcasts the 32-bit floating-point value from the low element of the
2149 ///    128-bit vector of [4 x float] in \a __X to all elements of the result's
2150 ///    128-bit vector of [4 x float].
2151 ///
2152 /// \headerfile <immintrin.h>
2153 ///
2154 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2155 ///
2156 /// \param __X
2157 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
2158 /// \returns A 128-bit vector of [4 x float] containing the result.
2159 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2160 _mm_broadcastss_ps(__m128 __X)
2161 {
2162   return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
2163 }
2164
2165 /// Broadcasts the 64-bit floating-point value from the low element of the
2166 ///    128-bit vector of [2 x double] in \a __a to both elements of the
2167 ///    result's 128-bit vector of [2 x double].
2168 ///
2169 /// \headerfile <immintrin.h>
2170 ///
2171 /// This intrinsic corresponds to the \c MOVDDUP instruction.
2172 ///
2173 /// \param __a
2174 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
2175 /// \returns A 128-bit vector of [2 x double] containing the result.
2176 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2177 _mm_broadcastsd_pd(__m128d __a)
2178 {
2179   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
2180 }
2181
2182 /// Broadcasts the 32-bit floating-point value from the low element of the
2183 ///    128-bit vector of [4 x float] in \a __X to all elements of the
2184 ///    result's 256-bit vector of [8 x float].
2185 ///
2186 /// \headerfile <immintrin.h>
2187 ///
2188 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2189 ///
2190 /// \param __X
2191 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
2192 /// \returns A 256-bit vector of [8 x float] containing the result.
2193 static __inline__ __m256 __DEFAULT_FN_ATTRS256
2194 _mm256_broadcastss_ps(__m128 __X)
2195 {
2196   return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
2197 }
2198
2199 /// Broadcasts the 64-bit floating-point value from the low element of the
2200 ///    128-bit vector of [2 x double] in \a __X to all elements of the
2201 ///    result's 256-bit vector of [4 x double].
2202 ///
2203 /// \headerfile <immintrin.h>
2204 ///
2205 /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
2206 ///
2207 /// \param __X
2208 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
2209 /// \returns A 256-bit vector of [4 x double] containing the result.
2210 static __inline__ __m256d __DEFAULT_FN_ATTRS256
2211 _mm256_broadcastsd_pd(__m128d __X)
2212 {
2213   return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
2214 }
2215
2216 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
2217 ///    upper halves of the 256-bit result.
2218 ///
2219 /// \headerfile <immintrin.h>
2220 ///
2221 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
2222 ///
2223 /// \param __X
2224 ///    A 128-bit integer vector to be broadcast.
2225 /// \returns A 256-bit integer vector containing the result.
2226 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2227 _mm256_broadcastsi128_si256(__m128i __X)
2228 {
2229   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
2230 }
2231
2232 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
2233
2234 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
2235 ///    [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
2236 ///    as specified by the immediate integer operand \a M.
2237 ///
2238 /// \code{.operation}
2239 /// FOR i := 0 TO 3
2240 ///   j := i*32
2241 ///   IF M[i] == 0
2242 ///     result[31+j:j] := V1[31+j:j]
2243 ///   ELSE
2244 ///     result[31+j:j] := V2[32+j:j]
2245 ///   FI
2246 /// ENDFOR
2247 /// \endcode
2248 ///
2249 /// \headerfile <immintrin.h>
2250 ///
2251 /// \code
2252 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
2253 /// \endcode
2254 ///
2255 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
2256 ///
2257 /// \param V1
2258 ///    A 128-bit vector of [4 x i32] containing source values.
2259 /// \param V2
2260 ///    A 128-bit vector of [4 x i32] containing source values.
2261 /// \param M
2262 ///    An immediate 8-bit integer operand, with bits [3:0] specifying the
2263 ///    source for each element of the result. The position of the mask bit
2264 ///    corresponds to the index of a copied value. When a mask bit is 0, the
2265 ///    element is copied from \a V1; otherwise, it is copied from \a V2.
2266 /// \returns A 128-bit vector of [4 x i32] containing the result.
2267 #define _mm_blend_epi32(V1, V2, M) \
2268   ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
2269                                       (__v4si)(__m128i)(V2), (int)(M)))
2270
2271 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
2272 ///    [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
2273 ///    as specified by the immediate integer operand \a M.
2274 ///
2275 /// \code{.operation}
2276 /// FOR i := 0 TO 7
2277 ///   j := i*32
2278 ///   IF M[i] == 0
2279 ///     result[31+j:j] := V1[31+j:j]
2280 ///   ELSE
2281 ///     result[31+j:j] := V2[32+j:j]
2282 ///   FI
2283 /// ENDFOR
2284 /// \endcode
2285 ///
2286 /// \headerfile <immintrin.h>
2287 ///
2288 /// \code
2289 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
2290 /// \endcode
2291 ///
2292 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
2293 ///
2294 /// \param V1
2295 ///    A 256-bit vector of [8 x i32] containing source values.
2296 /// \param V2
2297 ///    A 256-bit vector of [8 x i32] containing source values.
2298 /// \param M
2299 ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
2300 ///    source for each element of the result. The position of the mask bit
2301 ///    corresponds to the index of a copied value. When a mask bit is 0, the
2302 ///    element is copied from \a V1; otherwise, it is is copied from \a V2.
2303 /// \returns A 256-bit vector of [8 x i32] containing the result.
2304 #define _mm256_blend_epi32(V1, V2, M) \
2305   ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
2306                                       (__v8si)(__m256i)(V2), (int)(M)))
2307
2308 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
2309 ///    bytes of the 256-bit result.
2310 ///
2311 /// \headerfile <immintrin.h>
2312 ///
2313 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
2314 ///
2315 /// \param __X
2316 ///    A 128-bit integer vector whose low byte will be broadcast.
2317 /// \returns A 256-bit integer vector containing the result.
2318 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2319 _mm256_broadcastb_epi8(__m128i __X)
2320 {
2321   return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
2322 }
2323
2324 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
2325 ///    to all elements of the result's 256-bit vector of [16 x i16].
2326 ///
2327 /// \headerfile <immintrin.h>
2328 ///
2329 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
2330 ///
2331 /// \param __X
2332 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
2333 /// \returns A 256-bit vector of [16 x i16] containing the result.
2334 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2335 _mm256_broadcastw_epi16(__m128i __X)
2336 {
2337   return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
2338 }
2339
2340 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
2341 ///    to all elements of the result's 256-bit vector of [8 x i32].
2342 ///
2343 /// \headerfile <immintrin.h>
2344 ///
2345 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
2346 ///
2347 /// \param __X
2348 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
2349 /// \returns A 256-bit vector of [8 x i32] containing the result.
2350 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2351 _mm256_broadcastd_epi32(__m128i __X)
2352 {
2353   return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
2354 }
2355
2356 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
2357 ///    to all elements of the result's 256-bit vector of [4 x i64].
2358 ///
2359 /// \headerfile <immintrin.h>
2360 ///
2361 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
2362 ///
2363 /// \param __X
2364 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
2365 /// \returns A 256-bit vector of [4 x i64] containing the result.
2366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2367 _mm256_broadcastq_epi64(__m128i __X)
2368 {
2369   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
2370 }
2371
2372 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
2373 ///    bytes of the 128-bit result.
2374 ///
2375 /// \headerfile <immintrin.h>
2376 ///
2377 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
2378 ///
2379 /// \param __X
2380 ///    A 128-bit integer vector whose low byte will be broadcast.
2381 /// \returns A 128-bit integer vector containing the result.
2382 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2383 _mm_broadcastb_epi8(__m128i __X)
2384 {
2385   return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
2386 }
2387
2388 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
2389 ///    \a __X to all elements of the result's 128-bit vector of [8 x i16].
2390 ///
2391 /// \headerfile <immintrin.h>
2392 ///
2393 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
2394 ///
2395 /// \param __X
2396 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
2397 /// \returns A 128-bit vector of [8 x i16] containing the result.
2398 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2399 _mm_broadcastw_epi16(__m128i __X)
2400 {
2401   return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
2402 }
2403
2404 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
2405 ///    to all elements of the result's vector of [4 x i32].
2406 ///
2407 /// \headerfile <immintrin.h>
2408 ///
2409 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
2410 ///
2411 /// \param __X
2412 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
2413 /// \returns A 128-bit vector of [4 x i32] containing the result.
2414 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2415 _mm_broadcastd_epi32(__m128i __X)
2416 {
2417   return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
2418 }
2419
2420 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
2421 ///    to both elements of the result's 128-bit vector of [2 x i64].
2422 ///
2423 /// \headerfile <immintrin.h>
2424 ///
2425 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
2426 ///
2427 /// \param __X
2428 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
2429 /// \returns A 128-bit vector of [2 x i64] containing the result.
2430 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2431 _mm_broadcastq_epi64(__m128i __X)
2432 {
2433   return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
2434 }
2435
2436 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
2437 ///    256-bit vector of [8 x i32] in \a __a as specified by indexes in the
2438 ///    elements of the 256-bit vector of [8 x i32] in \a __b.
2439 ///
2440 /// \code{.operation}
2441 /// FOR i := 0 TO 7
2442 ///   j := i*32
2443 ///   k := __b[j+2:j] * 32
2444 ///   result[j+31:j] := __a[k+31:k]
2445 /// ENDFOR
2446 /// \endcode
2447 ///
2448 /// \headerfile <immintrin.h>
2449 ///
2450 /// This intrinsic corresponds to the \c VPERMD instruction.
2451 ///
2452 /// \param __a
2453 ///    A 256-bit vector of [8 x i32] containing the source values.
2454 /// \param __b
2455 ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
2456 ///    \a __a.
2457 /// \returns A 256-bit vector of [8 x i32] containing the result.
2458 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2459 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
2460 {
2461   return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
2462 }
2463
2464 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
2465 ///    the 256-bit vector of [4 x double] in \a V as specified by the
2466 ///    immediate value \a M.
2467 ///
2468 /// \code{.operation}
2469 /// FOR i := 0 TO 3
2470 ///   j := i*64
2471 ///   k := (M >> i*2)[1:0] * 64
2472 ///   result[j+63:j] := V[k+63:k]
2473 /// ENDFOR
2474 /// \endcode
2475 ///
2476 /// \headerfile <immintrin.h>
2477 ///
2478 /// \code
2479 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
2480 /// \endcode
2481 ///
2482 /// This intrinsic corresponds to the \c VPERMPD instruction.
2483 ///
2484 /// \param V
2485 ///    A 256-bit vector of [4 x double] containing the source values.
2486 /// \param M
2487 ///    An immediate 8-bit value specifying which elements to copy from \a V.
2488 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
2489 ///    \a M[3:2] specifies the index for element 1, and so forth.
2490 /// \returns A 256-bit vector of [4 x double] containing the result.
2491 #define _mm256_permute4x64_pd(V, M) \
2492   ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
2493
2494 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
2495 ///    the 256-bit vector of [8 x float] in \a __a as specified by indexes in
2496 ///    the elements of the 256-bit vector of [8 x i32] in \a __b.
2497 ///
2498 /// \code{.operation}
2499 /// FOR i := 0 TO 7
2500 ///   j := i*32
2501 ///   k := __b[j+2:j] * 32
2502 ///   result[j+31:j] := __a[k+31:k]
2503 /// ENDFOR
2504 /// \endcode
2505 ///
2506 /// \headerfile <immintrin.h>
2507 ///
2508 /// This intrinsic corresponds to the \c VPERMPS instruction.
2509 ///
2510 /// \param __a
2511 ///    A 256-bit vector of [8 x float] containing the source values.
2512 /// \param __b
2513 ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
2514 ///    \a __a.
2515 /// \returns A 256-bit vector of [8 x float] containing the result.
2516 static __inline__ __m256 __DEFAULT_FN_ATTRS256
2517 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
2518 {
2519   return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
2520 }
2521
2522 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
2523 ///    of the 256-bit vector of [4 x i64] in \a V as specified by the
2524 ///    immediate value \a M.
2525 ///
2526 /// \code{.operation}
2527 /// FOR i := 0 TO 3
2528 ///   j := i*64
2529 ///   k := (M >> i*2)[1:0] * 64
2530 ///   result[j+63:j] := V[k+63:k]
2531 /// ENDFOR
2532 /// \endcode
2533 ///
2534 /// \headerfile <immintrin.h>
2535 ///
2536 /// \code
2537 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
2538 /// \endcode
2539 ///
2540 /// This intrinsic corresponds to the \c VPERMQ instruction.
2541 ///
2542 /// \param V
2543 ///    A 256-bit vector of [4 x i64] containing the source values.
2544 /// \param M
2545 ///    An immediate 8-bit value specifying which elements to copy from \a V.
2546 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
2547 ///    \a M[3:2] specifies the index for element 1, and so forth.
2548 /// \returns A 256-bit vector of [4 x i64] containing the result.
2549 #define _mm256_permute4x64_epi64(V, M) \
2550   ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
2551
2552 /// Sets each half of the 256-bit result either to zero or to one of the
2553 ///    four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
2554 ///    as specified by the immediate value \a M.
2555 ///
2556 /// \code{.operation}
2557 /// FOR i := 0 TO 1
2558 ///   j := i*128
2559 ///   k := M >> (i*4)
2560 ///   IF k[3] == 0
2561 ///     CASE (k[1:0]) OF
2562 ///     0: result[127+j:j] := V1[127:0]
2563 ///     1: result[127+j:j] := V1[255:128]
2564 ///     2: result[127+j:j] := V2[127:0]
2565 ///     3: result[127+j:j] := V2[255:128]
2566 ///     ESAC
2567 ///   ELSE
2568 ///     result[127+j:j] := 0
2569 ///   FI
2570 /// ENDFOR
2571 /// \endcode
2572 ///
2573 /// \headerfile <immintrin.h>
2574 ///
2575 /// \code
2576 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
2577 /// \endcode
2578 ///
2579 /// This intrinsic corresponds to the \c VPERM2I128 instruction.
2580 ///
2581 /// \param V1
2582 ///    A 256-bit integer vector containing source values.
2583 /// \param V2
2584 ///    A 256-bit integer vector containing source values.
2585 /// \param M
2586 ///    An immediate value specifying how to form the result. Bits [3:0]
2587 ///    control the lower half of the result, bits [7:4] control the upper half.
2588 ///    Within each 4-bit control value, if bit 3 is 1, the result is zero,
2589 ///    otherwise bits [1:0] determine the source as follows. \n
2590 ///    0: the lower half of \a V1 \n
2591 ///    1: the upper half of \a V1 \n
2592 ///    2: the lower half of \a V2 \n
2593 ///    3: the upper half of \a V2
2594 /// \returns A 256-bit integer vector containing the result.
2595 #define _mm256_permute2x128_si256(V1, V2, M) \
2596   ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
2597
2598 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
2599 ///     of the immediate \a M is zero, extracts the lower half of the result;
2600 ///     otherwise, extracts the upper half.
2601 ///
2602 /// \headerfile <immintrin.h>
2603 ///
2604 /// \code
2605 /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
2606 /// \endcode
2607 ///
2608 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
2609 ///
2610 /// \param V
2611 ///    A 256-bit integer vector containing the source values.
2612 /// \param M
2613 ///    An immediate value specifying which half of \a V to extract.
2614 /// \returns A 128-bit integer vector containing the result.
2615 #define _mm256_extracti128_si256(V, M) \
2616   ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
2617
2618 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
2619 ///     result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
2620 ///     is zero, overwrites the lower half of the result; otherwise,
2621 ///     overwrites the upper half.
2622 ///
2623 /// \headerfile <immintrin.h>
2624 ///
2625 /// \code
2626 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
2627 /// \endcode
2628 ///
2629 /// This intrinsic corresponds to the \c VINSERTI128 instruction.
2630 ///
2631 /// \param V1
2632 ///    A 256-bit integer vector containing a source value.
2633 /// \param V2
2634 ///    A 128-bit integer vector containing a source value.
2635 /// \param M
2636 ///    An immediate value specifying where to put \a V2 in the result.
2637 /// \returns A 256-bit integer vector containing the result.
2638 #define _mm256_inserti128_si256(V1, V2, M) \
2639   ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
2640                                          (__v2di)(__m128i)(V2), (int)(M)))
2641
2642 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2643 _mm256_maskload_epi32(int const *__X, __m256i __M)
2644 {
2645   return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
2646 }
2647
2648 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2649 _mm256_maskload_epi64(long long const *__X, __m256i __M)
2650 {
2651   return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
2652 }
2653
2654 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2655 _mm_maskload_epi32(int const *__X, __m128i __M)
2656 {
2657   return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
2658 }
2659
2660 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2661 _mm_maskload_epi64(long long const *__X, __m128i __M)
2662 {
2663   return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
2664 }
2665
2666 static __inline__ void __DEFAULT_FN_ATTRS256
2667 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
2668 {
2669   __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
2670 }
2671
2672 static __inline__ void __DEFAULT_FN_ATTRS256
2673 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
2674 {
2675   __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
2676 }
2677
2678 static __inline__ void __DEFAULT_FN_ATTRS128
2679 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
2680 {
2681   __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
2682 }
2683
2684 static __inline__ void __DEFAULT_FN_ATTRS128
2685 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
2686 {
2687   __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
2688 }
2689
2690 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
2691 ///    left by the number of bits given in the corresponding element of the
2692 ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
2693 ///    returns the result. If the shift count for any element is greater than
2694 ///    31, the result for that element is zero.
2695 ///
2696 /// \headerfile <immintrin.h>
2697 ///
2698 /// This intrinsic corresponds to the \c VPSLLVD instruction.
2699 ///
2700 /// \param __X
2701 ///    A 256-bit vector of [8 x i32] to be shifted.
2702 /// \param __Y
2703 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
2704 ///    bits).
2705 /// \returns A 256-bit vector of [8 x i32] containing the result.
2706 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2707 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
2708 {
2709   return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
2710 }
2711
2712 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
2713 ///    left by the number of bits given in the corresponding element of the
2714 ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
2715 ///    returns the result. If the shift count for any element is greater than
2716 ///    31, the result for that element is zero.
2717 ///
2718 /// \headerfile <immintrin.h>
2719 ///
2720 /// This intrinsic corresponds to the \c VPSLLVD instruction.
2721 ///
2722 /// \param __X
2723 ///    A 128-bit vector of [4 x i32] to be shifted.
2724 /// \param __Y
2725 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
2726 ///    bits).
2727 /// \returns A 128-bit vector of [4 x i32] containing the result.
2728 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2729 _mm_sllv_epi32(__m128i __X, __m128i __Y)
2730 {
2731   return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
2732 }
2733
2734 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
2735 ///    left by the number of bits given in the corresponding element of the
2736 ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
2737 ///    returns the result. If the shift count for any element is greater than
2738 ///    63, the result for that element is zero.
2739 ///
2740 /// \headerfile <immintrin.h>
2741 ///
2742 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
2743 ///
2744 /// \param __X
2745 ///    A 256-bit vector of [4 x i64] to be shifted.
2746 /// \param __Y
2747 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
2748 ///    bits).
2749 /// \returns A 256-bit vector of [4 x i64] containing the result.
2750 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2751 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
2752 {
2753   return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
2754 }
2755
2756 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
2757 ///    left by the number of bits given in the corresponding element of the
2758 ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
2759 ///    returns the result. If the shift count for any element is greater than
2760 ///    63, the result for that element is zero.
2761 ///
2762 /// \headerfile <immintrin.h>
2763 ///
2764 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
2765 ///
2766 /// \param __X
2767 ///    A 128-bit vector of [2 x i64] to be shifted.
2768 /// \param __Y
2769 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
2770 ///    bits).
2771 /// \returns A 128-bit vector of [2 x i64] containing the result.
2772 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2773 _mm_sllv_epi64(__m128i __X, __m128i __Y)
2774 {
2775   return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
2776 }
2777
2778 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
2779 ///    right by the number of bits given in the corresponding element of the
2780 ///    256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
2781 ///    returns the result. If the shift count for any element is greater than
2782 ///    31, the result for that element is 0 or -1 according to the sign bit
2783 ///    for that element.
2784 ///
2785 /// \headerfile <immintrin.h>
2786 ///
2787 /// This intrinsic corresponds to the \c VPSRAVD instruction.
2788 ///
2789 /// \param __X
2790 ///    A 256-bit vector of [8 x i32] to be shifted.
2791 /// \param __Y
2792 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
2793 ///    bits).
2794 /// \returns A 256-bit vector of [8 x i32] containing the result.
2795 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2796 _mm256_srav_epi32(__m256i __X, __m256i __Y)
2797 {
2798   return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
2799 }
2800
2801 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
2802 ///    right by the number of bits given in the corresponding element of the
2803 ///    128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
2804 ///    returns the result. If the shift count for any element is greater than
2805 ///    31, the result for that element is 0 or -1 according to the sign bit
2806 ///    for that element.
2807 ///
2808 /// \headerfile <immintrin.h>
2809 ///
2810 /// This intrinsic corresponds to the \c VPSRAVD instruction.
2811 ///
2812 /// \param __X
2813 ///    A 128-bit vector of [4 x i32] to be shifted.
2814 /// \param __Y
2815 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
2816 ///    bits).
2817 /// \returns A 128-bit vector of [4 x i32] containing the result.
2818 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2819 _mm_srav_epi32(__m128i __X, __m128i __Y)
2820 {
2821   return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
2822 }
2823
2824 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
2825 ///    right by the number of bits given in the corresponding element of the
2826 ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
2827 ///    returns the result. If the shift count for any element is greater than
2828 ///    31, the result for that element is zero.
2829 ///
2830 /// \headerfile <immintrin.h>
2831 ///
2832 /// This intrinsic corresponds to the \c VPSRLVD instruction.
2833 ///
2834 /// \param __X
2835 ///    A 256-bit vector of [8 x i32] to be shifted.
2836 /// \param __Y
2837 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
2838 ///    bits).
2839 /// \returns A 256-bit vector of [8 x i32] containing the result.
2840 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2841 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
2842 {
2843   return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
2844 }
2845
2846 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
2847 ///    right by the number of bits given in the corresponding element of the
2848 ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
2849 ///    returns the result. If the shift count for any element is greater than
2850 ///    31, the result for that element is zero.
2851 ///
2852 /// \headerfile <immintrin.h>
2853 ///
2854 /// This intrinsic corresponds to the \c VPSRLVD instruction.
2855 ///
2856 /// \param __X
2857 ///    A 128-bit vector of [4 x i32] to be shifted.
2858 /// \param __Y
2859 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
2860 ///    bits).
2861 /// \returns A 128-bit vector of [4 x i32] containing the result.
2862 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2863 _mm_srlv_epi32(__m128i __X, __m128i __Y)
2864 {
2865   return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
2866 }
2867
2868 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
2869 ///    right by the number of bits given in the corresponding element of the
2870 ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
2871 ///    returns the result. If the shift count for any element is greater than
2872 ///    63, the result for that element is zero.
2873 ///
2874 /// \headerfile <immintrin.h>
2875 ///
2876 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
2877 ///
2878 /// \param __X
2879 ///    A 256-bit vector of [4 x i64] to be shifted.
2880 /// \param __Y
2881 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
2882 ///    bits).
2883 /// \returns A 256-bit vector of [4 x i64] containing the result.
2884 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2885 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
2886 {
2887   return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
2888 }
2889
2890 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
2891 ///    right by the number of bits given in the corresponding element of the
2892 ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
2893 ///    returns the result. If the shift count for any element is greater than
2894 ///    63, the result for that element is zero.
2895 ///
2896 /// \headerfile <immintrin.h>
2897 ///
2898 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
2899 ///
2900 /// \param __X
2901 ///    A 128-bit vector of [2 x i64] to be shifted.
2902 /// \param __Y
2903 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
2904 ///    bits).
2905 /// \returns A 128-bit vector of [2 x i64] containing the result.
2906 static __inline__ __m128i __DEFAULT_FN_ATTRS128
2907 _mm_srlv_epi64(__m128i __X, __m128i __Y)
2908 {
2909   return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
2910 }
2911
2912 /// Conditionally gathers two 64-bit floating-point values, either from the
2913 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
2914 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
2915 ///    of [2 x double] in \a mask determines the source for each element.
2916 ///
2917 /// \code{.operation}
2918 /// FOR element := 0 to 1
2919 ///   j := element*64
2920 ///   k := element*32
2921 ///   IF mask[j+63] == 0
2922 ///     result[j+63:j] := a[j+63:j]
2923 ///   ELSE
2924 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
2925 ///   FI
2926 /// ENDFOR
2927 /// \endcode
2928 ///
2929 /// \headerfile <immintrin.h>
2930 ///
2931 /// \code
2932 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
2933 ///                               __m128d mask, const int s);
2934 /// \endcode
2935 ///
2936 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
2937 ///
2938 /// \param a
2939 ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
2940 ///    zero.
2941 /// \param m
2942 ///    A pointer to the memory used for loading values.
2943 /// \param i
2944 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
2945 ///    the first two elements are used.
2946 /// \param mask
2947 ///    A 128-bit vector of [2 x double] containing the mask. The most
2948 ///    significant bit of each element in the mask vector represents the mask
2949 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
2950 ///    is gathered; otherwise the value is loaded from memory.
2951 /// \param s
2952 ///    A literal constant scale factor for the indexes in \a i. Must be
2953 ///    1, 2, 4, or 8.
2954 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
2955 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
2956   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
2957                                       (double const *)(m), \
2958                                       (__v4si)(__m128i)(i), \
2959                                       (__v2df)(__m128d)(mask), (s)))
2960
2961 /// Conditionally gathers four 64-bit floating-point values, either from the
2962 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
2963 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
2964 ///    of [4 x double] in \a mask determines the source for each element.
2965 ///
2966 /// \code{.operation}
2967 /// FOR element := 0 to 3
2968 ///   j := element*64
2969 ///   k := element*32
2970 ///   IF mask[j+63] == 0
2971 ///     result[j+63:j] := a[j+63:j]
2972 ///   ELSE
2973 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
2974 ///   FI
2975 /// ENDFOR
2976 /// \endcode
2977 ///
2978 /// \headerfile <immintrin.h>
2979 ///
2980 /// \code
2981 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
2982 ///                                  __m256d mask, const int s);
2983 /// \endcode
2984 ///
2985 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
2986 ///
2987 /// \param a
2988 ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
2989 ///    zero.
2990 /// \param m
2991 ///    A pointer to the memory used for loading values.
2992 /// \param i
2993 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
2994 /// \param mask
2995 ///    A 256-bit vector of [4 x double] containing the mask. The most
2996 ///    significant bit of each element in the mask vector represents the mask
2997 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
2998 ///    is gathered; otherwise the value is loaded from memory.
2999 /// \param s
3000 ///    A literal constant scale factor for the indexes in \a i. Must be
3001 ///    1, 2, 4, or 8.
3002 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
3003 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
3004   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
3005                                          (double const *)(m), \
3006                                          (__v4si)(__m128i)(i), \
3007                                          (__v4df)(__m256d)(mask), (s)))
3008
3009 /// Conditionally gathers two 64-bit floating-point values, either from the
3010 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3011 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3012 ///    of [2 x double] in \a mask determines the source for each element.
3013 ///
3014 /// \code{.operation}
3015 /// FOR element := 0 to 1
3016 ///   j := element*64
3017 ///   k := element*64
3018 ///   IF mask[j+63] == 0
3019 ///     result[j+63:j] := a[j+63:j]
3020 ///   ELSE
3021 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3022 ///   FI
3023 /// ENDFOR
3024 /// \endcode
3025 ///
3026 /// \headerfile <immintrin.h>
3027 ///
3028 /// \code
3029 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
3030 ///                               __m128d mask, const int s);
3031 /// \endcode
3032 ///
3033 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
3034 ///
3035 /// \param a
3036 ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
3037 ///    zero.
3038 /// \param m
3039 ///    A pointer to the memory used for loading values.
3040 /// \param i
3041 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3042 /// \param mask
3043 ///    A 128-bit vector of [2 x double] containing the mask. The most
3044 ///    significant bit of each element in the mask vector represents the mask
3045 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
3046 ///    is gathered; otherwise the value is loaded from memory.
3047 /// \param s
3048 ///    A literal constant scale factor for the indexes in \a i. Must be
3049 ///    1, 2, 4, or 8.
3050 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3051 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
3052   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
3053                                       (double const *)(m), \
3054                                       (__v2di)(__m128i)(i), \
3055                                       (__v2df)(__m128d)(mask), (s)))
3056
3057 /// Conditionally gathers four 64-bit floating-point values, either from the
3058 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
3059 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
3060 ///    of [4 x double] in \a mask determines the source for each element.
3061 ///
3062 /// \code{.operation}
3063 /// FOR element := 0 to 3
3064 ///   j := element*64
3065 ///   k := element*64
3066 ///   IF mask[j+63] == 0
3067 ///     result[j+63:j] := a[j+63:j]
3068 ///   ELSE
3069 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3070 ///   FI
3071 /// ENDFOR
3072 /// \endcode
3073 ///
3074 /// \headerfile <immintrin.h>
3075 ///
3076 /// \code
3077 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
3078 ///                                  __m256d mask, const int s);
3079 /// \endcode
3080 ///
3081 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
3082 ///
3083 /// \param a
3084 ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
3085 ///    zero.
3086 /// \param m
3087 ///    A pointer to the memory used for loading values.
3088 /// \param i
3089 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3090 /// \param mask
3091 ///    A 256-bit vector of [4 x double] containing the mask. The most
3092 ///    significant bit of each element in the mask vector represents the mask
3093 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
3094 ///    is gathered; otherwise the value is loaded from memory.
3095 /// \param s
3096 ///    A literal constant scale factor for the indexes in \a i. Must be
3097 ///    1, 2, 4, or 8.
3098 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
3099 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
3100   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
3101                                          (double const *)(m), \
3102                                          (__v4di)(__m256i)(i), \
3103                                          (__v4df)(__m256d)(mask), (s)))
3104
3105 /// Conditionally gathers four 32-bit floating-point values, either from the
3106 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
3107 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3108 ///    of [4 x float] in \a mask determines the source for each element.
3109 ///
3110 /// \code{.operation}
3111 /// FOR element := 0 to 3
3112 ///   j := element*32
3113 ///   k := element*32
3114 ///   IF mask[j+31] == 0
3115 ///     result[j+31:j] := a[j+31:j]
3116 ///   ELSE
3117 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3118 ///   FI
3119 /// ENDFOR
3120 /// \endcode
3121 ///
3122 /// \headerfile <immintrin.h>
3123 ///
3124 /// \code
3125 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
3126 ///                              __m128 mask, const int s);
3127 /// \endcode
3128 ///
3129 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
3130 ///
3131 /// \param a
3132 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
3133 ///    zero.
3134 /// \param m
3135 ///    A pointer to the memory used for loading values.
3136 /// \param i
3137 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3138 /// \param mask
3139 ///    A 128-bit vector of [4 x float] containing the mask. The most
3140 ///    significant bit of each element in the mask vector represents the mask
3141 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
3142 ///    is gathered; otherwise the value is loaded from memory.
3143 /// \param s
3144 ///    A literal constant scale factor for the indexes in \a i. Must be
3145 ///    1, 2, 4, or 8.
3146 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3147 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
3148   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
3149                                      (float const *)(m), \
3150                                      (__v4si)(__m128i)(i), \
3151                                      (__v4sf)(__m128)(mask), (s)))
3152
3153 /// Conditionally gathers eight 32-bit floating-point values, either from the
3154 ///    256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
3155 ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
3156 ///    of [8 x float] in \a mask determines the source for each element.
3157 ///
3158 /// \code{.operation}
3159 /// FOR element := 0 to 7
3160 ///   j := element*32
3161 ///   k := element*32
3162 ///   IF mask[j+31] == 0
3163 ///     result[j+31:j] := a[j+31:j]
3164 ///   ELSE
3165 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3166 ///   FI
3167 /// ENDFOR
3168 /// \endcode
3169 ///
3170 /// \headerfile <immintrin.h>
3171 ///
3172 /// \code
3173 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
3174 ///                                 __m256 mask, const int s);
3175 /// \endcode
3176 ///
3177 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
3178 ///
3179 /// \param a
3180 ///    A 256-bit vector of [8 x float] used as the source when a mask bit is
3181 ///    zero.
3182 /// \param m
3183 ///    A pointer to the memory used for loading values.
3184 /// \param i
3185 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
3186 /// \param mask
3187 ///    A 256-bit vector of [8 x float] containing the mask. The most
3188 ///    significant bit of each element in the mask vector represents the mask
3189 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
3190 ///    is gathered; otherwise the value is loaded from memory.
3191 /// \param s
3192 ///    A literal constant scale factor for the indexes in \a i. Must be
3193 ///    1, 2, 4, or 8.
3194 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
3195 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
3196   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
3197                                         (float const *)(m), \
3198                                         (__v8si)(__m256i)(i), \
3199                                         (__v8sf)(__m256)(mask), (s)))
3200
3201 /// Conditionally gathers two 32-bit floating-point values, either from the
3202 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
3203 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3204 ///    of [4 x float] in \a mask determines the source for the lower two
3205 ///    elements. The upper two elements of the result are zeroed.
3206 ///
3207 /// \code{.operation}
3208 /// FOR element := 0 to 1
3209 ///   j := element*32
3210 ///   k := element*64
3211 ///   IF mask[j+31] == 0
3212 ///     result[j+31:j] := a[j+31:j]
3213 ///   ELSE
3214 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3215 ///   FI
3216 /// ENDFOR
3217 /// result[127:64] := 0
3218 /// \endcode
3219 ///
3220 /// \headerfile <immintrin.h>
3221 ///
3222 /// \code
3223 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
3224 ///                              __m128 mask, const int s);
3225 /// \endcode
3226 ///
3227 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
3228 ///
3229 /// \param a
3230 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
3231 ///    zero. Only the first two elements are used.
3232 /// \param m
3233 ///    A pointer to the memory used for loading values.
3234 /// \param i
3235 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3236 /// \param mask
3237 ///    A 128-bit vector of [4 x float] containing the mask. The most
3238 ///    significant bit of each element in the mask vector represents the mask
3239 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
3240 ///    is gathered; otherwise the value is loaded from memory. Only the first
3241 ///    two elements are used.
3242 /// \param s
3243 ///    A literal constant scale factor for the indexes in \a i. Must be
3244 ///    1, 2, 4, or 8.
3245 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3246 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
3247   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
3248                                      (float const *)(m), \
3249                                      (__v2di)(__m128i)(i), \
3250                                      (__v4sf)(__m128)(mask), (s)))
3251
3252 /// Conditionally gathers four 32-bit floating-point values, either from the
3253 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
3254 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
3255 ///    of [4 x float] in \a mask determines the source for each element.
3256 ///
3257 /// \code{.operation}
3258 /// FOR element := 0 to 3
3259 ///   j := element*32
3260 ///   k := element*64
3261 ///   IF mask[j+31] == 0
3262 ///     result[j+31:j] := a[j+31:j]
3263 ///   ELSE
3264 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3265 ///   FI
3266 /// ENDFOR
3267 /// \endcode
3268 ///
3269 /// \headerfile <immintrin.h>
3270 ///
3271 /// \code
3272 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
3273 ///                                 __m128 mask, const int s);
3274 /// \endcode
3275 ///
3276 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
3277 ///
3278 /// \param a
3279 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
3280 ///   zero.
3281 /// \param m
3282 ///    A pointer to the memory used for loading values.
3283 /// \param i
3284 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3285 /// \param mask
3286 ///    A 128-bit vector of [4 x float] containing the mask. The most
3287 ///    significant bit of each element in the mask vector represents the mask
3288 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
3289 ///    is gathered; otherwise the value is loaded from memory.
3290 /// \param s
3291 ///    A literal constant scale factor for the indexes in \a i. Must be
3292 ///    1, 2, 4, or 8.
3293 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3294 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
3295   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
3296                                         (float const *)(m), \
3297                                         (__v4di)(__m256i)(i), \
3298                                         (__v4sf)(__m128)(mask), (s)))
3299
3300 /// Conditionally gathers four 32-bit integer values, either from the
3301 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
3302 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3303 ///    of [4 x i32] in \a mask determines the source for each element.
3304 ///
3305 /// \code{.operation}
3306 /// FOR element := 0 to 3
3307 ///   j := element*32
3308 ///   k := element*32
3309 ///   IF mask[j+31] == 0
3310 ///     result[j+31:j] := a[j+31:j]
3311 ///   ELSE
3312 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3313 ///   FI
3314 /// ENDFOR
3315 /// \endcode
3316 ///
3317 /// \headerfile <immintrin.h>
3318 ///
3319 /// \code
3320 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
3321 ///                                  __m128i mask, const int s);
3322 /// \endcode
3323 ///
3324 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
3325 ///
3326 /// \param a
3327 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
3328 ///    zero.
3329 /// \param m
3330 ///    A pointer to the memory used for loading values.
3331 /// \param i
3332 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3333 /// \param mask
3334 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
3335 ///    bit of each element in the mask vector represents the mask bits. If a
3336 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
3337 ///    otherwise the value is loaded from memory.
3338 /// \param s
3339 ///    A literal constant scale factor for the indexes in \a i. Must be
3340 ///    1, 2, 4, or 8.
3341 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
3342 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
3343   ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
3344                                      (int const *)(m), \
3345                                      (__v4si)(__m128i)(i), \
3346                                      (__v4si)(__m128i)(mask), (s)))
3347
3348 /// Conditionally gathers eight 32-bit integer values, either from the
3349 ///    256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
3350 ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
3351 ///    of [8 x i32] in \a mask determines the source for each element.
3352 ///
3353 /// \code{.operation}
3354 /// FOR element := 0 to 7
3355 ///   j := element*32
3356 ///   k := element*32
3357 ///   IF mask[j+31] == 0
3358 ///     result[j+31:j] := a[j+31:j]
3359 ///   ELSE
3360 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3361 ///   FI
3362 /// ENDFOR
3363 /// \endcode
3364 ///
3365 /// \headerfile <immintrin.h>
3366 ///
3367 /// \code
3368 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
3369 ///                                     __m256i mask, const int s);
3370 /// \endcode
3371 ///
3372 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
3373 ///
3374 /// \param a
3375 ///    A 256-bit vector of [8 x i32] used as the source when a mask bit is
3376 ///    zero.
3377 /// \param m
3378 ///    A pointer to the memory used for loading values.
3379 /// \param i
3380 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
3381 /// \param mask
3382 ///    A 256-bit vector of [8 x i32] containing the mask. The most significant
3383 ///    bit of each element in the mask vector represents the mask bits. If a
3384 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
3385 ///    otherwise the value is loaded from memory.
3386 /// \param s
3387 ///    A literal constant scale factor for the indexes in \a i. Must be
3388 ///    1, 2, 4, or 8.
3389 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
3390 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
3391   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
3392                                         (int const *)(m), \
3393                                         (__v8si)(__m256i)(i), \
3394                                         (__v8si)(__m256i)(mask), (s)))
3395
3396 /// Conditionally gathers two 32-bit integer values, either from the
3397 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
3398 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3399 ///    of [4 x i32] in \a mask determines the source for the lower two
3400 ///    elements. The upper two elements of the result are zeroed.
3401 ///
3402 /// \code{.operation}
3403 /// FOR element := 0 to 1
3404 ///   j := element*32
3405 ///   k := element*64
3406 ///   IF mask[j+31] == 0
3407 ///     result[j+31:j] := a[j+31:j]
3408 ///   ELSE
3409 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3410 ///   FI
3411 /// ENDFOR
3412 /// result[127:64] := 0
3413 /// \endcode
3414 ///
3415 /// \headerfile <immintrin.h>
3416 ///
3417 /// \code
3418 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
3419 ///                                  __m128i mask, const int s);
3420 /// \endcode
3421 ///
3422 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
3423 ///
3424 /// \param a
3425 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
3426 ///   zero. Only the first two elements are used.
3427 /// \param m
3428 ///    A pointer to the memory used for loading values.
3429 /// \param i
3430 ///    A 128-bit vector of [2 x i64] containing indexes into \a m.
3431 /// \param mask
3432 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
3433 ///    bit of each element in the mask vector represents the mask bits. If a
3434 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
3435 ///    otherwise the value is loaded from memory. Only the first two elements
3436 ///    are used.
3437 /// \param s
3438 ///    A literal constant scale factor for the indexes in \a i. Must be
3439 ///    1, 2, 4, or 8.
3440 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
3441 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
3442   ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
3443                                      (int const *)(m), \
3444                                      (__v2di)(__m128i)(i), \
3445                                      (__v4si)(__m128i)(mask), (s)))
3446
3447 /// Conditionally gathers four 32-bit integer values, either from the
3448 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
3449 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
3450 ///    of [4 x i32] in \a mask determines the source for each element.
3451 ///
3452 /// \code{.operation}
3453 /// FOR element := 0 to 3
3454 ///   j := element*32
3455 ///   k := element*64
3456 ///   IF mask[j+31] == 0
3457 ///     result[j+31:j] := a[j+31:j]
3458 ///   ELSE
3459 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3460 ///   FI
3461 /// ENDFOR
3462 /// \endcode
3463 ///
3464 /// \headerfile <immintrin.h>
3465 ///
3466 /// \code
3467 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
3468 ///                                     __m128i mask, const int s);
3469 /// \endcode
3470 ///
3471 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
3472 ///
3473 /// \param a
3474 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
3475 ///    zero.
3476 /// \param m
3477 ///    A pointer to the memory used for loading values.
3478 /// \param i
3479 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3480 /// \param mask
3481 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
3482 ///    bit of each element in the mask vector represents the mask bits. If a
3483 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
3484 ///    otherwise the value is loaded from memory.
3485 /// \param s
3486 ///    A literal constant scale factor for the indexes in \a i. Must be
3487 ///    1, 2, 4, or 8.
3488 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
3489 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
3490   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
3491                                         (int const *)(m), \
3492                                         (__v4di)(__m256i)(i), \
3493                                         (__v4si)(__m128i)(mask), (s)))
3494
3495 /// Conditionally gathers two 64-bit integer values, either from the
3496 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
3497 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3498 ///    of [2 x i64] in \a mask determines the source for each element.
3499 ///
3500 /// \code{.operation}
3501 /// FOR element := 0 to 1
3502 ///   j := element*64
3503 ///   k := element*32
3504 ///   IF mask[j+63] == 0
3505 ///     result[j+63:j] := a[j+63:j]
3506 ///   ELSE
3507 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3508 ///   FI
3509 /// ENDFOR
3510 /// \endcode
3511 ///
3512 /// \headerfile <immintrin.h>
3513 ///
3514 /// \code
3515 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
3516 ///                                  __m128i mask, const int s);
3517 /// \endcode
3518 ///
3519 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
3520 ///
3521 /// \param a
3522 ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
3523 ///    zero.
3524 /// \param m
3525 ///    A pointer to the memory used for loading values.
3526 /// \param i
3527 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3528 ///    the first two elements are used.
3529 /// \param mask
3530 ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
3531 ///    bit of each element in the mask vector represents the mask bits. If a
3532 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
3533 ///    otherwise the value is loaded from memory.
3534 /// \param s
3535 ///    A literal constant scale factor for the indexes in \a i. Must be
3536 ///    1, 2, 4, or 8.
3537 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
3538 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
3539   ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
3540                                      (long long const *)(m), \
3541                                      (__v4si)(__m128i)(i), \
3542                                      (__v2di)(__m128i)(mask), (s)))
3543
3544 /// Conditionally gathers four 64-bit integer values, either from the
3545 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
3546 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
3547 ///    of [4 x i64] in \a mask determines the source for each element.
3548 ///
3549 /// \code{.operation}
3550 /// FOR element := 0 to 3
3551 ///   j := element*64
3552 ///   k := element*32
3553 ///   IF mask[j+63] == 0
3554 ///     result[j+63:j] := a[j+63:j]
3555 ///   ELSE
3556 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3557 ///   FI
3558 /// ENDFOR
3559 /// \endcode
3560 ///
3561 /// \headerfile <immintrin.h>
3562 ///
3563 /// \code
3564 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
3565 ///                                     __m128i i, __m256i mask, const int s);
3566 /// \endcode
3567 ///
3568 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
3569 ///
3570 /// \param a
3571 ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
3572 ///    zero.
3573 /// \param m
3574 ///    A pointer to the memory used for loading values.
3575 /// \param i
3576 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3577 /// \param mask
3578 ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
3579 ///    bit of each element in the mask vector represents the mask bits. If a
3580 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
3581 ///    otherwise the value is loaded from memory.
3582 /// \param s
3583 ///    A literal constant scale factor for the indexes in \a i. Must be
3584 ///    1, 2, 4, or 8.
3585 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
3586 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
3587   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
3588                                         (long long const *)(m), \
3589                                         (__v4si)(__m128i)(i), \
3590                                         (__v4di)(__m256i)(mask), (s)))
3591
3592 /// Conditionally gathers two 64-bit integer values, either from the
3593 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
3594 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
3595 ///    of [2 x i64] in \a mask determines the source for each element.
3596 ///
3597 /// \code{.operation}
3598 /// FOR element := 0 to 1
3599 ///   j := element*64
3600 ///   k := element*64
3601 ///   IF mask[j+63] == 0
3602 ///     result[j+63:j] := a[j+63:j]
3603 ///   ELSE
3604 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3605 ///   FI
3606 /// ENDFOR
3607 /// \endcode
3608 ///
3609 /// \headerfile <immintrin.h>
3610 ///
3611 /// \code
3612 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
3613 ///                                  __m128i mask, const int s);
3614 /// \endcode
3615 ///
3616 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
3617 ///
3618 /// \param a
3619 ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
3620 ///    zero.
3621 /// \param m
3622 ///    A pointer to the memory used for loading values.
3623 /// \param i
3624 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3625 /// \param mask
3626 ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
3627 ///    bit of each element in the mask vector represents the mask bits. If a
3628 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
3629 ///    otherwise the value is loaded from memory.
3630 /// \param s
3631 ///    A literal constant scale factor for the indexes in \a i. Must be
3632 ///    1, 2, 4, or 8.
3633 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
3634 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
3635   ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
3636                                      (long long const *)(m), \
3637                                      (__v2di)(__m128i)(i), \
3638                                      (__v2di)(__m128i)(mask), (s)))
3639
3640 /// Conditionally gathers four 64-bit integer values, either from the
3641 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
3642 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
3643 ///    of [4 x i64] in \a mask determines the source for each element.
3644 ///
3645 /// \code{.operation}
3646 /// FOR element := 0 to 3
3647 ///   j := element*64
3648 ///   k := element*64
3649 ///   IF mask[j+63] == 0
3650 ///     result[j+63:j] := a[j+63:j]
3651 ///   ELSE
3652 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3653 ///   FI
3654 /// ENDFOR
3655 /// \endcode
3656 ///
3657 /// \headerfile <immintrin.h>
3658 ///
3659 /// \code
3660 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
3661 ///                                     __m256i i, __m256i mask, const int s);
3662 /// \endcode
3663 ///
3664 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
3665 ///
3666 /// \param a
3667 ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
3668 ///    zero.
3669 /// \param m
3670 ///    A pointer to the memory used for loading values.
3671 /// \param i
3672 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3673 /// \param mask
3674 ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
3675 ///    bit of each element in the mask vector represents the mask bits. If a
3676 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
3677 ///    otherwise the value is loaded from memory.
3678 /// \param s
3679 ///    A literal constant scale factor for the indexes in \a i. Must be
3680 ///    1, 2, 4, or 8.
3681 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
3682 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
3683   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
3684                                         (long long const *)(m), \
3685                                         (__v4di)(__m256i)(i), \
3686                                         (__v4di)(__m256i)(mask), (s)))
3687
3688 /// Gathers two 64-bit floating-point values from memory \a m using scaled
3689 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
3690 ///
3691 /// \code{.operation}
3692 /// FOR element := 0 to 1
3693 ///   j := element*64
3694 ///   k := element*32
3695 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3696 /// ENDFOR
3697 /// \endcode
3698 ///
3699 /// \headerfile <immintrin.h>
3700 ///
3701 /// \code
3702 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
3703 /// \endcode
3704 ///
3705 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3706 ///
3707 /// \param m
3708 ///    A pointer to the memory used for loading values.
3709 /// \param i
3710 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3711 ///    the first two elements are used.
3712 /// \param s
3713 ///    A literal constant scale factor for the indexes in \a i. Must be
3714 ///    1, 2, 4, or 8.
3715 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3716 #define _mm_i32gather_pd(m, i, s) \
3717   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
3718                                       (double const *)(m), \
3719                                       (__v4si)(__m128i)(i), \
3720                                       (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
3721                                                            _mm_setzero_pd()), \
3722                                       (s)))
3723
3724 /// Gathers four 64-bit floating-point values from memory \a m using scaled
3725 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
3726 ///
3727 /// \code{.operation}
3728 /// FOR element := 0 to 3
3729 ///   j := element*64
3730 ///   k := element*32
3731 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3732 /// ENDFOR
3733 /// \endcode
3734 ///
3735 /// \headerfile <immintrin.h>
3736 ///
3737 /// \code
3738 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
3739 /// \endcode
3740 ///
3741 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3742 ///
3743 /// \param m
3744 ///    A pointer to the memory used for loading values.
3745 /// \param i
3746 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3747 /// \param s
3748 ///    A literal constant scale factor for the indexes in \a i. Must be
3749 ///    1, 2, 4, or 8.
3750 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
3751 #define _mm256_i32gather_pd(m, i, s) \
3752   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
3753                                          (double const *)(m), \
3754                                          (__v4si)(__m128i)(i), \
3755                                          (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
3756                                                                _mm256_setzero_pd(), \
3757                                                                _CMP_EQ_OQ), \
3758                                          (s)))
3759
3760 /// Gathers two 64-bit floating-point values from memory \a m using scaled
3761 ///    indexes from the 128-bit vector of [2 x i64] in \a i.
3762 ///
3763 /// \code{.operation}
3764 /// FOR element := 0 to 1
3765 ///   j := element*64
3766 ///   k := element*64
3767 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3768 /// ENDFOR
3769 /// \endcode
3770 ///
3771 /// \headerfile <immintrin.h>
3772 ///
3773 /// \code
3774 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
3775 /// \endcode
3776 ///
3777 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
3778 ///
3779 /// \param m
3780 ///    A pointer to the memory used for loading values.
3781 /// \param i
3782 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3783 /// \param s
3784 ///    A literal constant scale factor for the indexes in \a i. Must be
3785 ///    1, 2, 4, or 8.
3786 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3787 #define _mm_i64gather_pd(m, i, s) \
3788   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
3789                                       (double const *)(m), \
3790                                       (__v2di)(__m128i)(i), \
3791                                       (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
3792                                                            _mm_setzero_pd()), \
3793                                       (s)))
3794
3795 /// Gathers four 64-bit floating-point values from memory \a m using scaled
3796 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
3797 ///
3798 /// \code{.operation}
3799 /// FOR element := 0 to 3
3800 ///   j := element*64
3801 ///   k := element*64
3802 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
3803 /// ENDFOR
3804 /// \endcode
3805 ///
3806 /// \headerfile <immintrin.h>
3807 ///
3808 /// \code
3809 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
3810 /// \endcode
3811 ///
3812 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
3813 ///
3814 /// \param m
3815 ///    A pointer to the memory used for loading values.
3816 /// \param i
3817 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3818 /// \param s
3819 ///    A literal constant scale factor for the indexes in \a i. Must be
3820 ///    1, 2, 4, or 8.
3821 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
3822 #define _mm256_i64gather_pd(m, i, s) \
3823   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
3824                                          (double const *)(m), \
3825                                          (__v4di)(__m256i)(i), \
3826                                          (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
3827                                                                _mm256_setzero_pd(), \
3828                                                                _CMP_EQ_OQ), \
3829                                          (s)))
3830
3831 /// Gathers four 32-bit floating-point values from memory \a m using scaled
3832 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
3833 ///
3834 /// \code{.operation}
3835 /// FOR element := 0 to 3
3836 ///   j := element*32
3837 ///   k := element*32
3838 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3839 /// ENDFOR
3840 /// \endcode
3841 ///
3842 /// \headerfile <immintrin.h>
3843 ///
3844 /// \code
3845 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
3846 /// \endcode
3847 ///
3848 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
3849 ///
3850 /// \param m
3851 ///    A pointer to the memory used for loading values.
3852 /// \param i
3853 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3854 /// \param s
3855 ///    A literal constant scale factor for the indexes in \a i. Must be
3856 ///    1, 2, 4, or 8.
3857 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3858 #define _mm_i32gather_ps(m, i, s) \
3859   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
3860                                      (float const *)(m), \
3861                                      (__v4si)(__m128i)(i), \
3862                                      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
3863                                                           _mm_setzero_ps()), \
3864                                      (s)))
3865
3866 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
3867 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
3868 ///
3869 /// \code{.operation}
3870 /// FOR element := 0 to 7
3871 ///   j := element*32
3872 ///   k := element*32
3873 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3874 /// ENDFOR
3875 /// \endcode
3876 ///
3877 /// \headerfile <immintrin.h>
3878 ///
3879 /// \code
3880 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
3881 /// \endcode
3882 ///
3883 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
3884 ///
3885 /// \param m
3886 ///    A pointer to the memory used for loading values.
3887 /// \param i
3888 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
3889 /// \param s
3890 ///    A literal constant scale factor for the indexes in \a i. Must be
3891 ///    1, 2, 4, or 8.
3892 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
3893 #define _mm256_i32gather_ps(m, i, s) \
3894   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
3895                                         (float const *)(m), \
3896                                         (__v8si)(__m256i)(i), \
3897                                         (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
3898                                                               _mm256_setzero_ps(), \
3899                                                               _CMP_EQ_OQ), \
3900                                         (s)))
3901
3902 /// Gathers two 32-bit floating-point values from memory \a m using scaled
3903 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
3904 ///    elements of the result are zeroed.
3905 ///
3906 /// \code{.operation}
3907 /// FOR element := 0 to 1
3908 ///   j := element*32
3909 ///   k := element*64
3910 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
3911 /// ENDFOR
3912 /// result[127:64] := 0
3913 /// \endcode
3914 ///
3915 /// \headerfile <immintrin.h>
3916 ///
3917 /// \code
3918 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
3919 /// \endcode
3920 ///
3921 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
3922 ///
3923 /// \param m
3924 ///    A pointer to the memory used for loading values.
3925 /// \param i
3926 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
3927 /// \param s
3928 ///    A literal constant scale factor for the indexes in \a i. Must be
3929 ///    1, 2, 4, or 8.
3930 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3931 #define _mm_i64gather_ps(m, i, s) \
3932   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
3933                                      (float const *)(m), \
3934                                      (__v2di)(__m128i)(i), \
3935                                      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
3936                                                           _mm_setzero_ps()), \
3937                                      (s)))
3938
3939 /// Gathers four 32-bit floating-point values from memory \a m using scaled
3940 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
3941 ///
3942 /// \code{.operation}
3943 /// FOR element := 0 to 3
3944 ///   j := element*32
3945 ///   k := element*64
3946 ///   result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
3947 /// ENDFOR
3948 /// \endcode
3949 ///
3950 /// \headerfile <immintrin.h>
3951 ///
3952 /// \code
3953 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
3954 /// \endcode
3955 ///
3956 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
3957 ///
3958 /// \param m
3959 ///    A pointer to the memory used for loading values.
3960 /// \param i
3961 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
3962 /// \param s
3963 ///    A literal constant scale factor for the indexes in \a i. Must be
3964 ///    1, 2, 4, or 8.
3965 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
3966 #define _mm256_i64gather_ps(m, i, s) \
3967   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
3968                                         (float const *)(m), \
3969                                         (__v4di)(__m256i)(i), \
3970                                         (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
3971                                                              _mm_setzero_ps()), \
3972                                         (s)))
3973
3974 /// Gathers four 32-bit floating-point values from memory \a m using scaled
3975 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
3976 ///
3977 /// \code{.operation}
3978 /// FOR element := 0 to 3
3979 ///   j := element*32
3980 ///   k := element*32
3981 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
3982 /// ENDFOR
3983 /// \endcode
3984 ///
3985 /// \headerfile <immintrin.h>
3986 ///
3987 /// \code
3988 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
3989 /// \endcode
3990 ///
3991 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
3992 ///
3993 /// \param m
3994 ///    A pointer to the memory used for loading values.
3995 /// \param i
3996 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
3997 /// \param s
3998 ///    A literal constant scale factor for the indexes in \a i. Must be
3999 ///    1, 2, 4, or 8.
4000 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4001 #define _mm_i32gather_epi32(m, i, s) \
4002   ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
4003                                      (int const *)(m), (__v4si)(__m128i)(i), \
4004                                      (__v4si)_mm_set1_epi32(-1), (s)))
4005
4006 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
4007 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
4008 ///
4009 /// \code{.operation}
4010 /// FOR element := 0 to 7
4011 ///   j := element*32
4012 ///   k := element*32
4013 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4014 /// ENDFOR
4015 /// \endcode
4016 ///
4017 /// \headerfile <immintrin.h>
4018 ///
4019 /// \code
4020 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
4021 /// \endcode
4022 ///
4023 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4024 ///
4025 /// \param m
4026 ///    A pointer to the memory used for loading values.
4027 /// \param i
4028 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4029 /// \param s
4030 ///    A literal constant scale factor for the indexes in \a i. Must be
4031 ///    1, 2, 4, or 8.
4032 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4033 #define _mm256_i32gather_epi32(m, i, s) \
4034   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
4035                                         (int const *)(m), (__v8si)(__m256i)(i), \
4036                                         (__v8si)_mm256_set1_epi32(-1), (s)))
4037
4038 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
4039 ///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
4040 ///    of the result are zeroed.
4041 ///
4042 /// \code{.operation}
4043 /// FOR element := 0 to 1
4044 ///   j := element*32
4045 ///   k := element*64
4046 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4047 /// ENDFOR
4048 /// result[127:64] := 0
4049 /// \endcode
4050 ///
4051 /// \headerfile <immintrin.h>
4052 ///
4053 /// \code
4054 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
4055 /// \endcode
4056 ///
4057 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4058 ///
4059 /// \param m
4060 ///    A pointer to the memory used for loading values.
4061 /// \param i
4062 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4063 /// \param s
4064 ///    A literal constant scale factor for the indexes in \a i. Must be
4065 ///    1, 2, 4, or 8.
4066 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4067 #define _mm_i64gather_epi32(m, i, s) \
4068   ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
4069                                      (int const *)(m), (__v2di)(__m128i)(i), \
4070                                      (__v4si)_mm_set1_epi32(-1), (s)))
4071
4072 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
4073 ///    from the 256-bit vector of [4 x i64] in \a i.
4074 ///
4075 /// \code{.operation}
4076 /// FOR element := 0 to 3
4077 ///   j := element*32
4078 ///   k := element*64
4079 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4080 /// ENDFOR
4081 /// \endcode
4082 ///
4083 /// \headerfile <immintrin.h>
4084 ///
4085 /// \code
4086 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
4087 /// \endcode
4088 ///
4089 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4090 ///
4091 /// \param m
4092 ///    A pointer to the memory used for loading values.
4093 /// \param i
4094 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4095 /// \param s
4096 ///    A literal constant scale factor for the indexes in \a i. Must be
4097 ///    1, 2, 4, or 8.
4098 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4099 #define _mm256_i64gather_epi32(m, i, s) \
4100   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
4101                                         (int const *)(m), (__v4di)(__m256i)(i), \
4102                                         (__v4si)_mm_set1_epi32(-1), (s)))
4103
4104 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
4105 ///    from the 128-bit vector of [4 x i32] in \a i.
4106 ///
4107 /// \code{.operation}
4108 /// FOR element := 0 to 1
4109 ///   j := element*64
4110 ///   k := element*32
4111 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4112 /// ENDFOR
4113 /// \endcode
4114 ///
4115 /// \headerfile <immintrin.h>
4116 ///
4117 /// \code
4118 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
4119 /// \endcode
4120 ///
4121 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4122 ///
4123 /// \param m
4124 ///    A pointer to the memory used for loading values.
4125 /// \param i
4126 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4127 ///    the first two elements are used.
4128 /// \param s
4129 ///    A literal constant scale factor for the indexes in \a i. Must be
4130 ///    1, 2, 4, or 8.
4131 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4132 #define _mm_i32gather_epi64(m, i, s) \
4133   ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
4134                                      (long long const *)(m), \
4135                                      (__v4si)(__m128i)(i), \
4136                                      (__v2di)_mm_set1_epi64x(-1), (s)))
4137
4138 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
4139 ///    from the 128-bit vector of [4 x i32] in \a i.
4140 ///
4141 /// \code{.operation}
4142 /// FOR element := 0 to 3
4143 ///   j := element*64
4144 ///   k := element*32
4145 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4146 /// ENDFOR
4147 /// \endcode
4148 ///
4149 /// \headerfile <immintrin.h>
4150 ///
4151 /// \code
4152 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
4153 /// \endcode
4154 ///
4155 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4156 ///
4157 /// \param m
4158 ///    A pointer to the memory used for loading values.
4159 /// \param i
4160 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4161 /// \param s
4162 ///    A literal constant scale factor for the indexes in \a i. Must be
4163 ///    1, 2, 4, or 8.
4164 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4165 #define _mm256_i32gather_epi64(m, i, s) \
4166   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
4167                                         (long long const *)(m), \
4168                                         (__v4si)(__m128i)(i), \
4169                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
4170
4171 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
4172 ///    from the 128-bit vector of [2 x i64] in \a i.
4173 ///
4174 /// \code{.operation}
4175 /// FOR element := 0 to 1
4176 ///   j := element*64
4177 ///   k := element*64
4178 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4179 /// ENDFOR
4180 /// \endcode
4181 ///
4182 /// \headerfile <immintrin.h>
4183 ///
4184 /// \code
4185 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
4186 /// \endcode
4187 ///
4188 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4189 ///
4190 /// \param m
4191 ///    A pointer to the memory used for loading values.
4192 /// \param i
4193 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4194 /// \param s
4195 ///    A literal constant scale factor for the indexes in \a i. Must be
4196 ///    1, 2, 4, or 8.
4197 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4198 #define _mm_i64gather_epi64(m, i, s) \
4199   ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
4200                                      (long long const *)(m), \
4201                                      (__v2di)(__m128i)(i), \
4202                                      (__v2di)_mm_set1_epi64x(-1), (s)))
4203
4204 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
4205 ///    from the 256-bit vector of [4 x i64] in \a i.
4206 ///
4207 /// \code{.operation}
4208 /// FOR element := 0 to 3
4209 ///   j := element*64
4210 ///   k := element*64
4211 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4212 /// ENDFOR
4213 /// \endcode
4214 ///
4215 /// \headerfile <immintrin.h>
4216 ///
4217 /// \code
4218 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
4219 /// \endcode
4220 ///
4221 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4222 ///
4223 /// \param m
4224 ///    A pointer to the memory used for loading values.
4225 /// \param i
4226 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4227 /// \param s
4228 ///    A literal constant scale factor for the indexes in \a i. Must be
4229 ///    1, 2, 4, or 8.
4230 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4231 #define _mm256_i64gather_epi64(m, i, s) \
4232   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
4233                                         (long long const *)(m), \
4234                                         (__v4di)(__m256i)(i), \
4235                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
4236
4237 #undef __DEFAULT_FN_ATTRS256
4238 #undef __DEFAULT_FN_ATTRS128
4239
4240 #endif /* __AVX2INTRIN_H */