clang/lib/Headers/avx2intrin.h

   1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __IMMINTRIN_H
  11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
  12 #endif
  13
  14 #ifndef __AVX2INTRIN_H
  15 #define __AVX2INTRIN_H
  16
  17 /* Define the default attributes for the functions in this file. */
  18 #define __DEFAULT_FN_ATTRS256                                                  \
  19   __attribute__((__always_inline__, __nodebug__,                               \
  20                  __target__("avx2,no-evex512"), __min_vector_width__(256)))
  21 #define __DEFAULT_FN_ATTRS128                                                  \
  22   __attribute__((__always_inline__, __nodebug__,                               \
  23                  __target__("avx2,no-evex512"), __min_vector_width__(128)))
  24
  25 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
  26 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
  27 ///    four unsigned 8-bit integers from the 256-bit integer vectors \a X and
  28 ///    \a Y.
  29 ///
  30 ///    Eight SAD results are computed using the lower half of the input
  31 ///    vectors, and another eight using the upper half. These 16-bit values
  32 ///    are returned in the lower and upper halves of the 256-bit result,
  33 ///    respectively.
  34 ///
  35 ///    A single SAD operation selects four bytes from \a X and four bytes from
  36 ///    \a Y as input. It computes the differences between each \a X byte and
  37 ///    the corresponding \a Y byte, takes the absolute value of each
  38 ///    difference, and sums these four values to form one 16-bit result. The
  39 ///    intrinsic computes 16 of these results with different sets of input
  40 ///    bytes.
  41 ///
  42 ///    For each set of eight results, the SAD operations use the same four
  43 ///    bytes from \a Y; the starting bit position for these four bytes is
  44 ///    specified by \a M[1:0] times 32. The eight operations use successive
  45 ///    sets of four bytes from \a X; the starting bit position for the first
  46 ///    set of four bytes is specified by \a M[2] times 32. These bit positions
  47 ///    are all relative to the 128-bit lane for each set of eight operations.
  48 ///
  49 /// \code{.operation}
  50 /// r := 0
  51 /// FOR i := 0 TO 1
  52 ///   j := i*3
  53 ///   Ybase := M[j+1:j]*32 + i*128
  54 ///   Xbase := M[j+2]*32 + i*128
  55 ///   FOR k := 0 TO 3
  56 ///     temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
  57 ///     temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
  58 ///     temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
  59 ///     temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
  60 ///     result[r+15:r] := temp0 + temp1 + temp2 + temp3
  61 ///     Xbase := Xbase + 8
  62 ///     r := r + 16
  63 ///   ENDFOR
  64 /// ENDFOR
  65 /// \endcode
  66 ///
  67 /// \headerfile <immintrin.h>
  68 ///
  69 /// \code
  70 /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
  71 /// \endcode
  72 ///
  73 /// This intrinsic corresponds to the \c VMPSADBW instruction.
  74 ///
  75 /// \param X
  76 ///    A 256-bit integer vector containing one of the inputs.
  77 /// \param Y
  78 ///    A 256-bit integer vector containing one of the inputs.
  79 /// \param M
  80 ///     An unsigned immediate value specifying the starting positions of the
  81 ///     bytes to operate on.
  82 /// \returns A 256-bit vector of [16 x i16] containing the result.
  83 #define _mm256_mpsadbw_epu8(X, Y, M) \
  84   ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
  85                                       (__v32qi)(__m256i)(Y), (int)(M)))
  86
  87 /// Computes the absolute value of each signed byte in the 256-bit integer
  88 ///    vector \a __a and returns each value in the corresponding byte of
  89 ///    the result.
  90 ///
  91 /// \headerfile <immintrin.h>
  92 ///
  93 /// This intrinsic corresponds to the \c VPABSB instruction.
  94 ///
  95 /// \param __a
  96 ///    A 256-bit integer vector.
  97 /// \returns A 256-bit integer vector containing the result.
  98 static __inline__ __m256i __DEFAULT_FN_ATTRS256
  99 _mm256_abs_epi8(__m256i __a)
 100 {
 101     return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
 102 }
 103
 104 /// Computes the absolute value of each signed 16-bit element in the 256-bit
 105 ///    vector of [16 x i16] in \a __a and returns each value in the
 106 ///    corresponding element of the result.
 107 ///
 108 /// \headerfile <immintrin.h>
 109 ///
 110 /// This intrinsic corresponds to the \c VPABSW instruction.
 111 ///
 112 /// \param __a
 113 ///    A 256-bit vector of [16 x i16].
 114 /// \returns A 256-bit vector of [16 x i16] containing the result.
 115 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 116 _mm256_abs_epi16(__m256i __a)
 117 {
 118     return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
 119 }
 120
 121 /// Computes the absolute value of each signed 32-bit element in the 256-bit
 122 ///    vector of [8 x i32] in \a __a and returns each value in the
 123 ///    corresponding element of the result.
 124 ///
 125 /// \headerfile <immintrin.h>
 126 ///
 127 /// This intrinsic corresponds to the \c VPABSD instruction.
 128 ///
 129 /// \param __a
 130 ///    A 256-bit vector of [8 x i32].
 131 /// \returns A 256-bit vector of [8 x i32] containing the result.
 132 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 133 _mm256_abs_epi32(__m256i __a)
 134 {
 135     return (__m256i)__builtin_elementwise_abs((__v8si)__a);
 136 }
 137
 138 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
 139 ///    integers using signed saturation, and returns the 256-bit result.
 140 ///
 141 /// \code{.operation}
 142 /// FOR i := 0 TO 7
 143 ///   j := i*16
 144 ///   k := i*8
 145 ///   result[7+k:k] := SATURATE8(__a[15+j:j])
 146 ///   result[71+k:64+k] := SATURATE8(__b[15+j:j])
 147 ///   result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
 148 ///   result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
 149 /// ENDFOR
 150 /// \endcode
 151 ///
 152 /// \headerfile <immintrin.h>
 153 ///
 154 /// This intrinsic corresponds to the \c VPACKSSWB instruction.
 155 ///
 156 /// \param __a
 157 ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
 158 ///    result[191:128].
 159 /// \param __b
 160 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
 161 ///    result[255:192].
 162 /// \returns A 256-bit integer vector containing the result.
 163 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 164 _mm256_packs_epi16(__m256i __a, __m256i __b)
 165 {
 166   return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
 167 }
 168
 169 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
 170 ///    integers using signed saturation, and returns the resulting 256-bit
 171 ///    vector of [16 x i16].
 172 ///
 173 /// \code{.operation}
 174 /// FOR i := 0 TO 3
 175 ///   j := i*32
 176 ///   k := i*16
 177 ///   result[15+k:k] := SATURATE16(__a[31+j:j])
 178 ///   result[79+k:64+k] := SATURATE16(__b[31+j:j])
 179 ///   result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
 180 ///   result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
 181 /// ENDFOR
 182 /// \endcode
 183 ///
 184 /// \headerfile <immintrin.h>
 185 ///
 186 /// This intrinsic corresponds to the \c VPACKSSDW instruction.
 187 ///
 188 /// \param __a
 189 ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
 190 ///    result[191:128].
 191 /// \param __b
 192 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
 193 ///    result[255:192].
 194 /// \returns A 256-bit vector of [16 x i16] containing the result.
 195 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 196 _mm256_packs_epi32(__m256i __a, __m256i __b)
 197 {
 198   return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
 199 }
 200
 201 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
 202 ///    using unsigned saturation, and returns the 256-bit result.
 203 ///
 204 /// \code{.operation}
 205 /// FOR i := 0 TO 7
 206 ///   j := i*16
 207 ///   k := i*8
 208 ///   result[7+k:k] := SATURATE8U(__a[15+j:j])
 209 ///   result[71+k:64+k] := SATURATE8U(__b[15+j:j])
 210 ///   result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
 211 ///   result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
 212 /// ENDFOR
 213 /// \endcode
 214 ///
 215 /// \headerfile <immintrin.h>
 216 ///
 217 /// This intrinsic corresponds to the \c VPACKUSWB instruction.
 218 ///
 219 /// \param __a
 220 ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
 221 ///    result[191:128].
 222 /// \param __b
 223 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
 224 ///    result[255:192].
 225 /// \returns A 256-bit integer vector containing the result.
 226 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 227 _mm256_packus_epi16(__m256i __a, __m256i __b)
 228 {
 229   return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
 230 }
 231
 232 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
 233 ///    using unsigned saturation, and returns the resulting 256-bit vector of
 234 ///    [16 x i16].
 235 ///
 236 /// \code{.operation}
 237 /// FOR i := 0 TO 3
 238 ///   j := i*32
 239 ///   k := i*16
 240 ///   result[15+k:k] := SATURATE16U(__V1[31+j:j])
 241 ///   result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
 242 ///   result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
 243 ///   result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
 244 /// ENDFOR
 245 /// \endcode
 246 ///
 247 /// \headerfile <immintrin.h>
 248 ///
 249 /// This intrinsic corresponds to the \c VPACKUSDW instruction.
 250 ///
 251 /// \param __V1
 252 ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
 253 ///    result[191:128].
 254 /// \param __V2
 255 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
 256 ///    result[255:192].
 257 /// \returns A 256-bit vector of [16 x i16] containing the result.
 258 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 259 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
 260 {
 261   return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
 262 }
 263
 264 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
 265 ///    vectors and returns the lower 8 bits of each sum in the corresponding
 266 ///    byte of the 256-bit integer vector result (overflow is ignored).
 267 ///
 268 /// \headerfile <immintrin.h>
 269 ///
 270 /// This intrinsic corresponds to the \c VPADDB instruction.
 271 ///
 272 /// \param __a
 273 ///    A 256-bit integer vector containing one of the source operands.
 274 /// \param __b
 275 ///    A 256-bit integer vector containing one of the source operands.
 276 /// \returns A 256-bit integer vector containing the sums.
 277 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 278 _mm256_add_epi8(__m256i __a, __m256i __b)
 279 {
 280   return (__m256i)((__v32qu)__a + (__v32qu)__b);
 281 }
 282
 283 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
 284 ///    [16 x i16] and returns the lower 16 bits of each sum in the
 285 ///    corresponding element of the [16 x i16] result (overflow is ignored).
 286 ///
 287 /// \headerfile <immintrin.h>
 288 ///
 289 /// This intrinsic corresponds to the \c VPADDW instruction.
 290 ///
 291 /// \param __a
 292 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 293 /// \param __b
 294 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 295 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 296 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 297 _mm256_add_epi16(__m256i __a, __m256i __b)
 298 {
 299   return (__m256i)((__v16hu)__a + (__v16hu)__b);
 300 }
 301
 302 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
 303 ///    [8 x i32] and returns the lower 32 bits of each sum in the corresponding
 304 ///    element of the [8 x i32] result (overflow is ignored).
 305 ///
 306 /// \headerfile <immintrin.h>
 307 ///
 308 /// This intrinsic corresponds to the \c VPADDD instruction.
 309 ///
 310 /// \param __a
 311 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 312 /// \param __b
 313 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 314 /// \returns A 256-bit vector of [8 x i32] containing the sums.
 315 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 316 _mm256_add_epi32(__m256i __a, __m256i __b)
 317 {
 318   return (__m256i)((__v8su)__a + (__v8su)__b);
 319 }
 320
 321 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
 322 ///    [4 x i64] and returns the lower 64 bits of each sum in the corresponding
 323 ///    element of the [4 x i64] result (overflow is ignored).
 324 ///
 325 /// \headerfile <immintrin.h>
 326 ///
 327 /// This intrinsic corresponds to the \c VPADDQ instruction.
 328 ///
 329 /// \param __a
 330 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
 331 /// \param __b
 332 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
 333 /// \returns A 256-bit vector of [4 x i64] containing the sums.
 334 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 335 _mm256_add_epi64(__m256i __a, __m256i __b)
 336 {
 337   return (__m256i)((__v4du)__a + (__v4du)__b);
 338 }
 339
 340 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
 341 ///    vectors using signed saturation, and returns each sum in the
 342 ///    corresponding byte of the 256-bit integer vector result.
 343 ///
 344 /// \headerfile <immintrin.h>
 345 ///
 346 /// This intrinsic corresponds to the \c VPADDSB instruction.
 347 ///
 348 /// \param __a
 349 ///    A 256-bit integer vector containing one of the source operands.
 350 /// \param __b
 351 ///    A 256-bit integer vector containing one of the source operands.
 352 /// \returns A 256-bit integer vector containing the sums.
 353 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 354 _mm256_adds_epi8(__m256i __a, __m256i __b)
 355 {
 356   return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
 357 }
 358
 359 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
 360 ///    [16 x i16] using signed saturation, and returns the [16 x i16] result.
 361 ///
 362 /// \headerfile <immintrin.h>
 363 ///
 364 /// This intrinsic corresponds to the \c VPADDSW instruction.
 365 ///
 366 /// \param __a
 367 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 368 /// \param __b
 369 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 370 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 371 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 372 _mm256_adds_epi16(__m256i __a, __m256i __b)
 373 {
 374   return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
 375 }
 376
 377 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
 378 ///    vectors using unsigned saturation, and returns each sum in the
 379 ///    corresponding byte of the 256-bit integer vector result.
 380 ///
 381 /// \headerfile <immintrin.h>
 382 ///
 383 /// This intrinsic corresponds to the \c VPADDUSB instruction.
 384 ///
 385 /// \param __a
 386 ///    A 256-bit integer vector containing one of the source operands.
 387 /// \param __b
 388 ///    A 256-bit integer vector containing one of the source operands.
 389 /// \returns A 256-bit integer vector containing the sums.
 390 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 391 _mm256_adds_epu8(__m256i __a, __m256i __b)
 392 {
 393   return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
 394 }
 395
 396 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
 397 ///    [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
 398 ///
 399 /// \headerfile <immintrin.h>
 400 ///
 401 /// This intrinsic corresponds to the \c VPADDUSW instruction.
 402 ///
 403 /// \param __a
 404 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 405 /// \param __b
 406 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 407 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 408 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 409 _mm256_adds_epu16(__m256i __a, __m256i __b)
 410 {
 411   return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
 412 }
 413
 414 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
 415 ///    temporary 256-bit value, and the lower half of the 256-bit vector \a b
 416 ///    as the lower half of the temporary value. Right-shifts the temporary
 417 ///    value by \a n bytes, and uses the lower 16 bytes of the shifted value
 418 ///    as the lower 16 bytes of the result. Uses the upper halves of \a a and
 419 ///    \a b to make another temporary value, right shifts by \a n, and uses
 420 ///    the lower 16 bytes of the shifted value as the upper 16 bytes of the
 421 ///    result.
 422 ///
 423 /// \headerfile <immintrin.h>
 424 ///
 425 /// \code
 426 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
 427 /// \endcode
 428 ///
 429 /// This intrinsic corresponds to the \c VPALIGNR instruction.
 430 ///
 431 /// \param a
 432 ///    A 256-bit integer vector containing source values.
 433 /// \param b
 434 ///    A 256-bit integer vector containing source values.
 435 /// \param n
 436 ///    An immediate value specifying the number of bytes to shift.
 437 /// \returns A 256-bit integer vector containing the result.
 438 #define _mm256_alignr_epi8(a, b, n) \
 439   ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
 440                                       (__v32qi)(__m256i)(b), (n)))
 441
 442 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
 443 ///    \a __b.
 444 ///
 445 /// \headerfile <immintrin.h>
 446 ///
 447 /// This intrinsic corresponds to the \c VPAND instruction.
 448 ///
 449 /// \param __a
 450 ///    A 256-bit integer vector.
 451 /// \param __b
 452 ///    A 256-bit integer vector.
 453 /// \returns A 256-bit integer vector containing the result.
 454 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 455 _mm256_and_si256(__m256i __a, __m256i __b)
 456 {
 457   return (__m256i)((__v4du)__a & (__v4du)__b);
 458 }
 459
 460 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with
 461 ///    the bitwise NOT of the 256-bit integer vector in \a __a.
 462 ///
 463 /// \headerfile <immintrin.h>
 464 ///
 465 /// This intrinsic corresponds to the \c VPANDN instruction.
 466 ///
 467 /// \param __a
 468 ///    A 256-bit integer vector.
 469 /// \param __b
 470 ///    A 256-bit integer vector.
 471 /// \returns A 256-bit integer vector containing the result.
 472 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 473 _mm256_andnot_si256(__m256i __a, __m256i __b)
 474 {
 475   return (__m256i)(~(__v4du)__a & (__v4du)__b);
 476 }
 477
 478 /// Computes the averages of the corresponding unsigned bytes in the two
 479 ///    256-bit integer vectors in \a __a and \a __b and returns each
 480 ///    average in the corresponding byte of the 256-bit result.
 481 ///
 482 /// \code{.operation}
 483 /// FOR i := 0 TO 31
 484 ///   j := i*8
 485 ///   result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
 486 /// ENDFOR
 487 /// \endcode
 488 ///
 489 /// \headerfile <immintrin.h>
 490 ///
 491 /// This intrinsic corresponds to the \c VPAVGB instruction.
 492 ///
 493 /// \param __a
 494 ///    A 256-bit integer vector.
 495 /// \param __b
 496 ///    A 256-bit integer vector.
 497 /// \returns A 256-bit integer vector containing the result.
 498 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 499 _mm256_avg_epu8(__m256i __a, __m256i __b)
 500 {
 501   return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
 502 }
 503
 504 /// Computes the averages of the corresponding unsigned 16-bit integers in
 505 ///    the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
 506 ///    each average in the corresponding element of the 256-bit result.
 507 ///
 508 /// \code{.operation}
 509 /// FOR i := 0 TO 15
 510 ///   j := i*16
 511 ///   result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
 512 /// ENDFOR
 513 /// \endcode
 514 ///
 515 /// \headerfile <immintrin.h>
 516 ///
 517 /// This intrinsic corresponds to the \c VPAVGW instruction.
 518 ///
 519 /// \param __a
 520 ///    A 256-bit vector of [16 x i16].
 521 /// \param __b
 522 ///    A 256-bit vector of [16 x i16].
 523 /// \returns A 256-bit vector of [16 x i16] containing the result.
 524 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 525 _mm256_avg_epu16(__m256i __a, __m256i __b)
 526 {
 527   return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
 528 }
 529
 530 /// Merges 8-bit integer values from either of the two 256-bit vectors
 531 ///    \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
 532 ///    the resulting 256-bit integer vector.
 533 ///
 534 /// \code{.operation}
 535 /// FOR i := 0 TO 31
 536 ///   j := i*8
 537 ///   IF __M[7+i] == 0
 538 ///     result[7+j:j] := __V1[7+j:j]
 539 ///   ELSE
 540 ///     result[7+j:j] := __V2[7+j:j]
 541 ///   FI
 542 /// ENDFOR
 543 /// \endcode
 544 ///
 545 /// \headerfile <immintrin.h>
 546 ///
 547 /// This intrinsic corresponds to the \c VPBLENDVB instruction.
 548 ///
 549 /// \param __V1
 550 ///    A 256-bit integer vector containing source values.
 551 /// \param __V2
 552 ///    A 256-bit integer vector containing source values.
 553 /// \param __M
 554 ///    A 256-bit integer vector, with bit [7] of each byte specifying the
 555 ///    source for each corresponding byte of the result. When the mask bit
 556 ///    is 0, the byte is copied from \a __V1; otherwise, it is copied from
 557 ///    \a __V2.
 558 /// \returns A 256-bit integer vector containing the result.
 559 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 560 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
 561 {
 562   return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
 563                                               (__v32qi)__M);
 564 }
 565
 566 /// Merges 16-bit integer values from either of the two 256-bit vectors
 567 ///    \a V1 or \a V2, as specified by the immediate integer operand \a M,
 568 ///    and returns the resulting 256-bit vector of [16 x i16].
 569 ///
 570 /// \code{.operation}
 571 /// FOR i := 0 TO 7
 572 ///   j := i*16
 573 ///   IF M[i] == 0
 574 ///     result[7+j:j] := V1[7+j:j]
 575 ///     result[135+j:128+j] := V1[135+j:128+j]
 576 ///   ELSE
 577 ///     result[7+j:j] := V2[7+j:j]
 578 ///     result[135+j:128+j] := V2[135+j:128+j]
 579 ///   FI
 580 /// ENDFOR
 581 /// \endcode
 582 ///
 583 /// \headerfile <immintrin.h>
 584 ///
 585 /// \code
 586 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
 587 /// \endcode
 588 ///
 589 /// This intrinsic corresponds to the \c VPBLENDW instruction.
 590 ///
 591 /// \param V1
 592 ///    A 256-bit vector of [16 x i16] containing source values.
 593 /// \param V2
 594 ///    A 256-bit vector of [16 x i16] containing source values.
 595 /// \param M
 596 ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
 597 ///    source for each element of the result. The position of the mask bit
 598 ///    corresponds to the index of a copied value. When a mask bit is 0, the
 599 ///    element is copied from \a V1; otherwise, it is copied from \a V2.
 600 ///    \a M[0] determines the source for elements 0 and 8, \a M[1] for
 601 ///    elements 1 and 9, and so forth.
 602 /// \returns A 256-bit vector of [16 x i16] containing the result.
 603 #define _mm256_blend_epi16(V1, V2, M) \
 604   ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
 605                                       (__v16hi)(__m256i)(V2), (int)(M)))
 606
 607 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
 608 ///    \a __b for equality and returns the outcomes in the corresponding
 609 ///    bytes of the 256-bit result.
 610 ///
 611 /// \code{.operation}
 612 /// FOR i := 0 TO 31
 613 ///   j := i*8
 614 ///   result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
 615 /// ENDFOR
 616 /// \endcode
 617 ///
 618 /// \headerfile <immintrin.h>
 619 ///
 620 /// This intrinsic corresponds to the \c VPCMPEQB instruction.
 621 ///
 622 /// \param __a
 623 ///    A 256-bit integer vector containing one of the inputs.
 624 /// \param __b
 625 ///    A 256-bit integer vector containing one of the inputs.
 626 /// \returns A 256-bit integer vector containing the result.
 627 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 628 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
 629 {
 630   return (__m256i)((__v32qi)__a == (__v32qi)__b);
 631 }
 632
 633 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
 634 ///    \a __a and \a __b for equality and returns the outcomes in the
 635 ///    corresponding elements of the 256-bit result.
 636 ///
 637 /// \code{.operation}
 638 /// FOR i := 0 TO 15
 639 ///   j := i*16
 640 ///   result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
 641 /// ENDFOR
 642 /// \endcode
 643 ///
 644 /// \headerfile <immintrin.h>
 645 ///
 646 /// This intrinsic corresponds to the \c VPCMPEQW instruction.
 647 ///
 648 /// \param __a
 649 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 650 /// \param __b
 651 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 652 /// \returns A 256-bit vector of [16 x i16] containing the result.
 653 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 654 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
 655 {
 656   return (__m256i)((__v16hi)__a == (__v16hi)__b);
 657 }
 658
 659 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
 660 ///    \a __a and \a __b for equality and returns the outcomes in the
 661 ///    corresponding elements of the 256-bit result.
 662 ///
 663 /// \code{.operation}
 664 /// FOR i := 0 TO 7
 665 ///   j := i*32
 666 ///   result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
 667 /// ENDFOR
 668 /// \endcode
 669 ///
 670 /// \headerfile <immintrin.h>
 671 ///
 672 /// This intrinsic corresponds to the \c VPCMPEQD instruction.
 673 ///
 674 /// \param __a
 675 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 676 /// \param __b
 677 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 678 /// \returns A 256-bit vector of [8 x i32] containing the result.
 679 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 680 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
 681 {
 682   return (__m256i)((__v8si)__a == (__v8si)__b);
 683 }
 684
 685 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
 686 ///    \a __a and \a __b for equality and returns the outcomes in the
 687 ///    corresponding elements of the 256-bit result.
 688 ///
 689 /// \code{.operation}
 690 /// FOR i := 0 TO 3
 691 ///   j := i*64
 692 ///   result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
 693 /// ENDFOR
 694 /// \endcode
 695 ///
 696 /// \headerfile <immintrin.h>
 697 ///
 698 /// This intrinsic corresponds to the \c VPCMPEQQ instruction.
 699 ///
 700 /// \param __a
 701 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 702 /// \param __b
 703 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 704 /// \returns A 256-bit vector of [4 x i64] containing the result.
 705 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 706 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
 707 {
 708   return (__m256i)((__v4di)__a == (__v4di)__b);
 709 }
 710
 711 /// Compares corresponding signed bytes in the 256-bit integer vectors in
 712 ///    \a __a and \a __b for greater-than and returns the outcomes in the
 713 ///    corresponding bytes of the 256-bit result.
 714 ///
 715 /// \code{.operation}
 716 /// FOR i := 0 TO 31
 717 ///   j := i*8
 718 ///   result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
 719 /// ENDFOR
 720 /// \endcode
 721 ///
 722 /// \headerfile <immintrin.h>
 723 ///
 724 /// This intrinsic corresponds to the \c VPCMPGTB instruction.
 725 ///
 726 /// \param __a
 727 ///    A 256-bit integer vector containing one of the inputs.
 728 /// \param __b
 729 ///    A 256-bit integer vector containing one of the inputs.
 730 /// \returns A 256-bit integer vector containing the result.
 731 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 732 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
 733 {
 734   /* This function always performs a signed comparison, but __v32qi is a char
 735      which may be signed or unsigned, so use __v32qs. */
 736   return (__m256i)((__v32qs)__a > (__v32qs)__b);
 737 }
 738
 739 /// Compares corresponding signed elements in the 256-bit vectors of
 740 ///    [16 x i16] in \a __a and \a __b for greater-than and returns the
 741 ///    outcomes in the corresponding elements of the 256-bit result.
 742 ///
 743 /// \code{.operation}
 744 /// FOR i := 0 TO 15
 745 ///   j := i*16
 746 ///   result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
 747 /// ENDFOR
 748 /// \endcode
 749 ///
 750 /// \headerfile <immintrin.h>
 751 ///
 752 /// This intrinsic corresponds to the \c VPCMPGTW instruction.
 753 ///
 754 /// \param __a
 755 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 756 /// \param __b
 757 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 758 /// \returns A 256-bit vector of [16 x i16] containing the result.
 759 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 760 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
 761 {
 762   return (__m256i)((__v16hi)__a > (__v16hi)__b);
 763 }
 764
 765 /// Compares corresponding signed elements in the 256-bit vectors of
 766 ///    [8 x i32] in \a __a and \a __b for greater-than and returns the
 767 ///    outcomes in the corresponding elements of the 256-bit result.
 768 ///
 769 /// \code{.operation}
 770 /// FOR i := 0 TO 7
 771 ///   j := i*32
 772 ///   result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
 773 /// ENDFOR
 774 /// \endcode
 775 ///
 776 /// \headerfile <immintrin.h>
 777 ///
 778 /// This intrinsic corresponds to the \c VPCMPGTD instruction.
 779 ///
 780 /// \param __a
 781 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 782 /// \param __b
 783 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 784 /// \returns A 256-bit vector of [8 x i32] containing the result.
 785 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 786 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
 787 {
 788   return (__m256i)((__v8si)__a > (__v8si)__b);
 789 }
 790
 791 /// Compares corresponding signed elements in the 256-bit vectors of
 792 ///    [4 x i64] in \a __a and \a __b for greater-than and returns the
 793 ///    outcomes in the corresponding elements of the 256-bit result.
 794 ///
 795 /// \code{.operation}
 796 /// FOR i := 0 TO 3
 797 ///   j := i*64
 798 ///   result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
 799 /// ENDFOR
 800 /// \endcode
 801 ///
 802 /// \headerfile <immintrin.h>
 803 ///
 804 /// This intrinsic corresponds to the \c VPCMPGTQ instruction.
 805 ///
 806 /// \param __a
 807 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 808 /// \param __b
 809 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 810 /// \returns A 256-bit vector of [4 x i64] containing the result.
 811 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 812 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 813 {
 814   return (__m256i)((__v4di)__a > (__v4di)__b);
 815 }
 816
 817 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
 818 ///    vectors of [16 x i16] and returns the lower 16 bits of each sum in an
 819 ///    element of the [16 x i16] result (overflow is ignored). Sums from
 820 ///    \a __a are returned in the lower 64 bits of each 128-bit half of the
 821 ///    result; sums from \a __b are returned in the upper 64 bits of each
 822 ///    128-bit half of the result.
 823 ///
 824 /// \code{.operation}
 825 /// FOR i := 0 TO 1
 826 ///   j := i*128
 827 ///   result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
 828 ///   result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
 829 ///   result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
 830 ///   result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
 831 ///   result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
 832 ///   result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
 833 ///   result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
 834 ///   result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
 835 /// ENDFOR
 836 /// \endcode
 837 ///
 838 /// \headerfile <immintrin.h>
 839 ///
 840 /// This intrinsic corresponds to the \c VPHADDW instruction.
 841 ///
 842 /// \param __a
 843 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 844 /// \param __b
 845 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 846 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 847 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 848 _mm256_hadd_epi16(__m256i __a, __m256i __b)
 849 {
 850     return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
 851 }
 852
 853 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
 854 ///    vectors of [8 x i32] and returns the lower 32 bits of each sum in an
 855 ///    element of the [8 x i32] result (overflow is ignored). Sums from \a __a
 856 ///    are returned in the lower 64 bits of each 128-bit half of the result;
 857 ///    sums from \a __b are returned in the upper 64 bits of each 128-bit half
 858 ///    of the result.
 859 ///
 860 /// \code{.operation}
 861 /// FOR i := 0 TO 1
 862 ///   j := i*128
 863 ///   result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
 864 ///   result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
 865 ///   result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
 866 ///   result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
 867 /// ENDFOR
 868 /// \endcode
 869 ///
 870 /// \headerfile <immintrin.h>
 871 ///
 872 /// This intrinsic corresponds to the \c VPHADDD instruction.
 873 ///
 874 /// \param __a
 875 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 876 /// \param __b
 877 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 878 /// \returns A 256-bit vector of [8 x i32] containing the sums.
 879 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 880 _mm256_hadd_epi32(__m256i __a, __m256i __b)
 881 {
 882     return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
 883 }
 884
 885 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
 886 ///    vectors of [16 x i16] using signed saturation and returns each sum in
 887 ///    an element of the [16 x i16] result. Sums from \a __a are returned in
 888 ///    the lower 64 bits of each 128-bit half of the result; sums from \a __b
 889 ///    are returned in the upper 64 bits of each 128-bit half of the result.
 890 ///
 891 /// \code{.operation}
 892 /// FOR i := 0 TO 1
 893 ///   j := i*128
 894 ///   result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
 895 ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
 896 ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
 897 ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
 898 ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
 899 ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
 900 ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
 901 ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
 902 /// ENDFOR
 903 /// \endcode
 904 ///
 905 /// \headerfile <immintrin.h>
 906 ///
 907 /// This intrinsic corresponds to the \c VPHADDSW instruction.
 908 ///
 909 /// \param __a
 910 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 911 /// \param __b
 912 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 913 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 914 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 915 _mm256_hadds_epi16(__m256i __a, __m256i __b)
 916 {
 917     return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
 918 }
 919
 920 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
 921 ///    vectors of [16 x i16] and returns the lower 16 bits of each difference
 922 ///    in an element of the [16 x i16] result (overflow is ignored).
 923 ///    Differences from \a __a are returned in the lower 64 bits of each
 924 ///    128-bit half of the result; differences from \a __b are returned in the
 925 ///    upper 64 bits of each 128-bit half of the result.
 926 ///
 927 /// \code{.operation}
 928 /// FOR i := 0 TO 1
 929 ///   j := i*128
 930 ///   result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
 931 ///   result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
 932 ///   result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
 933 ///   result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
 934 ///   result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
 935 ///   result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
 936 ///   result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
 937 ///   result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
 938 /// ENDFOR
 939 /// \endcode
 940 ///
 941 /// \headerfile <immintrin.h>
 942 ///
 943 /// This intrinsic corresponds to the \c VPHSUBW instruction.
 944 ///
 945 /// \param __a
 946 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 947 /// \param __b
 948 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 949 /// \returns A 256-bit vector of [16 x i16] containing the differences.
 950 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 951 _mm256_hsub_epi16(__m256i __a, __m256i __b)
 952 {
 953     return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
 954 }
 955
 956 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
 957 ///    vectors of [8 x i32] and returns the lower 32 bits of each difference in
 958 ///    an element of the [8 x i32] result (overflow is ignored). Differences
 959 ///    from \a __a are returned in the lower 64 bits of each 128-bit half of
 960 ///    the result; differences from \a __b are returned in the upper 64 bits
 961 ///    of each 128-bit half of the result.
 962 ///
 963 /// \code{.operation}
 964 /// FOR i := 0 TO 1
 965 ///   j := i*128
 966 ///   result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
 967 ///   result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
 968 ///   result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
 969 ///   result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
 970 /// ENDFOR
 971 /// \endcode
 972 ///
 973 /// \headerfile <immintrin.h>
 974 ///
 975 /// This intrinsic corresponds to the \c VPHSUBD instruction.
 976 ///
 977 /// \param __a
 978 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 979 /// \param __b
 980 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 981 /// \returns A 256-bit vector of [8 x i32] containing the differences.
 982 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 983 _mm256_hsub_epi32(__m256i __a, __m256i __b)
 984 {
 985     return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
 986 }
 987
 988 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
 989 ///    vectors of [16 x i16] using signed saturation and returns each sum in
 990 ///    an element of the [16 x i16] result. Differences from \a __a are
 991 ///    returned in the lower 64 bits of each 128-bit half of the result;
 992 ///    differences from \a __b are returned in the upper 64 bits of each
 993 ///    128-bit half of the result.
 994 ///
 995 /// \code{.operation}
 996 /// FOR i := 0 TO 1
 997 ///   j := i*128
 998 ///   result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
 999 ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1000 ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1001 ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1002 ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1003 ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1004 ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1005 ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1006 /// ENDFOR
1007 /// \endcode
1008 ///
1009 /// \headerfile <immintrin.h>
1010 ///
1011 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
1012 ///
1013 /// \param __a
1014 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1015 /// \param __b
1016 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1017 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1018 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1019 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
1020 {
1021     return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1022 }
1023
1024 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1025 ///    with the corresponding signed byte from the 256-bit integer vector in
1026 ///    \a __b, forming signed 16-bit intermediate products. Adds adjacent
1027 ///    pairs of those products using signed saturation to form 16-bit sums
1028 ///    returned as elements of the [16 x i16] result.
1029 ///
1030 /// \code{.operation}
1031 /// FOR i := 0 TO 15
1032 ///   j := i*16
1033 ///   temp1 := __a[j+7:j] * __b[j+7:j]
1034 ///   temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1035 ///   result[j+15:j] := SATURATE16(temp1 + temp2)
1036 /// ENDFOR
1037 /// \endcode
1038 ///
1039 /// \headerfile <immintrin.h>
1040 ///
1041 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1042 ///
1043 /// \param __a
1044 ///    A 256-bit vector containing one of the source operands.
1045 /// \param __b
1046 ///    A 256-bit vector containing one of the source operands.
1047 /// \returns A 256-bit vector of [16 x i16] containing the result.
1048 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1049 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
1050 {
1051     return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1052 }
1053
1054 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1055 ///    [16 x i16], forming 32-bit intermediate products, and adds pairs of
1056 ///    those products to form 32-bit sums returned as elements of the
1057 ///    [8 x i32] result.
1058 ///
1059 ///    There is only one wraparound case: when all four of the 16-bit sources
1060 ///    are \c 0x8000, the result will be \c 0x80000000.
1061 ///
1062 /// \code{.operation}
1063 /// FOR i := 0 TO 7
1064 ///   j := i*32
1065 ///   temp1 := __a[j+15:j] * __b[j+15:j]
1066 ///   temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1067 ///   result[j+31:j] := temp1 + temp2
1068 /// ENDFOR
1069 /// \endcode
1070 ///
1071 /// \headerfile <immintrin.h>
1072 ///
1073 /// This intrinsic corresponds to the \c VPMADDWD instruction.
1074 ///
1075 /// \param __a
1076 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1077 /// \param __b
1078 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1079 /// \returns A 256-bit vector of [8 x i32] containing the result.
1080 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1081 _mm256_madd_epi16(__m256i __a, __m256i __b)
1082 {
1083   return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1084 }
1085
1086 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1087 ///     in \a __a and \a __b and returns the larger of each pair in the
1088 ///     corresponding byte of the 256-bit result.
1089 ///
1090 /// \headerfile <immintrin.h>
1091 ///
1092 /// This intrinsic corresponds to the \c VPMAXSB instruction.
1093 ///
1094 /// \param __a
1095 ///    A 256-bit integer vector.
1096 /// \param __b
1097 ///    A 256-bit integer vector.
1098 /// \returns A 256-bit integer vector containing the result.
1099 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1100 _mm256_max_epi8(__m256i __a, __m256i __b)
1101 {
1102   return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1103 }
1104
1105 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1106 ///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1107 ///    each pair in the corresponding element of the 256-bit result.
1108 ///
1109 /// \headerfile <immintrin.h>
1110 ///
1111 /// This intrinsic corresponds to the \c VPMAXSW instruction.
1112 ///
1113 /// \param __a
1114 ///    A 256-bit vector of [16 x i16].
1115 /// \param __b
1116 ///    A 256-bit vector of [16 x i16].
1117 /// \returns A 256-bit vector of [16 x i16] containing the result.
1118 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1119 _mm256_max_epi16(__m256i __a, __m256i __b)
1120 {
1121   return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1122 }
1123
1124 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1125 ///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1126 ///    each pair in the corresponding element of the 256-bit result.
1127 ///
1128 /// \headerfile <immintrin.h>
1129 ///
1130 /// This intrinsic corresponds to the \c VPMAXSD instruction.
1131 ///
1132 /// \param __a
1133 ///    A 256-bit vector of [8 x i32].
1134 /// \param __b
1135 ///    A 256-bit vector of [8 x i32].
1136 /// \returns A 256-bit vector of [8 x i32] containing the result.
1137 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1138 _mm256_max_epi32(__m256i __a, __m256i __b)
1139 {
1140   return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1141 }
1142
1143 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1144 ///     vectors in \a __a and \a __b and returns the larger of each pair in
1145 ///     the corresponding byte of the 256-bit result.
1146 ///
1147 /// \headerfile <immintrin.h>
1148 ///
1149 /// This intrinsic corresponds to the \c VPMAXUB instruction.
1150 ///
1151 /// \param __a
1152 ///    A 256-bit integer vector.
1153 /// \param __b
1154 ///    A 256-bit integer vector.
1155 /// \returns A 256-bit integer vector containing the result.
1156 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1157 _mm256_max_epu8(__m256i __a, __m256i __b)
1158 {
1159   return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1160 }
1161
1162 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1163 ///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1164 ///    each pair in the corresponding element of the 256-bit result.
1165 ///
1166 /// \headerfile <immintrin.h>
1167 ///
1168 /// This intrinsic corresponds to the \c VPMAXUW instruction.
1169 ///
1170 /// \param __a
1171 ///    A 256-bit vector of [16 x i16].
1172 /// \param __b
1173 ///    A 256-bit vector of [16 x i16].
1174 /// \returns A 256-bit vector of [16 x i16] containing the result.
1175 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1176 _mm256_max_epu16(__m256i __a, __m256i __b)
1177 {
1178   return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1179 }
1180
1181 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1182 ///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1183 ///    each pair in the corresponding element of the 256-bit result.
1184 ///
1185 /// \headerfile <immintrin.h>
1186 ///
1187 /// This intrinsic corresponds to the \c VPMAXUD instruction.
1188 ///
1189 /// \param __a
1190 ///    A 256-bit vector of [8 x i32].
1191 /// \param __b
1192 ///    A 256-bit vector of [8 x i32].
1193 /// \returns A 256-bit vector of [8 x i32] containing the result.
1194 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1195 _mm256_max_epu32(__m256i __a, __m256i __b)
1196 {
1197   return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1198 }
1199
1200 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1201 ///     in \a __a and \a __b and returns the smaller of each pair in the
1202 ///     corresponding byte of the 256-bit result.
1203 ///
1204 /// \headerfile <immintrin.h>
1205 ///
1206 /// This intrinsic corresponds to the \c VPMINSB instruction.
1207 ///
1208 /// \param __a
1209 ///    A 256-bit integer vector.
1210 /// \param __b
1211 ///    A 256-bit integer vector.
1212 /// \returns A 256-bit integer vector containing the result.
1213 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1214 _mm256_min_epi8(__m256i __a, __m256i __b)
1215 {
1216   return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1217 }
1218
1219 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1220 ///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1221 ///    each pair in the corresponding element of the 256-bit result.
1222 ///
1223 /// \headerfile <immintrin.h>
1224 ///
1225 /// This intrinsic corresponds to the \c VPMINSW instruction.
1226 ///
1227 /// \param __a
1228 ///    A 256-bit vector of [16 x i16].
1229 /// \param __b
1230 ///    A 256-bit vector of [16 x i16].
1231 /// \returns A 256-bit vector of [16 x i16] containing the result.
1232 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1233 _mm256_min_epi16(__m256i __a, __m256i __b)
1234 {
1235   return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1236 }
1237
1238 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1239 ///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1240 ///    each pair in the corresponding element of the 256-bit result.
1241 ///
1242 /// \headerfile <immintrin.h>
1243 ///
1244 /// This intrinsic corresponds to the \c VPMINSD instruction.
1245 ///
1246 /// \param __a
1247 ///    A 256-bit vector of [8 x i32].
1248 /// \param __b
1249 ///    A 256-bit vector of [8 x i32].
1250 /// \returns A 256-bit vector of [8 x i32] containing the result.
1251 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1252 _mm256_min_epi32(__m256i __a, __m256i __b)
1253 {
1254   return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1255 }
1256
1257 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1258 ///     vectors in \a __a and \a __b and returns the smaller of each pair in
1259 ///     the corresponding byte of the 256-bit result.
1260 ///
1261 /// \headerfile <immintrin.h>
1262 ///
1263 /// This intrinsic corresponds to the \c VPMINUB instruction.
1264 ///
1265 /// \param __a
1266 ///    A 256-bit integer vector.
1267 /// \param __b
1268 ///    A 256-bit integer vector.
1269 /// \returns A 256-bit integer vector containing the result.
1270 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1271 _mm256_min_epu8(__m256i __a, __m256i __b)
1272 {
1273   return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1274 }
1275
1276 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1277 ///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1278 ///    each pair in the corresponding element of the 256-bit result.
1279 ///
1280 /// \headerfile <immintrin.h>
1281 ///
1282 /// This intrinsic corresponds to the \c VPMINUW instruction.
1283 ///
1284 /// \param __a
1285 ///    A 256-bit vector of [16 x i16].
1286 /// \param __b
1287 ///    A 256-bit vector of [16 x i16].
1288 /// \returns A 256-bit vector of [16 x i16] containing the result.
1289 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1290 _mm256_min_epu16(__m256i __a, __m256i __b)
1291 {
1292   return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1293 }
1294
1295 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1296 ///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1297 ///    each pair in the corresponding element of the 256-bit result.
1298 ///
1299 /// \headerfile <immintrin.h>
1300 ///
1301 /// This intrinsic corresponds to the \c VPMINUD instruction.
1302 ///
1303 /// \param __a
1304 ///    A 256-bit vector of [8 x i32].
1305 /// \param __b
1306 ///    A 256-bit vector of [8 x i32].
1307 /// \returns A 256-bit vector of [8 x i32] containing the result.
1308 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1309 _mm256_min_epu32(__m256i __a, __m256i __b)
1310 {
1311   return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1312 }
1313
1314 /// Creates a 32-bit integer mask from the most significant bit of each byte
1315 ///    in the 256-bit integer vector in \a __a and returns the result.
1316 ///
1317 /// \code{.operation}
1318 /// FOR i := 0 TO 31
1319 ///   j := i*8
1320 ///   result[i] := __a[j+7]
1321 /// ENDFOR
1322 /// \endcode
1323 ///
1324 /// \headerfile <immintrin.h>
1325 ///
1326 /// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1327 ///
1328 /// \param __a
1329 ///    A 256-bit integer vector containing the source bytes.
1330 /// \returns The 32-bit integer mask.
1331 static __inline__ int __DEFAULT_FN_ATTRS256
1332 _mm256_movemask_epi8(__m256i __a)
1333 {
1334   return __builtin_ia32_pmovmskb256((__v32qi)__a);
1335 }
1336
1337 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1338 ///    the 16-bit values in the corresponding elements of a 256-bit vector
1339 ///    of [16 x i16].
1340 ///
1341 /// \code{.operation}
1342 /// FOR i := 0 TO 15
1343 ///   j := i*8
1344 ///   k := i*16
1345 ///   result[k+15:k] := SignExtend(__V[j+7:j])
1346 /// ENDFOR
1347 /// \endcode
1348 ///
1349 /// \headerfile <immintrin.h>
1350 ///
1351 /// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1352 ///
1353 /// \param __V
1354 ///    A 128-bit integer vector containing the source bytes.
1355 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1356 ///    values.
1357 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1358 _mm256_cvtepi8_epi16(__m128i __V)
1359 {
1360   /* This function always performs a signed extension, but __v16qi is a char
1361      which may be signed or unsigned, so use __v16qs. */
1362   return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1363 }
1364
1365 /// Sign-extends bytes from the lower half of the 128-bit integer vector in
1366 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1367 ///    256-bit vector of [8 x i32].
1368 ///
1369 /// \code{.operation}
1370 /// FOR i := 0 TO 7
1371 ///   j := i*8
1372 ///   k := i*32
1373 ///   result[k+31:k] := SignExtend(__V[j+7:j])
1374 /// ENDFOR
1375 /// \endcode
1376 ///
1377 /// \headerfile <immintrin.h>
1378 ///
1379 /// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1380 ///
1381 /// \param __V
1382 ///    A 128-bit integer vector containing the source bytes.
1383 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1384 ///    values.
1385 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1386 _mm256_cvtepi8_epi32(__m128i __V)
1387 {
1388   /* This function always performs a signed extension, but __v16qi is a char
1389      which may be signed or unsigned, so use __v16qs. */
1390   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1391 }
1392
1393 /// Sign-extends the first four bytes from the 128-bit integer vector in
1394 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1395 ///    256-bit vector of [4 x i64].
1396 ///
1397 /// \code{.operation}
1398 /// result[63:0] := SignExtend(__V[7:0])
1399 /// result[127:64] := SignExtend(__V[15:8])
1400 /// result[191:128] := SignExtend(__V[23:16])
1401 /// result[255:192] := SignExtend(__V[31:24])
1402 /// \endcode
1403 ///
1404 /// \headerfile <immintrin.h>
1405 ///
1406 /// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1407 ///
1408 /// \param __V
1409 ///    A 128-bit integer vector containing the source bytes.
1410 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1411 ///    values.
1412 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1413 _mm256_cvtepi8_epi64(__m128i __V)
1414 {
1415   /* This function always performs a signed extension, but __v16qi is a char
1416      which may be signed or unsigned, so use __v16qs. */
1417   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1418 }
1419
1420 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1421 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1422 ///    256-bit vector of [8 x i32].
1423 ///
1424 /// \code{.operation}
1425 /// FOR i := 0 TO 7
1426 ///   j := i*16
1427 ///   k := i*32
1428 ///   result[k+31:k] := SignExtend(__V[j+15:j])
1429 /// ENDFOR
1430 /// \endcode
1431 ///
1432 /// \headerfile <immintrin.h>
1433 ///
1434 /// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1435 ///
1436 /// \param __V
1437 ///    A 128-bit vector of [8 x i16] containing the source values.
1438 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1439 ///    values.
1440 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1441 _mm256_cvtepi16_epi32(__m128i __V)
1442 {
1443   return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1444 }
1445
1446 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1447 ///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1448 ///    elements of a 256-bit vector of [4 x i64].
1449 ///
1450 /// \code{.operation}
1451 /// result[63:0] := SignExtend(__V[15:0])
1452 /// result[127:64] := SignExtend(__V[31:16])
1453 /// result[191:128] := SignExtend(__V[47:32])
1454 /// result[255:192] := SignExtend(__V[64:48])
1455 /// \endcode
1456 ///
1457 /// \headerfile <immintrin.h>
1458 ///
1459 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1460 ///
1461 /// \param __V
1462 ///    A 128-bit vector of [8 x i16] containing the source values.
1463 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1464 ///    values.
1465 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1466 _mm256_cvtepi16_epi64(__m128i __V)
1467 {
1468   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1469 }
1470
1471 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1472 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1473 ///    256-bit vector of [4 x i64].
1474 ///
1475 /// \code{.operation}
1476 /// result[63:0] := SignExtend(__V[31:0])
1477 /// result[127:64] := SignExtend(__V[63:32])
1478 /// result[191:128] := SignExtend(__V[95:64])
1479 /// result[255:192] := SignExtend(__V[127:96])
1480 /// \endcode
1481 ///
1482 /// \headerfile <immintrin.h>
1483 ///
1484 /// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1485 ///
1486 /// \param __V
1487 ///    A 128-bit vector of [4 x i32] containing the source values.
1488 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1489 ///    values.
1490 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1491 _mm256_cvtepi32_epi64(__m128i __V)
1492 {
1493   return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1494 }
1495
1496 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1497 ///    the 16-bit values in the corresponding elements of a 256-bit vector
1498 ///    of [16 x i16].
1499 ///
1500 /// \code{.operation}
1501 /// FOR i := 0 TO 15
1502 ///   j := i*8
1503 ///   k := i*16
1504 ///   result[k+15:k] := ZeroExtend(__V[j+7:j])
1505 /// ENDFOR
1506 /// \endcode
1507 ///
1508 /// \headerfile <immintrin.h>
1509 ///
1510 /// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1511 ///
1512 /// \param __V
1513 ///    A 128-bit integer vector containing the source bytes.
1514 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1515 ///    values.
1516 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1517 _mm256_cvtepu8_epi16(__m128i __V)
1518 {
1519   return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1520 }
1521
1522 /// Zero-extends bytes from the lower half of the 128-bit integer vector in
1523 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1524 ///    256-bit vector of [8 x i32].
1525 ///
1526 /// \code{.operation}
1527 /// FOR i := 0 TO 7
1528 ///   j := i*8
1529 ///   k := i*32
1530 ///   result[k+31:k] := ZeroExtend(__V[j+7:j])
1531 /// ENDFOR
1532 /// \endcode
1533 ///
1534 /// \headerfile <immintrin.h>
1535 ///
1536 /// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1537 ///
1538 /// \param __V
1539 ///    A 128-bit integer vector containing the source bytes.
1540 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1541 ///    values.
1542 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1543 _mm256_cvtepu8_epi32(__m128i __V)
1544 {
1545   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1546 }
1547
1548 /// Zero-extends the first four bytes from the 128-bit integer vector in
1549 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1550 ///    256-bit vector of [4 x i64].
1551 ///
1552 /// \code{.operation}
1553 /// result[63:0] := ZeroExtend(__V[7:0])
1554 /// result[127:64] := ZeroExtend(__V[15:8])
1555 /// result[191:128] := ZeroExtend(__V[23:16])
1556 /// result[255:192] := ZeroExtend(__V[31:24])
1557 /// \endcode
1558 ///
1559 /// \headerfile <immintrin.h>
1560 ///
1561 /// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1562 ///
1563 /// \param __V
1564 ///    A 128-bit integer vector containing the source bytes.
1565 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1566 ///    values.
1567 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1568 _mm256_cvtepu8_epi64(__m128i __V)
1569 {
1570   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1571 }
1572
1573 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1574 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1575 ///    256-bit vector of [8 x i32].
1576 ///
1577 /// \code{.operation}
1578 /// FOR i := 0 TO 7
1579 ///   j := i*16
1580 ///   k := i*32
1581 ///   result[k+31:k] := ZeroExtend(__V[j+15:j])
1582 /// ENDFOR
1583 /// \endcode
1584 ///
1585 /// \headerfile <immintrin.h>
1586 ///
1587 /// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1588 ///
1589 /// \param __V
1590 ///    A 128-bit vector of [8 x i16] containing the source values.
1591 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1592 ///    values.
1593 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1594 _mm256_cvtepu16_epi32(__m128i __V)
1595 {
1596   return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1597 }
1598
1599 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1600 ///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1601 ///    elements of a 256-bit vector of [4 x i64].
1602 ///
1603 /// \code{.operation}
1604 /// result[63:0] := ZeroExtend(__V[15:0])
1605 /// result[127:64] := ZeroExtend(__V[31:16])
1606 /// result[191:128] := ZeroExtend(__V[47:32])
1607 /// result[255:192] := ZeroExtend(__V[64:48])
1608 /// \endcode
1609 ///
1610 /// \headerfile <immintrin.h>
1611 ///
1612 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1613 ///
1614 /// \param __V
1615 ///    A 128-bit vector of [8 x i16] containing the source values.
1616 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1617 ///    values.
1618 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1619 _mm256_cvtepu16_epi64(__m128i __V)
1620 {
1621   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1622 }
1623
1624 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1625 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1626 ///    256-bit vector of [4 x i64].
1627 ///
1628 /// \code{.operation}
1629 /// result[63:0] := ZeroExtend(__V[31:0])
1630 /// result[127:64] := ZeroExtend(__V[63:32])
1631 /// result[191:128] := ZeroExtend(__V[95:64])
1632 /// result[255:192] := ZeroExtend(__V[127:96])
1633 /// \endcode
1634 ///
1635 /// \headerfile <immintrin.h>
1636 ///
1637 /// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1638 ///
1639 /// \param __V
1640 ///    A 128-bit vector of [4 x i32] containing the source values.
1641 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1642 ///    values.
1643 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1644 _mm256_cvtepu32_epi64(__m128i __V)
1645 {
1646   return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1647 }
1648
1649 /// Multiplies signed 32-bit integers from even-numbered elements of two
1650 ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1651 ///    [4 x i64] result.
1652 ///
1653 /// \code{.operation}
1654 /// result[63:0] := __a[31:0] * __b[31:0]
1655 /// result[127:64] := __a[95:64] * __b[95:64]
1656 /// result[191:128] := __a[159:128] * __b[159:128]
1657 /// result[255:192] := __a[223:192] * __b[223:192]
1658 /// \endcode
1659 ///
1660 /// \headerfile <immintrin.h>
1661 ///
1662 /// This intrinsic corresponds to the \c VPMULDQ instruction.
1663 ///
1664 /// \param __a
1665 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1666 /// \param __b
1667 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1668 /// \returns A 256-bit vector of [4 x i64] containing the products.
1669 static __inline__  __m256i __DEFAULT_FN_ATTRS256
1670 _mm256_mul_epi32(__m256i __a, __m256i __b)
1671 {
1672   return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1673 }
1674
1675 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1676 ///    [16 x i16], truncates the 32-bit results to the most significant 18
1677 ///    bits, rounds by adding 1, and returns bits [16:1] of each rounded
1678 ///    product in the [16 x i16] result.
1679 ///
1680 /// \code{.operation}
1681 /// FOR i := 0 TO 15
1682 ///   j := i*16
1683 ///   temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1684 ///   result[j+15:j] := temp[16:1]
1685 /// \endcode
1686 ///
1687 /// \headerfile <immintrin.h>
1688 ///
1689 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
1690 ///
1691 /// \param __a
1692 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1693 /// \param __b
1694 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1695 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1696 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1697 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1698 {
1699   return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1700 }
1701
1702 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1703 ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1704 ///    [16 x i16] result.
1705 ///
1706 /// \headerfile <immintrin.h>
1707 ///
1708 /// This intrinsic corresponds to the \c VPMULHUW instruction.
1709 ///
1710 /// \param __a
1711 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1712 /// \param __b
1713 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1714 /// \returns A 256-bit vector of [16 x i16] containing the products.
1715 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1716 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
1717 {
1718   return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
1719 }
1720
1721 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1722 ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1723 ///    [16 x i16] result.
1724 ///
1725 /// \headerfile <immintrin.h>
1726 ///
1727 /// This intrinsic corresponds to the \c VPMULHW instruction.
1728 ///
1729 /// \param __a
1730 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1731 /// \param __b
1732 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1733 /// \returns A 256-bit vector of [16 x i16] containing the products.
1734 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1735 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
1736 {
1737   return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1738 }
1739
1740 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1741 ///    [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1742 ///    [16 x i16] result.
1743 ///
1744 /// \headerfile <immintrin.h>
1745 ///
1746 /// This intrinsic corresponds to the \c VPMULLW instruction.
1747 ///
1748 /// \param __a
1749 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1750 /// \param __b
1751 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1752 /// \returns A 256-bit vector of [16 x i16] containing the products.
1753 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1754 _mm256_mullo_epi16(__m256i __a, __m256i __b)
1755 {
1756   return (__m256i)((__v16hu)__a * (__v16hu)__b);
1757 }
1758
1759 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1760 ///    [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1761 ///    [8 x i32] result.
1762 ///
1763 /// \headerfile <immintrin.h>
1764 ///
1765 /// This intrinsic corresponds to the \c VPMULLD instruction.
1766 ///
1767 /// \param __a
1768 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1769 /// \param __b
1770 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1771 /// \returns A 256-bit vector of [8 x i32] containing the products.
1772 static __inline__  __m256i __DEFAULT_FN_ATTRS256
1773 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
1774 {
1775   return (__m256i)((__v8su)__a * (__v8su)__b);
1776 }
1777
1778 /// Multiplies unsigned 32-bit integers from even-numered elements of two
1779 ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1780 ///    [4 x i64] result.
1781 ///
1782 /// \code{.operation}
1783 /// result[63:0] := __a[31:0] * __b[31:0]
1784 /// result[127:64] := __a[95:64] * __b[95:64]
1785 /// result[191:128] := __a[159:128] * __b[159:128]
1786 /// result[255:192] := __a[223:192] * __b[223:192]
1787 /// \endcode
1788 ///
1789 /// \headerfile <immintrin.h>
1790 ///
1791 /// This intrinsic corresponds to the \c VPMULUDQ instruction.
1792 ///
1793 /// \param __a
1794 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1795 /// \param __b
1796 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1797 /// \returns A 256-bit vector of [4 x i64] containing the products.
1798 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1799 _mm256_mul_epu32(__m256i __a, __m256i __b)
1800 {
1801   return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1802 }
1803
1804 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1805 ///    \a __b.
1806 ///
1807 /// \headerfile <immintrin.h>
1808 ///
1809 /// This intrinsic corresponds to the \c VPOR instruction.
1810 ///
1811 /// \param __a
1812 ///    A 256-bit integer vector.
1813 /// \param __b
1814 ///    A 256-bit integer vector.
1815 /// \returns A 256-bit integer vector containing the result.
1816 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1817 _mm256_or_si256(__m256i __a, __m256i __b)
1818 {
1819   return (__m256i)((__v4du)__a | (__v4du)__b);
1820 }
1821
1822 /// Computes four sum of absolute difference (SAD) operations on sets of eight
1823 ///    unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1824 ///    \a __b.
1825 ///
1826 ///    One SAD result is computed for each set of eight bytes from \a __a and
1827 ///    eight bytes from \a __b. The zero-extended SAD value is returned in the
1828 ///    corresponding 64-bit element of the result.
1829 ///
1830 ///    A single SAD operation takes the differences between the corresponding
1831 ///    bytes of \a __a and \a __b, takes the absolute value of each difference,
1832 ///    and sums these eight values to form one 16-bit result. This operation
1833 ///    is repeated four times with successive sets of eight bytes.
1834 ///
1835 /// \code{.operation}
1836 /// FOR i := 0 TO 3
1837 ///   j := i*64
1838 ///   temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1839 ///   temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1840 ///   temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1841 ///   temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1842 ///   temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1843 ///   temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1844 ///   temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1845 ///   temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1846 ///   result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1847 ///                     temp4 + temp5 + temp6 + temp7
1848 ///   result[j+63:j+16] := 0
1849 /// ENDFOR
1850 /// \endcode
1851 ///
1852 /// \headerfile <immintrin.h>
1853 ///
1854 /// This intrinsic corresponds to the \c VPSADBW instruction.
1855 ///
1856 /// \param __a
1857 ///    A 256-bit integer vector.
1858 /// \param __b
1859 ///    A 256-bit integer vector.
1860 /// \returns A 256-bit integer vector containing the result.
1861 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1862 _mm256_sad_epu8(__m256i __a, __m256i __b)
1863 {
1864   return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1865 }
1866
1867 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1868 ///    to control information in the 256-bit integer vector \a __b, and
1869 ///    returns the 256-bit result. In effect there are two separate 128-bit
1870 ///    shuffles in the lower and upper halves.
1871 ///
1872 /// \code{.operation}
1873 /// FOR i := 0 TO 31
1874 ///   j := i*8
1875 ///   IF __b[j+7] == 1
1876 ///     result[j+7:j] := 0
1877 ///   ELSE
1878 ///     k := __b[j+3:j] * 8
1879 ///     IF i > 15
1880 ///       k := k + 128
1881 ///     FI
1882 ///     result[j+7:j] := __a[k+7:k]
1883 ///   FI
1884 /// ENDFOR
1885 /// \endcode
1886 ///
1887 /// \headerfile <immintrin.h>
1888 ///
1889 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1890 ///
1891 /// \param __a
1892 ///    A 256-bit integer vector containing source values.
1893 /// \param __b
1894 ///    A 256-bit integer vector containing control information to determine
1895 ///    what goes into the corresponding byte of the result. If bit 7 of the
1896 ///    control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1897 ///    control byte specify the index (within the same 128-bit half) of \a __a
1898 ///    to copy to the result byte.
1899 /// \returns A 256-bit integer vector containing the result.
1900 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1901 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
1902 {
1903   return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1904 }
1905
1906 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1907 ///    according to control information in the integer literal \a imm, and
1908 ///    returns the 256-bit result. In effect there are two parallel 128-bit
1909 ///    shuffles in the lower and upper halves.
1910 ///
1911 /// \code{.operation}
1912 /// FOR i := 0 to 3
1913 ///   j := i*32
1914 ///   k := (imm >> i*2)[1:0] * 32
1915 ///   result[j+31:j] := a[k+31:k]
1916 ///   result[128+j+31:128+j] := a[128+k+31:128+k]
1917 /// ENDFOR
1918 /// \endcode
1919 ///
1920 /// \headerfile <immintrin.h>
1921 ///
1922 /// \code
1923 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1924 /// \endcode
1925 ///
1926 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1927 ///
1928 /// \param a
1929 ///    A 256-bit vector of [8 x i32] containing source values.
1930 /// \param imm
1931 ///    An immediate 8-bit value specifying which elements to copy from \a a.
1932 ///    \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1933 ///    result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1934 ///    forth.
1935 /// \returns A 256-bit vector of [8 x i32] containing the result.
1936 #define _mm256_shuffle_epi32(a, imm) \
1937   ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1938
1939 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1940 ///    according to control information in the integer literal \a imm, and
1941 ///    returns the 256-bit result. The upper 64 bits of each 128-bit half
1942 ///    are shuffled in parallel; the lower 64 bits of each 128-bit half are
1943 ///    copied from \a a unchanged.
1944 ///
1945 /// \code{.operation}
1946 /// result[63:0] := a[63:0]
1947 /// result[191:128] := a[191:128]
1948 /// FOR i := 0 TO 3
1949 ///   j := i * 16 + 64
1950 ///   k := (imm >> i*2)[1:0] * 16 + 64
1951 ///   result[j+15:j] := a[k+15:k]
1952 ///   result[128+j+15:128+j] := a[128+k+15:128+k]
1953 /// ENDFOR
1954 /// \endcode
1955 ///
1956 /// \headerfile <immintrin.h>
1957 ///
1958 /// \code
1959 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1960 /// \endcode
1961 ///
1962 /// This intrinsic corresponds to the \c VPSHUFHW instruction.
1963 ///
1964 /// \param a
1965 ///    A 256-bit vector of [16 x i16] containing source values.
1966 /// \param imm
1967 ///    An immediate 8-bit value specifying which elements to copy from \a a.
1968 ///    \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1969 ///    result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1970 ///    forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1971 /// \returns A 256-bit vector of [16 x i16] containing the result.
1972 #define _mm256_shufflehi_epi16(a, imm) \
1973   ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1974
1975 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1976 ///    according to control information in the integer literal \a imm, and
1977 ///    returns the 256-bit [16 x i16] result. The lower 64 bits of each
1978 ///    128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1979 ///    copied from \a a unchanged.
1980 ///
1981 /// \code{.operation}
1982 /// result[127:64] := a[127:64]
1983 /// result[255:192] := a[255:192]
1984 /// FOR i := 0 TO 3
1985 ///   j := i * 16
1986 ///   k := (imm >> i*2)[1:0] * 16
1987 ///   result[j+15:j] := a[k+15:k]
1988 ///   result[128+j+15:128+j] := a[128+k+15:128+k]
1989 /// ENDFOR
1990 /// \endcode
1991 ///
1992 /// \headerfile <immintrin.h>
1993 ///
1994 /// \code
1995 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1996 /// \endcode
1997 ///
1998 /// This intrinsic corresponds to the \c VPSHUFLW instruction.
1999 ///
2000 /// \param a
2001 ///    A 256-bit vector of [16 x i16] to use as a source of data for the
2002 ///    result.
2003 /// \param imm
2004 ///    An immediate 8-bit value specifying which elements to copy from \a a.
2005 ///    \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2006 ///    result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2007 ///    forth.
2008 /// \returns A 256-bit vector of [16 x i16] containing the result.
2009 #define _mm256_shufflelo_epi16(a, imm) \
2010   ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2011
2012 /// Sets each byte of the result to the corresponding byte of the 256-bit
2013 ///    integer vector in \a __a, the negative of that byte, or zero, depending
2014 ///    on whether the corresponding byte of the 256-bit integer vector in
2015 ///    \a __b is greater than zero, less than zero, or equal to zero,
2016 ///    respectively.
2017 ///
2018 /// \headerfile <immintrin.h>
2019 ///
2020 /// This intrinsic corresponds to the \c VPSIGNB instruction.
2021 ///
2022 /// \param __a
2023 ///    A 256-bit integer vector.
2024 /// \param __b
2025 ///    A 256-bit integer vector].
2026 /// \returns A 256-bit integer vector containing the result.
2027 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2028 _mm256_sign_epi8(__m256i __a, __m256i __b)
2029 {
2030     return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2031 }
2032
2033 /// Sets each element of the result to the corresponding element of the
2034 ///    256-bit vector of [16 x i16] in \a __a, the negative of that element,
2035 ///    or zero, depending on whether the corresponding element of the 256-bit
2036 ///    vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2037 ///    equal to zero, respectively.
2038 ///
2039 /// \headerfile <immintrin.h>
2040 ///
2041 /// This intrinsic corresponds to the \c VPSIGNW instruction.
2042 ///
2043 /// \param __a
2044 ///    A 256-bit vector of [16 x i16].
2045 /// \param __b
2046 ///    A 256-bit vector of [16 x i16].
2047 /// \returns A 256-bit vector of [16 x i16] containing the result.
2048 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2049 _mm256_sign_epi16(__m256i __a, __m256i __b)
2050 {
2051     return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2052 }
2053
2054 /// Sets each element of the result to the corresponding element of the
2055 ///    256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2056 ///    zero, depending on whether the corresponding element of the 256-bit
2057 ///    vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2058 ///    equal to zero, respectively.
2059 ///
2060 /// \headerfile <immintrin.h>
2061 ///
2062 /// This intrinsic corresponds to the \c VPSIGND instruction.
2063 ///
2064 /// \param __a
2065 ///    A 256-bit vector of [8 x i32].
2066 /// \param __b
2067 ///    A 256-bit vector of [8 x i32].
2068 /// \returns A 256-bit vector of [8 x i32] containing the result.
2069 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2070 _mm256_sign_epi32(__m256i __a, __m256i __b)
2071 {
2072     return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2073 }
2074
2075 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2076 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2077 ///    is greater than 15, the returned result is all zeroes.
2078 ///
2079 /// \headerfile <immintrin.h>
2080 ///
2081 /// \code
2082 /// __m256i _mm256_slli_si256(__m256i a, const int imm);
2083 /// \endcode
2084 ///
2085 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2086 ///
2087 /// \param a
2088 ///    A 256-bit integer vector to be shifted.
2089 /// \param imm
2090 ///     An unsigned immediate value specifying the shift count (in bytes).
2091 /// \returns A 256-bit integer vector containing the result.
2092 #define _mm256_slli_si256(a, imm) \
2093   ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2094
2095 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2096 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2097 ///    is greater than 15, the returned result is all zeroes.
2098 ///
2099 /// \headerfile <immintrin.h>
2100 ///
2101 /// \code
2102 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2103 /// \endcode
2104 ///
2105 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2106 ///
2107 /// \param a
2108 ///    A 256-bit integer vector to be shifted.
2109 /// \param imm
2110 ///    An unsigned immediate value specifying the shift count (in bytes).
2111 /// \returns A 256-bit integer vector containing the result.
2112 #define _mm256_bslli_epi128(a, imm) \
2113   ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2114
2115 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2116 ///    left by \a __count bits, shifting in zero bits, and returns the result.
2117 ///    If \a __count is greater than 15, the returned result is all zeroes.
2118 ///
2119 /// \headerfile <immintrin.h>
2120 ///
2121 /// This intrinsic corresponds to the \c VPSLLW instruction.
2122 ///
2123 /// \param __a
2124 ///    A 256-bit vector of [16 x i16] to be shifted.
2125 /// \param __count
2126 ///    An unsigned integer value specifying the shift count (in bits).
2127 /// \returns A 256-bit vector of [16 x i16] containing the result.
2128 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2129 _mm256_slli_epi16(__m256i __a, int __count)
2130 {
2131   return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2132 }
2133
2134 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2135 ///    left by the number of bits specified by the lower 64 bits of \a __count,
2136 ///    shifting in zero bits, and returns the result. If \a __count is greater
2137 ///    than 15, the returned result is all zeroes.
2138 ///
2139 /// \headerfile <immintrin.h>
2140 ///
2141 /// This intrinsic corresponds to the \c VPSLLW instruction.
2142 ///
2143 /// \param __a
2144 ///    A 256-bit vector of [16 x i16] to be shifted.
2145 /// \param __count
2146 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2147 ///    shift count (in bits). The upper element is ignored.
2148 /// \returns A 256-bit vector of [16 x i16] containing the result.
2149 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2150 _mm256_sll_epi16(__m256i __a, __m128i __count)
2151 {
2152   return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2153 }
2154
2155 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2156 ///    left by \a __count bits, shifting in zero bits, and returns the result.
2157 ///    If \a __count is greater than 31, the returned result is all zeroes.
2158 ///
2159 /// \headerfile <immintrin.h>
2160 ///
2161 /// This intrinsic corresponds to the \c VPSLLD instruction.
2162 ///
2163 /// \param __a
2164 ///    A 256-bit vector of [8 x i32] to be shifted.
2165 /// \param __count
2166 ///    An unsigned integer value specifying the shift count (in bits).
2167 /// \returns A 256-bit vector of [8 x i32] containing the result.
2168 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2169 _mm256_slli_epi32(__m256i __a, int __count)
2170 {
2171   return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2172 }
2173
2174 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2175 ///    left by the number of bits given in the lower 64 bits of \a __count,
2176 ///    shifting in zero bits, and returns the result. If \a __count is greater
2177 ///    than 31, the returned result is all zeroes.
2178 ///
2179 /// \headerfile <immintrin.h>
2180 ///
2181 /// This intrinsic corresponds to the \c VPSLLD instruction.
2182 ///
2183 /// \param __a
2184 ///    A 256-bit vector of [8 x i32] to be shifted.
2185 /// \param __count
2186 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2187 ///    shift count (in bits). The upper element is ignored.
2188 /// \returns A 256-bit vector of [8 x i32] containing the result.
2189 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2190 _mm256_sll_epi32(__m256i __a, __m128i __count)
2191 {
2192   return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2193 }
2194
2195 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2196 ///    left by \a __count bits, shifting in zero bits, and returns the result.
2197 ///    If \a __count is greater than 63, the returned result is all zeroes.
2198 ///
2199 /// \headerfile <immintrin.h>
2200 ///
2201 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2202 ///
2203 /// \param __a
2204 ///    A 256-bit vector of [4 x i64] to be shifted.
2205 /// \param __count
2206 ///    An unsigned integer value specifying the shift count (in bits).
2207 /// \returns A 256-bit vector of [4 x i64] containing the result.
2208 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2209 _mm256_slli_epi64(__m256i __a, int __count)
2210 {
2211   return __builtin_ia32_psllqi256((__v4di)__a, __count);
2212 }
2213
2214 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2215 ///    left by the number of bits given in the lower 64 bits of \a __count,
2216 ///    shifting in zero bits, and returns the result. If \a __count is greater
2217 ///    than 63, the returned result is all zeroes.
2218 ///
2219 /// \headerfile <immintrin.h>
2220 ///
2221 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2222 ///
2223 /// \param __a
2224 ///    A 256-bit vector of [4 x i64] to be shifted.
2225 /// \param __count
2226 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2227 ///    shift count (in bits). The upper element is ignored.
2228 /// \returns A 256-bit vector of [4 x i64] containing the result.
2229 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2230 _mm256_sll_epi64(__m256i __a, __m128i __count)
2231 {
2232   return __builtin_ia32_psllq256((__v4di)__a, __count);
2233 }
2234
2235 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2236 ///    right by \a __count bits, shifting in sign bits, and returns the result.
2237 ///    If \a __count is greater than 15, each element of the result is either
2238 ///    0 or -1 according to the corresponding input sign bit.
2239 ///
2240 /// \headerfile <immintrin.h>
2241 ///
2242 /// This intrinsic corresponds to the \c VPSRAW instruction.
2243 ///
2244 /// \param __a
2245 ///    A 256-bit vector of [16 x i16] to be shifted.
2246 /// \param __count
2247 ///    An unsigned integer value specifying the shift count (in bits).
2248 /// \returns A 256-bit vector of [16 x i16] containing the result.
2249 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2250 _mm256_srai_epi16(__m256i __a, int __count)
2251 {
2252   return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2253 }
2254
2255 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2256 ///    right by the number of bits given in the lower 64 bits of \a __count,
2257 ///    shifting in sign bits, and returns the result. If \a __count is greater
2258 ///    than 15, each element of the result is either 0 or -1 according to the
2259 ///    corresponding input sign bit.
2260 ///
2261 /// \headerfile <immintrin.h>
2262 ///
2263 /// This intrinsic corresponds to the \c VPSRAW instruction.
2264 ///
2265 /// \param __a
2266 ///    A 256-bit vector of [16 x i16] to be shifted.
2267 /// \param __count
2268 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2269 ///    shift count (in bits). The upper element is ignored.
2270 /// \returns A 256-bit vector of [16 x i16] containing the result.
2271 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2272 _mm256_sra_epi16(__m256i __a, __m128i __count)
2273 {
2274   return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2275 }
2276
2277 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2278 ///    right by \a __count bits, shifting in sign bits, and returns the result.
2279 ///    If \a __count is greater than 31, each element of the result is either
2280 ///    0 or -1 according to the corresponding input sign bit.
2281 ///
2282 /// \headerfile <immintrin.h>
2283 ///
2284 /// This intrinsic corresponds to the \c VPSRAD instruction.
2285 ///
2286 /// \param __a
2287 ///    A 256-bit vector of [8 x i32] to be shifted.
2288 /// \param __count
2289 ///    An unsigned integer value specifying the shift count (in bits).
2290 /// \returns A 256-bit vector of [8 x i32] containing the result.
2291 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2292 _mm256_srai_epi32(__m256i __a, int __count)
2293 {
2294   return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2295 }
2296
2297 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2298 ///    right by the number of bits given in the lower 64 bits of \a __count,
2299 ///    shifting in sign bits, and returns the result. If \a __count is greater
2300 ///    than 31, each element of the result is either 0 or -1 according to the
2301 ///    corresponding input sign bit.
2302 ///
2303 /// \headerfile <immintrin.h>
2304 ///
2305 /// This intrinsic corresponds to the \c VPSRAD instruction.
2306 ///
2307 /// \param __a
2308 ///    A 256-bit vector of [8 x i32] to be shifted.
2309 /// \param __count
2310 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2311 ///    shift count (in bits). The upper element is ignored.
2312 /// \returns A 256-bit vector of [8 x i32] containing the result.
2313 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2314 _mm256_sra_epi32(__m256i __a, __m128i __count)
2315 {
2316   return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2317 }
2318
2319 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2320 ///    \a imm bytes, shifting in zero bytes, and returns the result. If
2321 ///    \a imm is greater than 15, the returned result is all zeroes.
2322 ///
2323 /// \headerfile <immintrin.h>
2324 ///
2325 /// \code
2326 /// __m256i _mm256_srli_si256(__m256i a, const int imm);
2327 /// \endcode
2328 ///
2329 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2330 ///
2331 /// \param a
2332 ///    A 256-bit integer vector to be shifted.
2333 /// \param imm
2334 ///    An unsigned immediate value specifying the shift count (in bytes).
2335 /// \returns A 256-bit integer vector containing the result.
2336 #define _mm256_srli_si256(a, imm) \
2337   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2338
2339 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2340 ///    \a imm bytes, shifting in zero bytes, and returns the result. If
2341 ///    \a imm is greater than 15, the returned result is all zeroes.
2342 ///
2343 /// \headerfile <immintrin.h>
2344 ///
2345 /// \code
2346 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2347 /// \endcode
2348 ///
2349 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2350 ///
2351 /// \param a
2352 ///    A 256-bit integer vector to be shifted.
2353 /// \param imm
2354 ///     An unsigned immediate value specifying the shift count (in bytes).
2355 /// \returns A 256-bit integer vector containing the result.
2356 #define _mm256_bsrli_epi128(a, imm) \
2357   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2358
2359 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2360 ///    right by \a __count bits, shifting in zero bits, and returns the result.
2361 ///    If \a __count is greater than 15, the returned result is all zeroes.
2362 ///
2363 /// \headerfile <immintrin.h>
2364 ///
2365 /// This intrinsic corresponds to the \c VPSRLW instruction.
2366 ///
2367 /// \param __a
2368 ///    A 256-bit vector of [16 x i16] to be shifted.
2369 /// \param __count
2370 ///    An unsigned integer value specifying the shift count (in bits).
2371 /// \returns A 256-bit vector of [16 x i16] containing the result.
2372 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2373 _mm256_srli_epi16(__m256i __a, int __count)
2374 {
2375   return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2376 }
2377
2378 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2379 ///    right by the number of bits given in the lower 64 bits of \a __count,
2380 ///    shifting in zero bits, and returns the result. If \a __count is greater
2381 ///    than 15, the returned result is all zeroes.
2382 ///
2383 /// \headerfile <immintrin.h>
2384 ///
2385 /// This intrinsic corresponds to the \c VPSRLW instruction.
2386 ///
2387 /// \param __a
2388 ///    A 256-bit vector of [16 x i16] to be shifted.
2389 /// \param __count
2390 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2391 ///    shift count (in bits). The upper element is ignored.
2392 /// \returns A 256-bit vector of [16 x i16] containing the result.
2393 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2394 _mm256_srl_epi16(__m256i __a, __m128i __count)
2395 {
2396   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2397 }
2398
2399 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2400 ///    right by \a __count bits, shifting in zero bits, and returns the result.
2401 ///    If \a __count is greater than 31, the returned result is all zeroes.
2402 ///
2403 /// \headerfile <immintrin.h>
2404 ///
2405 /// This intrinsic corresponds to the \c VPSRLD instruction.
2406 ///
2407 /// \param __a
2408 ///    A 256-bit vector of [8 x i32] to be shifted.
2409 /// \param __count
2410 ///    An unsigned integer value specifying the shift count (in bits).
2411 /// \returns A 256-bit vector of [8 x i32] containing the result.
2412 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2413 _mm256_srli_epi32(__m256i __a, int __count)
2414 {
2415   return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2416 }
2417
2418 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2419 ///    right by the number of bits given in the lower 64 bits of \a __count,
2420 ///    shifting in zero bits, and returns the result. If \a __count is greater
2421 ///    than 31, the returned result is all zeroes.
2422 ///
2423 /// \headerfile <immintrin.h>
2424 ///
2425 /// This intrinsic corresponds to the \c VPSRLD instruction.
2426 ///
2427 /// \param __a
2428 ///    A 256-bit vector of [8 x i32] to be shifted.
2429 /// \param __count
2430 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2431 ///    shift count (in bits). The upper element is ignored.
2432 /// \returns A 256-bit vector of [8 x i32] containing the result.
2433 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2434 _mm256_srl_epi32(__m256i __a, __m128i __count)
2435 {
2436   return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2437 }
2438
2439 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2440 ///    right by \a __count bits, shifting in zero bits, and returns the result.
2441 ///    If \a __count is greater than 63, the returned result is all zeroes.
2442 ///
2443 /// \headerfile <immintrin.h>
2444 ///
2445 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2446 ///
2447 /// \param __a
2448 ///    A 256-bit vector of [4 x i64] to be shifted.
2449 /// \param __count
2450 ///    An unsigned integer value specifying the shift count (in bits).
2451 /// \returns A 256-bit vector of [4 x i64] containing the result.
2452 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2453 _mm256_srli_epi64(__m256i __a, int __count)
2454 {
2455   return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2456 }
2457
2458 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2459 ///    right by the number of bits given in the lower 64 bits of \a __count,
2460 ///    shifting in zero bits, and returns the result. If \a __count is greater
2461 ///    than 63, the returned result is all zeroes.
2462 ///
2463 /// \headerfile <immintrin.h>
2464 ///
2465 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2466 ///
2467 /// \param __a
2468 ///    A 256-bit vector of [4 x i64] to be shifted.
2469 /// \param __count
2470 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2471 ///    shift count (in bits). The upper element is ignored.
2472 /// \returns A 256-bit vector of [4 x i64] containing the result.
2473 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2474 _mm256_srl_epi64(__m256i __a, __m128i __count)
2475 {
2476   return __builtin_ia32_psrlq256((__v4di)__a, __count);
2477 }
2478
2479 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2480 ///    vectors. Returns the lower 8 bits of each difference in the
2481 ///    corresponding byte of the 256-bit integer vector result (overflow is
2482 ///    ignored).
2483 ///
2484 /// \code{.operation}
2485 /// FOR i := 0 TO 31
2486 ///   j := i*8
2487 ///   result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2488 /// ENDFOR
2489 /// \endcode
2490 ///
2491 /// \headerfile <immintrin.h>
2492 ///
2493 /// This intrinsic corresponds to the \c VPSUBB instruction.
2494 ///
2495 /// \param __a
2496 ///    A 256-bit integer vector containing the minuends.
2497 /// \param __b
2498 ///    A 256-bit integer vector containing the subtrahends.
2499 /// \returns A 256-bit integer vector containing the differences.
2500 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2501 _mm256_sub_epi8(__m256i __a, __m256i __b)
2502 {
2503   return (__m256i)((__v32qu)__a - (__v32qu)__b);
2504 }
2505
2506 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2507 ///    vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2508 ///    the corresponding element of the [16 x i16] result (overflow is
2509 ///    ignored).
2510 ///
2511 /// \code{.operation}
2512 /// FOR i := 0 TO 15
2513 ///   j := i*16
2514 ///   result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2515 /// ENDFOR
2516 /// \endcode
2517 ///
2518 /// \headerfile <immintrin.h>
2519 ///
2520 /// This intrinsic corresponds to the \c VPSUBW instruction.
2521 ///
2522 /// \param __a
2523 ///    A 256-bit vector of [16 x i16] containing the minuends.
2524 /// \param __b
2525 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
2526 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2527 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2528 _mm256_sub_epi16(__m256i __a, __m256i __b)
2529 {
2530   return (__m256i)((__v16hu)__a - (__v16hu)__b);
2531 }
2532
2533 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
2534 ///    vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2535 ///    the corresponding element of the [8 x i32] result (overflow is ignored).
2536 ///
2537 /// \code{.operation}
2538 /// FOR i := 0 TO 7
2539 ///   j := i*32
2540 ///   result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2541 /// ENDFOR
2542 /// \endcode
2543 ///
2544 /// \headerfile <immintrin.h>
2545 ///
2546 /// This intrinsic corresponds to the \c VPSUBD instruction.
2547 ///
2548 /// \param __a
2549 ///    A 256-bit vector of [8 x i32] containing the minuends.
2550 /// \param __b
2551 ///    A 256-bit vector of [8 x i32] containing the subtrahends.
2552 /// \returns A 256-bit vector of [8 x i32] containing the differences.
2553 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2554 _mm256_sub_epi32(__m256i __a, __m256i __b)
2555 {
2556   return (__m256i)((__v8su)__a - (__v8su)__b);
2557 }
2558
2559 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
2560 ///    vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2561 ///    the corresponding element of the [4 x i64] result (overflow is ignored).
2562 ///
2563 /// \code{.operation}
2564 /// FOR i := 0 TO 3
2565 ///   j := i*64
2566 ///   result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2567 /// ENDFOR
2568 /// \endcode
2569 ///
2570 /// \headerfile <immintrin.h>
2571 ///
2572 /// This intrinsic corresponds to the \c VPSUBQ instruction.
2573 ///
2574 /// \param __a
2575 ///    A 256-bit vector of [4 x i64] containing the minuends.
2576 /// \param __b
2577 ///    A 256-bit vector of [4 x i64] containing the subtrahends.
2578 /// \returns A 256-bit vector of [4 x i64] containing the differences.
2579 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2580 _mm256_sub_epi64(__m256i __a, __m256i __b)
2581 {
2582   return (__m256i)((__v4du)__a - (__v4du)__b);
2583 }
2584
2585 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2586 ///    vectors using signed saturation, and returns each differences in the
2587 ///    corresponding byte of the 256-bit integer vector result.
2588 ///
2589 /// \code{.operation}
2590 /// FOR i := 0 TO 31
2591 ///   j := i*8
2592 ///   result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2593 /// ENDFOR
2594 /// \endcode
2595 ///
2596 /// \headerfile <immintrin.h>
2597 ///
2598 /// This intrinsic corresponds to the \c VPSUBSB instruction.
2599 ///
2600 /// \param __a
2601 ///    A 256-bit integer vector containing the minuends.
2602 /// \param __b
2603 ///    A 256-bit integer vector containing the subtrahends.
2604 /// \returns A 256-bit integer vector containing the differences.
2605 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2606 _mm256_subs_epi8(__m256i __a, __m256i __b)
2607 {
2608   return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2609 }
2610
2611 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2612 ///    vectors of [16 x i16] using signed saturation, and returns each
2613 ///    difference in the corresponding element of the [16 x i16] result.
2614 ///
2615 /// \code{.operation}
2616 /// FOR i := 0 TO 15
2617 ///   j := i*16
2618 ///   result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2619 /// ENDFOR
2620 /// \endcode
2621 ///
2622 /// \headerfile <immintrin.h>
2623 ///
2624 /// This intrinsic corresponds to the \c VPSUBSW instruction.
2625 ///
2626 /// \param __a
2627 ///    A 256-bit vector of [16 x i16] containing the minuends.
2628 /// \param __b
2629 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
2630 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2631 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2632 _mm256_subs_epi16(__m256i __a, __m256i __b)
2633 {
2634   return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2635 }
2636
2637 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2638 ///    vectors using unsigned saturation, and returns each difference in the
2639 ///    corresponding byte of the 256-bit integer vector result. For each byte,
2640 ///    computes <c> result = __a - __b </c>.
2641 ///
2642 /// \code{.operation}
2643 /// FOR i := 0 TO 31
2644 ///   j := i*8
2645 ///   result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2646 /// ENDFOR
2647 /// \endcode
2648 ///
2649 /// \headerfile <immintrin.h>
2650 ///
2651 /// This intrinsic corresponds to the \c VPSUBUSB instruction.
2652 ///
2653 /// \param __a
2654 ///    A 256-bit integer vector containing the minuends.
2655 /// \param __b
2656 ///    A 256-bit integer vector containing the subtrahends.
2657 /// \returns A 256-bit integer vector containing the differences.
2658 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2659 _mm256_subs_epu8(__m256i __a, __m256i __b)
2660 {
2661   return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2662 }
2663
2664 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2665 ///    vectors of [16 x i16] using unsigned saturation, and returns each
2666 ///    difference in the corresponding element of the [16 x i16] result.
2667 ///
2668 /// \code{.operation}
2669 /// FOR i := 0 TO 15
2670 ///   j := i*16
2671 ///   result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2672 /// ENDFOR
2673 /// \endcode
2674 ///
2675 /// \headerfile <immintrin.h>
2676 ///
2677 /// This intrinsic corresponds to the \c VPSUBUSW instruction.
2678 ///
2679 /// \param __a
2680 ///    A 256-bit vector of [16 x i16] containing the minuends.
2681 /// \param __b
2682 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
2683 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2684 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2685 _mm256_subs_epu16(__m256i __a, __m256i __b)
2686 {
2687   return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2688 }
2689
2690 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2691 ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2692 ///    uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2693 ///    input; other bits in these parameters are ignored.
2694 ///
2695 /// \code{.operation}
2696 /// result[7:0] := __a[71:64]
2697 /// result[15:8] := __b[71:64]
2698 /// result[23:16] := __a[79:72]
2699 /// result[31:24] := __b[79:72]
2700 /// . . .
2701 /// result[127:120] := __b[127:120]
2702 /// result[135:128] := __a[199:192]
2703 /// . . .
2704 /// result[255:248] := __b[255:248]
2705 /// \endcode
2706 ///
2707 /// \headerfile <immintrin.h>
2708 ///
2709 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2710 ///
2711 /// \param __a
2712 ///    A 256-bit integer vector used as the source for the even-numbered bytes
2713 ///    of the result.
2714 /// \param __b
2715 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
2716 ///    of the result.
2717 /// \returns A 256-bit integer vector containing the result.
2718 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2719 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
2720 {
2721   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2722 }
2723
2724 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2725 ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2726 ///    vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2727 ///    128-bit half of \a __a and \a __b as input; other bits in these
2728 ///    parameters are ignored.
2729 ///
2730 /// \code{.operation}
2731 /// result[15:0] := __a[79:64]
2732 /// result[31:16] := __b[79:64]
2733 /// result[47:32] := __a[95:80]
2734 /// result[63:48] := __b[95:80]
2735 /// . . .
2736 /// result[127:112] := __b[127:112]
2737 /// result[143:128] := __a[211:196]
2738 /// . . .
2739 /// result[255:240] := __b[255:240]
2740 /// \endcode
2741 ///
2742 /// \headerfile <immintrin.h>
2743 ///
2744 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2745 ///
2746 /// \param __a
2747 ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2748 ///    elements of the result.
2749 /// \param __b
2750 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2751 ///    elements of the result.
2752 /// \returns A 256-bit vector of [16 x i16] containing the result.
2753 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2754 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
2755 {
2756   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2757 }
2758
2759 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2760 ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2761 ///    of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2762 ///    of \a __a and \a __b as input; other bits in these parameters are
2763 ///    ignored.
2764 ///
2765 /// \code{.operation}
2766 /// result[31:0] := __a[95:64]
2767 /// result[63:32] := __b[95:64]
2768 /// result[95:64] := __a[127:96]
2769 /// result[127:96] := __b[127:96]
2770 /// result[159:128] := __a[223:192]
2771 /// result[191:160] := __b[223:192]
2772 /// result[223:192] := __a[255:224]
2773 /// result[255:224] := __b[255:224]
2774 /// \endcode
2775 ///
2776 /// \headerfile <immintrin.h>
2777 ///
2778 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2779 ///
2780 /// \param __a
2781 ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2782 ///    elements of the result.
2783 /// \param __b
2784 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2785 ///    elements of the result.
2786 /// \returns A 256-bit vector of [8 x i32] containing the result.
2787 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2788 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
2789 {
2790   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2791 }
2792
2793 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2794 ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2795 ///    of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2796 ///    of \a __a and \a __b as input; other bits in these parameters are
2797 ///    ignored.
2798 ///
2799 /// \code{.operation}
2800 /// result[63:0] := __a[127:64]
2801 /// result[127:64] := __b[127:64]
2802 /// result[191:128] := __a[255:192]
2803 /// result[255:192] := __b[255:192]
2804 /// \endcode
2805 ///
2806 /// \headerfile <immintrin.h>
2807 ///
2808 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2809 ///
2810 /// \param __a
2811 ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2812 ///    elements of the result.
2813 /// \param __b
2814 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2815 ///    elements of the result.
2816 /// \returns A 256-bit vector of [4 x i64] containing the result.
2817 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2818 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
2819 {
2820   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2821 }
2822
2823 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2824 ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2825 ///    uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2826 ///    input; other bits in these parameters are ignored.
2827 ///
2828 /// \code{.operation}
2829 /// result[7:0] := __a[7:0]
2830 /// result[15:8] := __b[7:0]
2831 /// result[23:16] := __a[15:8]
2832 /// result[31:24] := __b[15:8]
2833 /// . . .
2834 /// result[127:120] := __b[63:56]
2835 /// result[135:128] := __a[135:128]
2836 /// . . .
2837 /// result[255:248] := __b[191:184]
2838 /// \endcode
2839 ///
2840 /// \headerfile <immintrin.h>
2841 ///
2842 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2843 ///
2844 /// \param __a
2845 ///    A 256-bit integer vector used as the source for the even-numbered bytes
2846 ///    of the result.
2847 /// \param __b
2848 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
2849 ///    of the result.
2850 /// \returns A 256-bit integer vector containing the result.
2851 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2852 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
2853 {
2854   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2855 }
2856
2857 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2858 ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2859 ///    vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2860 ///    128-bit half of \a __a and \a __b as input; other bits in these
2861 ///    parameters are ignored.
2862 ///
2863 /// \code{.operation}
2864 /// result[15:0] := __a[15:0]
2865 /// result[31:16] := __b[15:0]
2866 /// result[47:32] := __a[31:16]
2867 /// result[63:48] := __b[31:16]
2868 /// . . .
2869 /// result[127:112] := __b[63:48]
2870 /// result[143:128] := __a[143:128]
2871 /// . . .
2872 /// result[255:239] := __b[191:176]
2873 /// \endcode
2874 ///
2875 /// \headerfile <immintrin.h>
2876 ///
2877 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2878 ///
2879 /// \param __a
2880 ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2881 ///    elements of the result.
2882 /// \param __b
2883 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2884 ///    elements of the result.
2885 /// \returns A 256-bit vector of [16 x i16] containing the result.
2886 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2887 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
2888 {
2889   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2890 }
2891
2892 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2893 ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2894 ///    of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2895 ///    of \a __a and \a __b as input; other bits in these parameters are
2896 ///    ignored.
2897 ///
2898 /// \code{.operation}
2899 /// result[31:0] := __a[31:0]
2900 /// result[63:32] := __b[31:0]
2901 /// result[95:64] := __a[63:32]
2902 /// result[127:96] := __b[63:32]
2903 /// result[159:128] := __a[159:128]
2904 /// result[191:160] := __b[159:128]
2905 /// result[223:192] := __a[191:160]
2906 /// result[255:224] := __b[191:190]
2907 /// \endcode
2908 ///
2909 /// \headerfile <immintrin.h>
2910 ///
2911 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2912 ///
2913 /// \param __a
2914 ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2915 ///    elements of the result.
2916 /// \param __b
2917 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2918 ///    elements of the result.
2919 /// \returns A 256-bit vector of [8 x i32] containing the result.
2920 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2921 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
2922 {
2923   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2924 }
2925
2926 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2927 ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2928 ///    of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2929 ///    of \a __a and \a __b as input; other bits in these parameters are
2930 ///    ignored.
2931 ///
2932 /// \code{.operation}
2933 /// result[63:0] := __a[63:0]
2934 /// result[127:64] := __b[63:0]
2935 /// result[191:128] := __a[191:128]
2936 /// result[255:192] := __b[191:128]
2937 /// \endcode
2938 ///
2939 /// \headerfile <immintrin.h>
2940 ///
2941 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2942 ///
2943 /// \param __a
2944 ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2945 ///    elements of the result.
2946 /// \param __b
2947 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2948 ///    elements of the result.
2949 /// \returns A 256-bit vector of [4 x i64] containing the result.
2950 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2951 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
2952 {
2953   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2954 }
2955
2956 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2957 ///    \a __b.
2958 ///
2959 /// \headerfile <immintrin.h>
2960 ///
2961 /// This intrinsic corresponds to the \c VPXOR instruction.
2962 ///
2963 /// \param __a
2964 ///    A 256-bit integer vector.
2965 /// \param __b
2966 ///    A 256-bit integer vector.
2967 /// \returns A 256-bit integer vector containing the result.
2968 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2969 _mm256_xor_si256(__m256i __a, __m256i __b)
2970 {
2971   return (__m256i)((__v4du)__a ^ (__v4du)__b);
2972 }
2973
2974 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2975 ///   memory hint and returns the vector. \a __V must be aligned on a 32-byte
2976 ///   boundary.
2977 ///
2978 /// \headerfile <immintrin.h>
2979 ///
2980 /// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2981 ///
2982 /// \param __V
2983 ///    A pointer to the 32-byte aligned memory containing the vector to load.
2984 /// \returns A 256-bit integer vector loaded from memory.
2985 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2986 _mm256_stream_load_si256(const void *__V)
2987 {
2988   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2989   return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2990 }
2991
2992 /// Broadcasts the 32-bit floating-point value from the low element of the
2993 ///    128-bit vector of [4 x float] in \a __X to all elements of the result's
2994 ///    128-bit vector of [4 x float].
2995 ///
2996 /// \headerfile <immintrin.h>
2997 ///
2998 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2999 ///
3000 /// \param __X
3001 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
3002 /// \returns A 128-bit vector of [4 x float] containing the result.
3003 static __inline__ __m128 __DEFAULT_FN_ATTRS128
3004 _mm_broadcastss_ps(__m128 __X)
3005 {
3006   return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
3007 }
3008
3009 /// Broadcasts the 64-bit floating-point value from the low element of the
3010 ///    128-bit vector of [2 x double] in \a __a to both elements of the
3011 ///    result's 128-bit vector of [2 x double].
3012 ///
3013 /// \headerfile <immintrin.h>
3014 ///
3015 /// This intrinsic corresponds to the \c MOVDDUP instruction.
3016 ///
3017 /// \param __a
3018 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
3019 /// \returns A 128-bit vector of [2 x double] containing the result.
3020 static __inline__ __m128d __DEFAULT_FN_ATTRS128
3021 _mm_broadcastsd_pd(__m128d __a)
3022 {
3023   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
3024 }
3025
3026 /// Broadcasts the 32-bit floating-point value from the low element of the
3027 ///    128-bit vector of [4 x float] in \a __X to all elements of the
3028 ///    result's 256-bit vector of [8 x float].
3029 ///
3030 /// \headerfile <immintrin.h>
3031 ///
3032 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3033 ///
3034 /// \param __X
3035 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
3036 /// \returns A 256-bit vector of [8 x float] containing the result.
3037 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3038 _mm256_broadcastss_ps(__m128 __X)
3039 {
3040   return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3041 }
3042
3043 /// Broadcasts the 64-bit floating-point value from the low element of the
3044 ///    128-bit vector of [2 x double] in \a __X to all elements of the
3045 ///    result's 256-bit vector of [4 x double].
3046 ///
3047 /// \headerfile <immintrin.h>
3048 ///
3049 /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3050 ///
3051 /// \param __X
3052 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
3053 /// \returns A 256-bit vector of [4 x double] containing the result.
3054 static __inline__ __m256d __DEFAULT_FN_ATTRS256
3055 _mm256_broadcastsd_pd(__m128d __X)
3056 {
3057   return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3058 }
3059
3060 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
3061 ///    upper halves of the 256-bit result.
3062 ///
3063 /// \headerfile <immintrin.h>
3064 ///
3065 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3066 ///
3067 /// \param __X
3068 ///    A 128-bit integer vector to be broadcast.
3069 /// \returns A 256-bit integer vector containing the result.
3070 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3071 _mm256_broadcastsi128_si256(__m128i __X)
3072 {
3073   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3074 }
3075
3076 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3077
3078 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
3079 ///    [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3080 ///    as specified by the immediate integer operand \a M.
3081 ///
3082 /// \code{.operation}
3083 /// FOR i := 0 TO 3
3084 ///   j := i*32
3085 ///   IF M[i] == 0
3086 ///     result[31+j:j] := V1[31+j:j]
3087 ///   ELSE
3088 ///     result[31+j:j] := V2[32+j:j]
3089 ///   FI
3090 /// ENDFOR
3091 /// \endcode
3092 ///
3093 /// \headerfile <immintrin.h>
3094 ///
3095 /// \code
3096 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3097 /// \endcode
3098 ///
3099 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3100 ///
3101 /// \param V1
3102 ///    A 128-bit vector of [4 x i32] containing source values.
3103 /// \param V2
3104 ///    A 128-bit vector of [4 x i32] containing source values.
3105 /// \param M
3106 ///    An immediate 8-bit integer operand, with bits [3:0] specifying the
3107 ///    source for each element of the result. The position of the mask bit
3108 ///    corresponds to the index of a copied value. When a mask bit is 0, the
3109 ///    element is copied from \a V1; otherwise, it is copied from \a V2.
3110 /// \returns A 128-bit vector of [4 x i32] containing the result.
3111 #define _mm_blend_epi32(V1, V2, M) \
3112   ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3113                                       (__v4si)(__m128i)(V2), (int)(M)))
3114
3115 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
3116 ///    [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3117 ///    as specified by the immediate integer operand \a M.
3118 ///
3119 /// \code{.operation}
3120 /// FOR i := 0 TO 7
3121 ///   j := i*32
3122 ///   IF M[i] == 0
3123 ///     result[31+j:j] := V1[31+j:j]
3124 ///   ELSE
3125 ///     result[31+j:j] := V2[32+j:j]
3126 ///   FI
3127 /// ENDFOR
3128 /// \endcode
3129 ///
3130 /// \headerfile <immintrin.h>
3131 ///
3132 /// \code
3133 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3134 /// \endcode
3135 ///
3136 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3137 ///
3138 /// \param V1
3139 ///    A 256-bit vector of [8 x i32] containing source values.
3140 /// \param V2
3141 ///    A 256-bit vector of [8 x i32] containing source values.
3142 /// \param M
3143 ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
3144 ///    source for each element of the result. The position of the mask bit
3145 ///    corresponds to the index of a copied value. When a mask bit is 0, the
3146 ///    element is copied from \a V1; otherwise, it is is copied from \a V2.
3147 /// \returns A 256-bit vector of [8 x i32] containing the result.
3148 #define _mm256_blend_epi32(V1, V2, M) \
3149   ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3150                                       (__v8si)(__m256i)(V2), (int)(M)))
3151
3152 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3153 ///    bytes of the 256-bit result.
3154 ///
3155 /// \headerfile <immintrin.h>
3156 ///
3157 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3158 ///
3159 /// \param __X
3160 ///    A 128-bit integer vector whose low byte will be broadcast.
3161 /// \returns A 256-bit integer vector containing the result.
3162 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3163 _mm256_broadcastb_epi8(__m128i __X)
3164 {
3165   return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3166 }
3167
3168 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3169 ///    to all elements of the result's 256-bit vector of [16 x i16].
3170 ///
3171 /// \headerfile <immintrin.h>
3172 ///
3173 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3174 ///
3175 /// \param __X
3176 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
3177 /// \returns A 256-bit vector of [16 x i16] containing the result.
3178 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3179 _mm256_broadcastw_epi16(__m128i __X)
3180 {
3181   return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3182 }
3183
3184 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3185 ///    to all elements of the result's 256-bit vector of [8 x i32].
3186 ///
3187 /// \headerfile <immintrin.h>
3188 ///
3189 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3190 ///
3191 /// \param __X
3192 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
3193 /// \returns A 256-bit vector of [8 x i32] containing the result.
3194 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3195 _mm256_broadcastd_epi32(__m128i __X)
3196 {
3197   return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3198 }
3199
3200 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3201 ///    to all elements of the result's 256-bit vector of [4 x i64].
3202 ///
3203 /// \headerfile <immintrin.h>
3204 ///
3205 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3206 ///
3207 /// \param __X
3208 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
3209 /// \returns A 256-bit vector of [4 x i64] containing the result.
3210 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3211 _mm256_broadcastq_epi64(__m128i __X)
3212 {
3213   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3214 }
3215
3216 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3217 ///    bytes of the 128-bit result.
3218 ///
3219 /// \headerfile <immintrin.h>
3220 ///
3221 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3222 ///
3223 /// \param __X
3224 ///    A 128-bit integer vector whose low byte will be broadcast.
3225 /// \returns A 128-bit integer vector containing the result.
3226 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3227 _mm_broadcastb_epi8(__m128i __X)
3228 {
3229   return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3230 }
3231
3232 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3233 ///    \a __X to all elements of the result's 128-bit vector of [8 x i16].
3234 ///
3235 /// \headerfile <immintrin.h>
3236 ///
3237 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3238 ///
3239 /// \param __X
3240 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
3241 /// \returns A 128-bit vector of [8 x i16] containing the result.
3242 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3243 _mm_broadcastw_epi16(__m128i __X)
3244 {
3245   return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3246 }
3247
3248 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3249 ///    to all elements of the result's vector of [4 x i32].
3250 ///
3251 /// \headerfile <immintrin.h>
3252 ///
3253 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3254 ///
3255 /// \param __X
3256 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
3257 /// \returns A 128-bit vector of [4 x i32] containing the result.
3258 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3259 _mm_broadcastd_epi32(__m128i __X)
3260 {
3261   return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3262 }
3263
3264 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3265 ///    to both elements of the result's 128-bit vector of [2 x i64].
3266 ///
3267 /// \headerfile <immintrin.h>
3268 ///
3269 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3270 ///
3271 /// \param __X
3272 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
3273 /// \returns A 128-bit vector of [2 x i64] containing the result.
3274 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3275 _mm_broadcastq_epi64(__m128i __X)
3276 {
3277   return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3278 }
3279
3280 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3281 ///    256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3282 ///    elements of the 256-bit vector of [8 x i32] in \a __b.
3283 ///
3284 /// \code{.operation}
3285 /// FOR i := 0 TO 7
3286 ///   j := i*32
3287 ///   k := __b[j+2:j] * 32
3288 ///   result[j+31:j] := __a[k+31:k]
3289 /// ENDFOR
3290 /// \endcode
3291 ///
3292 /// \headerfile <immintrin.h>
3293 ///
3294 /// This intrinsic corresponds to the \c VPERMD instruction.
3295 ///
3296 /// \param __a
3297 ///    A 256-bit vector of [8 x i32] containing the source values.
3298 /// \param __b
3299 ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
3300 ///    \a __a.
3301 /// \returns A 256-bit vector of [8 x i32] containing the result.
3302 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3303 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
3304 {
3305   return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3306 }
3307
3308 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3309 ///    the 256-bit vector of [4 x double] in \a V as specified by the
3310 ///    immediate value \a M.
3311 ///
3312 /// \code{.operation}
3313 /// FOR i := 0 TO 3
3314 ///   j := i*64
3315 ///   k := (M >> i*2)[1:0] * 64
3316 ///   result[j+63:j] := V[k+63:k]
3317 /// ENDFOR
3318 /// \endcode
3319 ///
3320 /// \headerfile <immintrin.h>
3321 ///
3322 /// \code
3323 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3324 /// \endcode
3325 ///
3326 /// This intrinsic corresponds to the \c VPERMPD instruction.
3327 ///
3328 /// \param V
3329 ///    A 256-bit vector of [4 x double] containing the source values.
3330 /// \param M
3331 ///    An immediate 8-bit value specifying which elements to copy from \a V.
3332 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
3333 ///    \a M[3:2] specifies the index for element 1, and so forth.
3334 /// \returns A 256-bit vector of [4 x double] containing the result.
3335 #define _mm256_permute4x64_pd(V, M) \
3336   ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3337
3338 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3339 ///    the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3340 ///    the elements of the 256-bit vector of [8 x i32] in \a __b.
3341 ///
3342 /// \code{.operation}
3343 /// FOR i := 0 TO 7
3344 ///   j := i*32
3345 ///   k := __b[j+2:j] * 32
3346 ///   result[j+31:j] := __a[k+31:k]
3347 /// ENDFOR
3348 /// \endcode
3349 ///
3350 /// \headerfile <immintrin.h>
3351 ///
3352 /// This intrinsic corresponds to the \c VPERMPS instruction.
3353 ///
3354 /// \param __a
3355 ///    A 256-bit vector of [8 x float] containing the source values.
3356 /// \param __b
3357 ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
3358 ///    \a __a.
3359 /// \returns A 256-bit vector of [8 x float] containing the result.
3360 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3361 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
3362 {
3363   return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3364 }
3365
3366 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3367 ///    of the 256-bit vector of [4 x i64] in \a V as specified by the
3368 ///    immediate value \a M.
3369 ///
3370 /// \code{.operation}
3371 /// FOR i := 0 TO 3
3372 ///   j := i*64
3373 ///   k := (M >> i*2)[1:0] * 64
3374 ///   result[j+63:j] := V[k+63:k]
3375 /// ENDFOR
3376 /// \endcode
3377 ///
3378 /// \headerfile <immintrin.h>
3379 ///
3380 /// \code
3381 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3382 /// \endcode
3383 ///
3384 /// This intrinsic corresponds to the \c VPERMQ instruction.
3385 ///
3386 /// \param V
3387 ///    A 256-bit vector of [4 x i64] containing the source values.
3388 /// \param M
3389 ///    An immediate 8-bit value specifying which elements to copy from \a V.
3390 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
3391 ///    \a M[3:2] specifies the index for element 1, and so forth.
3392 /// \returns A 256-bit vector of [4 x i64] containing the result.
3393 #define _mm256_permute4x64_epi64(V, M) \
3394   ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3395
3396 /// Sets each half of the 256-bit result either to zero or to one of the
3397 ///    four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3398 ///    as specified by the immediate value \a M.
3399 ///
3400 /// \code{.operation}
3401 /// FOR i := 0 TO 1
3402 ///   j := i*128
3403 ///   k := M >> (i*4)
3404 ///   IF k[3] == 0
3405 ///     CASE (k[1:0]) OF
3406 ///     0: result[127+j:j] := V1[127:0]
3407 ///     1: result[127+j:j] := V1[255:128]
3408 ///     2: result[127+j:j] := V2[127:0]
3409 ///     3: result[127+j:j] := V2[255:128]
3410 ///     ESAC
3411 ///   ELSE
3412 ///     result[127+j:j] := 0
3413 ///   FI
3414 /// ENDFOR
3415 /// \endcode
3416 ///
3417 /// \headerfile <immintrin.h>
3418 ///
3419 /// \code
3420 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3421 /// \endcode
3422 ///
3423 /// This intrinsic corresponds to the \c VPERM2I128 instruction.
3424 ///
3425 /// \param V1
3426 ///    A 256-bit integer vector containing source values.
3427 /// \param V2
3428 ///    A 256-bit integer vector containing source values.
3429 /// \param M
3430 ///    An immediate value specifying how to form the result. Bits [3:0]
3431 ///    control the lower half of the result, bits [7:4] control the upper half.
3432 ///    Within each 4-bit control value, if bit 3 is 1, the result is zero,
3433 ///    otherwise bits [1:0] determine the source as follows. \n
3434 ///    0: the lower half of \a V1 \n
3435 ///    1: the upper half of \a V1 \n
3436 ///    2: the lower half of \a V2 \n
3437 ///    3: the upper half of \a V2
3438 /// \returns A 256-bit integer vector containing the result.
3439 #define _mm256_permute2x128_si256(V1, V2, M) \
3440   ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3441
3442 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3443 ///     of the immediate \a M is zero, extracts the lower half of the result;
3444 ///     otherwise, extracts the upper half.
3445 ///
3446 /// \headerfile <immintrin.h>
3447 ///
3448 /// \code
3449 /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3450 /// \endcode
3451 ///
3452 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3453 ///
3454 /// \param V
3455 ///    A 256-bit integer vector containing the source values.
3456 /// \param M
3457 ///    An immediate value specifying which half of \a V to extract.
3458 /// \returns A 128-bit integer vector containing the result.
3459 #define _mm256_extracti128_si256(V, M) \
3460   ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3461
3462 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3463 ///     result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3464 ///     is zero, overwrites the lower half of the result; otherwise,
3465 ///     overwrites the upper half.
3466 ///
3467 /// \headerfile <immintrin.h>
3468 ///
3469 /// \code
3470 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3471 /// \endcode
3472 ///
3473 /// This intrinsic corresponds to the \c VINSERTI128 instruction.
3474 ///
3475 /// \param V1
3476 ///    A 256-bit integer vector containing a source value.
3477 /// \param V2
3478 ///    A 128-bit integer vector containing a source value.
3479 /// \param M
3480 ///    An immediate value specifying where to put \a V2 in the result.
3481 /// \returns A 256-bit integer vector containing the result.
3482 #define _mm256_inserti128_si256(V1, V2, M) \
3483   ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3484                                          (__v2di)(__m128i)(V2), (int)(M)))
3485
3486 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3487 ///    the most significant bit of the corresponding element in the mask
3488 ///    \a __M is set; otherwise, sets that element of the result to zero.
3489 ///    Returns the 256-bit [8 x i32] result.
3490 ///
3491 /// \code{.operation}
3492 /// FOR i := 0 TO 7
3493 ///   j := i*32
3494 ///   IF __M[j+31] == 1
3495 ///     result[j+31:j] := Load32(__X+(i*4))
3496 ///   ELSE
3497 ///     result[j+31:j] := 0
3498 ///   FI
3499 /// ENDFOR
3500 /// \endcode
3501 ///
3502 /// \headerfile <immintrin.h>
3503 ///
3504 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3505 ///
3506 /// \param __X
3507 ///    A pointer to the memory used for loading values.
3508 /// \param __M
3509 ///    A 256-bit vector of [8 x i32] containing the mask bits.
3510 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3511 ///    elements.
3512 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3513 _mm256_maskload_epi32(int const *__X, __m256i __M)
3514 {
3515   return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3516 }
3517
3518 /// Conditionally loads four 64-bit integer elements from memory \a __X, if
3519 ///    the most significant bit of the corresponding element in the mask
3520 ///    \a __M is set; otherwise, sets that element of the result to zero.
3521 ///    Returns the 256-bit [4 x i64] result.
3522 ///
3523 /// \code{.operation}
3524 /// FOR i := 0 TO 3
3525 ///   j := i*64
3526 ///   IF __M[j+63] == 1
3527 ///     result[j+63:j] := Load64(__X+(i*8))
3528 ///   ELSE
3529 ///     result[j+63:j] := 0
3530 ///   FI
3531 /// ENDFOR
3532 /// \endcode
3533 ///
3534 /// \headerfile <immintrin.h>
3535 ///
3536 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3537 ///
3538 /// \param __X
3539 ///    A pointer to the memory used for loading values.
3540 /// \param __M
3541 ///    A 256-bit vector of [4 x i64] containing the mask bits.
3542 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3543 ///    elements.
3544 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3545 _mm256_maskload_epi64(long long const *__X, __m256i __M)
3546 {
3547   return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3548 }
3549
3550 /// Conditionally loads four 32-bit integer elements from memory \a __X, if
3551 ///    the most significant bit of the corresponding element in the mask
3552 ///    \a __M is set; otherwise, sets that element of the result to zero.
3553 ///    Returns the 128-bit [4 x i32] result.
3554 ///
3555 /// \code{.operation}
3556 /// FOR i := 0 TO 3
3557 ///   j := i*32
3558 ///   IF __M[j+31] == 1
3559 ///     result[j+31:j] := Load32(__X+(i*4))
3560 ///   ELSE
3561 ///     result[j+31:j] := 0
3562 ///   FI
3563 /// ENDFOR
3564 /// \endcode
3565 ///
3566 /// \headerfile <immintrin.h>
3567 ///
3568 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3569 ///
3570 /// \param __X
3571 ///    A pointer to the memory used for loading values.
3572 /// \param __M
3573 ///    A 128-bit vector of [4 x i32] containing the mask bits.
3574 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3575 ///    elements.
3576 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3577 _mm_maskload_epi32(int const *__X, __m128i __M)
3578 {
3579   return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3580 }
3581
3582 /// Conditionally loads two 64-bit integer elements from memory \a __X, if
3583 ///    the most significant bit of the corresponding element in the mask
3584 ///    \a __M is set; otherwise, sets that element of the result to zero.
3585 ///    Returns the 128-bit [2 x i64] result.
3586 ///
3587 /// \code{.operation}
3588 /// FOR i := 0 TO 1
3589 ///   j := i*64
3590 ///   IF __M[j+63] == 1
3591 ///     result[j+63:j] := Load64(__X+(i*8))
3592 ///   ELSE
3593 ///     result[j+63:j] := 0
3594 ///   FI
3595 /// ENDFOR
3596 /// \endcode
3597 ///
3598 /// \headerfile <immintrin.h>
3599 ///
3600 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3601 ///
3602 /// \param __X
3603 ///    A pointer to the memory used for loading values.
3604 /// \param __M
3605 ///    A 128-bit vector of [2 x i64] containing the mask bits.
3606 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3607 ///    elements.
3608 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3609 _mm_maskload_epi64(long long const *__X, __m128i __M)
3610 {
3611   return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3612 }
3613
3614 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3615 ///    of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3616 ///    the corresponding element in the mask \a __M is set; otherwise, the
3617 ///    memory element is unchanged.
3618 ///
3619 /// \code{.operation}
3620 /// FOR i := 0 TO 7
3621 ///   j := i*32
3622 ///   IF __M[j+31] == 1
3623 ///     Store32(__X+(i*4), __Y[j+31:j])
3624 ///   FI
3625 /// ENDFOR
3626 /// \endcode
3627 ///
3628 /// \headerfile <immintrin.h>
3629 ///
3630 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3631 ///
3632 /// \param __X
3633 ///    A pointer to the memory used for storing values.
3634 /// \param __M
3635 ///    A 256-bit vector of [8 x i32] containing the mask bits.
3636 /// \param __Y
3637 ///    A 256-bit vector of [8 x i32] containing the values to store.
3638 static __inline__ void __DEFAULT_FN_ATTRS256
3639 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3640 {
3641   __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3642 }
3643
3644 /// Conditionally stores four 64-bit integer elements from the 256-bit vector
3645 ///    of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3646 ///    the corresponding element in the mask \a __M is set; otherwise, the
3647 ///    memory element is unchanged.
3648 ///
3649 /// \code{.operation}
3650 /// FOR i := 0 TO 3
3651 ///   j := i*64
3652 ///   IF __M[j+63] == 1
3653 ///     Store64(__X+(i*8), __Y[j+63:j])
3654 ///   FI
3655 /// ENDFOR
3656 /// \endcode
3657 ///
3658 /// \headerfile <immintrin.h>
3659 ///
3660 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3661 ///
3662 /// \param __X
3663 ///    A pointer to the memory used for storing values.
3664 /// \param __M
3665 ///    A 256-bit vector of [4 x i64] containing the mask bits.
3666 /// \param __Y
3667 ///    A 256-bit vector of [4 x i64] containing the values to store.
3668 static __inline__ void __DEFAULT_FN_ATTRS256
3669 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3670 {
3671   __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3672 }
3673
3674 /// Conditionally stores four 32-bit integer elements from the 128-bit vector
3675 ///    of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3676 ///    the corresponding element in the mask \a __M is set; otherwise, the
3677 ///    memory element is unchanged.
3678 ///
3679 /// \code{.operation}
3680 /// FOR i := 0 TO 3
3681 ///   j := i*32
3682 ///   IF __M[j+31] == 1
3683 ///     Store32(__X+(i*4), __Y[j+31:j])
3684 ///   FI
3685 /// ENDFOR
3686 /// \endcode
3687 ///
3688 /// \headerfile <immintrin.h>
3689 ///
3690 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3691 ///
3692 /// \param __X
3693 ///    A pointer to the memory used for storing values.
3694 /// \param __M
3695 ///    A 128-bit vector of [4 x i32] containing the mask bits.
3696 /// \param __Y
3697 ///    A 128-bit vector of [4 x i32] containing the values to store.
3698 static __inline__ void __DEFAULT_FN_ATTRS128
3699 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3700 {
3701   __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3702 }
3703
3704 /// Conditionally stores two 64-bit integer elements from the 128-bit vector
3705 ///    of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3706 ///    the corresponding element in the mask \a __M is set; otherwise, the
3707 ///    memory element is unchanged.
3708 ///
3709 /// \code{.operation}
3710 /// FOR i := 0 TO 1
3711 ///   j := i*64
3712 ///   IF __M[j+63] == 1
3713 ///     Store64(__X+(i*8), __Y[j+63:j])
3714 ///   FI
3715 /// ENDFOR
3716 /// \endcode
3717 ///
3718 /// \headerfile <immintrin.h>
3719 ///
3720 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3721 ///
3722 /// \param __X
3723 ///    A pointer to the memory used for storing values.
3724 /// \param __M
3725 ///    A 128-bit vector of [2 x i64] containing the mask bits.
3726 /// \param __Y
3727 ///    A 128-bit vector of [2 x i64] containing the values to store.
3728 static __inline__ void __DEFAULT_FN_ATTRS128
3729 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3730 {
3731   __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3732 }
3733
3734 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3735 ///    left by the number of bits given in the corresponding element of the
3736 ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3737 ///    returns the result. If the shift count for any element is greater than
3738 ///    31, the result for that element is zero.
3739 ///
3740 /// \headerfile <immintrin.h>
3741 ///
3742 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3743 ///
3744 /// \param __X
3745 ///    A 256-bit vector of [8 x i32] to be shifted.
3746 /// \param __Y
3747 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3748 ///    bits).
3749 /// \returns A 256-bit vector of [8 x i32] containing the result.
3750 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3751 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
3752 {
3753   return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3754 }
3755
3756 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3757 ///    left by the number of bits given in the corresponding element of the
3758 ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3759 ///    returns the result. If the shift count for any element is greater than
3760 ///    31, the result for that element is zero.
3761 ///
3762 /// \headerfile <immintrin.h>
3763 ///
3764 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3765 ///
3766 /// \param __X
3767 ///    A 128-bit vector of [4 x i32] to be shifted.
3768 /// \param __Y
3769 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3770 ///    bits).
3771 /// \returns A 128-bit vector of [4 x i32] containing the result.
3772 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3773 _mm_sllv_epi32(__m128i __X, __m128i __Y)
3774 {
3775   return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3776 }
3777
3778 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3779 ///    left by the number of bits given in the corresponding element of the
3780 ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3781 ///    returns the result. If the shift count for any element is greater than
3782 ///    63, the result for that element is zero.
3783 ///
3784 /// \headerfile <immintrin.h>
3785 ///
3786 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3787 ///
3788 /// \param __X
3789 ///    A 256-bit vector of [4 x i64] to be shifted.
3790 /// \param __Y
3791 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3792 ///    bits).
3793 /// \returns A 256-bit vector of [4 x i64] containing the result.
3794 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3795 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
3796 {
3797   return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3798 }
3799
3800 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3801 ///    left by the number of bits given in the corresponding element of the
3802 ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3803 ///    returns the result. If the shift count for any element is greater than
3804 ///    63, the result for that element is zero.
3805 ///
3806 /// \headerfile <immintrin.h>
3807 ///
3808 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3809 ///
3810 /// \param __X
3811 ///    A 128-bit vector of [2 x i64] to be shifted.
3812 /// \param __Y
3813 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3814 ///    bits).
3815 /// \returns A 128-bit vector of [2 x i64] containing the result.
3816 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3817 _mm_sllv_epi64(__m128i __X, __m128i __Y)
3818 {
3819   return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3820 }
3821
3822 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3823 ///    right by the number of bits given in the corresponding element of the
3824 ///    256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3825 ///    returns the result. If the shift count for any element is greater than
3826 ///    31, the result for that element is 0 or -1 according to the sign bit
3827 ///    for that element.
3828 ///
3829 /// \headerfile <immintrin.h>
3830 ///
3831 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3832 ///
3833 /// \param __X
3834 ///    A 256-bit vector of [8 x i32] to be shifted.
3835 /// \param __Y
3836 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3837 ///    bits).
3838 /// \returns A 256-bit vector of [8 x i32] containing the result.
3839 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3840 _mm256_srav_epi32(__m256i __X, __m256i __Y)
3841 {
3842   return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3843 }
3844
3845 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3846 ///    right by the number of bits given in the corresponding element of the
3847 ///    128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3848 ///    returns the result. If the shift count for any element is greater than
3849 ///    31, the result for that element is 0 or -1 according to the sign bit
3850 ///    for that element.
3851 ///
3852 /// \headerfile <immintrin.h>
3853 ///
3854 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3855 ///
3856 /// \param __X
3857 ///    A 128-bit vector of [4 x i32] to be shifted.
3858 /// \param __Y
3859 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3860 ///    bits).
3861 /// \returns A 128-bit vector of [4 x i32] containing the result.
3862 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3863 _mm_srav_epi32(__m128i __X, __m128i __Y)
3864 {
3865   return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3866 }
3867
3868 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3869 ///    right by the number of bits given in the corresponding element of the
3870 ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3871 ///    returns the result. If the shift count for any element is greater than
3872 ///    31, the result for that element is zero.
3873 ///
3874 /// \headerfile <immintrin.h>
3875 ///
3876 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3877 ///
3878 /// \param __X
3879 ///    A 256-bit vector of [8 x i32] to be shifted.
3880 /// \param __Y
3881 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3882 ///    bits).
3883 /// \returns A 256-bit vector of [8 x i32] containing the result.
3884 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3885 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
3886 {
3887   return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3888 }
3889
3890 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3891 ///    right by the number of bits given in the corresponding element of the
3892 ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3893 ///    returns the result. If the shift count for any element is greater than
3894 ///    31, the result for that element is zero.
3895 ///
3896 /// \headerfile <immintrin.h>
3897 ///
3898 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3899 ///
3900 /// \param __X
3901 ///    A 128-bit vector of [4 x i32] to be shifted.
3902 /// \param __Y
3903 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3904 ///    bits).
3905 /// \returns A 128-bit vector of [4 x i32] containing the result.
3906 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3907 _mm_srlv_epi32(__m128i __X, __m128i __Y)
3908 {
3909   return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3910 }
3911
3912 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3913 ///    right by the number of bits given in the corresponding element of the
3914 ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3915 ///    returns the result. If the shift count for any element is greater than
3916 ///    63, the result for that element is zero.
3917 ///
3918 /// \headerfile <immintrin.h>
3919 ///
3920 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3921 ///
3922 /// \param __X
3923 ///    A 256-bit vector of [4 x i64] to be shifted.
3924 /// \param __Y
3925 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3926 ///    bits).
3927 /// \returns A 256-bit vector of [4 x i64] containing the result.
3928 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3929 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
3930 {
3931   return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3932 }
3933
3934 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3935 ///    right by the number of bits given in the corresponding element of the
3936 ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3937 ///    returns the result. If the shift count for any element is greater than
3938 ///    63, the result for that element is zero.
3939 ///
3940 /// \headerfile <immintrin.h>
3941 ///
3942 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3943 ///
3944 /// \param __X
3945 ///    A 128-bit vector of [2 x i64] to be shifted.
3946 /// \param __Y
3947 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3948 ///    bits).
3949 /// \returns A 128-bit vector of [2 x i64] containing the result.
3950 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3951 _mm_srlv_epi64(__m128i __X, __m128i __Y)
3952 {
3953   return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3954 }
3955
3956 /// Conditionally gathers two 64-bit floating-point values, either from the
3957 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3958 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3959 ///    of [2 x double] in \a mask determines the source for each element.
3960 ///
3961 /// \code{.operation}
3962 /// FOR element := 0 to 1
3963 ///   j := element*64
3964 ///   k := element*32
3965 ///   IF mask[j+63] == 0
3966 ///     result[j+63:j] := a[j+63:j]
3967 ///   ELSE
3968 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3969 ///   FI
3970 /// ENDFOR
3971 /// \endcode
3972 ///
3973 /// \headerfile <immintrin.h>
3974 ///
3975 /// \code
3976 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3977 ///                               __m128d mask, const int s);
3978 /// \endcode
3979 ///
3980 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3981 ///
3982 /// \param a
3983 ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
3984 ///    zero.
3985 /// \param m
3986 ///    A pointer to the memory used for loading values.
3987 /// \param i
3988 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3989 ///    the first two elements are used.
3990 /// \param mask
3991 ///    A 128-bit vector of [2 x double] containing the mask. The most
3992 ///    significant bit of each element in the mask vector represents the mask
3993 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
3994 ///    is gathered; otherwise the value is loaded from memory.
3995 /// \param s
3996 ///    A literal constant scale factor for the indexes in \a i. Must be
3997 ///    1, 2, 4, or 8.
3998 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3999 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4000   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4001                                       (double const *)(m), \
4002                                       (__v4si)(__m128i)(i), \
4003                                       (__v2df)(__m128d)(mask), (s)))
4004
4005 /// Conditionally gathers four 64-bit floating-point values, either from the
4006 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4007 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4008 ///    of [4 x double] in \a mask determines the source for each element.
4009 ///
4010 /// \code{.operation}
4011 /// FOR element := 0 to 3
4012 ///   j := element*64
4013 ///   k := element*32
4014 ///   IF mask[j+63] == 0
4015 ///     result[j+63:j] := a[j+63:j]
4016 ///   ELSE
4017 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4018 ///   FI
4019 /// ENDFOR
4020 /// \endcode
4021 ///
4022 /// \headerfile <immintrin.h>
4023 ///
4024 /// \code
4025 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4026 ///                                  __m256d mask, const int s);
4027 /// \endcode
4028 ///
4029 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4030 ///
4031 /// \param a
4032 ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
4033 ///    zero.
4034 /// \param m
4035 ///    A pointer to the memory used for loading values.
4036 /// \param i
4037 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4038 /// \param mask
4039 ///    A 256-bit vector of [4 x double] containing the mask. The most
4040 ///    significant bit of each element in the mask vector represents the mask
4041 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4042 ///    is gathered; otherwise the value is loaded from memory.
4043 /// \param s
4044 ///    A literal constant scale factor for the indexes in \a i. Must be
4045 ///    1, 2, 4, or 8.
4046 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4047 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4048   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4049                                          (double const *)(m), \
4050                                          (__v4si)(__m128i)(i), \
4051                                          (__v4df)(__m256d)(mask), (s)))
4052
4053 /// Conditionally gathers two 64-bit floating-point values, either from the
4054 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4055 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4056 ///    of [2 x double] in \a mask determines the source for each element.
4057 ///
4058 /// \code{.operation}
4059 /// FOR element := 0 to 1
4060 ///   j := element*64
4061 ///   k := element*64
4062 ///   IF mask[j+63] == 0
4063 ///     result[j+63:j] := a[j+63:j]
4064 ///   ELSE
4065 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4066 ///   FI
4067 /// ENDFOR
4068 /// \endcode
4069 ///
4070 /// \headerfile <immintrin.h>
4071 ///
4072 /// \code
4073 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4074 ///                               __m128d mask, const int s);
4075 /// \endcode
4076 ///
4077 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4078 ///
4079 /// \param a
4080 ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
4081 ///    zero.
4082 /// \param m
4083 ///    A pointer to the memory used for loading values.
4084 /// \param i
4085 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4086 /// \param mask
4087 ///    A 128-bit vector of [2 x double] containing the mask. The most
4088 ///    significant bit of each element in the mask vector represents the mask
4089 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4090 ///    is gathered; otherwise the value is loaded from memory.
4091 /// \param s
4092 ///    A literal constant scale factor for the indexes in \a i. Must be
4093 ///    1, 2, 4, or 8.
4094 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4095 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4096   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4097                                       (double const *)(m), \
4098                                       (__v2di)(__m128i)(i), \
4099                                       (__v2df)(__m128d)(mask), (s)))
4100
4101 /// Conditionally gathers four 64-bit floating-point values, either from the
4102 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4103 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4104 ///    of [4 x double] in \a mask determines the source for each element.
4105 ///
4106 /// \code{.operation}
4107 /// FOR element := 0 to 3
4108 ///   j := element*64
4109 ///   k := element*64
4110 ///   IF mask[j+63] == 0
4111 ///     result[j+63:j] := a[j+63:j]
4112 ///   ELSE
4113 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4114 ///   FI
4115 /// ENDFOR
4116 /// \endcode
4117 ///
4118 /// \headerfile <immintrin.h>
4119 ///
4120 /// \code
4121 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4122 ///                                  __m256d mask, const int s);
4123 /// \endcode
4124 ///
4125 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4126 ///
4127 /// \param a
4128 ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
4129 ///    zero.
4130 /// \param m
4131 ///    A pointer to the memory used for loading values.
4132 /// \param i
4133 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4134 /// \param mask
4135 ///    A 256-bit vector of [4 x double] containing the mask. The most
4136 ///    significant bit of each element in the mask vector represents the mask
4137 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4138 ///    is gathered; otherwise the value is loaded from memory.
4139 /// \param s
4140 ///    A literal constant scale factor for the indexes in \a i. Must be
4141 ///    1, 2, 4, or 8.
4142 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4143 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4144   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4145                                          (double const *)(m), \
4146                                          (__v4di)(__m256i)(i), \
4147                                          (__v4df)(__m256d)(mask), (s)))
4148
4149 /// Conditionally gathers four 32-bit floating-point values, either from the
4150 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4151 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4152 ///    of [4 x float] in \a mask determines the source for each element.
4153 ///
4154 /// \code{.operation}
4155 /// FOR element := 0 to 3
4156 ///   j := element*32
4157 ///   k := element*32
4158 ///   IF mask[j+31] == 0
4159 ///     result[j+31:j] := a[j+31:j]
4160 ///   ELSE
4161 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4162 ///   FI
4163 /// ENDFOR
4164 /// \endcode
4165 ///
4166 /// \headerfile <immintrin.h>
4167 ///
4168 /// \code
4169 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4170 ///                              __m128 mask, const int s);
4171 /// \endcode
4172 ///
4173 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4174 ///
4175 /// \param a
4176 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4177 ///    zero.
4178 /// \param m
4179 ///    A pointer to the memory used for loading values.
4180 /// \param i
4181 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4182 /// \param mask
4183 ///    A 128-bit vector of [4 x float] containing the mask. The most
4184 ///    significant bit of each element in the mask vector represents the mask
4185 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4186 ///    is gathered; otherwise the value is loaded from memory.
4187 /// \param s
4188 ///    A literal constant scale factor for the indexes in \a i. Must be
4189 ///    1, 2, 4, or 8.
4190 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4191 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4192   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4193                                      (float const *)(m), \
4194                                      (__v4si)(__m128i)(i), \
4195                                      (__v4sf)(__m128)(mask), (s)))
4196
4197 /// Conditionally gathers eight 32-bit floating-point values, either from the
4198 ///    256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4199 ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4200 ///    of [8 x float] in \a mask determines the source for each element.
4201 ///
4202 /// \code{.operation}
4203 /// FOR element := 0 to 7
4204 ///   j := element*32
4205 ///   k := element*32
4206 ///   IF mask[j+31] == 0
4207 ///     result[j+31:j] := a[j+31:j]
4208 ///   ELSE
4209 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4210 ///   FI
4211 /// ENDFOR
4212 /// \endcode
4213 ///
4214 /// \headerfile <immintrin.h>
4215 ///
4216 /// \code
4217 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4218 ///                                 __m256 mask, const int s);
4219 /// \endcode
4220 ///
4221 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4222 ///
4223 /// \param a
4224 ///    A 256-bit vector of [8 x float] used as the source when a mask bit is
4225 ///    zero.
4226 /// \param m
4227 ///    A pointer to the memory used for loading values.
4228 /// \param i
4229 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4230 /// \param mask
4231 ///    A 256-bit vector of [8 x float] containing the mask. The most
4232 ///    significant bit of each element in the mask vector represents the mask
4233 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4234 ///    is gathered; otherwise the value is loaded from memory.
4235 /// \param s
4236 ///    A literal constant scale factor for the indexes in \a i. Must be
4237 ///    1, 2, 4, or 8.
4238 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4239 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4240   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4241                                         (float const *)(m), \
4242                                         (__v8si)(__m256i)(i), \
4243                                         (__v8sf)(__m256)(mask), (s)))
4244
4245 /// Conditionally gathers two 32-bit floating-point values, either from the
4246 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4247 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4248 ///    of [4 x float] in \a mask determines the source for the lower two
4249 ///    elements. The upper two elements of the result are zeroed.
4250 ///
4251 /// \code{.operation}
4252 /// FOR element := 0 to 1
4253 ///   j := element*32
4254 ///   k := element*64
4255 ///   IF mask[j+31] == 0
4256 ///     result[j+31:j] := a[j+31:j]
4257 ///   ELSE
4258 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4259 ///   FI
4260 /// ENDFOR
4261 /// result[127:64] := 0
4262 /// \endcode
4263 ///
4264 /// \headerfile <immintrin.h>
4265 ///
4266 /// \code
4267 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4268 ///                              __m128 mask, const int s);
4269 /// \endcode
4270 ///
4271 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4272 ///
4273 /// \param a
4274 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4275 ///    zero. Only the first two elements are used.
4276 /// \param m
4277 ///    A pointer to the memory used for loading values.
4278 /// \param i
4279 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4280 /// \param mask
4281 ///    A 128-bit vector of [4 x float] containing the mask. The most
4282 ///    significant bit of each element in the mask vector represents the mask
4283 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4284 ///    is gathered; otherwise the value is loaded from memory. Only the first
4285 ///    two elements are used.
4286 /// \param s
4287 ///    A literal constant scale factor for the indexes in \a i. Must be
4288 ///    1, 2, 4, or 8.
4289 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4290 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4291   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4292                                      (float const *)(m), \
4293                                      (__v2di)(__m128i)(i), \
4294                                      (__v4sf)(__m128)(mask), (s)))
4295
4296 /// Conditionally gathers four 32-bit floating-point values, either from the
4297 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4298 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4299 ///    of [4 x float] in \a mask determines the source for each element.
4300 ///
4301 /// \code{.operation}
4302 /// FOR element := 0 to 3
4303 ///   j := element*32
4304 ///   k := element*64
4305 ///   IF mask[j+31] == 0
4306 ///     result[j+31:j] := a[j+31:j]
4307 ///   ELSE
4308 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4309 ///   FI
4310 /// ENDFOR
4311 /// \endcode
4312 ///
4313 /// \headerfile <immintrin.h>
4314 ///
4315 /// \code
4316 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4317 ///                                 __m128 mask, const int s);
4318 /// \endcode
4319 ///
4320 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4321 ///
4322 /// \param a
4323 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4324 ///   zero.
4325 /// \param m
4326 ///    A pointer to the memory used for loading values.
4327 /// \param i
4328 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4329 /// \param mask
4330 ///    A 128-bit vector of [4 x float] containing the mask. The most
4331 ///    significant bit of each element in the mask vector represents the mask
4332 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4333 ///    is gathered; otherwise the value is loaded from memory.
4334 /// \param s
4335 ///    A literal constant scale factor for the indexes in \a i. Must be
4336 ///    1, 2, 4, or 8.
4337 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4338 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4339   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4340                                         (float const *)(m), \
4341                                         (__v4di)(__m256i)(i), \
4342                                         (__v4sf)(__m128)(mask), (s)))
4343
4344 /// Conditionally gathers four 32-bit integer values, either from the
4345 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4346 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4347 ///    of [4 x i32] in \a mask determines the source for each element.
4348 ///
4349 /// \code{.operation}
4350 /// FOR element := 0 to 3
4351 ///   j := element*32
4352 ///   k := element*32
4353 ///   IF mask[j+31] == 0
4354 ///     result[j+31:j] := a[j+31:j]
4355 ///   ELSE
4356 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4357 ///   FI
4358 /// ENDFOR
4359 /// \endcode
4360 ///
4361 /// \headerfile <immintrin.h>
4362 ///
4363 /// \code
4364 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4365 ///                                  __m128i mask, const int s);
4366 /// \endcode
4367 ///
4368 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4369 ///
4370 /// \param a
4371 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4372 ///    zero.
4373 /// \param m
4374 ///    A pointer to the memory used for loading values.
4375 /// \param i
4376 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4377 /// \param mask
4378 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4379 ///    bit of each element in the mask vector represents the mask bits. If a
4380 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4381 ///    otherwise the value is loaded from memory.
4382 /// \param s
4383 ///    A literal constant scale factor for the indexes in \a i. Must be
4384 ///    1, 2, 4, or 8.
4385 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4386 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4387   ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4388                                      (int const *)(m), \
4389                                      (__v4si)(__m128i)(i), \
4390                                      (__v4si)(__m128i)(mask), (s)))
4391
4392 /// Conditionally gathers eight 32-bit integer values, either from the
4393 ///    256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4394 ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4395 ///    of [8 x i32] in \a mask determines the source for each element.
4396 ///
4397 /// \code{.operation}
4398 /// FOR element := 0 to 7
4399 ///   j := element*32
4400 ///   k := element*32
4401 ///   IF mask[j+31] == 0
4402 ///     result[j+31:j] := a[j+31:j]
4403 ///   ELSE
4404 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4405 ///   FI
4406 /// ENDFOR
4407 /// \endcode
4408 ///
4409 /// \headerfile <immintrin.h>
4410 ///
4411 /// \code
4412 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4413 ///                                     __m256i mask, const int s);
4414 /// \endcode
4415 ///
4416 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4417 ///
4418 /// \param a
4419 ///    A 256-bit vector of [8 x i32] used as the source when a mask bit is
4420 ///    zero.
4421 /// \param m
4422 ///    A pointer to the memory used for loading values.
4423 /// \param i
4424 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4425 /// \param mask
4426 ///    A 256-bit vector of [8 x i32] containing the mask. The most significant
4427 ///    bit of each element in the mask vector represents the mask bits. If a
4428 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4429 ///    otherwise the value is loaded from memory.
4430 /// \param s
4431 ///    A literal constant scale factor for the indexes in \a i. Must be
4432 ///    1, 2, 4, or 8.
4433 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4434 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4435   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4436                                         (int const *)(m), \
4437                                         (__v8si)(__m256i)(i), \
4438                                         (__v8si)(__m256i)(mask), (s)))
4439
4440 /// Conditionally gathers two 32-bit integer values, either from the
4441 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4442 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4443 ///    of [4 x i32] in \a mask determines the source for the lower two
4444 ///    elements. The upper two elements of the result are zeroed.
4445 ///
4446 /// \code{.operation}
4447 /// FOR element := 0 to 1
4448 ///   j := element*32
4449 ///   k := element*64
4450 ///   IF mask[j+31] == 0
4451 ///     result[j+31:j] := a[j+31:j]
4452 ///   ELSE
4453 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4454 ///   FI
4455 /// ENDFOR
4456 /// result[127:64] := 0
4457 /// \endcode
4458 ///
4459 /// \headerfile <immintrin.h>
4460 ///
4461 /// \code
4462 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4463 ///                                  __m128i mask, const int s);
4464 /// \endcode
4465 ///
4466 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4467 ///
4468 /// \param a
4469 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4470 ///   zero. Only the first two elements are used.
4471 /// \param m
4472 ///    A pointer to the memory used for loading values.
4473 /// \param i
4474 ///    A 128-bit vector of [2 x i64] containing indexes into \a m.
4475 /// \param mask
4476 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4477 ///    bit of each element in the mask vector represents the mask bits. If a
4478 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4479 ///    otherwise the value is loaded from memory. Only the first two elements
4480 ///    are used.
4481 /// \param s
4482 ///    A literal constant scale factor for the indexes in \a i. Must be
4483 ///    1, 2, 4, or 8.
4484 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4485 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4486   ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4487                                      (int const *)(m), \
4488                                      (__v2di)(__m128i)(i), \
4489                                      (__v4si)(__m128i)(mask), (s)))
4490
4491 /// Conditionally gathers four 32-bit integer values, either from the
4492 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4493 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4494 ///    of [4 x i32] in \a mask determines the source for each element.
4495 ///
4496 /// \code{.operation}
4497 /// FOR element := 0 to 3
4498 ///   j := element*32
4499 ///   k := element*64
4500 ///   IF mask[j+31] == 0
4501 ///     result[j+31:j] := a[j+31:j]
4502 ///   ELSE
4503 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4504 ///   FI
4505 /// ENDFOR
4506 /// \endcode
4507 ///
4508 /// \headerfile <immintrin.h>
4509 ///
4510 /// \code
4511 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4512 ///                                     __m128i mask, const int s);
4513 /// \endcode
4514 ///
4515 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4516 ///
4517 /// \param a
4518 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4519 ///    zero.
4520 /// \param m
4521 ///    A pointer to the memory used for loading values.
4522 /// \param i
4523 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4524 /// \param mask
4525 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4526 ///    bit of each element in the mask vector represents the mask bits. If a
4527 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4528 ///    otherwise the value is loaded from memory.
4529 /// \param s
4530 ///    A literal constant scale factor for the indexes in \a i. Must be
4531 ///    1, 2, 4, or 8.
4532 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4533 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4534   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4535                                         (int const *)(m), \
4536                                         (__v4di)(__m256i)(i), \
4537                                         (__v4si)(__m128i)(mask), (s)))
4538
4539 /// Conditionally gathers two 64-bit integer values, either from the
4540 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4541 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4542 ///    of [2 x i64] in \a mask determines the source for each element.
4543 ///
4544 /// \code{.operation}
4545 /// FOR element := 0 to 1
4546 ///   j := element*64
4547 ///   k := element*32
4548 ///   IF mask[j+63] == 0
4549 ///     result[j+63:j] := a[j+63:j]
4550 ///   ELSE
4551 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4552 ///   FI
4553 /// ENDFOR
4554 /// \endcode
4555 ///
4556 /// \headerfile <immintrin.h>
4557 ///
4558 /// \code
4559 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4560 ///                                  __m128i mask, const int s);
4561 /// \endcode
4562 ///
4563 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4564 ///
4565 /// \param a
4566 ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
4567 ///    zero.
4568 /// \param m
4569 ///    A pointer to the memory used for loading values.
4570 /// \param i
4571 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4572 ///    the first two elements are used.
4573 /// \param mask
4574 ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
4575 ///    bit of each element in the mask vector represents the mask bits. If a
4576 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4577 ///    otherwise the value is loaded from memory.
4578 /// \param s
4579 ///    A literal constant scale factor for the indexes in \a i. Must be
4580 ///    1, 2, 4, or 8.
4581 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4582 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4583   ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4584                                      (long long const *)(m), \
4585                                      (__v4si)(__m128i)(i), \
4586                                      (__v2di)(__m128i)(mask), (s)))
4587
4588 /// Conditionally gathers four 64-bit integer values, either from the
4589 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4590 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4591 ///    of [4 x i64] in \a mask determines the source for each element.
4592 ///
4593 /// \code{.operation}
4594 /// FOR element := 0 to 3
4595 ///   j := element*64
4596 ///   k := element*32
4597 ///   IF mask[j+63] == 0
4598 ///     result[j+63:j] := a[j+63:j]
4599 ///   ELSE
4600 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4601 ///   FI
4602 /// ENDFOR
4603 /// \endcode
4604 ///
4605 /// \headerfile <immintrin.h>
4606 ///
4607 /// \code
4608 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4609 ///                                     __m128i i, __m256i mask, const int s);
4610 /// \endcode
4611 ///
4612 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4613 ///
4614 /// \param a
4615 ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
4616 ///    zero.
4617 /// \param m
4618 ///    A pointer to the memory used for loading values.
4619 /// \param i
4620 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4621 /// \param mask
4622 ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
4623 ///    bit of each element in the mask vector represents the mask bits. If a
4624 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4625 ///    otherwise the value is loaded from memory.
4626 /// \param s
4627 ///    A literal constant scale factor for the indexes in \a i. Must be
4628 ///    1, 2, 4, or 8.
4629 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4630 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4631   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4632                                         (long long const *)(m), \
4633                                         (__v4si)(__m128i)(i), \
4634                                         (__v4di)(__m256i)(mask), (s)))
4635
4636 /// Conditionally gathers two 64-bit integer values, either from the
4637 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4638 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4639 ///    of [2 x i64] in \a mask determines the source for each element.
4640 ///
4641 /// \code{.operation}
4642 /// FOR element := 0 to 1
4643 ///   j := element*64
4644 ///   k := element*64
4645 ///   IF mask[j+63] == 0
4646 ///     result[j+63:j] := a[j+63:j]
4647 ///   ELSE
4648 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4649 ///   FI
4650 /// ENDFOR
4651 /// \endcode
4652 ///
4653 /// \headerfile <immintrin.h>
4654 ///
4655 /// \code
4656 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4657 ///                                  __m128i mask, const int s);
4658 /// \endcode
4659 ///
4660 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4661 ///
4662 /// \param a
4663 ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
4664 ///    zero.
4665 /// \param m
4666 ///    A pointer to the memory used for loading values.
4667 /// \param i
4668 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4669 /// \param mask
4670 ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
4671 ///    bit of each element in the mask vector represents the mask bits. If a
4672 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4673 ///    otherwise the value is loaded from memory.
4674 /// \param s
4675 ///    A literal constant scale factor for the indexes in \a i. Must be
4676 ///    1, 2, 4, or 8.
4677 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4678 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4679   ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4680                                      (long long const *)(m), \
4681                                      (__v2di)(__m128i)(i), \
4682                                      (__v2di)(__m128i)(mask), (s)))
4683
4684 /// Conditionally gathers four 64-bit integer values, either from the
4685 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4686 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4687 ///    of [4 x i64] in \a mask determines the source for each element.
4688 ///
4689 /// \code{.operation}
4690 /// FOR element := 0 to 3
4691 ///   j := element*64
4692 ///   k := element*64
4693 ///   IF mask[j+63] == 0
4694 ///     result[j+63:j] := a[j+63:j]
4695 ///   ELSE
4696 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4697 ///   FI
4698 /// ENDFOR
4699 /// \endcode
4700 ///
4701 /// \headerfile <immintrin.h>
4702 ///
4703 /// \code
4704 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4705 ///                                     __m256i i, __m256i mask, const int s);
4706 /// \endcode
4707 ///
4708 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4709 ///
4710 /// \param a
4711 ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
4712 ///    zero.
4713 /// \param m
4714 ///    A pointer to the memory used for loading values.
4715 /// \param i
4716 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4717 /// \param mask
4718 ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
4719 ///    bit of each element in the mask vector represents the mask bits. If a
4720 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4721 ///    otherwise the value is loaded from memory.
4722 /// \param s
4723 ///    A literal constant scale factor for the indexes in \a i. Must be
4724 ///    1, 2, 4, or 8.
4725 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4726 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4727   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4728                                         (long long const *)(m), \
4729                                         (__v4di)(__m256i)(i), \
4730                                         (__v4di)(__m256i)(mask), (s)))
4731
4732 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4733 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4734 ///
4735 /// \code{.operation}
4736 /// FOR element := 0 to 1
4737 ///   j := element*64
4738 ///   k := element*32
4739 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4740 /// ENDFOR
4741 /// \endcode
4742 ///
4743 /// \headerfile <immintrin.h>
4744 ///
4745 /// \code
4746 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4747 /// \endcode
4748 ///
4749 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4750 ///
4751 /// \param m
4752 ///    A pointer to the memory used for loading values.
4753 /// \param i
4754 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4755 ///    the first two elements are used.
4756 /// \param s
4757 ///    A literal constant scale factor for the indexes in \a i. Must be
4758 ///    1, 2, 4, or 8.
4759 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4760 #define _mm_i32gather_pd(m, i, s) \
4761   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4762                                       (double const *)(m), \
4763                                       (__v4si)(__m128i)(i), \
4764                                       (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4765                                                            _mm_setzero_pd()), \
4766                                       (s)))
4767
4768 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4769 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4770 ///
4771 /// \code{.operation}
4772 /// FOR element := 0 to 3
4773 ///   j := element*64
4774 ///   k := element*32
4775 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4776 /// ENDFOR
4777 /// \endcode
4778 ///
4779 /// \headerfile <immintrin.h>
4780 ///
4781 /// \code
4782 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4783 /// \endcode
4784 ///
4785 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4786 ///
4787 /// \param m
4788 ///    A pointer to the memory used for loading values.
4789 /// \param i
4790 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4791 /// \param s
4792 ///    A literal constant scale factor for the indexes in \a i. Must be
4793 ///    1, 2, 4, or 8.
4794 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4795 #define _mm256_i32gather_pd(m, i, s) \
4796   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4797                                          (double const *)(m), \
4798                                          (__v4si)(__m128i)(i), \
4799                                          (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4800                                                                _mm256_setzero_pd(), \
4801                                                                _CMP_EQ_OQ), \
4802                                          (s)))
4803
4804 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4805 ///    indexes from the 128-bit vector of [2 x i64] in \a i.
4806 ///
4807 /// \code{.operation}
4808 /// FOR element := 0 to 1
4809 ///   j := element*64
4810 ///   k := element*64
4811 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4812 /// ENDFOR
4813 /// \endcode
4814 ///
4815 /// \headerfile <immintrin.h>
4816 ///
4817 /// \code
4818 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4819 /// \endcode
4820 ///
4821 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4822 ///
4823 /// \param m
4824 ///    A pointer to the memory used for loading values.
4825 /// \param i
4826 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4827 /// \param s
4828 ///    A literal constant scale factor for the indexes in \a i. Must be
4829 ///    1, 2, 4, or 8.
4830 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4831 #define _mm_i64gather_pd(m, i, s) \
4832   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4833                                       (double const *)(m), \
4834                                       (__v2di)(__m128i)(i), \
4835                                       (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4836                                                            _mm_setzero_pd()), \
4837                                       (s)))
4838
4839 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4840 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
4841 ///
4842 /// \code{.operation}
4843 /// FOR element := 0 to 3
4844 ///   j := element*64
4845 ///   k := element*64
4846 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4847 /// ENDFOR
4848 /// \endcode
4849 ///
4850 /// \headerfile <immintrin.h>
4851 ///
4852 /// \code
4853 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4854 /// \endcode
4855 ///
4856 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4857 ///
4858 /// \param m
4859 ///    A pointer to the memory used for loading values.
4860 /// \param i
4861 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4862 /// \param s
4863 ///    A literal constant scale factor for the indexes in \a i. Must be
4864 ///    1, 2, 4, or 8.
4865 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4866 #define _mm256_i64gather_pd(m, i, s) \
4867   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4868                                          (double const *)(m), \
4869                                          (__v4di)(__m256i)(i), \
4870                                          (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4871                                                                _mm256_setzero_pd(), \
4872                                                                _CMP_EQ_OQ), \
4873                                          (s)))
4874
4875 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4876 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4877 ///
4878 /// \code{.operation}
4879 /// FOR element := 0 to 3
4880 ///   j := element*32
4881 ///   k := element*32
4882 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4883 /// ENDFOR
4884 /// \endcode
4885 ///
4886 /// \headerfile <immintrin.h>
4887 ///
4888 /// \code
4889 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4890 /// \endcode
4891 ///
4892 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4893 ///
4894 /// \param m
4895 ///    A pointer to the memory used for loading values.
4896 /// \param i
4897 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4898 /// \param s
4899 ///    A literal constant scale factor for the indexes in \a i. Must be
4900 ///    1, 2, 4, or 8.
4901 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4902 #define _mm_i32gather_ps(m, i, s) \
4903   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4904                                      (float const *)(m), \
4905                                      (__v4si)(__m128i)(i), \
4906                                      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4907                                                           _mm_setzero_ps()), \
4908                                      (s)))
4909
4910 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
4911 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
4912 ///
4913 /// \code{.operation}
4914 /// FOR element := 0 to 7
4915 ///   j := element*32
4916 ///   k := element*32
4917 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4918 /// ENDFOR
4919 /// \endcode
4920 ///
4921 /// \headerfile <immintrin.h>
4922 ///
4923 /// \code
4924 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4925 /// \endcode
4926 ///
4927 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4928 ///
4929 /// \param m
4930 ///    A pointer to the memory used for loading values.
4931 /// \param i
4932 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4933 /// \param s
4934 ///    A literal constant scale factor for the indexes in \a i. Must be
4935 ///    1, 2, 4, or 8.
4936 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4937 #define _mm256_i32gather_ps(m, i, s) \
4938   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4939                                         (float const *)(m), \
4940                                         (__v8si)(__m256i)(i), \
4941                                         (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4942                                                               _mm256_setzero_ps(), \
4943                                                               _CMP_EQ_OQ), \
4944                                         (s)))
4945
4946 /// Gathers two 32-bit floating-point values from memory \a m using scaled
4947 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4948 ///    elements of the result are zeroed.
4949 ///
4950 /// \code{.operation}
4951 /// FOR element := 0 to 1
4952 ///   j := element*32
4953 ///   k := element*64
4954 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4955 /// ENDFOR
4956 /// result[127:64] := 0
4957 /// \endcode
4958 ///
4959 /// \headerfile <immintrin.h>
4960 ///
4961 /// \code
4962 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4963 /// \endcode
4964 ///
4965 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4966 ///
4967 /// \param m
4968 ///    A pointer to the memory used for loading values.
4969 /// \param i
4970 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4971 /// \param s
4972 ///    A literal constant scale factor for the indexes in \a i. Must be
4973 ///    1, 2, 4, or 8.
4974 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4975 #define _mm_i64gather_ps(m, i, s) \
4976   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4977                                      (float const *)(m), \
4978                                      (__v2di)(__m128i)(i), \
4979                                      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4980                                                           _mm_setzero_ps()), \
4981                                      (s)))
4982
4983 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4984 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
4985 ///
4986 /// \code{.operation}
4987 /// FOR element := 0 to 3
4988 ///   j := element*32
4989 ///   k := element*64
4990 ///   result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4991 /// ENDFOR
4992 /// \endcode
4993 ///
4994 /// \headerfile <immintrin.h>
4995 ///
4996 /// \code
4997 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4998 /// \endcode
4999 ///
5000 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
5001 ///
5002 /// \param m
5003 ///    A pointer to the memory used for loading values.
5004 /// \param i
5005 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5006 /// \param s
5007 ///    A literal constant scale factor for the indexes in \a i. Must be
5008 ///    1, 2, 4, or 8.
5009 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
5010 #define _mm256_i64gather_ps(m, i, s) \
5011   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5012                                         (float const *)(m), \
5013                                         (__v4di)(__m256i)(i), \
5014                                         (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5015                                                              _mm_setzero_ps()), \
5016                                         (s)))
5017
5018 /// Gathers four 32-bit floating-point values from memory \a m using scaled
5019 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
5020 ///
5021 /// \code{.operation}
5022 /// FOR element := 0 to 3
5023 ///   j := element*32
5024 ///   k := element*32
5025 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5026 /// ENDFOR
5027 /// \endcode
5028 ///
5029 /// \headerfile <immintrin.h>
5030 ///
5031 /// \code
5032 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5033 /// \endcode
5034 ///
5035 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5036 ///
5037 /// \param m
5038 ///    A pointer to the memory used for loading values.
5039 /// \param i
5040 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5041 /// \param s
5042 ///    A literal constant scale factor for the indexes in \a i. Must be
5043 ///    1, 2, 4, or 8.
5044 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5045 #define _mm_i32gather_epi32(m, i, s) \
5046   ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5047                                      (int const *)(m), (__v4si)(__m128i)(i), \
5048                                      (__v4si)_mm_set1_epi32(-1), (s)))
5049
5050 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
5051 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
5052 ///
5053 /// \code{.operation}
5054 /// FOR element := 0 to 7
5055 ///   j := element*32
5056 ///   k := element*32
5057 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5058 /// ENDFOR
5059 /// \endcode
5060 ///
5061 /// \headerfile <immintrin.h>
5062 ///
5063 /// \code
5064 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5065 /// \endcode
5066 ///
5067 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5068 ///
5069 /// \param m
5070 ///    A pointer to the memory used for loading values.
5071 /// \param i
5072 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5073 /// \param s
5074 ///    A literal constant scale factor for the indexes in \a i. Must be
5075 ///    1, 2, 4, or 8.
5076 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5077 #define _mm256_i32gather_epi32(m, i, s) \
5078   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5079                                         (int const *)(m), (__v8si)(__m256i)(i), \
5080                                         (__v8si)_mm256_set1_epi32(-1), (s)))
5081
5082 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
5083 ///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5084 ///    of the result are zeroed.
5085 ///
5086 /// \code{.operation}
5087 /// FOR element := 0 to 1
5088 ///   j := element*32
5089 ///   k := element*64
5090 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5091 /// ENDFOR
5092 /// result[127:64] := 0
5093 /// \endcode
5094 ///
5095 /// \headerfile <immintrin.h>
5096 ///
5097 /// \code
5098 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5099 /// \endcode
5100 ///
5101 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5102 ///
5103 /// \param m
5104 ///    A pointer to the memory used for loading values.
5105 /// \param i
5106 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5107 /// \param s
5108 ///    A literal constant scale factor for the indexes in \a i. Must be
5109 ///    1, 2, 4, or 8.
5110 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5111 #define _mm_i64gather_epi32(m, i, s) \
5112   ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5113                                      (int const *)(m), (__v2di)(__m128i)(i), \
5114                                      (__v4si)_mm_set1_epi32(-1), (s)))
5115
5116 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
5117 ///    from the 256-bit vector of [4 x i64] in \a i.
5118 ///
5119 /// \code{.operation}
5120 /// FOR element := 0 to 3
5121 ///   j := element*32
5122 ///   k := element*64
5123 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5124 /// ENDFOR
5125 /// \endcode
5126 ///
5127 /// \headerfile <immintrin.h>
5128 ///
5129 /// \code
5130 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5131 /// \endcode
5132 ///
5133 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5134 ///
5135 /// \param m
5136 ///    A pointer to the memory used for loading values.
5137 /// \param i
5138 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5139 /// \param s
5140 ///    A literal constant scale factor for the indexes in \a i. Must be
5141 ///    1, 2, 4, or 8.
5142 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5143 #define _mm256_i64gather_epi32(m, i, s) \
5144   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5145                                         (int const *)(m), (__v4di)(__m256i)(i), \
5146                                         (__v4si)_mm_set1_epi32(-1), (s)))
5147
5148 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5149 ///    from the 128-bit vector of [4 x i32] in \a i.
5150 ///
5151 /// \code{.operation}
5152 /// FOR element := 0 to 1
5153 ///   j := element*64
5154 ///   k := element*32
5155 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5156 /// ENDFOR
5157 /// \endcode
5158 ///
5159 /// \headerfile <immintrin.h>
5160 ///
5161 /// \code
5162 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5163 /// \endcode
5164 ///
5165 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5166 ///
5167 /// \param m
5168 ///    A pointer to the memory used for loading values.
5169 /// \param i
5170 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5171 ///    the first two elements are used.
5172 /// \param s
5173 ///    A literal constant scale factor for the indexes in \a i. Must be
5174 ///    1, 2, 4, or 8.
5175 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5176 #define _mm_i32gather_epi64(m, i, s) \
5177   ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5178                                      (long long const *)(m), \
5179                                      (__v4si)(__m128i)(i), \
5180                                      (__v2di)_mm_set1_epi64x(-1), (s)))
5181
5182 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5183 ///    from the 128-bit vector of [4 x i32] in \a i.
5184 ///
5185 /// \code{.operation}
5186 /// FOR element := 0 to 3
5187 ///   j := element*64
5188 ///   k := element*32
5189 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5190 /// ENDFOR
5191 /// \endcode
5192 ///
5193 /// \headerfile <immintrin.h>
5194 ///
5195 /// \code
5196 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5197 /// \endcode
5198 ///
5199 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5200 ///
5201 /// \param m
5202 ///    A pointer to the memory used for loading values.
5203 /// \param i
5204 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5205 /// \param s
5206 ///    A literal constant scale factor for the indexes in \a i. Must be
5207 ///    1, 2, 4, or 8.
5208 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5209 #define _mm256_i32gather_epi64(m, i, s) \
5210   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5211                                         (long long const *)(m), \
5212                                         (__v4si)(__m128i)(i), \
5213                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
5214
5215 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5216 ///    from the 128-bit vector of [2 x i64] in \a i.
5217 ///
5218 /// \code{.operation}
5219 /// FOR element := 0 to 1
5220 ///   j := element*64
5221 ///   k := element*64
5222 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5223 /// ENDFOR
5224 /// \endcode
5225 ///
5226 /// \headerfile <immintrin.h>
5227 ///
5228 /// \code
5229 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5230 /// \endcode
5231 ///
5232 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5233 ///
5234 /// \param m
5235 ///    A pointer to the memory used for loading values.
5236 /// \param i
5237 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5238 /// \param s
5239 ///    A literal constant scale factor for the indexes in \a i. Must be
5240 ///    1, 2, 4, or 8.
5241 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5242 #define _mm_i64gather_epi64(m, i, s) \
5243   ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5244                                      (long long const *)(m), \
5245                                      (__v2di)(__m128i)(i), \
5246                                      (__v2di)_mm_set1_epi64x(-1), (s)))
5247
5248 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5249 ///    from the 256-bit vector of [4 x i64] in \a i.
5250 ///
5251 /// \code{.operation}
5252 /// FOR element := 0 to 3
5253 ///   j := element*64
5254 ///   k := element*64
5255 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5256 /// ENDFOR
5257 /// \endcode
5258 ///
5259 /// \headerfile <immintrin.h>
5260 ///
5261 /// \code
5262 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5263 /// \endcode
5264 ///
5265 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5266 ///
5267 /// \param m
5268 ///    A pointer to the memory used for loading values.
5269 /// \param i
5270 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5271 /// \param s
5272 ///    A literal constant scale factor for the indexes in \a i. Must be
5273 ///    1, 2, 4, or 8.
5274 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5275 #define _mm256_i64gather_epi64(m, i, s) \
5276   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5277                                         (long long const *)(m), \
5278                                         (__v4di)(__m256i)(i), \
5279                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
5280
5281 #undef __DEFAULT_FN_ATTRS256
5282 #undef __DEFAULT_FN_ATTRS128
5283
5284 #endif /* __AVX2INTRIN_H */