clang/lib/Headers/avx2intrin.h

   1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __IMMINTRIN_H
  11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
  12 #endif
  13
  14 #ifndef __AVX2INTRIN_H
  15 #define __AVX2INTRIN_H
  16
  17 /* Define the default attributes for the functions in this file. */
  18 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
  19 #define __DEFAULT_FN_ATTRS256                                                  \
  20   __attribute__((__always_inline__, __nodebug__,                               \
  21                  __target__("avx2,no-evex512"), __min_vector_width__(256)))
  22 #define __DEFAULT_FN_ATTRS128                                                  \
  23   __attribute__((__always_inline__, __nodebug__,                               \
  24                  __target__("avx2,no-evex512"), __min_vector_width__(128)))
  25 #else
  26 #define __DEFAULT_FN_ATTRS256                                                  \
  27   __attribute__((__always_inline__, __nodebug__, __target__("avx2"),           \
  28                  __min_vector_width__(256)))
  29 #define __DEFAULT_FN_ATTRS128                                                  \
  30   __attribute__((__always_inline__, __nodebug__, __target__("avx2"),           \
  31                  __min_vector_width__(128)))
  32 #endif
  33
  34 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
  35 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
  36 ///    four unsigned 8-bit integers from the 256-bit integer vectors \a X and
  37 ///    \a Y.
  38 ///
  39 ///    Eight SAD results are computed using the lower half of the input
  40 ///    vectors, and another eight using the upper half. These 16-bit values
  41 ///    are returned in the lower and upper halves of the 256-bit result,
  42 ///    respectively.
  43 ///
  44 ///    A single SAD operation selects four bytes from \a X and four bytes from
  45 ///    \a Y as input. It computes the differences between each \a X byte and
  46 ///    the corresponding \a Y byte, takes the absolute value of each
  47 ///    difference, and sums these four values to form one 16-bit result. The
  48 ///    intrinsic computes 16 of these results with different sets of input
  49 ///    bytes.
  50 ///
  51 ///    For each set of eight results, the SAD operations use the same four
  52 ///    bytes from \a Y; the starting bit position for these four bytes is
  53 ///    specified by \a M[1:0] times 32. The eight operations use successive
  54 ///    sets of four bytes from \a X; the starting bit position for the first
  55 ///    set of four bytes is specified by \a M[2] times 32. These bit positions
  56 ///    are all relative to the 128-bit lane for each set of eight operations.
  57 ///
  58 /// \code{.operation}
  59 /// r := 0
  60 /// FOR i := 0 TO 1
  61 ///   j := i*3
  62 ///   Ybase := M[j+1:j]*32 + i*128
  63 ///   Xbase := M[j+2]*32 + i*128
  64 ///   FOR k := 0 TO 3
  65 ///     temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
  66 ///     temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
  67 ///     temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
  68 ///     temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
  69 ///     result[r+15:r] := temp0 + temp1 + temp2 + temp3
  70 ///     Xbase := Xbase + 8
  71 ///     r := r + 16
  72 ///   ENDFOR
  73 /// ENDFOR
  74 /// \endcode
  75 ///
  76 /// \headerfile <immintrin.h>
  77 ///
  78 /// \code
  79 /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
  80 /// \endcode
  81 ///
  82 /// This intrinsic corresponds to the \c VMPSADBW instruction.
  83 ///
  84 /// \param X
  85 ///    A 256-bit integer vector containing one of the inputs.
  86 /// \param Y
  87 ///    A 256-bit integer vector containing one of the inputs.
  88 /// \param M
  89 ///     An unsigned immediate value specifying the starting positions of the
  90 ///     bytes to operate on.
  91 /// \returns A 256-bit vector of [16 x i16] containing the result.
  92 #define _mm256_mpsadbw_epu8(X, Y, M) \
  93   ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
  94                                       (__v32qi)(__m256i)(Y), (int)(M)))
  95
  96 /// Computes the absolute value of each signed byte in the 256-bit integer
  97 ///    vector \a __a and returns each value in the corresponding byte of
  98 ///    the result.
  99 ///
 100 /// \headerfile <immintrin.h>
 101 ///
 102 /// This intrinsic corresponds to the \c VPABSB instruction.
 103 ///
 104 /// \param __a
 105 ///    A 256-bit integer vector.
 106 /// \returns A 256-bit integer vector containing the result.
 107 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 108 _mm256_abs_epi8(__m256i __a)
 109 {
 110     return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
 111 }
 112
 113 /// Computes the absolute value of each signed 16-bit element in the 256-bit
 114 ///    vector of [16 x i16] in \a __a and returns each value in the
 115 ///    corresponding element of the result.
 116 ///
 117 /// \headerfile <immintrin.h>
 118 ///
 119 /// This intrinsic corresponds to the \c VPABSW instruction.
 120 ///
 121 /// \param __a
 122 ///    A 256-bit vector of [16 x i16].
 123 /// \returns A 256-bit vector of [16 x i16] containing the result.
 124 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 125 _mm256_abs_epi16(__m256i __a)
 126 {
 127     return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
 128 }
 129
 130 /// Computes the absolute value of each signed 32-bit element in the 256-bit
 131 ///    vector of [8 x i32] in \a __a and returns each value in the
 132 ///    corresponding element of the result.
 133 ///
 134 /// \headerfile <immintrin.h>
 135 ///
 136 /// This intrinsic corresponds to the \c VPABSD instruction.
 137 ///
 138 /// \param __a
 139 ///    A 256-bit vector of [8 x i32].
 140 /// \returns A 256-bit vector of [8 x i32] containing the result.
 141 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 142 _mm256_abs_epi32(__m256i __a)
 143 {
 144     return (__m256i)__builtin_elementwise_abs((__v8si)__a);
 145 }
 146
 147 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
 148 ///    integers using signed saturation, and returns the 256-bit result.
 149 ///
 150 /// \code{.operation}
 151 /// FOR i := 0 TO 7
 152 ///   j := i*16
 153 ///   k := i*8
 154 ///   result[7+k:k] := SATURATE8(__a[15+j:j])
 155 ///   result[71+k:64+k] := SATURATE8(__b[15+j:j])
 156 ///   result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
 157 ///   result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
 158 /// ENDFOR
 159 /// \endcode
 160 ///
 161 /// \headerfile <immintrin.h>
 162 ///
 163 /// This intrinsic corresponds to the \c VPACKSSWB instruction.
 164 ///
 165 /// \param __a
 166 ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
 167 ///    result[191:128].
 168 /// \param __b
 169 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
 170 ///    result[255:192].
 171 /// \returns A 256-bit integer vector containing the result.
 172 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 173 _mm256_packs_epi16(__m256i __a, __m256i __b)
 174 {
 175   return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
 176 }
 177
 178 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
 179 ///    integers using signed saturation, and returns the resulting 256-bit
 180 ///    vector of [16 x i16].
 181 ///
 182 /// \code{.operation}
 183 /// FOR i := 0 TO 3
 184 ///   j := i*32
 185 ///   k := i*16
 186 ///   result[15+k:k] := SATURATE16(__a[31+j:j])
 187 ///   result[79+k:64+k] := SATURATE16(__b[31+j:j])
 188 ///   result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
 189 ///   result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
 190 /// ENDFOR
 191 /// \endcode
 192 ///
 193 /// \headerfile <immintrin.h>
 194 ///
 195 /// This intrinsic corresponds to the \c VPACKSSDW instruction.
 196 ///
 197 /// \param __a
 198 ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
 199 ///    result[191:128].
 200 /// \param __b
 201 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
 202 ///    result[255:192].
 203 /// \returns A 256-bit vector of [16 x i16] containing the result.
 204 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 205 _mm256_packs_epi32(__m256i __a, __m256i __b)
 206 {
 207   return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
 208 }
 209
 210 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
 211 ///    using unsigned saturation, and returns the 256-bit result.
 212 ///
 213 /// \code{.operation}
 214 /// FOR i := 0 TO 7
 215 ///   j := i*16
 216 ///   k := i*8
 217 ///   result[7+k:k] := SATURATE8U(__a[15+j:j])
 218 ///   result[71+k:64+k] := SATURATE8U(__b[15+j:j])
 219 ///   result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
 220 ///   result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
 221 /// ENDFOR
 222 /// \endcode
 223 ///
 224 /// \headerfile <immintrin.h>
 225 ///
 226 /// This intrinsic corresponds to the \c VPACKUSWB instruction.
 227 ///
 228 /// \param __a
 229 ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
 230 ///    result[191:128].
 231 /// \param __b
 232 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
 233 ///    result[255:192].
 234 /// \returns A 256-bit integer vector containing the result.
 235 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 236 _mm256_packus_epi16(__m256i __a, __m256i __b)
 237 {
 238   return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
 239 }
 240
 241 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
 242 ///    using unsigned saturation, and returns the resulting 256-bit vector of
 243 ///    [16 x i16].
 244 ///
 245 /// \code{.operation}
 246 /// FOR i := 0 TO 3
 247 ///   j := i*32
 248 ///   k := i*16
 249 ///   result[15+k:k] := SATURATE16U(__V1[31+j:j])
 250 ///   result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
 251 ///   result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
 252 ///   result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
 253 /// ENDFOR
 254 /// \endcode
 255 ///
 256 /// \headerfile <immintrin.h>
 257 ///
 258 /// This intrinsic corresponds to the \c VPACKUSDW instruction.
 259 ///
 260 /// \param __V1
 261 ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
 262 ///    result[191:128].
 263 /// \param __V2
 264 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
 265 ///    result[255:192].
 266 /// \returns A 256-bit vector of [16 x i16] containing the result.
 267 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 268 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
 269 {
 270   return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
 271 }
 272
 273 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
 274 ///    vectors and returns the lower 8 bits of each sum in the corresponding
 275 ///    byte of the 256-bit integer vector result (overflow is ignored).
 276 ///
 277 /// \headerfile <immintrin.h>
 278 ///
 279 /// This intrinsic corresponds to the \c VPADDB instruction.
 280 ///
 281 /// \param __a
 282 ///    A 256-bit integer vector containing one of the source operands.
 283 /// \param __b
 284 ///    A 256-bit integer vector containing one of the source operands.
 285 /// \returns A 256-bit integer vector containing the sums.
 286 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 287 _mm256_add_epi8(__m256i __a, __m256i __b)
 288 {
 289   return (__m256i)((__v32qu)__a + (__v32qu)__b);
 290 }
 291
 292 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
 293 ///    [16 x i16] and returns the lower 16 bits of each sum in the
 294 ///    corresponding element of the [16 x i16] result (overflow is ignored).
 295 ///
 296 /// \headerfile <immintrin.h>
 297 ///
 298 /// This intrinsic corresponds to the \c VPADDW instruction.
 299 ///
 300 /// \param __a
 301 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 302 /// \param __b
 303 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 304 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 305 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 306 _mm256_add_epi16(__m256i __a, __m256i __b)
 307 {
 308   return (__m256i)((__v16hu)__a + (__v16hu)__b);
 309 }
 310
 311 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
 312 ///    [8 x i32] and returns the lower 32 bits of each sum in the corresponding
 313 ///    element of the [8 x i32] result (overflow is ignored).
 314 ///
 315 /// \headerfile <immintrin.h>
 316 ///
 317 /// This intrinsic corresponds to the \c VPADDD instruction.
 318 ///
 319 /// \param __a
 320 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 321 /// \param __b
 322 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 323 /// \returns A 256-bit vector of [8 x i32] containing the sums.
 324 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 325 _mm256_add_epi32(__m256i __a, __m256i __b)
 326 {
 327   return (__m256i)((__v8su)__a + (__v8su)__b);
 328 }
 329
 330 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
 331 ///    [4 x i64] and returns the lower 64 bits of each sum in the corresponding
 332 ///    element of the [4 x i64] result (overflow is ignored).
 333 ///
 334 /// \headerfile <immintrin.h>
 335 ///
 336 /// This intrinsic corresponds to the \c VPADDQ instruction.
 337 ///
 338 /// \param __a
 339 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
 340 /// \param __b
 341 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
 342 /// \returns A 256-bit vector of [4 x i64] containing the sums.
 343 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 344 _mm256_add_epi64(__m256i __a, __m256i __b)
 345 {
 346   return (__m256i)((__v4du)__a + (__v4du)__b);
 347 }
 348
 349 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
 350 ///    vectors using signed saturation, and returns each sum in the
 351 ///    corresponding byte of the 256-bit integer vector result.
 352 ///
 353 /// \headerfile <immintrin.h>
 354 ///
 355 /// This intrinsic corresponds to the \c VPADDSB instruction.
 356 ///
 357 /// \param __a
 358 ///    A 256-bit integer vector containing one of the source operands.
 359 /// \param __b
 360 ///    A 256-bit integer vector containing one of the source operands.
 361 /// \returns A 256-bit integer vector containing the sums.
 362 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 363 _mm256_adds_epi8(__m256i __a, __m256i __b)
 364 {
 365   return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
 366 }
 367
 368 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
 369 ///    [16 x i16] using signed saturation, and returns the [16 x i16] result.
 370 ///
 371 /// \headerfile <immintrin.h>
 372 ///
 373 /// This intrinsic corresponds to the \c VPADDSW instruction.
 374 ///
 375 /// \param __a
 376 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 377 /// \param __b
 378 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 379 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 380 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 381 _mm256_adds_epi16(__m256i __a, __m256i __b)
 382 {
 383   return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
 384 }
 385
 386 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
 387 ///    vectors using unsigned saturation, and returns each sum in the
 388 ///    corresponding byte of the 256-bit integer vector result.
 389 ///
 390 /// \headerfile <immintrin.h>
 391 ///
 392 /// This intrinsic corresponds to the \c VPADDUSB instruction.
 393 ///
 394 /// \param __a
 395 ///    A 256-bit integer vector containing one of the source operands.
 396 /// \param __b
 397 ///    A 256-bit integer vector containing one of the source operands.
 398 /// \returns A 256-bit integer vector containing the sums.
 399 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 400 _mm256_adds_epu8(__m256i __a, __m256i __b)
 401 {
 402   return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
 403 }
 404
 405 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
 406 ///    [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
 407 ///
 408 /// \headerfile <immintrin.h>
 409 ///
 410 /// This intrinsic corresponds to the \c VPADDUSW instruction.
 411 ///
 412 /// \param __a
 413 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 414 /// \param __b
 415 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 416 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 417 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 418 _mm256_adds_epu16(__m256i __a, __m256i __b)
 419 {
 420   return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
 421 }
 422
 423 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
 424 ///    temporary 256-bit value, and the lower half of the 256-bit vector \a b
 425 ///    as the lower half of the temporary value. Right-shifts the temporary
 426 ///    value by \a n bytes, and uses the lower 16 bytes of the shifted value
 427 ///    as the lower 16 bytes of the result. Uses the upper halves of \a a and
 428 ///    \a b to make another temporary value, right shifts by \a n, and uses
 429 ///    the lower 16 bytes of the shifted value as the upper 16 bytes of the
 430 ///    result.
 431 ///
 432 /// \headerfile <immintrin.h>
 433 ///
 434 /// \code
 435 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
 436 /// \endcode
 437 ///
 438 /// This intrinsic corresponds to the \c VPALIGNR instruction.
 439 ///
 440 /// \param a
 441 ///    A 256-bit integer vector containing source values.
 442 /// \param b
 443 ///    A 256-bit integer vector containing source values.
 444 /// \param n
 445 ///    An immediate value specifying the number of bytes to shift.
 446 /// \returns A 256-bit integer vector containing the result.
 447 #define _mm256_alignr_epi8(a, b, n) \
 448   ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
 449                                       (__v32qi)(__m256i)(b), (n)))
 450
 451 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
 452 ///    \a __b.
 453 ///
 454 /// \headerfile <immintrin.h>
 455 ///
 456 /// This intrinsic corresponds to the \c VPAND instruction.
 457 ///
 458 /// \param __a
 459 ///    A 256-bit integer vector.
 460 /// \param __b
 461 ///    A 256-bit integer vector.
 462 /// \returns A 256-bit integer vector containing the result.
 463 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 464 _mm256_and_si256(__m256i __a, __m256i __b)
 465 {
 466   return (__m256i)((__v4du)__a & (__v4du)__b);
 467 }
 468
 469 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with
 470 ///    the bitwise NOT of the 256-bit integer vector in \a __a.
 471 ///
 472 /// \headerfile <immintrin.h>
 473 ///
 474 /// This intrinsic corresponds to the \c VPANDN instruction.
 475 ///
 476 /// \param __a
 477 ///    A 256-bit integer vector.
 478 /// \param __b
 479 ///    A 256-bit integer vector.
 480 /// \returns A 256-bit integer vector containing the result.
 481 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 482 _mm256_andnot_si256(__m256i __a, __m256i __b)
 483 {
 484   return (__m256i)(~(__v4du)__a & (__v4du)__b);
 485 }
 486
 487 /// Computes the averages of the corresponding unsigned bytes in the two
 488 ///    256-bit integer vectors in \a __a and \a __b and returns each
 489 ///    average in the corresponding byte of the 256-bit result.
 490 ///
 491 /// \code{.operation}
 492 /// FOR i := 0 TO 31
 493 ///   j := i*8
 494 ///   result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
 495 /// ENDFOR
 496 /// \endcode
 497 ///
 498 /// \headerfile <immintrin.h>
 499 ///
 500 /// This intrinsic corresponds to the \c VPAVGB instruction.
 501 ///
 502 /// \param __a
 503 ///    A 256-bit integer vector.
 504 /// \param __b
 505 ///    A 256-bit integer vector.
 506 /// \returns A 256-bit integer vector containing the result.
 507 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 508 _mm256_avg_epu8(__m256i __a, __m256i __b)
 509 {
 510   return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
 511 }
 512
 513 /// Computes the averages of the corresponding unsigned 16-bit integers in
 514 ///    the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
 515 ///    each average in the corresponding element of the 256-bit result.
 516 ///
 517 /// \code{.operation}
 518 /// FOR i := 0 TO 15
 519 ///   j := i*16
 520 ///   result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
 521 /// ENDFOR
 522 /// \endcode
 523 ///
 524 /// \headerfile <immintrin.h>
 525 ///
 526 /// This intrinsic corresponds to the \c VPAVGW instruction.
 527 ///
 528 /// \param __a
 529 ///    A 256-bit vector of [16 x i16].
 530 /// \param __b
 531 ///    A 256-bit vector of [16 x i16].
 532 /// \returns A 256-bit vector of [16 x i16] containing the result.
 533 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 534 _mm256_avg_epu16(__m256i __a, __m256i __b)
 535 {
 536   return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
 537 }
 538
 539 /// Merges 8-bit integer values from either of the two 256-bit vectors
 540 ///    \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
 541 ///    the resulting 256-bit integer vector.
 542 ///
 543 /// \code{.operation}
 544 /// FOR i := 0 TO 31
 545 ///   j := i*8
 546 ///   IF __M[7+i] == 0
 547 ///     result[7+j:j] := __V1[7+j:j]
 548 ///   ELSE
 549 ///     result[7+j:j] := __V2[7+j:j]
 550 ///   FI
 551 /// ENDFOR
 552 /// \endcode
 553 ///
 554 /// \headerfile <immintrin.h>
 555 ///
 556 /// This intrinsic corresponds to the \c VPBLENDVB instruction.
 557 ///
 558 /// \param __V1
 559 ///    A 256-bit integer vector containing source values.
 560 /// \param __V2
 561 ///    A 256-bit integer vector containing source values.
 562 /// \param __M
 563 ///    A 256-bit integer vector, with bit [7] of each byte specifying the
 564 ///    source for each corresponding byte of the result. When the mask bit
 565 ///    is 0, the byte is copied from \a __V1; otherwise, it is copied from
 566 ///    \a __V2.
 567 /// \returns A 256-bit integer vector containing the result.
 568 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 569 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
 570 {
 571   return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
 572                                               (__v32qi)__M);
 573 }
 574
 575 /// Merges 16-bit integer values from either of the two 256-bit vectors
 576 ///    \a V1 or \a V2, as specified by the immediate integer operand \a M,
 577 ///    and returns the resulting 256-bit vector of [16 x i16].
 578 ///
 579 /// \code{.operation}
 580 /// FOR i := 0 TO 7
 581 ///   j := i*16
 582 ///   IF M[i] == 0
 583 ///     result[7+j:j] := V1[7+j:j]
 584 ///     result[135+j:128+j] := V1[135+j:128+j]
 585 ///   ELSE
 586 ///     result[7+j:j] := V2[7+j:j]
 587 ///     result[135+j:128+j] := V2[135+j:128+j]
 588 ///   FI
 589 /// ENDFOR
 590 /// \endcode
 591 ///
 592 /// \headerfile <immintrin.h>
 593 ///
 594 /// \code
 595 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
 596 /// \endcode
 597 ///
 598 /// This intrinsic corresponds to the \c VPBLENDW instruction.
 599 ///
 600 /// \param V1
 601 ///    A 256-bit vector of [16 x i16] containing source values.
 602 /// \param V2
 603 ///    A 256-bit vector of [16 x i16] containing source values.
 604 /// \param M
 605 ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
 606 ///    source for each element of the result. The position of the mask bit
 607 ///    corresponds to the index of a copied value. When a mask bit is 0, the
 608 ///    element is copied from \a V1; otherwise, it is copied from \a V2.
 609 ///    \a M[0] determines the source for elements 0 and 8, \a M[1] for
 610 ///    elements 1 and 9, and so forth.
 611 /// \returns A 256-bit vector of [16 x i16] containing the result.
 612 #define _mm256_blend_epi16(V1, V2, M) \
 613   ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
 614                                       (__v16hi)(__m256i)(V2), (int)(M)))
 615
 616 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
 617 ///    \a __b for equality and returns the outcomes in the corresponding
 618 ///    bytes of the 256-bit result.
 619 ///
 620 /// \code{.operation}
 621 /// FOR i := 0 TO 31
 622 ///   j := i*8
 623 ///   result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
 624 /// ENDFOR
 625 /// \endcode
 626 ///
 627 /// \headerfile <immintrin.h>
 628 ///
 629 /// This intrinsic corresponds to the \c VPCMPEQB instruction.
 630 ///
 631 /// \param __a
 632 ///    A 256-bit integer vector containing one of the inputs.
 633 /// \param __b
 634 ///    A 256-bit integer vector containing one of the inputs.
 635 /// \returns A 256-bit integer vector containing the result.
 636 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 637 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
 638 {
 639   return (__m256i)((__v32qi)__a == (__v32qi)__b);
 640 }
 641
 642 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
 643 ///    \a __a and \a __b for equality and returns the outcomes in the
 644 ///    corresponding elements of the 256-bit result.
 645 ///
 646 /// \code{.operation}
 647 /// FOR i := 0 TO 15
 648 ///   j := i*16
 649 ///   result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
 650 /// ENDFOR
 651 /// \endcode
 652 ///
 653 /// \headerfile <immintrin.h>
 654 ///
 655 /// This intrinsic corresponds to the \c VPCMPEQW instruction.
 656 ///
 657 /// \param __a
 658 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 659 /// \param __b
 660 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 661 /// \returns A 256-bit vector of [16 x i16] containing the result.
 662 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 663 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
 664 {
 665   return (__m256i)((__v16hi)__a == (__v16hi)__b);
 666 }
 667
 668 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
 669 ///    \a __a and \a __b for equality and returns the outcomes in the
 670 ///    corresponding elements of the 256-bit result.
 671 ///
 672 /// \code{.operation}
 673 /// FOR i := 0 TO 7
 674 ///   j := i*32
 675 ///   result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
 676 /// ENDFOR
 677 /// \endcode
 678 ///
 679 /// \headerfile <immintrin.h>
 680 ///
 681 /// This intrinsic corresponds to the \c VPCMPEQD instruction.
 682 ///
 683 /// \param __a
 684 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 685 /// \param __b
 686 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 687 /// \returns A 256-bit vector of [8 x i32] containing the result.
 688 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 689 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
 690 {
 691   return (__m256i)((__v8si)__a == (__v8si)__b);
 692 }
 693
 694 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
 695 ///    \a __a and \a __b for equality and returns the outcomes in the
 696 ///    corresponding elements of the 256-bit result.
 697 ///
 698 /// \code{.operation}
 699 /// FOR i := 0 TO 3
 700 ///   j := i*64
 701 ///   result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
 702 /// ENDFOR
 703 /// \endcode
 704 ///
 705 /// \headerfile <immintrin.h>
 706 ///
 707 /// This intrinsic corresponds to the \c VPCMPEQQ instruction.
 708 ///
 709 /// \param __a
 710 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 711 /// \param __b
 712 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 713 /// \returns A 256-bit vector of [4 x i64] containing the result.
 714 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 715 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
 716 {
 717   return (__m256i)((__v4di)__a == (__v4di)__b);
 718 }
 719
 720 /// Compares corresponding signed bytes in the 256-bit integer vectors in
 721 ///    \a __a and \a __b for greater-than and returns the outcomes in the
 722 ///    corresponding bytes of the 256-bit result.
 723 ///
 724 /// \code{.operation}
 725 /// FOR i := 0 TO 31
 726 ///   j := i*8
 727 ///   result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
 728 /// ENDFOR
 729 /// \endcode
 730 ///
 731 /// \headerfile <immintrin.h>
 732 ///
 733 /// This intrinsic corresponds to the \c VPCMPGTB instruction.
 734 ///
 735 /// \param __a
 736 ///    A 256-bit integer vector containing one of the inputs.
 737 /// \param __b
 738 ///    A 256-bit integer vector containing one of the inputs.
 739 /// \returns A 256-bit integer vector containing the result.
 740 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 741 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
 742 {
 743   /* This function always performs a signed comparison, but __v32qi is a char
 744      which may be signed or unsigned, so use __v32qs. */
 745   return (__m256i)((__v32qs)__a > (__v32qs)__b);
 746 }
 747
 748 /// Compares corresponding signed elements in the 256-bit vectors of
 749 ///    [16 x i16] in \a __a and \a __b for greater-than and returns the
 750 ///    outcomes in the corresponding elements of the 256-bit result.
 751 ///
 752 /// \code{.operation}
 753 /// FOR i := 0 TO 15
 754 ///   j := i*16
 755 ///   result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
 756 /// ENDFOR
 757 /// \endcode
 758 ///
 759 /// \headerfile <immintrin.h>
 760 ///
 761 /// This intrinsic corresponds to the \c VPCMPGTW instruction.
 762 ///
 763 /// \param __a
 764 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 765 /// \param __b
 766 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 767 /// \returns A 256-bit vector of [16 x i16] containing the result.
 768 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 769 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
 770 {
 771   return (__m256i)((__v16hi)__a > (__v16hi)__b);
 772 }
 773
 774 /// Compares corresponding signed elements in the 256-bit vectors of
 775 ///    [8 x i32] in \a __a and \a __b for greater-than and returns the
 776 ///    outcomes in the corresponding elements of the 256-bit result.
 777 ///
 778 /// \code{.operation}
 779 /// FOR i := 0 TO 7
 780 ///   j := i*32
 781 ///   result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
 782 /// ENDFOR
 783 /// \endcode
 784 ///
 785 /// \headerfile <immintrin.h>
 786 ///
 787 /// This intrinsic corresponds to the \c VPCMPGTD instruction.
 788 ///
 789 /// \param __a
 790 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 791 /// \param __b
 792 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 793 /// \returns A 256-bit vector of [8 x i32] containing the result.
 794 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 795 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
 796 {
 797   return (__m256i)((__v8si)__a > (__v8si)__b);
 798 }
 799
 800 /// Compares corresponding signed elements in the 256-bit vectors of
 801 ///    [4 x i64] in \a __a and \a __b for greater-than and returns the
 802 ///    outcomes in the corresponding elements of the 256-bit result.
 803 ///
 804 /// \code{.operation}
 805 /// FOR i := 0 TO 3
 806 ///   j := i*64
 807 ///   result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
 808 /// ENDFOR
 809 /// \endcode
 810 ///
 811 /// \headerfile <immintrin.h>
 812 ///
 813 /// This intrinsic corresponds to the \c VPCMPGTQ instruction.
 814 ///
 815 /// \param __a
 816 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 817 /// \param __b
 818 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 819 /// \returns A 256-bit vector of [4 x i64] containing the result.
 820 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 821 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 822 {
 823   return (__m256i)((__v4di)__a > (__v4di)__b);
 824 }
 825
 826 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
 827 ///    vectors of [16 x i16] and returns the lower 16 bits of each sum in an
 828 ///    element of the [16 x i16] result (overflow is ignored). Sums from
 829 ///    \a __a are returned in the lower 64 bits of each 128-bit half of the
 830 ///    result; sums from \a __b are returned in the upper 64 bits of each
 831 ///    128-bit half of the result.
 832 ///
 833 /// \code{.operation}
 834 /// FOR i := 0 TO 1
 835 ///   j := i*128
 836 ///   result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
 837 ///   result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
 838 ///   result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
 839 ///   result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
 840 ///   result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
 841 ///   result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
 842 ///   result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
 843 ///   result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
 844 /// ENDFOR
 845 /// \endcode
 846 ///
 847 /// \headerfile <immintrin.h>
 848 ///
 849 /// This intrinsic corresponds to the \c VPHADDW instruction.
 850 ///
 851 /// \param __a
 852 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 853 /// \param __b
 854 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 855 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 856 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 857 _mm256_hadd_epi16(__m256i __a, __m256i __b)
 858 {
 859     return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
 860 }
 861
 862 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
 863 ///    vectors of [8 x i32] and returns the lower 32 bits of each sum in an
 864 ///    element of the [8 x i32] result (overflow is ignored). Sums from \a __a
 865 ///    are returned in the lower 64 bits of each 128-bit half of the result;
 866 ///    sums from \a __b are returned in the upper 64 bits of each 128-bit half
 867 ///    of the result.
 868 ///
 869 /// \code{.operation}
 870 /// FOR i := 0 TO 1
 871 ///   j := i*128
 872 ///   result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
 873 ///   result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
 874 ///   result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
 875 ///   result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
 876 /// ENDFOR
 877 /// \endcode
 878 ///
 879 /// \headerfile <immintrin.h>
 880 ///
 881 /// This intrinsic corresponds to the \c VPHADDD instruction.
 882 ///
 883 /// \param __a
 884 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 885 /// \param __b
 886 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 887 /// \returns A 256-bit vector of [8 x i32] containing the sums.
 888 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 889 _mm256_hadd_epi32(__m256i __a, __m256i __b)
 890 {
 891     return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
 892 }
 893
 894 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
 895 ///    vectors of [16 x i16] using signed saturation and returns each sum in
 896 ///    an element of the [16 x i16] result. Sums from \a __a are returned in
 897 ///    the lower 64 bits of each 128-bit half of the result; sums from \a __b
 898 ///    are returned in the upper 64 bits of each 128-bit half of the result.
 899 ///
 900 /// \code{.operation}
 901 /// FOR i := 0 TO 1
 902 ///   j := i*128
 903 ///   result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
 904 ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
 905 ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
 906 ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
 907 ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
 908 ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
 909 ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
 910 ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
 911 /// ENDFOR
 912 /// \endcode
 913 ///
 914 /// \headerfile <immintrin.h>
 915 ///
 916 /// This intrinsic corresponds to the \c VPHADDSW instruction.
 917 ///
 918 /// \param __a
 919 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 920 /// \param __b
 921 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 922 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 923 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 924 _mm256_hadds_epi16(__m256i __a, __m256i __b)
 925 {
 926     return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
 927 }
 928
 929 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
 930 ///    vectors of [16 x i16] and returns the lower 16 bits of each difference
 931 ///    in an element of the [16 x i16] result (overflow is ignored).
 932 ///    Differences from \a __a are returned in the lower 64 bits of each
 933 ///    128-bit half of the result; differences from \a __b are returned in the
 934 ///    upper 64 bits of each 128-bit half of the result.
 935 ///
 936 /// \code{.operation}
 937 /// FOR i := 0 TO 1
 938 ///   j := i*128
 939 ///   result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
 940 ///   result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
 941 ///   result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
 942 ///   result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
 943 ///   result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
 944 ///   result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
 945 ///   result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
 946 ///   result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
 947 /// ENDFOR
 948 /// \endcode
 949 ///
 950 /// \headerfile <immintrin.h>
 951 ///
 952 /// This intrinsic corresponds to the \c VPHSUBW instruction.
 953 ///
 954 /// \param __a
 955 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 956 /// \param __b
 957 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 958 /// \returns A 256-bit vector of [16 x i16] containing the differences.
 959 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 960 _mm256_hsub_epi16(__m256i __a, __m256i __b)
 961 {
 962     return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
 963 }
 964
 965 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
 966 ///    vectors of [8 x i32] and returns the lower 32 bits of each difference in
 967 ///    an element of the [8 x i32] result (overflow is ignored). Differences
 968 ///    from \a __a are returned in the lower 64 bits of each 128-bit half of
 969 ///    the result; differences from \a __b are returned in the upper 64 bits
 970 ///    of each 128-bit half of the result.
 971 ///
 972 /// \code{.operation}
 973 /// FOR i := 0 TO 1
 974 ///   j := i*128
 975 ///   result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
 976 ///   result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
 977 ///   result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
 978 ///   result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
 979 /// ENDFOR
 980 /// \endcode
 981 ///
 982 /// \headerfile <immintrin.h>
 983 ///
 984 /// This intrinsic corresponds to the \c VPHSUBD instruction.
 985 ///
 986 /// \param __a
 987 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 988 /// \param __b
 989 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 990 /// \returns A 256-bit vector of [8 x i32] containing the differences.
 991 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 992 _mm256_hsub_epi32(__m256i __a, __m256i __b)
 993 {
 994     return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
 995 }
 996
 997 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
 998 ///    vectors of [16 x i16] using signed saturation and returns each sum in
 999 ///    an element of the [16 x i16] result. Differences from \a __a are
1000 ///    returned in the lower 64 bits of each 128-bit half of the result;
1001 ///    differences from \a __b are returned in the upper 64 bits of each
1002 ///    128-bit half of the result.
1003 ///
1004 /// \code{.operation}
1005 /// FOR i := 0 TO 1
1006 ///   j := i*128
1007 ///   result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
1008 ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1009 ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1010 ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1011 ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1012 ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1013 ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1014 ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1015 /// ENDFOR
1016 /// \endcode
1017 ///
1018 /// \headerfile <immintrin.h>
1019 ///
1020 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
1021 ///
1022 /// \param __a
1023 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1024 /// \param __b
1025 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1026 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1027 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1028 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
1029 {
1030     return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1031 }
1032
1033 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1034 ///    with the corresponding signed byte from the 256-bit integer vector in
1035 ///    \a __b, forming signed 16-bit intermediate products. Adds adjacent
1036 ///    pairs of those products using signed saturation to form 16-bit sums
1037 ///    returned as elements of the [16 x i16] result.
1038 ///
1039 /// \code{.operation}
1040 /// FOR i := 0 TO 15
1041 ///   j := i*16
1042 ///   temp1 := __a[j+7:j] * __b[j+7:j]
1043 ///   temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1044 ///   result[j+15:j] := SATURATE16(temp1 + temp2)
1045 /// ENDFOR
1046 /// \endcode
1047 ///
1048 /// \headerfile <immintrin.h>
1049 ///
1050 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1051 ///
1052 /// \param __a
1053 ///    A 256-bit vector containing one of the source operands.
1054 /// \param __b
1055 ///    A 256-bit vector containing one of the source operands.
1056 /// \returns A 256-bit vector of [16 x i16] containing the result.
1057 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1058 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
1059 {
1060     return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1061 }
1062
1063 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1064 ///    [16 x i16], forming 32-bit intermediate products, and adds pairs of
1065 ///    those products to form 32-bit sums returned as elements of the
1066 ///    [8 x i32] result.
1067 ///
1068 ///    There is only one wraparound case: when all four of the 16-bit sources
1069 ///    are \c 0x8000, the result will be \c 0x80000000.
1070 ///
1071 /// \code{.operation}
1072 /// FOR i := 0 TO 7
1073 ///   j := i*32
1074 ///   temp1 := __a[j+15:j] * __b[j+15:j]
1075 ///   temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1076 ///   result[j+31:j] := temp1 + temp2
1077 /// ENDFOR
1078 /// \endcode
1079 ///
1080 /// \headerfile <immintrin.h>
1081 ///
1082 /// This intrinsic corresponds to the \c VPMADDWD instruction.
1083 ///
1084 /// \param __a
1085 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1086 /// \param __b
1087 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1088 /// \returns A 256-bit vector of [8 x i32] containing the result.
1089 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1090 _mm256_madd_epi16(__m256i __a, __m256i __b)
1091 {
1092   return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1093 }
1094
1095 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1096 ///     in \a __a and \a __b and returns the larger of each pair in the
1097 ///     corresponding byte of the 256-bit result.
1098 ///
1099 /// \headerfile <immintrin.h>
1100 ///
1101 /// This intrinsic corresponds to the \c VPMAXSB instruction.
1102 ///
1103 /// \param __a
1104 ///    A 256-bit integer vector.
1105 /// \param __b
1106 ///    A 256-bit integer vector.
1107 /// \returns A 256-bit integer vector containing the result.
1108 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1109 _mm256_max_epi8(__m256i __a, __m256i __b)
1110 {
1111   return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1112 }
1113
1114 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1115 ///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1116 ///    each pair in the corresponding element of the 256-bit result.
1117 ///
1118 /// \headerfile <immintrin.h>
1119 ///
1120 /// This intrinsic corresponds to the \c VPMAXSW instruction.
1121 ///
1122 /// \param __a
1123 ///    A 256-bit vector of [16 x i16].
1124 /// \param __b
1125 ///    A 256-bit vector of [16 x i16].
1126 /// \returns A 256-bit vector of [16 x i16] containing the result.
1127 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1128 _mm256_max_epi16(__m256i __a, __m256i __b)
1129 {
1130   return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1131 }
1132
1133 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1134 ///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1135 ///    each pair in the corresponding element of the 256-bit result.
1136 ///
1137 /// \headerfile <immintrin.h>
1138 ///
1139 /// This intrinsic corresponds to the \c VPMAXSD instruction.
1140 ///
1141 /// \param __a
1142 ///    A 256-bit vector of [8 x i32].
1143 /// \param __b
1144 ///    A 256-bit vector of [8 x i32].
1145 /// \returns A 256-bit vector of [8 x i32] containing the result.
1146 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1147 _mm256_max_epi32(__m256i __a, __m256i __b)
1148 {
1149   return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1150 }
1151
1152 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1153 ///     vectors in \a __a and \a __b and returns the larger of each pair in
1154 ///     the corresponding byte of the 256-bit result.
1155 ///
1156 /// \headerfile <immintrin.h>
1157 ///
1158 /// This intrinsic corresponds to the \c VPMAXUB instruction.
1159 ///
1160 /// \param __a
1161 ///    A 256-bit integer vector.
1162 /// \param __b
1163 ///    A 256-bit integer vector.
1164 /// \returns A 256-bit integer vector containing the result.
1165 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1166 _mm256_max_epu8(__m256i __a, __m256i __b)
1167 {
1168   return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1169 }
1170
1171 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1172 ///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1173 ///    each pair in the corresponding element of the 256-bit result.
1174 ///
1175 /// \headerfile <immintrin.h>
1176 ///
1177 /// This intrinsic corresponds to the \c VPMAXUW instruction.
1178 ///
1179 /// \param __a
1180 ///    A 256-bit vector of [16 x i16].
1181 /// \param __b
1182 ///    A 256-bit vector of [16 x i16].
1183 /// \returns A 256-bit vector of [16 x i16] containing the result.
1184 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1185 _mm256_max_epu16(__m256i __a, __m256i __b)
1186 {
1187   return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1188 }
1189
1190 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1191 ///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1192 ///    each pair in the corresponding element of the 256-bit result.
1193 ///
1194 /// \headerfile <immintrin.h>
1195 ///
1196 /// This intrinsic corresponds to the \c VPMAXUD instruction.
1197 ///
1198 /// \param __a
1199 ///    A 256-bit vector of [8 x i32].
1200 /// \param __b
1201 ///    A 256-bit vector of [8 x i32].
1202 /// \returns A 256-bit vector of [8 x i32] containing the result.
1203 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1204 _mm256_max_epu32(__m256i __a, __m256i __b)
1205 {
1206   return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1207 }
1208
1209 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1210 ///     in \a __a and \a __b and returns the smaller of each pair in the
1211 ///     corresponding byte of the 256-bit result.
1212 ///
1213 /// \headerfile <immintrin.h>
1214 ///
1215 /// This intrinsic corresponds to the \c VPMINSB instruction.
1216 ///
1217 /// \param __a
1218 ///    A 256-bit integer vector.
1219 /// \param __b
1220 ///    A 256-bit integer vector.
1221 /// \returns A 256-bit integer vector containing the result.
1222 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1223 _mm256_min_epi8(__m256i __a, __m256i __b)
1224 {
1225   return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1226 }
1227
1228 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1229 ///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1230 ///    each pair in the corresponding element of the 256-bit result.
1231 ///
1232 /// \headerfile <immintrin.h>
1233 ///
1234 /// This intrinsic corresponds to the \c VPMINSW instruction.
1235 ///
1236 /// \param __a
1237 ///    A 256-bit vector of [16 x i16].
1238 /// \param __b
1239 ///    A 256-bit vector of [16 x i16].
1240 /// \returns A 256-bit vector of [16 x i16] containing the result.
1241 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1242 _mm256_min_epi16(__m256i __a, __m256i __b)
1243 {
1244   return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1245 }
1246
1247 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1248 ///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1249 ///    each pair in the corresponding element of the 256-bit result.
1250 ///
1251 /// \headerfile <immintrin.h>
1252 ///
1253 /// This intrinsic corresponds to the \c VPMINSD instruction.
1254 ///
1255 /// \param __a
1256 ///    A 256-bit vector of [8 x i32].
1257 /// \param __b
1258 ///    A 256-bit vector of [8 x i32].
1259 /// \returns A 256-bit vector of [8 x i32] containing the result.
1260 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1261 _mm256_min_epi32(__m256i __a, __m256i __b)
1262 {
1263   return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1264 }
1265
1266 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1267 ///     vectors in \a __a and \a __b and returns the smaller of each pair in
1268 ///     the corresponding byte of the 256-bit result.
1269 ///
1270 /// \headerfile <immintrin.h>
1271 ///
1272 /// This intrinsic corresponds to the \c VPMINUB instruction.
1273 ///
1274 /// \param __a
1275 ///    A 256-bit integer vector.
1276 /// \param __b
1277 ///    A 256-bit integer vector.
1278 /// \returns A 256-bit integer vector containing the result.
1279 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1280 _mm256_min_epu8(__m256i __a, __m256i __b)
1281 {
1282   return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1283 }
1284
1285 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1286 ///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1287 ///    each pair in the corresponding element of the 256-bit result.
1288 ///
1289 /// \headerfile <immintrin.h>
1290 ///
1291 /// This intrinsic corresponds to the \c VPMINUW instruction.
1292 ///
1293 /// \param __a
1294 ///    A 256-bit vector of [16 x i16].
1295 /// \param __b
1296 ///    A 256-bit vector of [16 x i16].
1297 /// \returns A 256-bit vector of [16 x i16] containing the result.
1298 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1299 _mm256_min_epu16(__m256i __a, __m256i __b)
1300 {
1301   return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1302 }
1303
1304 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1305 ///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1306 ///    each pair in the corresponding element of the 256-bit result.
1307 ///
1308 /// \headerfile <immintrin.h>
1309 ///
1310 /// This intrinsic corresponds to the \c VPMINUD instruction.
1311 ///
1312 /// \param __a
1313 ///    A 256-bit vector of [8 x i32].
1314 /// \param __b
1315 ///    A 256-bit vector of [8 x i32].
1316 /// \returns A 256-bit vector of [8 x i32] containing the result.
1317 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1318 _mm256_min_epu32(__m256i __a, __m256i __b)
1319 {
1320   return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1321 }
1322
1323 /// Creates a 32-bit integer mask from the most significant bit of each byte
1324 ///    in the 256-bit integer vector in \a __a and returns the result.
1325 ///
1326 /// \code{.operation}
1327 /// FOR i := 0 TO 31
1328 ///   j := i*8
1329 ///   result[i] := __a[j+7]
1330 /// ENDFOR
1331 /// \endcode
1332 ///
1333 /// \headerfile <immintrin.h>
1334 ///
1335 /// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1336 ///
1337 /// \param __a
1338 ///    A 256-bit integer vector containing the source bytes.
1339 /// \returns The 32-bit integer mask.
1340 static __inline__ int __DEFAULT_FN_ATTRS256
1341 _mm256_movemask_epi8(__m256i __a)
1342 {
1343   return __builtin_ia32_pmovmskb256((__v32qi)__a);
1344 }
1345
1346 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1347 ///    the 16-bit values in the corresponding elements of a 256-bit vector
1348 ///    of [16 x i16].
1349 ///
1350 /// \code{.operation}
1351 /// FOR i := 0 TO 15
1352 ///   j := i*8
1353 ///   k := i*16
1354 ///   result[k+15:k] := SignExtend(__V[j+7:j])
1355 /// ENDFOR
1356 /// \endcode
1357 ///
1358 /// \headerfile <immintrin.h>
1359 ///
1360 /// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1361 ///
1362 /// \param __V
1363 ///    A 128-bit integer vector containing the source bytes.
1364 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1365 ///    values.
1366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1367 _mm256_cvtepi8_epi16(__m128i __V)
1368 {
1369   /* This function always performs a signed extension, but __v16qi is a char
1370      which may be signed or unsigned, so use __v16qs. */
1371   return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1372 }
1373
1374 /// Sign-extends bytes from the lower half of the 128-bit integer vector in
1375 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1376 ///    256-bit vector of [8 x i32].
1377 ///
1378 /// \code{.operation}
1379 /// FOR i := 0 TO 7
1380 ///   j := i*8
1381 ///   k := i*32
1382 ///   result[k+31:k] := SignExtend(__V[j+7:j])
1383 /// ENDFOR
1384 /// \endcode
1385 ///
1386 /// \headerfile <immintrin.h>
1387 ///
1388 /// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1389 ///
1390 /// \param __V
1391 ///    A 128-bit integer vector containing the source bytes.
1392 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1393 ///    values.
1394 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1395 _mm256_cvtepi8_epi32(__m128i __V)
1396 {
1397   /* This function always performs a signed extension, but __v16qi is a char
1398      which may be signed or unsigned, so use __v16qs. */
1399   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1400 }
1401
1402 /// Sign-extends the first four bytes from the 128-bit integer vector in
1403 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1404 ///    256-bit vector of [4 x i64].
1405 ///
1406 /// \code{.operation}
1407 /// result[63:0] := SignExtend(__V[7:0])
1408 /// result[127:64] := SignExtend(__V[15:8])
1409 /// result[191:128] := SignExtend(__V[23:16])
1410 /// result[255:192] := SignExtend(__V[31:24])
1411 /// \endcode
1412 ///
1413 /// \headerfile <immintrin.h>
1414 ///
1415 /// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1416 ///
1417 /// \param __V
1418 ///    A 128-bit integer vector containing the source bytes.
1419 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1420 ///    values.
1421 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1422 _mm256_cvtepi8_epi64(__m128i __V)
1423 {
1424   /* This function always performs a signed extension, but __v16qi is a char
1425      which may be signed or unsigned, so use __v16qs. */
1426   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1427 }
1428
1429 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1430 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1431 ///    256-bit vector of [8 x i32].
1432 ///
1433 /// \code{.operation}
1434 /// FOR i := 0 TO 7
1435 ///   j := i*16
1436 ///   k := i*32
1437 ///   result[k+31:k] := SignExtend(__V[j+15:j])
1438 /// ENDFOR
1439 /// \endcode
1440 ///
1441 /// \headerfile <immintrin.h>
1442 ///
1443 /// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1444 ///
1445 /// \param __V
1446 ///    A 128-bit vector of [8 x i16] containing the source values.
1447 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1448 ///    values.
1449 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1450 _mm256_cvtepi16_epi32(__m128i __V)
1451 {
1452   return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1453 }
1454
1455 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1456 ///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1457 ///    elements of a 256-bit vector of [4 x i64].
1458 ///
1459 /// \code{.operation}
1460 /// result[63:0] := SignExtend(__V[15:0])
1461 /// result[127:64] := SignExtend(__V[31:16])
1462 /// result[191:128] := SignExtend(__V[47:32])
1463 /// result[255:192] := SignExtend(__V[64:48])
1464 /// \endcode
1465 ///
1466 /// \headerfile <immintrin.h>
1467 ///
1468 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1469 ///
1470 /// \param __V
1471 ///    A 128-bit vector of [8 x i16] containing the source values.
1472 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1473 ///    values.
1474 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1475 _mm256_cvtepi16_epi64(__m128i __V)
1476 {
1477   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1478 }
1479
1480 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1481 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1482 ///    256-bit vector of [4 x i64].
1483 ///
1484 /// \code{.operation}
1485 /// result[63:0] := SignExtend(__V[31:0])
1486 /// result[127:64] := SignExtend(__V[63:32])
1487 /// result[191:128] := SignExtend(__V[95:64])
1488 /// result[255:192] := SignExtend(__V[127:96])
1489 /// \endcode
1490 ///
1491 /// \headerfile <immintrin.h>
1492 ///
1493 /// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1494 ///
1495 /// \param __V
1496 ///    A 128-bit vector of [4 x i32] containing the source values.
1497 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1498 ///    values.
1499 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1500 _mm256_cvtepi32_epi64(__m128i __V)
1501 {
1502   return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1503 }
1504
1505 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1506 ///    the 16-bit values in the corresponding elements of a 256-bit vector
1507 ///    of [16 x i16].
1508 ///
1509 /// \code{.operation}
1510 /// FOR i := 0 TO 15
1511 ///   j := i*8
1512 ///   k := i*16
1513 ///   result[k+15:k] := ZeroExtend(__V[j+7:j])
1514 /// ENDFOR
1515 /// \endcode
1516 ///
1517 /// \headerfile <immintrin.h>
1518 ///
1519 /// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1520 ///
1521 /// \param __V
1522 ///    A 128-bit integer vector containing the source bytes.
1523 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1524 ///    values.
1525 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1526 _mm256_cvtepu8_epi16(__m128i __V)
1527 {
1528   return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1529 }
1530
1531 /// Zero-extends bytes from the lower half of the 128-bit integer vector in
1532 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1533 ///    256-bit vector of [8 x i32].
1534 ///
1535 /// \code{.operation}
1536 /// FOR i := 0 TO 7
1537 ///   j := i*8
1538 ///   k := i*32
1539 ///   result[k+31:k] := ZeroExtend(__V[j+7:j])
1540 /// ENDFOR
1541 /// \endcode
1542 ///
1543 /// \headerfile <immintrin.h>
1544 ///
1545 /// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1546 ///
1547 /// \param __V
1548 ///    A 128-bit integer vector containing the source bytes.
1549 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1550 ///    values.
1551 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1552 _mm256_cvtepu8_epi32(__m128i __V)
1553 {
1554   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1555 }
1556
1557 /// Zero-extends the first four bytes from the 128-bit integer vector in
1558 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1559 ///    256-bit vector of [4 x i64].
1560 ///
1561 /// \code{.operation}
1562 /// result[63:0] := ZeroExtend(__V[7:0])
1563 /// result[127:64] := ZeroExtend(__V[15:8])
1564 /// result[191:128] := ZeroExtend(__V[23:16])
1565 /// result[255:192] := ZeroExtend(__V[31:24])
1566 /// \endcode
1567 ///
1568 /// \headerfile <immintrin.h>
1569 ///
1570 /// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1571 ///
1572 /// \param __V
1573 ///    A 128-bit integer vector containing the source bytes.
1574 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1575 ///    values.
1576 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1577 _mm256_cvtepu8_epi64(__m128i __V)
1578 {
1579   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1580 }
1581
1582 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1583 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1584 ///    256-bit vector of [8 x i32].
1585 ///
1586 /// \code{.operation}
1587 /// FOR i := 0 TO 7
1588 ///   j := i*16
1589 ///   k := i*32
1590 ///   result[k+31:k] := ZeroExtend(__V[j+15:j])
1591 /// ENDFOR
1592 /// \endcode
1593 ///
1594 /// \headerfile <immintrin.h>
1595 ///
1596 /// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1597 ///
1598 /// \param __V
1599 ///    A 128-bit vector of [8 x i16] containing the source values.
1600 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1601 ///    values.
1602 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1603 _mm256_cvtepu16_epi32(__m128i __V)
1604 {
1605   return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1606 }
1607
1608 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1609 ///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1610 ///    elements of a 256-bit vector of [4 x i64].
1611 ///
1612 /// \code{.operation}
1613 /// result[63:0] := ZeroExtend(__V[15:0])
1614 /// result[127:64] := ZeroExtend(__V[31:16])
1615 /// result[191:128] := ZeroExtend(__V[47:32])
1616 /// result[255:192] := ZeroExtend(__V[64:48])
1617 /// \endcode
1618 ///
1619 /// \headerfile <immintrin.h>
1620 ///
1621 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1622 ///
1623 /// \param __V
1624 ///    A 128-bit vector of [8 x i16] containing the source values.
1625 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1626 ///    values.
1627 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1628 _mm256_cvtepu16_epi64(__m128i __V)
1629 {
1630   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1631 }
1632
1633 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1634 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1635 ///    256-bit vector of [4 x i64].
1636 ///
1637 /// \code{.operation}
1638 /// result[63:0] := ZeroExtend(__V[31:0])
1639 /// result[127:64] := ZeroExtend(__V[63:32])
1640 /// result[191:128] := ZeroExtend(__V[95:64])
1641 /// result[255:192] := ZeroExtend(__V[127:96])
1642 /// \endcode
1643 ///
1644 /// \headerfile <immintrin.h>
1645 ///
1646 /// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1647 ///
1648 /// \param __V
1649 ///    A 128-bit vector of [4 x i32] containing the source values.
1650 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1651 ///    values.
1652 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1653 _mm256_cvtepu32_epi64(__m128i __V)
1654 {
1655   return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1656 }
1657
1658 /// Multiplies signed 32-bit integers from even-numbered elements of two
1659 ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1660 ///    [4 x i64] result.
1661 ///
1662 /// \code{.operation}
1663 /// result[63:0] := __a[31:0] * __b[31:0]
1664 /// result[127:64] := __a[95:64] * __b[95:64]
1665 /// result[191:128] := __a[159:128] * __b[159:128]
1666 /// result[255:192] := __a[223:192] * __b[223:192]
1667 /// \endcode
1668 ///
1669 /// \headerfile <immintrin.h>
1670 ///
1671 /// This intrinsic corresponds to the \c VPMULDQ instruction.
1672 ///
1673 /// \param __a
1674 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1675 /// \param __b
1676 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1677 /// \returns A 256-bit vector of [4 x i64] containing the products.
1678 static __inline__  __m256i __DEFAULT_FN_ATTRS256
1679 _mm256_mul_epi32(__m256i __a, __m256i __b)
1680 {
1681   return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1682 }
1683
1684 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1685 ///    [16 x i16], truncates the 32-bit results to the most significant 18
1686 ///    bits, rounds by adding 1, and returns bits [16:1] of each rounded
1687 ///    product in the [16 x i16] result.
1688 ///
1689 /// \code{.operation}
1690 /// FOR i := 0 TO 15
1691 ///   j := i*16
1692 ///   temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1693 ///   result[j+15:j] := temp[16:1]
1694 /// \endcode
1695 ///
1696 /// \headerfile <immintrin.h>
1697 ///
1698 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
1699 ///
1700 /// \param __a
1701 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1702 /// \param __b
1703 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1704 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1705 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1706 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1707 {
1708   return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1709 }
1710
1711 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1712 ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1713 ///    [16 x i16] result.
1714 ///
1715 /// \headerfile <immintrin.h>
1716 ///
1717 /// This intrinsic corresponds to the \c VPMULHUW instruction.
1718 ///
1719 /// \param __a
1720 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1721 /// \param __b
1722 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1723 /// \returns A 256-bit vector of [16 x i16] containing the products.
1724 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1725 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
1726 {
1727   return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
1728 }
1729
1730 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1731 ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1732 ///    [16 x i16] result.
1733 ///
1734 /// \headerfile <immintrin.h>
1735 ///
1736 /// This intrinsic corresponds to the \c VPMULHW instruction.
1737 ///
1738 /// \param __a
1739 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1740 /// \param __b
1741 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1742 /// \returns A 256-bit vector of [16 x i16] containing the products.
1743 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1744 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
1745 {
1746   return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1747 }
1748
1749 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1750 ///    [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1751 ///    [16 x i16] result.
1752 ///
1753 /// \headerfile <immintrin.h>
1754 ///
1755 /// This intrinsic corresponds to the \c VPMULLW instruction.
1756 ///
1757 /// \param __a
1758 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1759 /// \param __b
1760 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1761 /// \returns A 256-bit vector of [16 x i16] containing the products.
1762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1763 _mm256_mullo_epi16(__m256i __a, __m256i __b)
1764 {
1765   return (__m256i)((__v16hu)__a * (__v16hu)__b);
1766 }
1767
1768 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1769 ///    [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1770 ///    [8 x i32] result.
1771 ///
1772 /// \headerfile <immintrin.h>
1773 ///
1774 /// This intrinsic corresponds to the \c VPMULLD instruction.
1775 ///
1776 /// \param __a
1777 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1778 /// \param __b
1779 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1780 /// \returns A 256-bit vector of [8 x i32] containing the products.
1781 static __inline__  __m256i __DEFAULT_FN_ATTRS256
1782 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
1783 {
1784   return (__m256i)((__v8su)__a * (__v8su)__b);
1785 }
1786
1787 /// Multiplies unsigned 32-bit integers from even-numered elements of two
1788 ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1789 ///    [4 x i64] result.
1790 ///
1791 /// \code{.operation}
1792 /// result[63:0] := __a[31:0] * __b[31:0]
1793 /// result[127:64] := __a[95:64] * __b[95:64]
1794 /// result[191:128] := __a[159:128] * __b[159:128]
1795 /// result[255:192] := __a[223:192] * __b[223:192]
1796 /// \endcode
1797 ///
1798 /// \headerfile <immintrin.h>
1799 ///
1800 /// This intrinsic corresponds to the \c VPMULUDQ instruction.
1801 ///
1802 /// \param __a
1803 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1804 /// \param __b
1805 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1806 /// \returns A 256-bit vector of [4 x i64] containing the products.
1807 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1808 _mm256_mul_epu32(__m256i __a, __m256i __b)
1809 {
1810   return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1811 }
1812
1813 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1814 ///    \a __b.
1815 ///
1816 /// \headerfile <immintrin.h>
1817 ///
1818 /// This intrinsic corresponds to the \c VPOR instruction.
1819 ///
1820 /// \param __a
1821 ///    A 256-bit integer vector.
1822 /// \param __b
1823 ///    A 256-bit integer vector.
1824 /// \returns A 256-bit integer vector containing the result.
1825 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1826 _mm256_or_si256(__m256i __a, __m256i __b)
1827 {
1828   return (__m256i)((__v4du)__a | (__v4du)__b);
1829 }
1830
1831 /// Computes four sum of absolute difference (SAD) operations on sets of eight
1832 ///    unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1833 ///    \a __b.
1834 ///
1835 ///    One SAD result is computed for each set of eight bytes from \a __a and
1836 ///    eight bytes from \a __b. The zero-extended SAD value is returned in the
1837 ///    corresponding 64-bit element of the result.
1838 ///
1839 ///    A single SAD operation takes the differences between the corresponding
1840 ///    bytes of \a __a and \a __b, takes the absolute value of each difference,
1841 ///    and sums these eight values to form one 16-bit result. This operation
1842 ///    is repeated four times with successive sets of eight bytes.
1843 ///
1844 /// \code{.operation}
1845 /// FOR i := 0 TO 3
1846 ///   j := i*64
1847 ///   temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1848 ///   temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1849 ///   temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1850 ///   temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1851 ///   temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1852 ///   temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1853 ///   temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1854 ///   temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1855 ///   result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1856 ///                     temp4 + temp5 + temp6 + temp7
1857 ///   result[j+63:j+16] := 0
1858 /// ENDFOR
1859 /// \endcode
1860 ///
1861 /// \headerfile <immintrin.h>
1862 ///
1863 /// This intrinsic corresponds to the \c VPSADBW instruction.
1864 ///
1865 /// \param __a
1866 ///    A 256-bit integer vector.
1867 /// \param __b
1868 ///    A 256-bit integer vector.
1869 /// \returns A 256-bit integer vector containing the result.
1870 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1871 _mm256_sad_epu8(__m256i __a, __m256i __b)
1872 {
1873   return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1874 }
1875
1876 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1877 ///    to control information in the 256-bit integer vector \a __b, and
1878 ///    returns the 256-bit result. In effect there are two separate 128-bit
1879 ///    shuffles in the lower and upper halves.
1880 ///
1881 /// \code{.operation}
1882 /// FOR i := 0 TO 31
1883 ///   j := i*8
1884 ///   IF __b[j+7] == 1
1885 ///     result[j+7:j] := 0
1886 ///   ELSE
1887 ///     k := __b[j+3:j] * 8
1888 ///     IF i > 15
1889 ///       k := k + 128
1890 ///     FI
1891 ///     result[j+7:j] := __a[k+7:k]
1892 ///   FI
1893 /// ENDFOR
1894 /// \endcode
1895 ///
1896 /// \headerfile <immintrin.h>
1897 ///
1898 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1899 ///
1900 /// \param __a
1901 ///    A 256-bit integer vector containing source values.
1902 /// \param __b
1903 ///    A 256-bit integer vector containing control information to determine
1904 ///    what goes into the corresponding byte of the result. If bit 7 of the
1905 ///    control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1906 ///    control byte specify the index (within the same 128-bit half) of \a __a
1907 ///    to copy to the result byte.
1908 /// \returns A 256-bit integer vector containing the result.
1909 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1910 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
1911 {
1912   return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1913 }
1914
1915 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1916 ///    according to control information in the integer literal \a imm, and
1917 ///    returns the 256-bit result. In effect there are two parallel 128-bit
1918 ///    shuffles in the lower and upper halves.
1919 ///
1920 /// \code{.operation}
1921 /// FOR i := 0 to 3
1922 ///   j := i*32
1923 ///   k := (imm >> i*2)[1:0] * 32
1924 ///   result[j+31:j] := a[k+31:k]
1925 ///   result[128+j+31:128+j] := a[128+k+31:128+k]
1926 /// ENDFOR
1927 /// \endcode
1928 ///
1929 /// \headerfile <immintrin.h>
1930 ///
1931 /// \code
1932 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1933 /// \endcode
1934 ///
1935 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1936 ///
1937 /// \param a
1938 ///    A 256-bit vector of [8 x i32] containing source values.
1939 /// \param imm
1940 ///    An immediate 8-bit value specifying which elements to copy from \a a.
1941 ///    \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1942 ///    result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1943 ///    forth.
1944 /// \returns A 256-bit vector of [8 x i32] containing the result.
1945 #define _mm256_shuffle_epi32(a, imm) \
1946   ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1947
1948 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1949 ///    according to control information in the integer literal \a imm, and
1950 ///    returns the 256-bit result. The upper 64 bits of each 128-bit half
1951 ///    are shuffled in parallel; the lower 64 bits of each 128-bit half are
1952 ///    copied from \a a unchanged.
1953 ///
1954 /// \code{.operation}
1955 /// result[63:0] := a[63:0]
1956 /// result[191:128] := a[191:128]
1957 /// FOR i := 0 TO 3
1958 ///   j := i * 16 + 64
1959 ///   k := (imm >> i*2)[1:0] * 16 + 64
1960 ///   result[j+15:j] := a[k+15:k]
1961 ///   result[128+j+15:128+j] := a[128+k+15:128+k]
1962 /// ENDFOR
1963 /// \endcode
1964 ///
1965 /// \headerfile <immintrin.h>
1966 ///
1967 /// \code
1968 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1969 /// \endcode
1970 ///
1971 /// This intrinsic corresponds to the \c VPSHUFHW instruction.
1972 ///
1973 /// \param a
1974 ///    A 256-bit vector of [16 x i16] containing source values.
1975 /// \param imm
1976 ///    An immediate 8-bit value specifying which elements to copy from \a a.
1977 ///    \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1978 ///    result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1979 ///    forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1980 /// \returns A 256-bit vector of [16 x i16] containing the result.
1981 #define _mm256_shufflehi_epi16(a, imm) \
1982   ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1983
1984 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1985 ///    according to control information in the integer literal \a imm, and
1986 ///    returns the 256-bit [16 x i16] result. The lower 64 bits of each
1987 ///    128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1988 ///    copied from \a a unchanged.
1989 ///
1990 /// \code{.operation}
1991 /// result[127:64] := a[127:64]
1992 /// result[255:192] := a[255:192]
1993 /// FOR i := 0 TO 3
1994 ///   j := i * 16
1995 ///   k := (imm >> i*2)[1:0] * 16
1996 ///   result[j+15:j] := a[k+15:k]
1997 ///   result[128+j+15:128+j] := a[128+k+15:128+k]
1998 /// ENDFOR
1999 /// \endcode
2000 ///
2001 /// \headerfile <immintrin.h>
2002 ///
2003 /// \code
2004 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
2005 /// \endcode
2006 ///
2007 /// This intrinsic corresponds to the \c VPSHUFLW instruction.
2008 ///
2009 /// \param a
2010 ///    A 256-bit vector of [16 x i16] to use as a source of data for the
2011 ///    result.
2012 /// \param imm
2013 ///    An immediate 8-bit value specifying which elements to copy from \a a.
2014 ///    \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2015 ///    result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2016 ///    forth.
2017 /// \returns A 256-bit vector of [16 x i16] containing the result.
2018 #define _mm256_shufflelo_epi16(a, imm) \
2019   ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2020
2021 /// Sets each byte of the result to the corresponding byte of the 256-bit
2022 ///    integer vector in \a __a, the negative of that byte, or zero, depending
2023 ///    on whether the corresponding byte of the 256-bit integer vector in
2024 ///    \a __b is greater than zero, less than zero, or equal to zero,
2025 ///    respectively.
2026 ///
2027 /// \headerfile <immintrin.h>
2028 ///
2029 /// This intrinsic corresponds to the \c VPSIGNB instruction.
2030 ///
2031 /// \param __a
2032 ///    A 256-bit integer vector.
2033 /// \param __b
2034 ///    A 256-bit integer vector].
2035 /// \returns A 256-bit integer vector containing the result.
2036 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2037 _mm256_sign_epi8(__m256i __a, __m256i __b)
2038 {
2039     return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2040 }
2041
2042 /// Sets each element of the result to the corresponding element of the
2043 ///    256-bit vector of [16 x i16] in \a __a, the negative of that element,
2044 ///    or zero, depending on whether the corresponding element of the 256-bit
2045 ///    vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2046 ///    equal to zero, respectively.
2047 ///
2048 /// \headerfile <immintrin.h>
2049 ///
2050 /// This intrinsic corresponds to the \c VPSIGNW instruction.
2051 ///
2052 /// \param __a
2053 ///    A 256-bit vector of [16 x i16].
2054 /// \param __b
2055 ///    A 256-bit vector of [16 x i16].
2056 /// \returns A 256-bit vector of [16 x i16] containing the result.
2057 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2058 _mm256_sign_epi16(__m256i __a, __m256i __b)
2059 {
2060     return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2061 }
2062
2063 /// Sets each element of the result to the corresponding element of the
2064 ///    256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2065 ///    zero, depending on whether the corresponding element of the 256-bit
2066 ///    vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2067 ///    equal to zero, respectively.
2068 ///
2069 /// \headerfile <immintrin.h>
2070 ///
2071 /// This intrinsic corresponds to the \c VPSIGND instruction.
2072 ///
2073 /// \param __a
2074 ///    A 256-bit vector of [8 x i32].
2075 /// \param __b
2076 ///    A 256-bit vector of [8 x i32].
2077 /// \returns A 256-bit vector of [8 x i32] containing the result.
2078 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2079 _mm256_sign_epi32(__m256i __a, __m256i __b)
2080 {
2081     return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2082 }
2083
2084 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2085 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2086 ///    is greater than 15, the returned result is all zeroes.
2087 ///
2088 /// \headerfile <immintrin.h>
2089 ///
2090 /// \code
2091 /// __m256i _mm256_slli_si256(__m256i a, const int imm);
2092 /// \endcode
2093 ///
2094 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2095 ///
2096 /// \param a
2097 ///    A 256-bit integer vector to be shifted.
2098 /// \param imm
2099 ///     An unsigned immediate value specifying the shift count (in bytes).
2100 /// \returns A 256-bit integer vector containing the result.
2101 #define _mm256_slli_si256(a, imm) \
2102   ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2103
2104 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2105 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2106 ///    is greater than 15, the returned result is all zeroes.
2107 ///
2108 /// \headerfile <immintrin.h>
2109 ///
2110 /// \code
2111 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2112 /// \endcode
2113 ///
2114 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2115 ///
2116 /// \param a
2117 ///    A 256-bit integer vector to be shifted.
2118 /// \param imm
2119 ///    An unsigned immediate value specifying the shift count (in bytes).
2120 /// \returns A 256-bit integer vector containing the result.
2121 #define _mm256_bslli_epi128(a, imm) \
2122   ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2123
2124 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2125 ///    left by \a __count bits, shifting in zero bits, and returns the result.
2126 ///    If \a __count is greater than 15, the returned result is all zeroes.
2127 ///
2128 /// \headerfile <immintrin.h>
2129 ///
2130 /// This intrinsic corresponds to the \c VPSLLW instruction.
2131 ///
2132 /// \param __a
2133 ///    A 256-bit vector of [16 x i16] to be shifted.
2134 /// \param __count
2135 ///    An unsigned integer value specifying the shift count (in bits).
2136 /// \returns A 256-bit vector of [16 x i16] containing the result.
2137 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2138 _mm256_slli_epi16(__m256i __a, int __count)
2139 {
2140   return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2141 }
2142
2143 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2144 ///    left by the number of bits specified by the lower 64 bits of \a __count,
2145 ///    shifting in zero bits, and returns the result. If \a __count is greater
2146 ///    than 15, the returned result is all zeroes.
2147 ///
2148 /// \headerfile <immintrin.h>
2149 ///
2150 /// This intrinsic corresponds to the \c VPSLLW instruction.
2151 ///
2152 /// \param __a
2153 ///    A 256-bit vector of [16 x i16] to be shifted.
2154 /// \param __count
2155 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2156 ///    shift count (in bits). The upper element is ignored.
2157 /// \returns A 256-bit vector of [16 x i16] containing the result.
2158 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2159 _mm256_sll_epi16(__m256i __a, __m128i __count)
2160 {
2161   return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2162 }
2163
2164 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2165 ///    left by \a __count bits, shifting in zero bits, and returns the result.
2166 ///    If \a __count is greater than 31, the returned result is all zeroes.
2167 ///
2168 /// \headerfile <immintrin.h>
2169 ///
2170 /// This intrinsic corresponds to the \c VPSLLD instruction.
2171 ///
2172 /// \param __a
2173 ///    A 256-bit vector of [8 x i32] to be shifted.
2174 /// \param __count
2175 ///    An unsigned integer value specifying the shift count (in bits).
2176 /// \returns A 256-bit vector of [8 x i32] containing the result.
2177 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2178 _mm256_slli_epi32(__m256i __a, int __count)
2179 {
2180   return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2181 }
2182
2183 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2184 ///    left by the number of bits given in the lower 64 bits of \a __count,
2185 ///    shifting in zero bits, and returns the result. If \a __count is greater
2186 ///    than 31, the returned result is all zeroes.
2187 ///
2188 /// \headerfile <immintrin.h>
2189 ///
2190 /// This intrinsic corresponds to the \c VPSLLD instruction.
2191 ///
2192 /// \param __a
2193 ///    A 256-bit vector of [8 x i32] to be shifted.
2194 /// \param __count
2195 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2196 ///    shift count (in bits). The upper element is ignored.
2197 /// \returns A 256-bit vector of [8 x i32] containing the result.
2198 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2199 _mm256_sll_epi32(__m256i __a, __m128i __count)
2200 {
2201   return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2202 }
2203
2204 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2205 ///    left by \a __count bits, shifting in zero bits, and returns the result.
2206 ///    If \a __count is greater than 63, the returned result is all zeroes.
2207 ///
2208 /// \headerfile <immintrin.h>
2209 ///
2210 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2211 ///
2212 /// \param __a
2213 ///    A 256-bit vector of [4 x i64] to be shifted.
2214 /// \param __count
2215 ///    An unsigned integer value specifying the shift count (in bits).
2216 /// \returns A 256-bit vector of [4 x i64] containing the result.
2217 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2218 _mm256_slli_epi64(__m256i __a, int __count)
2219 {
2220   return __builtin_ia32_psllqi256((__v4di)__a, __count);
2221 }
2222
2223 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2224 ///    left by the number of bits given in the lower 64 bits of \a __count,
2225 ///    shifting in zero bits, and returns the result. If \a __count is greater
2226 ///    than 63, the returned result is all zeroes.
2227 ///
2228 /// \headerfile <immintrin.h>
2229 ///
2230 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2231 ///
2232 /// \param __a
2233 ///    A 256-bit vector of [4 x i64] to be shifted.
2234 /// \param __count
2235 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2236 ///    shift count (in bits). The upper element is ignored.
2237 /// \returns A 256-bit vector of [4 x i64] containing the result.
2238 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2239 _mm256_sll_epi64(__m256i __a, __m128i __count)
2240 {
2241   return __builtin_ia32_psllq256((__v4di)__a, __count);
2242 }
2243
2244 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2245 ///    right by \a __count bits, shifting in sign bits, and returns the result.
2246 ///    If \a __count is greater than 15, each element of the result is either
2247 ///    0 or -1 according to the corresponding input sign bit.
2248 ///
2249 /// \headerfile <immintrin.h>
2250 ///
2251 /// This intrinsic corresponds to the \c VPSRAW instruction.
2252 ///
2253 /// \param __a
2254 ///    A 256-bit vector of [16 x i16] to be shifted.
2255 /// \param __count
2256 ///    An unsigned integer value specifying the shift count (in bits).
2257 /// \returns A 256-bit vector of [16 x i16] containing the result.
2258 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2259 _mm256_srai_epi16(__m256i __a, int __count)
2260 {
2261   return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2262 }
2263
2264 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2265 ///    right by the number of bits given in the lower 64 bits of \a __count,
2266 ///    shifting in sign bits, and returns the result. If \a __count is greater
2267 ///    than 15, each element of the result is either 0 or -1 according to the
2268 ///    corresponding input sign bit.
2269 ///
2270 /// \headerfile <immintrin.h>
2271 ///
2272 /// This intrinsic corresponds to the \c VPSRAW instruction.
2273 ///
2274 /// \param __a
2275 ///    A 256-bit vector of [16 x i16] to be shifted.
2276 /// \param __count
2277 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2278 ///    shift count (in bits). The upper element is ignored.
2279 /// \returns A 256-bit vector of [16 x i16] containing the result.
2280 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2281 _mm256_sra_epi16(__m256i __a, __m128i __count)
2282 {
2283   return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2284 }
2285
2286 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2287 ///    right by \a __count bits, shifting in sign bits, and returns the result.
2288 ///    If \a __count is greater than 31, each element of the result is either
2289 ///    0 or -1 according to the corresponding input sign bit.
2290 ///
2291 /// \headerfile <immintrin.h>
2292 ///
2293 /// This intrinsic corresponds to the \c VPSRAD instruction.
2294 ///
2295 /// \param __a
2296 ///    A 256-bit vector of [8 x i32] to be shifted.
2297 /// \param __count
2298 ///    An unsigned integer value specifying the shift count (in bits).
2299 /// \returns A 256-bit vector of [8 x i32] containing the result.
2300 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2301 _mm256_srai_epi32(__m256i __a, int __count)
2302 {
2303   return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2304 }
2305
2306 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2307 ///    right by the number of bits given in the lower 64 bits of \a __count,
2308 ///    shifting in sign bits, and returns the result. If \a __count is greater
2309 ///    than 31, each element of the result is either 0 or -1 according to the
2310 ///    corresponding input sign bit.
2311 ///
2312 /// \headerfile <immintrin.h>
2313 ///
2314 /// This intrinsic corresponds to the \c VPSRAD instruction.
2315 ///
2316 /// \param __a
2317 ///    A 256-bit vector of [8 x i32] to be shifted.
2318 /// \param __count
2319 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2320 ///    shift count (in bits). The upper element is ignored.
2321 /// \returns A 256-bit vector of [8 x i32] containing the result.
2322 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2323 _mm256_sra_epi32(__m256i __a, __m128i __count)
2324 {
2325   return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2326 }
2327
2328 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2329 ///    \a imm bytes, shifting in zero bytes, and returns the result. If
2330 ///    \a imm is greater than 15, the returned result is all zeroes.
2331 ///
2332 /// \headerfile <immintrin.h>
2333 ///
2334 /// \code
2335 /// __m256i _mm256_srli_si256(__m256i a, const int imm);
2336 /// \endcode
2337 ///
2338 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2339 ///
2340 /// \param a
2341 ///    A 256-bit integer vector to be shifted.
2342 /// \param imm
2343 ///    An unsigned immediate value specifying the shift count (in bytes).
2344 /// \returns A 256-bit integer vector containing the result.
2345 #define _mm256_srli_si256(a, imm) \
2346   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2347
2348 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2349 ///    \a imm bytes, shifting in zero bytes, and returns the result. If
2350 ///    \a imm is greater than 15, the returned result is all zeroes.
2351 ///
2352 /// \headerfile <immintrin.h>
2353 ///
2354 /// \code
2355 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2356 /// \endcode
2357 ///
2358 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2359 ///
2360 /// \param a
2361 ///    A 256-bit integer vector to be shifted.
2362 /// \param imm
2363 ///     An unsigned immediate value specifying the shift count (in bytes).
2364 /// \returns A 256-bit integer vector containing the result.
2365 #define _mm256_bsrli_epi128(a, imm) \
2366   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2367
2368 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2369 ///    right by \a __count bits, shifting in zero bits, and returns the result.
2370 ///    If \a __count is greater than 15, the returned result is all zeroes.
2371 ///
2372 /// \headerfile <immintrin.h>
2373 ///
2374 /// This intrinsic corresponds to the \c VPSRLW instruction.
2375 ///
2376 /// \param __a
2377 ///    A 256-bit vector of [16 x i16] to be shifted.
2378 /// \param __count
2379 ///    An unsigned integer value specifying the shift count (in bits).
2380 /// \returns A 256-bit vector of [16 x i16] containing the result.
2381 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2382 _mm256_srli_epi16(__m256i __a, int __count)
2383 {
2384   return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2385 }
2386
2387 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2388 ///    right by the number of bits given in the lower 64 bits of \a __count,
2389 ///    shifting in zero bits, and returns the result. If \a __count is greater
2390 ///    than 15, the returned result is all zeroes.
2391 ///
2392 /// \headerfile <immintrin.h>
2393 ///
2394 /// This intrinsic corresponds to the \c VPSRLW instruction.
2395 ///
2396 /// \param __a
2397 ///    A 256-bit vector of [16 x i16] to be shifted.
2398 /// \param __count
2399 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2400 ///    shift count (in bits). The upper element is ignored.
2401 /// \returns A 256-bit vector of [16 x i16] containing the result.
2402 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2403 _mm256_srl_epi16(__m256i __a, __m128i __count)
2404 {
2405   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2406 }
2407
2408 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2409 ///    right by \a __count bits, shifting in zero bits, and returns the result.
2410 ///    If \a __count is greater than 31, the returned result is all zeroes.
2411 ///
2412 /// \headerfile <immintrin.h>
2413 ///
2414 /// This intrinsic corresponds to the \c VPSRLD instruction.
2415 ///
2416 /// \param __a
2417 ///    A 256-bit vector of [8 x i32] to be shifted.
2418 /// \param __count
2419 ///    An unsigned integer value specifying the shift count (in bits).
2420 /// \returns A 256-bit vector of [8 x i32] containing the result.
2421 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2422 _mm256_srli_epi32(__m256i __a, int __count)
2423 {
2424   return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2425 }
2426
2427 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2428 ///    right by the number of bits given in the lower 64 bits of \a __count,
2429 ///    shifting in zero bits, and returns the result. If \a __count is greater
2430 ///    than 31, the returned result is all zeroes.
2431 ///
2432 /// \headerfile <immintrin.h>
2433 ///
2434 /// This intrinsic corresponds to the \c VPSRLD instruction.
2435 ///
2436 /// \param __a
2437 ///    A 256-bit vector of [8 x i32] to be shifted.
2438 /// \param __count
2439 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2440 ///    shift count (in bits). The upper element is ignored.
2441 /// \returns A 256-bit vector of [8 x i32] containing the result.
2442 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2443 _mm256_srl_epi32(__m256i __a, __m128i __count)
2444 {
2445   return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2446 }
2447
2448 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2449 ///    right by \a __count bits, shifting in zero bits, and returns the result.
2450 ///    If \a __count is greater than 63, the returned result is all zeroes.
2451 ///
2452 /// \headerfile <immintrin.h>
2453 ///
2454 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2455 ///
2456 /// \param __a
2457 ///    A 256-bit vector of [4 x i64] to be shifted.
2458 /// \param __count
2459 ///    An unsigned integer value specifying the shift count (in bits).
2460 /// \returns A 256-bit vector of [4 x i64] containing the result.
2461 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2462 _mm256_srli_epi64(__m256i __a, int __count)
2463 {
2464   return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2465 }
2466
2467 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2468 ///    right by the number of bits given in the lower 64 bits of \a __count,
2469 ///    shifting in zero bits, and returns the result. If \a __count is greater
2470 ///    than 63, the returned result is all zeroes.
2471 ///
2472 /// \headerfile <immintrin.h>
2473 ///
2474 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2475 ///
2476 /// \param __a
2477 ///    A 256-bit vector of [4 x i64] to be shifted.
2478 /// \param __count
2479 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2480 ///    shift count (in bits). The upper element is ignored.
2481 /// \returns A 256-bit vector of [4 x i64] containing the result.
2482 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2483 _mm256_srl_epi64(__m256i __a, __m128i __count)
2484 {
2485   return __builtin_ia32_psrlq256((__v4di)__a, __count);
2486 }
2487
2488 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2489 ///    vectors. Returns the lower 8 bits of each difference in the
2490 ///    corresponding byte of the 256-bit integer vector result (overflow is
2491 ///    ignored).
2492 ///
2493 /// \code{.operation}
2494 /// FOR i := 0 TO 31
2495 ///   j := i*8
2496 ///   result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2497 /// ENDFOR
2498 /// \endcode
2499 ///
2500 /// \headerfile <immintrin.h>
2501 ///
2502 /// This intrinsic corresponds to the \c VPSUBB instruction.
2503 ///
2504 /// \param __a
2505 ///    A 256-bit integer vector containing the minuends.
2506 /// \param __b
2507 ///    A 256-bit integer vector containing the subtrahends.
2508 /// \returns A 256-bit integer vector containing the differences.
2509 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2510 _mm256_sub_epi8(__m256i __a, __m256i __b)
2511 {
2512   return (__m256i)((__v32qu)__a - (__v32qu)__b);
2513 }
2514
2515 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2516 ///    vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2517 ///    the corresponding element of the [16 x i16] result (overflow is
2518 ///    ignored).
2519 ///
2520 /// \code{.operation}
2521 /// FOR i := 0 TO 15
2522 ///   j := i*16
2523 ///   result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2524 /// ENDFOR
2525 /// \endcode
2526 ///
2527 /// \headerfile <immintrin.h>
2528 ///
2529 /// This intrinsic corresponds to the \c VPSUBW instruction.
2530 ///
2531 /// \param __a
2532 ///    A 256-bit vector of [16 x i16] containing the minuends.
2533 /// \param __b
2534 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
2535 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2536 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2537 _mm256_sub_epi16(__m256i __a, __m256i __b)
2538 {
2539   return (__m256i)((__v16hu)__a - (__v16hu)__b);
2540 }
2541
2542 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
2543 ///    vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2544 ///    the corresponding element of the [8 x i32] result (overflow is ignored).
2545 ///
2546 /// \code{.operation}
2547 /// FOR i := 0 TO 7
2548 ///   j := i*32
2549 ///   result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2550 /// ENDFOR
2551 /// \endcode
2552 ///
2553 /// \headerfile <immintrin.h>
2554 ///
2555 /// This intrinsic corresponds to the \c VPSUBD instruction.
2556 ///
2557 /// \param __a
2558 ///    A 256-bit vector of [8 x i32] containing the minuends.
2559 /// \param __b
2560 ///    A 256-bit vector of [8 x i32] containing the subtrahends.
2561 /// \returns A 256-bit vector of [8 x i32] containing the differences.
2562 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2563 _mm256_sub_epi32(__m256i __a, __m256i __b)
2564 {
2565   return (__m256i)((__v8su)__a - (__v8su)__b);
2566 }
2567
2568 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
2569 ///    vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2570 ///    the corresponding element of the [4 x i64] result (overflow is ignored).
2571 ///
2572 /// \code{.operation}
2573 /// FOR i := 0 TO 3
2574 ///   j := i*64
2575 ///   result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2576 /// ENDFOR
2577 /// \endcode
2578 ///
2579 /// \headerfile <immintrin.h>
2580 ///
2581 /// This intrinsic corresponds to the \c VPSUBQ instruction.
2582 ///
2583 /// \param __a
2584 ///    A 256-bit vector of [4 x i64] containing the minuends.
2585 /// \param __b
2586 ///    A 256-bit vector of [4 x i64] containing the subtrahends.
2587 /// \returns A 256-bit vector of [4 x i64] containing the differences.
2588 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2589 _mm256_sub_epi64(__m256i __a, __m256i __b)
2590 {
2591   return (__m256i)((__v4du)__a - (__v4du)__b);
2592 }
2593
2594 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2595 ///    vectors using signed saturation, and returns each differences in the
2596 ///    corresponding byte of the 256-bit integer vector result.
2597 ///
2598 /// \code{.operation}
2599 /// FOR i := 0 TO 31
2600 ///   j := i*8
2601 ///   result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2602 /// ENDFOR
2603 /// \endcode
2604 ///
2605 /// \headerfile <immintrin.h>
2606 ///
2607 /// This intrinsic corresponds to the \c VPSUBSB instruction.
2608 ///
2609 /// \param __a
2610 ///    A 256-bit integer vector containing the minuends.
2611 /// \param __b
2612 ///    A 256-bit integer vector containing the subtrahends.
2613 /// \returns A 256-bit integer vector containing the differences.
2614 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2615 _mm256_subs_epi8(__m256i __a, __m256i __b)
2616 {
2617   return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2618 }
2619
2620 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2621 ///    vectors of [16 x i16] using signed saturation, and returns each
2622 ///    difference in the corresponding element of the [16 x i16] result.
2623 ///
2624 /// \code{.operation}
2625 /// FOR i := 0 TO 15
2626 ///   j := i*16
2627 ///   result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2628 /// ENDFOR
2629 /// \endcode
2630 ///
2631 /// \headerfile <immintrin.h>
2632 ///
2633 /// This intrinsic corresponds to the \c VPSUBSW instruction.
2634 ///
2635 /// \param __a
2636 ///    A 256-bit vector of [16 x i16] containing the minuends.
2637 /// \param __b
2638 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
2639 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2640 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2641 _mm256_subs_epi16(__m256i __a, __m256i __b)
2642 {
2643   return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2644 }
2645
2646 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2647 ///    vectors using unsigned saturation, and returns each difference in the
2648 ///    corresponding byte of the 256-bit integer vector result. For each byte,
2649 ///    computes <c> result = __a - __b </c>.
2650 ///
2651 /// \code{.operation}
2652 /// FOR i := 0 TO 31
2653 ///   j := i*8
2654 ///   result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2655 /// ENDFOR
2656 /// \endcode
2657 ///
2658 /// \headerfile <immintrin.h>
2659 ///
2660 /// This intrinsic corresponds to the \c VPSUBUSB instruction.
2661 ///
2662 /// \param __a
2663 ///    A 256-bit integer vector containing the minuends.
2664 /// \param __b
2665 ///    A 256-bit integer vector containing the subtrahends.
2666 /// \returns A 256-bit integer vector containing the differences.
2667 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2668 _mm256_subs_epu8(__m256i __a, __m256i __b)
2669 {
2670   return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2671 }
2672
2673 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2674 ///    vectors of [16 x i16] using unsigned saturation, and returns each
2675 ///    difference in the corresponding element of the [16 x i16] result.
2676 ///
2677 /// \code{.operation}
2678 /// FOR i := 0 TO 15
2679 ///   j := i*16
2680 ///   result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2681 /// ENDFOR
2682 /// \endcode
2683 ///
2684 /// \headerfile <immintrin.h>
2685 ///
2686 /// This intrinsic corresponds to the \c VPSUBUSW instruction.
2687 ///
2688 /// \param __a
2689 ///    A 256-bit vector of [16 x i16] containing the minuends.
2690 /// \param __b
2691 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
2692 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2693 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2694 _mm256_subs_epu16(__m256i __a, __m256i __b)
2695 {
2696   return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2697 }
2698
2699 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2700 ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2701 ///    uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2702 ///    input; other bits in these parameters are ignored.
2703 ///
2704 /// \code{.operation}
2705 /// result[7:0] := __a[71:64]
2706 /// result[15:8] := __b[71:64]
2707 /// result[23:16] := __a[79:72]
2708 /// result[31:24] := __b[79:72]
2709 /// . . .
2710 /// result[127:120] := __b[127:120]
2711 /// result[135:128] := __a[199:192]
2712 /// . . .
2713 /// result[255:248] := __b[255:248]
2714 /// \endcode
2715 ///
2716 /// \headerfile <immintrin.h>
2717 ///
2718 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2719 ///
2720 /// \param __a
2721 ///    A 256-bit integer vector used as the source for the even-numbered bytes
2722 ///    of the result.
2723 /// \param __b
2724 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
2725 ///    of the result.
2726 /// \returns A 256-bit integer vector containing the result.
2727 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2728 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
2729 {
2730   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2731 }
2732
2733 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2734 ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2735 ///    vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2736 ///    128-bit half of \a __a and \a __b as input; other bits in these
2737 ///    parameters are ignored.
2738 ///
2739 /// \code{.operation}
2740 /// result[15:0] := __a[79:64]
2741 /// result[31:16] := __b[79:64]
2742 /// result[47:32] := __a[95:80]
2743 /// result[63:48] := __b[95:80]
2744 /// . . .
2745 /// result[127:112] := __b[127:112]
2746 /// result[143:128] := __a[211:196]
2747 /// . . .
2748 /// result[255:240] := __b[255:240]
2749 /// \endcode
2750 ///
2751 /// \headerfile <immintrin.h>
2752 ///
2753 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2754 ///
2755 /// \param __a
2756 ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2757 ///    elements of the result.
2758 /// \param __b
2759 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2760 ///    elements of the result.
2761 /// \returns A 256-bit vector of [16 x i16] containing the result.
2762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2763 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
2764 {
2765   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2766 }
2767
2768 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2769 ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2770 ///    of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2771 ///    of \a __a and \a __b as input; other bits in these parameters are
2772 ///    ignored.
2773 ///
2774 /// \code{.operation}
2775 /// result[31:0] := __a[95:64]
2776 /// result[63:32] := __b[95:64]
2777 /// result[95:64] := __a[127:96]
2778 /// result[127:96] := __b[127:96]
2779 /// result[159:128] := __a[223:192]
2780 /// result[191:160] := __b[223:192]
2781 /// result[223:192] := __a[255:224]
2782 /// result[255:224] := __b[255:224]
2783 /// \endcode
2784 ///
2785 /// \headerfile <immintrin.h>
2786 ///
2787 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2788 ///
2789 /// \param __a
2790 ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2791 ///    elements of the result.
2792 /// \param __b
2793 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2794 ///    elements of the result.
2795 /// \returns A 256-bit vector of [8 x i32] containing the result.
2796 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2797 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
2798 {
2799   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2800 }
2801
2802 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2803 ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2804 ///    of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2805 ///    of \a __a and \a __b as input; other bits in these parameters are
2806 ///    ignored.
2807 ///
2808 /// \code{.operation}
2809 /// result[63:0] := __a[127:64]
2810 /// result[127:64] := __b[127:64]
2811 /// result[191:128] := __a[255:192]
2812 /// result[255:192] := __b[255:192]
2813 /// \endcode
2814 ///
2815 /// \headerfile <immintrin.h>
2816 ///
2817 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2818 ///
2819 /// \param __a
2820 ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2821 ///    elements of the result.
2822 /// \param __b
2823 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2824 ///    elements of the result.
2825 /// \returns A 256-bit vector of [4 x i64] containing the result.
2826 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2827 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
2828 {
2829   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2830 }
2831
2832 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2833 ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2834 ///    uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2835 ///    input; other bits in these parameters are ignored.
2836 ///
2837 /// \code{.operation}
2838 /// result[7:0] := __a[7:0]
2839 /// result[15:8] := __b[7:0]
2840 /// result[23:16] := __a[15:8]
2841 /// result[31:24] := __b[15:8]
2842 /// . . .
2843 /// result[127:120] := __b[63:56]
2844 /// result[135:128] := __a[135:128]
2845 /// . . .
2846 /// result[255:248] := __b[191:184]
2847 /// \endcode
2848 ///
2849 /// \headerfile <immintrin.h>
2850 ///
2851 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2852 ///
2853 /// \param __a
2854 ///    A 256-bit integer vector used as the source for the even-numbered bytes
2855 ///    of the result.
2856 /// \param __b
2857 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
2858 ///    of the result.
2859 /// \returns A 256-bit integer vector containing the result.
2860 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2861 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
2862 {
2863   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2864 }
2865
2866 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2867 ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2868 ///    vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2869 ///    128-bit half of \a __a and \a __b as input; other bits in these
2870 ///    parameters are ignored.
2871 ///
2872 /// \code{.operation}
2873 /// result[15:0] := __a[15:0]
2874 /// result[31:16] := __b[15:0]
2875 /// result[47:32] := __a[31:16]
2876 /// result[63:48] := __b[31:16]
2877 /// . . .
2878 /// result[127:112] := __b[63:48]
2879 /// result[143:128] := __a[143:128]
2880 /// . . .
2881 /// result[255:239] := __b[191:176]
2882 /// \endcode
2883 ///
2884 /// \headerfile <immintrin.h>
2885 ///
2886 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2887 ///
2888 /// \param __a
2889 ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2890 ///    elements of the result.
2891 /// \param __b
2892 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2893 ///    elements of the result.
2894 /// \returns A 256-bit vector of [16 x i16] containing the result.
2895 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2896 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
2897 {
2898   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2899 }
2900
2901 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2902 ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2903 ///    of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2904 ///    of \a __a and \a __b as input; other bits in these parameters are
2905 ///    ignored.
2906 ///
2907 /// \code{.operation}
2908 /// result[31:0] := __a[31:0]
2909 /// result[63:32] := __b[31:0]
2910 /// result[95:64] := __a[63:32]
2911 /// result[127:96] := __b[63:32]
2912 /// result[159:128] := __a[159:128]
2913 /// result[191:160] := __b[159:128]
2914 /// result[223:192] := __a[191:160]
2915 /// result[255:224] := __b[191:190]
2916 /// \endcode
2917 ///
2918 /// \headerfile <immintrin.h>
2919 ///
2920 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2921 ///
2922 /// \param __a
2923 ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2924 ///    elements of the result.
2925 /// \param __b
2926 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2927 ///    elements of the result.
2928 /// \returns A 256-bit vector of [8 x i32] containing the result.
2929 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2930 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
2931 {
2932   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2933 }
2934
2935 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2936 ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2937 ///    of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2938 ///    of \a __a and \a __b as input; other bits in these parameters are
2939 ///    ignored.
2940 ///
2941 /// \code{.operation}
2942 /// result[63:0] := __a[63:0]
2943 /// result[127:64] := __b[63:0]
2944 /// result[191:128] := __a[191:128]
2945 /// result[255:192] := __b[191:128]
2946 /// \endcode
2947 ///
2948 /// \headerfile <immintrin.h>
2949 ///
2950 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2951 ///
2952 /// \param __a
2953 ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2954 ///    elements of the result.
2955 /// \param __b
2956 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2957 ///    elements of the result.
2958 /// \returns A 256-bit vector of [4 x i64] containing the result.
2959 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2960 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
2961 {
2962   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2963 }
2964
2965 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2966 ///    \a __b.
2967 ///
2968 /// \headerfile <immintrin.h>
2969 ///
2970 /// This intrinsic corresponds to the \c VPXOR instruction.
2971 ///
2972 /// \param __a
2973 ///    A 256-bit integer vector.
2974 /// \param __b
2975 ///    A 256-bit integer vector.
2976 /// \returns A 256-bit integer vector containing the result.
2977 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2978 _mm256_xor_si256(__m256i __a, __m256i __b)
2979 {
2980   return (__m256i)((__v4du)__a ^ (__v4du)__b);
2981 }
2982
2983 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2984 ///   memory hint and returns the vector. \a __V must be aligned on a 32-byte
2985 ///   boundary.
2986 ///
2987 /// \headerfile <immintrin.h>
2988 ///
2989 /// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2990 ///
2991 /// \param __V
2992 ///    A pointer to the 32-byte aligned memory containing the vector to load.
2993 /// \returns A 256-bit integer vector loaded from memory.
2994 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2995 _mm256_stream_load_si256(const void *__V)
2996 {
2997   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2998   return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2999 }
3000
3001 /// Broadcasts the 32-bit floating-point value from the low element of the
3002 ///    128-bit vector of [4 x float] in \a __X to all elements of the result's
3003 ///    128-bit vector of [4 x float].
3004 ///
3005 /// \headerfile <immintrin.h>
3006 ///
3007 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3008 ///
3009 /// \param __X
3010 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
3011 /// \returns A 128-bit vector of [4 x float] containing the result.
3012 static __inline__ __m128 __DEFAULT_FN_ATTRS128
3013 _mm_broadcastss_ps(__m128 __X)
3014 {
3015   return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
3016 }
3017
3018 /// Broadcasts the 64-bit floating-point value from the low element of the
3019 ///    128-bit vector of [2 x double] in \a __a to both elements of the
3020 ///    result's 128-bit vector of [2 x double].
3021 ///
3022 /// \headerfile <immintrin.h>
3023 ///
3024 /// This intrinsic corresponds to the \c MOVDDUP instruction.
3025 ///
3026 /// \param __a
3027 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
3028 /// \returns A 128-bit vector of [2 x double] containing the result.
3029 static __inline__ __m128d __DEFAULT_FN_ATTRS128
3030 _mm_broadcastsd_pd(__m128d __a)
3031 {
3032   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
3033 }
3034
3035 /// Broadcasts the 32-bit floating-point value from the low element of the
3036 ///    128-bit vector of [4 x float] in \a __X to all elements of the
3037 ///    result's 256-bit vector of [8 x float].
3038 ///
3039 /// \headerfile <immintrin.h>
3040 ///
3041 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3042 ///
3043 /// \param __X
3044 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
3045 /// \returns A 256-bit vector of [8 x float] containing the result.
3046 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3047 _mm256_broadcastss_ps(__m128 __X)
3048 {
3049   return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3050 }
3051
3052 /// Broadcasts the 64-bit floating-point value from the low element of the
3053 ///    128-bit vector of [2 x double] in \a __X to all elements of the
3054 ///    result's 256-bit vector of [4 x double].
3055 ///
3056 /// \headerfile <immintrin.h>
3057 ///
3058 /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3059 ///
3060 /// \param __X
3061 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
3062 /// \returns A 256-bit vector of [4 x double] containing the result.
3063 static __inline__ __m256d __DEFAULT_FN_ATTRS256
3064 _mm256_broadcastsd_pd(__m128d __X)
3065 {
3066   return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3067 }
3068
3069 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
3070 ///    upper halves of the 256-bit result.
3071 ///
3072 /// \headerfile <immintrin.h>
3073 ///
3074 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3075 ///
3076 /// \param __X
3077 ///    A 128-bit integer vector to be broadcast.
3078 /// \returns A 256-bit integer vector containing the result.
3079 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3080 _mm256_broadcastsi128_si256(__m128i __X)
3081 {
3082   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3083 }
3084
3085 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3086
3087 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
3088 ///    [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3089 ///    as specified by the immediate integer operand \a M.
3090 ///
3091 /// \code{.operation}
3092 /// FOR i := 0 TO 3
3093 ///   j := i*32
3094 ///   IF M[i] == 0
3095 ///     result[31+j:j] := V1[31+j:j]
3096 ///   ELSE
3097 ///     result[31+j:j] := V2[32+j:j]
3098 ///   FI
3099 /// ENDFOR
3100 /// \endcode
3101 ///
3102 /// \headerfile <immintrin.h>
3103 ///
3104 /// \code
3105 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3106 /// \endcode
3107 ///
3108 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3109 ///
3110 /// \param V1
3111 ///    A 128-bit vector of [4 x i32] containing source values.
3112 /// \param V2
3113 ///    A 128-bit vector of [4 x i32] containing source values.
3114 /// \param M
3115 ///    An immediate 8-bit integer operand, with bits [3:0] specifying the
3116 ///    source for each element of the result. The position of the mask bit
3117 ///    corresponds to the index of a copied value. When a mask bit is 0, the
3118 ///    element is copied from \a V1; otherwise, it is copied from \a V2.
3119 /// \returns A 128-bit vector of [4 x i32] containing the result.
3120 #define _mm_blend_epi32(V1, V2, M) \
3121   ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3122                                       (__v4si)(__m128i)(V2), (int)(M)))
3123
3124 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
3125 ///    [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3126 ///    as specified by the immediate integer operand \a M.
3127 ///
3128 /// \code{.operation}
3129 /// FOR i := 0 TO 7
3130 ///   j := i*32
3131 ///   IF M[i] == 0
3132 ///     result[31+j:j] := V1[31+j:j]
3133 ///   ELSE
3134 ///     result[31+j:j] := V2[32+j:j]
3135 ///   FI
3136 /// ENDFOR
3137 /// \endcode
3138 ///
3139 /// \headerfile <immintrin.h>
3140 ///
3141 /// \code
3142 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3143 /// \endcode
3144 ///
3145 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3146 ///
3147 /// \param V1
3148 ///    A 256-bit vector of [8 x i32] containing source values.
3149 /// \param V2
3150 ///    A 256-bit vector of [8 x i32] containing source values.
3151 /// \param M
3152 ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
3153 ///    source for each element of the result. The position of the mask bit
3154 ///    corresponds to the index of a copied value. When a mask bit is 0, the
3155 ///    element is copied from \a V1; otherwise, it is is copied from \a V2.
3156 /// \returns A 256-bit vector of [8 x i32] containing the result.
3157 #define _mm256_blend_epi32(V1, V2, M) \
3158   ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3159                                       (__v8si)(__m256i)(V2), (int)(M)))
3160
3161 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3162 ///    bytes of the 256-bit result.
3163 ///
3164 /// \headerfile <immintrin.h>
3165 ///
3166 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3167 ///
3168 /// \param __X
3169 ///    A 128-bit integer vector whose low byte will be broadcast.
3170 /// \returns A 256-bit integer vector containing the result.
3171 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3172 _mm256_broadcastb_epi8(__m128i __X)
3173 {
3174   return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3175 }
3176
3177 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3178 ///    to all elements of the result's 256-bit vector of [16 x i16].
3179 ///
3180 /// \headerfile <immintrin.h>
3181 ///
3182 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3183 ///
3184 /// \param __X
3185 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
3186 /// \returns A 256-bit vector of [16 x i16] containing the result.
3187 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3188 _mm256_broadcastw_epi16(__m128i __X)
3189 {
3190   return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3191 }
3192
3193 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3194 ///    to all elements of the result's 256-bit vector of [8 x i32].
3195 ///
3196 /// \headerfile <immintrin.h>
3197 ///
3198 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3199 ///
3200 /// \param __X
3201 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
3202 /// \returns A 256-bit vector of [8 x i32] containing the result.
3203 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3204 _mm256_broadcastd_epi32(__m128i __X)
3205 {
3206   return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3207 }
3208
3209 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3210 ///    to all elements of the result's 256-bit vector of [4 x i64].
3211 ///
3212 /// \headerfile <immintrin.h>
3213 ///
3214 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3215 ///
3216 /// \param __X
3217 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
3218 /// \returns A 256-bit vector of [4 x i64] containing the result.
3219 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3220 _mm256_broadcastq_epi64(__m128i __X)
3221 {
3222   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3223 }
3224
3225 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3226 ///    bytes of the 128-bit result.
3227 ///
3228 /// \headerfile <immintrin.h>
3229 ///
3230 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3231 ///
3232 /// \param __X
3233 ///    A 128-bit integer vector whose low byte will be broadcast.
3234 /// \returns A 128-bit integer vector containing the result.
3235 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3236 _mm_broadcastb_epi8(__m128i __X)
3237 {
3238   return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3239 }
3240
3241 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3242 ///    \a __X to all elements of the result's 128-bit vector of [8 x i16].
3243 ///
3244 /// \headerfile <immintrin.h>
3245 ///
3246 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3247 ///
3248 /// \param __X
3249 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
3250 /// \returns A 128-bit vector of [8 x i16] containing the result.
3251 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3252 _mm_broadcastw_epi16(__m128i __X)
3253 {
3254   return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3255 }
3256
3257 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3258 ///    to all elements of the result's vector of [4 x i32].
3259 ///
3260 /// \headerfile <immintrin.h>
3261 ///
3262 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3263 ///
3264 /// \param __X
3265 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
3266 /// \returns A 128-bit vector of [4 x i32] containing the result.
3267 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3268 _mm_broadcastd_epi32(__m128i __X)
3269 {
3270   return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3271 }
3272
3273 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3274 ///    to both elements of the result's 128-bit vector of [2 x i64].
3275 ///
3276 /// \headerfile <immintrin.h>
3277 ///
3278 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3279 ///
3280 /// \param __X
3281 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
3282 /// \returns A 128-bit vector of [2 x i64] containing the result.
3283 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3284 _mm_broadcastq_epi64(__m128i __X)
3285 {
3286   return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3287 }
3288
3289 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3290 ///    256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3291 ///    elements of the 256-bit vector of [8 x i32] in \a __b.
3292 ///
3293 /// \code{.operation}
3294 /// FOR i := 0 TO 7
3295 ///   j := i*32
3296 ///   k := __b[j+2:j] * 32
3297 ///   result[j+31:j] := __a[k+31:k]
3298 /// ENDFOR
3299 /// \endcode
3300 ///
3301 /// \headerfile <immintrin.h>
3302 ///
3303 /// This intrinsic corresponds to the \c VPERMD instruction.
3304 ///
3305 /// \param __a
3306 ///    A 256-bit vector of [8 x i32] containing the source values.
3307 /// \param __b
3308 ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
3309 ///    \a __a.
3310 /// \returns A 256-bit vector of [8 x i32] containing the result.
3311 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3312 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
3313 {
3314   return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3315 }
3316
3317 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3318 ///    the 256-bit vector of [4 x double] in \a V as specified by the
3319 ///    immediate value \a M.
3320 ///
3321 /// \code{.operation}
3322 /// FOR i := 0 TO 3
3323 ///   j := i*64
3324 ///   k := (M >> i*2)[1:0] * 64
3325 ///   result[j+63:j] := V[k+63:k]
3326 /// ENDFOR
3327 /// \endcode
3328 ///
3329 /// \headerfile <immintrin.h>
3330 ///
3331 /// \code
3332 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3333 /// \endcode
3334 ///
3335 /// This intrinsic corresponds to the \c VPERMPD instruction.
3336 ///
3337 /// \param V
3338 ///    A 256-bit vector of [4 x double] containing the source values.
3339 /// \param M
3340 ///    An immediate 8-bit value specifying which elements to copy from \a V.
3341 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
3342 ///    \a M[3:2] specifies the index for element 1, and so forth.
3343 /// \returns A 256-bit vector of [4 x double] containing the result.
3344 #define _mm256_permute4x64_pd(V, M) \
3345   ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3346
3347 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3348 ///    the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3349 ///    the elements of the 256-bit vector of [8 x i32] in \a __b.
3350 ///
3351 /// \code{.operation}
3352 /// FOR i := 0 TO 7
3353 ///   j := i*32
3354 ///   k := __b[j+2:j] * 32
3355 ///   result[j+31:j] := __a[k+31:k]
3356 /// ENDFOR
3357 /// \endcode
3358 ///
3359 /// \headerfile <immintrin.h>
3360 ///
3361 /// This intrinsic corresponds to the \c VPERMPS instruction.
3362 ///
3363 /// \param __a
3364 ///    A 256-bit vector of [8 x float] containing the source values.
3365 /// \param __b
3366 ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
3367 ///    \a __a.
3368 /// \returns A 256-bit vector of [8 x float] containing the result.
3369 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3370 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
3371 {
3372   return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3373 }
3374
3375 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3376 ///    of the 256-bit vector of [4 x i64] in \a V as specified by the
3377 ///    immediate value \a M.
3378 ///
3379 /// \code{.operation}
3380 /// FOR i := 0 TO 3
3381 ///   j := i*64
3382 ///   k := (M >> i*2)[1:0] * 64
3383 ///   result[j+63:j] := V[k+63:k]
3384 /// ENDFOR
3385 /// \endcode
3386 ///
3387 /// \headerfile <immintrin.h>
3388 ///
3389 /// \code
3390 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3391 /// \endcode
3392 ///
3393 /// This intrinsic corresponds to the \c VPERMQ instruction.
3394 ///
3395 /// \param V
3396 ///    A 256-bit vector of [4 x i64] containing the source values.
3397 /// \param M
3398 ///    An immediate 8-bit value specifying which elements to copy from \a V.
3399 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
3400 ///    \a M[3:2] specifies the index for element 1, and so forth.
3401 /// \returns A 256-bit vector of [4 x i64] containing the result.
3402 #define _mm256_permute4x64_epi64(V, M) \
3403   ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3404
3405 /// Sets each half of the 256-bit result either to zero or to one of the
3406 ///    four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3407 ///    as specified by the immediate value \a M.
3408 ///
3409 /// \code{.operation}
3410 /// FOR i := 0 TO 1
3411 ///   j := i*128
3412 ///   k := M >> (i*4)
3413 ///   IF k[3] == 0
3414 ///     CASE (k[1:0]) OF
3415 ///     0: result[127+j:j] := V1[127:0]
3416 ///     1: result[127+j:j] := V1[255:128]
3417 ///     2: result[127+j:j] := V2[127:0]
3418 ///     3: result[127+j:j] := V2[255:128]
3419 ///     ESAC
3420 ///   ELSE
3421 ///     result[127+j:j] := 0
3422 ///   FI
3423 /// ENDFOR
3424 /// \endcode
3425 ///
3426 /// \headerfile <immintrin.h>
3427 ///
3428 /// \code
3429 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3430 /// \endcode
3431 ///
3432 /// This intrinsic corresponds to the \c VPERM2I128 instruction.
3433 ///
3434 /// \param V1
3435 ///    A 256-bit integer vector containing source values.
3436 /// \param V2
3437 ///    A 256-bit integer vector containing source values.
3438 /// \param M
3439 ///    An immediate value specifying how to form the result. Bits [3:0]
3440 ///    control the lower half of the result, bits [7:4] control the upper half.
3441 ///    Within each 4-bit control value, if bit 3 is 1, the result is zero,
3442 ///    otherwise bits [1:0] determine the source as follows. \n
3443 ///    0: the lower half of \a V1 \n
3444 ///    1: the upper half of \a V1 \n
3445 ///    2: the lower half of \a V2 \n
3446 ///    3: the upper half of \a V2
3447 /// \returns A 256-bit integer vector containing the result.
3448 #define _mm256_permute2x128_si256(V1, V2, M) \
3449   ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3450
3451 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3452 ///     of the immediate \a M is zero, extracts the lower half of the result;
3453 ///     otherwise, extracts the upper half.
3454 ///
3455 /// \headerfile <immintrin.h>
3456 ///
3457 /// \code
3458 /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3459 /// \endcode
3460 ///
3461 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3462 ///
3463 /// \param V
3464 ///    A 256-bit integer vector containing the source values.
3465 /// \param M
3466 ///    An immediate value specifying which half of \a V to extract.
3467 /// \returns A 128-bit integer vector containing the result.
3468 #define _mm256_extracti128_si256(V, M) \
3469   ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3470
3471 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3472 ///     result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3473 ///     is zero, overwrites the lower half of the result; otherwise,
3474 ///     overwrites the upper half.
3475 ///
3476 /// \headerfile <immintrin.h>
3477 ///
3478 /// \code
3479 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3480 /// \endcode
3481 ///
3482 /// This intrinsic corresponds to the \c VINSERTI128 instruction.
3483 ///
3484 /// \param V1
3485 ///    A 256-bit integer vector containing a source value.
3486 /// \param V2
3487 ///    A 128-bit integer vector containing a source value.
3488 /// \param M
3489 ///    An immediate value specifying where to put \a V2 in the result.
3490 /// \returns A 256-bit integer vector containing the result.
3491 #define _mm256_inserti128_si256(V1, V2, M) \
3492   ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3493                                          (__v2di)(__m128i)(V2), (int)(M)))
3494
3495 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3496 ///    the most significant bit of the corresponding element in the mask
3497 ///    \a __M is set; otherwise, sets that element of the result to zero.
3498 ///    Returns the 256-bit [8 x i32] result.
3499 ///
3500 /// \code{.operation}
3501 /// FOR i := 0 TO 7
3502 ///   j := i*32
3503 ///   IF __M[j+31] == 1
3504 ///     result[j+31:j] := Load32(__X+(i*4))
3505 ///   ELSE
3506 ///     result[j+31:j] := 0
3507 ///   FI
3508 /// ENDFOR
3509 /// \endcode
3510 ///
3511 /// \headerfile <immintrin.h>
3512 ///
3513 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3514 ///
3515 /// \param __X
3516 ///    A pointer to the memory used for loading values.
3517 /// \param __M
3518 ///    A 256-bit vector of [8 x i32] containing the mask bits.
3519 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3520 ///    elements.
3521 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3522 _mm256_maskload_epi32(int const *__X, __m256i __M)
3523 {
3524   return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3525 }
3526
3527 /// Conditionally loads four 64-bit integer elements from memory \a __X, if
3528 ///    the most significant bit of the corresponding element in the mask
3529 ///    \a __M is set; otherwise, sets that element of the result to zero.
3530 ///    Returns the 256-bit [4 x i64] result.
3531 ///
3532 /// \code{.operation}
3533 /// FOR i := 0 TO 3
3534 ///   j := i*64
3535 ///   IF __M[j+63] == 1
3536 ///     result[j+63:j] := Load64(__X+(i*8))
3537 ///   ELSE
3538 ///     result[j+63:j] := 0
3539 ///   FI
3540 /// ENDFOR
3541 /// \endcode
3542 ///
3543 /// \headerfile <immintrin.h>
3544 ///
3545 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3546 ///
3547 /// \param __X
3548 ///    A pointer to the memory used for loading values.
3549 /// \param __M
3550 ///    A 256-bit vector of [4 x i64] containing the mask bits.
3551 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3552 ///    elements.
3553 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3554 _mm256_maskload_epi64(long long const *__X, __m256i __M)
3555 {
3556   return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3557 }
3558
3559 /// Conditionally loads four 32-bit integer elements from memory \a __X, if
3560 ///    the most significant bit of the corresponding element in the mask
3561 ///    \a __M is set; otherwise, sets that element of the result to zero.
3562 ///    Returns the 128-bit [4 x i32] result.
3563 ///
3564 /// \code{.operation}
3565 /// FOR i := 0 TO 3
3566 ///   j := i*32
3567 ///   IF __M[j+31] == 1
3568 ///     result[j+31:j] := Load32(__X+(i*4))
3569 ///   ELSE
3570 ///     result[j+31:j] := 0
3571 ///   FI
3572 /// ENDFOR
3573 /// \endcode
3574 ///
3575 /// \headerfile <immintrin.h>
3576 ///
3577 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3578 ///
3579 /// \param __X
3580 ///    A pointer to the memory used for loading values.
3581 /// \param __M
3582 ///    A 128-bit vector of [4 x i32] containing the mask bits.
3583 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3584 ///    elements.
3585 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3586 _mm_maskload_epi32(int const *__X, __m128i __M)
3587 {
3588   return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3589 }
3590
3591 /// Conditionally loads two 64-bit integer elements from memory \a __X, if
3592 ///    the most significant bit of the corresponding element in the mask
3593 ///    \a __M is set; otherwise, sets that element of the result to zero.
3594 ///    Returns the 128-bit [2 x i64] result.
3595 ///
3596 /// \code{.operation}
3597 /// FOR i := 0 TO 1
3598 ///   j := i*64
3599 ///   IF __M[j+63] == 1
3600 ///     result[j+63:j] := Load64(__X+(i*8))
3601 ///   ELSE
3602 ///     result[j+63:j] := 0
3603 ///   FI
3604 /// ENDFOR
3605 /// \endcode
3606 ///
3607 /// \headerfile <immintrin.h>
3608 ///
3609 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3610 ///
3611 /// \param __X
3612 ///    A pointer to the memory used for loading values.
3613 /// \param __M
3614 ///    A 128-bit vector of [2 x i64] containing the mask bits.
3615 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3616 ///    elements.
3617 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3618 _mm_maskload_epi64(long long const *__X, __m128i __M)
3619 {
3620   return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3621 }
3622
3623 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3624 ///    of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3625 ///    the corresponding element in the mask \a __M is set; otherwise, the
3626 ///    memory element is unchanged.
3627 ///
3628 /// \code{.operation}
3629 /// FOR i := 0 TO 7
3630 ///   j := i*32
3631 ///   IF __M[j+31] == 1
3632 ///     Store32(__X+(i*4), __Y[j+31:j])
3633 ///   FI
3634 /// ENDFOR
3635 /// \endcode
3636 ///
3637 /// \headerfile <immintrin.h>
3638 ///
3639 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3640 ///
3641 /// \param __X
3642 ///    A pointer to the memory used for storing values.
3643 /// \param __M
3644 ///    A 256-bit vector of [8 x i32] containing the mask bits.
3645 /// \param __Y
3646 ///    A 256-bit vector of [8 x i32] containing the values to store.
3647 static __inline__ void __DEFAULT_FN_ATTRS256
3648 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3649 {
3650   __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3651 }
3652
3653 /// Conditionally stores four 64-bit integer elements from the 256-bit vector
3654 ///    of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3655 ///    the corresponding element in the mask \a __M is set; otherwise, the
3656 ///    memory element is unchanged.
3657 ///
3658 /// \code{.operation}
3659 /// FOR i := 0 TO 3
3660 ///   j := i*64
3661 ///   IF __M[j+63] == 1
3662 ///     Store64(__X+(i*8), __Y[j+63:j])
3663 ///   FI
3664 /// ENDFOR
3665 /// \endcode
3666 ///
3667 /// \headerfile <immintrin.h>
3668 ///
3669 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3670 ///
3671 /// \param __X
3672 ///    A pointer to the memory used for storing values.
3673 /// \param __M
3674 ///    A 256-bit vector of [4 x i64] containing the mask bits.
3675 /// \param __Y
3676 ///    A 256-bit vector of [4 x i64] containing the values to store.
3677 static __inline__ void __DEFAULT_FN_ATTRS256
3678 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3679 {
3680   __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3681 }
3682
3683 /// Conditionally stores four 32-bit integer elements from the 128-bit vector
3684 ///    of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3685 ///    the corresponding element in the mask \a __M is set; otherwise, the
3686 ///    memory element is unchanged.
3687 ///
3688 /// \code{.operation}
3689 /// FOR i := 0 TO 3
3690 ///   j := i*32
3691 ///   IF __M[j+31] == 1
3692 ///     Store32(__X+(i*4), __Y[j+31:j])
3693 ///   FI
3694 /// ENDFOR
3695 /// \endcode
3696 ///
3697 /// \headerfile <immintrin.h>
3698 ///
3699 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3700 ///
3701 /// \param __X
3702 ///    A pointer to the memory used for storing values.
3703 /// \param __M
3704 ///    A 128-bit vector of [4 x i32] containing the mask bits.
3705 /// \param __Y
3706 ///    A 128-bit vector of [4 x i32] containing the values to store.
3707 static __inline__ void __DEFAULT_FN_ATTRS128
3708 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3709 {
3710   __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3711 }
3712
3713 /// Conditionally stores two 64-bit integer elements from the 128-bit vector
3714 ///    of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3715 ///    the corresponding element in the mask \a __M is set; otherwise, the
3716 ///    memory element is unchanged.
3717 ///
3718 /// \code{.operation}
3719 /// FOR i := 0 TO 1
3720 ///   j := i*64
3721 ///   IF __M[j+63] == 1
3722 ///     Store64(__X+(i*8), __Y[j+63:j])
3723 ///   FI
3724 /// ENDFOR
3725 /// \endcode
3726 ///
3727 /// \headerfile <immintrin.h>
3728 ///
3729 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3730 ///
3731 /// \param __X
3732 ///    A pointer to the memory used for storing values.
3733 /// \param __M
3734 ///    A 128-bit vector of [2 x i64] containing the mask bits.
3735 /// \param __Y
3736 ///    A 128-bit vector of [2 x i64] containing the values to store.
3737 static __inline__ void __DEFAULT_FN_ATTRS128
3738 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3739 {
3740   __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3741 }
3742
3743 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3744 ///    left by the number of bits given in the corresponding element of the
3745 ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3746 ///    returns the result. If the shift count for any element is greater than
3747 ///    31, the result for that element is zero.
3748 ///
3749 /// \headerfile <immintrin.h>
3750 ///
3751 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3752 ///
3753 /// \param __X
3754 ///    A 256-bit vector of [8 x i32] to be shifted.
3755 /// \param __Y
3756 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3757 ///    bits).
3758 /// \returns A 256-bit vector of [8 x i32] containing the result.
3759 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3760 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
3761 {
3762   return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3763 }
3764
3765 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3766 ///    left by the number of bits given in the corresponding element of the
3767 ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3768 ///    returns the result. If the shift count for any element is greater than
3769 ///    31, the result for that element is zero.
3770 ///
3771 /// \headerfile <immintrin.h>
3772 ///
3773 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3774 ///
3775 /// \param __X
3776 ///    A 128-bit vector of [4 x i32] to be shifted.
3777 /// \param __Y
3778 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3779 ///    bits).
3780 /// \returns A 128-bit vector of [4 x i32] containing the result.
3781 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3782 _mm_sllv_epi32(__m128i __X, __m128i __Y)
3783 {
3784   return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3785 }
3786
3787 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3788 ///    left by the number of bits given in the corresponding element of the
3789 ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3790 ///    returns the result. If the shift count for any element is greater than
3791 ///    63, the result for that element is zero.
3792 ///
3793 /// \headerfile <immintrin.h>
3794 ///
3795 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3796 ///
3797 /// \param __X
3798 ///    A 256-bit vector of [4 x i64] to be shifted.
3799 /// \param __Y
3800 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3801 ///    bits).
3802 /// \returns A 256-bit vector of [4 x i64] containing the result.
3803 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3804 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
3805 {
3806   return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3807 }
3808
3809 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3810 ///    left by the number of bits given in the corresponding element of the
3811 ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3812 ///    returns the result. If the shift count for any element is greater than
3813 ///    63, the result for that element is zero.
3814 ///
3815 /// \headerfile <immintrin.h>
3816 ///
3817 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3818 ///
3819 /// \param __X
3820 ///    A 128-bit vector of [2 x i64] to be shifted.
3821 /// \param __Y
3822 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3823 ///    bits).
3824 /// \returns A 128-bit vector of [2 x i64] containing the result.
3825 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3826 _mm_sllv_epi64(__m128i __X, __m128i __Y)
3827 {
3828   return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3829 }
3830
3831 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3832 ///    right by the number of bits given in the corresponding element of the
3833 ///    256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3834 ///    returns the result. If the shift count for any element is greater than
3835 ///    31, the result for that element is 0 or -1 according to the sign bit
3836 ///    for that element.
3837 ///
3838 /// \headerfile <immintrin.h>
3839 ///
3840 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3841 ///
3842 /// \param __X
3843 ///    A 256-bit vector of [8 x i32] to be shifted.
3844 /// \param __Y
3845 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3846 ///    bits).
3847 /// \returns A 256-bit vector of [8 x i32] containing the result.
3848 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3849 _mm256_srav_epi32(__m256i __X, __m256i __Y)
3850 {
3851   return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3852 }
3853
3854 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3855 ///    right by the number of bits given in the corresponding element of the
3856 ///    128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3857 ///    returns the result. If the shift count for any element is greater than
3858 ///    31, the result for that element is 0 or -1 according to the sign bit
3859 ///    for that element.
3860 ///
3861 /// \headerfile <immintrin.h>
3862 ///
3863 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3864 ///
3865 /// \param __X
3866 ///    A 128-bit vector of [4 x i32] to be shifted.
3867 /// \param __Y
3868 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3869 ///    bits).
3870 /// \returns A 128-bit vector of [4 x i32] containing the result.
3871 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3872 _mm_srav_epi32(__m128i __X, __m128i __Y)
3873 {
3874   return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3875 }
3876
3877 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3878 ///    right by the number of bits given in the corresponding element of the
3879 ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3880 ///    returns the result. If the shift count for any element is greater than
3881 ///    31, the result for that element is zero.
3882 ///
3883 /// \headerfile <immintrin.h>
3884 ///
3885 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3886 ///
3887 /// \param __X
3888 ///    A 256-bit vector of [8 x i32] to be shifted.
3889 /// \param __Y
3890 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3891 ///    bits).
3892 /// \returns A 256-bit vector of [8 x i32] containing the result.
3893 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3894 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
3895 {
3896   return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3897 }
3898
3899 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3900 ///    right by the number of bits given in the corresponding element of the
3901 ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3902 ///    returns the result. If the shift count for any element is greater than
3903 ///    31, the result for that element is zero.
3904 ///
3905 /// \headerfile <immintrin.h>
3906 ///
3907 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3908 ///
3909 /// \param __X
3910 ///    A 128-bit vector of [4 x i32] to be shifted.
3911 /// \param __Y
3912 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3913 ///    bits).
3914 /// \returns A 128-bit vector of [4 x i32] containing the result.
3915 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3916 _mm_srlv_epi32(__m128i __X, __m128i __Y)
3917 {
3918   return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3919 }
3920
3921 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3922 ///    right by the number of bits given in the corresponding element of the
3923 ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3924 ///    returns the result. If the shift count for any element is greater than
3925 ///    63, the result for that element is zero.
3926 ///
3927 /// \headerfile <immintrin.h>
3928 ///
3929 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3930 ///
3931 /// \param __X
3932 ///    A 256-bit vector of [4 x i64] to be shifted.
3933 /// \param __Y
3934 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3935 ///    bits).
3936 /// \returns A 256-bit vector of [4 x i64] containing the result.
3937 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3938 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
3939 {
3940   return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3941 }
3942
3943 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3944 ///    right by the number of bits given in the corresponding element of the
3945 ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3946 ///    returns the result. If the shift count for any element is greater than
3947 ///    63, the result for that element is zero.
3948 ///
3949 /// \headerfile <immintrin.h>
3950 ///
3951 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3952 ///
3953 /// \param __X
3954 ///    A 128-bit vector of [2 x i64] to be shifted.
3955 /// \param __Y
3956 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3957 ///    bits).
3958 /// \returns A 128-bit vector of [2 x i64] containing the result.
3959 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3960 _mm_srlv_epi64(__m128i __X, __m128i __Y)
3961 {
3962   return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3963 }
3964
3965 /// Conditionally gathers two 64-bit floating-point values, either from the
3966 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3967 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3968 ///    of [2 x double] in \a mask determines the source for each element.
3969 ///
3970 /// \code{.operation}
3971 /// FOR element := 0 to 1
3972 ///   j := element*64
3973 ///   k := element*32
3974 ///   IF mask[j+63] == 0
3975 ///     result[j+63:j] := a[j+63:j]
3976 ///   ELSE
3977 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3978 ///   FI
3979 /// ENDFOR
3980 /// \endcode
3981 ///
3982 /// \headerfile <immintrin.h>
3983 ///
3984 /// \code
3985 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3986 ///                               __m128d mask, const int s);
3987 /// \endcode
3988 ///
3989 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3990 ///
3991 /// \param a
3992 ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
3993 ///    zero.
3994 /// \param m
3995 ///    A pointer to the memory used for loading values.
3996 /// \param i
3997 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3998 ///    the first two elements are used.
3999 /// \param mask
4000 ///    A 128-bit vector of [2 x double] containing the mask. The most
4001 ///    significant bit of each element in the mask vector represents the mask
4002 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4003 ///    is gathered; otherwise the value is loaded from memory.
4004 /// \param s
4005 ///    A literal constant scale factor for the indexes in \a i. Must be
4006 ///    1, 2, 4, or 8.
4007 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4008 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4009   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4010                                       (double const *)(m), \
4011                                       (__v4si)(__m128i)(i), \
4012                                       (__v2df)(__m128d)(mask), (s)))
4013
4014 /// Conditionally gathers four 64-bit floating-point values, either from the
4015 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4016 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4017 ///    of [4 x double] in \a mask determines the source for each element.
4018 ///
4019 /// \code{.operation}
4020 /// FOR element := 0 to 3
4021 ///   j := element*64
4022 ///   k := element*32
4023 ///   IF mask[j+63] == 0
4024 ///     result[j+63:j] := a[j+63:j]
4025 ///   ELSE
4026 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4027 ///   FI
4028 /// ENDFOR
4029 /// \endcode
4030 ///
4031 /// \headerfile <immintrin.h>
4032 ///
4033 /// \code
4034 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4035 ///                                  __m256d mask, const int s);
4036 /// \endcode
4037 ///
4038 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4039 ///
4040 /// \param a
4041 ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
4042 ///    zero.
4043 /// \param m
4044 ///    A pointer to the memory used for loading values.
4045 /// \param i
4046 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4047 /// \param mask
4048 ///    A 256-bit vector of [4 x double] containing the mask. The most
4049 ///    significant bit of each element in the mask vector represents the mask
4050 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4051 ///    is gathered; otherwise the value is loaded from memory.
4052 /// \param s
4053 ///    A literal constant scale factor for the indexes in \a i. Must be
4054 ///    1, 2, 4, or 8.
4055 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4056 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4057   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4058                                          (double const *)(m), \
4059                                          (__v4si)(__m128i)(i), \
4060                                          (__v4df)(__m256d)(mask), (s)))
4061
4062 /// Conditionally gathers two 64-bit floating-point values, either from the
4063 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4064 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4065 ///    of [2 x double] in \a mask determines the source for each element.
4066 ///
4067 /// \code{.operation}
4068 /// FOR element := 0 to 1
4069 ///   j := element*64
4070 ///   k := element*64
4071 ///   IF mask[j+63] == 0
4072 ///     result[j+63:j] := a[j+63:j]
4073 ///   ELSE
4074 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4075 ///   FI
4076 /// ENDFOR
4077 /// \endcode
4078 ///
4079 /// \headerfile <immintrin.h>
4080 ///
4081 /// \code
4082 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4083 ///                               __m128d mask, const int s);
4084 /// \endcode
4085 ///
4086 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4087 ///
4088 /// \param a
4089 ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
4090 ///    zero.
4091 /// \param m
4092 ///    A pointer to the memory used for loading values.
4093 /// \param i
4094 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4095 /// \param mask
4096 ///    A 128-bit vector of [2 x double] containing the mask. The most
4097 ///    significant bit of each element in the mask vector represents the mask
4098 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4099 ///    is gathered; otherwise the value is loaded from memory.
4100 /// \param s
4101 ///    A literal constant scale factor for the indexes in \a i. Must be
4102 ///    1, 2, 4, or 8.
4103 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4104 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4105   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4106                                       (double const *)(m), \
4107                                       (__v2di)(__m128i)(i), \
4108                                       (__v2df)(__m128d)(mask), (s)))
4109
4110 /// Conditionally gathers four 64-bit floating-point values, either from the
4111 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4112 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4113 ///    of [4 x double] in \a mask determines the source for each element.
4114 ///
4115 /// \code{.operation}
4116 /// FOR element := 0 to 3
4117 ///   j := element*64
4118 ///   k := element*64
4119 ///   IF mask[j+63] == 0
4120 ///     result[j+63:j] := a[j+63:j]
4121 ///   ELSE
4122 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4123 ///   FI
4124 /// ENDFOR
4125 /// \endcode
4126 ///
4127 /// \headerfile <immintrin.h>
4128 ///
4129 /// \code
4130 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4131 ///                                  __m256d mask, const int s);
4132 /// \endcode
4133 ///
4134 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4135 ///
4136 /// \param a
4137 ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
4138 ///    zero.
4139 /// \param m
4140 ///    A pointer to the memory used for loading values.
4141 /// \param i
4142 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4143 /// \param mask
4144 ///    A 256-bit vector of [4 x double] containing the mask. The most
4145 ///    significant bit of each element in the mask vector represents the mask
4146 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4147 ///    is gathered; otherwise the value is loaded from memory.
4148 /// \param s
4149 ///    A literal constant scale factor for the indexes in \a i. Must be
4150 ///    1, 2, 4, or 8.
4151 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4152 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4153   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4154                                          (double const *)(m), \
4155                                          (__v4di)(__m256i)(i), \
4156                                          (__v4df)(__m256d)(mask), (s)))
4157
4158 /// Conditionally gathers four 32-bit floating-point values, either from the
4159 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4160 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4161 ///    of [4 x float] in \a mask determines the source for each element.
4162 ///
4163 /// \code{.operation}
4164 /// FOR element := 0 to 3
4165 ///   j := element*32
4166 ///   k := element*32
4167 ///   IF mask[j+31] == 0
4168 ///     result[j+31:j] := a[j+31:j]
4169 ///   ELSE
4170 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4171 ///   FI
4172 /// ENDFOR
4173 /// \endcode
4174 ///
4175 /// \headerfile <immintrin.h>
4176 ///
4177 /// \code
4178 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4179 ///                              __m128 mask, const int s);
4180 /// \endcode
4181 ///
4182 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4183 ///
4184 /// \param a
4185 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4186 ///    zero.
4187 /// \param m
4188 ///    A pointer to the memory used for loading values.
4189 /// \param i
4190 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4191 /// \param mask
4192 ///    A 128-bit vector of [4 x float] containing the mask. The most
4193 ///    significant bit of each element in the mask vector represents the mask
4194 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4195 ///    is gathered; otherwise the value is loaded from memory.
4196 /// \param s
4197 ///    A literal constant scale factor for the indexes in \a i. Must be
4198 ///    1, 2, 4, or 8.
4199 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4200 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4201   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4202                                      (float const *)(m), \
4203                                      (__v4si)(__m128i)(i), \
4204                                      (__v4sf)(__m128)(mask), (s)))
4205
4206 /// Conditionally gathers eight 32-bit floating-point values, either from the
4207 ///    256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4208 ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4209 ///    of [8 x float] in \a mask determines the source for each element.
4210 ///
4211 /// \code{.operation}
4212 /// FOR element := 0 to 7
4213 ///   j := element*32
4214 ///   k := element*32
4215 ///   IF mask[j+31] == 0
4216 ///     result[j+31:j] := a[j+31:j]
4217 ///   ELSE
4218 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4219 ///   FI
4220 /// ENDFOR
4221 /// \endcode
4222 ///
4223 /// \headerfile <immintrin.h>
4224 ///
4225 /// \code
4226 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4227 ///                                 __m256 mask, const int s);
4228 /// \endcode
4229 ///
4230 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4231 ///
4232 /// \param a
4233 ///    A 256-bit vector of [8 x float] used as the source when a mask bit is
4234 ///    zero.
4235 /// \param m
4236 ///    A pointer to the memory used for loading values.
4237 /// \param i
4238 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4239 /// \param mask
4240 ///    A 256-bit vector of [8 x float] containing the mask. The most
4241 ///    significant bit of each element in the mask vector represents the mask
4242 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4243 ///    is gathered; otherwise the value is loaded from memory.
4244 /// \param s
4245 ///    A literal constant scale factor for the indexes in \a i. Must be
4246 ///    1, 2, 4, or 8.
4247 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4248 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4249   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4250                                         (float const *)(m), \
4251                                         (__v8si)(__m256i)(i), \
4252                                         (__v8sf)(__m256)(mask), (s)))
4253
4254 /// Conditionally gathers two 32-bit floating-point values, either from the
4255 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4256 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4257 ///    of [4 x float] in \a mask determines the source for the lower two
4258 ///    elements. The upper two elements of the result are zeroed.
4259 ///
4260 /// \code{.operation}
4261 /// FOR element := 0 to 1
4262 ///   j := element*32
4263 ///   k := element*64
4264 ///   IF mask[j+31] == 0
4265 ///     result[j+31:j] := a[j+31:j]
4266 ///   ELSE
4267 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4268 ///   FI
4269 /// ENDFOR
4270 /// result[127:64] := 0
4271 /// \endcode
4272 ///
4273 /// \headerfile <immintrin.h>
4274 ///
4275 /// \code
4276 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4277 ///                              __m128 mask, const int s);
4278 /// \endcode
4279 ///
4280 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4281 ///
4282 /// \param a
4283 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4284 ///    zero. Only the first two elements are used.
4285 /// \param m
4286 ///    A pointer to the memory used for loading values.
4287 /// \param i
4288 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4289 /// \param mask
4290 ///    A 128-bit vector of [4 x float] containing the mask. The most
4291 ///    significant bit of each element in the mask vector represents the mask
4292 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4293 ///    is gathered; otherwise the value is loaded from memory. Only the first
4294 ///    two elements are used.
4295 /// \param s
4296 ///    A literal constant scale factor for the indexes in \a i. Must be
4297 ///    1, 2, 4, or 8.
4298 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4299 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4300   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4301                                      (float const *)(m), \
4302                                      (__v2di)(__m128i)(i), \
4303                                      (__v4sf)(__m128)(mask), (s)))
4304
4305 /// Conditionally gathers four 32-bit floating-point values, either from the
4306 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4307 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4308 ///    of [4 x float] in \a mask determines the source for each element.
4309 ///
4310 /// \code{.operation}
4311 /// FOR element := 0 to 3
4312 ///   j := element*32
4313 ///   k := element*64
4314 ///   IF mask[j+31] == 0
4315 ///     result[j+31:j] := a[j+31:j]
4316 ///   ELSE
4317 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4318 ///   FI
4319 /// ENDFOR
4320 /// \endcode
4321 ///
4322 /// \headerfile <immintrin.h>
4323 ///
4324 /// \code
4325 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4326 ///                                 __m128 mask, const int s);
4327 /// \endcode
4328 ///
4329 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4330 ///
4331 /// \param a
4332 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4333 ///   zero.
4334 /// \param m
4335 ///    A pointer to the memory used for loading values.
4336 /// \param i
4337 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4338 /// \param mask
4339 ///    A 128-bit vector of [4 x float] containing the mask. The most
4340 ///    significant bit of each element in the mask vector represents the mask
4341 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4342 ///    is gathered; otherwise the value is loaded from memory.
4343 /// \param s
4344 ///    A literal constant scale factor for the indexes in \a i. Must be
4345 ///    1, 2, 4, or 8.
4346 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4347 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4348   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4349                                         (float const *)(m), \
4350                                         (__v4di)(__m256i)(i), \
4351                                         (__v4sf)(__m128)(mask), (s)))
4352
4353 /// Conditionally gathers four 32-bit integer values, either from the
4354 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4355 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4356 ///    of [4 x i32] in \a mask determines the source for each element.
4357 ///
4358 /// \code{.operation}
4359 /// FOR element := 0 to 3
4360 ///   j := element*32
4361 ///   k := element*32
4362 ///   IF mask[j+31] == 0
4363 ///     result[j+31:j] := a[j+31:j]
4364 ///   ELSE
4365 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4366 ///   FI
4367 /// ENDFOR
4368 /// \endcode
4369 ///
4370 /// \headerfile <immintrin.h>
4371 ///
4372 /// \code
4373 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4374 ///                                  __m128i mask, const int s);
4375 /// \endcode
4376 ///
4377 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4378 ///
4379 /// \param a
4380 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4381 ///    zero.
4382 /// \param m
4383 ///    A pointer to the memory used for loading values.
4384 /// \param i
4385 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4386 /// \param mask
4387 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4388 ///    bit of each element in the mask vector represents the mask bits. If a
4389 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4390 ///    otherwise the value is loaded from memory.
4391 /// \param s
4392 ///    A literal constant scale factor for the indexes in \a i. Must be
4393 ///    1, 2, 4, or 8.
4394 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4395 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4396   ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4397                                      (int const *)(m), \
4398                                      (__v4si)(__m128i)(i), \
4399                                      (__v4si)(__m128i)(mask), (s)))
4400
4401 /// Conditionally gathers eight 32-bit integer values, either from the
4402 ///    256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4403 ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4404 ///    of [8 x i32] in \a mask determines the source for each element.
4405 ///
4406 /// \code{.operation}
4407 /// FOR element := 0 to 7
4408 ///   j := element*32
4409 ///   k := element*32
4410 ///   IF mask[j+31] == 0
4411 ///     result[j+31:j] := a[j+31:j]
4412 ///   ELSE
4413 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4414 ///   FI
4415 /// ENDFOR
4416 /// \endcode
4417 ///
4418 /// \headerfile <immintrin.h>
4419 ///
4420 /// \code
4421 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4422 ///                                     __m256i mask, const int s);
4423 /// \endcode
4424 ///
4425 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4426 ///
4427 /// \param a
4428 ///    A 256-bit vector of [8 x i32] used as the source when a mask bit is
4429 ///    zero.
4430 /// \param m
4431 ///    A pointer to the memory used for loading values.
4432 /// \param i
4433 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4434 /// \param mask
4435 ///    A 256-bit vector of [8 x i32] containing the mask. The most significant
4436 ///    bit of each element in the mask vector represents the mask bits. If a
4437 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4438 ///    otherwise the value is loaded from memory.
4439 /// \param s
4440 ///    A literal constant scale factor for the indexes in \a i. Must be
4441 ///    1, 2, 4, or 8.
4442 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4443 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4444   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4445                                         (int const *)(m), \
4446                                         (__v8si)(__m256i)(i), \
4447                                         (__v8si)(__m256i)(mask), (s)))
4448
4449 /// Conditionally gathers two 32-bit integer values, either from the
4450 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4451 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4452 ///    of [4 x i32] in \a mask determines the source for the lower two
4453 ///    elements. The upper two elements of the result are zeroed.
4454 ///
4455 /// \code{.operation}
4456 /// FOR element := 0 to 1
4457 ///   j := element*32
4458 ///   k := element*64
4459 ///   IF mask[j+31] == 0
4460 ///     result[j+31:j] := a[j+31:j]
4461 ///   ELSE
4462 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4463 ///   FI
4464 /// ENDFOR
4465 /// result[127:64] := 0
4466 /// \endcode
4467 ///
4468 /// \headerfile <immintrin.h>
4469 ///
4470 /// \code
4471 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4472 ///                                  __m128i mask, const int s);
4473 /// \endcode
4474 ///
4475 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4476 ///
4477 /// \param a
4478 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4479 ///   zero. Only the first two elements are used.
4480 /// \param m
4481 ///    A pointer to the memory used for loading values.
4482 /// \param i
4483 ///    A 128-bit vector of [2 x i64] containing indexes into \a m.
4484 /// \param mask
4485 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4486 ///    bit of each element in the mask vector represents the mask bits. If a
4487 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4488 ///    otherwise the value is loaded from memory. Only the first two elements
4489 ///    are used.
4490 /// \param s
4491 ///    A literal constant scale factor for the indexes in \a i. Must be
4492 ///    1, 2, 4, or 8.
4493 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4494 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4495   ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4496                                      (int const *)(m), \
4497                                      (__v2di)(__m128i)(i), \
4498                                      (__v4si)(__m128i)(mask), (s)))
4499
4500 /// Conditionally gathers four 32-bit integer values, either from the
4501 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4502 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4503 ///    of [4 x i32] in \a mask determines the source for each element.
4504 ///
4505 /// \code{.operation}
4506 /// FOR element := 0 to 3
4507 ///   j := element*32
4508 ///   k := element*64
4509 ///   IF mask[j+31] == 0
4510 ///     result[j+31:j] := a[j+31:j]
4511 ///   ELSE
4512 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4513 ///   FI
4514 /// ENDFOR
4515 /// \endcode
4516 ///
4517 /// \headerfile <immintrin.h>
4518 ///
4519 /// \code
4520 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4521 ///                                     __m128i mask, const int s);
4522 /// \endcode
4523 ///
4524 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4525 ///
4526 /// \param a
4527 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4528 ///    zero.
4529 /// \param m
4530 ///    A pointer to the memory used for loading values.
4531 /// \param i
4532 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4533 /// \param mask
4534 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4535 ///    bit of each element in the mask vector represents the mask bits. If a
4536 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4537 ///    otherwise the value is loaded from memory.
4538 /// \param s
4539 ///    A literal constant scale factor for the indexes in \a i. Must be
4540 ///    1, 2, 4, or 8.
4541 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4542 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4543   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4544                                         (int const *)(m), \
4545                                         (__v4di)(__m256i)(i), \
4546                                         (__v4si)(__m128i)(mask), (s)))
4547
4548 /// Conditionally gathers two 64-bit integer values, either from the
4549 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4550 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4551 ///    of [2 x i64] in \a mask determines the source for each element.
4552 ///
4553 /// \code{.operation}
4554 /// FOR element := 0 to 1
4555 ///   j := element*64
4556 ///   k := element*32
4557 ///   IF mask[j+63] == 0
4558 ///     result[j+63:j] := a[j+63:j]
4559 ///   ELSE
4560 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4561 ///   FI
4562 /// ENDFOR
4563 /// \endcode
4564 ///
4565 /// \headerfile <immintrin.h>
4566 ///
4567 /// \code
4568 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4569 ///                                  __m128i mask, const int s);
4570 /// \endcode
4571 ///
4572 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4573 ///
4574 /// \param a
4575 ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
4576 ///    zero.
4577 /// \param m
4578 ///    A pointer to the memory used for loading values.
4579 /// \param i
4580 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4581 ///    the first two elements are used.
4582 /// \param mask
4583 ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
4584 ///    bit of each element in the mask vector represents the mask bits. If a
4585 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4586 ///    otherwise the value is loaded from memory.
4587 /// \param s
4588 ///    A literal constant scale factor for the indexes in \a i. Must be
4589 ///    1, 2, 4, or 8.
4590 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4591 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4592   ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4593                                      (long long const *)(m), \
4594                                      (__v4si)(__m128i)(i), \
4595                                      (__v2di)(__m128i)(mask), (s)))
4596
4597 /// Conditionally gathers four 64-bit integer values, either from the
4598 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4599 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4600 ///    of [4 x i64] in \a mask determines the source for each element.
4601 ///
4602 /// \code{.operation}
4603 /// FOR element := 0 to 3
4604 ///   j := element*64
4605 ///   k := element*32
4606 ///   IF mask[j+63] == 0
4607 ///     result[j+63:j] := a[j+63:j]
4608 ///   ELSE
4609 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4610 ///   FI
4611 /// ENDFOR
4612 /// \endcode
4613 ///
4614 /// \headerfile <immintrin.h>
4615 ///
4616 /// \code
4617 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4618 ///                                     __m128i i, __m256i mask, const int s);
4619 /// \endcode
4620 ///
4621 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4622 ///
4623 /// \param a
4624 ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
4625 ///    zero.
4626 /// \param m
4627 ///    A pointer to the memory used for loading values.
4628 /// \param i
4629 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4630 /// \param mask
4631 ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
4632 ///    bit of each element in the mask vector represents the mask bits. If a
4633 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4634 ///    otherwise the value is loaded from memory.
4635 /// \param s
4636 ///    A literal constant scale factor for the indexes in \a i. Must be
4637 ///    1, 2, 4, or 8.
4638 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4639 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4640   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4641                                         (long long const *)(m), \
4642                                         (__v4si)(__m128i)(i), \
4643                                         (__v4di)(__m256i)(mask), (s)))
4644
4645 /// Conditionally gathers two 64-bit integer values, either from the
4646 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4647 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4648 ///    of [2 x i64] in \a mask determines the source for each element.
4649 ///
4650 /// \code{.operation}
4651 /// FOR element := 0 to 1
4652 ///   j := element*64
4653 ///   k := element*64
4654 ///   IF mask[j+63] == 0
4655 ///     result[j+63:j] := a[j+63:j]
4656 ///   ELSE
4657 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4658 ///   FI
4659 /// ENDFOR
4660 /// \endcode
4661 ///
4662 /// \headerfile <immintrin.h>
4663 ///
4664 /// \code
4665 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4666 ///                                  __m128i mask, const int s);
4667 /// \endcode
4668 ///
4669 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4670 ///
4671 /// \param a
4672 ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
4673 ///    zero.
4674 /// \param m
4675 ///    A pointer to the memory used for loading values.
4676 /// \param i
4677 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4678 /// \param mask
4679 ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
4680 ///    bit of each element in the mask vector represents the mask bits. If a
4681 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4682 ///    otherwise the value is loaded from memory.
4683 /// \param s
4684 ///    A literal constant scale factor for the indexes in \a i. Must be
4685 ///    1, 2, 4, or 8.
4686 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4687 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4688   ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4689                                      (long long const *)(m), \
4690                                      (__v2di)(__m128i)(i), \
4691                                      (__v2di)(__m128i)(mask), (s)))
4692
4693 /// Conditionally gathers four 64-bit integer values, either from the
4694 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4695 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4696 ///    of [4 x i64] in \a mask determines the source for each element.
4697 ///
4698 /// \code{.operation}
4699 /// FOR element := 0 to 3
4700 ///   j := element*64
4701 ///   k := element*64
4702 ///   IF mask[j+63] == 0
4703 ///     result[j+63:j] := a[j+63:j]
4704 ///   ELSE
4705 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4706 ///   FI
4707 /// ENDFOR
4708 /// \endcode
4709 ///
4710 /// \headerfile <immintrin.h>
4711 ///
4712 /// \code
4713 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4714 ///                                     __m256i i, __m256i mask, const int s);
4715 /// \endcode
4716 ///
4717 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4718 ///
4719 /// \param a
4720 ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
4721 ///    zero.
4722 /// \param m
4723 ///    A pointer to the memory used for loading values.
4724 /// \param i
4725 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4726 /// \param mask
4727 ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
4728 ///    bit of each element in the mask vector represents the mask bits. If a
4729 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4730 ///    otherwise the value is loaded from memory.
4731 /// \param s
4732 ///    A literal constant scale factor for the indexes in \a i. Must be
4733 ///    1, 2, 4, or 8.
4734 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4735 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4736   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4737                                         (long long const *)(m), \
4738                                         (__v4di)(__m256i)(i), \
4739                                         (__v4di)(__m256i)(mask), (s)))
4740
4741 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4742 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4743 ///
4744 /// \code{.operation}
4745 /// FOR element := 0 to 1
4746 ///   j := element*64
4747 ///   k := element*32
4748 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4749 /// ENDFOR
4750 /// \endcode
4751 ///
4752 /// \headerfile <immintrin.h>
4753 ///
4754 /// \code
4755 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4756 /// \endcode
4757 ///
4758 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4759 ///
4760 /// \param m
4761 ///    A pointer to the memory used for loading values.
4762 /// \param i
4763 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4764 ///    the first two elements are used.
4765 /// \param s
4766 ///    A literal constant scale factor for the indexes in \a i. Must be
4767 ///    1, 2, 4, or 8.
4768 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4769 #define _mm_i32gather_pd(m, i, s) \
4770   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4771                                       (double const *)(m), \
4772                                       (__v4si)(__m128i)(i), \
4773                                       (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4774                                                            _mm_setzero_pd()), \
4775                                       (s)))
4776
4777 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4778 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4779 ///
4780 /// \code{.operation}
4781 /// FOR element := 0 to 3
4782 ///   j := element*64
4783 ///   k := element*32
4784 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4785 /// ENDFOR
4786 /// \endcode
4787 ///
4788 /// \headerfile <immintrin.h>
4789 ///
4790 /// \code
4791 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4792 /// \endcode
4793 ///
4794 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4795 ///
4796 /// \param m
4797 ///    A pointer to the memory used for loading values.
4798 /// \param i
4799 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4800 /// \param s
4801 ///    A literal constant scale factor for the indexes in \a i. Must be
4802 ///    1, 2, 4, or 8.
4803 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4804 #define _mm256_i32gather_pd(m, i, s) \
4805   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4806                                          (double const *)(m), \
4807                                          (__v4si)(__m128i)(i), \
4808                                          (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4809                                                                _mm256_setzero_pd(), \
4810                                                                _CMP_EQ_OQ), \
4811                                          (s)))
4812
4813 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4814 ///    indexes from the 128-bit vector of [2 x i64] in \a i.
4815 ///
4816 /// \code{.operation}
4817 /// FOR element := 0 to 1
4818 ///   j := element*64
4819 ///   k := element*64
4820 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4821 /// ENDFOR
4822 /// \endcode
4823 ///
4824 /// \headerfile <immintrin.h>
4825 ///
4826 /// \code
4827 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4828 /// \endcode
4829 ///
4830 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4831 ///
4832 /// \param m
4833 ///    A pointer to the memory used for loading values.
4834 /// \param i
4835 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4836 /// \param s
4837 ///    A literal constant scale factor for the indexes in \a i. Must be
4838 ///    1, 2, 4, or 8.
4839 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4840 #define _mm_i64gather_pd(m, i, s) \
4841   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4842                                       (double const *)(m), \
4843                                       (__v2di)(__m128i)(i), \
4844                                       (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4845                                                            _mm_setzero_pd()), \
4846                                       (s)))
4847
4848 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4849 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
4850 ///
4851 /// \code{.operation}
4852 /// FOR element := 0 to 3
4853 ///   j := element*64
4854 ///   k := element*64
4855 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4856 /// ENDFOR
4857 /// \endcode
4858 ///
4859 /// \headerfile <immintrin.h>
4860 ///
4861 /// \code
4862 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4863 /// \endcode
4864 ///
4865 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4866 ///
4867 /// \param m
4868 ///    A pointer to the memory used for loading values.
4869 /// \param i
4870 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4871 /// \param s
4872 ///    A literal constant scale factor for the indexes in \a i. Must be
4873 ///    1, 2, 4, or 8.
4874 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4875 #define _mm256_i64gather_pd(m, i, s) \
4876   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4877                                          (double const *)(m), \
4878                                          (__v4di)(__m256i)(i), \
4879                                          (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4880                                                                _mm256_setzero_pd(), \
4881                                                                _CMP_EQ_OQ), \
4882                                          (s)))
4883
4884 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4885 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4886 ///
4887 /// \code{.operation}
4888 /// FOR element := 0 to 3
4889 ///   j := element*32
4890 ///   k := element*32
4891 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4892 /// ENDFOR
4893 /// \endcode
4894 ///
4895 /// \headerfile <immintrin.h>
4896 ///
4897 /// \code
4898 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4899 /// \endcode
4900 ///
4901 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4902 ///
4903 /// \param m
4904 ///    A pointer to the memory used for loading values.
4905 /// \param i
4906 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4907 /// \param s
4908 ///    A literal constant scale factor for the indexes in \a i. Must be
4909 ///    1, 2, 4, or 8.
4910 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4911 #define _mm_i32gather_ps(m, i, s) \
4912   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4913                                      (float const *)(m), \
4914                                      (__v4si)(__m128i)(i), \
4915                                      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4916                                                           _mm_setzero_ps()), \
4917                                      (s)))
4918
4919 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
4920 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
4921 ///
4922 /// \code{.operation}
4923 /// FOR element := 0 to 7
4924 ///   j := element*32
4925 ///   k := element*32
4926 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4927 /// ENDFOR
4928 /// \endcode
4929 ///
4930 /// \headerfile <immintrin.h>
4931 ///
4932 /// \code
4933 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4934 /// \endcode
4935 ///
4936 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4937 ///
4938 /// \param m
4939 ///    A pointer to the memory used for loading values.
4940 /// \param i
4941 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4942 /// \param s
4943 ///    A literal constant scale factor for the indexes in \a i. Must be
4944 ///    1, 2, 4, or 8.
4945 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4946 #define _mm256_i32gather_ps(m, i, s) \
4947   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4948                                         (float const *)(m), \
4949                                         (__v8si)(__m256i)(i), \
4950                                         (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4951                                                               _mm256_setzero_ps(), \
4952                                                               _CMP_EQ_OQ), \
4953                                         (s)))
4954
4955 /// Gathers two 32-bit floating-point values from memory \a m using scaled
4956 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4957 ///    elements of the result are zeroed.
4958 ///
4959 /// \code{.operation}
4960 /// FOR element := 0 to 1
4961 ///   j := element*32
4962 ///   k := element*64
4963 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4964 /// ENDFOR
4965 /// result[127:64] := 0
4966 /// \endcode
4967 ///
4968 /// \headerfile <immintrin.h>
4969 ///
4970 /// \code
4971 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4972 /// \endcode
4973 ///
4974 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4975 ///
4976 /// \param m
4977 ///    A pointer to the memory used for loading values.
4978 /// \param i
4979 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4980 /// \param s
4981 ///    A literal constant scale factor for the indexes in \a i. Must be
4982 ///    1, 2, 4, or 8.
4983 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4984 #define _mm_i64gather_ps(m, i, s) \
4985   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4986                                      (float const *)(m), \
4987                                      (__v2di)(__m128i)(i), \
4988                                      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4989                                                           _mm_setzero_ps()), \
4990                                      (s)))
4991
4992 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4993 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
4994 ///
4995 /// \code{.operation}
4996 /// FOR element := 0 to 3
4997 ///   j := element*32
4998 ///   k := element*64
4999 ///   result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
5000 /// ENDFOR
5001 /// \endcode
5002 ///
5003 /// \headerfile <immintrin.h>
5004 ///
5005 /// \code
5006 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
5007 /// \endcode
5008 ///
5009 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
5010 ///
5011 /// \param m
5012 ///    A pointer to the memory used for loading values.
5013 /// \param i
5014 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5015 /// \param s
5016 ///    A literal constant scale factor for the indexes in \a i. Must be
5017 ///    1, 2, 4, or 8.
5018 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
5019 #define _mm256_i64gather_ps(m, i, s) \
5020   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5021                                         (float const *)(m), \
5022                                         (__v4di)(__m256i)(i), \
5023                                         (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5024                                                              _mm_setzero_ps()), \
5025                                         (s)))
5026
5027 /// Gathers four 32-bit floating-point values from memory \a m using scaled
5028 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
5029 ///
5030 /// \code{.operation}
5031 /// FOR element := 0 to 3
5032 ///   j := element*32
5033 ///   k := element*32
5034 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5035 /// ENDFOR
5036 /// \endcode
5037 ///
5038 /// \headerfile <immintrin.h>
5039 ///
5040 /// \code
5041 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5042 /// \endcode
5043 ///
5044 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5045 ///
5046 /// \param m
5047 ///    A pointer to the memory used for loading values.
5048 /// \param i
5049 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5050 /// \param s
5051 ///    A literal constant scale factor for the indexes in \a i. Must be
5052 ///    1, 2, 4, or 8.
5053 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5054 #define _mm_i32gather_epi32(m, i, s) \
5055   ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5056                                      (int const *)(m), (__v4si)(__m128i)(i), \
5057                                      (__v4si)_mm_set1_epi32(-1), (s)))
5058
5059 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
5060 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
5061 ///
5062 /// \code{.operation}
5063 /// FOR element := 0 to 7
5064 ///   j := element*32
5065 ///   k := element*32
5066 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5067 /// ENDFOR
5068 /// \endcode
5069 ///
5070 /// \headerfile <immintrin.h>
5071 ///
5072 /// \code
5073 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5074 /// \endcode
5075 ///
5076 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5077 ///
5078 /// \param m
5079 ///    A pointer to the memory used for loading values.
5080 /// \param i
5081 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5082 /// \param s
5083 ///    A literal constant scale factor for the indexes in \a i. Must be
5084 ///    1, 2, 4, or 8.
5085 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5086 #define _mm256_i32gather_epi32(m, i, s) \
5087   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5088                                         (int const *)(m), (__v8si)(__m256i)(i), \
5089                                         (__v8si)_mm256_set1_epi32(-1), (s)))
5090
5091 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
5092 ///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5093 ///    of the result are zeroed.
5094 ///
5095 /// \code{.operation}
5096 /// FOR element := 0 to 1
5097 ///   j := element*32
5098 ///   k := element*64
5099 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5100 /// ENDFOR
5101 /// result[127:64] := 0
5102 /// \endcode
5103 ///
5104 /// \headerfile <immintrin.h>
5105 ///
5106 /// \code
5107 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5108 /// \endcode
5109 ///
5110 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5111 ///
5112 /// \param m
5113 ///    A pointer to the memory used for loading values.
5114 /// \param i
5115 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5116 /// \param s
5117 ///    A literal constant scale factor for the indexes in \a i. Must be
5118 ///    1, 2, 4, or 8.
5119 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5120 #define _mm_i64gather_epi32(m, i, s) \
5121   ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5122                                      (int const *)(m), (__v2di)(__m128i)(i), \
5123                                      (__v4si)_mm_set1_epi32(-1), (s)))
5124
5125 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
5126 ///    from the 256-bit vector of [4 x i64] in \a i.
5127 ///
5128 /// \code{.operation}
5129 /// FOR element := 0 to 3
5130 ///   j := element*32
5131 ///   k := element*64
5132 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5133 /// ENDFOR
5134 /// \endcode
5135 ///
5136 /// \headerfile <immintrin.h>
5137 ///
5138 /// \code
5139 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5140 /// \endcode
5141 ///
5142 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5143 ///
5144 /// \param m
5145 ///    A pointer to the memory used for loading values.
5146 /// \param i
5147 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5148 /// \param s
5149 ///    A literal constant scale factor for the indexes in \a i. Must be
5150 ///    1, 2, 4, or 8.
5151 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5152 #define _mm256_i64gather_epi32(m, i, s) \
5153   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5154                                         (int const *)(m), (__v4di)(__m256i)(i), \
5155                                         (__v4si)_mm_set1_epi32(-1), (s)))
5156
5157 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5158 ///    from the 128-bit vector of [4 x i32] in \a i.
5159 ///
5160 /// \code{.operation}
5161 /// FOR element := 0 to 1
5162 ///   j := element*64
5163 ///   k := element*32
5164 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5165 /// ENDFOR
5166 /// \endcode
5167 ///
5168 /// \headerfile <immintrin.h>
5169 ///
5170 /// \code
5171 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5172 /// \endcode
5173 ///
5174 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5175 ///
5176 /// \param m
5177 ///    A pointer to the memory used for loading values.
5178 /// \param i
5179 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5180 ///    the first two elements are used.
5181 /// \param s
5182 ///    A literal constant scale factor for the indexes in \a i. Must be
5183 ///    1, 2, 4, or 8.
5184 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5185 #define _mm_i32gather_epi64(m, i, s) \
5186   ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5187                                      (long long const *)(m), \
5188                                      (__v4si)(__m128i)(i), \
5189                                      (__v2di)_mm_set1_epi64x(-1), (s)))
5190
5191 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5192 ///    from the 128-bit vector of [4 x i32] in \a i.
5193 ///
5194 /// \code{.operation}
5195 /// FOR element := 0 to 3
5196 ///   j := element*64
5197 ///   k := element*32
5198 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5199 /// ENDFOR
5200 /// \endcode
5201 ///
5202 /// \headerfile <immintrin.h>
5203 ///
5204 /// \code
5205 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5206 /// \endcode
5207 ///
5208 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5209 ///
5210 /// \param m
5211 ///    A pointer to the memory used for loading values.
5212 /// \param i
5213 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5214 /// \param s
5215 ///    A literal constant scale factor for the indexes in \a i. Must be
5216 ///    1, 2, 4, or 8.
5217 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5218 #define _mm256_i32gather_epi64(m, i, s) \
5219   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5220                                         (long long const *)(m), \
5221                                         (__v4si)(__m128i)(i), \
5222                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
5223
5224 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5225 ///    from the 128-bit vector of [2 x i64] in \a i.
5226 ///
5227 /// \code{.operation}
5228 /// FOR element := 0 to 1
5229 ///   j := element*64
5230 ///   k := element*64
5231 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5232 /// ENDFOR
5233 /// \endcode
5234 ///
5235 /// \headerfile <immintrin.h>
5236 ///
5237 /// \code
5238 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5239 /// \endcode
5240 ///
5241 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5242 ///
5243 /// \param m
5244 ///    A pointer to the memory used for loading values.
5245 /// \param i
5246 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5247 /// \param s
5248 ///    A literal constant scale factor for the indexes in \a i. Must be
5249 ///    1, 2, 4, or 8.
5250 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5251 #define _mm_i64gather_epi64(m, i, s) \
5252   ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5253                                      (long long const *)(m), \
5254                                      (__v2di)(__m128i)(i), \
5255                                      (__v2di)_mm_set1_epi64x(-1), (s)))
5256
5257 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5258 ///    from the 256-bit vector of [4 x i64] in \a i.
5259 ///
5260 /// \code{.operation}
5261 /// FOR element := 0 to 3
5262 ///   j := element*64
5263 ///   k := element*64
5264 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5265 /// ENDFOR
5266 /// \endcode
5267 ///
5268 /// \headerfile <immintrin.h>
5269 ///
5270 /// \code
5271 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5272 /// \endcode
5273 ///
5274 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5275 ///
5276 /// \param m
5277 ///    A pointer to the memory used for loading values.
5278 /// \param i
5279 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5280 /// \param s
5281 ///    A literal constant scale factor for the indexes in \a i. Must be
5282 ///    1, 2, 4, or 8.
5283 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5284 #define _mm256_i64gather_epi64(m, i, s) \
5285   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5286                                         (long long const *)(m), \
5287                                         (__v4di)(__m256i)(i), \
5288                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
5289
5290 #undef __DEFAULT_FN_ATTRS256
5291 #undef __DEFAULT_FN_ATTRS128
5292
5293 #endif /* __AVX2INTRIN_H */