clang/lib/Headers/xmmintrin.h

   1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __XMMINTRIN_H
  11 #define __XMMINTRIN_H
  12
  13 #if !defined(__i386__) && !defined(__x86_64__)
  14 #error "This header is only meant to be used on x86 and x64 architecture"
  15 #endif
  16
  17 #include <mmintrin.h>
  18
  19 typedef int __v4si __attribute__((__vector_size__(16)));
  20 typedef float __v4sf __attribute__((__vector_size__(16)));
  21 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
  22
  23 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
  24
  25 /* Unsigned types */
  26 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
  27
  28 /* This header should only be included in a hosted environment as it depends on
  29  * a standard library to provide allocation routines. */
  30 #if __STDC_HOSTED__
  31 #include <mm_malloc.h>
  32 #endif
  33
  34 /* Define the default attributes for the functions in this file. */
  35 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
  36 #define __DEFAULT_FN_ATTRS                                                     \
  37   __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
  38                  __min_vector_width__(128)))
  39 #define __DEFAULT_FN_ATTRS_SSE2                                                \
  40   __attribute__((__always_inline__, __nodebug__,                               \
  41                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
  42 #else
  43 #define __DEFAULT_FN_ATTRS                                                     \
  44   __attribute__((__always_inline__, __nodebug__, __target__("sse"),            \
  45                  __min_vector_width__(128)))
  46 #define __DEFAULT_FN_ATTRS_SSE2                                                \
  47   __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
  48                  __min_vector_width__(128)))
  49 #endif
  50
  51 #if defined(__cplusplus) && (__cplusplus >= 201103L)
  52 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
  53 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
  54 #else
  55 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
  56 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
  57 #endif
  58
  59 #define __trunc64(x)                                                           \
  60   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
  61 #define __zext128(x)                                                           \
  62   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
  63                                     1, 2, 3)
  64 #define __anyext128(x)                                                         \
  65   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
  66                                     1, -1, -1)
  67 #define __zeroupper64(x)                                                       \
  68   (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0,   \
  69                                     1, 4, 5)
  70
  71 /// Adds the 32-bit float values in the low-order bits of the operands.
  72 ///
  73 /// \headerfile <x86intrin.h>
  74 ///
  75 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
  76 ///
  77 /// \param __a
  78 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  79 ///    The lower 32 bits of this operand are used in the calculation.
  80 /// \param __b
  81 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  82 ///    The lower 32 bits of this operand are used in the calculation.
  83 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
  84 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
  85 ///    the upper 96 bits of the first source operand.
  86 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
  87 _mm_add_ss(__m128 __a, __m128 __b) {
  88   __a[0] += __b[0];
  89   return __a;
  90 }
  91
  92 /// Adds two 128-bit vectors of [4 x float], and returns the results of
  93 ///    the addition.
  94 ///
  95 /// \headerfile <x86intrin.h>
  96 ///
  97 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
  98 ///
  99 /// \param __a
 100 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 101 /// \param __b
 102 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 103 /// \returns A 128-bit vector of [4 x float] containing the sums of both
 104 ///    operands.
 105 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 106 _mm_add_ps(__m128 __a, __m128 __b) {
 107   return (__m128)((__v4sf)__a + (__v4sf)__b);
 108 }
 109
 110 /// Subtracts the 32-bit float value in the low-order bits of the second
 111 ///    operand from the corresponding value in the first operand.
 112 ///
 113 /// \headerfile <x86intrin.h>
 114 ///
 115 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
 116 ///
 117 /// \param __a
 118 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
 119 ///    of this operand are used in the calculation.
 120 /// \param __b
 121 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
 122 ///    bits of this operand are used in the calculation.
 123 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 124 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
 125 ///    copied from the upper 96 bits of the first source operand.
 126 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 127 _mm_sub_ss(__m128 __a, __m128 __b) {
 128   __a[0] -= __b[0];
 129   return __a;
 130 }
 131
 132 /// Subtracts each of the values of the second operand from the first
 133 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
 134 ///    the results of the subtraction.
 135 ///
 136 /// \headerfile <x86intrin.h>
 137 ///
 138 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
 139 ///
 140 /// \param __a
 141 ///    A 128-bit vector of [4 x float] containing the minuend.
 142 /// \param __b
 143 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 144 /// \returns A 128-bit vector of [4 x float] containing the differences between
 145 ///    both operands.
 146 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 147 _mm_sub_ps(__m128 __a, __m128 __b) {
 148   return (__m128)((__v4sf)__a - (__v4sf)__b);
 149 }
 150
 151 /// Multiplies two 32-bit float values in the low-order bits of the
 152 ///    operands.
 153 ///
 154 /// \headerfile <x86intrin.h>
 155 ///
 156 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
 157 ///
 158 /// \param __a
 159 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 160 ///    The lower 32 bits of this operand are used in the calculation.
 161 /// \param __b
 162 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 163 ///    The lower 32 bits of this operand are used in the calculation.
 164 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
 165 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
 166 ///    bits of the first source operand.
 167 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 168 _mm_mul_ss(__m128 __a, __m128 __b) {
 169   __a[0] *= __b[0];
 170   return __a;
 171 }
 172
 173 /// Multiplies two 128-bit vectors of [4 x float] and returns the
 174 ///    results of the multiplication.
 175 ///
 176 /// \headerfile <x86intrin.h>
 177 ///
 178 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
 179 ///
 180 /// \param __a
 181 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 182 /// \param __b
 183 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 184 /// \returns A 128-bit vector of [4 x float] containing the products of both
 185 ///    operands.
 186 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 187 _mm_mul_ps(__m128 __a, __m128 __b) {
 188   return (__m128)((__v4sf)__a * (__v4sf)__b);
 189 }
 190
 191 /// Divides the value in the low-order 32 bits of the first operand by
 192 ///    the corresponding value in the second operand.
 193 ///
 194 /// \headerfile <x86intrin.h>
 195 ///
 196 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
 197 ///
 198 /// \param __a
 199 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
 200 ///    bits of this operand are used in the calculation.
 201 /// \param __b
 202 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
 203 ///    of this operand are used in the calculation.
 204 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
 205 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
 206 ///    upper 96 bits of the first source operand.
 207 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 208 _mm_div_ss(__m128 __a, __m128 __b) {
 209   __a[0] /= __b[0];
 210   return __a;
 211 }
 212
 213 /// Divides two 128-bit vectors of [4 x float].
 214 ///
 215 /// \headerfile <x86intrin.h>
 216 ///
 217 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
 218 ///
 219 /// \param __a
 220 ///    A 128-bit vector of [4 x float] containing the dividend.
 221 /// \param __b
 222 ///    A 128-bit vector of [4 x float] containing the divisor.
 223 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
 224 ///    operands.
 225 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 226 _mm_div_ps(__m128 __a, __m128 __b) {
 227   return (__m128)((__v4sf)__a / (__v4sf)__b);
 228 }
 229
 230 /// Calculates the square root of the value stored in the low-order bits
 231 ///    of a 128-bit vector of [4 x float].
 232 ///
 233 /// \headerfile <x86intrin.h>
 234 ///
 235 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
 236 ///
 237 /// \param __a
 238 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 239 ///    used in the calculation.
 240 /// \returns A 128-bit vector of [4 x float] containing the square root of the
 241 ///    value in the low-order bits of the operand.
 242 static __inline__ __m128 __DEFAULT_FN_ATTRS
 243 _mm_sqrt_ss(__m128 __a)
 244 {
 245   return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
 246 }
 247
 248 /// Calculates the square roots of the values stored in a 128-bit vector
 249 ///    of [4 x float].
 250 ///
 251 /// \headerfile <x86intrin.h>
 252 ///
 253 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
 254 ///
 255 /// \param __a
 256 ///    A 128-bit vector of [4 x float].
 257 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
 258 ///    values in the operand.
 259 static __inline__ __m128 __DEFAULT_FN_ATTRS
 260 _mm_sqrt_ps(__m128 __a)
 261 {
 262   return __builtin_ia32_sqrtps((__v4sf)__a);
 263 }
 264
 265 /// Calculates the approximate reciprocal of the value stored in the
 266 ///    low-order bits of a 128-bit vector of [4 x float].
 267 ///
 268 /// \headerfile <x86intrin.h>
 269 ///
 270 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
 271 ///
 272 /// \param __a
 273 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 274 ///    used in the calculation.
 275 /// \returns A 128-bit vector of [4 x float] containing the approximate
 276 ///    reciprocal of the value in the low-order bits of the operand.
 277 static __inline__ __m128 __DEFAULT_FN_ATTRS
 278 _mm_rcp_ss(__m128 __a)
 279 {
 280   return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
 281 }
 282
 283 /// Calculates the approximate reciprocals of the values stored in a
 284 ///    128-bit vector of [4 x float].
 285 ///
 286 /// \headerfile <x86intrin.h>
 287 ///
 288 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
 289 ///
 290 /// \param __a
 291 ///    A 128-bit vector of [4 x float].
 292 /// \returns A 128-bit vector of [4 x float] containing the approximate
 293 ///    reciprocals of the values in the operand.
 294 static __inline__ __m128 __DEFAULT_FN_ATTRS
 295 _mm_rcp_ps(__m128 __a)
 296 {
 297   return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
 298 }
 299
 300 /// Calculates the approximate reciprocal of the square root of the value
 301 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
 302 ///
 303 /// \headerfile <x86intrin.h>
 304 ///
 305 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
 306 ///
 307 /// \param __a
 308 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 309 ///    used in the calculation.
 310 /// \returns A 128-bit vector of [4 x float] containing the approximate
 311 ///    reciprocal of the square root of the value in the low-order bits of the
 312 ///    operand.
 313 static __inline__ __m128 __DEFAULT_FN_ATTRS
 314 _mm_rsqrt_ss(__m128 __a)
 315 {
 316   return __builtin_ia32_rsqrtss((__v4sf)__a);
 317 }
 318
 319 /// Calculates the approximate reciprocals of the square roots of the
 320 ///    values stored in a 128-bit vector of [4 x float].
 321 ///
 322 /// \headerfile <x86intrin.h>
 323 ///
 324 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
 325 ///
 326 /// \param __a
 327 ///    A 128-bit vector of [4 x float].
 328 /// \returns A 128-bit vector of [4 x float] containing the approximate
 329 ///    reciprocals of the square roots of the values in the operand.
 330 static __inline__ __m128 __DEFAULT_FN_ATTRS
 331 _mm_rsqrt_ps(__m128 __a)
 332 {
 333   return __builtin_ia32_rsqrtps((__v4sf)__a);
 334 }
 335
 336 /// Compares two 32-bit float values in the low-order bits of both
 337 ///    operands and returns the lesser value in the low-order bits of the
 338 ///    vector of [4 x float].
 339 ///
 340 ///    If either value in a comparison is NaN, returns the value from \a __b.
 341 ///
 342 /// \headerfile <x86intrin.h>
 343 ///
 344 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
 345 ///
 346 /// \param __a
 347 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 348 ///    32 bits of this operand are used in the comparison.
 349 /// \param __b
 350 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 351 ///    32 bits of this operand are used in the comparison.
 352 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 353 ///    minimum value between both operands. The upper 96 bits are copied from
 354 ///    the upper 96 bits of the first source operand.
 355 static __inline__ __m128 __DEFAULT_FN_ATTRS
 356 _mm_min_ss(__m128 __a, __m128 __b)
 357 {
 358   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
 359 }
 360
 361 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
 362 ///    of each pair of values.
 363 ///
 364 ///    If either value in a comparison is NaN, returns the value from \a __b.
 365 ///
 366 /// \headerfile <x86intrin.h>
 367 ///
 368 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
 369 ///
 370 /// \param __a
 371 ///    A 128-bit vector of [4 x float] containing one of the operands.
 372 /// \param __b
 373 ///    A 128-bit vector of [4 x float] containing one of the operands.
 374 /// \returns A 128-bit vector of [4 x float] containing the minimum values
 375 ///    between both operands.
 376 static __inline__ __m128 __DEFAULT_FN_ATTRS
 377 _mm_min_ps(__m128 __a, __m128 __b)
 378 {
 379   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
 380 }
 381
 382 /// Compares two 32-bit float values in the low-order bits of both
 383 ///    operands and returns the greater value in the low-order bits of a 128-bit
 384 ///    vector of [4 x float].
 385 ///
 386 ///    If either value in a comparison is NaN, returns the value from \a __b.
 387 ///
 388 /// \headerfile <x86intrin.h>
 389 ///
 390 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
 391 ///
 392 /// \param __a
 393 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 394 ///    32 bits of this operand are used in the comparison.
 395 /// \param __b
 396 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 397 ///    32 bits of this operand are used in the comparison.
 398 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 399 ///    maximum value between both operands. The upper 96 bits are copied from
 400 ///    the upper 96 bits of the first source operand.
 401 static __inline__ __m128 __DEFAULT_FN_ATTRS
 402 _mm_max_ss(__m128 __a, __m128 __b)
 403 {
 404   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
 405 }
 406
 407 /// Compares two 128-bit vectors of [4 x float] and returns the greater
 408 ///    of each pair of values.
 409 ///
 410 ///    If either value in a comparison is NaN, returns the value from \a __b.
 411 ///
 412 /// \headerfile <x86intrin.h>
 413 ///
 414 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
 415 ///
 416 /// \param __a
 417 ///    A 128-bit vector of [4 x float] containing one of the operands.
 418 /// \param __b
 419 ///    A 128-bit vector of [4 x float] containing one of the operands.
 420 /// \returns A 128-bit vector of [4 x float] containing the maximum values
 421 ///    between both operands.
 422 static __inline__ __m128 __DEFAULT_FN_ATTRS
 423 _mm_max_ps(__m128 __a, __m128 __b)
 424 {
 425   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
 426 }
 427
 428 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
 429 ///
 430 /// \headerfile <x86intrin.h>
 431 ///
 432 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
 433 ///
 434 /// \param __a
 435 ///    A 128-bit vector containing one of the source operands.
 436 /// \param __b
 437 ///    A 128-bit vector containing one of the source operands.
 438 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 439 ///    values between both operands.
 440 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 441 _mm_and_ps(__m128 __a, __m128 __b) {
 442   return (__m128)((__v4su)__a & (__v4su)__b);
 443 }
 444
 445 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
 446 ///    the one's complement of the values contained in the first source
 447 ///    operand.
 448 ///
 449 /// \headerfile <x86intrin.h>
 450 ///
 451 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
 452 ///
 453 /// \param __a
 454 ///    A 128-bit vector of [4 x float] containing the first source operand. The
 455 ///    one's complement of this value is used in the bitwise AND.
 456 /// \param __b
 457 ///    A 128-bit vector of [4 x float] containing the second source operand.
 458 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 459 ///    one's complement of the first operand and the values in the second
 460 ///    operand.
 461 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 462 _mm_andnot_ps(__m128 __a, __m128 __b) {
 463   return (__m128)(~(__v4su)__a & (__v4su)__b);
 464 }
 465
 466 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
 467 ///
 468 /// \headerfile <x86intrin.h>
 469 ///
 470 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
 471 ///
 472 /// \param __a
 473 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 474 /// \param __b
 475 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 476 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
 477 ///    values between both operands.
 478 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 479 _mm_or_ps(__m128 __a, __m128 __b) {
 480   return (__m128)((__v4su)__a | (__v4su)__b);
 481 }
 482
 483 /// Performs a bitwise exclusive OR of two 128-bit vectors of
 484 ///    [4 x float].
 485 ///
 486 /// \headerfile <x86intrin.h>
 487 ///
 488 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
 489 ///
 490 /// \param __a
 491 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 492 /// \param __b
 493 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 494 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
 495 ///    of the values between both operands.
 496 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 497 _mm_xor_ps(__m128 __a, __m128 __b) {
 498   return (__m128)((__v4su)__a ^ (__v4su)__b);
 499 }
 500
 501 /// Compares two 32-bit float values in the low-order bits of both
 502 ///    operands for equality.
 503 ///
 504 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
 505 ///    low-order bits of a vector [4 x float].
 506 ///    If either value in a comparison is NaN, returns false.
 507 ///
 508 /// \headerfile <x86intrin.h>
 509 ///
 510 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
 511 ///
 512 /// \param __a
 513 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 514 ///    32 bits of this operand are used in the comparison.
 515 /// \param __b
 516 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 517 ///    32 bits of this operand are used in the comparison.
 518 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 519 ///    in the low-order bits.
 520 static __inline__ __m128 __DEFAULT_FN_ATTRS
 521 _mm_cmpeq_ss(__m128 __a, __m128 __b)
 522 {
 523   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
 524 }
 525
 526 /// Compares each of the corresponding 32-bit float values of the
 527 ///    128-bit vectors of [4 x float] for equality.
 528 ///
 529 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
 530 ///    If either value in a comparison is NaN, returns false.
 531 ///
 532 /// \headerfile <x86intrin.h>
 533 ///
 534 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
 535 ///
 536 /// \param __a
 537 ///    A 128-bit vector of [4 x float].
 538 /// \param __b
 539 ///    A 128-bit vector of [4 x float].
 540 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 541 static __inline__ __m128 __DEFAULT_FN_ATTRS
 542 _mm_cmpeq_ps(__m128 __a, __m128 __b)
 543 {
 544   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
 545 }
 546
 547 /// Compares two 32-bit float values in the low-order bits of both
 548 ///    operands to determine if the value in the first operand is less than the
 549 ///    corresponding value in the second operand.
 550 ///
 551 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
 552 ///    low-order bits of a vector of [4 x float].
 553 ///    If either value in a comparison is NaN, returns false.
 554 ///
 555 /// \headerfile <x86intrin.h>
 556 ///
 557 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
 558 ///
 559 /// \param __a
 560 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 561 ///    32 bits of this operand are used in the comparison.
 562 /// \param __b
 563 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 564 ///    32 bits of this operand are used in the comparison.
 565 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 566 ///    in the low-order bits.
 567 static __inline__ __m128 __DEFAULT_FN_ATTRS
 568 _mm_cmplt_ss(__m128 __a, __m128 __b)
 569 {
 570   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
 571 }
 572
 573 /// Compares each of the corresponding 32-bit float values of the
 574 ///    128-bit vectors of [4 x float] to determine if the values in the first
 575 ///    operand are less than those in the second operand.
 576 ///
 577 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 578 ///    If either value in a comparison is NaN, returns false.
 579 ///
 580 /// \headerfile <x86intrin.h>
 581 ///
 582 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
 583 ///
 584 /// \param __a
 585 ///    A 128-bit vector of [4 x float].
 586 /// \param __b
 587 ///    A 128-bit vector of [4 x float].
 588 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 589 static __inline__ __m128 __DEFAULT_FN_ATTRS
 590 _mm_cmplt_ps(__m128 __a, __m128 __b)
 591 {
 592   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
 593 }
 594
 595 /// Compares two 32-bit float values in the low-order bits of both
 596 ///    operands to determine if the value in the first operand is less than or
 597 ///    equal to the corresponding value in the second operand.
 598 ///
 599 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
 600 ///    the low-order bits of a vector of [4 x float].
 601 ///    If either value in a comparison is NaN, returns false.
 602 ///
 603 /// \headerfile <x86intrin.h>
 604 ///
 605 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
 606 ///
 607 /// \param __a
 608 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 609 ///    32 bits of this operand are used in the comparison.
 610 /// \param __b
 611 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 612 ///    32 bits of this operand are used in the comparison.
 613 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 614 ///    in the low-order bits.
 615 static __inline__ __m128 __DEFAULT_FN_ATTRS
 616 _mm_cmple_ss(__m128 __a, __m128 __b)
 617 {
 618   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
 619 }
 620
 621 /// Compares each of the corresponding 32-bit float values of the
 622 ///    128-bit vectors of [4 x float] to determine if the values in the first
 623 ///    operand are less than or equal to those in the second operand.
 624 ///
 625 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
 626 ///    If either value in a comparison is NaN, returns false.
 627 ///
 628 /// \headerfile <x86intrin.h>
 629 ///
 630 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
 631 ///
 632 /// \param __a
 633 ///    A 128-bit vector of [4 x float].
 634 /// \param __b
 635 ///    A 128-bit vector of [4 x float].
 636 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 637 static __inline__ __m128 __DEFAULT_FN_ATTRS
 638 _mm_cmple_ps(__m128 __a, __m128 __b)
 639 {
 640   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
 641 }
 642
 643 /// Compares two 32-bit float values in the low-order bits of both
 644 ///    operands to determine if the value in the first operand is greater than
 645 ///    the corresponding value in the second operand.
 646 ///
 647 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
 648 ///    low-order bits of a vector of [4 x float].
 649 ///    If either value in a comparison is NaN, returns false.
 650 ///
 651 /// \headerfile <x86intrin.h>
 652 ///
 653 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
 654 ///
 655 /// \param __a
 656 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 657 ///    32 bits of this operand are used in the comparison.
 658 /// \param __b
 659 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 660 ///    32 bits of this operand are used in the comparison.
 661 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 662 ///    in the low-order bits.
 663 static __inline__ __m128 __DEFAULT_FN_ATTRS
 664 _mm_cmpgt_ss(__m128 __a, __m128 __b)
 665 {
 666   return (__m128)__builtin_shufflevector((__v4sf)__a,
 667                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
 668                                          4, 1, 2, 3);
 669 }
 670
 671 /// Compares each of the corresponding 32-bit float values of the
 672 ///    128-bit vectors of [4 x float] to determine if the values in the first
 673 ///    operand are greater than those in the second operand.
 674 ///
 675 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
 676 ///    If either value in a comparison is NaN, returns false.
 677 ///
 678 /// \headerfile <x86intrin.h>
 679 ///
 680 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
 681 ///
 682 /// \param __a
 683 ///    A 128-bit vector of [4 x float].
 684 /// \param __b
 685 ///    A 128-bit vector of [4 x float].
 686 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 687 static __inline__ __m128 __DEFAULT_FN_ATTRS
 688 _mm_cmpgt_ps(__m128 __a, __m128 __b)
 689 {
 690   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
 691 }
 692
 693 /// Compares two 32-bit float values in the low-order bits of both
 694 ///    operands to determine if the value in the first operand is greater than
 695 ///    or equal to the corresponding value in the second operand.
 696 ///
 697 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
 698 ///    low-order bits of a vector of [4 x float].
 699 ///    If either value in a comparison is NaN, returns false.
 700 ///
 701 /// \headerfile <x86intrin.h>
 702 ///
 703 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
 704 ///
 705 /// \param __a
 706 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 707 ///    32 bits of this operand are used in the comparison.
 708 /// \param __b
 709 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 710 ///    32 bits of this operand are used in the comparison.
 711 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 712 ///    in the low-order bits.
 713 static __inline__ __m128 __DEFAULT_FN_ATTRS
 714 _mm_cmpge_ss(__m128 __a, __m128 __b)
 715 {
 716   return (__m128)__builtin_shufflevector((__v4sf)__a,
 717                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
 718                                          4, 1, 2, 3);
 719 }
 720
 721 /// Compares each of the corresponding 32-bit float values of the
 722 ///    128-bit vectors of [4 x float] to determine if the values in the first
 723 ///    operand are greater than or equal to those in the second operand.
 724 ///
 725 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 726 ///    If either value in a comparison is NaN, returns false.
 727 ///
 728 /// \headerfile <x86intrin.h>
 729 ///
 730 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
 731 ///
 732 /// \param __a
 733 ///    A 128-bit vector of [4 x float].
 734 /// \param __b
 735 ///    A 128-bit vector of [4 x float].
 736 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 737 static __inline__ __m128 __DEFAULT_FN_ATTRS
 738 _mm_cmpge_ps(__m128 __a, __m128 __b)
 739 {
 740   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
 741 }
 742
 743 /// Compares two 32-bit float values in the low-order bits of both operands
 744 ///    for inequality.
 745 ///
 746 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
 747 ///    low-order bits of a vector of [4 x float].
 748 ///    If either value in a comparison is NaN, returns true.
 749 ///
 750 /// \headerfile <x86intrin.h>
 751 ///
 752 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
 753 ///   instructions.
 754 ///
 755 /// \param __a
 756 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 757 ///    32 bits of this operand are used in the comparison.
 758 /// \param __b
 759 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 760 ///    32 bits of this operand are used in the comparison.
 761 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 762 ///    in the low-order bits.
 763 static __inline__ __m128 __DEFAULT_FN_ATTRS
 764 _mm_cmpneq_ss(__m128 __a, __m128 __b)
 765 {
 766   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
 767 }
 768
 769 /// Compares each of the corresponding 32-bit float values of the
 770 ///    128-bit vectors of [4 x float] for inequality.
 771 ///
 772 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
 773 ///    If either value in a comparison is NaN, returns true.
 774 ///
 775 /// \headerfile <x86intrin.h>
 776 ///
 777 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
 778 ///   instructions.
 779 ///
 780 /// \param __a
 781 ///    A 128-bit vector of [4 x float].
 782 /// \param __b
 783 ///    A 128-bit vector of [4 x float].
 784 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 785 static __inline__ __m128 __DEFAULT_FN_ATTRS
 786 _mm_cmpneq_ps(__m128 __a, __m128 __b)
 787 {
 788   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
 789 }
 790
 791 /// Compares two 32-bit float values in the low-order bits of both
 792 ///    operands to determine if the value in the first operand is not less than
 793 ///    the corresponding value in the second operand.
 794 ///
 795 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
 796 ///    low-order bits of a vector of [4 x float].
 797 ///    If either value in a comparison is NaN, returns true.
 798 ///
 799 /// \headerfile <x86intrin.h>
 800 ///
 801 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
 802 ///   instructions.
 803 ///
 804 /// \param __a
 805 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 806 ///    32 bits of this operand are used in the comparison.
 807 /// \param __b
 808 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 809 ///    32 bits of this operand are used in the comparison.
 810 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 811 ///    in the low-order bits.
 812 static __inline__ __m128 __DEFAULT_FN_ATTRS
 813 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 814 {
 815   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
 816 }
 817
 818 /// Compares each of the corresponding 32-bit float values of the
 819 ///    128-bit vectors of [4 x float] to determine if the values in the first
 820 ///    operand are not less than those in the second operand.
 821 ///
 822 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
 823 ///    If either value in a comparison is NaN, returns true.
 824 ///
 825 /// \headerfile <x86intrin.h>
 826 ///
 827 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
 828 ///   instructions.
 829 ///
 830 /// \param __a
 831 ///    A 128-bit vector of [4 x float].
 832 /// \param __b
 833 ///    A 128-bit vector of [4 x float].
 834 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 835 static __inline__ __m128 __DEFAULT_FN_ATTRS
 836 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
 837 {
 838   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
 839 }
 840
 841 /// Compares two 32-bit float values in the low-order bits of both
 842 ///    operands to determine if the value in the first operand is not less than
 843 ///    or equal to the corresponding value in the second operand.
 844 ///
 845 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
 846 ///    low-order bits of a vector of [4 x float].
 847 ///    If either value in a comparison is NaN, returns true.
 848 ///
 849 /// \headerfile <x86intrin.h>
 850 ///
 851 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
 852 ///   instructions.
 853 ///
 854 /// \param __a
 855 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 856 ///    32 bits of this operand are used in the comparison.
 857 /// \param __b
 858 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 859 ///    32 bits of this operand are used in the comparison.
 860 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 861 ///    in the low-order bits.
 862 static __inline__ __m128 __DEFAULT_FN_ATTRS
 863 _mm_cmpnle_ss(__m128 __a, __m128 __b)
 864 {
 865   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
 866 }
 867
 868 /// Compares each of the corresponding 32-bit float values of the
 869 ///    128-bit vectors of [4 x float] to determine if the values in the first
 870 ///    operand are not less than or equal to those in the second operand.
 871 ///
 872 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
 873 ///    If either value in a comparison is NaN, returns true.
 874 ///
 875 /// \headerfile <x86intrin.h>
 876 ///
 877 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
 878 ///   instructions.
 879 ///
 880 /// \param __a
 881 ///    A 128-bit vector of [4 x float].
 882 /// \param __b
 883 ///    A 128-bit vector of [4 x float].
 884 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 885 static __inline__ __m128 __DEFAULT_FN_ATTRS
 886 _mm_cmpnle_ps(__m128 __a, __m128 __b)
 887 {
 888   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
 889 }
 890
 891 /// Compares two 32-bit float values in the low-order bits of both
 892 ///    operands to determine if the value in the first operand is not greater
 893 ///    than the corresponding value in the second operand.
 894 ///
 895 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
 896 ///    low-order bits of a vector of [4 x float].
 897 ///    If either value in a comparison is NaN, returns true.
 898 ///
 899 /// \headerfile <x86intrin.h>
 900 ///
 901 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
 902 ///   instructions.
 903 ///
 904 /// \param __a
 905 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 906 ///    32 bits of this operand are used in the comparison.
 907 /// \param __b
 908 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 909 ///    32 bits of this operand are used in the comparison.
 910 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 911 ///    in the low-order bits.
 912 static __inline__ __m128 __DEFAULT_FN_ATTRS
 913 _mm_cmpngt_ss(__m128 __a, __m128 __b)
 914 {
 915   return (__m128)__builtin_shufflevector((__v4sf)__a,
 916                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
 917                                          4, 1, 2, 3);
 918 }
 919
 920 /// Compares each of the corresponding 32-bit float values of the
 921 ///    128-bit vectors of [4 x float] to determine if the values in the first
 922 ///    operand are not greater than those in the second operand.
 923 ///
 924 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
 925 ///    If either value in a comparison is NaN, returns true.
 926 ///
 927 /// \headerfile <x86intrin.h>
 928 ///
 929 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
 930 ///   instructions.
 931 ///
 932 /// \param __a
 933 ///    A 128-bit vector of [4 x float].
 934 /// \param __b
 935 ///    A 128-bit vector of [4 x float].
 936 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 937 static __inline__ __m128 __DEFAULT_FN_ATTRS
 938 _mm_cmpngt_ps(__m128 __a, __m128 __b)
 939 {
 940   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
 941 }
 942
 943 /// Compares two 32-bit float values in the low-order bits of both
 944 ///    operands to determine if the value in the first operand is not greater
 945 ///    than or equal to the corresponding value in the second operand.
 946 ///
 947 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
 948 ///    low-order bits of a vector of [4 x float].
 949 ///    If either value in a comparison is NaN, returns true.
 950 ///
 951 /// \headerfile <x86intrin.h>
 952 ///
 953 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
 954 ///   instructions.
 955 ///
 956 /// \param __a
 957 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 958 ///    32 bits of this operand are used in the comparison.
 959 /// \param __b
 960 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 961 ///    32 bits of this operand are used in the comparison.
 962 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 963 ///    in the low-order bits.
 964 static __inline__ __m128 __DEFAULT_FN_ATTRS
 965 _mm_cmpnge_ss(__m128 __a, __m128 __b)
 966 {
 967   return (__m128)__builtin_shufflevector((__v4sf)__a,
 968                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
 969                                          4, 1, 2, 3);
 970 }
 971
 972 /// Compares each of the corresponding 32-bit float values of the
 973 ///    128-bit vectors of [4 x float] to determine if the values in the first
 974 ///    operand are not greater than or equal to those in the second operand.
 975 ///
 976 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
 977 ///    If either value in a comparison is NaN, returns true.
 978 ///
 979 /// \headerfile <x86intrin.h>
 980 ///
 981 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
 982 ///   instructions.
 983 ///
 984 /// \param __a
 985 ///    A 128-bit vector of [4 x float].
 986 /// \param __b
 987 ///    A 128-bit vector of [4 x float].
 988 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 989 static __inline__ __m128 __DEFAULT_FN_ATTRS
 990 _mm_cmpnge_ps(__m128 __a, __m128 __b)
 991 {
 992   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
 993 }
 994
 995 /// Compares two 32-bit float values in the low-order bits of both
 996 ///    operands to determine if the value in the first operand is ordered with
 997 ///    respect to the corresponding value in the second operand.
 998 ///
 999 ///    A pair of floating-point values are ordered with respect to each
1000 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
1001 ///    0xFFFFFFFF for true.
1002 ///
1003 /// \headerfile <x86intrin.h>
1004 ///
1005 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
1006 ///   instructions.
1007 ///
1008 /// \param __a
1009 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1010 ///    32 bits of this operand are used in the comparison.
1011 /// \param __b
1012 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1013 ///    32 bits of this operand are used in the comparison.
1014 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1015 ///    in the low-order bits.
1016 static __inline__ __m128 __DEFAULT_FN_ATTRS
1017 _mm_cmpord_ss(__m128 __a, __m128 __b)
1018 {
1019   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
1020 }
1021
1022 /// Compares each of the corresponding 32-bit float values of the
1023 ///    128-bit vectors of [4 x float] to determine if the values in the first
1024 ///    operand are ordered with respect to those in the second operand.
1025 ///
1026 ///    A pair of floating-point values are ordered with respect to each
1027 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
1028 ///    0xFFFFFFFF for true.
1029 ///
1030 /// \headerfile <x86intrin.h>
1031 ///
1032 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
1033 ///   instructions.
1034 ///
1035 /// \param __a
1036 ///    A 128-bit vector of [4 x float].
1037 /// \param __b
1038 ///    A 128-bit vector of [4 x float].
1039 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1040 static __inline__ __m128 __DEFAULT_FN_ATTRS
1041 _mm_cmpord_ps(__m128 __a, __m128 __b)
1042 {
1043   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
1044 }
1045
1046 /// Compares two 32-bit float values in the low-order bits of both
1047 ///    operands to determine if the value in the first operand is unordered
1048 ///    with respect to the corresponding value in the second operand.
1049 ///
1050 ///    A pair of double-precision values are unordered with respect to each
1051 ///    other if one or both values are NaN. Each comparison returns 0x0 for
1052 ///    false, 0xFFFFFFFF for true.
1053 ///
1054 /// \headerfile <x86intrin.h>
1055 ///
1056 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1057 ///   instructions.
1058 ///
1059 /// \param __a
1060 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1061 ///    32 bits of this operand are used in the comparison.
1062 /// \param __b
1063 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1064 ///    32 bits of this operand are used in the comparison.
1065 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1066 ///    in the low-order bits.
1067 static __inline__ __m128 __DEFAULT_FN_ATTRS
1068 _mm_cmpunord_ss(__m128 __a, __m128 __b)
1069 {
1070   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1071 }
1072
1073 /// Compares each of the corresponding 32-bit float values of the
1074 ///    128-bit vectors of [4 x float] to determine if the values in the first
1075 ///    operand are unordered with respect to those in the second operand.
1076 ///
1077 ///    A pair of double-precision values are unordered with respect to each
1078 ///    other if one or both values are NaN. Each comparison returns 0x0 for
1079 ///    false, 0xFFFFFFFFFFFFFFFF for true.
1080 ///
1081 /// \headerfile <x86intrin.h>
1082 ///
1083 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1084 ///   instructions.
1085 ///
1086 /// \param __a
1087 ///    A 128-bit vector of [4 x float].
1088 /// \param __b
1089 ///    A 128-bit vector of [4 x float].
1090 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1091 static __inline__ __m128 __DEFAULT_FN_ATTRS
1092 _mm_cmpunord_ps(__m128 __a, __m128 __b)
1093 {
1094   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1095 }
1096
1097 /// Compares two 32-bit float values in the low-order bits of both
1098 ///    operands for equality.
1099 ///
1100 ///    The comparison returns 0 for false, 1 for true. If either value in a
1101 ///    comparison is NaN, returns 0.
1102 ///
1103 /// \headerfile <x86intrin.h>
1104 ///
1105 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1106 ///   instructions.
1107 ///
1108 /// \param __a
1109 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1110 ///    used in the comparison.
1111 /// \param __b
1112 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1113 ///    used in the comparison.
1114 /// \returns An integer containing the comparison results.
1115 static __inline__ int __DEFAULT_FN_ATTRS
1116 _mm_comieq_ss(__m128 __a, __m128 __b)
1117 {
1118   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1119 }
1120
1121 /// Compares two 32-bit float values in the low-order bits of both
1122 ///    operands to determine if the first operand is less than the second
1123 ///    operand.
1124 ///
1125 ///    The comparison returns 0 for false, 1 for true. If either value in a
1126 ///    comparison is NaN, returns 0.
1127 ///
1128 /// \headerfile <x86intrin.h>
1129 ///
1130 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1131 ///   instructions.
1132 ///
1133 /// \param __a
1134 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1135 ///    used in the comparison.
1136 /// \param __b
1137 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1138 ///    used in the comparison.
1139 /// \returns An integer containing the comparison results.
1140 static __inline__ int __DEFAULT_FN_ATTRS
1141 _mm_comilt_ss(__m128 __a, __m128 __b)
1142 {
1143   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1144 }
1145
1146 /// Compares two 32-bit float values in the low-order bits of both
1147 ///    operands to determine if the first operand is less than or equal to the
1148 ///    second operand.
1149 ///
1150 ///    The comparison returns 0 for false, 1 for true. If either value in a
1151 ///    comparison is NaN, returns 0.
1152 ///
1153 /// \headerfile <x86intrin.h>
1154 ///
1155 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1156 ///
1157 /// \param __a
1158 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1159 ///    used in the comparison.
1160 /// \param __b
1161 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1162 ///    used in the comparison.
1163 /// \returns An integer containing the comparison results.
1164 static __inline__ int __DEFAULT_FN_ATTRS
1165 _mm_comile_ss(__m128 __a, __m128 __b)
1166 {
1167   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1168 }
1169
1170 /// Compares two 32-bit float values in the low-order bits of both
1171 ///    operands to determine if the first operand is greater than the second
1172 ///    operand.
1173 ///
1174 ///    The comparison returns 0 for false, 1 for true. If either value in a
1175 ///    comparison is NaN, returns 0.
1176 ///
1177 /// \headerfile <x86intrin.h>
1178 ///
1179 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1180 ///
1181 /// \param __a
1182 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1183 ///    used in the comparison.
1184 /// \param __b
1185 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1186 ///    used in the comparison.
1187 /// \returns An integer containing the comparison results.
1188 static __inline__ int __DEFAULT_FN_ATTRS
1189 _mm_comigt_ss(__m128 __a, __m128 __b)
1190 {
1191   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1192 }
1193
1194 /// Compares two 32-bit float values in the low-order bits of both
1195 ///    operands to determine if the first operand is greater than or equal to
1196 ///    the second operand.
1197 ///
1198 ///    The comparison returns 0 for false, 1 for true. If either value in a
1199 ///    comparison is NaN, returns 0.
1200 ///
1201 /// \headerfile <x86intrin.h>
1202 ///
1203 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1204 ///
1205 /// \param __a
1206 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1207 ///    used in the comparison.
1208 /// \param __b
1209 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1210 ///    used in the comparison.
1211 /// \returns An integer containing the comparison results.
1212 static __inline__ int __DEFAULT_FN_ATTRS
1213 _mm_comige_ss(__m128 __a, __m128 __b)
1214 {
1215   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1216 }
1217
1218 /// Compares two 32-bit float values in the low-order bits of both
1219 ///    operands to determine if the first operand is not equal to the second
1220 ///    operand.
1221 ///
1222 ///    The comparison returns 0 for false, 1 for true. If either value in a
1223 ///    comparison is NaN, returns 1.
1224 ///
1225 /// \headerfile <x86intrin.h>
1226 ///
1227 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1228 ///
1229 /// \param __a
1230 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1231 ///    used in the comparison.
1232 /// \param __b
1233 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1234 ///    used in the comparison.
1235 /// \returns An integer containing the comparison results.
1236 static __inline__ int __DEFAULT_FN_ATTRS
1237 _mm_comineq_ss(__m128 __a, __m128 __b)
1238 {
1239   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1240 }
1241
1242 /// Performs an unordered comparison of two 32-bit float values using
1243 ///    the low-order bits of both operands to determine equality.
1244 ///
1245 ///    The comparison returns 0 for false, 1 for true. If either value in a
1246 ///    comparison is NaN, returns 0.
1247 ///
1248 /// \headerfile <x86intrin.h>
1249 ///
1250 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1251 ///
1252 /// \param __a
1253 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1254 ///    used in the comparison.
1255 /// \param __b
1256 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1257 ///    used in the comparison.
1258 /// \returns An integer containing the comparison results.
1259 static __inline__ int __DEFAULT_FN_ATTRS
1260 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1261 {
1262   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1263 }
1264
1265 /// Performs an unordered comparison of two 32-bit float values using
1266 ///    the low-order bits of both operands to determine if the first operand is
1267 ///    less than the second operand.
1268 ///
1269 ///    The comparison returns 0 for false, 1 for true. If either value in a
1270 ///    comparison is NaN, returns 0.
1271 ///
1272 /// \headerfile <x86intrin.h>
1273 ///
1274 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1275 ///
1276 /// \param __a
1277 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1278 ///    used in the comparison.
1279 /// \param __b
1280 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1281 ///    used in the comparison.
1282 /// \returns An integer containing the comparison results.
1283 static __inline__ int __DEFAULT_FN_ATTRS
1284 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1285 {
1286   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1287 }
1288
1289 /// Performs an unordered comparison of two 32-bit float values using
1290 ///    the low-order bits of both operands to determine if the first operand is
1291 ///    less than or equal to the second operand.
1292 ///
1293 ///    The comparison returns 0 for false, 1 for true. If either value in a
1294 ///    comparison is NaN, returns 0.
1295 ///
1296 /// \headerfile <x86intrin.h>
1297 ///
1298 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1299 ///
1300 /// \param __a
1301 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1302 ///    used in the comparison.
1303 /// \param __b
1304 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1305 ///    used in the comparison.
1306 /// \returns An integer containing the comparison results.
1307 static __inline__ int __DEFAULT_FN_ATTRS
1308 _mm_ucomile_ss(__m128 __a, __m128 __b)
1309 {
1310   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1311 }
1312
1313 /// Performs an unordered comparison of two 32-bit float values using
1314 ///    the low-order bits of both operands to determine if the first operand is
1315 ///    greater than the second operand.
1316 ///
1317 ///    The comparison returns 0 for false, 1 for true. If either value in a
1318 ///    comparison is NaN, returns 0.
1319 ///
1320 /// \headerfile <x86intrin.h>
1321 ///
1322 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1323 ///
1324 /// \param __a
1325 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1326 ///    used in the comparison.
1327 /// \param __b
1328 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1329 ///    used in the comparison.
1330 /// \returns An integer containing the comparison results.
1331 static __inline__ int __DEFAULT_FN_ATTRS
1332 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1333 {
1334   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1335 }
1336
1337 /// Performs an unordered comparison of two 32-bit float values using
1338 ///    the low-order bits of both operands to determine if the first operand is
1339 ///    greater than or equal to the second operand.
1340 ///
1341 ///    The comparison returns 0 for false, 1 for true. If either value in a
1342 ///    comparison is NaN, returns 0.
1343 ///
1344 /// \headerfile <x86intrin.h>
1345 ///
1346 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1347 ///
1348 /// \param __a
1349 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1350 ///    used in the comparison.
1351 /// \param __b
1352 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1353 ///    used in the comparison.
1354 /// \returns An integer containing the comparison results.
1355 static __inline__ int __DEFAULT_FN_ATTRS
1356 _mm_ucomige_ss(__m128 __a, __m128 __b)
1357 {
1358   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1359 }
1360
1361 /// Performs an unordered comparison of two 32-bit float values using
1362 ///    the low-order bits of both operands to determine inequality.
1363 ///
1364 ///    The comparison returns 0 for false, 1 for true. If either value in a
1365 ///    comparison is NaN, returns 0.
1366 ///
1367 /// \headerfile <x86intrin.h>
1368 ///
1369 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1370 ///
1371 /// \param __a
1372 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1373 ///    used in the comparison.
1374 /// \param __b
1375 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1376 ///    used in the comparison.
1377 /// \returns An integer containing the comparison results.
1378 static __inline__ int __DEFAULT_FN_ATTRS
1379 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1380 {
1381   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1382 }
1383
1384 /// Converts a float value contained in the lower 32 bits of a vector of
1385 ///    [4 x float] into a 32-bit integer.
1386 ///
1387 ///    If the converted value does not fit in a 32-bit integer, raises a
1388 ///    floating-point invalid exception. If the exception is masked, returns
1389 ///    the most negative integer.
1390 ///
1391 /// \headerfile <x86intrin.h>
1392 ///
1393 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1394 ///   instructions.
1395 ///
1396 /// \param __a
1397 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1398 ///    used in the conversion.
1399 /// \returns A 32-bit integer containing the converted value.
1400 static __inline__ int __DEFAULT_FN_ATTRS
1401 _mm_cvtss_si32(__m128 __a)
1402 {
1403   return __builtin_ia32_cvtss2si((__v4sf)__a);
1404 }
1405
1406 /// Converts a float value contained in the lower 32 bits of a vector of
1407 ///    [4 x float] into a 32-bit integer.
1408 ///
1409 ///    If the converted value does not fit in a 32-bit integer, raises a
1410 ///    floating-point invalid exception. If the exception is masked, returns
1411 ///    the most negative integer.
1412 ///
1413 /// \headerfile <x86intrin.h>
1414 ///
1415 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1416 ///   instructions.
1417 ///
1418 /// \param __a
1419 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1420 ///    used in the conversion.
1421 /// \returns A 32-bit integer containing the converted value.
1422 static __inline__ int __DEFAULT_FN_ATTRS
1423 _mm_cvt_ss2si(__m128 __a)
1424 {
1425   return _mm_cvtss_si32(__a);
1426 }
1427
1428 #ifdef __x86_64__
1429
1430 /// Converts a float value contained in the lower 32 bits of a vector of
1431 ///    [4 x float] into a 64-bit integer.
1432 ///
1433 ///    If the converted value does not fit in a 32-bit integer, raises a
1434 ///    floating-point invalid exception. If the exception is masked, returns
1435 ///    the most negative integer.
1436 ///
1437 /// \headerfile <x86intrin.h>
1438 ///
1439 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1440 ///   instructions.
1441 ///
1442 /// \param __a
1443 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1444 ///    used in the conversion.
1445 /// \returns A 64-bit integer containing the converted value.
1446 static __inline__ long long __DEFAULT_FN_ATTRS
1447 _mm_cvtss_si64(__m128 __a)
1448 {
1449   return __builtin_ia32_cvtss2si64((__v4sf)__a);
1450 }
1451
1452 #endif
1453
1454 /// Converts two low-order float values in a 128-bit vector of
1455 ///    [4 x float] into a 64-bit vector of [2 x i32].
1456 ///
1457 ///    If a converted value does not fit in a 32-bit integer, raises a
1458 ///    floating-point invalid exception. If the exception is masked, returns
1459 ///    the most negative integer.
1460 ///
1461 /// \headerfile <x86intrin.h>
1462 ///
1463 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1464 ///
1465 /// \param __a
1466 ///    A 128-bit vector of [4 x float].
1467 /// \returns A 64-bit integer vector containing the converted values.
1468 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1469 _mm_cvtps_pi32(__m128 __a)
1470 {
1471   return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
1472 }
1473
1474 /// Converts two low-order float values in a 128-bit vector of
1475 ///    [4 x float] into a 64-bit vector of [2 x i32].
1476 ///
1477 ///    If a converted value does not fit in a 32-bit integer, raises a
1478 ///    floating-point invalid exception. If the exception is masked, returns
1479 ///    the most negative integer.
1480 ///
1481 /// \headerfile <x86intrin.h>
1482 ///
1483 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1484 ///
1485 /// \param __a
1486 ///    A 128-bit vector of [4 x float].
1487 /// \returns A 64-bit integer vector containing the converted values.
1488 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1489 _mm_cvt_ps2pi(__m128 __a)
1490 {
1491   return _mm_cvtps_pi32(__a);
1492 }
1493
1494 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1495 ///    truncated (rounded toward zero) 32-bit integer.
1496 ///
1497 ///    If the converted value does not fit in a 32-bit integer, raises a
1498 ///    floating-point invalid exception. If the exception is masked, returns
1499 ///    the most negative integer.
1500 ///
1501 /// \headerfile <x86intrin.h>
1502 ///
1503 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1504 ///   instructions.
1505 ///
1506 /// \param __a
1507 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1508 ///    used in the conversion.
1509 /// \returns A 32-bit integer containing the converted value.
1510 static __inline__ int __DEFAULT_FN_ATTRS
1511 _mm_cvttss_si32(__m128 __a)
1512 {
1513   return __builtin_ia32_cvttss2si((__v4sf)__a);
1514 }
1515
1516 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1517 ///    truncated (rounded toward zero) 32-bit integer.
1518 ///
1519 ///    If the converted value does not fit in a 32-bit integer, raises a
1520 ///    floating-point invalid exception. If the exception is masked, returns
1521 ///    the most negative integer.
1522 ///
1523 /// \headerfile <x86intrin.h>
1524 ///
1525 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1526 ///   instructions.
1527 ///
1528 /// \param __a
1529 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1530 ///    used in the conversion.
1531 /// \returns A 32-bit integer containing the converted value.
1532 static __inline__ int __DEFAULT_FN_ATTRS
1533 _mm_cvtt_ss2si(__m128 __a)
1534 {
1535   return _mm_cvttss_si32(__a);
1536 }
1537
1538 #ifdef __x86_64__
1539 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1540 ///    truncated (rounded toward zero) 64-bit integer.
1541 ///
1542 ///    If the converted value does not fit in a 64-bit integer, raises a
1543 ///    floating-point invalid exception. If the exception is masked, returns
1544 ///    the most negative integer.
1545 ///
1546 /// \headerfile <x86intrin.h>
1547 ///
1548 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1549 ///   instructions.
1550 ///
1551 /// \param __a
1552 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1553 ///    used in the conversion.
1554 /// \returns A 64-bit integer containing the converted value.
1555 static __inline__ long long __DEFAULT_FN_ATTRS
1556 _mm_cvttss_si64(__m128 __a)
1557 {
1558   return __builtin_ia32_cvttss2si64((__v4sf)__a);
1559 }
1560 #endif
1561
1562 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1563 ///    into two signed truncated (rounded toward zero) 32-bit integers,
1564 ///    returned in a 64-bit vector of [2 x i32].
1565 ///
1566 ///    If a converted value does not fit in a 32-bit integer, raises a
1567 ///    floating-point invalid exception. If the exception is masked, returns
1568 ///    the most negative integer.
1569 ///
1570 /// \headerfile <x86intrin.h>
1571 ///
1572 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1573 ///   instructions.
1574 ///
1575 /// \param __a
1576 ///    A 128-bit vector of [4 x float].
1577 /// \returns A 64-bit integer vector containing the converted values.
1578 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1579 _mm_cvttps_pi32(__m128 __a)
1580 {
1581   return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
1582 }
1583
1584 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1585 ///    into two signed truncated (rounded toward zero) 64-bit integers,
1586 ///    returned in a 64-bit vector of [2 x i32].
1587 ///
1588 ///    If a converted value does not fit in a 32-bit integer, raises a
1589 ///    floating-point invalid exception. If the exception is masked, returns
1590 ///    the most negative integer.
1591 ///
1592 /// \headerfile <x86intrin.h>
1593 ///
1594 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1595 ///
1596 /// \param __a
1597 ///    A 128-bit vector of [4 x float].
1598 /// \returns A 64-bit integer vector containing the converted values.
1599 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1600 _mm_cvtt_ps2pi(__m128 __a)
1601 {
1602   return _mm_cvttps_pi32(__a);
1603 }
1604
1605 /// Converts a 32-bit signed integer value into a floating point value
1606 ///    and writes it to the lower 32 bits of the destination. The remaining
1607 ///    higher order elements of the destination vector are copied from the
1608 ///    corresponding elements in the first operand.
1609 ///
1610 /// \headerfile <x86intrin.h>
1611 ///
1612 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1613 ///
1614 /// \param __a
1615 ///    A 128-bit vector of [4 x float].
1616 /// \param __b
1617 ///    A 32-bit signed integer operand containing the value to be converted.
1618 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1619 ///    converted value of the second operand. The upper 96 bits are copied from
1620 ///    the upper 96 bits of the first operand.
1621 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a,
1622                                                                      int __b) {
1623   __a[0] = __b;
1624   return __a;
1625 }
1626
1627 /// Converts a 32-bit signed integer value into a floating point value
1628 ///    and writes it to the lower 32 bits of the destination. The remaining
1629 ///    higher order elements of the destination are copied from the
1630 ///    corresponding elements in the first operand.
1631 ///
1632 /// \headerfile <x86intrin.h>
1633 ///
1634 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1635 ///
1636 /// \param __a
1637 ///    A 128-bit vector of [4 x float].
1638 /// \param __b
1639 ///    A 32-bit signed integer operand containing the value to be converted.
1640 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1641 ///    converted value of the second operand. The upper 96 bits are copied from
1642 ///    the upper 96 bits of the first operand.
1643 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a,
1644                                                                     int __b) {
1645   return _mm_cvtsi32_ss(__a, __b);
1646 }
1647
1648 #ifdef __x86_64__
1649
1650 /// Converts a 64-bit signed integer value into a floating point value
1651 ///    and writes it to the lower 32 bits of the destination. The remaining
1652 ///    higher order elements of the destination are copied from the
1653 ///    corresponding elements in the first operand.
1654 ///
1655 /// \headerfile <x86intrin.h>
1656 ///
1657 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1658 ///
1659 /// \param __a
1660 ///    A 128-bit vector of [4 x float].
1661 /// \param __b
1662 ///    A 64-bit signed integer operand containing the value to be converted.
1663 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1664 ///    converted value of the second operand. The upper 96 bits are copied from
1665 ///    the upper 96 bits of the first operand.
1666 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1667 _mm_cvtsi64_ss(__m128 __a, long long __b) {
1668   __a[0] = __b;
1669   return __a;
1670 }
1671
1672 #endif
1673
1674 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1675 ///    floating point values and writes them to the lower 64-bits of the
1676 ///    destination. The remaining higher order elements of the destination are
1677 ///    copied from the corresponding elements in the first operand.
1678 ///
1679 /// \headerfile <x86intrin.h>
1680 ///
1681 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1682 ///
1683 /// \param __a
1684 ///    A 128-bit vector of [4 x float].
1685 /// \param __b
1686 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1687 ///    and written to the corresponding low-order elements in the destination.
1688 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1689 ///    converted value of the second operand. The upper 64 bits are copied from
1690 ///    the upper 64 bits of the first operand.
1691 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
1692 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1693 {
1694   return (__m128)__builtin_shufflevector(
1695       (__v4sf)__a,
1696       __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
1697       4, 5, 2, 3);
1698 }
1699
1700 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1701 ///    floating point values and writes them to the lower 64-bits of the
1702 ///    destination. The remaining higher order elements of the destination are
1703 ///    copied from the corresponding elements in the first operand.
1704 ///
1705 /// \headerfile <x86intrin.h>
1706 ///
1707 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1708 ///
1709 /// \param __a
1710 ///    A 128-bit vector of [4 x float].
1711 /// \param __b
1712 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1713 ///    and written to the corresponding low-order elements in the destination.
1714 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1715 ///    converted value from the second operand. The upper 64 bits are copied
1716 ///    from the upper 64 bits of the first operand.
1717 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
1718 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1719 {
1720   return _mm_cvtpi32_ps(__a, __b);
1721 }
1722
1723 /// Extracts a float value contained in the lower 32 bits of a vector of
1724 ///    [4 x float].
1725 ///
1726 /// \headerfile <x86intrin.h>
1727 ///
1728 /// This intrinsic has no corresponding instruction.
1729 ///
1730 /// \param __a
1731 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1732 ///    used in the extraction.
1733 /// \returns A 32-bit float containing the extracted value.
1734 static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
1735 _mm_cvtss_f32(__m128 __a) {
1736   return __a[0];
1737 }
1738
1739 /// Loads two packed float values from the address \a __p into the
1740 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1741 ///     are copied from the low-order bits of the first operand.
1742 ///
1743 /// \headerfile <x86intrin.h>
1744 ///
1745 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1746 ///
1747 /// \param __a
1748 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1749 ///    of the destination.
1750 /// \param __p
1751 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1752 ///    [127:64] of the destination.
1753 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1754 static __inline__ __m128 __DEFAULT_FN_ATTRS
1755 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1756 {
1757   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1758   struct __mm_loadh_pi_struct {
1759     __mm_loadh_pi_v2f32 __u;
1760   } __attribute__((__packed__, __may_alias__));
1761   __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1762   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1763   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1764 }
1765
1766 /// Loads two packed float values from the address \a __p into the
1767 ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1768 ///    are copied from the high-order bits of the first operand.
1769 ///
1770 /// \headerfile <x86intrin.h>
1771 ///
1772 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1773 ///
1774 /// \param __a
1775 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1776 ///    [127:64] of the destination.
1777 /// \param __p
1778 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1779 ///    [63:0] of the destination.
1780 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1781 static __inline__ __m128 __DEFAULT_FN_ATTRS
1782 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1783 {
1784   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1785   struct __mm_loadl_pi_struct {
1786     __mm_loadl_pi_v2f32 __u;
1787   } __attribute__((__packed__, __may_alias__));
1788   __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1789   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1790   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1791 }
1792
1793 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1794 ///    32 bits of the vector are initialized with the single-precision
1795 ///    floating-point value loaded from a specified memory location. The upper
1796 ///    96 bits are set to zero.
1797 ///
1798 /// \headerfile <x86intrin.h>
1799 ///
1800 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1801 ///
1802 /// \param __p
1803 ///    A pointer to a 32-bit memory location containing a single-precision
1804 ///    floating-point value.
1805 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1806 ///    lower 32 bits contain the value loaded from the memory location. The
1807 ///    upper 96 bits are set to zero.
1808 static __inline__ __m128 __DEFAULT_FN_ATTRS
1809 _mm_load_ss(const float *__p)
1810 {
1811   struct __mm_load_ss_struct {
1812     float __u;
1813   } __attribute__((__packed__, __may_alias__));
1814   float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1815   return __extension__ (__m128){ __u, 0, 0, 0 };
1816 }
1817
1818 /// Loads a 32-bit float value and duplicates it to all four vector
1819 ///    elements of a 128-bit vector of [4 x float].
1820 ///
1821 /// \headerfile <x86intrin.h>
1822 ///
1823 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1824 ///    instruction.
1825 ///
1826 /// \param __p
1827 ///    A pointer to a float value to be loaded and duplicated.
1828 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1829 ///    duplicated values.
1830 static __inline__ __m128 __DEFAULT_FN_ATTRS
1831 _mm_load1_ps(const float *__p)
1832 {
1833   struct __mm_load1_ps_struct {
1834     float __u;
1835   } __attribute__((__packed__, __may_alias__));
1836   float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1837   return __extension__ (__m128){ __u, __u, __u, __u };
1838 }
1839
1840 #define        _mm_load_ps1(p) _mm_load1_ps(p)
1841
1842 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1843 ///    memory location.
1844 ///
1845 /// \headerfile <x86intrin.h>
1846 ///
1847 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1848 ///
1849 /// \param __p
1850 ///    A pointer to a 128-bit memory location. The address of the memory
1851 ///    location has to be 128-bit aligned.
1852 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1853 static __inline__ __m128 __DEFAULT_FN_ATTRS
1854 _mm_load_ps(const float *__p)
1855 {
1856   return *(const __m128*)__p;
1857 }
1858
1859 /// Loads a 128-bit floating-point vector of [4 x float] from an
1860 ///    unaligned memory location.
1861 ///
1862 /// \headerfile <x86intrin.h>
1863 ///
1864 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1865 ///
1866 /// \param __p
1867 ///    A pointer to a 128-bit memory location. The address of the memory
1868 ///    location does not have to be aligned.
1869 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1870 static __inline__ __m128 __DEFAULT_FN_ATTRS
1871 _mm_loadu_ps(const float *__p)
1872 {
1873   struct __loadu_ps {
1874     __m128_u __v;
1875   } __attribute__((__packed__, __may_alias__));
1876   return ((const struct __loadu_ps*)__p)->__v;
1877 }
1878
1879 /// Loads four packed float values, in reverse order, from an aligned
1880 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1881 ///
1882 /// \headerfile <x86intrin.h>
1883 ///
1884 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1885 ///    instruction.
1886 ///
1887 /// \param __p
1888 ///    A pointer to a 128-bit memory location. The address of the memory
1889 ///    location has to be 128-bit aligned.
1890 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1891 ///    in reverse order.
1892 static __inline__ __m128 __DEFAULT_FN_ATTRS
1893 _mm_loadr_ps(const float *__p)
1894 {
1895   __m128 __a = _mm_load_ps(__p);
1896   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1897 }
1898
1899 /// Create a 128-bit vector of [4 x float] with undefined values.
1900 ///
1901 /// \headerfile <x86intrin.h>
1902 ///
1903 /// This intrinsic has no corresponding instruction.
1904 ///
1905 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1906 static __inline__ __m128 __DEFAULT_FN_ATTRS
1907 _mm_undefined_ps(void)
1908 {
1909   return (__m128)__builtin_ia32_undef128();
1910 }
1911
1912 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1913 ///    32 bits of the vector are initialized with the specified single-precision
1914 ///    floating-point value. The upper 96 bits are set to zero.
1915 ///
1916 /// \headerfile <x86intrin.h>
1917 ///
1918 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1919 ///
1920 /// \param __w
1921 ///    A single-precision floating-point value used to initialize the lower 32
1922 ///    bits of the result.
1923 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1924 ///    lower 32 bits contain the value provided in the source operand. The
1925 ///    upper 96 bits are set to zero.
1926 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1927 _mm_set_ss(float __w) {
1928   return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
1929 }
1930
1931 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1932 ///    of the four single-precision floating-point vector elements set to the
1933 ///    specified single-precision floating-point value.
1934 ///
1935 /// \headerfile <x86intrin.h>
1936 ///
1937 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1938 ///
1939 /// \param __w
1940 ///    A single-precision floating-point value used to initialize each vector
1941 ///    element of the result.
1942 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1943 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1944 _mm_set1_ps(float __w) {
1945   return __extension__ (__m128){ __w, __w, __w, __w };
1946 }
1947
1948 /* Microsoft specific. */
1949 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1950 ///    of the four single-precision floating-point vector elements set to the
1951 ///    specified single-precision floating-point value.
1952 ///
1953 /// \headerfile <x86intrin.h>
1954 ///
1955 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1956 ///
1957 /// \param __w
1958 ///    A single-precision floating-point value used to initialize each vector
1959 ///    element of the result.
1960 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1961 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1962 _mm_set_ps1(float __w) {
1963     return _mm_set1_ps(__w);
1964 }
1965
1966 /// Constructs a 128-bit floating-point vector of [4 x float]
1967 ///    initialized with the specified single-precision floating-point values.
1968 ///
1969 /// \headerfile <x86intrin.h>
1970 ///
1971 /// This intrinsic is a utility function and does not correspond to a specific
1972 ///    instruction.
1973 ///
1974 /// \param __z
1975 ///    A single-precision floating-point value used to initialize bits [127:96]
1976 ///    of the result.
1977 /// \param __y
1978 ///    A single-precision floating-point value used to initialize bits [95:64]
1979 ///    of the result.
1980 /// \param __x
1981 ///    A single-precision floating-point value used to initialize bits [63:32]
1982 ///    of the result.
1983 /// \param __w
1984 ///    A single-precision floating-point value used to initialize bits [31:0]
1985 ///    of the result.
1986 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1987 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1988 _mm_set_ps(float __z, float __y, float __x, float __w) {
1989   return __extension__ (__m128){ __w, __x, __y, __z };
1990 }
1991
1992 /// Constructs a 128-bit floating-point vector of [4 x float],
1993 ///    initialized in reverse order with the specified 32-bit single-precision
1994 ///    float-point values.
1995 ///
1996 /// \headerfile <x86intrin.h>
1997 ///
1998 /// This intrinsic is a utility function and does not correspond to a specific
1999 ///    instruction.
2000 ///
2001 /// \param __z
2002 ///    A single-precision floating-point value used to initialize bits [31:0]
2003 ///    of the result.
2004 /// \param __y
2005 ///    A single-precision floating-point value used to initialize bits [63:32]
2006 ///    of the result.
2007 /// \param __x
2008 ///    A single-precision floating-point value used to initialize bits [95:64]
2009 ///    of the result.
2010 /// \param __w
2011 ///    A single-precision floating-point value used to initialize bits [127:96]
2012 ///    of the result.
2013 /// \returns An initialized 128-bit floating-point vector of [4 x float].
2014 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2015 _mm_setr_ps(float __z, float __y, float __x, float __w) {
2016   return __extension__ (__m128){ __z, __y, __x, __w };
2017 }
2018
2019 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
2020 ///    to zero.
2021 ///
2022 /// \headerfile <x86intrin.h>
2023 ///
2024 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
2025 ///
2026 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
2027 ///    all elements set to zero.
2028 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2029 _mm_setzero_ps(void) {
2030   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
2031 }
2032
2033 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2034 ///    memory location.
2035 ///
2036 /// \headerfile <x86intrin.h>
2037 ///
2038 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
2039 ///
2040 /// \param __p
2041 ///    A pointer to a 64-bit memory location.
2042 /// \param __a
2043 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2044 static __inline__ void __DEFAULT_FN_ATTRS
2045 _mm_storeh_pi(__m64 *__p, __m128 __a)
2046 {
2047   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2048   struct __mm_storeh_pi_struct {
2049     __mm_storeh_pi_v2f32 __u;
2050   } __attribute__((__packed__, __may_alias__));
2051   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
2052 }
2053
2054 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2055 ///     memory location.
2056 ///
2057 /// \headerfile <x86intrin.h>
2058 ///
2059 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
2060 ///
2061 /// \param __p
2062 ///    A pointer to a memory location that will receive the float values.
2063 /// \param __a
2064 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2065 static __inline__ void __DEFAULT_FN_ATTRS
2066 _mm_storel_pi(__m64 *__p, __m128 __a)
2067 {
2068   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2069   struct __mm_storeh_pi_struct {
2070     __mm_storeh_pi_v2f32 __u;
2071   } __attribute__((__packed__, __may_alias__));
2072   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
2073 }
2074
2075 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2076 ///     memory location.
2077 ///
2078 /// \headerfile <x86intrin.h>
2079 ///
2080 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2081 ///
2082 /// \param __p
2083 ///    A pointer to a 32-bit memory location.
2084 /// \param __a
2085 ///    A 128-bit vector of [4 x float] containing the value to be stored.
2086 static __inline__ void __DEFAULT_FN_ATTRS
2087 _mm_store_ss(float *__p, __m128 __a)
2088 {
2089   struct __mm_store_ss_struct {
2090     float __u;
2091   } __attribute__((__packed__, __may_alias__));
2092   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2093 }
2094
2095 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
2096 ///    location.
2097 ///
2098 /// \headerfile <x86intrin.h>
2099 ///
2100 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2101 ///
2102 /// \param __p
2103 ///    A pointer to a 128-bit memory location. The address of the memory
2104 ///    location does not have to be aligned.
2105 /// \param __a
2106 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2107 static __inline__ void __DEFAULT_FN_ATTRS
2108 _mm_storeu_ps(float *__p, __m128 __a)
2109 {
2110   struct __storeu_ps {
2111     __m128_u __v;
2112   } __attribute__((__packed__, __may_alias__));
2113   ((struct __storeu_ps*)__p)->__v = __a;
2114 }
2115
2116 /// Stores a 128-bit vector of [4 x float] into an aligned memory
2117 ///    location.
2118 ///
2119 /// \headerfile <x86intrin.h>
2120 ///
2121 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2122 ///
2123 /// \param __p
2124 ///    A pointer to a 128-bit memory location. The address of the memory
2125 ///    location has to be 16-byte aligned.
2126 /// \param __a
2127 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2128 static __inline__ void __DEFAULT_FN_ATTRS
2129 _mm_store_ps(float *__p, __m128 __a)
2130 {
2131   *(__m128*)__p = __a;
2132 }
2133
2134 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2135 ///    four contiguous elements in an aligned memory location.
2136 ///
2137 /// \headerfile <x86intrin.h>
2138 ///
2139 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2140 ///    instruction.
2141 ///
2142 /// \param __p
2143 ///    A pointer to a 128-bit memory location.
2144 /// \param __a
2145 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2146 ///    of the four contiguous elements pointed by \a __p.
2147 static __inline__ void __DEFAULT_FN_ATTRS
2148 _mm_store1_ps(float *__p, __m128 __a)
2149 {
2150   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2151   _mm_store_ps(__p, __a);
2152 }
2153
2154 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2155 ///    four contiguous elements in an aligned memory location.
2156 ///
2157 /// \headerfile <x86intrin.h>
2158 ///
2159 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2160 ///    instruction.
2161 ///
2162 /// \param __p
2163 ///    A pointer to a 128-bit memory location.
2164 /// \param __a
2165 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2166 ///    of the four contiguous elements pointed by \a __p.
2167 static __inline__ void __DEFAULT_FN_ATTRS
2168 _mm_store_ps1(float *__p, __m128 __a)
2169 {
2170   _mm_store1_ps(__p, __a);
2171 }
2172
2173 /// Stores float values from a 128-bit vector of [4 x float] to an
2174 ///    aligned memory location in reverse order.
2175 ///
2176 /// \headerfile <x86intrin.h>
2177 ///
2178 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2179 ///    instruction.
2180 ///
2181 /// \param __p
2182 ///    A pointer to a 128-bit memory location. The address of the memory
2183 ///    location has to be 128-bit aligned.
2184 /// \param __a
2185 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2186 static __inline__ void __DEFAULT_FN_ATTRS
2187 _mm_storer_ps(float *__p, __m128 __a)
2188 {
2189   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2190   _mm_store_ps(__p, __a);
2191 }
2192
2193 #define _MM_HINT_ET0 7
2194 #define _MM_HINT_ET1 6
2195 #define _MM_HINT_T0  3
2196 #define _MM_HINT_T1  2
2197 #define _MM_HINT_T2  1
2198 #define _MM_HINT_NTA 0
2199
2200 #ifndef _MSC_VER
2201 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2202    Sema doesn't do any form of constant propagation yet. */
2203
2204 /// Loads one cache line of data from the specified address to a location
2205 ///    closer to the processor.
2206 ///
2207 /// \headerfile <x86intrin.h>
2208 ///
2209 /// \code
2210 /// void _mm_prefetch(const void *a, const int sel);
2211 /// \endcode
2212 ///
2213 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2214 ///
2215 /// \param a
2216 ///    A pointer to a memory location containing a cache line of data.
2217 /// \param sel
2218 ///    A predefined integer constant specifying the type of prefetch
2219 ///    operation: \n
2220 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2221 ///    PREFETCHNTA instruction will be generated. \n
2222 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2223 ///    be generated. \n
2224 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2225 ///    be generated. \n
2226 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2227 ///    be generated.
2228 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2229                                                  ((sel) >> 2) & 1, (sel) & 0x3))
2230 #endif
2231
2232 /// Stores a 64-bit integer in the specified aligned memory location. To
2233 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
2234 ///    used again soon).
2235 ///
2236 /// \headerfile <x86intrin.h>
2237 ///
2238 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2239 ///
2240 /// \param __p
2241 ///    A pointer to an aligned memory location used to store the register value.
2242 /// \param __a
2243 ///    A 64-bit integer containing the value to be stored.
2244 static __inline__ void __DEFAULT_FN_ATTRS
2245 _mm_stream_pi(void *__p, __m64 __a)
2246 {
2247   __builtin_nontemporal_store(__a, (__m64 *)__p);
2248 }
2249
2250 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2251 ///    128-bit aligned memory location. To minimize caching, the data is flagged
2252 ///    as non-temporal (unlikely to be used again soon).
2253 ///
2254 /// \headerfile <x86intrin.h>
2255 ///
2256 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2257 ///
2258 /// \param __p
2259 ///    A pointer to a 128-bit aligned memory location that will receive the
2260 ///    single-precision floating-point values.
2261 /// \param __a
2262 ///    A 128-bit vector of [4 x float] containing the values to be moved.
2263 static __inline__ void __DEFAULT_FN_ATTRS
2264 _mm_stream_ps(void *__p, __m128 __a)
2265 {
2266   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2267 }
2268
2269 #if defined(__cplusplus)
2270 extern "C" {
2271 #endif
2272
2273 /// Forces strong memory ordering (serialization) between store
2274 ///    instructions preceding this instruction and store instructions following
2275 ///    this instruction, ensuring the system completes all previous stores
2276 ///    before executing subsequent stores.
2277 ///
2278 /// \headerfile <x86intrin.h>
2279 ///
2280 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2281 ///
2282 void _mm_sfence(void);
2283
2284 #if defined(__cplusplus)
2285 } // extern "C"
2286 #endif
2287
2288 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2289 ///    returns it, as specified by the immediate integer operand.
2290 ///
2291 /// \headerfile <x86intrin.h>
2292 ///
2293 /// \code
2294 /// int _mm_extract_pi16(__m64 a, int n);
2295 /// \endcode
2296 ///
2297 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2298 ///
2299 /// \param a
2300 ///    A 64-bit vector of [4 x i16].
2301 /// \param n
2302 ///    An immediate integer operand that determines which bits are extracted: \n
2303 ///    0: Bits [15:0] are copied to the destination. \n
2304 ///    1: Bits [31:16] are copied to the destination. \n
2305 ///    2: Bits [47:32] are copied to the destination. \n
2306 ///    3: Bits [63:48] are copied to the destination.
2307 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2308 #define _mm_extract_pi16(a, n) \
2309   ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2310
2311 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2312 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2313 ///    specified by the immediate operand \a n.
2314 ///
2315 /// \headerfile <x86intrin.h>
2316 ///
2317 /// \code
2318 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2319 /// \endcode
2320 ///
2321 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2322 ///
2323 /// \param a
2324 ///    A 64-bit vector of [4 x i16].
2325 /// \param d
2326 ///    An integer. The lower 16-bit value from this operand is written to the
2327 ///    destination at the offset specified by operand \a n.
2328 /// \param n
2329 ///    An immediate integer operant that determines which the bits to be used
2330 ///    in the destination. \n
2331 ///    0: Bits [15:0] are copied to the destination. \n
2332 ///    1: Bits [31:16] are copied to the destination. \n
2333 ///    2: Bits [47:32] are copied to the destination. \n
2334 ///    3: Bits [63:48] are copied to the destination.  \n
2335 ///    The remaining bits in the destination are copied from the corresponding
2336 ///    bits in operand \a a.
2337 /// \returns A 64-bit integer vector containing the copied packed data from the
2338 ///    operands.
2339 #define _mm_insert_pi16(a, d, n) \
2340   ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2341
2342 /// Compares each of the corresponding packed 16-bit integer values of
2343 ///    the 64-bit integer vectors, and writes the greater value to the
2344 ///    corresponding bits in the destination.
2345 ///
2346 /// \headerfile <x86intrin.h>
2347 ///
2348 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2349 ///
2350 /// \param __a
2351 ///    A 64-bit integer vector containing one of the source operands.
2352 /// \param __b
2353 ///    A 64-bit integer vector containing one of the source operands.
2354 /// \returns A 64-bit integer vector containing the comparison results.
2355 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2356 _mm_max_pi16(__m64 __a, __m64 __b)
2357 {
2358   return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
2359 }
2360
2361 /// Compares each of the corresponding packed 8-bit unsigned integer
2362 ///    values of the 64-bit integer vectors, and writes the greater value to the
2363 ///    corresponding bits in the destination.
2364 ///
2365 /// \headerfile <x86intrin.h>
2366 ///
2367 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2368 ///
2369 /// \param __a
2370 ///    A 64-bit integer vector containing one of the source operands.
2371 /// \param __b
2372 ///    A 64-bit integer vector containing one of the source operands.
2373 /// \returns A 64-bit integer vector containing the comparison results.
2374 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2375 _mm_max_pu8(__m64 __a, __m64 __b)
2376 {
2377   return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
2378 }
2379
2380 /// Compares each of the corresponding packed 16-bit integer values of
2381 ///    the 64-bit integer vectors, and writes the lesser value to the
2382 ///    corresponding bits in the destination.
2383 ///
2384 /// \headerfile <x86intrin.h>
2385 ///
2386 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2387 ///
2388 /// \param __a
2389 ///    A 64-bit integer vector containing one of the source operands.
2390 /// \param __b
2391 ///    A 64-bit integer vector containing one of the source operands.
2392 /// \returns A 64-bit integer vector containing the comparison results.
2393 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2394 _mm_min_pi16(__m64 __a, __m64 __b)
2395 {
2396   return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
2397 }
2398
2399 /// Compares each of the corresponding packed 8-bit unsigned integer
2400 ///    values of the 64-bit integer vectors, and writes the lesser value to the
2401 ///    corresponding bits in the destination.
2402 ///
2403 /// \headerfile <x86intrin.h>
2404 ///
2405 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2406 ///
2407 /// \param __a
2408 ///    A 64-bit integer vector containing one of the source operands.
2409 /// \param __b
2410 ///    A 64-bit integer vector containing one of the source operands.
2411 /// \returns A 64-bit integer vector containing the comparison results.
2412 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2413 _mm_min_pu8(__m64 __a, __m64 __b)
2414 {
2415   return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
2416 }
2417
2418 /// Takes the most significant bit from each 8-bit element in a 64-bit
2419 ///    integer vector to create an 8-bit mask value. Zero-extends the value to
2420 ///    32-bit integer and writes it to the destination.
2421 ///
2422 /// \headerfile <x86intrin.h>
2423 ///
2424 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2425 ///
2426 /// \param __a
2427 ///    A 64-bit integer vector containing the values with bits to be extracted.
2428 /// \returns The most significant bit from each 8-bit element in \a __a,
2429 ///    written to bits [7:0].
2430 static __inline__ int __DEFAULT_FN_ATTRS_SSE2
2431 _mm_movemask_pi8(__m64 __a)
2432 {
2433   return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
2434 }
2435
2436 /// Multiplies packed 16-bit unsigned integer values and writes the
2437 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
2438 ///    the destination.
2439 ///
2440 /// \headerfile <x86intrin.h>
2441 ///
2442 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2443 ///
2444 /// \param __a
2445 ///    A 64-bit integer vector containing one of the source operands.
2446 /// \param __b
2447 ///    A 64-bit integer vector containing one of the source operands.
2448 /// \returns A 64-bit integer vector containing the products of both operands.
2449 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2450 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2451 {
2452   return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
2453                                              (__v8hi)__anyext128(__b)));
2454 }
2455
2456 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2457 ///    destination, as specified by the immediate value operand.
2458 ///
2459 /// \headerfile <x86intrin.h>
2460 ///
2461 /// \code
2462 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2463 /// \endcode
2464 ///
2465 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2466 ///
2467 /// \param a
2468 ///    A 64-bit integer vector containing the values to be shuffled.
2469 /// \param n
2470 ///    An immediate value containing an 8-bit value specifying which elements to
2471 ///    copy from \a a. The destinations within the 64-bit destination are
2472 ///    assigned values as follows: \n
2473 ///    Bits [1:0] are used to assign values to bits [15:0] in the
2474 ///    destination. \n
2475 ///    Bits [3:2] are used to assign values to bits [31:16] in the
2476 ///    destination. \n
2477 ///    Bits [5:4] are used to assign values to bits [47:32] in the
2478 ///    destination. \n
2479 ///    Bits [7:6] are used to assign values to bits [63:48] in the
2480 ///    destination. \n
2481 ///    Bit value assignments: \n
2482 ///    00: assigned from bits [15:0] of \a a. \n
2483 ///    01: assigned from bits [31:16] of \a a. \n
2484 ///    10: assigned from bits [47:32] of \a a. \n
2485 ///    11: assigned from bits [63:48] of \a a. \n
2486 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2487 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2488 ///    <c>[b6, b4, b2, b0]</c>.
2489 /// \returns A 64-bit integer vector containing the shuffled values.
2490 #define _mm_shuffle_pi16(a, n)                                                 \
2491   ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
2492                                   (n) & 0x3, ((n) >> 2) & 0x3,                 \
2493                                   ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
2494
2495 /// Conditionally copies the values from each 8-bit element in the first
2496 ///    64-bit integer vector operand to the specified memory location, as
2497 ///    specified by the most significant bit in the corresponding element in the
2498 ///    second 64-bit integer vector operand.
2499 ///
2500 ///    To minimize caching, the data is flagged as non-temporal
2501 ///    (unlikely to be used again soon).
2502 ///
2503 /// \headerfile <x86intrin.h>
2504 ///
2505 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2506 ///
2507 /// \param __d
2508 ///    A 64-bit integer vector containing the values with elements to be copied.
2509 /// \param __n
2510 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2511 ///    element determines whether the corresponding element in operand \a __d
2512 ///    is copied. If the most significant bit of a given element is 1, the
2513 ///    corresponding element in operand \a __d is copied.
2514 /// \param __p
2515 ///    A pointer to a 64-bit memory location that will receive the conditionally
2516 ///    copied integer values. The address of the memory location does not have
2517 ///    to be aligned.
2518 static __inline__ void __DEFAULT_FN_ATTRS_SSE2
2519 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2520 {
2521   // This is complex, because we need to support the case where __p is pointing
2522   // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
2523   // write might cause a trap where a 64-bit maskmovq would not. (Memory
2524   // locations not selected by the mask bits might still cause traps.)
2525   __m128i __d128  = __anyext128(__d);
2526   __m128i __n128  = __zext128(__n);
2527   if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
2528       ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
2529     // If there's a risk of spurious trap due to a 128-bit write, back up the
2530     // pointer by 8 bytes and shift values in registers to match.
2531     __p -= 8;
2532     __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
2533     __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
2534   }
2535
2536   __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
2537 }
2538
2539 /// Computes the rounded averages of the packed unsigned 8-bit integer
2540 ///    values and writes the averages to the corresponding bits in the
2541 ///    destination.
2542 ///
2543 /// \headerfile <x86intrin.h>
2544 ///
2545 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2546 ///
2547 /// \param __a
2548 ///    A 64-bit integer vector containing one of the source operands.
2549 /// \param __b
2550 ///    A 64-bit integer vector containing one of the source operands.
2551 /// \returns A 64-bit integer vector containing the averages of both operands.
2552 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2553 _mm_avg_pu8(__m64 __a, __m64 __b)
2554 {
2555   return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
2556                                            (__v16qi)__anyext128(__b)));
2557 }
2558
2559 /// Computes the rounded averages of the packed unsigned 16-bit integer
2560 ///    values and writes the averages to the corresponding bits in the
2561 ///    destination.
2562 ///
2563 /// \headerfile <x86intrin.h>
2564 ///
2565 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2566 ///
2567 /// \param __a
2568 ///    A 64-bit integer vector containing one of the source operands.
2569 /// \param __b
2570 ///    A 64-bit integer vector containing one of the source operands.
2571 /// \returns A 64-bit integer vector containing the averages of both operands.
2572 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2573 _mm_avg_pu16(__m64 __a, __m64 __b)
2574 {
2575   return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
2576                                            (__v8hi)__anyext128(__b)));
2577 }
2578
2579 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2580 ///    64-bit vector operands and computes the absolute value for each of the
2581 ///    difference. Then sum of the 8 absolute differences is written to the
2582 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2583 ///
2584 /// \headerfile <x86intrin.h>
2585 ///
2586 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2587 ///
2588 /// \param __a
2589 ///    A 64-bit integer vector containing one of the source operands.
2590 /// \param __b
2591 ///    A 64-bit integer vector containing one of the source operands.
2592 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2593 ///    sets of absolute differences between both operands. The upper bits are
2594 ///    cleared.
2595 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2596 _mm_sad_pu8(__m64 __a, __m64 __b)
2597 {
2598   return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
2599                                             (__v16qi)__zext128(__b)));
2600 }
2601
2602 #if defined(__cplusplus)
2603 extern "C" {
2604 #endif
2605
2606 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2607 ///    integer value.
2608 ///
2609 ///    There are several groups of macros associated with this
2610 ///    intrinsic, including:
2611 ///    <ul>
2612 ///    <li>
2613 ///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2614 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2615 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2616 ///      _MM_GET_EXCEPTION_STATE().
2617 ///    </li>
2618 ///    <li>
2619 ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2620 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2621 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2622 ///    </li>
2623 ///    <li>
2624 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2625 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2626 ///      _MM_GET_ROUNDING_MODE().
2627 ///    </li>
2628 ///    <li>
2629 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2630 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2631 ///    </li>
2632 ///    <li>
2633 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2634 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2635 ///      _MM_GET_DENORMALS_ZERO_MODE().
2636 ///    </li>
2637 ///    </ul>
2638 ///
2639 ///    For example, the following expression checks if an overflow exception has
2640 ///    occurred:
2641 ///    \code
2642 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2643 ///    \endcode
2644 ///
2645 ///    The following expression gets the current rounding mode:
2646 ///    \code
2647 ///      _MM_GET_ROUNDING_MODE()
2648 ///    \endcode
2649 ///
2650 /// \headerfile <x86intrin.h>
2651 ///
2652 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2653 ///
2654 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2655 ///    register.
2656 unsigned int _mm_getcsr(void);
2657
2658 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2659 ///
2660 ///    There are several groups of macros associated with this intrinsic,
2661 ///    including:
2662 ///    <ul>
2663 ///    <li>
2664 ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2665 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2666 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2667 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2668 ///    </li>
2669 ///    <li>
2670 ///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2671 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2672 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2673 ///      of these macros.
2674 ///    </li>
2675 ///    <li>
2676 ///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2677 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2678 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2679 ///    </li>
2680 ///    <li>
2681 ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2682 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2683 ///      one of these macros.
2684 ///    </li>
2685 ///    <li>
2686 ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2687 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2688 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2689 ///    </li>
2690 ///    </ul>
2691 ///
2692 ///    For example, the following expression causes subsequent floating-point
2693 ///    operations to round up:
2694 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2695 ///
2696 ///    The following example sets the DAZ and FTZ flags:
2697 ///    \code
2698 ///    void setFlags() {
2699 ///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2700 ///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2701 ///    }
2702 ///    \endcode
2703 ///
2704 /// \headerfile <x86intrin.h>
2705 ///
2706 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2707 ///
2708 /// \param __i
2709 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
2710 void _mm_setcsr(unsigned int __i);
2711
2712 #if defined(__cplusplus)
2713 } // extern "C"
2714 #endif
2715
2716 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2717 ///    specified by the immediate value operand.
2718 ///
2719 /// \headerfile <x86intrin.h>
2720 ///
2721 /// \code
2722 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2723 /// \endcode
2724 ///
2725 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2726 ///
2727 /// \param a
2728 ///    A 128-bit vector of [4 x float].
2729 /// \param b
2730 ///    A 128-bit vector of [4 x float].
2731 /// \param mask
2732 ///    An immediate value containing an 8-bit value specifying which elements to
2733 ///    copy from \a a and \a b. \n
2734 ///    Bits [3:0] specify the values copied from operand \a a. \n
2735 ///    Bits [7:4] specify the values copied from operand \a b. \n
2736 ///    The destinations within the 128-bit destination are assigned values as
2737 ///    follows: \n
2738 ///    Bits [1:0] are used to assign values to bits [31:0] in the
2739 ///    destination. \n
2740 ///    Bits [3:2] are used to assign values to bits [63:32] in the
2741 ///    destination. \n
2742 ///    Bits [5:4] are used to assign values to bits [95:64] in the
2743 ///    destination. \n
2744 ///    Bits [7:6] are used to assign values to bits [127:96] in the
2745 ///    destination. \n
2746 ///    Bit value assignments: \n
2747 ///    00: Bits [31:0] copied from the specified operand. \n
2748 ///    01: Bits [63:32] copied from the specified operand. \n
2749 ///    10: Bits [95:64] copied from the specified operand. \n
2750 ///    11: Bits [127:96] copied from the specified operand. \n
2751 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2752 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2753 ///    <c>[b6, b4, b2, b0]</c>.
2754 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2755 #define _mm_shuffle_ps(a, b, mask) \
2756   ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2757                                  (int)(mask)))
2758
2759 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2760 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2761 ///
2762 /// \headerfile <x86intrin.h>
2763 ///
2764 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2765 ///
2766 /// \param __a
2767 ///    A 128-bit vector of [4 x float]. \n
2768 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
2769 ///    Bits [127:96] are written to bits [95:64] of the destination.
2770 /// \param __b
2771 ///    A 128-bit vector of [4 x float].
2772 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
2773 ///    Bits [127:96] are written to bits [127:96] of the destination.
2774 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2775 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2776 _mm_unpackhi_ps(__m128 __a, __m128 __b) {
2777   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2778 }
2779
2780 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2781 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2782 ///
2783 /// \headerfile <x86intrin.h>
2784 ///
2785 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2786 ///
2787 /// \param __a
2788 ///    A 128-bit vector of [4 x float]. \n
2789 ///    Bits [31:0] are written to bits [31:0] of the destination.  \n
2790 ///    Bits [63:32] are written to bits [95:64] of the destination.
2791 /// \param __b
2792 ///    A 128-bit vector of [4 x float]. \n
2793 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
2794 ///    Bits [63:32] are written to bits [127:96] of the destination.
2795 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2796 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2797 _mm_unpacklo_ps(__m128 __a, __m128 __b) {
2798   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2799 }
2800
2801 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2802 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
2803 ///    96 bits are set to the upper 96 bits of the first parameter.
2804 ///
2805 /// \headerfile <x86intrin.h>
2806 ///
2807 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2808 ///    instruction.
2809 ///
2810 /// \param __a
2811 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2812 ///    written to the upper 96 bits of the result.
2813 /// \param __b
2814 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2815 ///    written to the lower 32 bits of the result.
2816 /// \returns A 128-bit floating-point vector of [4 x float].
2817 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2818 _mm_move_ss(__m128 __a, __m128 __b) {
2819   __a[0] = __b[0];
2820   return __a;
2821 }
2822
2823 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2824 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
2825 ///    64 bits are set to the upper 64 bits of the first parameter.
2826 ///
2827 /// \headerfile <x86intrin.h>
2828 ///
2829 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2830 ///
2831 /// \param __a
2832 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2833 ///    written to the upper 64 bits of the result.
2834 /// \param __b
2835 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2836 ///    written to the lower 64 bits of the result.
2837 /// \returns A 128-bit floating-point vector of [4 x float].
2838 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2839 _mm_movehl_ps(__m128 __a, __m128 __b) {
2840   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2841 }
2842
2843 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2844 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
2845 ///    64 bits are set to the lower 64 bits of the second parameter.
2846 ///
2847 /// \headerfile <x86intrin.h>
2848 ///
2849 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2850 ///
2851 /// \param __a
2852 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2853 ///    written to the lower 64 bits of the result.
2854 /// \param __b
2855 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2856 ///    written to the upper 64 bits of the result.
2857 /// \returns A 128-bit floating-point vector of [4 x float].
2858 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2859 _mm_movelh_ps(__m128 __a, __m128 __b) {
2860   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2861 }
2862
2863 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2864 ///    float].
2865 ///
2866 /// \headerfile <x86intrin.h>
2867 ///
2868 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2869 ///
2870 /// \param __a
2871 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2872 ///    from the corresponding elements in this operand.
2873 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2874 ///    values from the operand.
2875 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2876 _mm_cvtpi16_ps(__m64 __a)
2877 {
2878   return __builtin_convertvector((__v4hi)__a, __v4sf);
2879 }
2880
2881 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2882 ///    128-bit vector of [4 x float].
2883 ///
2884 /// \headerfile <x86intrin.h>
2885 ///
2886 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2887 ///
2888 /// \param __a
2889 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2890 ///    destination are copied from the corresponding elements in this operand.
2891 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2892 ///    values from the operand.
2893 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2894 _mm_cvtpu16_ps(__m64 __a)
2895 {
2896   return __builtin_convertvector((__v4hu)__a, __v4sf);
2897 }
2898
2899 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2900 ///    into a 128-bit vector of [4 x float].
2901 ///
2902 /// \headerfile <x86intrin.h>
2903 ///
2904 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2905 ///
2906 /// \param __a
2907 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2908 ///    from the corresponding lower 4 elements in this operand.
2909 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2910 ///    values from the operand.
2911 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2912 _mm_cvtpi8_ps(__m64 __a)
2913 {
2914   return __builtin_convertvector(
2915       __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
2916                               0, 1, 2, 3), __v4sf);
2917 }
2918
2919 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2920 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2921 ///
2922 /// \headerfile <x86intrin.h>
2923 ///
2924 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2925 ///
2926 /// \param __a
2927 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2928 ///    destination are copied from the corresponding lower 4 elements in this
2929 ///    operand.
2930 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2931 ///    values from the source operand.
2932 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2933 _mm_cvtpu8_ps(__m64 __a)
2934 {
2935   return __builtin_convertvector(
2936       __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
2937                               0, 1, 2, 3), __v4sf);
2938 }
2939
2940 /// Converts the two 32-bit signed integer values from each 64-bit vector
2941 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2942 ///
2943 /// \headerfile <x86intrin.h>
2944 ///
2945 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2946 ///
2947 /// \param __a
2948 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2949 ///    copied from the elements in this operand.
2950 /// \param __b
2951 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2952 ///    copied from the elements in this operand.
2953 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2954 ///    copied and converted values from the first operand. The upper 64 bits
2955 ///    contain the copied and converted values from the second operand.
2956 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2957 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2958 {
2959   return __builtin_convertvector(
2960       __builtin_shufflevector((__v2si)__a, (__v2si)__b,
2961                               0, 1, 2, 3), __v4sf);
2962 }
2963
2964 /// Converts each single-precision floating-point element of a 128-bit
2965 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2966 ///    packs the results into a 64-bit integer vector of [4 x i16].
2967 ///
2968 ///    If the floating-point element is NaN or infinity, or if the
2969 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2970 ///    it is converted to 0x8000. Otherwise if the floating-point element is
2971 ///    greater than 0x7FFF, it is converted to 0x7FFF.
2972 ///
2973 /// \headerfile <x86intrin.h>
2974 ///
2975 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2976 ///
2977 /// \param __a
2978 ///    A 128-bit floating-point vector of [4 x float].
2979 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2980 ///    values.
2981 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2982 _mm_cvtps_pi16(__m128 __a)
2983 {
2984   return __trunc64(__builtin_ia32_packssdw128(
2985       (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
2986 }
2987
2988 /// Converts each single-precision floating-point element of a 128-bit
2989 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
2990 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
2991 ///    [8 x i8]. The upper 32 bits of the vector are set to 0.
2992 ///
2993 ///    If the floating-point element is NaN or infinity, or if the
2994 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2995 ///    is converted to 0x80. Otherwise if the floating-point element is greater
2996 ///    than 0x7F, it is converted to 0x7F.
2997 ///
2998 /// \headerfile <x86intrin.h>
2999 ///
3000 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
3001 ///
3002 /// \param __a
3003 ///    128-bit floating-point vector of [4 x float].
3004 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
3005 ///    converted values and the uppper 32 bits are set to zero.
3006 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
3007 _mm_cvtps_pi8(__m128 __a)
3008 {
3009   __m64 __b, __c;
3010
3011   __b = _mm_cvtps_pi16(__a);
3012   __c = _mm_setzero_si64();
3013
3014   return _mm_packs_pi16(__b, __c);
3015 }
3016
3017 /// Extracts the sign bits from each single-precision floating-point
3018 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
3019 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
3020 ///    to zero.
3021 ///
3022 /// \headerfile <x86intrin.h>
3023 ///
3024 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
3025 ///
3026 /// \param __a
3027 ///    A 128-bit floating-point vector of [4 x float].
3028 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3029 ///    single-precision floating-point element of the parameter. Bits [31:4] are
3030 ///    set to zero.
3031 static __inline__ int __DEFAULT_FN_ATTRS
3032 _mm_movemask_ps(__m128 __a)
3033 {
3034   return __builtin_ia32_movmskps((__v4sf)__a);
3035 }
3036
3037 /* Compare */
3038 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
3039 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
3040 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
3041 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
3042 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
3043 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
3044 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
3045 #define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
3046
3047 /// Compares each of the corresponding values of two 128-bit vectors of
3048 ///    [4 x float], using the operation specified by the immediate integer
3049 ///    operand.
3050 ///
3051 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3052 ///    If either value in a comparison is NaN, comparisons that are ordered
3053 ///    return false, and comparisons that are unordered return true.
3054 ///
3055 /// \headerfile <x86intrin.h>
3056 ///
3057 /// \code
3058 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
3059 /// \endcode
3060 ///
3061 /// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3062 ///
3063 /// \param a
3064 ///    A 128-bit vector of [4 x float].
3065 /// \param b
3066 ///    A 128-bit vector of [4 x float].
3067 /// \param c
3068 ///    An immediate integer operand, with bits [4:0] specifying which comparison
3069 ///    operation to use: \n
3070 ///    0x00: Equal (ordered, non-signaling) \n
3071 ///    0x01: Less-than (ordered, signaling) \n
3072 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3073 ///    0x03: Unordered (non-signaling) \n
3074 ///    0x04: Not-equal (unordered, non-signaling) \n
3075 ///    0x05: Not-less-than (unordered, signaling) \n
3076 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3077 ///    0x07: Ordered (non-signaling) \n
3078 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3079 #define _mm_cmp_ps(a, b, c)                                                    \
3080   ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3081
3082 /// Compares each of the corresponding scalar values of two 128-bit
3083 ///    vectors of [4 x float], using the operation specified by the immediate
3084 ///    integer operand.
3085 ///
3086 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3087 ///    If either value in a comparison is NaN, comparisons that are ordered
3088 ///    return false, and comparisons that are unordered return true.
3089 ///
3090 /// \headerfile <x86intrin.h>
3091 ///
3092 /// \code
3093 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3094 /// \endcode
3095 ///
3096 /// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3097 ///
3098 /// \param a
3099 ///    A 128-bit vector of [4 x float].
3100 /// \param b
3101 ///    A 128-bit vector of [4 x float].
3102 /// \param c
3103 ///    An immediate integer operand, with bits [4:0] specifying which comparison
3104 ///    operation to use: \n
3105 ///    0x00: Equal (ordered, non-signaling) \n
3106 ///    0x01: Less-than (ordered, signaling) \n
3107 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3108 ///    0x03: Unordered (non-signaling) \n
3109 ///    0x04: Not-equal (unordered, non-signaling) \n
3110 ///    0x05: Not-less-than (unordered, signaling) \n
3111 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3112 ///    0x07: Ordered (non-signaling) \n
3113 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3114 #define _mm_cmp_ss(a, b, c)                                                    \
3115   ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3116
3117 #define _MM_ALIGN16 __attribute__((aligned(16)))
3118
3119 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3120
3121 #define _MM_EXCEPT_INVALID    (0x0001U)
3122 #define _MM_EXCEPT_DENORM     (0x0002U)
3123 #define _MM_EXCEPT_DIV_ZERO   (0x0004U)
3124 #define _MM_EXCEPT_OVERFLOW   (0x0008U)
3125 #define _MM_EXCEPT_UNDERFLOW  (0x0010U)
3126 #define _MM_EXCEPT_INEXACT    (0x0020U)
3127 #define _MM_EXCEPT_MASK       (0x003fU)
3128
3129 #define _MM_MASK_INVALID      (0x0080U)
3130 #define _MM_MASK_DENORM       (0x0100U)
3131 #define _MM_MASK_DIV_ZERO     (0x0200U)
3132 #define _MM_MASK_OVERFLOW     (0x0400U)
3133 #define _MM_MASK_UNDERFLOW    (0x0800U)
3134 #define _MM_MASK_INEXACT      (0x1000U)
3135 #define _MM_MASK_MASK         (0x1f80U)
3136
3137 #define _MM_ROUND_NEAREST     (0x0000U)
3138 #define _MM_ROUND_DOWN        (0x2000U)
3139 #define _MM_ROUND_UP          (0x4000U)
3140 #define _MM_ROUND_TOWARD_ZERO (0x6000U)
3141 #define _MM_ROUND_MASK        (0x6000U)
3142
3143 #define _MM_FLUSH_ZERO_MASK   (0x8000U)
3144 #define _MM_FLUSH_ZERO_ON     (0x8000U)
3145 #define _MM_FLUSH_ZERO_OFF    (0x0000U)
3146
3147 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3148 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3149 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3150 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3151
3152 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3153 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3154 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3155 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3156
3157 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3158 do { \
3159   __m128 tmp3, tmp2, tmp1, tmp0; \
3160   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3161   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3162   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3163   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3164   (row0) = _mm_movelh_ps(tmp0, tmp2); \
3165   (row1) = _mm_movehl_ps(tmp2, tmp0); \
3166   (row2) = _mm_movelh_ps(tmp1, tmp3); \
3167   (row3) = _mm_movehl_ps(tmp3, tmp1); \
3168 } while (0)
3169
3170 /* Aliases for compatibility. */
3171 #define _m_pextrw _mm_extract_pi16
3172 #define _m_pinsrw _mm_insert_pi16
3173 #define _m_pmaxsw _mm_max_pi16
3174 #define _m_pmaxub _mm_max_pu8
3175 #define _m_pminsw _mm_min_pi16
3176 #define _m_pminub _mm_min_pu8
3177 #define _m_pmovmskb _mm_movemask_pi8
3178 #define _m_pmulhuw _mm_mulhi_pu16
3179 #define _m_pshufw _mm_shuffle_pi16
3180 #define _m_maskmovq _mm_maskmove_si64
3181 #define _m_pavgb _mm_avg_pu8
3182 #define _m_pavgw _mm_avg_pu16
3183 #define _m_psadbw _mm_sad_pu8
3184 #define _m_ _mm_
3185
3186 #undef __trunc64
3187 #undef __zext128
3188 #undef __anyext128
3189 #undef __zeroupper64
3190 #undef __DEFAULT_FN_ATTRS
3191 #undef __DEFAULT_FN_ATTRS_CONSTEXPR
3192 #undef __DEFAULT_FN_ATTRS_SSE2
3193 #undef __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
3194
3195 /* Ugly hack for backwards-compatibility (compatible with gcc) */
3196 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3197 #include <emmintrin.h>
3198 #endif
3199
3200 #endif /* __XMMINTRIN_H */