1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
17 #include <xmmintrin.h>
19 typedef double __m128d
__attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i
__attribute__((__vector_size__(16), __aligned__(16)));
22 typedef double __m128d_u
__attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24 __attribute__((__vector_size__(16), __aligned__(1)));
27 typedef double __v2df
__attribute__((__vector_size__(16)));
28 typedef long long __v2di
__attribute__((__vector_size__(16)));
29 typedef short __v8hi
__attribute__((__vector_size__(16)));
30 typedef char __v16qi
__attribute__((__vector_size__(16)));
33 typedef unsigned long long __v2du
__attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu
__attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu
__attribute__((__vector_size__(16)));
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38 * appear in the interface though. */
39 typedef signed char __v16qs
__attribute__((__vector_size__(16)));
42 /* Both _Float16 and __bf16 require SSE2 being enabled. */
43 typedef _Float16 __v8hf
__attribute__((__vector_size__(16), __aligned__(16)));
44 typedef _Float16 __m128h
__attribute__((__vector_size__(16), __aligned__(16)));
45 typedef _Float16 __m128h_u
__attribute__((__vector_size__(16), __aligned__(1)));
47 typedef __bf16 __v8bf
__attribute__((__vector_size__(16), __aligned__(16)));
48 typedef __bf16 __m128bh
__attribute__((__vector_size__(16), __aligned__(16)));
51 /* Define the default attributes for the functions in this file. */
52 #define __DEFAULT_FN_ATTRS \
53 __attribute__((__always_inline__, __nodebug__, \
54 __target__("sse2,no-evex512"), __min_vector_width__(128)))
55 #define __DEFAULT_FN_ATTRS_MMX \
56 __attribute__((__always_inline__, __nodebug__, \
57 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
59 /// Adds lower double-precision values in both operands and returns the
60 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
61 /// are copied from the upper double-precision value of the first operand.
63 /// \headerfile <x86intrin.h>
65 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
68 /// A 128-bit vector of [2 x double] containing one of the source operands.
70 /// A 128-bit vector of [2 x double] containing one of the source operands.
71 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
72 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
73 /// from the upper 64 bits of the first source operand.
74 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_add_sd(__m128d __a
,
80 /// Adds two 128-bit vectors of [2 x double].
82 /// \headerfile <x86intrin.h>
84 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
87 /// A 128-bit vector of [2 x double] containing one of the source operands.
89 /// A 128-bit vector of [2 x double] containing one of the source operands.
90 /// \returns A 128-bit vector of [2 x double] containing the sums of both
92 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_add_pd(__m128d __a
,
94 return (__m128d
)((__v2df
)__a
+ (__v2df
)__b
);
97 /// Subtracts the lower double-precision value of the second operand
98 /// from the lower double-precision value of the first operand and returns
99 /// the difference in the lower 64 bits of the result. The upper 64 bits of
100 /// the result are copied from the upper double-precision value of the first
103 /// \headerfile <x86intrin.h>
105 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
108 /// A 128-bit vector of [2 x double] containing the minuend.
110 /// A 128-bit vector of [2 x double] containing the subtrahend.
111 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
112 /// difference of the lower 64 bits of both operands. The upper 64 bits are
113 /// copied from the upper 64 bits of the first source operand.
114 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sub_sd(__m128d __a
,
120 /// Subtracts two 128-bit vectors of [2 x double].
122 /// \headerfile <x86intrin.h>
124 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
127 /// A 128-bit vector of [2 x double] containing the minuend.
129 /// A 128-bit vector of [2 x double] containing the subtrahend.
130 /// \returns A 128-bit vector of [2 x double] containing the differences between
132 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sub_pd(__m128d __a
,
134 return (__m128d
)((__v2df
)__a
- (__v2df
)__b
);
137 /// Multiplies lower double-precision values in both operands and returns
138 /// the product in the lower 64 bits of the result. The upper 64 bits of the
139 /// result are copied from the upper double-precision value of the first
142 /// \headerfile <x86intrin.h>
144 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
147 /// A 128-bit vector of [2 x double] containing one of the source operands.
149 /// A 128-bit vector of [2 x double] containing one of the source operands.
150 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
151 /// product of the lower 64 bits of both operands. The upper 64 bits are
152 /// copied from the upper 64 bits of the first source operand.
153 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mul_sd(__m128d __a
,
159 /// Multiplies two 128-bit vectors of [2 x double].
161 /// \headerfile <x86intrin.h>
163 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
166 /// A 128-bit vector of [2 x double] containing one of the operands.
168 /// A 128-bit vector of [2 x double] containing one of the operands.
169 /// \returns A 128-bit vector of [2 x double] containing the products of both
171 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mul_pd(__m128d __a
,
173 return (__m128d
)((__v2df
)__a
* (__v2df
)__b
);
176 /// Divides the lower double-precision value of the first operand by the
177 /// lower double-precision value of the second operand and returns the
178 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
179 /// result are copied from the upper double-precision value of the first
182 /// \headerfile <x86intrin.h>
184 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
187 /// A 128-bit vector of [2 x double] containing the dividend.
189 /// A 128-bit vector of [2 x double] containing divisor.
190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
191 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
192 /// copied from the upper 64 bits of the first source operand.
193 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_div_sd(__m128d __a
,
199 /// Performs an element-by-element division of two 128-bit vectors of
202 /// \headerfile <x86intrin.h>
204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
207 /// A 128-bit vector of [2 x double] containing the dividend.
209 /// A 128-bit vector of [2 x double] containing the divisor.
210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
212 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_div_pd(__m128d __a
,
214 return (__m128d
)((__v2df
)__a
/ (__v2df
)__b
);
217 /// Calculates the square root of the lower double-precision value of
218 /// the second operand and returns it in the lower 64 bits of the result.
219 /// The upper 64 bits of the result are copied from the upper
220 /// double-precision value of the first operand.
222 /// \headerfile <x86intrin.h>
224 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
227 /// A 128-bit vector of [2 x double] containing one of the operands. The
228 /// upper 64 bits of this operand are copied to the upper 64 bits of the
231 /// A 128-bit vector of [2 x double] containing one of the operands. The
232 /// square root is calculated using the lower 64 bits of this operand.
233 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
235 /// bits are copied from the upper 64 bits of operand \a __a.
236 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sqrt_sd(__m128d __a
,
238 __m128d __c
= __builtin_ia32_sqrtsd((__v2df
)__b
);
239 return __extension__(__m128d
){__c
[0], __a
[1]};
242 /// Calculates the square root of the each of two values stored in a
243 /// 128-bit vector of [2 x double].
245 /// \headerfile <x86intrin.h>
247 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
250 /// A 128-bit vector of [2 x double].
251 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
252 /// values in the operand.
253 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sqrt_pd(__m128d __a
) {
254 return __builtin_ia32_sqrtpd((__v2df
)__a
);
257 /// Compares lower 64-bit double-precision values of both operands, and
258 /// returns the lesser of the pair of values in the lower 64-bits of the
259 /// result. The upper 64 bits of the result are copied from the upper
260 /// double-precision value of the first operand.
262 /// \headerfile <x86intrin.h>
264 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
267 /// A 128-bit vector of [2 x double] containing one of the operands. The
268 /// lower 64 bits of this operand are used in the comparison.
270 /// A 128-bit vector of [2 x double] containing one of the operands. The
271 /// lower 64 bits of this operand are used in the comparison.
272 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
273 /// minimum value between both operands. The upper 64 bits are copied from
274 /// the upper 64 bits of the first source operand.
275 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_min_sd(__m128d __a
,
277 return __builtin_ia32_minsd((__v2df
)__a
, (__v2df
)__b
);
280 /// Performs element-by-element comparison of the two 128-bit vectors of
281 /// [2 x double] and returns the vector containing the lesser of each pair of
284 /// \headerfile <x86intrin.h>
286 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
289 /// A 128-bit vector of [2 x double] containing one of the operands.
291 /// A 128-bit vector of [2 x double] containing one of the operands.
292 /// \returns A 128-bit vector of [2 x double] containing the minimum values
293 /// between both operands.
294 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_min_pd(__m128d __a
,
296 return __builtin_ia32_minpd((__v2df
)__a
, (__v2df
)__b
);
299 /// Compares lower 64-bit double-precision values of both operands, and
300 /// returns the greater of the pair of values in the lower 64-bits of the
301 /// result. The upper 64 bits of the result are copied from the upper
302 /// double-precision value of the first operand.
304 /// \headerfile <x86intrin.h>
306 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
309 /// A 128-bit vector of [2 x double] containing one of the operands. The
310 /// lower 64 bits of this operand are used in the comparison.
312 /// A 128-bit vector of [2 x double] containing one of the operands. The
313 /// lower 64 bits of this operand are used in the comparison.
314 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
315 /// maximum value between both operands. The upper 64 bits are copied from
316 /// the upper 64 bits of the first source operand.
317 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_max_sd(__m128d __a
,
319 return __builtin_ia32_maxsd((__v2df
)__a
, (__v2df
)__b
);
322 /// Performs element-by-element comparison of the two 128-bit vectors of
323 /// [2 x double] and returns the vector containing the greater of each pair
326 /// \headerfile <x86intrin.h>
328 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
331 /// A 128-bit vector of [2 x double] containing one of the operands.
333 /// A 128-bit vector of [2 x double] containing one of the operands.
334 /// \returns A 128-bit vector of [2 x double] containing the maximum values
335 /// between both operands.
336 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_max_pd(__m128d __a
,
338 return __builtin_ia32_maxpd((__v2df
)__a
, (__v2df
)__b
);
341 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
343 /// \headerfile <x86intrin.h>
345 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
348 /// A 128-bit vector of [2 x double] containing one of the source operands.
350 /// A 128-bit vector of [2 x double] containing one of the source operands.
351 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
352 /// values between both operands.
353 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_and_pd(__m128d __a
,
355 return (__m128d
)((__v2du
)__a
& (__v2du
)__b
);
358 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
359 /// the one's complement of the values contained in the first source operand.
361 /// \headerfile <x86intrin.h>
363 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
366 /// A 128-bit vector of [2 x double] containing the left source operand. The
367 /// one's complement of this value is used in the bitwise AND.
369 /// A 128-bit vector of [2 x double] containing the right source operand.
370 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
371 /// values in the second operand and the one's complement of the first
373 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_andnot_pd(__m128d __a
,
375 return (__m128d
)(~(__v2du
)__a
& (__v2du
)__b
);
378 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
380 /// \headerfile <x86intrin.h>
382 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
385 /// A 128-bit vector of [2 x double] containing one of the source operands.
387 /// A 128-bit vector of [2 x double] containing one of the source operands.
388 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
389 /// values between both operands.
390 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_or_pd(__m128d __a
,
392 return (__m128d
)((__v2du
)__a
| (__v2du
)__b
);
395 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
397 /// \headerfile <x86intrin.h>
399 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
402 /// A 128-bit vector of [2 x double] containing one of the source operands.
404 /// A 128-bit vector of [2 x double] containing one of the source operands.
405 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
406 /// values between both operands.
407 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_xor_pd(__m128d __a
,
409 return (__m128d
)((__v2du
)__a
^ (__v2du
)__b
);
412 /// Compares each of the corresponding double-precision values of the
413 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
414 /// for false, 0xFFFFFFFFFFFFFFFF for true.
416 /// \headerfile <x86intrin.h>
418 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
421 /// A 128-bit vector of [2 x double].
423 /// A 128-bit vector of [2 x double].
424 /// \returns A 128-bit vector containing the comparison results.
425 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpeq_pd(__m128d __a
,
427 return (__m128d
)__builtin_ia32_cmpeqpd((__v2df
)__a
, (__v2df
)__b
);
430 /// Compares each of the corresponding double-precision values of the
431 /// 128-bit vectors of [2 x double] to determine if the values in the first
432 /// operand are less than those in the second operand. Each comparison
433 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
435 /// \headerfile <x86intrin.h>
437 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
440 /// A 128-bit vector of [2 x double].
442 /// A 128-bit vector of [2 x double].
443 /// \returns A 128-bit vector containing the comparison results.
444 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmplt_pd(__m128d __a
,
446 return (__m128d
)__builtin_ia32_cmpltpd((__v2df
)__a
, (__v2df
)__b
);
449 /// Compares each of the corresponding double-precision values of the
450 /// 128-bit vectors of [2 x double] to determine if the values in the first
451 /// operand are less than or equal to those in the second operand.
453 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
455 /// \headerfile <x86intrin.h>
457 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
460 /// A 128-bit vector of [2 x double].
462 /// A 128-bit vector of [2 x double].
463 /// \returns A 128-bit vector containing the comparison results.
464 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmple_pd(__m128d __a
,
466 return (__m128d
)__builtin_ia32_cmplepd((__v2df
)__a
, (__v2df
)__b
);
469 /// Compares each of the corresponding double-precision values of the
470 /// 128-bit vectors of [2 x double] to determine if the values in the first
471 /// operand are greater than those in the second operand.
473 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
475 /// \headerfile <x86intrin.h>
477 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
480 /// A 128-bit vector of [2 x double].
482 /// A 128-bit vector of [2 x double].
483 /// \returns A 128-bit vector containing the comparison results.
484 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpgt_pd(__m128d __a
,
486 return (__m128d
)__builtin_ia32_cmpltpd((__v2df
)__b
, (__v2df
)__a
);
489 /// Compares each of the corresponding double-precision values of the
490 /// 128-bit vectors of [2 x double] to determine if the values in the first
491 /// operand are greater than or equal to those in the second operand.
493 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
495 /// \headerfile <x86intrin.h>
497 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
500 /// A 128-bit vector of [2 x double].
502 /// A 128-bit vector of [2 x double].
503 /// \returns A 128-bit vector containing the comparison results.
504 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpge_pd(__m128d __a
,
506 return (__m128d
)__builtin_ia32_cmplepd((__v2df
)__b
, (__v2df
)__a
);
509 /// Compares each of the corresponding double-precision values of the
510 /// 128-bit vectors of [2 x double] to determine if the values in the first
511 /// operand are ordered with respect to those in the second operand.
513 /// A pair of double-precision values are "ordered" with respect to each
514 /// other if neither value is a NaN. Each comparison yields 0x0 for false,
515 /// 0xFFFFFFFFFFFFFFFF for true.
517 /// \headerfile <x86intrin.h>
519 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
522 /// A 128-bit vector of [2 x double].
524 /// A 128-bit vector of [2 x double].
525 /// \returns A 128-bit vector containing the comparison results.
526 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpord_pd(__m128d __a
,
528 return (__m128d
)__builtin_ia32_cmpordpd((__v2df
)__a
, (__v2df
)__b
);
531 /// Compares each of the corresponding double-precision values of the
532 /// 128-bit vectors of [2 x double] to determine if the values in the first
533 /// operand are unordered with respect to those in the second operand.
535 /// A pair of double-precision values are "unordered" with respect to each
536 /// other if one or both values are NaN. Each comparison yields 0x0 for
537 /// false, 0xFFFFFFFFFFFFFFFF for true.
539 /// \headerfile <x86intrin.h>
541 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
545 /// A 128-bit vector of [2 x double].
547 /// A 128-bit vector of [2 x double].
548 /// \returns A 128-bit vector containing the comparison results.
549 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpunord_pd(__m128d __a
,
551 return (__m128d
)__builtin_ia32_cmpunordpd((__v2df
)__a
, (__v2df
)__b
);
554 /// Compares each of the corresponding double-precision values of the
555 /// 128-bit vectors of [2 x double] to determine if the values in the first
556 /// operand are unequal to those in the second operand.
558 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
560 /// \headerfile <x86intrin.h>
562 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
565 /// A 128-bit vector of [2 x double].
567 /// A 128-bit vector of [2 x double].
568 /// \returns A 128-bit vector containing the comparison results.
569 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpneq_pd(__m128d __a
,
571 return (__m128d
)__builtin_ia32_cmpneqpd((__v2df
)__a
, (__v2df
)__b
);
574 /// Compares each of the corresponding double-precision values of the
575 /// 128-bit vectors of [2 x double] to determine if the values in the first
576 /// operand are not less than those in the second operand.
578 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
580 /// \headerfile <x86intrin.h>
582 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
585 /// A 128-bit vector of [2 x double].
587 /// A 128-bit vector of [2 x double].
588 /// \returns A 128-bit vector containing the comparison results.
589 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnlt_pd(__m128d __a
,
591 return (__m128d
)__builtin_ia32_cmpnltpd((__v2df
)__a
, (__v2df
)__b
);
594 /// Compares each of the corresponding double-precision values of the
595 /// 128-bit vectors of [2 x double] to determine if the values in the first
596 /// operand are not less than or equal to those in the second operand.
598 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
600 /// \headerfile <x86intrin.h>
602 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
605 /// A 128-bit vector of [2 x double].
607 /// A 128-bit vector of [2 x double].
608 /// \returns A 128-bit vector containing the comparison results.
609 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnle_pd(__m128d __a
,
611 return (__m128d
)__builtin_ia32_cmpnlepd((__v2df
)__a
, (__v2df
)__b
);
614 /// Compares each of the corresponding double-precision values of the
615 /// 128-bit vectors of [2 x double] to determine if the values in the first
616 /// operand are not greater than those in the second operand.
618 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
620 /// \headerfile <x86intrin.h>
622 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
625 /// A 128-bit vector of [2 x double].
627 /// A 128-bit vector of [2 x double].
628 /// \returns A 128-bit vector containing the comparison results.
629 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpngt_pd(__m128d __a
,
631 return (__m128d
)__builtin_ia32_cmpnltpd((__v2df
)__b
, (__v2df
)__a
);
634 /// Compares each of the corresponding double-precision values of the
635 /// 128-bit vectors of [2 x double] to determine if the values in the first
636 /// operand are not greater than or equal to those in the second operand.
638 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
640 /// \headerfile <x86intrin.h>
642 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
645 /// A 128-bit vector of [2 x double].
647 /// A 128-bit vector of [2 x double].
648 /// \returns A 128-bit vector containing the comparison results.
649 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnge_pd(__m128d __a
,
651 return (__m128d
)__builtin_ia32_cmpnlepd((__v2df
)__b
, (__v2df
)__a
);
654 /// Compares the lower double-precision floating-point values in each of
655 /// the two 128-bit floating-point vectors of [2 x double] for equality.
657 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
659 /// \headerfile <x86intrin.h>
661 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
664 /// A 128-bit vector of [2 x double]. The lower double-precision value is
665 /// compared to the lower double-precision value of \a __b.
667 /// A 128-bit vector of [2 x double]. The lower double-precision value is
668 /// compared to the lower double-precision value of \a __a.
669 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
670 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
671 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpeq_sd(__m128d __a
,
673 return (__m128d
)__builtin_ia32_cmpeqsd((__v2df
)__a
, (__v2df
)__b
);
676 /// Compares the lower double-precision floating-point values in each of
677 /// the two 128-bit floating-point vectors of [2 x double] to determine if
678 /// the value in the first parameter is less than the corresponding value in
679 /// the second parameter.
681 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
683 /// \headerfile <x86intrin.h>
685 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
688 /// A 128-bit vector of [2 x double]. The lower double-precision value is
689 /// compared to the lower double-precision value of \a __b.
691 /// A 128-bit vector of [2 x double]. The lower double-precision value is
692 /// compared to the lower double-precision value of \a __a.
693 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
694 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
695 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmplt_sd(__m128d __a
,
697 return (__m128d
)__builtin_ia32_cmpltsd((__v2df
)__a
, (__v2df
)__b
);
700 /// Compares the lower double-precision floating-point values in each of
701 /// the two 128-bit floating-point vectors of [2 x double] to determine if
702 /// the value in the first parameter is less than or equal to the
703 /// corresponding value in the second parameter.
705 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
707 /// \headerfile <x86intrin.h>
709 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
712 /// A 128-bit vector of [2 x double]. The lower double-precision value is
713 /// compared to the lower double-precision value of \a __b.
715 /// A 128-bit vector of [2 x double]. The lower double-precision value is
716 /// compared to the lower double-precision value of \a __a.
717 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
718 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
719 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmple_sd(__m128d __a
,
721 return (__m128d
)__builtin_ia32_cmplesd((__v2df
)__a
, (__v2df
)__b
);
724 /// Compares the lower double-precision floating-point values in each of
725 /// the two 128-bit floating-point vectors of [2 x double] to determine if
726 /// the value in the first parameter is greater than the corresponding value
727 /// in the second parameter.
729 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
731 /// \headerfile <x86intrin.h>
733 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
736 /// A 128-bit vector of [2 x double]. The lower double-precision value is
737 /// compared to the lower double-precision value of \a __b.
739 /// A 128-bit vector of [2 x double]. The lower double-precision value is
740 /// compared to the lower double-precision value of \a __a.
741 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
742 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
743 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpgt_sd(__m128d __a
,
745 __m128d __c
= __builtin_ia32_cmpltsd((__v2df
)__b
, (__v2df
)__a
);
746 return __extension__(__m128d
){__c
[0], __a
[1]};
749 /// Compares the lower double-precision floating-point values in each of
750 /// the two 128-bit floating-point vectors of [2 x double] to determine if
751 /// the value in the first parameter is greater than or equal to the
752 /// corresponding value in the second parameter.
754 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
756 /// \headerfile <x86intrin.h>
758 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
761 /// A 128-bit vector of [2 x double]. The lower double-precision value is
762 /// compared to the lower double-precision value of \a __b.
764 /// A 128-bit vector of [2 x double]. The lower double-precision value is
765 /// compared to the lower double-precision value of \a __a.
766 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
767 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
768 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpge_sd(__m128d __a
,
770 __m128d __c
= __builtin_ia32_cmplesd((__v2df
)__b
, (__v2df
)__a
);
771 return __extension__(__m128d
){__c
[0], __a
[1]};
774 /// Compares the lower double-precision floating-point values in each of
775 /// the two 128-bit floating-point vectors of [2 x double] to determine if
776 /// the value in the first parameter is "ordered" with respect to the
777 /// corresponding value in the second parameter.
779 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
780 /// of double-precision values are "ordered" with respect to each other if
781 /// neither value is a NaN.
783 /// \headerfile <x86intrin.h>
785 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
788 /// A 128-bit vector of [2 x double]. The lower double-precision value is
789 /// compared to the lower double-precision value of \a __b.
791 /// A 128-bit vector of [2 x double]. The lower double-precision value is
792 /// compared to the lower double-precision value of \a __a.
793 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
794 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
795 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpord_sd(__m128d __a
,
797 return (__m128d
)__builtin_ia32_cmpordsd((__v2df
)__a
, (__v2df
)__b
);
800 /// Compares the lower double-precision floating-point values in each of
801 /// the two 128-bit floating-point vectors of [2 x double] to determine if
802 /// the value in the first parameter is "unordered" with respect to the
803 /// corresponding value in the second parameter.
805 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
806 /// of double-precision values are "unordered" with respect to each other if
807 /// one or both values are NaN.
809 /// \headerfile <x86intrin.h>
811 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
815 /// A 128-bit vector of [2 x double]. The lower double-precision value is
816 /// compared to the lower double-precision value of \a __b.
818 /// A 128-bit vector of [2 x double]. The lower double-precision value is
819 /// compared to the lower double-precision value of \a __a.
820 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
821 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
822 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpunord_sd(__m128d __a
,
824 return (__m128d
)__builtin_ia32_cmpunordsd((__v2df
)__a
, (__v2df
)__b
);
827 /// Compares the lower double-precision floating-point values in each of
828 /// the two 128-bit floating-point vectors of [2 x double] to determine if
829 /// the value in the first parameter is unequal to the corresponding value in
830 /// the second parameter.
832 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
834 /// \headerfile <x86intrin.h>
836 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
839 /// A 128-bit vector of [2 x double]. The lower double-precision value is
840 /// compared to the lower double-precision value of \a __b.
842 /// A 128-bit vector of [2 x double]. The lower double-precision value is
843 /// compared to the lower double-precision value of \a __a.
844 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
845 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
846 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpneq_sd(__m128d __a
,
848 return (__m128d
)__builtin_ia32_cmpneqsd((__v2df
)__a
, (__v2df
)__b
);
851 /// Compares the lower double-precision floating-point values in each of
852 /// the two 128-bit floating-point vectors of [2 x double] to determine if
853 /// the value in the first parameter is not less than the corresponding
854 /// value in the second parameter.
856 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
858 /// \headerfile <x86intrin.h>
860 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
863 /// A 128-bit vector of [2 x double]. The lower double-precision value is
864 /// compared to the lower double-precision value of \a __b.
866 /// A 128-bit vector of [2 x double]. The lower double-precision value is
867 /// compared to the lower double-precision value of \a __a.
868 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
869 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
870 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnlt_sd(__m128d __a
,
872 return (__m128d
)__builtin_ia32_cmpnltsd((__v2df
)__a
, (__v2df
)__b
);
875 /// Compares the lower double-precision floating-point values in each of
876 /// the two 128-bit floating-point vectors of [2 x double] to determine if
877 /// the value in the first parameter is not less than or equal to the
878 /// corresponding value in the second parameter.
880 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
882 /// \headerfile <x86intrin.h>
884 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
887 /// A 128-bit vector of [2 x double]. The lower double-precision value is
888 /// compared to the lower double-precision value of \a __b.
890 /// A 128-bit vector of [2 x double]. The lower double-precision value is
891 /// compared to the lower double-precision value of \a __a.
892 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
893 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
894 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnle_sd(__m128d __a
,
896 return (__m128d
)__builtin_ia32_cmpnlesd((__v2df
)__a
, (__v2df
)__b
);
899 /// Compares the lower double-precision floating-point values in each of
900 /// the two 128-bit floating-point vectors of [2 x double] to determine if
901 /// the value in the first parameter is not greater than the corresponding
902 /// value in the second parameter.
904 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
906 /// \headerfile <x86intrin.h>
908 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
911 /// A 128-bit vector of [2 x double]. The lower double-precision value is
912 /// compared to the lower double-precision value of \a __b.
914 /// A 128-bit vector of [2 x double]. The lower double-precision value is
915 /// compared to the lower double-precision value of \a __a.
916 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
917 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
918 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpngt_sd(__m128d __a
,
920 __m128d __c
= __builtin_ia32_cmpnltsd((__v2df
)__b
, (__v2df
)__a
);
921 return __extension__(__m128d
){__c
[0], __a
[1]};
924 /// Compares the lower double-precision floating-point values in each of
925 /// the two 128-bit floating-point vectors of [2 x double] to determine if
926 /// the value in the first parameter is not greater than or equal to the
927 /// corresponding value in the second parameter.
929 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
931 /// \headerfile <x86intrin.h>
933 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
936 /// A 128-bit vector of [2 x double]. The lower double-precision value is
937 /// compared to the lower double-precision value of \a __b.
939 /// A 128-bit vector of [2 x double]. The lower double-precision value is
940 /// compared to the lower double-precision value of \a __a.
941 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
942 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
943 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnge_sd(__m128d __a
,
945 __m128d __c
= __builtin_ia32_cmpnlesd((__v2df
)__b
, (__v2df
)__a
);
946 return __extension__(__m128d
){__c
[0], __a
[1]};
949 /// Compares the lower double-precision floating-point values in each of
950 /// the two 128-bit floating-point vectors of [2 x double] for equality.
952 /// The comparison yields 0 for false, 1 for true. If either of the two
953 /// lower double-precision values is NaN, 0 is returned.
955 /// \headerfile <x86intrin.h>
957 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
960 /// A 128-bit vector of [2 x double]. The lower double-precision value is
961 /// compared to the lower double-precision value of \a __b.
963 /// A 128-bit vector of [2 x double]. The lower double-precision value is
964 /// compared to the lower double-precision value of \a __a.
965 /// \returns An integer containing the comparison results. If either of the two
966 /// lower double-precision values is NaN, 0 is returned.
967 static __inline__
int __DEFAULT_FN_ATTRS
_mm_comieq_sd(__m128d __a
,
969 return __builtin_ia32_comisdeq((__v2df
)__a
, (__v2df
)__b
);
972 /// Compares the lower double-precision floating-point values in each of
973 /// the two 128-bit floating-point vectors of [2 x double] to determine if
974 /// the value in the first parameter is less than the corresponding value in
975 /// the second parameter.
977 /// The comparison yields 0 for false, 1 for true. If either of the two
978 /// lower double-precision values is NaN, 0 is returned.
980 /// \headerfile <x86intrin.h>
982 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
985 /// A 128-bit vector of [2 x double]. The lower double-precision value is
986 /// compared to the lower double-precision value of \a __b.
988 /// A 128-bit vector of [2 x double]. The lower double-precision value is
989 /// compared to the lower double-precision value of \a __a.
990 /// \returns An integer containing the comparison results. If either of the two
991 /// lower double-precision values is NaN, 0 is returned.
992 static __inline__
int __DEFAULT_FN_ATTRS
_mm_comilt_sd(__m128d __a
,
994 return __builtin_ia32_comisdlt((__v2df
)__a
, (__v2df
)__b
);
997 /// Compares the lower double-precision floating-point values in each of
998 /// the two 128-bit floating-point vectors of [2 x double] to determine if
999 /// the value in the first parameter is less than or equal to the
1000 /// corresponding value in the second parameter.
1002 /// The comparison yields 0 for false, 1 for true. If either of the two
1003 /// lower double-precision values is NaN, 0 is returned.
1005 /// \headerfile <x86intrin.h>
1007 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1010 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1011 /// compared to the lower double-precision value of \a __b.
1013 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1014 /// compared to the lower double-precision value of \a __a.
1015 /// \returns An integer containing the comparison results. If either of the two
1016 /// lower double-precision values is NaN, 0 is returned.
1017 static __inline__
int __DEFAULT_FN_ATTRS
_mm_comile_sd(__m128d __a
,
1019 return __builtin_ia32_comisdle((__v2df
)__a
, (__v2df
)__b
);
1022 /// Compares the lower double-precision floating-point values in each of
1023 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1024 /// the value in the first parameter is greater than the corresponding value
1025 /// in the second parameter.
1027 /// The comparison yields 0 for false, 1 for true. If either of the two
1028 /// lower double-precision values is NaN, 0 is returned.
1030 /// \headerfile <x86intrin.h>
1032 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1035 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1036 /// compared to the lower double-precision value of \a __b.
1038 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1039 /// compared to the lower double-precision value of \a __a.
1040 /// \returns An integer containing the comparison results. If either of the two
1041 /// lower double-precision values is NaN, 0 is returned.
1042 static __inline__
int __DEFAULT_FN_ATTRS
_mm_comigt_sd(__m128d __a
,
1044 return __builtin_ia32_comisdgt((__v2df
)__a
, (__v2df
)__b
);
1047 /// Compares the lower double-precision floating-point values in each of
1048 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1049 /// the value in the first parameter is greater than or equal to the
1050 /// corresponding value in the second parameter.
1052 /// The comparison yields 0 for false, 1 for true. If either of the two
1053 /// lower double-precision values is NaN, 0 is returned.
1055 /// \headerfile <x86intrin.h>
1057 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1060 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1061 /// compared to the lower double-precision value of \a __b.
1063 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1064 /// compared to the lower double-precision value of \a __a.
1065 /// \returns An integer containing the comparison results. If either of the two
1066 /// lower double-precision values is NaN, 0 is returned.
1067 static __inline__
int __DEFAULT_FN_ATTRS
_mm_comige_sd(__m128d __a
,
1069 return __builtin_ia32_comisdge((__v2df
)__a
, (__v2df
)__b
);
1072 /// Compares the lower double-precision floating-point values in each of
1073 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1074 /// the value in the first parameter is unequal to the corresponding value in
1075 /// the second parameter.
1077 /// The comparison yields 0 for false, 1 for true. If either of the two
1078 /// lower double-precision values is NaN, 1 is returned.
1080 /// \headerfile <x86intrin.h>
1082 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1085 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1086 /// compared to the lower double-precision value of \a __b.
1088 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1089 /// compared to the lower double-precision value of \a __a.
1090 /// \returns An integer containing the comparison results. If either of the two
1091 /// lower double-precision values is NaN, 1 is returned.
1092 static __inline__
int __DEFAULT_FN_ATTRS
_mm_comineq_sd(__m128d __a
,
1094 return __builtin_ia32_comisdneq((__v2df
)__a
, (__v2df
)__b
);
1097 /// Compares the lower double-precision floating-point values in each of
1098 /// the two 128-bit floating-point vectors of [2 x double] for equality. The
1099 /// comparison yields 0 for false, 1 for true.
1101 /// If either of the two lower double-precision values is NaN, 0 is returned.
1103 /// \headerfile <x86intrin.h>
1105 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1108 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1109 /// compared to the lower double-precision value of \a __b.
1111 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1112 /// compared to the lower double-precision value of \a __a.
1113 /// \returns An integer containing the comparison results. If either of the two
1114 /// lower double-precision values is NaN, 0 is returned.
1115 static __inline__
int __DEFAULT_FN_ATTRS
_mm_ucomieq_sd(__m128d __a
,
1117 return __builtin_ia32_ucomisdeq((__v2df
)__a
, (__v2df
)__b
);
1120 /// Compares the lower double-precision floating-point values in each of
1121 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1122 /// the value in the first parameter is less than the corresponding value in
1123 /// the second parameter.
1125 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1126 /// double-precision values is NaN, 0 is returned.
1128 /// \headerfile <x86intrin.h>
1130 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1133 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1134 /// compared to the lower double-precision value of \a __b.
1136 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1137 /// compared to the lower double-precision value of \a __a.
1138 /// \returns An integer containing the comparison results. If either of the two
1139 /// lower double-precision values is NaN, 0 is returned.
1140 static __inline__
int __DEFAULT_FN_ATTRS
_mm_ucomilt_sd(__m128d __a
,
1142 return __builtin_ia32_ucomisdlt((__v2df
)__a
, (__v2df
)__b
);
1145 /// Compares the lower double-precision floating-point values in each of
1146 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1147 /// the value in the first parameter is less than or equal to the
1148 /// corresponding value in the second parameter.
1150 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1151 /// double-precision values is NaN, 0 is returned.
1153 /// \headerfile <x86intrin.h>
1155 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1158 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1159 /// compared to the lower double-precision value of \a __b.
1161 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1162 /// compared to the lower double-precision value of \a __a.
1163 /// \returns An integer containing the comparison results. If either of the two
1164 /// lower double-precision values is NaN, 0 is returned.
1165 static __inline__
int __DEFAULT_FN_ATTRS
_mm_ucomile_sd(__m128d __a
,
1167 return __builtin_ia32_ucomisdle((__v2df
)__a
, (__v2df
)__b
);
1170 /// Compares the lower double-precision floating-point values in each of
1171 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1172 /// the value in the first parameter is greater than the corresponding value
1173 /// in the second parameter.
1175 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1176 /// double-precision values is NaN, 0 is returned.
1178 /// \headerfile <x86intrin.h>
1180 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1183 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1184 /// compared to the lower double-precision value of \a __b.
1186 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1187 /// compared to the lower double-precision value of \a __a.
1188 /// \returns An integer containing the comparison results. If either of the two
1189 /// lower double-precision values is NaN, 0 is returned.
1190 static __inline__
int __DEFAULT_FN_ATTRS
_mm_ucomigt_sd(__m128d __a
,
1192 return __builtin_ia32_ucomisdgt((__v2df
)__a
, (__v2df
)__b
);
1195 /// Compares the lower double-precision floating-point values in each of
1196 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1197 /// the value in the first parameter is greater than or equal to the
1198 /// corresponding value in the second parameter.
1200 /// The comparison yields 0 for false, 1 for true. If either of the two
1201 /// lower double-precision values is NaN, 0 is returned.
1203 /// \headerfile <x86intrin.h>
1205 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1208 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1209 /// compared to the lower double-precision value of \a __b.
1211 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1212 /// compared to the lower double-precision value of \a __a.
1213 /// \returns An integer containing the comparison results. If either of the two
1214 /// lower double-precision values is NaN, 0 is returned.
1215 static __inline__
int __DEFAULT_FN_ATTRS
_mm_ucomige_sd(__m128d __a
,
1217 return __builtin_ia32_ucomisdge((__v2df
)__a
, (__v2df
)__b
);
1220 /// Compares the lower double-precision floating-point values in each of
1221 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1222 /// the value in the first parameter is unequal to the corresponding value in
1223 /// the second parameter.
1225 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1226 /// double-precision values is NaN, 1 is returned.
1228 /// \headerfile <x86intrin.h>
1230 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1233 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1234 /// compared to the lower double-precision value of \a __b.
1236 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1237 /// compared to the lower double-precision value of \a __a.
1238 /// \returns An integer containing the comparison result. If either of the two
1239 /// lower double-precision values is NaN, 1 is returned.
1240 static __inline__
int __DEFAULT_FN_ATTRS
_mm_ucomineq_sd(__m128d __a
,
1242 return __builtin_ia32_ucomisdneq((__v2df
)__a
, (__v2df
)__b
);
1245 /// Converts the two double-precision floating-point elements of a
1246 /// 128-bit vector of [2 x double] into two single-precision floating-point
1247 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1248 /// The upper 64 bits of the result vector are set to zero.
1250 /// \headerfile <x86intrin.h>
1252 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1255 /// A 128-bit vector of [2 x double].
1256 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1257 /// converted values. The upper 64 bits are set to zero.
1258 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtpd_ps(__m128d __a
) {
1259 return __builtin_ia32_cvtpd2ps((__v2df
)__a
);
1262 /// Converts the lower two single-precision floating-point elements of a
1263 /// 128-bit vector of [4 x float] into two double-precision floating-point
1264 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1265 /// elements of the input vector are unused.
1267 /// \headerfile <x86intrin.h>
1269 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1272 /// A 128-bit vector of [4 x float]. The lower two single-precision
1273 /// floating-point elements are converted to double-precision values. The
1274 /// upper two elements are unused.
1275 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1276 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtps_pd(__m128 __a
) {
1277 return (__m128d
) __builtin_convertvector(
1278 __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__a
, 0, 1), __v2df
);
1281 /// Converts the lower two integer elements of a 128-bit vector of
1282 /// [4 x i32] into two double-precision floating-point values, returned in a
1283 /// 128-bit vector of [2 x double].
1285 /// The upper two elements of the input vector are unused.
1287 /// \headerfile <x86intrin.h>
1289 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1292 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1293 /// converted to double-precision values.
1295 /// The upper two elements are unused.
1296 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1297 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtepi32_pd(__m128i __a
) {
1298 return (__m128d
) __builtin_convertvector(
1299 __builtin_shufflevector((__v4si
)__a
, (__v4si
)__a
, 0, 1), __v2df
);
1302 /// Converts the two double-precision floating-point elements of a
1303 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1304 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1305 /// 64 bits of the result vector are set to zero.
1307 /// \headerfile <x86intrin.h>
1309 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1312 /// A 128-bit vector of [2 x double].
1313 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1314 /// converted values. The upper 64 bits are set to zero.
1315 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtpd_epi32(__m128d __a
) {
1316 return __builtin_ia32_cvtpd2dq((__v2df
)__a
);
1319 /// Converts the low-order element of a 128-bit vector of [2 x double]
1320 /// into a 32-bit signed integer value.
1322 /// \headerfile <x86intrin.h>
1324 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1327 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1329 /// \returns A 32-bit signed integer containing the converted value.
1330 static __inline__
int __DEFAULT_FN_ATTRS
_mm_cvtsd_si32(__m128d __a
) {
1331 return __builtin_ia32_cvtsd2si((__v2df
)__a
);
1334 /// Converts the lower double-precision floating-point element of a
1335 /// 128-bit vector of [2 x double], in the second parameter, into a
1336 /// single-precision floating-point value, returned in the lower 32 bits of a
1337 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1338 /// copied from the upper 96 bits of the first parameter.
1340 /// \headerfile <x86intrin.h>
1342 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1345 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1346 /// copied to the upper 96 bits of the result.
1348 /// A 128-bit vector of [2 x double]. The lower double-precision
1349 /// floating-point element is used in the conversion.
1350 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1351 /// converted value from the second parameter. The upper 96 bits are copied
1352 /// from the upper 96 bits of the first parameter.
1353 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsd_ss(__m128 __a
,
1355 return (__m128
)__builtin_ia32_cvtsd2ss((__v4sf
)__a
, (__v2df
)__b
);
1358 /// Converts a 32-bit signed integer value, in the second parameter, into
1359 /// a double-precision floating-point value, returned in the lower 64 bits of
1360 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1361 /// are copied from the upper 64 bits of the first parameter.
1363 /// \headerfile <x86intrin.h>
1365 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1368 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1369 /// copied to the upper 64 bits of the result.
1371 /// A 32-bit signed integer containing the value to be converted.
1372 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1373 /// converted value from the second parameter. The upper 64 bits are copied
1374 /// from the upper 64 bits of the first parameter.
1375 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtsi32_sd(__m128d __a
,
1381 /// Converts the lower single-precision floating-point element of a
1382 /// 128-bit vector of [4 x float], in the second parameter, into a
1383 /// double-precision floating-point value, returned in the lower 64 bits of
1384 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1385 /// are copied from the upper 64 bits of the first parameter.
1387 /// \headerfile <x86intrin.h>
1389 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1392 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1393 /// copied to the upper 64 bits of the result.
1395 /// A 128-bit vector of [4 x float]. The lower single-precision
1396 /// floating-point element is used in the conversion.
1397 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1398 /// converted value from the second parameter. The upper 64 bits are copied
1399 /// from the upper 64 bits of the first parameter.
1400 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtss_sd(__m128d __a
,
1406 /// Converts the two double-precision floating-point elements of a
1407 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1408 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1410 /// If the result of either conversion is inexact, the result is truncated
1411 /// (rounded towards zero) regardless of the current MXCSR setting. The upper
1412 /// 64 bits of the result vector are set to zero.
1414 /// \headerfile <x86intrin.h>
1416 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1420 /// A 128-bit vector of [2 x double].
1421 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1422 /// converted values. The upper 64 bits are set to zero.
1423 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvttpd_epi32(__m128d __a
) {
1424 return (__m128i
)__builtin_ia32_cvttpd2dq((__v2df
)__a
);
1427 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1428 /// signed integer value, truncating the result when it is inexact.
1430 /// \headerfile <x86intrin.h>
1432 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1436 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1438 /// \returns A 32-bit signed integer containing the converted value.
1439 static __inline__
int __DEFAULT_FN_ATTRS
_mm_cvttsd_si32(__m128d __a
) {
1440 return __builtin_ia32_cvttsd2si((__v2df
)__a
);
1443 /// Converts the two double-precision floating-point elements of a
1444 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1445 /// returned in a 64-bit vector of [2 x i32].
1447 /// \headerfile <x86intrin.h>
1449 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1452 /// A 128-bit vector of [2 x double].
1453 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1454 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpd_pi32(__m128d __a
) {
1455 return (__m64
)__builtin_ia32_cvtpd2pi((__v2df
)__a
);
1458 /// Converts the two double-precision floating-point elements of a
1459 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1460 /// returned in a 64-bit vector of [2 x i32].
1462 /// If the result of either conversion is inexact, the result is truncated
1463 /// (rounded towards zero) regardless of the current MXCSR setting.
1465 /// \headerfile <x86intrin.h>
1467 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1470 /// A 128-bit vector of [2 x double].
1471 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1472 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvttpd_pi32(__m128d __a
) {
1473 return (__m64
)__builtin_ia32_cvttpd2pi((__v2df
)__a
);
1476 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1477 /// [2 x i32] into two double-precision floating-point values, returned in a
1478 /// 128-bit vector of [2 x double].
1480 /// \headerfile <x86intrin.h>
1482 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1485 /// A 64-bit vector of [2 x i32].
1486 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1487 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi32_pd(__m64 __a
) {
1488 return __builtin_ia32_cvtpi2pd((__v2si
)__a
);
1491 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1492 /// a double-precision floating-point value.
1494 /// \headerfile <x86intrin.h>
1496 /// This intrinsic has no corresponding instruction.
1499 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1500 /// \returns A double-precision floating-point value copied from the lower 64
1502 static __inline__
double __DEFAULT_FN_ATTRS
_mm_cvtsd_f64(__m128d __a
) {
1506 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1507 /// memory location.
1509 /// \headerfile <x86intrin.h>
1511 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1514 /// A pointer to a 128-bit memory location. The address of the memory
1515 /// location has to be 16-byte aligned.
1516 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1517 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load_pd(double const *__dp
) {
1518 return *(const __m128d
*)__dp
;
1521 /// Loads a double-precision floating-point value from a specified memory
1522 /// location and duplicates it to both vector elements of a 128-bit vector of
1525 /// \headerfile <x86intrin.h>
1527 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1530 /// A pointer to a memory location containing a double-precision value.
1531 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1532 /// duplicated values.
1533 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load1_pd(double const *__dp
) {
1534 struct __mm_load1_pd_struct
{
1536 } __attribute__((__packed__
, __may_alias__
));
1537 double __u
= ((const struct __mm_load1_pd_struct
*)__dp
)->__u
;
1538 return __extension__(__m128d
){__u
, __u
};
1541 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1543 /// Loads two double-precision values, in reverse order, from an aligned
1544 /// memory location into a 128-bit vector of [2 x double].
1546 /// \headerfile <x86intrin.h>
1548 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1549 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1550 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1553 /// A 16-byte aligned pointer to an array of double-precision values to be
1554 /// loaded in reverse order.
1555 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1557 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadr_pd(double const *__dp
) {
1558 __m128d __u
= *(const __m128d
*)__dp
;
1559 return __builtin_shufflevector((__v2df
)__u
, (__v2df
)__u
, 1, 0);
1562 /// Loads a 128-bit floating-point vector of [2 x double] from an
1563 /// unaligned memory location.
1565 /// \headerfile <x86intrin.h>
1567 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1570 /// A pointer to a 128-bit memory location. The address of the memory
1571 /// location does not have to be aligned.
1572 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1573 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadu_pd(double const *__dp
) {
1576 } __attribute__((__packed__
, __may_alias__
));
1577 return ((const struct __loadu_pd
*)__dp
)->__v
;
1580 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1581 /// vector and clears the upper element.
1583 /// \headerfile <x86intrin.h>
1585 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1588 /// A pointer to a 64-bit memory location. The address of the memory
1589 /// location does not have to be aligned.
1590 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1591 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si64(void const *__a
) {
1592 struct __loadu_si64
{
1594 } __attribute__((__packed__
, __may_alias__
));
1595 long long __u
= ((const struct __loadu_si64
*)__a
)->__v
;
1596 return __extension__(__m128i
)(__v2di
){__u
, 0LL};
1599 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1600 /// vector and clears the upper element.
1602 /// \headerfile <x86intrin.h>
1604 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1607 /// A pointer to a 32-bit memory location. The address of the memory
1608 /// location does not have to be aligned.
1609 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1610 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si32(void const *__a
) {
1611 struct __loadu_si32
{
1613 } __attribute__((__packed__
, __may_alias__
));
1614 int __u
= ((const struct __loadu_si32
*)__a
)->__v
;
1615 return __extension__(__m128i
)(__v4si
){__u
, 0, 0, 0};
1618 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1619 /// vector and clears the upper element.
1621 /// \headerfile <x86intrin.h>
1623 /// This intrinsic does not correspond to a specific instruction.
1626 /// A pointer to a 16-bit memory location. The address of the memory
1627 /// location does not have to be aligned.
1628 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1629 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si16(void const *__a
) {
1630 struct __loadu_si16
{
1632 } __attribute__((__packed__
, __may_alias__
));
1633 short __u
= ((const struct __loadu_si16
*)__a
)->__v
;
1634 return __extension__(__m128i
)(__v8hi
){__u
, 0, 0, 0, 0, 0, 0, 0};
1637 /// Loads a 64-bit double-precision value to the low element of a
1638 /// 128-bit integer vector and clears the upper element.
1640 /// \headerfile <x86intrin.h>
1642 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1645 /// A pointer to a memory location containing a double-precision value.
1646 /// The address of the memory location does not have to be aligned.
1647 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1648 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load_sd(double const *__dp
) {
1649 struct __mm_load_sd_struct
{
1651 } __attribute__((__packed__
, __may_alias__
));
1652 double __u
= ((const struct __mm_load_sd_struct
*)__dp
)->__u
;
1653 return __extension__(__m128d
){__u
, 0};
1656 /// Loads a double-precision value into the high-order bits of a 128-bit
1657 /// vector of [2 x double]. The low-order bits are copied from the low-order
1658 /// bits of the first operand.
1660 /// \headerfile <x86intrin.h>
1662 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1665 /// A 128-bit vector of [2 x double]. \n
1666 /// Bits [63:0] are written to bits [63:0] of the result.
1668 /// A pointer to a 64-bit memory location containing a double-precision
1669 /// floating-point value that is loaded. The loaded value is written to bits
1670 /// [127:64] of the result. The address of the memory location does not have
1672 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1673 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadh_pd(__m128d __a
,
1674 double const *__dp
) {
1675 struct __mm_loadh_pd_struct
{
1677 } __attribute__((__packed__
, __may_alias__
));
1678 double __u
= ((const struct __mm_loadh_pd_struct
*)__dp
)->__u
;
1679 return __extension__(__m128d
){__a
[0], __u
};
1682 /// Loads a double-precision value into the low-order bits of a 128-bit
1683 /// vector of [2 x double]. The high-order bits are copied from the
1684 /// high-order bits of the first operand.
1686 /// \headerfile <x86intrin.h>
1688 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1691 /// A 128-bit vector of [2 x double]. \n
1692 /// Bits [127:64] are written to bits [127:64] of the result.
1694 /// A pointer to a 64-bit memory location containing a double-precision
1695 /// floating-point value that is loaded. The loaded value is written to bits
1696 /// [63:0] of the result. The address of the memory location does not have to
1698 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1699 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadl_pd(__m128d __a
,
1700 double const *__dp
) {
1701 struct __mm_loadl_pd_struct
{
1703 } __attribute__((__packed__
, __may_alias__
));
1704 double __u
= ((const struct __mm_loadl_pd_struct
*)__dp
)->__u
;
1705 return __extension__(__m128d
){__u
, __a
[1]};
1708 /// Constructs a 128-bit floating-point vector of [2 x double] with
1709 /// unspecified content. This could be used as an argument to another
1710 /// intrinsic function where the argument is required but the value is not
1713 /// \headerfile <x86intrin.h>
1715 /// This intrinsic has no corresponding instruction.
1717 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1719 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_undefined_pd(void) {
1720 return (__m128d
)__builtin_ia32_undef128();
1723 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1724 /// 64 bits of the vector are initialized with the specified double-precision
1725 /// floating-point value. The upper 64 bits are set to zero.
1727 /// \headerfile <x86intrin.h>
1729 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1732 /// A double-precision floating-point value used to initialize the lower 64
1733 /// bits of the result.
1734 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1735 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1737 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_sd(double __w
) {
1738 return __extension__(__m128d
){__w
, 0};
1741 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1742 /// of the two double-precision floating-point vector elements set to the
1743 /// specified double-precision floating-point value.
1745 /// \headerfile <x86intrin.h>
1747 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1750 /// A double-precision floating-point value used to initialize each vector
1751 /// element of the result.
1752 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1753 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set1_pd(double __w
) {
1754 return __extension__(__m128d
){__w
, __w
};
1757 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1758 /// of the two double-precision floating-point vector elements set to the
1759 /// specified double-precision floating-point value.
1761 /// \headerfile <x86intrin.h>
1763 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1766 /// A double-precision floating-point value used to initialize each vector
1767 /// element of the result.
1768 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1769 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_pd1(double __w
) {
1770 return _mm_set1_pd(__w
);
1773 /// Constructs a 128-bit floating-point vector of [2 x double]
1774 /// initialized with the specified double-precision floating-point values.
1776 /// \headerfile <x86intrin.h>
1778 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1781 /// A double-precision floating-point value used to initialize the upper 64
1782 /// bits of the result.
1784 /// A double-precision floating-point value used to initialize the lower 64
1785 /// bits of the result.
1786 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1787 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_pd(double __w
,
1789 return __extension__(__m128d
){__x
, __w
};
1792 /// Constructs a 128-bit floating-point vector of [2 x double],
1793 /// initialized in reverse order with the specified double-precision
1794 /// floating-point values.
1796 /// \headerfile <x86intrin.h>
1798 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1801 /// A double-precision floating-point value used to initialize the lower 64
1802 /// bits of the result.
1804 /// A double-precision floating-point value used to initialize the upper 64
1805 /// bits of the result.
1806 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1807 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_setr_pd(double __w
,
1809 return __extension__(__m128d
){__w
, __x
};
1812 /// Constructs a 128-bit floating-point vector of [2 x double]
1813 /// initialized to zero.
1815 /// \headerfile <x86intrin.h>
1817 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1819 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1820 /// all elements set to zero.
1821 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_setzero_pd(void) {
1822 return __extension__(__m128d
){0.0, 0.0};
1825 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1826 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1827 /// 64 bits are set to the upper 64 bits of the first parameter.
1829 /// \headerfile <x86intrin.h>
1831 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1834 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1835 /// upper 64 bits of the result.
1837 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1838 /// lower 64 bits of the result.
1839 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1840 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_move_sd(__m128d __a
,
1846 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1847 /// memory location.
1849 /// \headerfile <x86intrin.h>
1851 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1854 /// A pointer to a 64-bit memory location.
1856 /// A 128-bit vector of [2 x double] containing the value to be stored.
1857 static __inline__
void __DEFAULT_FN_ATTRS
_mm_store_sd(double *__dp
,
1859 struct __mm_store_sd_struct
{
1861 } __attribute__((__packed__
, __may_alias__
));
1862 ((struct __mm_store_sd_struct
*)__dp
)->__u
= __a
[0];
1865 /// Moves packed double-precision values from a 128-bit vector of
1866 /// [2 x double] to a memory location.
1868 /// \headerfile <x86intrin.h>
1870 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1873 /// A pointer to an aligned memory location that can store two
1874 /// double-precision values.
1876 /// A packed 128-bit vector of [2 x double] containing the values to be
1878 static __inline__
void __DEFAULT_FN_ATTRS
_mm_store_pd(double *__dp
,
1880 *(__m128d
*)__dp
= __a
;
1883 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1884 /// the upper and lower 64 bits of a memory location.
1886 /// \headerfile <x86intrin.h>
1888 /// This intrinsic corresponds to the
1889 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1892 /// A pointer to a memory location that can store two double-precision
1895 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1896 /// of the values in \a __dp.
1897 static __inline__
void __DEFAULT_FN_ATTRS
_mm_store1_pd(double *__dp
,
1899 __a
= __builtin_shufflevector((__v2df
)__a
, (__v2df
)__a
, 0, 0);
1900 _mm_store_pd(__dp
, __a
);
1903 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1904 /// the upper and lower 64 bits of a memory location.
1906 /// \headerfile <x86intrin.h>
1908 /// This intrinsic corresponds to the
1909 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1912 /// A pointer to a memory location that can store two double-precision
1915 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1916 /// of the values in \a __dp.
1917 static __inline__
void __DEFAULT_FN_ATTRS
_mm_store_pd1(double *__dp
,
1919 _mm_store1_pd(__dp
, __a
);
1922 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1925 /// \headerfile <x86intrin.h>
1927 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1930 /// A pointer to a 128-bit memory location. The address of the memory
1931 /// location does not have to be aligned.
1933 /// A 128-bit vector of [2 x double] containing the values to be stored.
1934 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storeu_pd(double *__dp
,
1936 struct __storeu_pd
{
1938 } __attribute__((__packed__
, __may_alias__
));
1939 ((struct __storeu_pd
*)__dp
)->__v
= __a
;
1942 /// Stores two double-precision values, in reverse order, from a 128-bit
1943 /// vector of [2 x double] to a 16-byte aligned memory location.
1945 /// \headerfile <x86intrin.h>
1947 /// This intrinsic corresponds to a shuffling instruction followed by a
1948 /// <c> VMOVAPD / MOVAPD </c> instruction.
1951 /// A pointer to a 16-byte aligned memory location that can store two
1952 /// double-precision values.
1954 /// A 128-bit vector of [2 x double] containing the values to be reversed and
1956 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storer_pd(double *__dp
,
1958 __a
= __builtin_shufflevector((__v2df
)__a
, (__v2df
)__a
, 1, 0);
1959 *(__m128d
*)__dp
= __a
;
1962 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1963 /// memory location.
1965 /// \headerfile <x86intrin.h>
1967 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1970 /// A pointer to a 64-bit memory location.
1972 /// A 128-bit vector of [2 x double] containing the value to be stored.
1973 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storeh_pd(double *__dp
,
1975 struct __mm_storeh_pd_struct
{
1977 } __attribute__((__packed__
, __may_alias__
));
1978 ((struct __mm_storeh_pd_struct
*)__dp
)->__u
= __a
[1];
1981 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1982 /// memory location.
1984 /// \headerfile <x86intrin.h>
1986 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1989 /// A pointer to a 64-bit memory location.
1991 /// A 128-bit vector of [2 x double] containing the value to be stored.
1992 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storel_pd(double *__dp
,
1994 struct __mm_storeh_pd_struct
{
1996 } __attribute__((__packed__
, __may_alias__
));
1997 ((struct __mm_storeh_pd_struct
*)__dp
)->__u
= __a
[0];
2000 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2001 /// saving the lower 8 bits of each sum in the corresponding element of a
2002 /// 128-bit result vector of [16 x i8].
2004 /// The integer elements of both parameters can be either signed or unsigned.
2006 /// \headerfile <x86intrin.h>
2008 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2011 /// A 128-bit vector of [16 x i8].
2013 /// A 128-bit vector of [16 x i8].
2014 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2016 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi8(__m128i __a
,
2018 return (__m128i
)((__v16qu
)__a
+ (__v16qu
)__b
);
2021 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2022 /// saving the lower 16 bits of each sum in the corresponding element of a
2023 /// 128-bit result vector of [8 x i16].
2025 /// The integer elements of both parameters can be either signed or unsigned.
2027 /// \headerfile <x86intrin.h>
2029 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2032 /// A 128-bit vector of [8 x i16].
2034 /// A 128-bit vector of [8 x i16].
2035 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2037 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi16(__m128i __a
,
2039 return (__m128i
)((__v8hu
)__a
+ (__v8hu
)__b
);
2042 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2043 /// saving the lower 32 bits of each sum in the corresponding element of a
2044 /// 128-bit result vector of [4 x i32].
2046 /// The integer elements of both parameters can be either signed or unsigned.
2048 /// \headerfile <x86intrin.h>
2050 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2053 /// A 128-bit vector of [4 x i32].
2055 /// A 128-bit vector of [4 x i32].
2056 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2058 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi32(__m128i __a
,
2060 return (__m128i
)((__v4su
)__a
+ (__v4su
)__b
);
2063 /// Adds two signed or unsigned 64-bit integer values, returning the
2064 /// lower 64 bits of the sum.
2066 /// \headerfile <x86intrin.h>
2068 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2071 /// A 64-bit integer.
2073 /// A 64-bit integer.
2074 /// \returns A 64-bit integer containing the sum of both parameters.
2075 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_add_si64(__m64 __a
,
2077 return (__m64
)__builtin_ia32_paddq((__v1di
)__a
, (__v1di
)__b
);
2080 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2081 /// saving the lower 64 bits of each sum in the corresponding element of a
2082 /// 128-bit result vector of [2 x i64].
2084 /// The integer elements of both parameters can be either signed or unsigned.
2086 /// \headerfile <x86intrin.h>
2088 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2091 /// A 128-bit vector of [2 x i64].
2093 /// A 128-bit vector of [2 x i64].
2094 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2096 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi64(__m128i __a
,
2098 return (__m128i
)((__v2du
)__a
+ (__v2du
)__b
);
2101 /// Adds, with saturation, the corresponding elements of two 128-bit
2102 /// signed [16 x i8] vectors, saving each sum in the corresponding element of
2103 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2104 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2106 /// \headerfile <x86intrin.h>
2108 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2111 /// A 128-bit signed [16 x i8] vector.
2113 /// A 128-bit signed [16 x i8] vector.
2114 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2115 /// both parameters.
2116 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi8(__m128i __a
,
2118 return (__m128i
)__builtin_elementwise_add_sat((__v16qs
)__a
, (__v16qs
)__b
);
2121 /// Adds, with saturation, the corresponding elements of two 128-bit
2122 /// signed [8 x i16] vectors, saving each sum in the corresponding element of
2123 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2124 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2127 /// \headerfile <x86intrin.h>
2129 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2132 /// A 128-bit signed [8 x i16] vector.
2134 /// A 128-bit signed [8 x i16] vector.
2135 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2136 /// both parameters.
2137 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi16(__m128i __a
,
2139 return (__m128i
)__builtin_elementwise_add_sat((__v8hi
)__a
, (__v8hi
)__b
);
2142 /// Adds, with saturation, the corresponding elements of two 128-bit
2143 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2144 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2145 /// are saturated to 0xFF. Negative sums are saturated to 0x00.
2147 /// \headerfile <x86intrin.h>
2149 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2152 /// A 128-bit unsigned [16 x i8] vector.
2154 /// A 128-bit unsigned [16 x i8] vector.
2155 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2156 /// of both parameters.
2157 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu8(__m128i __a
,
2159 return (__m128i
)__builtin_elementwise_add_sat((__v16qu
)__a
, (__v16qu
)__b
);
2162 /// Adds, with saturation, the corresponding elements of two 128-bit
2163 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2164 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than
2165 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2167 /// \headerfile <x86intrin.h>
2169 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2172 /// A 128-bit unsigned [8 x i16] vector.
2174 /// A 128-bit unsigned [8 x i16] vector.
2175 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2176 /// of both parameters.
2177 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu16(__m128i __a
,
2179 return (__m128i
)__builtin_elementwise_add_sat((__v8hu
)__a
, (__v8hu
)__b
);
2182 /// Computes the rounded averages of corresponding elements of two
2183 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2184 /// corresponding element of a 128-bit result vector of [16 x i8].
2186 /// \headerfile <x86intrin.h>
2188 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2191 /// A 128-bit unsigned [16 x i8] vector.
2193 /// A 128-bit unsigned [16 x i8] vector.
2194 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2195 /// averages of both parameters.
2196 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu8(__m128i __a
,
2198 return (__m128i
)__builtin_ia32_pavgb128((__v16qi
)__a
, (__v16qi
)__b
);
2201 /// Computes the rounded averages of corresponding elements of two
2202 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2203 /// corresponding element of a 128-bit result vector of [8 x i16].
2205 /// \headerfile <x86intrin.h>
2207 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2210 /// A 128-bit unsigned [8 x i16] vector.
2212 /// A 128-bit unsigned [8 x i16] vector.
2213 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2214 /// averages of both parameters.
2215 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu16(__m128i __a
,
2217 return (__m128i
)__builtin_ia32_pavgw128((__v8hi
)__a
, (__v8hi
)__b
);
2220 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2221 /// vectors, producing eight intermediate 32-bit signed integer products, and
2222 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2223 /// [4 x i32] vector.
2225 /// For example, bits [15:0] of both parameters are multiplied producing a
2226 /// 32-bit product, bits [31:16] of both parameters are multiplied producing
2227 /// a 32-bit product, and the sum of those two products becomes bits [31:0]
2230 /// \headerfile <x86intrin.h>
2232 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2235 /// A 128-bit signed [8 x i16] vector.
2237 /// A 128-bit signed [8 x i16] vector.
2238 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2239 /// of both parameters.
2240 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_madd_epi16(__m128i __a
,
2242 return (__m128i
)__builtin_ia32_pmaddwd128((__v8hi
)__a
, (__v8hi
)__b
);
2245 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2246 /// vectors, saving the greater value from each comparison in the
2247 /// corresponding element of a 128-bit result vector of [8 x i16].
2249 /// \headerfile <x86intrin.h>
2251 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2254 /// A 128-bit signed [8 x i16] vector.
2256 /// A 128-bit signed [8 x i16] vector.
2257 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2258 /// each comparison.
2259 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epi16(__m128i __a
,
2261 return (__m128i
)__builtin_elementwise_max((__v8hi
)__a
, (__v8hi
)__b
);
2264 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2265 /// vectors, saving the greater value from each comparison in the
2266 /// corresponding element of a 128-bit result vector of [16 x i8].
2268 /// \headerfile <x86intrin.h>
2270 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2273 /// A 128-bit unsigned [16 x i8] vector.
2275 /// A 128-bit unsigned [16 x i8] vector.
2276 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2277 /// each comparison.
2278 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epu8(__m128i __a
,
2280 return (__m128i
)__builtin_elementwise_max((__v16qu
)__a
, (__v16qu
)__b
);
2283 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2284 /// vectors, saving the smaller value from each comparison in the
2285 /// corresponding element of a 128-bit result vector of [8 x i16].
2287 /// \headerfile <x86intrin.h>
2289 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2292 /// A 128-bit signed [8 x i16] vector.
2294 /// A 128-bit signed [8 x i16] vector.
2295 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2296 /// each comparison.
2297 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epi16(__m128i __a
,
2299 return (__m128i
)__builtin_elementwise_min((__v8hi
)__a
, (__v8hi
)__b
);
2302 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2303 /// vectors, saving the smaller value from each comparison in the
2304 /// corresponding element of a 128-bit result vector of [16 x i8].
2306 /// \headerfile <x86intrin.h>
2308 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2311 /// A 128-bit unsigned [16 x i8] vector.
2313 /// A 128-bit unsigned [16 x i8] vector.
2314 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2315 /// each comparison.
2316 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epu8(__m128i __a
,
2318 return (__m128i
)__builtin_elementwise_min((__v16qu
)__a
, (__v16qu
)__b
);
2321 /// Multiplies the corresponding elements of two signed [8 x i16]
2322 /// vectors, saving the upper 16 bits of each 32-bit product in the
2323 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2325 /// \headerfile <x86intrin.h>
2327 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2330 /// A 128-bit signed [8 x i16] vector.
2332 /// A 128-bit signed [8 x i16] vector.
2333 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2334 /// each of the eight 32-bit products.
2335 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhi_epi16(__m128i __a
,
2337 return (__m128i
)__builtin_ia32_pmulhw128((__v8hi
)__a
, (__v8hi
)__b
);
2340 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2341 /// vectors, saving the upper 16 bits of each 32-bit product in the
2342 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2344 /// \headerfile <x86intrin.h>
2346 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2349 /// A 128-bit unsigned [8 x i16] vector.
2351 /// A 128-bit unsigned [8 x i16] vector.
2352 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2353 /// of each of the eight 32-bit products.
2354 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhi_epu16(__m128i __a
,
2356 return (__m128i
)__builtin_ia32_pmulhuw128((__v8hi
)__a
, (__v8hi
)__b
);
2359 /// Multiplies the corresponding elements of two signed [8 x i16]
2360 /// vectors, saving the lower 16 bits of each 32-bit product in the
2361 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2363 /// \headerfile <x86intrin.h>
2365 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2368 /// A 128-bit signed [8 x i16] vector.
2370 /// A 128-bit signed [8 x i16] vector.
2371 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2372 /// each of the eight 32-bit products.
2373 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mullo_epi16(__m128i __a
,
2375 return (__m128i
)((__v8hu
)__a
* (__v8hu
)__b
);
2378 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2379 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2382 /// \headerfile <x86intrin.h>
2384 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2387 /// A 64-bit integer containing one of the source operands.
2389 /// A 64-bit integer containing one of the source operands.
2390 /// \returns A 64-bit integer vector containing the product of both operands.
2391 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_mul_su32(__m64 __a
,
2393 return __builtin_ia32_pmuludq((__v2si
)__a
, (__v2si
)__b
);
2396 /// Multiplies 32-bit unsigned integer values contained in the lower
2397 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2398 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2400 /// \headerfile <x86intrin.h>
2402 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2405 /// A [2 x i64] vector containing one of the source operands.
2407 /// A [2 x i64] vector containing one of the source operands.
2408 /// \returns A [2 x i64] vector containing the product of both operands.
2409 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mul_epu32(__m128i __a
,
2411 return __builtin_ia32_pmuludq128((__v4si
)__a
, (__v4si
)__b
);
2414 /// Computes the absolute differences of corresponding 8-bit integer
2415 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2416 /// separately sums the second 8 absolute differences. Packs these two
2417 /// unsigned 16-bit integer sums into the upper and lower elements of a
2418 /// [2 x i64] vector.
2420 /// \headerfile <x86intrin.h>
2422 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2425 /// A 128-bit integer vector containing one of the source operands.
2427 /// A 128-bit integer vector containing one of the source operands.
2428 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2429 /// differences between both operands.
2430 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sad_epu8(__m128i __a
,
2432 return __builtin_ia32_psadbw128((__v16qi
)__a
, (__v16qi
)__b
);
2435 /// Subtracts the corresponding 8-bit integer values in the operands.
2437 /// \headerfile <x86intrin.h>
2439 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2442 /// A 128-bit integer vector containing the minuends.
2444 /// A 128-bit integer vector containing the subtrahends.
2445 /// \returns A 128-bit integer vector containing the differences of the values
2446 /// in the operands.
2447 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi8(__m128i __a
,
2449 return (__m128i
)((__v16qu
)__a
- (__v16qu
)__b
);
2452 /// Subtracts the corresponding 16-bit integer values in the operands.
2454 /// \headerfile <x86intrin.h>
2456 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2459 /// A 128-bit integer vector containing the minuends.
2461 /// A 128-bit integer vector containing the subtrahends.
2462 /// \returns A 128-bit integer vector containing the differences of the values
2463 /// in the operands.
2464 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi16(__m128i __a
,
2466 return (__m128i
)((__v8hu
)__a
- (__v8hu
)__b
);
2469 /// Subtracts the corresponding 32-bit integer values in the operands.
2471 /// \headerfile <x86intrin.h>
2473 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2476 /// A 128-bit integer vector containing the minuends.
2478 /// A 128-bit integer vector containing the subtrahends.
2479 /// \returns A 128-bit integer vector containing the differences of the values
2480 /// in the operands.
2481 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi32(__m128i __a
,
2483 return (__m128i
)((__v4su
)__a
- (__v4su
)__b
);
2486 /// Subtracts signed or unsigned 64-bit integer values and writes the
2487 /// difference to the corresponding bits in the destination.
2489 /// \headerfile <x86intrin.h>
2491 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2494 /// A 64-bit integer vector containing the minuend.
2496 /// A 64-bit integer vector containing the subtrahend.
2497 /// \returns A 64-bit integer vector containing the difference of the values in
2499 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sub_si64(__m64 __a
,
2501 return (__m64
)__builtin_ia32_psubq((__v1di
)__a
, (__v1di
)__b
);
2504 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2506 /// \headerfile <x86intrin.h>
2508 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2511 /// A 128-bit integer vector containing the minuends.
2513 /// A 128-bit integer vector containing the subtrahends.
2514 /// \returns A 128-bit integer vector containing the differences of the values
2515 /// in the operands.
2516 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi64(__m128i __a
,
2518 return (__m128i
)((__v2du
)__a
- (__v2du
)__b
);
2521 /// Subtracts corresponding 8-bit signed integer values in the input and
2522 /// returns the differences in the corresponding bytes in the destination.
2523 /// Differences greater than 0x7F are saturated to 0x7F, and differences less
2524 /// than 0x80 are saturated to 0x80.
2526 /// \headerfile <x86intrin.h>
2528 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2531 /// A 128-bit integer vector containing the minuends.
2533 /// A 128-bit integer vector containing the subtrahends.
2534 /// \returns A 128-bit integer vector containing the differences of the values
2535 /// in the operands.
2536 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epi8(__m128i __a
,
2538 return (__m128i
)__builtin_elementwise_sub_sat((__v16qs
)__a
, (__v16qs
)__b
);
2541 /// Subtracts corresponding 16-bit signed integer values in the input and
2542 /// returns the differences in the corresponding bytes in the destination.
2543 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2544 /// than 0x8000 are saturated to 0x8000.
2546 /// \headerfile <x86intrin.h>
2548 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2551 /// A 128-bit integer vector containing the minuends.
2553 /// A 128-bit integer vector containing the subtrahends.
2554 /// \returns A 128-bit integer vector containing the differences of the values
2555 /// in the operands.
2556 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epi16(__m128i __a
,
2558 return (__m128i
)__builtin_elementwise_sub_sat((__v8hi
)__a
, (__v8hi
)__b
);
2561 /// Subtracts corresponding 8-bit unsigned integer values in the input
2562 /// and returns the differences in the corresponding bytes in the
2563 /// destination. Differences less than 0x00 are saturated to 0x00.
2565 /// \headerfile <x86intrin.h>
2567 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2570 /// A 128-bit integer vector containing the minuends.
2572 /// A 128-bit integer vector containing the subtrahends.
2573 /// \returns A 128-bit integer vector containing the unsigned integer
2574 /// differences of the values in the operands.
2575 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epu8(__m128i __a
,
2577 return (__m128i
)__builtin_elementwise_sub_sat((__v16qu
)__a
, (__v16qu
)__b
);
2580 /// Subtracts corresponding 16-bit unsigned integer values in the input
2581 /// and returns the differences in the corresponding bytes in the
2582 /// destination. Differences less than 0x0000 are saturated to 0x0000.
2584 /// \headerfile <x86intrin.h>
2586 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2589 /// A 128-bit integer vector containing the minuends.
2591 /// A 128-bit integer vector containing the subtrahends.
2592 /// \returns A 128-bit integer vector containing the unsigned integer
2593 /// differences of the values in the operands.
2594 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epu16(__m128i __a
,
2596 return (__m128i
)__builtin_elementwise_sub_sat((__v8hu
)__a
, (__v8hu
)__b
);
2599 /// Performs a bitwise AND of two 128-bit integer vectors.
2601 /// \headerfile <x86intrin.h>
2603 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2606 /// A 128-bit integer vector containing one of the source operands.
2608 /// A 128-bit integer vector containing one of the source operands.
2609 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2610 /// in both operands.
2611 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_and_si128(__m128i __a
,
2613 return (__m128i
)((__v2du
)__a
& (__v2du
)__b
);
2616 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2617 /// one's complement of the values contained in the first source operand.
2619 /// \headerfile <x86intrin.h>
2621 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2624 /// A 128-bit vector containing the left source operand. The one's complement
2625 /// of this value is used in the bitwise AND.
2627 /// A 128-bit vector containing the right source operand.
2628 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2629 /// complement of the first operand and the values in the second operand.
2630 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_andnot_si128(__m128i __a
,
2632 return (__m128i
)(~(__v2du
)__a
& (__v2du
)__b
);
2634 /// Performs a bitwise OR of two 128-bit integer vectors.
2636 /// \headerfile <x86intrin.h>
2638 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2641 /// A 128-bit integer vector containing one of the source operands.
2643 /// A 128-bit integer vector containing one of the source operands.
2644 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2645 /// in both operands.
2646 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_or_si128(__m128i __a
,
2648 return (__m128i
)((__v2du
)__a
| (__v2du
)__b
);
2651 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2653 /// \headerfile <x86intrin.h>
2655 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2658 /// A 128-bit integer vector containing one of the source operands.
2660 /// A 128-bit integer vector containing one of the source operands.
2661 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2662 /// values in both operands.
2663 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_xor_si128(__m128i __a
,
2665 return (__m128i
)((__v2du
)__a
^ (__v2du
)__b
);
2668 /// Left-shifts the 128-bit integer vector operand by the specified
2669 /// number of bytes. Low-order bits are cleared.
2671 /// \headerfile <x86intrin.h>
2674 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2677 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2680 /// A 128-bit integer vector containing the source operand.
2682 /// An immediate value specifying the number of bytes to left-shift operand
2684 /// \returns A 128-bit integer vector containing the left-shifted value.
2685 #define _mm_slli_si128(a, imm) \
2686 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2689 #define _mm_bslli_si128(a, imm) \
2690 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2693 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2694 /// by the specified number of bits. Low-order bits are cleared.
2696 /// \headerfile <x86intrin.h>
2698 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2701 /// A 128-bit integer vector containing the source operand.
2703 /// An integer value specifying the number of bits to left-shift each value
2704 /// in operand \a __a.
2705 /// \returns A 128-bit integer vector containing the left-shifted values.
2706 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi16(__m128i __a
,
2708 return (__m128i
)__builtin_ia32_psllwi128((__v8hi
)__a
, __count
);
2711 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2712 /// by the specified number of bits. Low-order bits are cleared.
2714 /// \headerfile <x86intrin.h>
2716 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2719 /// A 128-bit integer vector containing the source operand.
2721 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2722 /// to left-shift each value in operand \a __a.
2723 /// \returns A 128-bit integer vector containing the left-shifted values.
2724 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi16(__m128i __a
,
2726 return (__m128i
)__builtin_ia32_psllw128((__v8hi
)__a
, (__v8hi
)__count
);
2729 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2730 /// by the specified number of bits. Low-order bits are cleared.
2732 /// \headerfile <x86intrin.h>
2734 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2737 /// A 128-bit integer vector containing the source operand.
2739 /// An integer value specifying the number of bits to left-shift each value
2740 /// in operand \a __a.
2741 /// \returns A 128-bit integer vector containing the left-shifted values.
2742 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi32(__m128i __a
,
2744 return (__m128i
)__builtin_ia32_pslldi128((__v4si
)__a
, __count
);
2747 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2748 /// by the specified number of bits. Low-order bits are cleared.
2750 /// \headerfile <x86intrin.h>
2752 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2755 /// A 128-bit integer vector containing the source operand.
2757 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2758 /// to left-shift each value in operand \a __a.
2759 /// \returns A 128-bit integer vector containing the left-shifted values.
2760 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi32(__m128i __a
,
2762 return (__m128i
)__builtin_ia32_pslld128((__v4si
)__a
, (__v4si
)__count
);
2765 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2766 /// by the specified number of bits. Low-order bits are cleared.
2768 /// \headerfile <x86intrin.h>
2770 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2773 /// A 128-bit integer vector containing the source operand.
2775 /// An integer value specifying the number of bits to left-shift each value
2776 /// in operand \a __a.
2777 /// \returns A 128-bit integer vector containing the left-shifted values.
2778 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi64(__m128i __a
,
2780 return __builtin_ia32_psllqi128((__v2di
)__a
, __count
);
2783 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2784 /// by the specified number of bits. Low-order bits are cleared.
2786 /// \headerfile <x86intrin.h>
2788 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2791 /// A 128-bit integer vector containing the source operand.
2793 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2794 /// to left-shift each value in operand \a __a.
2795 /// \returns A 128-bit integer vector containing the left-shifted values.
2796 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi64(__m128i __a
,
2798 return __builtin_ia32_psllq128((__v2di
)__a
, (__v2di
)__count
);
2801 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2802 /// by the specified number of bits. High-order bits are filled with the sign
2803 /// bit of the initial value.
2805 /// \headerfile <x86intrin.h>
2807 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2810 /// A 128-bit integer vector containing the source operand.
2812 /// An integer value specifying the number of bits to right-shift each value
2813 /// in operand \a __a.
2814 /// \returns A 128-bit integer vector containing the right-shifted values.
2815 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srai_epi16(__m128i __a
,
2817 return (__m128i
)__builtin_ia32_psrawi128((__v8hi
)__a
, __count
);
2820 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2821 /// by the specified number of bits. High-order bits are filled with the sign
2822 /// bit of the initial value.
2824 /// \headerfile <x86intrin.h>
2826 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2829 /// A 128-bit integer vector containing the source operand.
2831 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2832 /// to right-shift each value in operand \a __a.
2833 /// \returns A 128-bit integer vector containing the right-shifted values.
2834 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sra_epi16(__m128i __a
,
2836 return (__m128i
)__builtin_ia32_psraw128((__v8hi
)__a
, (__v8hi
)__count
);
2839 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2840 /// by the specified number of bits. High-order bits are filled with the sign
2841 /// bit of the initial value.
2843 /// \headerfile <x86intrin.h>
2845 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2848 /// A 128-bit integer vector containing the source operand.
2850 /// An integer value specifying the number of bits to right-shift each value
2851 /// in operand \a __a.
2852 /// \returns A 128-bit integer vector containing the right-shifted values.
2853 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srai_epi32(__m128i __a
,
2855 return (__m128i
)__builtin_ia32_psradi128((__v4si
)__a
, __count
);
2858 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2859 /// by the specified number of bits. High-order bits are filled with the sign
2860 /// bit of the initial value.
2862 /// \headerfile <x86intrin.h>
2864 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2867 /// A 128-bit integer vector containing the source operand.
2869 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2870 /// to right-shift each value in operand \a __a.
2871 /// \returns A 128-bit integer vector containing the right-shifted values.
2872 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sra_epi32(__m128i __a
,
2874 return (__m128i
)__builtin_ia32_psrad128((__v4si
)__a
, (__v4si
)__count
);
2877 /// Right-shifts the 128-bit integer vector operand by the specified
2878 /// number of bytes. High-order bits are cleared.
2880 /// \headerfile <x86intrin.h>
2883 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2886 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2889 /// A 128-bit integer vector containing the source operand.
2891 /// An immediate value specifying the number of bytes to right-shift operand
2893 /// \returns A 128-bit integer vector containing the right-shifted value.
2894 #define _mm_srli_si128(a, imm) \
2895 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2898 #define _mm_bsrli_si128(a, imm) \
2899 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2902 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2903 /// operand by the specified number of bits. High-order bits are cleared.
2905 /// \headerfile <x86intrin.h>
2907 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2910 /// A 128-bit integer vector containing the source operand.
2912 /// An integer value specifying the number of bits to right-shift each value
2913 /// in operand \a __a.
2914 /// \returns A 128-bit integer vector containing the right-shifted values.
2915 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi16(__m128i __a
,
2917 return (__m128i
)__builtin_ia32_psrlwi128((__v8hi
)__a
, __count
);
2920 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2921 /// operand by the specified number of bits. High-order bits are cleared.
2923 /// \headerfile <x86intrin.h>
2925 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2928 /// A 128-bit integer vector containing the source operand.
2930 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2931 /// to right-shift each value in operand \a __a.
2932 /// \returns A 128-bit integer vector containing the right-shifted values.
2933 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi16(__m128i __a
,
2935 return (__m128i
)__builtin_ia32_psrlw128((__v8hi
)__a
, (__v8hi
)__count
);
2938 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2939 /// operand by the specified number of bits. High-order bits are cleared.
2941 /// \headerfile <x86intrin.h>
2943 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2946 /// A 128-bit integer vector containing the source operand.
2948 /// An integer value specifying the number of bits to right-shift each value
2949 /// in operand \a __a.
2950 /// \returns A 128-bit integer vector containing the right-shifted values.
2951 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi32(__m128i __a
,
2953 return (__m128i
)__builtin_ia32_psrldi128((__v4si
)__a
, __count
);
2956 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2957 /// operand by the specified number of bits. High-order bits are cleared.
2959 /// \headerfile <x86intrin.h>
2961 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2964 /// A 128-bit integer vector containing the source operand.
2966 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2967 /// to right-shift each value in operand \a __a.
2968 /// \returns A 128-bit integer vector containing the right-shifted values.
2969 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi32(__m128i __a
,
2971 return (__m128i
)__builtin_ia32_psrld128((__v4si
)__a
, (__v4si
)__count
);
2974 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2975 /// operand by the specified number of bits. High-order bits are cleared.
2977 /// \headerfile <x86intrin.h>
2979 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2982 /// A 128-bit integer vector containing the source operand.
2984 /// An integer value specifying the number of bits to right-shift each value
2985 /// in operand \a __a.
2986 /// \returns A 128-bit integer vector containing the right-shifted values.
2987 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi64(__m128i __a
,
2989 return __builtin_ia32_psrlqi128((__v2di
)__a
, __count
);
2992 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2993 /// operand by the specified number of bits. High-order bits are cleared.
2995 /// \headerfile <x86intrin.h>
2997 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3000 /// A 128-bit integer vector containing the source operand.
3002 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3003 /// to right-shift each value in operand \a __a.
3004 /// \returns A 128-bit integer vector containing the right-shifted values.
3005 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi64(__m128i __a
,
3007 return __builtin_ia32_psrlq128((__v2di
)__a
, (__v2di
)__count
);
3010 /// Compares each of the corresponding 8-bit values of the 128-bit
3011 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3014 /// \headerfile <x86intrin.h>
3016 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3019 /// A 128-bit integer vector.
3021 /// A 128-bit integer vector.
3022 /// \returns A 128-bit integer vector containing the comparison results.
3023 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpeq_epi8(__m128i __a
,
3025 return (__m128i
)((__v16qi
)__a
== (__v16qi
)__b
);
3028 /// Compares each of the corresponding 16-bit values of the 128-bit
3029 /// integer vectors for equality. Each comparison yields 0x0 for false,
3030 /// 0xFFFF for true.
3032 /// \headerfile <x86intrin.h>
3034 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3037 /// A 128-bit integer vector.
3039 /// A 128-bit integer vector.
3040 /// \returns A 128-bit integer vector containing the comparison results.
3041 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpeq_epi16(__m128i __a
,
3043 return (__m128i
)((__v8hi
)__a
== (__v8hi
)__b
);
3046 /// Compares each of the corresponding 32-bit values of the 128-bit
3047 /// integer vectors for equality. Each comparison yields 0x0 for false,
3048 /// 0xFFFFFFFF for true.
3050 /// \headerfile <x86intrin.h>
3052 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3055 /// A 128-bit integer vector.
3057 /// A 128-bit integer vector.
3058 /// \returns A 128-bit integer vector containing the comparison results.
3059 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpeq_epi32(__m128i __a
,
3061 return (__m128i
)((__v4si
)__a
== (__v4si
)__b
);
3064 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3065 /// integer vectors to determine if the values in the first operand are
3066 /// greater than those in the second operand. Each comparison yields 0x0 for
3067 /// false, 0xFF for true.
3069 /// \headerfile <x86intrin.h>
3071 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3074 /// A 128-bit integer vector.
3076 /// A 128-bit integer vector.
3077 /// \returns A 128-bit integer vector containing the comparison results.
3078 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpgt_epi8(__m128i __a
,
3080 /* This function always performs a signed comparison, but __v16qi is a char
3081 which may be signed or unsigned, so use __v16qs. */
3082 return (__m128i
)((__v16qs
)__a
> (__v16qs
)__b
);
3085 /// Compares each of the corresponding signed 16-bit values of the
3086 /// 128-bit integer vectors to determine if the values in the first operand
3087 /// are greater than those in the second operand.
3089 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3091 /// \headerfile <x86intrin.h>
3093 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3096 /// A 128-bit integer vector.
3098 /// A 128-bit integer vector.
3099 /// \returns A 128-bit integer vector containing the comparison results.
3100 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpgt_epi16(__m128i __a
,
3102 return (__m128i
)((__v8hi
)__a
> (__v8hi
)__b
);
3105 /// Compares each of the corresponding signed 32-bit values of the
3106 /// 128-bit integer vectors to determine if the values in the first operand
3107 /// are greater than those in the second operand.
3109 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3111 /// \headerfile <x86intrin.h>
3113 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3116 /// A 128-bit integer vector.
3118 /// A 128-bit integer vector.
3119 /// \returns A 128-bit integer vector containing the comparison results.
3120 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpgt_epi32(__m128i __a
,
3122 return (__m128i
)((__v4si
)__a
> (__v4si
)__b
);
3125 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3126 /// integer vectors to determine if the values in the first operand are less
3127 /// than those in the second operand.
3129 /// Each comparison yields 0x0 for false, 0xFF for true.
3131 /// \headerfile <x86intrin.h>
3133 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3136 /// A 128-bit integer vector.
3138 /// A 128-bit integer vector.
3139 /// \returns A 128-bit integer vector containing the comparison results.
3140 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmplt_epi8(__m128i __a
,
3142 return _mm_cmpgt_epi8(__b
, __a
);
3145 /// Compares each of the corresponding signed 16-bit values of the
3146 /// 128-bit integer vectors to determine if the values in the first operand
3147 /// are less than those in the second operand.
3149 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3151 /// \headerfile <x86intrin.h>
3153 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3156 /// A 128-bit integer vector.
3158 /// A 128-bit integer vector.
3159 /// \returns A 128-bit integer vector containing the comparison results.
3160 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmplt_epi16(__m128i __a
,
3162 return _mm_cmpgt_epi16(__b
, __a
);
3165 /// Compares each of the corresponding signed 32-bit values of the
3166 /// 128-bit integer vectors to determine if the values in the first operand
3167 /// are less than those in the second operand.
3169 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3171 /// \headerfile <x86intrin.h>
3173 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3176 /// A 128-bit integer vector.
3178 /// A 128-bit integer vector.
3179 /// \returns A 128-bit integer vector containing the comparison results.
3180 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmplt_epi32(__m128i __a
,
3182 return _mm_cmpgt_epi32(__b
, __a
);
3186 /// Converts a 64-bit signed integer value from the second operand into a
3187 /// double-precision value and returns it in the lower element of a [2 x
3188 /// double] vector; the upper element of the returned vector is copied from
3189 /// the upper element of the first operand.
3191 /// \headerfile <x86intrin.h>
3193 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3196 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3197 /// copied to the upper 64 bits of the destination.
3199 /// A 64-bit signed integer operand containing the value to be converted.
3200 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3201 /// converted value of the second operand. The upper 64 bits are copied from
3202 /// the upper 64 bits of the first operand.
3203 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtsi64_sd(__m128d __a
,
3209 /// Converts the first (lower) element of a vector of [2 x double] into a
3210 /// 64-bit signed integer value, according to the current rounding mode.
3212 /// \headerfile <x86intrin.h>
3214 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3217 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3219 /// \returns A 64-bit signed integer containing the converted value.
3220 static __inline__
long long __DEFAULT_FN_ATTRS
_mm_cvtsd_si64(__m128d __a
) {
3221 return __builtin_ia32_cvtsd2si64((__v2df
)__a
);
3224 /// Converts the first (lower) element of a vector of [2 x double] into a
3225 /// 64-bit signed integer value, truncating the result when it is inexact.
3227 /// \headerfile <x86intrin.h>
3229 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3233 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3235 /// \returns A 64-bit signed integer containing the converted value.
3236 static __inline__
long long __DEFAULT_FN_ATTRS
_mm_cvttsd_si64(__m128d __a
) {
3237 return __builtin_ia32_cvttsd2si64((__v2df
)__a
);
3241 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3243 /// \headerfile <x86intrin.h>
3245 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3248 /// A 128-bit integer vector.
3249 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3250 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtepi32_ps(__m128i __a
) {
3251 return (__m128
) __builtin_convertvector((__v4si
)__a
, __v4sf
);
3254 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3256 /// \headerfile <x86intrin.h>
3258 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3261 /// A 128-bit vector of [4 x float].
3262 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3264 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtps_epi32(__m128 __a
) {
3265 return (__m128i
)__builtin_ia32_cvtps2dq((__v4sf
)__a
);
3268 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3269 /// truncating the result when it is inexact.
3271 /// \headerfile <x86intrin.h>
3273 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3277 /// A 128-bit vector of [4 x float].
3278 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3279 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvttps_epi32(__m128 __a
) {
3280 return (__m128i
)__builtin_ia32_cvttps2dq((__v4sf
)__a
);
3283 /// Returns a vector of [4 x i32] where the lowest element is the input
3284 /// operand and the remaining elements are zero.
3286 /// \headerfile <x86intrin.h>
3288 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3291 /// A 32-bit signed integer operand.
3292 /// \returns A 128-bit vector of [4 x i32].
3293 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtsi32_si128(int __a
) {
3294 return __extension__(__m128i
)(__v4si
){__a
, 0, 0, 0};
3297 /// Returns a vector of [2 x i64] where the lower element is the input
3298 /// operand and the upper element is zero.
3300 /// \headerfile <x86intrin.h>
3302 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3306 /// A 64-bit signed integer operand containing the value to be converted.
3307 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3308 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtsi64_si128(long long __a
) {
3309 return __extension__(__m128i
)(__v2di
){__a
, 0};
3312 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3313 /// 32-bit signed integer value.
3315 /// \headerfile <x86intrin.h>
3317 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3320 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3322 /// \returns A 32-bit signed integer containing the moved value.
3323 static __inline__
int __DEFAULT_FN_ATTRS
_mm_cvtsi128_si32(__m128i __a
) {
3324 __v4si __b
= (__v4si
)__a
;
3328 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3329 /// 64-bit signed integer value.
3331 /// \headerfile <x86intrin.h>
3333 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3336 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3338 /// \returns A 64-bit signed integer containing the moved value.
3339 static __inline__
long long __DEFAULT_FN_ATTRS
_mm_cvtsi128_si64(__m128i __a
) {
3343 /// Moves packed integer values from an aligned 128-bit memory location
3344 /// to elements in a 128-bit integer vector.
3346 /// \headerfile <x86intrin.h>
3348 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3351 /// An aligned pointer to a memory location containing integer values.
3352 /// \returns A 128-bit integer vector containing the moved values.
3353 static __inline__ __m128i __DEFAULT_FN_ATTRS
3354 _mm_load_si128(__m128i
const *__p
) {
3358 /// Moves packed integer values from an unaligned 128-bit memory location
3359 /// to elements in a 128-bit integer vector.
3361 /// \headerfile <x86intrin.h>
3363 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3366 /// A pointer to a memory location containing integer values.
3367 /// \returns A 128-bit integer vector containing the moved values.
3368 static __inline__ __m128i __DEFAULT_FN_ATTRS
3369 _mm_loadu_si128(__m128i_u
const *__p
) {
3370 struct __loadu_si128
{
3372 } __attribute__((__packed__
, __may_alias__
));
3373 return ((const struct __loadu_si128
*)__p
)->__v
;
3376 /// Returns a vector of [2 x i64] where the lower element is taken from
3377 /// the lower element of the operand, and the upper element is zero.
3379 /// \headerfile <x86intrin.h>
3381 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3384 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3385 /// the destination.
3386 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3387 /// moved value. The higher order bits are cleared.
3388 static __inline__ __m128i __DEFAULT_FN_ATTRS
3389 _mm_loadl_epi64(__m128i_u
const *__p
) {
3390 struct __mm_loadl_epi64_struct
{
3392 } __attribute__((__packed__
, __may_alias__
));
3393 return __extension__(__m128i
){
3394 ((const struct __mm_loadl_epi64_struct
*)__p
)->__u
, 0};
3397 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3398 /// This could be used as an argument to another intrinsic function where the
3399 /// argument is required but the value is not actually used.
3401 /// \headerfile <x86intrin.h>
3403 /// This intrinsic has no corresponding instruction.
3405 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3406 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_undefined_si128(void) {
3407 return (__m128i
)__builtin_ia32_undef128();
3410 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3411 /// the specified 64-bit integer values.
3413 /// \headerfile <x86intrin.h>
3415 /// This intrinsic is a utility function and does not correspond to a specific
3419 /// A 64-bit integer value used to initialize the upper 64 bits of the
3420 /// destination vector of [2 x i64].
3422 /// A 64-bit integer value used to initialize the lower 64 bits of the
3423 /// destination vector of [2 x i64].
3424 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3425 /// provided in the operands.
3426 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi64x(long long __q1
,
3428 return __extension__(__m128i
)(__v2di
){__q0
, __q1
};
3431 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3432 /// the specified 64-bit integer values.
3434 /// \headerfile <x86intrin.h>
3436 /// This intrinsic is a utility function and does not correspond to a specific
3440 /// A 64-bit integer value used to initialize the upper 64 bits of the
3441 /// destination vector of [2 x i64].
3443 /// A 64-bit integer value used to initialize the lower 64 bits of the
3444 /// destination vector of [2 x i64].
3445 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3446 /// provided in the operands.
3447 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi64(__m64 __q1
,
3449 return _mm_set_epi64x((long long)__q1
, (long long)__q0
);
3452 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3453 /// the specified 32-bit integer values.
3455 /// \headerfile <x86intrin.h>
3457 /// This intrinsic is a utility function and does not correspond to a specific
3461 /// A 32-bit integer value used to initialize bits [127:96] of the
3462 /// destination vector.
3464 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3467 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3470 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3472 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3473 /// provided in the operands.
3474 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi32(int __i3
, int __i2
,
3475 int __i1
, int __i0
) {
3476 return __extension__(__m128i
)(__v4si
){__i0
, __i1
, __i2
, __i3
};
3479 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3480 /// the specified 16-bit integer values.
3482 /// \headerfile <x86intrin.h>
3484 /// This intrinsic is a utility function and does not correspond to a specific
3488 /// A 16-bit integer value used to initialize bits [127:112] of the
3489 /// destination vector.
3491 /// A 16-bit integer value used to initialize bits [111:96] of the
3492 /// destination vector.
3494 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3497 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3500 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3503 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3506 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3509 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3511 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3512 /// provided in the operands.
3513 static __inline__ __m128i __DEFAULT_FN_ATTRS
3514 _mm_set_epi16(short __w7
, short __w6
, short __w5
, short __w4
, short __w3
,
3515 short __w2
, short __w1
, short __w0
) {
3516 return __extension__(__m128i
)(__v8hi
){__w0
, __w1
, __w2
, __w3
,
3517 __w4
, __w5
, __w6
, __w7
};
3520 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3521 /// the specified 8-bit integer values.
3523 /// \headerfile <x86intrin.h>
3525 /// This intrinsic is a utility function and does not correspond to a specific
3529 /// Initializes bits [127:120] of the destination vector.
3531 /// Initializes bits [119:112] of the destination vector.
3533 /// Initializes bits [111:104] of the destination vector.
3535 /// Initializes bits [103:96] of the destination vector.
3537 /// Initializes bits [95:88] of the destination vector.
3539 /// Initializes bits [87:80] of the destination vector.
3541 /// Initializes bits [79:72] of the destination vector.
3543 /// Initializes bits [71:64] of the destination vector.
3545 /// Initializes bits [63:56] of the destination vector.
3547 /// Initializes bits [55:48] of the destination vector.
3549 /// Initializes bits [47:40] of the destination vector.
3551 /// Initializes bits [39:32] of the destination vector.
3553 /// Initializes bits [31:24] of the destination vector.
3555 /// Initializes bits [23:16] of the destination vector.
3557 /// Initializes bits [15:8] of the destination vector.
3559 /// Initializes bits [7:0] of the destination vector.
3560 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3561 /// provided in the operands.
3562 static __inline__ __m128i __DEFAULT_FN_ATTRS
3563 _mm_set_epi8(char __b15
, char __b14
, char __b13
, char __b12
, char __b11
,
3564 char __b10
, char __b9
, char __b8
, char __b7
, char __b6
, char __b5
,
3565 char __b4
, char __b3
, char __b2
, char __b1
, char __b0
) {
3566 return __extension__(__m128i
)(__v16qi
){
3567 __b0
, __b1
, __b2
, __b3
, __b4
, __b5
, __b6
, __b7
,
3568 __b8
, __b9
, __b10
, __b11
, __b12
, __b13
, __b14
, __b15
};
3571 /// Initializes both values in a 128-bit integer vector with the
3572 /// specified 64-bit integer value.
3574 /// \headerfile <x86intrin.h>
3576 /// This intrinsic is a utility function and does not correspond to a specific
3580 /// Integer value used to initialize the elements of the destination integer
3582 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3583 /// elements containing the value provided in the operand.
3584 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi64x(long long __q
) {
3585 return _mm_set_epi64x(__q
, __q
);
3588 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3589 /// specified 64-bit value.
3591 /// \headerfile <x86intrin.h>
3593 /// This intrinsic is a utility function and does not correspond to a specific
3597 /// A 64-bit value used to initialize the elements of the destination integer
3599 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3600 /// containing the value provided in the operand.
3601 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi64(__m64 __q
) {
3602 return _mm_set_epi64(__q
, __q
);
3605 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3606 /// specified 32-bit value.
3608 /// \headerfile <x86intrin.h>
3610 /// This intrinsic is a utility function and does not correspond to a specific
3614 /// A 32-bit value used to initialize the elements of the destination integer
3616 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3617 /// containing the value provided in the operand.
3618 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi32(int __i
) {
3619 return _mm_set_epi32(__i
, __i
, __i
, __i
);
3622 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3623 /// specified 16-bit value.
3625 /// \headerfile <x86intrin.h>
3627 /// This intrinsic is a utility function and does not correspond to a specific
3631 /// A 16-bit value used to initialize the elements of the destination integer
3633 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3634 /// containing the value provided in the operand.
3635 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi16(short __w
) {
3636 return _mm_set_epi16(__w
, __w
, __w
, __w
, __w
, __w
, __w
, __w
);
3639 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3640 /// specified 8-bit value.
3642 /// \headerfile <x86intrin.h>
3644 /// This intrinsic is a utility function and does not correspond to a specific
3648 /// An 8-bit value used to initialize the elements of the destination integer
3650 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3651 /// containing the value provided in the operand.
3652 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi8(char __b
) {
3653 return _mm_set_epi8(__b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
,
3654 __b
, __b
, __b
, __b
, __b
);
3657 /// Constructs a 128-bit integer vector, initialized in reverse order
3658 /// with the specified 64-bit integral values.
3660 /// \headerfile <x86intrin.h>
3662 /// This intrinsic does not correspond to a specific instruction.
3665 /// A 64-bit integral value used to initialize the lower 64 bits of the
3668 /// A 64-bit integral value used to initialize the upper 64 bits of the
3670 /// \returns An initialized 128-bit integer vector.
3671 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi64(__m64 __q0
,
3673 return _mm_set_epi64(__q1
, __q0
);
3676 /// Constructs a 128-bit integer vector, initialized in reverse order
3677 /// with the specified 32-bit integral values.
3679 /// \headerfile <x86intrin.h>
3681 /// This intrinsic is a utility function and does not correspond to a specific
3685 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3687 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3689 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3691 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3692 /// \returns An initialized 128-bit integer vector.
3693 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi32(int __i0
, int __i1
,
3696 return _mm_set_epi32(__i3
, __i2
, __i1
, __i0
);
3699 /// Constructs a 128-bit integer vector, initialized in reverse order
3700 /// with the specified 16-bit integral values.
3702 /// \headerfile <x86intrin.h>
3704 /// This intrinsic is a utility function and does not correspond to a specific
3708 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3710 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3712 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3714 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3716 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3718 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3720 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3722 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3723 /// \returns An initialized 128-bit integer vector.
3724 static __inline__ __m128i __DEFAULT_FN_ATTRS
3725 _mm_setr_epi16(short __w0
, short __w1
, short __w2
, short __w3
, short __w4
,
3726 short __w5
, short __w6
, short __w7
) {
3727 return _mm_set_epi16(__w7
, __w6
, __w5
, __w4
, __w3
, __w2
, __w1
, __w0
);
3730 /// Constructs a 128-bit integer vector, initialized in reverse order
3731 /// with the specified 8-bit integral values.
3733 /// \headerfile <x86intrin.h>
3735 /// This intrinsic is a utility function and does not correspond to a specific
3739 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3741 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3743 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3745 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3747 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3749 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3751 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3753 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3755 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3757 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3759 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3761 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3763 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3765 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3767 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3769 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3770 /// \returns An initialized 128-bit integer vector.
3771 static __inline__ __m128i __DEFAULT_FN_ATTRS
3772 _mm_setr_epi8(char __b0
, char __b1
, char __b2
, char __b3
, char __b4
, char __b5
,
3773 char __b6
, char __b7
, char __b8
, char __b9
, char __b10
,
3774 char __b11
, char __b12
, char __b13
, char __b14
, char __b15
) {
3775 return _mm_set_epi8(__b15
, __b14
, __b13
, __b12
, __b11
, __b10
, __b9
, __b8
,
3776 __b7
, __b6
, __b5
, __b4
, __b3
, __b2
, __b1
, __b0
);
3779 /// Creates a 128-bit integer vector initialized to zero.
3781 /// \headerfile <x86intrin.h>
3783 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3785 /// \returns An initialized 128-bit integer vector with all elements set to
3787 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setzero_si128(void) {
3788 return __extension__(__m128i
)(__v2di
){0LL, 0LL};
3791 /// Stores a 128-bit integer vector to a memory location aligned on a
3792 /// 128-bit boundary.
3794 /// \headerfile <x86intrin.h>
3796 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3799 /// A pointer to an aligned memory location that will receive the integer
3802 /// A 128-bit integer vector containing the values to be moved.
3803 static __inline__
void __DEFAULT_FN_ATTRS
_mm_store_si128(__m128i
*__p
,
3808 /// Stores a 128-bit integer vector to an unaligned memory location.
3810 /// \headerfile <x86intrin.h>
3812 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3815 /// A pointer to a memory location that will receive the integer values.
3817 /// A 128-bit integer vector containing the values to be moved.
3818 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storeu_si128(__m128i_u
*__p
,
3820 struct __storeu_si128
{
3822 } __attribute__((__packed__
, __may_alias__
));
3823 ((struct __storeu_si128
*)__p
)->__v
= __b
;
3826 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3829 /// \headerfile <x86intrin.h>
3831 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3834 /// A pointer to a 64-bit memory location. The address of the memory
3835 /// location does not have to be aligned.
3837 /// A 128-bit integer vector containing the value to be stored.
3838 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storeu_si64(void *__p
,
3840 struct __storeu_si64
{
3842 } __attribute__((__packed__
, __may_alias__
));
3843 ((struct __storeu_si64
*)__p
)->__v
= ((__v2di
)__b
)[0];
3846 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3849 /// \headerfile <x86intrin.h>
3851 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3854 /// A pointer to a 32-bit memory location. The address of the memory
3855 /// location does not have to be aligned.
3857 /// A 128-bit integer vector containing the value to be stored.
3858 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storeu_si32(void *__p
,
3860 struct __storeu_si32
{
3862 } __attribute__((__packed__
, __may_alias__
));
3863 ((struct __storeu_si32
*)__p
)->__v
= ((__v4si
)__b
)[0];
3866 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3869 /// \headerfile <x86intrin.h>
3871 /// This intrinsic does not correspond to a specific instruction.
3874 /// A pointer to a 16-bit memory location. The address of the memory
3875 /// location does not have to be aligned.
3877 /// A 128-bit integer vector containing the value to be stored.
3878 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storeu_si16(void *__p
,
3880 struct __storeu_si16
{
3882 } __attribute__((__packed__
, __may_alias__
));
3883 ((struct __storeu_si16
*)__p
)->__v
= ((__v8hi
)__b
)[0];
3886 /// Moves bytes selected by the mask from the first operand to the
3887 /// specified unaligned memory location. When a mask bit is 1, the
3888 /// corresponding byte is written, otherwise it is not written.
3890 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3891 /// used again soon). Exception and trap behavior for elements not selected
3892 /// for storage to memory are implementation dependent.
3894 /// \headerfile <x86intrin.h>
3896 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3900 /// A 128-bit integer vector containing the values to be moved.
3902 /// A 128-bit integer vector containing the mask. The most significant bit of
3903 /// each byte represents the mask bits.
3905 /// A pointer to an unaligned 128-bit memory location where the specified
3906 /// values are moved.
3907 static __inline__
void __DEFAULT_FN_ATTRS
_mm_maskmoveu_si128(__m128i __d
,
3910 __builtin_ia32_maskmovdqu((__v16qi
)__d
, (__v16qi
)__n
, __p
);
3913 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3914 /// a memory location.
3916 /// \headerfile <x86intrin.h>
3918 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3921 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
3922 /// of the integer vector parameter.
3924 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3925 /// value to be stored.
3926 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storel_epi64(__m128i_u
*__p
,
3928 struct __mm_storel_epi64_struct
{
3930 } __attribute__((__packed__
, __may_alias__
));
3931 ((struct __mm_storel_epi64_struct
*)__p
)->__u
= __a
[0];
3934 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3935 /// aligned memory location.
3937 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3938 /// used again soon).
3940 /// \headerfile <x86intrin.h>
3942 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3945 /// A pointer to the 128-bit aligned memory location used to store the value.
3947 /// A vector of [2 x double] containing the 64-bit values to be stored.
3948 static __inline__
void __DEFAULT_FN_ATTRS
_mm_stream_pd(void *__p
,
3950 __builtin_nontemporal_store((__v2df
)__a
, (__v2df
*)__p
);
3953 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3955 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3956 /// used again soon).
3958 /// \headerfile <x86intrin.h>
3960 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3963 /// A pointer to the 128-bit aligned memory location used to store the value.
3965 /// A 128-bit integer vector containing the values to be stored.
3966 static __inline__
void __DEFAULT_FN_ATTRS
_mm_stream_si128(void *__p
,
3968 __builtin_nontemporal_store((__v2di
)__a
, (__v2di
*)__p
);
3971 /// Stores a 32-bit integer value in the specified memory location.
3973 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3974 /// used again soon).
3976 /// \headerfile <x86intrin.h>
3978 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3981 /// A pointer to the 32-bit memory location used to store the value.
3983 /// A 32-bit integer containing the value to be stored.
3984 static __inline__
void
3985 __attribute__((__always_inline__
, __nodebug__
, __target__("sse2")))
3986 _mm_stream_si32(void *__p
, int __a
) {
3987 __builtin_ia32_movnti((int *)__p
, __a
);
3991 /// Stores a 64-bit integer value in the specified memory location.
3993 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3994 /// used again soon).
3996 /// \headerfile <x86intrin.h>
3998 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4001 /// A pointer to the 64-bit memory location used to store the value.
4003 /// A 64-bit integer containing the value to be stored.
4004 static __inline__
void
4005 __attribute__((__always_inline__
, __nodebug__
, __target__("sse2")))
4006 _mm_stream_si64(void *__p
, long long __a
) {
4007 __builtin_ia32_movnti64((long long *)__p
, __a
);
4011 #if defined(__cplusplus)
4015 /// The cache line containing \a __p is flushed and invalidated from all
4016 /// caches in the coherency domain.
4018 /// \headerfile <x86intrin.h>
4020 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4023 /// A pointer to the memory location used to identify the cache line to be
4025 void _mm_clflush(void const *__p
);
4027 /// Forces strong memory ordering (serialization) between load
4028 /// instructions preceding this instruction and load instructions following
4029 /// this instruction, ensuring the system completes all previous loads before
4030 /// executing subsequent loads.
4032 /// \headerfile <x86intrin.h>
4034 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4036 void _mm_lfence(void);
4038 /// Forces strong memory ordering (serialization) between load and store
4039 /// instructions preceding this instruction and load and store instructions
4040 /// following this instruction, ensuring that the system completes all
4041 /// previous memory accesses before executing subsequent memory accesses.
4043 /// \headerfile <x86intrin.h>
4045 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4047 void _mm_mfence(void);
4049 #if defined(__cplusplus)
4053 /// Converts 16-bit signed integers from both 128-bit integer vector
4054 /// operands into 8-bit signed integers, and packs the results into the
4055 /// destination. Positive values greater than 0x7F are saturated to 0x7F.
4056 /// Negative values less than 0x80 are saturated to 0x80.
4058 /// \headerfile <x86intrin.h>
4060 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4063 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4064 /// a signed integer and is converted to a 8-bit signed integer with
4065 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4066 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4067 /// written to the lower 64 bits of the result.
4069 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4070 /// a signed integer and is converted to a 8-bit signed integer with
4071 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4072 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4073 /// written to the higher 64 bits of the result.
4074 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4075 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packs_epi16(__m128i __a
,
4077 return (__m128i
)__builtin_ia32_packsswb128((__v8hi
)__a
, (__v8hi
)__b
);
4080 /// Converts 32-bit signed integers from both 128-bit integer vector
4081 /// operands into 16-bit signed integers, and packs the results into the
4082 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4083 /// Negative values less than 0x8000 are saturated to 0x8000.
4085 /// \headerfile <x86intrin.h>
4087 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4090 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4091 /// a signed integer and is converted to a 16-bit signed integer with
4092 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4093 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4094 /// are written to the lower 64 bits of the result.
4096 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4097 /// a signed integer and is converted to a 16-bit signed integer with
4098 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4099 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4100 /// are written to the higher 64 bits of the result.
4101 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4102 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packs_epi32(__m128i __a
,
4104 return (__m128i
)__builtin_ia32_packssdw128((__v4si
)__a
, (__v4si
)__b
);
4107 /// Converts 16-bit signed integers from both 128-bit integer vector
4108 /// operands into 8-bit unsigned integers, and packs the results into the
4109 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less
4110 /// than 0x00 are saturated to 0x00.
4112 /// \headerfile <x86intrin.h>
4114 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4117 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4118 /// a signed integer and is converted to an 8-bit unsigned integer with
4119 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4120 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4121 /// written to the lower 64 bits of the result.
4123 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4124 /// a signed integer and is converted to an 8-bit unsigned integer with
4125 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4126 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4127 /// written to the higher 64 bits of the result.
4128 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4129 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packus_epi16(__m128i __a
,
4131 return (__m128i
)__builtin_ia32_packuswb128((__v8hi
)__a
, (__v8hi
)__b
);
4134 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4135 /// the immediate-value parameter as a selector.
4137 /// \headerfile <x86intrin.h>
4140 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4143 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4146 /// A 128-bit integer vector.
4148 /// An immediate value. Bits [2:0] selects values from \a a to be assigned
4149 /// to bits[15:0] of the result. \n
4150 /// 000: assign values from bits [15:0] of \a a. \n
4151 /// 001: assign values from bits [31:16] of \a a. \n
4152 /// 010: assign values from bits [47:32] of \a a. \n
4153 /// 011: assign values from bits [63:48] of \a a. \n
4154 /// 100: assign values from bits [79:64] of \a a. \n
4155 /// 101: assign values from bits [95:80] of \a a. \n
4156 /// 110: assign values from bits [111:96] of \a a. \n
4157 /// 111: assign values from bits [127:112] of \a a.
4158 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4159 /// integer vector parameter and the remaining bits are assigned zeros.
4160 #define _mm_extract_epi16(a, imm) \
4161 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4164 /// Constructs a 128-bit integer vector by first making a copy of the
4165 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4166 /// of an integer parameter into an offset specified by the immediate-value
4169 /// \headerfile <x86intrin.h>
4172 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4175 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4178 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4179 /// result and then one of the eight elements in the result is replaced by
4180 /// the lower 16 bits of \a b.
4182 /// An integer. The lower 16 bits of this parameter are written to the
4183 /// result beginning at an offset specified by \a imm.
4185 /// An immediate value specifying the bit offset in the result at which the
4186 /// lower 16 bits of \a b are written.
4187 /// \returns A 128-bit integer vector containing the constructed values.
4188 #define _mm_insert_epi16(a, b, imm) \
4189 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4192 /// Copies the values of the most significant bits from each 8-bit
4193 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4194 /// value, zero-extends the value, and writes it to the destination.
4196 /// \headerfile <x86intrin.h>
4198 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4201 /// A 128-bit integer vector containing the values with bits to be extracted.
4202 /// \returns The most significant bits from each 8-bit element in \a __a,
4203 /// written to bits [15:0]. The other bits are assigned zeros.
4204 static __inline__
int __DEFAULT_FN_ATTRS
_mm_movemask_epi8(__m128i __a
) {
4205 return __builtin_ia32_pmovmskb128((__v16qi
)__a
);
4208 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4209 /// elements of a 128-bit integer vector parameter, using the immediate-value
4210 /// parameter as a specifier.
4212 /// \headerfile <x86intrin.h>
4215 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4218 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4221 /// A 128-bit integer vector containing the values to be copied.
4223 /// An immediate value containing an 8-bit value specifying which elements to
4224 /// copy from a. The destinations within the 128-bit destination are assigned
4225 /// values as follows: \n
4226 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4227 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4228 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4229 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4230 /// Bit value assignments: \n
4231 /// 00: assign values from bits [31:0] of \a a. \n
4232 /// 01: assign values from bits [63:32] of \a a. \n
4233 /// 10: assign values from bits [95:64] of \a a. \n
4234 /// 11: assign values from bits [127:96] of \a a. \n
4235 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4236 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4237 /// <c>[b6, b4, b2, b0]</c>.
4238 /// \returns A 128-bit integer vector containing the shuffled values.
4239 #define _mm_shuffle_epi32(a, imm) \
4240 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4242 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4243 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4244 /// value parameter as a specifier.
4246 /// \headerfile <x86intrin.h>
4249 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4252 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4255 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4256 /// [127:64] of the result.
4258 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4259 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4260 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4261 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4262 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4263 /// Bit value assignments: \n
4264 /// 00: assign values from bits [15:0] of \a a. \n
4265 /// 01: assign values from bits [31:16] of \a a. \n
4266 /// 10: assign values from bits [47:32] of \a a. \n
4267 /// 11: assign values from bits [63:48] of \a a. \n
4268 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4269 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4270 /// <c>[b6, b4, b2, b0]</c>.
4271 /// \returns A 128-bit integer vector containing the shuffled values.
4272 #define _mm_shufflelo_epi16(a, imm) \
4273 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4275 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4276 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4277 /// value parameter as a specifier.
4279 /// \headerfile <x86intrin.h>
4282 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4285 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4288 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4289 /// [63:0] of the result.
4291 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4292 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4293 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4294 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4295 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4296 /// Bit value assignments: \n
4297 /// 00: assign values from bits [79:64] of \a a. \n
4298 /// 01: assign values from bits [95:80] of \a a. \n
4299 /// 10: assign values from bits [111:96] of \a a. \n
4300 /// 11: assign values from bits [127:112] of \a a. \n
4301 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4302 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4303 /// <c>[b6, b4, b2, b0]</c>.
4304 /// \returns A 128-bit integer vector containing the shuffled values.
4305 #define _mm_shufflehi_epi16(a, imm) \
4306 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4308 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4309 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4311 /// \headerfile <x86intrin.h>
4313 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4317 /// A 128-bit vector of [16 x i8].
4318 /// Bits [71:64] are written to bits [7:0] of the result. \n
4319 /// Bits [79:72] are written to bits [23:16] of the result. \n
4320 /// Bits [87:80] are written to bits [39:32] of the result. \n
4321 /// Bits [95:88] are written to bits [55:48] of the result. \n
4322 /// Bits [103:96] are written to bits [71:64] of the result. \n
4323 /// Bits [111:104] are written to bits [87:80] of the result. \n
4324 /// Bits [119:112] are written to bits [103:96] of the result. \n
4325 /// Bits [127:120] are written to bits [119:112] of the result.
4327 /// A 128-bit vector of [16 x i8]. \n
4328 /// Bits [71:64] are written to bits [15:8] of the result. \n
4329 /// Bits [79:72] are written to bits [31:24] of the result. \n
4330 /// Bits [87:80] are written to bits [47:40] of the result. \n
4331 /// Bits [95:88] are written to bits [63:56] of the result. \n
4332 /// Bits [103:96] are written to bits [79:72] of the result. \n
4333 /// Bits [111:104] are written to bits [95:88] of the result. \n
4334 /// Bits [119:112] are written to bits [111:104] of the result. \n
4335 /// Bits [127:120] are written to bits [127:120] of the result.
4336 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4337 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi8(__m128i __a
,
4339 return (__m128i
)__builtin_shufflevector(
4340 (__v16qi
)__a
, (__v16qi
)__b
, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4341 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4344 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4345 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4347 /// \headerfile <x86intrin.h>
4349 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4353 /// A 128-bit vector of [8 x i16].
4354 /// Bits [79:64] are written to bits [15:0] of the result. \n
4355 /// Bits [95:80] are written to bits [47:32] of the result. \n
4356 /// Bits [111:96] are written to bits [79:64] of the result. \n
4357 /// Bits [127:112] are written to bits [111:96] of the result.
4359 /// A 128-bit vector of [8 x i16].
4360 /// Bits [79:64] are written to bits [31:16] of the result. \n
4361 /// Bits [95:80] are written to bits [63:48] of the result. \n
4362 /// Bits [111:96] are written to bits [95:80] of the result. \n
4363 /// Bits [127:112] are written to bits [127:112] of the result.
4364 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4365 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi16(__m128i __a
,
4367 return (__m128i
)__builtin_shufflevector((__v8hi
)__a
, (__v8hi
)__b
, 4, 8 + 4, 5,
4368 8 + 5, 6, 8 + 6, 7, 8 + 7);
4371 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4372 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4374 /// \headerfile <x86intrin.h>
4376 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4380 /// A 128-bit vector of [4 x i32]. \n
4381 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4382 /// Bits [127:96] are written to bits [95:64] of the destination.
4384 /// A 128-bit vector of [4 x i32]. \n
4385 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4386 /// Bits [127:96] are written to bits [127:96] of the destination.
4387 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4388 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi32(__m128i __a
,
4390 return (__m128i
)__builtin_shufflevector((__v4si
)__a
, (__v4si
)__b
, 2, 4 + 2, 3,
4394 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4395 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4397 /// \headerfile <x86intrin.h>
4399 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4403 /// A 128-bit vector of [2 x i64]. \n
4404 /// Bits [127:64] are written to bits [63:0] of the destination.
4406 /// A 128-bit vector of [2 x i64]. \n
4407 /// Bits [127:64] are written to bits [127:64] of the destination.
4408 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4409 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi64(__m128i __a
,
4411 return (__m128i
)__builtin_shufflevector((__v2di
)__a
, (__v2di
)__b
, 1, 2 + 1);
4414 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4415 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4417 /// \headerfile <x86intrin.h>
4419 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4423 /// A 128-bit vector of [16 x i8]. \n
4424 /// Bits [7:0] are written to bits [7:0] of the result. \n
4425 /// Bits [15:8] are written to bits [23:16] of the result. \n
4426 /// Bits [23:16] are written to bits [39:32] of the result. \n
4427 /// Bits [31:24] are written to bits [55:48] of the result. \n
4428 /// Bits [39:32] are written to bits [71:64] of the result. \n
4429 /// Bits [47:40] are written to bits [87:80] of the result. \n
4430 /// Bits [55:48] are written to bits [103:96] of the result. \n
4431 /// Bits [63:56] are written to bits [119:112] of the result.
4433 /// A 128-bit vector of [16 x i8].
4434 /// Bits [7:0] are written to bits [15:8] of the result. \n
4435 /// Bits [15:8] are written to bits [31:24] of the result. \n
4436 /// Bits [23:16] are written to bits [47:40] of the result. \n
4437 /// Bits [31:24] are written to bits [63:56] of the result. \n
4438 /// Bits [39:32] are written to bits [79:72] of the result. \n
4439 /// Bits [47:40] are written to bits [95:88] of the result. \n
4440 /// Bits [55:48] are written to bits [111:104] of the result. \n
4441 /// Bits [63:56] are written to bits [127:120] of the result.
4442 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4443 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi8(__m128i __a
,
4445 return (__m128i
)__builtin_shufflevector(
4446 (__v16qi
)__a
, (__v16qi
)__b
, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4447 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4450 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4451 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4454 /// \headerfile <x86intrin.h>
4456 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4460 /// A 128-bit vector of [8 x i16].
4461 /// Bits [15:0] are written to bits [15:0] of the result. \n
4462 /// Bits [31:16] are written to bits [47:32] of the result. \n
4463 /// Bits [47:32] are written to bits [79:64] of the result. \n
4464 /// Bits [63:48] are written to bits [111:96] of the result.
4466 /// A 128-bit vector of [8 x i16].
4467 /// Bits [15:0] are written to bits [31:16] of the result. \n
4468 /// Bits [31:16] are written to bits [63:48] of the result. \n
4469 /// Bits [47:32] are written to bits [95:80] of the result. \n
4470 /// Bits [63:48] are written to bits [127:112] of the result.
4471 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4472 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi16(__m128i __a
,
4474 return (__m128i
)__builtin_shufflevector((__v8hi
)__a
, (__v8hi
)__b
, 0, 8 + 0, 1,
4475 8 + 1, 2, 8 + 2, 3, 8 + 3);
4478 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4479 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4481 /// \headerfile <x86intrin.h>
4483 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4487 /// A 128-bit vector of [4 x i32]. \n
4488 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4489 /// Bits [63:32] are written to bits [95:64] of the destination.
4491 /// A 128-bit vector of [4 x i32]. \n
4492 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4493 /// Bits [63:32] are written to bits [127:96] of the destination.
4494 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4495 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi32(__m128i __a
,
4497 return (__m128i
)__builtin_shufflevector((__v4si
)__a
, (__v4si
)__b
, 0, 4 + 0, 1,
4501 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4502 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4504 /// \headerfile <x86intrin.h>
4506 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4510 /// A 128-bit vector of [2 x i64]. \n
4511 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4513 /// A 128-bit vector of [2 x i64]. \n
4514 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4515 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4516 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi64(__m128i __a
,
4518 return (__m128i
)__builtin_shufflevector((__v2di
)__a
, (__v2di
)__b
, 0, 2 + 0);
4521 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4524 /// \headerfile <x86intrin.h>
4526 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4529 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4531 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4532 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_movepi64_pi64(__m128i __a
) {
4533 return (__m64
)__a
[0];
4536 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4539 /// \headerfile <x86intrin.h>
4541 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4545 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4546 /// the operand. The upper 64 bits are assigned zeros.
4547 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_movpi64_epi64(__m64 __a
) {
4548 return __extension__(__m128i
)(__v2di
){(long long)__a
, 0};
4551 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4552 /// integer vector, zeroing the upper bits.
4554 /// \headerfile <x86intrin.h>
4556 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4559 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4561 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4562 /// the operand. The upper 64 bits are assigned zeros.
4563 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_move_epi64(__m128i __a
) {
4564 return __builtin_shufflevector((__v2di
)__a
, _mm_setzero_si128(), 0, 2);
4567 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4568 /// [2 x double] and interleaves them into a 128-bit vector of [2 x
4571 /// \headerfile <x86intrin.h>
4573 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4576 /// A 128-bit vector of [2 x double]. \n
4577 /// Bits [127:64] are written to bits [63:0] of the destination.
4579 /// A 128-bit vector of [2 x double]. \n
4580 /// Bits [127:64] are written to bits [127:64] of the destination.
4581 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4582 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_unpackhi_pd(__m128d __a
,
4584 return __builtin_shufflevector((__v2df
)__a
, (__v2df
)__b
, 1, 2 + 1);
4587 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4588 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4591 /// \headerfile <x86intrin.h>
4593 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4596 /// A 128-bit vector of [2 x double]. \n
4597 /// Bits [63:0] are written to bits [63:0] of the destination.
4599 /// A 128-bit vector of [2 x double]. \n
4600 /// Bits [63:0] are written to bits [127:64] of the destination.
4601 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4602 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_unpacklo_pd(__m128d __a
,
4604 return __builtin_shufflevector((__v2df
)__a
, (__v2df
)__b
, 0, 2 + 0);
4607 /// Extracts the sign bits of the double-precision values in the 128-bit
4608 /// vector of [2 x double], zero-extends the value, and writes it to the
4609 /// low-order bits of the destination.
4611 /// \headerfile <x86intrin.h>
4613 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4616 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4618 /// \returns The sign bits from each of the double-precision elements in \a __a,
4619 /// written to bits [1:0]. The remaining bits are assigned values of zero.
4620 static __inline__
int __DEFAULT_FN_ATTRS
_mm_movemask_pd(__m128d __a
) {
4621 return __builtin_ia32_movmskpd((__v2df
)__a
);
4624 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4625 /// 128-bit vector parameters of [2 x double], using the immediate-value
4626 /// parameter as a specifier.
4628 /// \headerfile <x86intrin.h>
4631 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4634 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4637 /// A 128-bit vector of [2 x double].
4639 /// A 128-bit vector of [2 x double].
4641 /// An 8-bit immediate value. The least significant two bits specify which
4642 /// elements to copy from \a a and \a b: \n
4643 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4644 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4645 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4646 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4647 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4648 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4649 /// <c>[b1, b0]</c>.
4650 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4651 #define _mm_shuffle_pd(a, b, i) \
4652 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4655 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4656 /// floating-point vector of [4 x float].
4658 /// \headerfile <x86intrin.h>
4660 /// This intrinsic has no corresponding instruction.
4663 /// A 128-bit floating-point vector of [2 x double].
4664 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4665 /// bitwise pattern as the parameter.
4666 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_castpd_ps(__m128d __a
) {
4670 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4673 /// \headerfile <x86intrin.h>
4675 /// This intrinsic has no corresponding instruction.
4678 /// A 128-bit floating-point vector of [2 x double].
4679 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4681 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_castpd_si128(__m128d __a
) {
4682 return (__m128i
)__a
;
4685 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4686 /// floating-point vector of [2 x double].
4688 /// \headerfile <x86intrin.h>
4690 /// This intrinsic has no corresponding instruction.
4693 /// A 128-bit floating-point vector of [4 x float].
4694 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4695 /// bitwise pattern as the parameter.
4696 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_castps_pd(__m128 __a
) {
4697 return (__m128d
)__a
;
4700 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4703 /// \headerfile <x86intrin.h>
4705 /// This intrinsic has no corresponding instruction.
4708 /// A 128-bit floating-point vector of [4 x float].
4709 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4711 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_castps_si128(__m128 __a
) {
4712 return (__m128i
)__a
;
4715 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4718 /// \headerfile <x86intrin.h>
4720 /// This intrinsic has no corresponding instruction.
4723 /// A 128-bit integer vector.
4724 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4725 /// bitwise pattern as the parameter.
4726 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_castsi128_ps(__m128i __a
) {
4730 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4731 /// of [2 x double].
4733 /// \headerfile <x86intrin.h>
4735 /// This intrinsic has no corresponding instruction.
4738 /// A 128-bit integer vector.
4739 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4740 /// bitwise pattern as the parameter.
4741 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_castsi128_pd(__m128i __a
) {
4742 return (__m128d
)__a
;
4745 #if defined(__cplusplus)
4749 /// Indicates that a spin loop is being executed for the purposes of
4750 /// optimizing power consumption during the loop.
4752 /// \headerfile <x86intrin.h>
4754 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4756 void _mm_pause(void);
4758 #if defined(__cplusplus)
4761 #undef __DEFAULT_FN_ATTRS
4762 #undef __DEFAULT_FN_ATTRS_MMX
4764 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4766 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4767 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4769 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4771 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4772 #define _MM_SET_DENORMALS_ZERO_MODE(x) \
4773 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4775 #endif /* __EMMINTRIN_H */