1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
17 #include <xmmintrin.h>
19 typedef double __m128d
__attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i
__attribute__((__vector_size__(16), __aligned__(16)));
22 typedef double __m128d_u
__attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24 __attribute__((__vector_size__(16), __aligned__(1)));
27 typedef double __v2df
__attribute__((__vector_size__(16)));
28 typedef long long __v2di
__attribute__((__vector_size__(16)));
29 typedef short __v8hi
__attribute__((__vector_size__(16)));
30 typedef char __v16qi
__attribute__((__vector_size__(16)));
33 typedef unsigned long long __v2du
__attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu
__attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu
__attribute__((__vector_size__(16)));
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38 * appear in the interface though. */
39 typedef signed char __v16qs
__attribute__((__vector_size__(16)));
41 /* Define the default attributes for the functions in this file. */
42 #define __DEFAULT_FN_ATTRS \
43 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
44 __min_vector_width__(128)))
45 #define __DEFAULT_FN_ATTRS_MMX \
46 __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), \
47 __min_vector_width__(64)))
49 /// Adds lower double-precision values in both operands and returns the
50 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
51 /// are copied from the upper double-precision value of the first operand.
53 /// \headerfile <x86intrin.h>
55 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
58 /// A 128-bit vector of [2 x double] containing one of the source operands.
60 /// A 128-bit vector of [2 x double] containing one of the source operands.
61 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
62 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
63 /// from the upper 64 bits of the first source operand.
64 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_add_sd(__m128d __a
,
70 /// Adds two 128-bit vectors of [2 x double].
72 /// \headerfile <x86intrin.h>
74 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
77 /// A 128-bit vector of [2 x double] containing one of the source operands.
79 /// A 128-bit vector of [2 x double] containing one of the source operands.
80 /// \returns A 128-bit vector of [2 x double] containing the sums of both
82 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_add_pd(__m128d __a
,
84 return (__m128d
)((__v2df
)__a
+ (__v2df
)__b
);
87 /// Subtracts the lower double-precision value of the second operand
88 /// from the lower double-precision value of the first operand and returns
89 /// the difference in the lower 64 bits of the result. The upper 64 bits of
90 /// the result are copied from the upper double-precision value of the first
93 /// \headerfile <x86intrin.h>
95 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
98 /// A 128-bit vector of [2 x double] containing the minuend.
100 /// A 128-bit vector of [2 x double] containing the subtrahend.
101 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
102 /// difference of the lower 64 bits of both operands. The upper 64 bits are
103 /// copied from the upper 64 bits of the first source operand.
104 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sub_sd(__m128d __a
,
110 /// Subtracts two 128-bit vectors of [2 x double].
112 /// \headerfile <x86intrin.h>
114 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
117 /// A 128-bit vector of [2 x double] containing the minuend.
119 /// A 128-bit vector of [2 x double] containing the subtrahend.
120 /// \returns A 128-bit vector of [2 x double] containing the differences between
122 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sub_pd(__m128d __a
,
124 return (__m128d
)((__v2df
)__a
- (__v2df
)__b
);
127 /// Multiplies lower double-precision values in both operands and returns
128 /// the product in the lower 64 bits of the result. The upper 64 bits of the
129 /// result are copied from the upper double-precision value of the first
132 /// \headerfile <x86intrin.h>
134 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
137 /// A 128-bit vector of [2 x double] containing one of the source operands.
139 /// A 128-bit vector of [2 x double] containing one of the source operands.
140 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
141 /// product of the lower 64 bits of both operands. The upper 64 bits are
142 /// copied from the upper 64 bits of the first source operand.
143 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mul_sd(__m128d __a
,
149 /// Multiplies two 128-bit vectors of [2 x double].
151 /// \headerfile <x86intrin.h>
153 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
156 /// A 128-bit vector of [2 x double] containing one of the operands.
158 /// A 128-bit vector of [2 x double] containing one of the operands.
159 /// \returns A 128-bit vector of [2 x double] containing the products of both
161 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mul_pd(__m128d __a
,
163 return (__m128d
)((__v2df
)__a
* (__v2df
)__b
);
166 /// Divides the lower double-precision value of the first operand by the
167 /// lower double-precision value of the second operand and returns the
168 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
169 /// result are copied from the upper double-precision value of the first
172 /// \headerfile <x86intrin.h>
174 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
177 /// A 128-bit vector of [2 x double] containing the dividend.
179 /// A 128-bit vector of [2 x double] containing divisor.
180 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
181 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
182 /// copied from the upper 64 bits of the first source operand.
183 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_div_sd(__m128d __a
,
189 /// Performs an element-by-element division of two 128-bit vectors of
192 /// \headerfile <x86intrin.h>
194 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
197 /// A 128-bit vector of [2 x double] containing the dividend.
199 /// A 128-bit vector of [2 x double] containing the divisor.
200 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
202 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_div_pd(__m128d __a
,
204 return (__m128d
)((__v2df
)__a
/ (__v2df
)__b
);
207 /// Calculates the square root of the lower double-precision value of
208 /// the second operand and returns it in the lower 64 bits of the result.
209 /// The upper 64 bits of the result are copied from the upper
210 /// double-precision value of the first operand.
212 /// \headerfile <x86intrin.h>
214 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
217 /// A 128-bit vector of [2 x double] containing one of the operands. The
218 /// upper 64 bits of this operand are copied to the upper 64 bits of the
221 /// A 128-bit vector of [2 x double] containing one of the operands. The
222 /// square root is calculated using the lower 64 bits of this operand.
223 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
224 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
225 /// bits are copied from the upper 64 bits of operand \a __a.
226 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sqrt_sd(__m128d __a
,
228 __m128d __c
= __builtin_ia32_sqrtsd((__v2df
)__b
);
229 return __extension__(__m128d
){__c
[0], __a
[1]};
232 /// Calculates the square root of the each of two values stored in a
233 /// 128-bit vector of [2 x double].
235 /// \headerfile <x86intrin.h>
237 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
240 /// A 128-bit vector of [2 x double].
241 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
242 /// values in the operand.
243 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sqrt_pd(__m128d __a
) {
244 return __builtin_ia32_sqrtpd((__v2df
)__a
);
247 /// Compares lower 64-bit double-precision values of both operands, and
248 /// returns the lesser of the pair of values in the lower 64-bits of the
249 /// result. The upper 64 bits of the result are copied from the upper
250 /// double-precision value of the first operand.
252 /// \headerfile <x86intrin.h>
254 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
257 /// A 128-bit vector of [2 x double] containing one of the operands. The
258 /// lower 64 bits of this operand are used in the comparison.
260 /// A 128-bit vector of [2 x double] containing one of the operands. The
261 /// lower 64 bits of this operand are used in the comparison.
262 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
263 /// minimum value between both operands. The upper 64 bits are copied from
264 /// the upper 64 bits of the first source operand.
265 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_min_sd(__m128d __a
,
267 return __builtin_ia32_minsd((__v2df
)__a
, (__v2df
)__b
);
270 /// Performs element-by-element comparison of the two 128-bit vectors of
271 /// [2 x double] and returns the vector containing the lesser of each pair of
274 /// \headerfile <x86intrin.h>
276 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
279 /// A 128-bit vector of [2 x double] containing one of the operands.
281 /// A 128-bit vector of [2 x double] containing one of the operands.
282 /// \returns A 128-bit vector of [2 x double] containing the minimum values
283 /// between both operands.
284 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_min_pd(__m128d __a
,
286 return __builtin_ia32_minpd((__v2df
)__a
, (__v2df
)__b
);
289 /// Compares lower 64-bit double-precision values of both operands, and
290 /// returns the greater of the pair of values in the lower 64-bits of the
291 /// result. The upper 64 bits of the result are copied from the upper
292 /// double-precision value of the first operand.
294 /// \headerfile <x86intrin.h>
296 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
299 /// A 128-bit vector of [2 x double] containing one of the operands. The
300 /// lower 64 bits of this operand are used in the comparison.
302 /// A 128-bit vector of [2 x double] containing one of the operands. The
303 /// lower 64 bits of this operand are used in the comparison.
304 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
305 /// maximum value between both operands. The upper 64 bits are copied from
306 /// the upper 64 bits of the first source operand.
307 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_max_sd(__m128d __a
,
309 return __builtin_ia32_maxsd((__v2df
)__a
, (__v2df
)__b
);
312 /// Performs element-by-element comparison of the two 128-bit vectors of
313 /// [2 x double] and returns the vector containing the greater of each pair
316 /// \headerfile <x86intrin.h>
318 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
321 /// A 128-bit vector of [2 x double] containing one of the operands.
323 /// A 128-bit vector of [2 x double] containing one of the operands.
324 /// \returns A 128-bit vector of [2 x double] containing the maximum values
325 /// between both operands.
326 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_max_pd(__m128d __a
,
328 return __builtin_ia32_maxpd((__v2df
)__a
, (__v2df
)__b
);
331 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
333 /// \headerfile <x86intrin.h>
335 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
338 /// A 128-bit vector of [2 x double] containing one of the source operands.
340 /// A 128-bit vector of [2 x double] containing one of the source operands.
341 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
342 /// values between both operands.
343 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_and_pd(__m128d __a
,
345 return (__m128d
)((__v2du
)__a
& (__v2du
)__b
);
348 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
349 /// the one's complement of the values contained in the first source operand.
351 /// \headerfile <x86intrin.h>
353 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
356 /// A 128-bit vector of [2 x double] containing the left source operand. The
357 /// one's complement of this value is used in the bitwise AND.
359 /// A 128-bit vector of [2 x double] containing the right source operand.
360 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
361 /// values in the second operand and the one's complement of the first
363 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_andnot_pd(__m128d __a
,
365 return (__m128d
)(~(__v2du
)__a
& (__v2du
)__b
);
368 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
370 /// \headerfile <x86intrin.h>
372 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
375 /// A 128-bit vector of [2 x double] containing one of the source operands.
377 /// A 128-bit vector of [2 x double] containing one of the source operands.
378 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
379 /// values between both operands.
380 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_or_pd(__m128d __a
,
382 return (__m128d
)((__v2du
)__a
| (__v2du
)__b
);
385 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
387 /// \headerfile <x86intrin.h>
389 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
392 /// A 128-bit vector of [2 x double] containing one of the source operands.
394 /// A 128-bit vector of [2 x double] containing one of the source operands.
395 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
396 /// values between both operands.
397 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_xor_pd(__m128d __a
,
399 return (__m128d
)((__v2du
)__a
^ (__v2du
)__b
);
402 /// Compares each of the corresponding double-precision values of the
403 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
404 /// for false, 0xFFFFFFFFFFFFFFFF for true.
406 /// \headerfile <x86intrin.h>
408 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
411 /// A 128-bit vector of [2 x double].
413 /// A 128-bit vector of [2 x double].
414 /// \returns A 128-bit vector containing the comparison results.
415 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpeq_pd(__m128d __a
,
417 return (__m128d
)__builtin_ia32_cmpeqpd((__v2df
)__a
, (__v2df
)__b
);
420 /// Compares each of the corresponding double-precision values of the
421 /// 128-bit vectors of [2 x double] to determine if the values in the first
422 /// operand are less than those in the second operand. Each comparison
423 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
425 /// \headerfile <x86intrin.h>
427 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
430 /// A 128-bit vector of [2 x double].
432 /// A 128-bit vector of [2 x double].
433 /// \returns A 128-bit vector containing the comparison results.
434 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmplt_pd(__m128d __a
,
436 return (__m128d
)__builtin_ia32_cmpltpd((__v2df
)__a
, (__v2df
)__b
);
439 /// Compares each of the corresponding double-precision values of the
440 /// 128-bit vectors of [2 x double] to determine if the values in the first
441 /// operand are less than or equal to those in the second operand.
443 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
445 /// \headerfile <x86intrin.h>
447 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
450 /// A 128-bit vector of [2 x double].
452 /// A 128-bit vector of [2 x double].
453 /// \returns A 128-bit vector containing the comparison results.
454 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmple_pd(__m128d __a
,
456 return (__m128d
)__builtin_ia32_cmplepd((__v2df
)__a
, (__v2df
)__b
);
459 /// Compares each of the corresponding double-precision values of the
460 /// 128-bit vectors of [2 x double] to determine if the values in the first
461 /// operand are greater than those in the second operand.
463 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
465 /// \headerfile <x86intrin.h>
467 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
470 /// A 128-bit vector of [2 x double].
472 /// A 128-bit vector of [2 x double].
473 /// \returns A 128-bit vector containing the comparison results.
474 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpgt_pd(__m128d __a
,
476 return (__m128d
)__builtin_ia32_cmpltpd((__v2df
)__b
, (__v2df
)__a
);
479 /// Compares each of the corresponding double-precision values of the
480 /// 128-bit vectors of [2 x double] to determine if the values in the first
481 /// operand are greater than or equal to those in the second operand.
483 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
485 /// \headerfile <x86intrin.h>
487 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
490 /// A 128-bit vector of [2 x double].
492 /// A 128-bit vector of [2 x double].
493 /// \returns A 128-bit vector containing the comparison results.
494 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpge_pd(__m128d __a
,
496 return (__m128d
)__builtin_ia32_cmplepd((__v2df
)__b
, (__v2df
)__a
);
499 /// Compares each of the corresponding double-precision values of the
500 /// 128-bit vectors of [2 x double] to determine if the values in the first
501 /// operand are ordered with respect to those in the second operand.
503 /// A pair of double-precision values are "ordered" with respect to each
504 /// other if neither value is a NaN. Each comparison yields 0x0 for false,
505 /// 0xFFFFFFFFFFFFFFFF for true.
507 /// \headerfile <x86intrin.h>
509 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
512 /// A 128-bit vector of [2 x double].
514 /// A 128-bit vector of [2 x double].
515 /// \returns A 128-bit vector containing the comparison results.
516 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpord_pd(__m128d __a
,
518 return (__m128d
)__builtin_ia32_cmpordpd((__v2df
)__a
, (__v2df
)__b
);
521 /// Compares each of the corresponding double-precision values of the
522 /// 128-bit vectors of [2 x double] to determine if the values in the first
523 /// operand are unordered with respect to those in the second operand.
525 /// A pair of double-precision values are "unordered" with respect to each
526 /// other if one or both values are NaN. Each comparison yields 0x0 for
527 /// false, 0xFFFFFFFFFFFFFFFF for true.
529 /// \headerfile <x86intrin.h>
531 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
535 /// A 128-bit vector of [2 x double].
537 /// A 128-bit vector of [2 x double].
538 /// \returns A 128-bit vector containing the comparison results.
539 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpunord_pd(__m128d __a
,
541 return (__m128d
)__builtin_ia32_cmpunordpd((__v2df
)__a
, (__v2df
)__b
);
544 /// Compares each of the corresponding double-precision values of the
545 /// 128-bit vectors of [2 x double] to determine if the values in the first
546 /// operand are unequal to those in the second operand.
548 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
550 /// \headerfile <x86intrin.h>
552 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
555 /// A 128-bit vector of [2 x double].
557 /// A 128-bit vector of [2 x double].
558 /// \returns A 128-bit vector containing the comparison results.
559 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpneq_pd(__m128d __a
,
561 return (__m128d
)__builtin_ia32_cmpneqpd((__v2df
)__a
, (__v2df
)__b
);
564 /// Compares each of the corresponding double-precision values of the
565 /// 128-bit vectors of [2 x double] to determine if the values in the first
566 /// operand are not less than those in the second operand.
568 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
570 /// \headerfile <x86intrin.h>
572 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
575 /// A 128-bit vector of [2 x double].
577 /// A 128-bit vector of [2 x double].
578 /// \returns A 128-bit vector containing the comparison results.
579 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnlt_pd(__m128d __a
,
581 return (__m128d
)__builtin_ia32_cmpnltpd((__v2df
)__a
, (__v2df
)__b
);
584 /// Compares each of the corresponding double-precision values of the
585 /// 128-bit vectors of [2 x double] to determine if the values in the first
586 /// operand are not less than or equal to those in the second operand.
588 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
590 /// \headerfile <x86intrin.h>
592 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
595 /// A 128-bit vector of [2 x double].
597 /// A 128-bit vector of [2 x double].
598 /// \returns A 128-bit vector containing the comparison results.
599 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnle_pd(__m128d __a
,
601 return (__m128d
)__builtin_ia32_cmpnlepd((__v2df
)__a
, (__v2df
)__b
);
604 /// Compares each of the corresponding double-precision values of the
605 /// 128-bit vectors of [2 x double] to determine if the values in the first
606 /// operand are not greater than those in the second operand.
608 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
610 /// \headerfile <x86intrin.h>
612 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
615 /// A 128-bit vector of [2 x double].
617 /// A 128-bit vector of [2 x double].
618 /// \returns A 128-bit vector containing the comparison results.
619 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpngt_pd(__m128d __a
,
621 return (__m128d
)__builtin_ia32_cmpnltpd((__v2df
)__b
, (__v2df
)__a
);
624 /// Compares each of the corresponding double-precision values of the
625 /// 128-bit vectors of [2 x double] to determine if the values in the first
626 /// operand are not greater than or equal to those in the second operand.
628 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
630 /// \headerfile <x86intrin.h>
632 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
635 /// A 128-bit vector of [2 x double].
637 /// A 128-bit vector of [2 x double].
638 /// \returns A 128-bit vector containing the comparison results.
639 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnge_pd(__m128d __a
,
641 return (__m128d
)__builtin_ia32_cmpnlepd((__v2df
)__b
, (__v2df
)__a
);
644 /// Compares the lower double-precision floating-point values in each of
645 /// the two 128-bit floating-point vectors of [2 x double] for equality.
647 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
649 /// \headerfile <x86intrin.h>
651 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
654 /// A 128-bit vector of [2 x double]. The lower double-precision value is
655 /// compared to the lower double-precision value of \a __b.
657 /// A 128-bit vector of [2 x double]. The lower double-precision value is
658 /// compared to the lower double-precision value of \a __a.
659 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
660 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
661 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpeq_sd(__m128d __a
,
663 return (__m128d
)__builtin_ia32_cmpeqsd((__v2df
)__a
, (__v2df
)__b
);
666 /// Compares the lower double-precision floating-point values in each of
667 /// the two 128-bit floating-point vectors of [2 x double] to determine if
668 /// the value in the first parameter is less than the corresponding value in
669 /// the second parameter.
671 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
673 /// \headerfile <x86intrin.h>
675 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
678 /// A 128-bit vector of [2 x double]. The lower double-precision value is
679 /// compared to the lower double-precision value of \a __b.
681 /// A 128-bit vector of [2 x double]. The lower double-precision value is
682 /// compared to the lower double-precision value of \a __a.
683 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
684 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
685 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmplt_sd(__m128d __a
,
687 return (__m128d
)__builtin_ia32_cmpltsd((__v2df
)__a
, (__v2df
)__b
);
690 /// Compares the lower double-precision floating-point values in each of
691 /// the two 128-bit floating-point vectors of [2 x double] to determine if
692 /// the value in the first parameter is less than or equal to the
693 /// corresponding value in the second parameter.
695 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
697 /// \headerfile <x86intrin.h>
699 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
702 /// A 128-bit vector of [2 x double]. The lower double-precision value is
703 /// compared to the lower double-precision value of \a __b.
705 /// A 128-bit vector of [2 x double]. The lower double-precision value is
706 /// compared to the lower double-precision value of \a __a.
707 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
708 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
709 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmple_sd(__m128d __a
,
711 return (__m128d
)__builtin_ia32_cmplesd((__v2df
)__a
, (__v2df
)__b
);
714 /// Compares the lower double-precision floating-point values in each of
715 /// the two 128-bit floating-point vectors of [2 x double] to determine if
716 /// the value in the first parameter is greater than the corresponding value
717 /// in the second parameter.
719 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
721 /// \headerfile <x86intrin.h>
723 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
726 /// A 128-bit vector of [2 x double]. The lower double-precision value is
727 /// compared to the lower double-precision value of \a __b.
729 /// A 128-bit vector of [2 x double]. The lower double-precision value is
730 /// compared to the lower double-precision value of \a __a.
731 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
732 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
733 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpgt_sd(__m128d __a
,
735 __m128d __c
= __builtin_ia32_cmpltsd((__v2df
)__b
, (__v2df
)__a
);
736 return __extension__(__m128d
){__c
[0], __a
[1]};
739 /// Compares the lower double-precision floating-point values in each of
740 /// the two 128-bit floating-point vectors of [2 x double] to determine if
741 /// the value in the first parameter is greater than or equal to the
742 /// corresponding value in the second parameter.
744 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
746 /// \headerfile <x86intrin.h>
748 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
751 /// A 128-bit vector of [2 x double]. The lower double-precision value is
752 /// compared to the lower double-precision value of \a __b.
754 /// A 128-bit vector of [2 x double]. The lower double-precision value is
755 /// compared to the lower double-precision value of \a __a.
756 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
757 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
758 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpge_sd(__m128d __a
,
760 __m128d __c
= __builtin_ia32_cmplesd((__v2df
)__b
, (__v2df
)__a
);
761 return __extension__(__m128d
){__c
[0], __a
[1]};
764 /// Compares the lower double-precision floating-point values in each of
765 /// the two 128-bit floating-point vectors of [2 x double] to determine if
766 /// the value in the first parameter is "ordered" with respect to the
767 /// corresponding value in the second parameter.
769 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
770 /// of double-precision values are "ordered" with respect to each other if
771 /// neither value is a NaN.
773 /// \headerfile <x86intrin.h>
775 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
778 /// A 128-bit vector of [2 x double]. The lower double-precision value is
779 /// compared to the lower double-precision value of \a __b.
781 /// A 128-bit vector of [2 x double]. The lower double-precision value is
782 /// compared to the lower double-precision value of \a __a.
783 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
784 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
785 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpord_sd(__m128d __a
,
787 return (__m128d
)__builtin_ia32_cmpordsd((__v2df
)__a
, (__v2df
)__b
);
790 /// Compares the lower double-precision floating-point values in each of
791 /// the two 128-bit floating-point vectors of [2 x double] to determine if
792 /// the value in the first parameter is "unordered" with respect to the
793 /// corresponding value in the second parameter.
795 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
796 /// of double-precision values are "unordered" with respect to each other if
797 /// one or both values are NaN.
799 /// \headerfile <x86intrin.h>
801 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
805 /// A 128-bit vector of [2 x double]. The lower double-precision value is
806 /// compared to the lower double-precision value of \a __b.
808 /// A 128-bit vector of [2 x double]. The lower double-precision value is
809 /// compared to the lower double-precision value of \a __a.
810 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
811 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
812 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpunord_sd(__m128d __a
,
814 return (__m128d
)__builtin_ia32_cmpunordsd((__v2df
)__a
, (__v2df
)__b
);
817 /// Compares the lower double-precision floating-point values in each of
818 /// the two 128-bit floating-point vectors of [2 x double] to determine if
819 /// the value in the first parameter is unequal to the corresponding value in
820 /// the second parameter.
822 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
824 /// \headerfile <x86intrin.h>
826 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
829 /// A 128-bit vector of [2 x double]. The lower double-precision value is
830 /// compared to the lower double-precision value of \a __b.
832 /// A 128-bit vector of [2 x double]. The lower double-precision value is
833 /// compared to the lower double-precision value of \a __a.
834 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
835 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
836 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpneq_sd(__m128d __a
,
838 return (__m128d
)__builtin_ia32_cmpneqsd((__v2df
)__a
, (__v2df
)__b
);
841 /// Compares the lower double-precision floating-point values in each of
842 /// the two 128-bit floating-point vectors of [2 x double] to determine if
843 /// the value in the first parameter is not less than the corresponding
844 /// value in the second parameter.
846 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
848 /// \headerfile <x86intrin.h>
850 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
853 /// A 128-bit vector of [2 x double]. The lower double-precision value is
854 /// compared to the lower double-precision value of \a __b.
856 /// A 128-bit vector of [2 x double]. The lower double-precision value is
857 /// compared to the lower double-precision value of \a __a.
858 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
859 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
860 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnlt_sd(__m128d __a
,
862 return (__m128d
)__builtin_ia32_cmpnltsd((__v2df
)__a
, (__v2df
)__b
);
865 /// Compares the lower double-precision floating-point values in each of
866 /// the two 128-bit floating-point vectors of [2 x double] to determine if
867 /// the value in the first parameter is not less than or equal to the
868 /// corresponding value in the second parameter.
870 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
872 /// \headerfile <x86intrin.h>
874 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
877 /// A 128-bit vector of [2 x double]. The lower double-precision value is
878 /// compared to the lower double-precision value of \a __b.
880 /// A 128-bit vector of [2 x double]. The lower double-precision value is
881 /// compared to the lower double-precision value of \a __a.
882 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
883 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
884 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnle_sd(__m128d __a
,
886 return (__m128d
)__builtin_ia32_cmpnlesd((__v2df
)__a
, (__v2df
)__b
);
889 /// Compares the lower double-precision floating-point values in each of
890 /// the two 128-bit floating-point vectors of [2 x double] to determine if
891 /// the value in the first parameter is not greater than the corresponding
892 /// value in the second parameter.
894 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
896 /// \headerfile <x86intrin.h>
898 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
901 /// A 128-bit vector of [2 x double]. The lower double-precision value is
902 /// compared to the lower double-precision value of \a __b.
904 /// A 128-bit vector of [2 x double]. The lower double-precision value is
905 /// compared to the lower double-precision value of \a __a.
906 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
907 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
908 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpngt_sd(__m128d __a
,
910 __m128d __c
= __builtin_ia32_cmpnltsd((__v2df
)__b
, (__v2df
)__a
);
911 return __extension__(__m128d
){__c
[0], __a
[1]};
914 /// Compares the lower double-precision floating-point values in each of
915 /// the two 128-bit floating-point vectors of [2 x double] to determine if
916 /// the value in the first parameter is not greater than or equal to the
917 /// corresponding value in the second parameter.
919 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
921 /// \headerfile <x86intrin.h>
923 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
926 /// A 128-bit vector of [2 x double]. The lower double-precision value is
927 /// compared to the lower double-precision value of \a __b.
929 /// A 128-bit vector of [2 x double]. The lower double-precision value is
930 /// compared to the lower double-precision value of \a __a.
931 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
932 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
933 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnge_sd(__m128d __a
,
935 __m128d __c
= __builtin_ia32_cmpnlesd((__v2df
)__b
, (__v2df
)__a
);
936 return __extension__(__m128d
){__c
[0], __a
[1]};
939 /// Compares the lower double-precision floating-point values in each of
940 /// the two 128-bit floating-point vectors of [2 x double] for equality.
942 /// The comparison yields 0 for false, 1 for true. If either of the two
943 /// lower double-precision values is NaN, 0 is returned.
945 /// \headerfile <x86intrin.h>
947 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
950 /// A 128-bit vector of [2 x double]. The lower double-precision value is
951 /// compared to the lower double-precision value of \a __b.
953 /// A 128-bit vector of [2 x double]. The lower double-precision value is
954 /// compared to the lower double-precision value of \a __a.
955 /// \returns An integer containing the comparison results. If either of the two
956 /// lower double-precision values is NaN, 0 is returned.
957 static __inline__
int __DEFAULT_FN_ATTRS
_mm_comieq_sd(__m128d __a
,
959 return __builtin_ia32_comisdeq((__v2df
)__a
, (__v2df
)__b
);
962 /// Compares the lower double-precision floating-point values in each of
963 /// the two 128-bit floating-point vectors of [2 x double] to determine if
964 /// the value in the first parameter is less than the corresponding value in
965 /// the second parameter.
967 /// The comparison yields 0 for false, 1 for true. If either of the two
968 /// lower double-precision values is NaN, 0 is returned.
970 /// \headerfile <x86intrin.h>
972 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
975 /// A 128-bit vector of [2 x double]. The lower double-precision value is
976 /// compared to the lower double-precision value of \a __b.
978 /// A 128-bit vector of [2 x double]. The lower double-precision value is
979 /// compared to the lower double-precision value of \a __a.
980 /// \returns An integer containing the comparison results. If either of the two
981 /// lower double-precision values is NaN, 0 is returned.
982 static __inline__
int __DEFAULT_FN_ATTRS
_mm_comilt_sd(__m128d __a
,
984 return __builtin_ia32_comisdlt((__v2df
)__a
, (__v2df
)__b
);
987 /// Compares the lower double-precision floating-point values in each of
988 /// the two 128-bit floating-point vectors of [2 x double] to determine if
989 /// the value in the first parameter is less than or equal to the
990 /// corresponding value in the second parameter.
992 /// The comparison yields 0 for false, 1 for true. If either of the two
993 /// lower double-precision values is NaN, 0 is returned.
995 /// \headerfile <x86intrin.h>
997 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1000 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1001 /// compared to the lower double-precision value of \a __b.
1003 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1004 /// compared to the lower double-precision value of \a __a.
1005 /// \returns An integer containing the comparison results. If either of the two
1006 /// lower double-precision values is NaN, 0 is returned.
1007 static __inline__
int __DEFAULT_FN_ATTRS
_mm_comile_sd(__m128d __a
,
1009 return __builtin_ia32_comisdle((__v2df
)__a
, (__v2df
)__b
);
1012 /// Compares the lower double-precision floating-point values in each of
1013 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1014 /// the value in the first parameter is greater than the corresponding value
1015 /// in the second parameter.
1017 /// The comparison yields 0 for false, 1 for true. If either of the two
1018 /// lower double-precision values is NaN, 0 is returned.
1020 /// \headerfile <x86intrin.h>
1022 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1025 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1026 /// compared to the lower double-precision value of \a __b.
1028 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1029 /// compared to the lower double-precision value of \a __a.
1030 /// \returns An integer containing the comparison results. If either of the two
1031 /// lower double-precision values is NaN, 0 is returned.
1032 static __inline__
int __DEFAULT_FN_ATTRS
_mm_comigt_sd(__m128d __a
,
1034 return __builtin_ia32_comisdgt((__v2df
)__a
, (__v2df
)__b
);
1037 /// Compares the lower double-precision floating-point values in each of
1038 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1039 /// the value in the first parameter is greater than or equal to the
1040 /// corresponding value in the second parameter.
1042 /// The comparison yields 0 for false, 1 for true. If either of the two
1043 /// lower double-precision values is NaN, 0 is returned.
1045 /// \headerfile <x86intrin.h>
1047 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1050 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1051 /// compared to the lower double-precision value of \a __b.
1053 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1054 /// compared to the lower double-precision value of \a __a.
1055 /// \returns An integer containing the comparison results. If either of the two
1056 /// lower double-precision values is NaN, 0 is returned.
1057 static __inline__
int __DEFAULT_FN_ATTRS
_mm_comige_sd(__m128d __a
,
1059 return __builtin_ia32_comisdge((__v2df
)__a
, (__v2df
)__b
);
1062 /// Compares the lower double-precision floating-point values in each of
1063 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1064 /// the value in the first parameter is unequal to the corresponding value in
1065 /// the second parameter.
1067 /// The comparison yields 0 for false, 1 for true. If either of the two
1068 /// lower double-precision values is NaN, 1 is returned.
1070 /// \headerfile <x86intrin.h>
1072 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1075 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1076 /// compared to the lower double-precision value of \a __b.
1078 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1079 /// compared to the lower double-precision value of \a __a.
1080 /// \returns An integer containing the comparison results. If either of the two
1081 /// lower double-precision values is NaN, 1 is returned.
1082 static __inline__
int __DEFAULT_FN_ATTRS
_mm_comineq_sd(__m128d __a
,
1084 return __builtin_ia32_comisdneq((__v2df
)__a
, (__v2df
)__b
);
1087 /// Compares the lower double-precision floating-point values in each of
1088 /// the two 128-bit floating-point vectors of [2 x double] for equality. The
1089 /// comparison yields 0 for false, 1 for true.
1091 /// If either of the two lower double-precision values is NaN, 0 is returned.
1093 /// \headerfile <x86intrin.h>
1095 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1098 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1099 /// compared to the lower double-precision value of \a __b.
1101 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1102 /// compared to the lower double-precision value of \a __a.
1103 /// \returns An integer containing the comparison results. If either of the two
1104 /// lower double-precision values is NaN, 0 is returned.
1105 static __inline__
int __DEFAULT_FN_ATTRS
_mm_ucomieq_sd(__m128d __a
,
1107 return __builtin_ia32_ucomisdeq((__v2df
)__a
, (__v2df
)__b
);
1110 /// Compares the lower double-precision floating-point values in each of
1111 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1112 /// the value in the first parameter is less than the corresponding value in
1113 /// the second parameter.
1115 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1116 /// double-precision values is NaN, 0 is returned.
1118 /// \headerfile <x86intrin.h>
1120 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1123 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1124 /// compared to the lower double-precision value of \a __b.
1126 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1127 /// compared to the lower double-precision value of \a __a.
1128 /// \returns An integer containing the comparison results. If either of the two
1129 /// lower double-precision values is NaN, 0 is returned.
1130 static __inline__
int __DEFAULT_FN_ATTRS
_mm_ucomilt_sd(__m128d __a
,
1132 return __builtin_ia32_ucomisdlt((__v2df
)__a
, (__v2df
)__b
);
1135 /// Compares the lower double-precision floating-point values in each of
1136 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1137 /// the value in the first parameter is less than or equal to the
1138 /// corresponding value in the second parameter.
1140 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1141 /// double-precision values is NaN, 0 is returned.
1143 /// \headerfile <x86intrin.h>
1145 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1148 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1149 /// compared to the lower double-precision value of \a __b.
1151 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1152 /// compared to the lower double-precision value of \a __a.
1153 /// \returns An integer containing the comparison results. If either of the two
1154 /// lower double-precision values is NaN, 0 is returned.
1155 static __inline__
int __DEFAULT_FN_ATTRS
_mm_ucomile_sd(__m128d __a
,
1157 return __builtin_ia32_ucomisdle((__v2df
)__a
, (__v2df
)__b
);
1160 /// Compares the lower double-precision floating-point values in each of
1161 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1162 /// the value in the first parameter is greater than the corresponding value
1163 /// in the second parameter.
1165 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1166 /// double-precision values is NaN, 0 is returned.
1168 /// \headerfile <x86intrin.h>
1170 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1173 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1174 /// compared to the lower double-precision value of \a __b.
1176 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1177 /// compared to the lower double-precision value of \a __a.
1178 /// \returns An integer containing the comparison results. If either of the two
1179 /// lower double-precision values is NaN, 0 is returned.
1180 static __inline__
int __DEFAULT_FN_ATTRS
_mm_ucomigt_sd(__m128d __a
,
1182 return __builtin_ia32_ucomisdgt((__v2df
)__a
, (__v2df
)__b
);
1185 /// Compares the lower double-precision floating-point values in each of
1186 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1187 /// the value in the first parameter is greater than or equal to the
1188 /// corresponding value in the second parameter.
1190 /// The comparison yields 0 for false, 1 for true. If either of the two
1191 /// lower double-precision values is NaN, 0 is returned.
1193 /// \headerfile <x86intrin.h>
1195 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1198 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1199 /// compared to the lower double-precision value of \a __b.
1201 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1202 /// compared to the lower double-precision value of \a __a.
1203 /// \returns An integer containing the comparison results. If either of the two
1204 /// lower double-precision values is NaN, 0 is returned.
1205 static __inline__
int __DEFAULT_FN_ATTRS
_mm_ucomige_sd(__m128d __a
,
1207 return __builtin_ia32_ucomisdge((__v2df
)__a
, (__v2df
)__b
);
1210 /// Compares the lower double-precision floating-point values in each of
1211 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1212 /// the value in the first parameter is unequal to the corresponding value in
1213 /// the second parameter.
1215 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1216 /// double-precision values is NaN, 1 is returned.
1218 /// \headerfile <x86intrin.h>
1220 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1223 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1224 /// compared to the lower double-precision value of \a __b.
1226 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1227 /// compared to the lower double-precision value of \a __a.
1228 /// \returns An integer containing the comparison result. If either of the two
1229 /// lower double-precision values is NaN, 1 is returned.
1230 static __inline__
int __DEFAULT_FN_ATTRS
_mm_ucomineq_sd(__m128d __a
,
1232 return __builtin_ia32_ucomisdneq((__v2df
)__a
, (__v2df
)__b
);
1235 /// Converts the two double-precision floating-point elements of a
1236 /// 128-bit vector of [2 x double] into two single-precision floating-point
1237 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1238 /// The upper 64 bits of the result vector are set to zero.
1240 /// \headerfile <x86intrin.h>
1242 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1245 /// A 128-bit vector of [2 x double].
1246 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1247 /// converted values. The upper 64 bits are set to zero.
1248 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtpd_ps(__m128d __a
) {
1249 return __builtin_ia32_cvtpd2ps((__v2df
)__a
);
1252 /// Converts the lower two single-precision floating-point elements of a
1253 /// 128-bit vector of [4 x float] into two double-precision floating-point
1254 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1255 /// elements of the input vector are unused.
1257 /// \headerfile <x86intrin.h>
1259 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1262 /// A 128-bit vector of [4 x float]. The lower two single-precision
1263 /// floating-point elements are converted to double-precision values. The
1264 /// upper two elements are unused.
1265 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1266 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtps_pd(__m128 __a
) {
1267 return (__m128d
) __builtin_convertvector(
1268 __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__a
, 0, 1), __v2df
);
1271 /// Converts the lower two integer elements of a 128-bit vector of
1272 /// [4 x i32] into two double-precision floating-point values, returned in a
1273 /// 128-bit vector of [2 x double].
1275 /// The upper two elements of the input vector are unused.
1277 /// \headerfile <x86intrin.h>
1279 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1282 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1283 /// converted to double-precision values.
1285 /// The upper two elements are unused.
1286 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1287 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtepi32_pd(__m128i __a
) {
1288 return (__m128d
) __builtin_convertvector(
1289 __builtin_shufflevector((__v4si
)__a
, (__v4si
)__a
, 0, 1), __v2df
);
1292 /// Converts the two double-precision floating-point elements of a
1293 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1294 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1295 /// 64 bits of the result vector are set to zero.
1297 /// \headerfile <x86intrin.h>
1299 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1302 /// A 128-bit vector of [2 x double].
1303 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1304 /// converted values. The upper 64 bits are set to zero.
1305 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtpd_epi32(__m128d __a
) {
1306 return __builtin_ia32_cvtpd2dq((__v2df
)__a
);
1309 /// Converts the low-order element of a 128-bit vector of [2 x double]
1310 /// into a 32-bit signed integer value.
1312 /// \headerfile <x86intrin.h>
1314 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1317 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1319 /// \returns A 32-bit signed integer containing the converted value.
1320 static __inline__
int __DEFAULT_FN_ATTRS
_mm_cvtsd_si32(__m128d __a
) {
1321 return __builtin_ia32_cvtsd2si((__v2df
)__a
);
1324 /// Converts the lower double-precision floating-point element of a
1325 /// 128-bit vector of [2 x double], in the second parameter, into a
1326 /// single-precision floating-point value, returned in the lower 32 bits of a
1327 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1328 /// copied from the upper 96 bits of the first parameter.
1330 /// \headerfile <x86intrin.h>
1332 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1335 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1336 /// copied to the upper 96 bits of the result.
1338 /// A 128-bit vector of [2 x double]. The lower double-precision
1339 /// floating-point element is used in the conversion.
1340 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1341 /// converted value from the second parameter. The upper 96 bits are copied
1342 /// from the upper 96 bits of the first parameter.
1343 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsd_ss(__m128 __a
,
1345 return (__m128
)__builtin_ia32_cvtsd2ss((__v4sf
)__a
, (__v2df
)__b
);
1348 /// Converts a 32-bit signed integer value, in the second parameter, into
1349 /// a double-precision floating-point value, returned in the lower 64 bits of
1350 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1351 /// are copied from the upper 64 bits of the first parameter.
1353 /// \headerfile <x86intrin.h>
1355 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1358 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1359 /// copied to the upper 64 bits of the result.
1361 /// A 32-bit signed integer containing the value to be converted.
1362 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1363 /// converted value from the second parameter. The upper 64 bits are copied
1364 /// from the upper 64 bits of the first parameter.
1365 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtsi32_sd(__m128d __a
,
1371 /// Converts the lower single-precision floating-point element of a
1372 /// 128-bit vector of [4 x float], in the second parameter, into a
1373 /// double-precision floating-point value, returned in the lower 64 bits of
1374 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1375 /// are copied from the upper 64 bits of the first parameter.
1377 /// \headerfile <x86intrin.h>
1379 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1382 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1383 /// copied to the upper 64 bits of the result.
1385 /// A 128-bit vector of [4 x float]. The lower single-precision
1386 /// floating-point element is used in the conversion.
1387 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1388 /// converted value from the second parameter. The upper 64 bits are copied
1389 /// from the upper 64 bits of the first parameter.
1390 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtss_sd(__m128d __a
,
1396 /// Converts the two double-precision floating-point elements of a
1397 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1398 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1400 /// If the result of either conversion is inexact, the result is truncated
1401 /// (rounded towards zero) regardless of the current MXCSR setting. The upper
1402 /// 64 bits of the result vector are set to zero.
1404 /// \headerfile <x86intrin.h>
1406 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1410 /// A 128-bit vector of [2 x double].
1411 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1412 /// converted values. The upper 64 bits are set to zero.
1413 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvttpd_epi32(__m128d __a
) {
1414 return (__m128i
)__builtin_ia32_cvttpd2dq((__v2df
)__a
);
1417 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1418 /// signed integer value, truncating the result when it is inexact.
1420 /// \headerfile <x86intrin.h>
1422 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1426 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1428 /// \returns A 32-bit signed integer containing the converted value.
1429 static __inline__
int __DEFAULT_FN_ATTRS
_mm_cvttsd_si32(__m128d __a
) {
1430 return __builtin_ia32_cvttsd2si((__v2df
)__a
);
1433 /// Converts the two double-precision floating-point elements of a
1434 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1435 /// returned in a 64-bit vector of [2 x i32].
1437 /// \headerfile <x86intrin.h>
1439 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1442 /// A 128-bit vector of [2 x double].
1443 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1444 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpd_pi32(__m128d __a
) {
1445 return (__m64
)__builtin_ia32_cvtpd2pi((__v2df
)__a
);
1448 /// Converts the two double-precision floating-point elements of a
1449 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1450 /// returned in a 64-bit vector of [2 x i32].
1452 /// If the result of either conversion is inexact, the result is truncated
1453 /// (rounded towards zero) regardless of the current MXCSR setting.
1455 /// \headerfile <x86intrin.h>
1457 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1460 /// A 128-bit vector of [2 x double].
1461 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1462 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvttpd_pi32(__m128d __a
) {
1463 return (__m64
)__builtin_ia32_cvttpd2pi((__v2df
)__a
);
1466 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1467 /// [2 x i32] into two double-precision floating-point values, returned in a
1468 /// 128-bit vector of [2 x double].
1470 /// \headerfile <x86intrin.h>
1472 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1475 /// A 64-bit vector of [2 x i32].
1476 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1477 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi32_pd(__m64 __a
) {
1478 return __builtin_ia32_cvtpi2pd((__v2si
)__a
);
1481 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1482 /// a double-precision floating-point value.
1484 /// \headerfile <x86intrin.h>
1486 /// This intrinsic has no corresponding instruction.
1489 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1490 /// \returns A double-precision floating-point value copied from the lower 64
1492 static __inline__
double __DEFAULT_FN_ATTRS
_mm_cvtsd_f64(__m128d __a
) {
1496 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1497 /// memory location.
1499 /// \headerfile <x86intrin.h>
1501 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1504 /// A pointer to a 128-bit memory location. The address of the memory
1505 /// location has to be 16-byte aligned.
1506 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1507 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load_pd(double const *__dp
) {
1508 return *(const __m128d
*)__dp
;
1511 /// Loads a double-precision floating-point value from a specified memory
1512 /// location and duplicates it to both vector elements of a 128-bit vector of
1515 /// \headerfile <x86intrin.h>
1517 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1520 /// A pointer to a memory location containing a double-precision value.
1521 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1522 /// duplicated values.
1523 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load1_pd(double const *__dp
) {
1524 struct __mm_load1_pd_struct
{
1526 } __attribute__((__packed__
, __may_alias__
));
1527 double __u
= ((const struct __mm_load1_pd_struct
*)__dp
)->__u
;
1528 return __extension__(__m128d
){__u
, __u
};
1531 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1533 /// Loads two double-precision values, in reverse order, from an aligned
1534 /// memory location into a 128-bit vector of [2 x double].
1536 /// \headerfile <x86intrin.h>
1538 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1539 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1540 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1543 /// A 16-byte aligned pointer to an array of double-precision values to be
1544 /// loaded in reverse order.
1545 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1547 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadr_pd(double const *__dp
) {
1548 __m128d __u
= *(const __m128d
*)__dp
;
1549 return __builtin_shufflevector((__v2df
)__u
, (__v2df
)__u
, 1, 0);
1552 /// Loads a 128-bit floating-point vector of [2 x double] from an
1553 /// unaligned memory location.
1555 /// \headerfile <x86intrin.h>
1557 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1560 /// A pointer to a 128-bit memory location. The address of the memory
1561 /// location does not have to be aligned.
1562 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1563 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadu_pd(double const *__dp
) {
1566 } __attribute__((__packed__
, __may_alias__
));
1567 return ((const struct __loadu_pd
*)__dp
)->__v
;
1570 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1571 /// vector and clears the upper element.
1573 /// \headerfile <x86intrin.h>
1575 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1578 /// A pointer to a 64-bit memory location. The address of the memory
1579 /// location does not have to be aligned.
1580 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1581 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si64(void const *__a
) {
1582 struct __loadu_si64
{
1584 } __attribute__((__packed__
, __may_alias__
));
1585 long long __u
= ((const struct __loadu_si64
*)__a
)->__v
;
1586 return __extension__(__m128i
)(__v2di
){__u
, 0LL};
1589 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1590 /// vector and clears the upper element.
1592 /// \headerfile <x86intrin.h>
1594 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1597 /// A pointer to a 32-bit memory location. The address of the memory
1598 /// location does not have to be aligned.
1599 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1600 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si32(void const *__a
) {
1601 struct __loadu_si32
{
1603 } __attribute__((__packed__
, __may_alias__
));
1604 int __u
= ((const struct __loadu_si32
*)__a
)->__v
;
1605 return __extension__(__m128i
)(__v4si
){__u
, 0, 0, 0};
1608 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1609 /// vector and clears the upper element.
1611 /// \headerfile <x86intrin.h>
1613 /// This intrinsic does not correspond to a specific instruction.
1616 /// A pointer to a 16-bit memory location. The address of the memory
1617 /// location does not have to be aligned.
1618 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1619 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si16(void const *__a
) {
1620 struct __loadu_si16
{
1622 } __attribute__((__packed__
, __may_alias__
));
1623 short __u
= ((const struct __loadu_si16
*)__a
)->__v
;
1624 return __extension__(__m128i
)(__v8hi
){__u
, 0, 0, 0, 0, 0, 0, 0};
1627 /// Loads a 64-bit double-precision value to the low element of a
1628 /// 128-bit integer vector and clears the upper element.
1630 /// \headerfile <x86intrin.h>
1632 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1635 /// A pointer to a memory location containing a double-precision value.
1636 /// The address of the memory location does not have to be aligned.
1637 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1638 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load_sd(double const *__dp
) {
1639 struct __mm_load_sd_struct
{
1641 } __attribute__((__packed__
, __may_alias__
));
1642 double __u
= ((const struct __mm_load_sd_struct
*)__dp
)->__u
;
1643 return __extension__(__m128d
){__u
, 0};
1646 /// Loads a double-precision value into the high-order bits of a 128-bit
1647 /// vector of [2 x double]. The low-order bits are copied from the low-order
1648 /// bits of the first operand.
1650 /// \headerfile <x86intrin.h>
1652 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1655 /// A 128-bit vector of [2 x double]. \n
1656 /// Bits [63:0] are written to bits [63:0] of the result.
1658 /// A pointer to a 64-bit memory location containing a double-precision
1659 /// floating-point value that is loaded. The loaded value is written to bits
1660 /// [127:64] of the result. The address of the memory location does not have
1662 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1663 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadh_pd(__m128d __a
,
1664 double const *__dp
) {
1665 struct __mm_loadh_pd_struct
{
1667 } __attribute__((__packed__
, __may_alias__
));
1668 double __u
= ((const struct __mm_loadh_pd_struct
*)__dp
)->__u
;
1669 return __extension__(__m128d
){__a
[0], __u
};
1672 /// Loads a double-precision value into the low-order bits of a 128-bit
1673 /// vector of [2 x double]. The high-order bits are copied from the
1674 /// high-order bits of the first operand.
1676 /// \headerfile <x86intrin.h>
1678 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1681 /// A 128-bit vector of [2 x double]. \n
1682 /// Bits [127:64] are written to bits [127:64] of the result.
1684 /// A pointer to a 64-bit memory location containing a double-precision
1685 /// floating-point value that is loaded. The loaded value is written to bits
1686 /// [63:0] of the result. The address of the memory location does not have to
1688 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1689 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadl_pd(__m128d __a
,
1690 double const *__dp
) {
1691 struct __mm_loadl_pd_struct
{
1693 } __attribute__((__packed__
, __may_alias__
));
1694 double __u
= ((const struct __mm_loadl_pd_struct
*)__dp
)->__u
;
1695 return __extension__(__m128d
){__u
, __a
[1]};
1698 /// Constructs a 128-bit floating-point vector of [2 x double] with
1699 /// unspecified content. This could be used as an argument to another
1700 /// intrinsic function where the argument is required but the value is not
1703 /// \headerfile <x86intrin.h>
1705 /// This intrinsic has no corresponding instruction.
1707 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1709 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_undefined_pd(void) {
1710 return (__m128d
)__builtin_ia32_undef128();
1713 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1714 /// 64 bits of the vector are initialized with the specified double-precision
1715 /// floating-point value. The upper 64 bits are set to zero.
1717 /// \headerfile <x86intrin.h>
1719 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1722 /// A double-precision floating-point value used to initialize the lower 64
1723 /// bits of the result.
1724 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1725 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1727 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_sd(double __w
) {
1728 return __extension__(__m128d
){__w
, 0};
1731 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1732 /// of the two double-precision floating-point vector elements set to the
1733 /// specified double-precision floating-point value.
1735 /// \headerfile <x86intrin.h>
1737 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1740 /// A double-precision floating-point value used to initialize each vector
1741 /// element of the result.
1742 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1743 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set1_pd(double __w
) {
1744 return __extension__(__m128d
){__w
, __w
};
1747 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1748 /// of the two double-precision floating-point vector elements set to the
1749 /// specified double-precision floating-point value.
1751 /// \headerfile <x86intrin.h>
1753 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1756 /// A double-precision floating-point value used to initialize each vector
1757 /// element of the result.
1758 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1759 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_pd1(double __w
) {
1760 return _mm_set1_pd(__w
);
1763 /// Constructs a 128-bit floating-point vector of [2 x double]
1764 /// initialized with the specified double-precision floating-point values.
1766 /// \headerfile <x86intrin.h>
1768 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1771 /// A double-precision floating-point value used to initialize the upper 64
1772 /// bits of the result.
1774 /// A double-precision floating-point value used to initialize the lower 64
1775 /// bits of the result.
1776 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1777 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_pd(double __w
,
1779 return __extension__(__m128d
){__x
, __w
};
1782 /// Constructs a 128-bit floating-point vector of [2 x double],
1783 /// initialized in reverse order with the specified double-precision
1784 /// floating-point values.
1786 /// \headerfile <x86intrin.h>
1788 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1791 /// A double-precision floating-point value used to initialize the lower 64
1792 /// bits of the result.
1794 /// A double-precision floating-point value used to initialize the upper 64
1795 /// bits of the result.
1796 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1797 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_setr_pd(double __w
,
1799 return __extension__(__m128d
){__w
, __x
};
1802 /// Constructs a 128-bit floating-point vector of [2 x double]
1803 /// initialized to zero.
1805 /// \headerfile <x86intrin.h>
1807 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1809 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1810 /// all elements set to zero.
1811 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_setzero_pd(void) {
1812 return __extension__(__m128d
){0, 0};
1815 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1816 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1817 /// 64 bits are set to the upper 64 bits of the first parameter.
1819 /// \headerfile <x86intrin.h>
1821 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1824 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1825 /// upper 64 bits of the result.
1827 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1828 /// lower 64 bits of the result.
1829 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1830 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_move_sd(__m128d __a
,
1836 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1837 /// memory location.
1839 /// \headerfile <x86intrin.h>
1841 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1844 /// A pointer to a 64-bit memory location.
1846 /// A 128-bit vector of [2 x double] containing the value to be stored.
1847 static __inline__
void __DEFAULT_FN_ATTRS
_mm_store_sd(double *__dp
,
1849 struct __mm_store_sd_struct
{
1851 } __attribute__((__packed__
, __may_alias__
));
1852 ((struct __mm_store_sd_struct
*)__dp
)->__u
= __a
[0];
1855 /// Moves packed double-precision values from a 128-bit vector of
1856 /// [2 x double] to a memory location.
1858 /// \headerfile <x86intrin.h>
1860 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1863 /// A pointer to an aligned memory location that can store two
1864 /// double-precision values.
1866 /// A packed 128-bit vector of [2 x double] containing the values to be
1868 static __inline__
void __DEFAULT_FN_ATTRS
_mm_store_pd(double *__dp
,
1870 *(__m128d
*)__dp
= __a
;
1873 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1874 /// the upper and lower 64 bits of a memory location.
1876 /// \headerfile <x86intrin.h>
1878 /// This intrinsic corresponds to the
1879 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1882 /// A pointer to a memory location that can store two double-precision
1885 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1886 /// of the values in \a __dp.
1887 static __inline__
void __DEFAULT_FN_ATTRS
_mm_store1_pd(double *__dp
,
1889 __a
= __builtin_shufflevector((__v2df
)__a
, (__v2df
)__a
, 0, 0);
1890 _mm_store_pd(__dp
, __a
);
1893 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1894 /// the upper and lower 64 bits of a memory location.
1896 /// \headerfile <x86intrin.h>
1898 /// This intrinsic corresponds to the
1899 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1902 /// A pointer to a memory location that can store two double-precision
1905 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1906 /// of the values in \a __dp.
1907 static __inline__
void __DEFAULT_FN_ATTRS
_mm_store_pd1(double *__dp
,
1909 _mm_store1_pd(__dp
, __a
);
1912 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1915 /// \headerfile <x86intrin.h>
1917 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1920 /// A pointer to a 128-bit memory location. The address of the memory
1921 /// location does not have to be aligned.
1923 /// A 128-bit vector of [2 x double] containing the values to be stored.
1924 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storeu_pd(double *__dp
,
1926 struct __storeu_pd
{
1928 } __attribute__((__packed__
, __may_alias__
));
1929 ((struct __storeu_pd
*)__dp
)->__v
= __a
;
1932 /// Stores two double-precision values, in reverse order, from a 128-bit
1933 /// vector of [2 x double] to a 16-byte aligned memory location.
1935 /// \headerfile <x86intrin.h>
1937 /// This intrinsic corresponds to a shuffling instruction followed by a
1938 /// <c> VMOVAPD / MOVAPD </c> instruction.
1941 /// A pointer to a 16-byte aligned memory location that can store two
1942 /// double-precision values.
1944 /// A 128-bit vector of [2 x double] containing the values to be reversed and
1946 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storer_pd(double *__dp
,
1948 __a
= __builtin_shufflevector((__v2df
)__a
, (__v2df
)__a
, 1, 0);
1949 *(__m128d
*)__dp
= __a
;
1952 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1953 /// memory location.
1955 /// \headerfile <x86intrin.h>
1957 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1960 /// A pointer to a 64-bit memory location.
1962 /// A 128-bit vector of [2 x double] containing the value to be stored.
1963 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storeh_pd(double *__dp
,
1965 struct __mm_storeh_pd_struct
{
1967 } __attribute__((__packed__
, __may_alias__
));
1968 ((struct __mm_storeh_pd_struct
*)__dp
)->__u
= __a
[1];
1971 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1972 /// memory location.
1974 /// \headerfile <x86intrin.h>
1976 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1979 /// A pointer to a 64-bit memory location.
1981 /// A 128-bit vector of [2 x double] containing the value to be stored.
1982 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storel_pd(double *__dp
,
1984 struct __mm_storeh_pd_struct
{
1986 } __attribute__((__packed__
, __may_alias__
));
1987 ((struct __mm_storeh_pd_struct
*)__dp
)->__u
= __a
[0];
1990 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
1991 /// saving the lower 8 bits of each sum in the corresponding element of a
1992 /// 128-bit result vector of [16 x i8].
1994 /// The integer elements of both parameters can be either signed or unsigned.
1996 /// \headerfile <x86intrin.h>
1998 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2001 /// A 128-bit vector of [16 x i8].
2003 /// A 128-bit vector of [16 x i8].
2004 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2006 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi8(__m128i __a
,
2008 return (__m128i
)((__v16qu
)__a
+ (__v16qu
)__b
);
2011 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2012 /// saving the lower 16 bits of each sum in the corresponding element of a
2013 /// 128-bit result vector of [8 x i16].
2015 /// The integer elements of both parameters can be either signed or unsigned.
2017 /// \headerfile <x86intrin.h>
2019 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2022 /// A 128-bit vector of [8 x i16].
2024 /// A 128-bit vector of [8 x i16].
2025 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2027 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi16(__m128i __a
,
2029 return (__m128i
)((__v8hu
)__a
+ (__v8hu
)__b
);
2032 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2033 /// saving the lower 32 bits of each sum in the corresponding element of a
2034 /// 128-bit result vector of [4 x i32].
2036 /// The integer elements of both parameters can be either signed or unsigned.
2038 /// \headerfile <x86intrin.h>
2040 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2043 /// A 128-bit vector of [4 x i32].
2045 /// A 128-bit vector of [4 x i32].
2046 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2048 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi32(__m128i __a
,
2050 return (__m128i
)((__v4su
)__a
+ (__v4su
)__b
);
2053 /// Adds two signed or unsigned 64-bit integer values, returning the
2054 /// lower 64 bits of the sum.
2056 /// \headerfile <x86intrin.h>
2058 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2061 /// A 64-bit integer.
2063 /// A 64-bit integer.
2064 /// \returns A 64-bit integer containing the sum of both parameters.
2065 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_add_si64(__m64 __a
,
2067 return (__m64
)__builtin_ia32_paddq((__v1di
)__a
, (__v1di
)__b
);
2070 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2071 /// saving the lower 64 bits of each sum in the corresponding element of a
2072 /// 128-bit result vector of [2 x i64].
2074 /// The integer elements of both parameters can be either signed or unsigned.
2076 /// \headerfile <x86intrin.h>
2078 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2081 /// A 128-bit vector of [2 x i64].
2083 /// A 128-bit vector of [2 x i64].
2084 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2086 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi64(__m128i __a
,
2088 return (__m128i
)((__v2du
)__a
+ (__v2du
)__b
);
2091 /// Adds, with saturation, the corresponding elements of two 128-bit
2092 /// signed [16 x i8] vectors, saving each sum in the corresponding element of
2093 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2094 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2096 /// \headerfile <x86intrin.h>
2098 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2101 /// A 128-bit signed [16 x i8] vector.
2103 /// A 128-bit signed [16 x i8] vector.
2104 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2105 /// both parameters.
2106 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi8(__m128i __a
,
2108 return (__m128i
)__builtin_elementwise_add_sat((__v16qs
)__a
, (__v16qs
)__b
);
2111 /// Adds, with saturation, the corresponding elements of two 128-bit
2112 /// signed [8 x i16] vectors, saving each sum in the corresponding element of
2113 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2114 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2117 /// \headerfile <x86intrin.h>
2119 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2122 /// A 128-bit signed [8 x i16] vector.
2124 /// A 128-bit signed [8 x i16] vector.
2125 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2126 /// both parameters.
2127 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi16(__m128i __a
,
2129 return (__m128i
)__builtin_elementwise_add_sat((__v8hi
)__a
, (__v8hi
)__b
);
2132 /// Adds, with saturation, the corresponding elements of two 128-bit
2133 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2134 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2135 /// are saturated to 0xFF. Negative sums are saturated to 0x00.
2137 /// \headerfile <x86intrin.h>
2139 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2142 /// A 128-bit unsigned [16 x i8] vector.
2144 /// A 128-bit unsigned [16 x i8] vector.
2145 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2146 /// of both parameters.
2147 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu8(__m128i __a
,
2149 return (__m128i
)__builtin_elementwise_add_sat((__v16qu
)__a
, (__v16qu
)__b
);
2152 /// Adds, with saturation, the corresponding elements of two 128-bit
2153 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2154 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than
2155 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2157 /// \headerfile <x86intrin.h>
2159 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2162 /// A 128-bit unsigned [8 x i16] vector.
2164 /// A 128-bit unsigned [8 x i16] vector.
2165 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2166 /// of both parameters.
2167 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu16(__m128i __a
,
2169 return (__m128i
)__builtin_elementwise_add_sat((__v8hu
)__a
, (__v8hu
)__b
);
2172 /// Computes the rounded averages of corresponding elements of two
2173 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2174 /// corresponding element of a 128-bit result vector of [16 x i8].
2176 /// \headerfile <x86intrin.h>
2178 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2181 /// A 128-bit unsigned [16 x i8] vector.
2183 /// A 128-bit unsigned [16 x i8] vector.
2184 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2185 /// averages of both parameters.
2186 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu8(__m128i __a
,
2188 return (__m128i
)__builtin_ia32_pavgb128((__v16qi
)__a
, (__v16qi
)__b
);
2191 /// Computes the rounded averages of corresponding elements of two
2192 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2193 /// corresponding element of a 128-bit result vector of [8 x i16].
2195 /// \headerfile <x86intrin.h>
2197 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2200 /// A 128-bit unsigned [8 x i16] vector.
2202 /// A 128-bit unsigned [8 x i16] vector.
2203 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2204 /// averages of both parameters.
2205 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu16(__m128i __a
,
2207 return (__m128i
)__builtin_ia32_pavgw128((__v8hi
)__a
, (__v8hi
)__b
);
2210 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2211 /// vectors, producing eight intermediate 32-bit signed integer products, and
2212 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2213 /// [4 x i32] vector.
2215 /// For example, bits [15:0] of both parameters are multiplied producing a
2216 /// 32-bit product, bits [31:16] of both parameters are multiplied producing
2217 /// a 32-bit product, and the sum of those two products becomes bits [31:0]
2220 /// \headerfile <x86intrin.h>
2222 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2225 /// A 128-bit signed [8 x i16] vector.
2227 /// A 128-bit signed [8 x i16] vector.
2228 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2229 /// of both parameters.
2230 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_madd_epi16(__m128i __a
,
2232 return (__m128i
)__builtin_ia32_pmaddwd128((__v8hi
)__a
, (__v8hi
)__b
);
2235 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2236 /// vectors, saving the greater value from each comparison in the
2237 /// corresponding element of a 128-bit result vector of [8 x i16].
2239 /// \headerfile <x86intrin.h>
2241 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2244 /// A 128-bit signed [8 x i16] vector.
2246 /// A 128-bit signed [8 x i16] vector.
2247 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2248 /// each comparison.
2249 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epi16(__m128i __a
,
2251 return (__m128i
)__builtin_elementwise_max((__v8hi
)__a
, (__v8hi
)__b
);
2254 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2255 /// vectors, saving the greater value from each comparison in the
2256 /// corresponding element of a 128-bit result vector of [16 x i8].
2258 /// \headerfile <x86intrin.h>
2260 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2263 /// A 128-bit unsigned [16 x i8] vector.
2265 /// A 128-bit unsigned [16 x i8] vector.
2266 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2267 /// each comparison.
2268 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epu8(__m128i __a
,
2270 return (__m128i
)__builtin_elementwise_max((__v16qu
)__a
, (__v16qu
)__b
);
2273 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2274 /// vectors, saving the smaller value from each comparison in the
2275 /// corresponding element of a 128-bit result vector of [8 x i16].
2277 /// \headerfile <x86intrin.h>
2279 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2282 /// A 128-bit signed [8 x i16] vector.
2284 /// A 128-bit signed [8 x i16] vector.
2285 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2286 /// each comparison.
2287 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epi16(__m128i __a
,
2289 return (__m128i
)__builtin_elementwise_min((__v8hi
)__a
, (__v8hi
)__b
);
2292 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2293 /// vectors, saving the smaller value from each comparison in the
2294 /// corresponding element of a 128-bit result vector of [16 x i8].
2296 /// \headerfile <x86intrin.h>
2298 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2301 /// A 128-bit unsigned [16 x i8] vector.
2303 /// A 128-bit unsigned [16 x i8] vector.
2304 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2305 /// each comparison.
2306 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epu8(__m128i __a
,
2308 return (__m128i
)__builtin_elementwise_min((__v16qu
)__a
, (__v16qu
)__b
);
2311 /// Multiplies the corresponding elements of two signed [8 x i16]
2312 /// vectors, saving the upper 16 bits of each 32-bit product in the
2313 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2315 /// \headerfile <x86intrin.h>
2317 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2320 /// A 128-bit signed [8 x i16] vector.
2322 /// A 128-bit signed [8 x i16] vector.
2323 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2324 /// each of the eight 32-bit products.
2325 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhi_epi16(__m128i __a
,
2327 return (__m128i
)__builtin_ia32_pmulhw128((__v8hi
)__a
, (__v8hi
)__b
);
2330 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2331 /// vectors, saving the upper 16 bits of each 32-bit product in the
2332 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2334 /// \headerfile <x86intrin.h>
2336 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2339 /// A 128-bit unsigned [8 x i16] vector.
2341 /// A 128-bit unsigned [8 x i16] vector.
2342 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2343 /// of each of the eight 32-bit products.
2344 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhi_epu16(__m128i __a
,
2346 return (__m128i
)__builtin_ia32_pmulhuw128((__v8hi
)__a
, (__v8hi
)__b
);
2349 /// Multiplies the corresponding elements of two signed [8 x i16]
2350 /// vectors, saving the lower 16 bits of each 32-bit product in the
2351 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2353 /// \headerfile <x86intrin.h>
2355 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2358 /// A 128-bit signed [8 x i16] vector.
2360 /// A 128-bit signed [8 x i16] vector.
2361 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2362 /// each of the eight 32-bit products.
2363 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mullo_epi16(__m128i __a
,
2365 return (__m128i
)((__v8hu
)__a
* (__v8hu
)__b
);
2368 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2369 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2372 /// \headerfile <x86intrin.h>
2374 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2377 /// A 64-bit integer containing one of the source operands.
2379 /// A 64-bit integer containing one of the source operands.
2380 /// \returns A 64-bit integer vector containing the product of both operands.
2381 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_mul_su32(__m64 __a
,
2383 return __builtin_ia32_pmuludq((__v2si
)__a
, (__v2si
)__b
);
2386 /// Multiplies 32-bit unsigned integer values contained in the lower
2387 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2388 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2390 /// \headerfile <x86intrin.h>
2392 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2395 /// A [2 x i64] vector containing one of the source operands.
2397 /// A [2 x i64] vector containing one of the source operands.
2398 /// \returns A [2 x i64] vector containing the product of both operands.
2399 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mul_epu32(__m128i __a
,
2401 return __builtin_ia32_pmuludq128((__v4si
)__a
, (__v4si
)__b
);
2404 /// Computes the absolute differences of corresponding 8-bit integer
2405 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2406 /// separately sums the second 8 absolute differences. Packs these two
2407 /// unsigned 16-bit integer sums into the upper and lower elements of a
2408 /// [2 x i64] vector.
2410 /// \headerfile <x86intrin.h>
2412 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2415 /// A 128-bit integer vector containing one of the source operands.
2417 /// A 128-bit integer vector containing one of the source operands.
2418 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2419 /// differences between both operands.
2420 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sad_epu8(__m128i __a
,
2422 return __builtin_ia32_psadbw128((__v16qi
)__a
, (__v16qi
)__b
);
2425 /// Subtracts the corresponding 8-bit integer values in the operands.
2427 /// \headerfile <x86intrin.h>
2429 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2432 /// A 128-bit integer vector containing the minuends.
2434 /// A 128-bit integer vector containing the subtrahends.
2435 /// \returns A 128-bit integer vector containing the differences of the values
2436 /// in the operands.
2437 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi8(__m128i __a
,
2439 return (__m128i
)((__v16qu
)__a
- (__v16qu
)__b
);
2442 /// Subtracts the corresponding 16-bit integer values in the operands.
2444 /// \headerfile <x86intrin.h>
2446 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2449 /// A 128-bit integer vector containing the minuends.
2451 /// A 128-bit integer vector containing the subtrahends.
2452 /// \returns A 128-bit integer vector containing the differences of the values
2453 /// in the operands.
2454 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi16(__m128i __a
,
2456 return (__m128i
)((__v8hu
)__a
- (__v8hu
)__b
);
2459 /// Subtracts the corresponding 32-bit integer values in the operands.
2461 /// \headerfile <x86intrin.h>
2463 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2466 /// A 128-bit integer vector containing the minuends.
2468 /// A 128-bit integer vector containing the subtrahends.
2469 /// \returns A 128-bit integer vector containing the differences of the values
2470 /// in the operands.
2471 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi32(__m128i __a
,
2473 return (__m128i
)((__v4su
)__a
- (__v4su
)__b
);
2476 /// Subtracts signed or unsigned 64-bit integer values and writes the
2477 /// difference to the corresponding bits in the destination.
2479 /// \headerfile <x86intrin.h>
2481 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2484 /// A 64-bit integer vector containing the minuend.
2486 /// A 64-bit integer vector containing the subtrahend.
2487 /// \returns A 64-bit integer vector containing the difference of the values in
2489 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sub_si64(__m64 __a
,
2491 return (__m64
)__builtin_ia32_psubq((__v1di
)__a
, (__v1di
)__b
);
2494 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2496 /// \headerfile <x86intrin.h>
2498 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2501 /// A 128-bit integer vector containing the minuends.
2503 /// A 128-bit integer vector containing the subtrahends.
2504 /// \returns A 128-bit integer vector containing the differences of the values
2505 /// in the operands.
2506 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi64(__m128i __a
,
2508 return (__m128i
)((__v2du
)__a
- (__v2du
)__b
);
2511 /// Subtracts corresponding 8-bit signed integer values in the input and
2512 /// returns the differences in the corresponding bytes in the destination.
2513 /// Differences greater than 0x7F are saturated to 0x7F, and differences less
2514 /// than 0x80 are saturated to 0x80.
2516 /// \headerfile <x86intrin.h>
2518 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2521 /// A 128-bit integer vector containing the minuends.
2523 /// A 128-bit integer vector containing the subtrahends.
2524 /// \returns A 128-bit integer vector containing the differences of the values
2525 /// in the operands.
2526 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epi8(__m128i __a
,
2528 return (__m128i
)__builtin_elementwise_sub_sat((__v16qs
)__a
, (__v16qs
)__b
);
2531 /// Subtracts corresponding 16-bit signed integer values in the input and
2532 /// returns the differences in the corresponding bytes in the destination.
2533 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2534 /// than 0x8000 are saturated to 0x8000.
2536 /// \headerfile <x86intrin.h>
2538 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2541 /// A 128-bit integer vector containing the minuends.
2543 /// A 128-bit integer vector containing the subtrahends.
2544 /// \returns A 128-bit integer vector containing the differences of the values
2545 /// in the operands.
2546 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epi16(__m128i __a
,
2548 return (__m128i
)__builtin_elementwise_sub_sat((__v8hi
)__a
, (__v8hi
)__b
);
2551 /// Subtracts corresponding 8-bit unsigned integer values in the input
2552 /// and returns the differences in the corresponding bytes in the
2553 /// destination. Differences less than 0x00 are saturated to 0x00.
2555 /// \headerfile <x86intrin.h>
2557 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2560 /// A 128-bit integer vector containing the minuends.
2562 /// A 128-bit integer vector containing the subtrahends.
2563 /// \returns A 128-bit integer vector containing the unsigned integer
2564 /// differences of the values in the operands.
2565 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epu8(__m128i __a
,
2567 return (__m128i
)__builtin_elementwise_sub_sat((__v16qu
)__a
, (__v16qu
)__b
);
2570 /// Subtracts corresponding 16-bit unsigned integer values in the input
2571 /// and returns the differences in the corresponding bytes in the
2572 /// destination. Differences less than 0x0000 are saturated to 0x0000.
2574 /// \headerfile <x86intrin.h>
2576 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2579 /// A 128-bit integer vector containing the minuends.
2581 /// A 128-bit integer vector containing the subtrahends.
2582 /// \returns A 128-bit integer vector containing the unsigned integer
2583 /// differences of the values in the operands.
2584 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epu16(__m128i __a
,
2586 return (__m128i
)__builtin_elementwise_sub_sat((__v8hu
)__a
, (__v8hu
)__b
);
2589 /// Performs a bitwise AND of two 128-bit integer vectors.
2591 /// \headerfile <x86intrin.h>
2593 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2596 /// A 128-bit integer vector containing one of the source operands.
2598 /// A 128-bit integer vector containing one of the source operands.
2599 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2600 /// in both operands.
2601 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_and_si128(__m128i __a
,
2603 return (__m128i
)((__v2du
)__a
& (__v2du
)__b
);
2606 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2607 /// one's complement of the values contained in the first source operand.
2609 /// \headerfile <x86intrin.h>
2611 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2614 /// A 128-bit vector containing the left source operand. The one's complement
2615 /// of this value is used in the bitwise AND.
2617 /// A 128-bit vector containing the right source operand.
2618 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2619 /// complement of the first operand and the values in the second operand.
2620 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_andnot_si128(__m128i __a
,
2622 return (__m128i
)(~(__v2du
)__a
& (__v2du
)__b
);
2624 /// Performs a bitwise OR of two 128-bit integer vectors.
2626 /// \headerfile <x86intrin.h>
2628 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2631 /// A 128-bit integer vector containing one of the source operands.
2633 /// A 128-bit integer vector containing one of the source operands.
2634 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2635 /// in both operands.
2636 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_or_si128(__m128i __a
,
2638 return (__m128i
)((__v2du
)__a
| (__v2du
)__b
);
2641 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2643 /// \headerfile <x86intrin.h>
2645 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2648 /// A 128-bit integer vector containing one of the source operands.
2650 /// A 128-bit integer vector containing one of the source operands.
2651 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2652 /// values in both operands.
2653 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_xor_si128(__m128i __a
,
2655 return (__m128i
)((__v2du
)__a
^ (__v2du
)__b
);
2658 /// Left-shifts the 128-bit integer vector operand by the specified
2659 /// number of bytes. Low-order bits are cleared.
2661 /// \headerfile <x86intrin.h>
2664 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2667 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2670 /// A 128-bit integer vector containing the source operand.
2672 /// An immediate value specifying the number of bytes to left-shift operand
2674 /// \returns A 128-bit integer vector containing the left-shifted value.
2675 #define _mm_slli_si128(a, imm) \
2676 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2679 #define _mm_bslli_si128(a, imm) \
2680 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2683 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2684 /// by the specified number of bits. Low-order bits are cleared.
2686 /// \headerfile <x86intrin.h>
2688 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2691 /// A 128-bit integer vector containing the source operand.
2693 /// An integer value specifying the number of bits to left-shift each value
2694 /// in operand \a __a.
2695 /// \returns A 128-bit integer vector containing the left-shifted values.
2696 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi16(__m128i __a
,
2698 return (__m128i
)__builtin_ia32_psllwi128((__v8hi
)__a
, __count
);
2701 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2702 /// by the specified number of bits. Low-order bits are cleared.
2704 /// \headerfile <x86intrin.h>
2706 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2709 /// A 128-bit integer vector containing the source operand.
2711 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2712 /// to left-shift each value in operand \a __a.
2713 /// \returns A 128-bit integer vector containing the left-shifted values.
2714 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi16(__m128i __a
,
2716 return (__m128i
)__builtin_ia32_psllw128((__v8hi
)__a
, (__v8hi
)__count
);
2719 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2720 /// by the specified number of bits. Low-order bits are cleared.
2722 /// \headerfile <x86intrin.h>
2724 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2727 /// A 128-bit integer vector containing the source operand.
2729 /// An integer value specifying the number of bits to left-shift each value
2730 /// in operand \a __a.
2731 /// \returns A 128-bit integer vector containing the left-shifted values.
2732 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi32(__m128i __a
,
2734 return (__m128i
)__builtin_ia32_pslldi128((__v4si
)__a
, __count
);
2737 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2738 /// by the specified number of bits. Low-order bits are cleared.
2740 /// \headerfile <x86intrin.h>
2742 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2745 /// A 128-bit integer vector containing the source operand.
2747 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2748 /// to left-shift each value in operand \a __a.
2749 /// \returns A 128-bit integer vector containing the left-shifted values.
2750 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi32(__m128i __a
,
2752 return (__m128i
)__builtin_ia32_pslld128((__v4si
)__a
, (__v4si
)__count
);
2755 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2756 /// by the specified number of bits. Low-order bits are cleared.
2758 /// \headerfile <x86intrin.h>
2760 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2763 /// A 128-bit integer vector containing the source operand.
2765 /// An integer value specifying the number of bits to left-shift each value
2766 /// in operand \a __a.
2767 /// \returns A 128-bit integer vector containing the left-shifted values.
2768 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi64(__m128i __a
,
2770 return __builtin_ia32_psllqi128((__v2di
)__a
, __count
);
2773 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2774 /// by the specified number of bits. Low-order bits are cleared.
2776 /// \headerfile <x86intrin.h>
2778 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2781 /// A 128-bit integer vector containing the source operand.
2783 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2784 /// to left-shift each value in operand \a __a.
2785 /// \returns A 128-bit integer vector containing the left-shifted values.
2786 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi64(__m128i __a
,
2788 return __builtin_ia32_psllq128((__v2di
)__a
, (__v2di
)__count
);
2791 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2792 /// by the specified number of bits. High-order bits are filled with the sign
2793 /// bit of the initial value.
2795 /// \headerfile <x86intrin.h>
2797 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2800 /// A 128-bit integer vector containing the source operand.
2802 /// An integer value specifying the number of bits to right-shift each value
2803 /// in operand \a __a.
2804 /// \returns A 128-bit integer vector containing the right-shifted values.
2805 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srai_epi16(__m128i __a
,
2807 return (__m128i
)__builtin_ia32_psrawi128((__v8hi
)__a
, __count
);
2810 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2811 /// by the specified number of bits. High-order bits are filled with the sign
2812 /// bit of the initial value.
2814 /// \headerfile <x86intrin.h>
2816 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2819 /// A 128-bit integer vector containing the source operand.
2821 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2822 /// to right-shift each value in operand \a __a.
2823 /// \returns A 128-bit integer vector containing the right-shifted values.
2824 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sra_epi16(__m128i __a
,
2826 return (__m128i
)__builtin_ia32_psraw128((__v8hi
)__a
, (__v8hi
)__count
);
2829 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2830 /// by the specified number of bits. High-order bits are filled with the sign
2831 /// bit of the initial value.
2833 /// \headerfile <x86intrin.h>
2835 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2838 /// A 128-bit integer vector containing the source operand.
2840 /// An integer value specifying the number of bits to right-shift each value
2841 /// in operand \a __a.
2842 /// \returns A 128-bit integer vector containing the right-shifted values.
2843 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srai_epi32(__m128i __a
,
2845 return (__m128i
)__builtin_ia32_psradi128((__v4si
)__a
, __count
);
2848 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2849 /// by the specified number of bits. High-order bits are filled with the sign
2850 /// bit of the initial value.
2852 /// \headerfile <x86intrin.h>
2854 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2857 /// A 128-bit integer vector containing the source operand.
2859 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2860 /// to right-shift each value in operand \a __a.
2861 /// \returns A 128-bit integer vector containing the right-shifted values.
2862 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sra_epi32(__m128i __a
,
2864 return (__m128i
)__builtin_ia32_psrad128((__v4si
)__a
, (__v4si
)__count
);
2867 /// Right-shifts the 128-bit integer vector operand by the specified
2868 /// number of bytes. High-order bits are cleared.
2870 /// \headerfile <x86intrin.h>
2873 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2876 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2879 /// A 128-bit integer vector containing the source operand.
2881 /// An immediate value specifying the number of bytes to right-shift operand
2883 /// \returns A 128-bit integer vector containing the right-shifted value.
2884 #define _mm_srli_si128(a, imm) \
2885 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2888 #define _mm_bsrli_si128(a, imm) \
2889 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2892 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2893 /// operand by the specified number of bits. High-order bits are cleared.
2895 /// \headerfile <x86intrin.h>
2897 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2900 /// A 128-bit integer vector containing the source operand.
2902 /// An integer value specifying the number of bits to right-shift each value
2903 /// in operand \a __a.
2904 /// \returns A 128-bit integer vector containing the right-shifted values.
2905 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi16(__m128i __a
,
2907 return (__m128i
)__builtin_ia32_psrlwi128((__v8hi
)__a
, __count
);
2910 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2911 /// operand by the specified number of bits. High-order bits are cleared.
2913 /// \headerfile <x86intrin.h>
2915 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2918 /// A 128-bit integer vector containing the source operand.
2920 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2921 /// to right-shift each value in operand \a __a.
2922 /// \returns A 128-bit integer vector containing the right-shifted values.
2923 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi16(__m128i __a
,
2925 return (__m128i
)__builtin_ia32_psrlw128((__v8hi
)__a
, (__v8hi
)__count
);
2928 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2929 /// operand by the specified number of bits. High-order bits are cleared.
2931 /// \headerfile <x86intrin.h>
2933 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2936 /// A 128-bit integer vector containing the source operand.
2938 /// An integer value specifying the number of bits to right-shift each value
2939 /// in operand \a __a.
2940 /// \returns A 128-bit integer vector containing the right-shifted values.
2941 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi32(__m128i __a
,
2943 return (__m128i
)__builtin_ia32_psrldi128((__v4si
)__a
, __count
);
2946 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2947 /// operand by the specified number of bits. High-order bits are cleared.
2949 /// \headerfile <x86intrin.h>
2951 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2954 /// A 128-bit integer vector containing the source operand.
2956 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2957 /// to right-shift each value in operand \a __a.
2958 /// \returns A 128-bit integer vector containing the right-shifted values.
2959 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi32(__m128i __a
,
2961 return (__m128i
)__builtin_ia32_psrld128((__v4si
)__a
, (__v4si
)__count
);
2964 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2965 /// operand by the specified number of bits. High-order bits are cleared.
2967 /// \headerfile <x86intrin.h>
2969 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2972 /// A 128-bit integer vector containing the source operand.
2974 /// An integer value specifying the number of bits to right-shift each value
2975 /// in operand \a __a.
2976 /// \returns A 128-bit integer vector containing the right-shifted values.
2977 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi64(__m128i __a
,
2979 return __builtin_ia32_psrlqi128((__v2di
)__a
, __count
);
2982 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2983 /// operand by the specified number of bits. High-order bits are cleared.
2985 /// \headerfile <x86intrin.h>
2987 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2990 /// A 128-bit integer vector containing the source operand.
2992 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2993 /// to right-shift each value in operand \a __a.
2994 /// \returns A 128-bit integer vector containing the right-shifted values.
2995 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi64(__m128i __a
,
2997 return __builtin_ia32_psrlq128((__v2di
)__a
, (__v2di
)__count
);
3000 /// Compares each of the corresponding 8-bit values of the 128-bit
3001 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3004 /// \headerfile <x86intrin.h>
3006 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3009 /// A 128-bit integer vector.
3011 /// A 128-bit integer vector.
3012 /// \returns A 128-bit integer vector containing the comparison results.
3013 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpeq_epi8(__m128i __a
,
3015 return (__m128i
)((__v16qi
)__a
== (__v16qi
)__b
);
3018 /// Compares each of the corresponding 16-bit values of the 128-bit
3019 /// integer vectors for equality. Each comparison yields 0x0 for false,
3020 /// 0xFFFF for true.
3022 /// \headerfile <x86intrin.h>
3024 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3027 /// A 128-bit integer vector.
3029 /// A 128-bit integer vector.
3030 /// \returns A 128-bit integer vector containing the comparison results.
3031 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpeq_epi16(__m128i __a
,
3033 return (__m128i
)((__v8hi
)__a
== (__v8hi
)__b
);
3036 /// Compares each of the corresponding 32-bit values of the 128-bit
3037 /// integer vectors for equality. Each comparison yields 0x0 for false,
3038 /// 0xFFFFFFFF for true.
3040 /// \headerfile <x86intrin.h>
3042 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3045 /// A 128-bit integer vector.
3047 /// A 128-bit integer vector.
3048 /// \returns A 128-bit integer vector containing the comparison results.
3049 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpeq_epi32(__m128i __a
,
3051 return (__m128i
)((__v4si
)__a
== (__v4si
)__b
);
3054 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3055 /// integer vectors to determine if the values in the first operand are
3056 /// greater than those in the second operand. Each comparison yields 0x0 for
3057 /// false, 0xFF for true.
3059 /// \headerfile <x86intrin.h>
3061 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3064 /// A 128-bit integer vector.
3066 /// A 128-bit integer vector.
3067 /// \returns A 128-bit integer vector containing the comparison results.
3068 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpgt_epi8(__m128i __a
,
3070 /* This function always performs a signed comparison, but __v16qi is a char
3071 which may be signed or unsigned, so use __v16qs. */
3072 return (__m128i
)((__v16qs
)__a
> (__v16qs
)__b
);
3075 /// Compares each of the corresponding signed 16-bit values of the
3076 /// 128-bit integer vectors to determine if the values in the first operand
3077 /// are greater than those in the second operand.
3079 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3081 /// \headerfile <x86intrin.h>
3083 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3086 /// A 128-bit integer vector.
3088 /// A 128-bit integer vector.
3089 /// \returns A 128-bit integer vector containing the comparison results.
3090 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpgt_epi16(__m128i __a
,
3092 return (__m128i
)((__v8hi
)__a
> (__v8hi
)__b
);
3095 /// Compares each of the corresponding signed 32-bit values of the
3096 /// 128-bit integer vectors to determine if the values in the first operand
3097 /// are greater than those in the second operand.
3099 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3101 /// \headerfile <x86intrin.h>
3103 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3106 /// A 128-bit integer vector.
3108 /// A 128-bit integer vector.
3109 /// \returns A 128-bit integer vector containing the comparison results.
3110 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpgt_epi32(__m128i __a
,
3112 return (__m128i
)((__v4si
)__a
> (__v4si
)__b
);
3115 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3116 /// integer vectors to determine if the values in the first operand are less
3117 /// than those in the second operand.
3119 /// Each comparison yields 0x0 for false, 0xFF for true.
3121 /// \headerfile <x86intrin.h>
3123 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3126 /// A 128-bit integer vector.
3128 /// A 128-bit integer vector.
3129 /// \returns A 128-bit integer vector containing the comparison results.
3130 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmplt_epi8(__m128i __a
,
3132 return _mm_cmpgt_epi8(__b
, __a
);
3135 /// Compares each of the corresponding signed 16-bit values of the
3136 /// 128-bit integer vectors to determine if the values in the first operand
3137 /// are less than those in the second operand.
3139 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3141 /// \headerfile <x86intrin.h>
3143 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3146 /// A 128-bit integer vector.
3148 /// A 128-bit integer vector.
3149 /// \returns A 128-bit integer vector containing the comparison results.
3150 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmplt_epi16(__m128i __a
,
3152 return _mm_cmpgt_epi16(__b
, __a
);
3155 /// Compares each of the corresponding signed 32-bit values of the
3156 /// 128-bit integer vectors to determine if the values in the first operand
3157 /// are less than those in the second operand.
3159 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3161 /// \headerfile <x86intrin.h>
3163 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3166 /// A 128-bit integer vector.
3168 /// A 128-bit integer vector.
3169 /// \returns A 128-bit integer vector containing the comparison results.
3170 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmplt_epi32(__m128i __a
,
3172 return _mm_cmpgt_epi32(__b
, __a
);
3176 /// Converts a 64-bit signed integer value from the second operand into a
3177 /// double-precision value and returns it in the lower element of a [2 x
3178 /// double] vector; the upper element of the returned vector is copied from
3179 /// the upper element of the first operand.
3181 /// \headerfile <x86intrin.h>
3183 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3186 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3187 /// copied to the upper 64 bits of the destination.
3189 /// A 64-bit signed integer operand containing the value to be converted.
3190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3191 /// converted value of the second operand. The upper 64 bits are copied from
3192 /// the upper 64 bits of the first operand.
3193 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtsi64_sd(__m128d __a
,
3199 /// Converts the first (lower) element of a vector of [2 x double] into a
3200 /// 64-bit signed integer value, according to the current rounding mode.
3202 /// \headerfile <x86intrin.h>
3204 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3207 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3209 /// \returns A 64-bit signed integer containing the converted value.
3210 static __inline__
long long __DEFAULT_FN_ATTRS
_mm_cvtsd_si64(__m128d __a
) {
3211 return __builtin_ia32_cvtsd2si64((__v2df
)__a
);
3214 /// Converts the first (lower) element of a vector of [2 x double] into a
3215 /// 64-bit signed integer value, truncating the result when it is inexact.
3217 /// \headerfile <x86intrin.h>
3219 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3223 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3225 /// \returns A 64-bit signed integer containing the converted value.
3226 static __inline__
long long __DEFAULT_FN_ATTRS
_mm_cvttsd_si64(__m128d __a
) {
3227 return __builtin_ia32_cvttsd2si64((__v2df
)__a
);
3231 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3233 /// \headerfile <x86intrin.h>
3235 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3238 /// A 128-bit integer vector.
3239 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3240 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtepi32_ps(__m128i __a
) {
3241 return (__m128
) __builtin_convertvector((__v4si
)__a
, __v4sf
);
3244 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3246 /// \headerfile <x86intrin.h>
3248 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3251 /// A 128-bit vector of [4 x float].
3252 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3254 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtps_epi32(__m128 __a
) {
3255 return (__m128i
)__builtin_ia32_cvtps2dq((__v4sf
)__a
);
3258 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3259 /// truncating the result when it is inexact.
3261 /// \headerfile <x86intrin.h>
3263 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3267 /// A 128-bit vector of [4 x float].
3268 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3269 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvttps_epi32(__m128 __a
) {
3270 return (__m128i
)__builtin_ia32_cvttps2dq((__v4sf
)__a
);
3273 /// Returns a vector of [4 x i32] where the lowest element is the input
3274 /// operand and the remaining elements are zero.
3276 /// \headerfile <x86intrin.h>
3278 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3281 /// A 32-bit signed integer operand.
3282 /// \returns A 128-bit vector of [4 x i32].
3283 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtsi32_si128(int __a
) {
3284 return __extension__(__m128i
)(__v4si
){__a
, 0, 0, 0};
3287 /// Returns a vector of [2 x i64] where the lower element is the input
3288 /// operand and the upper element is zero.
3290 /// \headerfile <x86intrin.h>
3292 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3296 /// A 64-bit signed integer operand containing the value to be converted.
3297 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3298 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtsi64_si128(long long __a
) {
3299 return __extension__(__m128i
)(__v2di
){__a
, 0};
3302 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3303 /// 32-bit signed integer value.
3305 /// \headerfile <x86intrin.h>
3307 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3310 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3312 /// \returns A 32-bit signed integer containing the moved value.
3313 static __inline__
int __DEFAULT_FN_ATTRS
_mm_cvtsi128_si32(__m128i __a
) {
3314 __v4si __b
= (__v4si
)__a
;
3318 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3319 /// 64-bit signed integer value.
3321 /// \headerfile <x86intrin.h>
3323 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3326 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3328 /// \returns A 64-bit signed integer containing the moved value.
3329 static __inline__
long long __DEFAULT_FN_ATTRS
_mm_cvtsi128_si64(__m128i __a
) {
3333 /// Moves packed integer values from an aligned 128-bit memory location
3334 /// to elements in a 128-bit integer vector.
3336 /// \headerfile <x86intrin.h>
3338 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3341 /// An aligned pointer to a memory location containing integer values.
3342 /// \returns A 128-bit integer vector containing the moved values.
3343 static __inline__ __m128i __DEFAULT_FN_ATTRS
3344 _mm_load_si128(__m128i
const *__p
) {
3348 /// Moves packed integer values from an unaligned 128-bit memory location
3349 /// to elements in a 128-bit integer vector.
3351 /// \headerfile <x86intrin.h>
3353 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3356 /// A pointer to a memory location containing integer values.
3357 /// \returns A 128-bit integer vector containing the moved values.
3358 static __inline__ __m128i __DEFAULT_FN_ATTRS
3359 _mm_loadu_si128(__m128i_u
const *__p
) {
3360 struct __loadu_si128
{
3362 } __attribute__((__packed__
, __may_alias__
));
3363 return ((const struct __loadu_si128
*)__p
)->__v
;
3366 /// Returns a vector of [2 x i64] where the lower element is taken from
3367 /// the lower element of the operand, and the upper element is zero.
3369 /// \headerfile <x86intrin.h>
3371 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3374 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3375 /// the destination.
3376 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3377 /// moved value. The higher order bits are cleared.
3378 static __inline__ __m128i __DEFAULT_FN_ATTRS
3379 _mm_loadl_epi64(__m128i_u
const *__p
) {
3380 struct __mm_loadl_epi64_struct
{
3382 } __attribute__((__packed__
, __may_alias__
));
3383 return __extension__(__m128i
){
3384 ((const struct __mm_loadl_epi64_struct
*)__p
)->__u
, 0};
3387 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3388 /// This could be used as an argument to another intrinsic function where the
3389 /// argument is required but the value is not actually used.
3391 /// \headerfile <x86intrin.h>
3393 /// This intrinsic has no corresponding instruction.
3395 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3396 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_undefined_si128(void) {
3397 return (__m128i
)__builtin_ia32_undef128();
3400 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3401 /// the specified 64-bit integer values.
3403 /// \headerfile <x86intrin.h>
3405 /// This intrinsic is a utility function and does not correspond to a specific
3409 /// A 64-bit integer value used to initialize the upper 64 bits of the
3410 /// destination vector of [2 x i64].
3412 /// A 64-bit integer value used to initialize the lower 64 bits of the
3413 /// destination vector of [2 x i64].
3414 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3415 /// provided in the operands.
3416 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi64x(long long __q1
,
3418 return __extension__(__m128i
)(__v2di
){__q0
, __q1
};
3421 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3422 /// the specified 64-bit integer values.
3424 /// \headerfile <x86intrin.h>
3426 /// This intrinsic is a utility function and does not correspond to a specific
3430 /// A 64-bit integer value used to initialize the upper 64 bits of the
3431 /// destination vector of [2 x i64].
3433 /// A 64-bit integer value used to initialize the lower 64 bits of the
3434 /// destination vector of [2 x i64].
3435 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3436 /// provided in the operands.
3437 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi64(__m64 __q1
,
3439 return _mm_set_epi64x((long long)__q1
, (long long)__q0
);
3442 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3443 /// the specified 32-bit integer values.
3445 /// \headerfile <x86intrin.h>
3447 /// This intrinsic is a utility function and does not correspond to a specific
3451 /// A 32-bit integer value used to initialize bits [127:96] of the
3452 /// destination vector.
3454 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3457 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3460 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3462 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3463 /// provided in the operands.
3464 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi32(int __i3
, int __i2
,
3465 int __i1
, int __i0
) {
3466 return __extension__(__m128i
)(__v4si
){__i0
, __i1
, __i2
, __i3
};
3469 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3470 /// the specified 16-bit integer values.
3472 /// \headerfile <x86intrin.h>
3474 /// This intrinsic is a utility function and does not correspond to a specific
3478 /// A 16-bit integer value used to initialize bits [127:112] of the
3479 /// destination vector.
3481 /// A 16-bit integer value used to initialize bits [111:96] of the
3482 /// destination vector.
3484 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3487 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3490 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3493 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3496 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3499 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3501 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3502 /// provided in the operands.
3503 static __inline__ __m128i __DEFAULT_FN_ATTRS
3504 _mm_set_epi16(short __w7
, short __w6
, short __w5
, short __w4
, short __w3
,
3505 short __w2
, short __w1
, short __w0
) {
3506 return __extension__(__m128i
)(__v8hi
){__w0
, __w1
, __w2
, __w3
,
3507 __w4
, __w5
, __w6
, __w7
};
3510 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3511 /// the specified 8-bit integer values.
3513 /// \headerfile <x86intrin.h>
3515 /// This intrinsic is a utility function and does not correspond to a specific
3519 /// Initializes bits [127:120] of the destination vector.
3521 /// Initializes bits [119:112] of the destination vector.
3523 /// Initializes bits [111:104] of the destination vector.
3525 /// Initializes bits [103:96] of the destination vector.
3527 /// Initializes bits [95:88] of the destination vector.
3529 /// Initializes bits [87:80] of the destination vector.
3531 /// Initializes bits [79:72] of the destination vector.
3533 /// Initializes bits [71:64] of the destination vector.
3535 /// Initializes bits [63:56] of the destination vector.
3537 /// Initializes bits [55:48] of the destination vector.
3539 /// Initializes bits [47:40] of the destination vector.
3541 /// Initializes bits [39:32] of the destination vector.
3543 /// Initializes bits [31:24] of the destination vector.
3545 /// Initializes bits [23:16] of the destination vector.
3547 /// Initializes bits [15:8] of the destination vector.
3549 /// Initializes bits [7:0] of the destination vector.
3550 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3551 /// provided in the operands.
3552 static __inline__ __m128i __DEFAULT_FN_ATTRS
3553 _mm_set_epi8(char __b15
, char __b14
, char __b13
, char __b12
, char __b11
,
3554 char __b10
, char __b9
, char __b8
, char __b7
, char __b6
, char __b5
,
3555 char __b4
, char __b3
, char __b2
, char __b1
, char __b0
) {
3556 return __extension__(__m128i
)(__v16qi
){
3557 __b0
, __b1
, __b2
, __b3
, __b4
, __b5
, __b6
, __b7
,
3558 __b8
, __b9
, __b10
, __b11
, __b12
, __b13
, __b14
, __b15
};
3561 /// Initializes both values in a 128-bit integer vector with the
3562 /// specified 64-bit integer value.
3564 /// \headerfile <x86intrin.h>
3566 /// This intrinsic is a utility function and does not correspond to a specific
3570 /// Integer value used to initialize the elements of the destination integer
3572 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3573 /// elements containing the value provided in the operand.
3574 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi64x(long long __q
) {
3575 return _mm_set_epi64x(__q
, __q
);
3578 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3579 /// specified 64-bit value.
3581 /// \headerfile <x86intrin.h>
3583 /// This intrinsic is a utility function and does not correspond to a specific
3587 /// A 64-bit value used to initialize the elements of the destination integer
3589 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3590 /// containing the value provided in the operand.
3591 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi64(__m64 __q
) {
3592 return _mm_set_epi64(__q
, __q
);
3595 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3596 /// specified 32-bit value.
3598 /// \headerfile <x86intrin.h>
3600 /// This intrinsic is a utility function and does not correspond to a specific
3604 /// A 32-bit value used to initialize the elements of the destination integer
3606 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3607 /// containing the value provided in the operand.
3608 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi32(int __i
) {
3609 return _mm_set_epi32(__i
, __i
, __i
, __i
);
3612 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3613 /// specified 16-bit value.
3615 /// \headerfile <x86intrin.h>
3617 /// This intrinsic is a utility function and does not correspond to a specific
3621 /// A 16-bit value used to initialize the elements of the destination integer
3623 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3624 /// containing the value provided in the operand.
3625 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi16(short __w
) {
3626 return _mm_set_epi16(__w
, __w
, __w
, __w
, __w
, __w
, __w
, __w
);
3629 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3630 /// specified 8-bit value.
3632 /// \headerfile <x86intrin.h>
3634 /// This intrinsic is a utility function and does not correspond to a specific
3638 /// An 8-bit value used to initialize the elements of the destination integer
3640 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3641 /// containing the value provided in the operand.
3642 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi8(char __b
) {
3643 return _mm_set_epi8(__b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
,
3644 __b
, __b
, __b
, __b
, __b
);
3647 /// Constructs a 128-bit integer vector, initialized in reverse order
3648 /// with the specified 64-bit integral values.
3650 /// \headerfile <x86intrin.h>
3652 /// This intrinsic does not correspond to a specific instruction.
3655 /// A 64-bit integral value used to initialize the lower 64 bits of the
3658 /// A 64-bit integral value used to initialize the upper 64 bits of the
3660 /// \returns An initialized 128-bit integer vector.
3661 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi64(__m64 __q0
,
3663 return _mm_set_epi64(__q1
, __q0
);
3666 /// Constructs a 128-bit integer vector, initialized in reverse order
3667 /// with the specified 32-bit integral values.
3669 /// \headerfile <x86intrin.h>
3671 /// This intrinsic is a utility function and does not correspond to a specific
3675 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3677 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3679 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3681 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3682 /// \returns An initialized 128-bit integer vector.
3683 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi32(int __i0
, int __i1
,
3686 return _mm_set_epi32(__i3
, __i2
, __i1
, __i0
);
3689 /// Constructs a 128-bit integer vector, initialized in reverse order
3690 /// with the specified 16-bit integral values.
3692 /// \headerfile <x86intrin.h>
3694 /// This intrinsic is a utility function and does not correspond to a specific
3698 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3700 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3702 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3704 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3706 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3708 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3710 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3712 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3713 /// \returns An initialized 128-bit integer vector.
3714 static __inline__ __m128i __DEFAULT_FN_ATTRS
3715 _mm_setr_epi16(short __w0
, short __w1
, short __w2
, short __w3
, short __w4
,
3716 short __w5
, short __w6
, short __w7
) {
3717 return _mm_set_epi16(__w7
, __w6
, __w5
, __w4
, __w3
, __w2
, __w1
, __w0
);
3720 /// Constructs a 128-bit integer vector, initialized in reverse order
3721 /// with the specified 8-bit integral values.
3723 /// \headerfile <x86intrin.h>
3725 /// This intrinsic is a utility function and does not correspond to a specific
3729 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3731 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3733 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3735 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3737 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3739 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3741 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3743 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3745 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3747 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3749 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3751 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3753 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3755 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3757 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3759 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3760 /// \returns An initialized 128-bit integer vector.
3761 static __inline__ __m128i __DEFAULT_FN_ATTRS
3762 _mm_setr_epi8(char __b0
, char __b1
, char __b2
, char __b3
, char __b4
, char __b5
,
3763 char __b6
, char __b7
, char __b8
, char __b9
, char __b10
,
3764 char __b11
, char __b12
, char __b13
, char __b14
, char __b15
) {
3765 return _mm_set_epi8(__b15
, __b14
, __b13
, __b12
, __b11
, __b10
, __b9
, __b8
,
3766 __b7
, __b6
, __b5
, __b4
, __b3
, __b2
, __b1
, __b0
);
3769 /// Creates a 128-bit integer vector initialized to zero.
3771 /// \headerfile <x86intrin.h>
3773 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3775 /// \returns An initialized 128-bit integer vector with all elements set to
3777 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setzero_si128(void) {
3778 return __extension__(__m128i
)(__v2di
){0LL, 0LL};
3781 /// Stores a 128-bit integer vector to a memory location aligned on a
3782 /// 128-bit boundary.
3784 /// \headerfile <x86intrin.h>
3786 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3789 /// A pointer to an aligned memory location that will receive the integer
3792 /// A 128-bit integer vector containing the values to be moved.
3793 static __inline__
void __DEFAULT_FN_ATTRS
_mm_store_si128(__m128i
*__p
,
3798 /// Stores a 128-bit integer vector to an unaligned memory location.
3800 /// \headerfile <x86intrin.h>
3802 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3805 /// A pointer to a memory location that will receive the integer values.
3807 /// A 128-bit integer vector containing the values to be moved.
3808 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storeu_si128(__m128i_u
*__p
,
3810 struct __storeu_si128
{
3812 } __attribute__((__packed__
, __may_alias__
));
3813 ((struct __storeu_si128
*)__p
)->__v
= __b
;
3816 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3819 /// \headerfile <x86intrin.h>
3821 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3824 /// A pointer to a 64-bit memory location. The address of the memory
3825 /// location does not have to be aligned.
3827 /// A 128-bit integer vector containing the value to be stored.
3828 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storeu_si64(void *__p
,
3830 struct __storeu_si64
{
3832 } __attribute__((__packed__
, __may_alias__
));
3833 ((struct __storeu_si64
*)__p
)->__v
= ((__v2di
)__b
)[0];
3836 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3839 /// \headerfile <x86intrin.h>
3841 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3844 /// A pointer to a 32-bit memory location. The address of the memory
3845 /// location does not have to be aligned.
3847 /// A 128-bit integer vector containing the value to be stored.
3848 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storeu_si32(void *__p
,
3850 struct __storeu_si32
{
3852 } __attribute__((__packed__
, __may_alias__
));
3853 ((struct __storeu_si32
*)__p
)->__v
= ((__v4si
)__b
)[0];
3856 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3859 /// \headerfile <x86intrin.h>
3861 /// This intrinsic does not correspond to a specific instruction.
3864 /// A pointer to a 16-bit memory location. The address of the memory
3865 /// location does not have to be aligned.
3867 /// A 128-bit integer vector containing the value to be stored.
3868 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storeu_si16(void *__p
,
3870 struct __storeu_si16
{
3872 } __attribute__((__packed__
, __may_alias__
));
3873 ((struct __storeu_si16
*)__p
)->__v
= ((__v8hi
)__b
)[0];
3876 /// Moves bytes selected by the mask from the first operand to the
3877 /// specified unaligned memory location. When a mask bit is 1, the
3878 /// corresponding byte is written, otherwise it is not written.
3880 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3881 /// used again soon). Exception and trap behavior for elements not selected
3882 /// for storage to memory are implementation dependent.
3884 /// \headerfile <x86intrin.h>
3886 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3890 /// A 128-bit integer vector containing the values to be moved.
3892 /// A 128-bit integer vector containing the mask. The most significant bit of
3893 /// each byte represents the mask bits.
3895 /// A pointer to an unaligned 128-bit memory location where the specified
3896 /// values are moved.
3897 static __inline__
void __DEFAULT_FN_ATTRS
_mm_maskmoveu_si128(__m128i __d
,
3900 __builtin_ia32_maskmovdqu((__v16qi
)__d
, (__v16qi
)__n
, __p
);
3903 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3904 /// a memory location.
3906 /// \headerfile <x86intrin.h>
3908 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3911 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
3912 /// of the integer vector parameter.
3914 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3915 /// value to be stored.
3916 static __inline__
void __DEFAULT_FN_ATTRS
_mm_storel_epi64(__m128i_u
*__p
,
3918 struct __mm_storel_epi64_struct
{
3920 } __attribute__((__packed__
, __may_alias__
));
3921 ((struct __mm_storel_epi64_struct
*)__p
)->__u
= __a
[0];
3924 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3925 /// aligned memory location.
3927 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3928 /// used again soon).
3930 /// \headerfile <x86intrin.h>
3932 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3935 /// A pointer to the 128-bit aligned memory location used to store the value.
3937 /// A vector of [2 x double] containing the 64-bit values to be stored.
3938 static __inline__
void __DEFAULT_FN_ATTRS
_mm_stream_pd(double *__p
,
3940 __builtin_nontemporal_store((__v2df
)__a
, (__v2df
*)__p
);
3943 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3945 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3946 /// used again soon).
3948 /// \headerfile <x86intrin.h>
3950 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3953 /// A pointer to the 128-bit aligned memory location used to store the value.
3955 /// A 128-bit integer vector containing the values to be stored.
3956 static __inline__
void __DEFAULT_FN_ATTRS
_mm_stream_si128(__m128i
*__p
,
3958 __builtin_nontemporal_store((__v2di
)__a
, (__v2di
*)__p
);
3961 /// Stores a 32-bit integer value in the specified memory location.
3963 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3964 /// used again soon).
3966 /// \headerfile <x86intrin.h>
3968 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3971 /// A pointer to the 32-bit memory location used to store the value.
3973 /// A 32-bit integer containing the value to be stored.
3974 static __inline__
void
3975 __attribute__((__always_inline__
, __nodebug__
, __target__("sse2")))
3976 _mm_stream_si32(int *__p
, int __a
) {
3977 __builtin_ia32_movnti(__p
, __a
);
3981 /// Stores a 64-bit integer value in the specified memory location.
3983 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3984 /// used again soon).
3986 /// \headerfile <x86intrin.h>
3988 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
3991 /// A pointer to the 64-bit memory location used to store the value.
3993 /// A 64-bit integer containing the value to be stored.
3994 static __inline__
void
3995 __attribute__((__always_inline__
, __nodebug__
, __target__("sse2")))
3996 _mm_stream_si64(long long *__p
, long long __a
) {
3997 __builtin_ia32_movnti64(__p
, __a
);
4001 #if defined(__cplusplus)
4005 /// The cache line containing \a __p is flushed and invalidated from all
4006 /// caches in the coherency domain.
4008 /// \headerfile <x86intrin.h>
4010 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4013 /// A pointer to the memory location used to identify the cache line to be
4015 void _mm_clflush(void const *__p
);
4017 /// Forces strong memory ordering (serialization) between load
4018 /// instructions preceding this instruction and load instructions following
4019 /// this instruction, ensuring the system completes all previous loads before
4020 /// executing subsequent loads.
4022 /// \headerfile <x86intrin.h>
4024 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4026 void _mm_lfence(void);
4028 /// Forces strong memory ordering (serialization) between load and store
4029 /// instructions preceding this instruction and load and store instructions
4030 /// following this instruction, ensuring that the system completes all
4031 /// previous memory accesses before executing subsequent memory accesses.
4033 /// \headerfile <x86intrin.h>
4035 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4037 void _mm_mfence(void);
4039 #if defined(__cplusplus)
4043 /// Converts 16-bit signed integers from both 128-bit integer vector
4044 /// operands into 8-bit signed integers, and packs the results into the
4045 /// destination. Positive values greater than 0x7F are saturated to 0x7F.
4046 /// Negative values less than 0x80 are saturated to 0x80.
4048 /// \headerfile <x86intrin.h>
4050 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4053 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4054 /// a signed integer and is converted to a 8-bit signed integer with
4055 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4056 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4057 /// written to the lower 64 bits of the result.
4059 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4060 /// a signed integer and is converted to a 8-bit signed integer with
4061 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4062 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4063 /// written to the higher 64 bits of the result.
4064 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4065 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packs_epi16(__m128i __a
,
4067 return (__m128i
)__builtin_ia32_packsswb128((__v8hi
)__a
, (__v8hi
)__b
);
4070 /// Converts 32-bit signed integers from both 128-bit integer vector
4071 /// operands into 16-bit signed integers, and packs the results into the
4072 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4073 /// Negative values less than 0x8000 are saturated to 0x8000.
4075 /// \headerfile <x86intrin.h>
4077 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4080 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4081 /// a signed integer and is converted to a 16-bit signed integer with
4082 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4083 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4084 /// are written to the lower 64 bits of the result.
4086 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4087 /// a signed integer and is converted to a 16-bit signed integer with
4088 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4089 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4090 /// are written to the higher 64 bits of the result.
4091 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4092 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packs_epi32(__m128i __a
,
4094 return (__m128i
)__builtin_ia32_packssdw128((__v4si
)__a
, (__v4si
)__b
);
4097 /// Converts 16-bit signed integers from both 128-bit integer vector
4098 /// operands into 8-bit unsigned integers, and packs the results into the
4099 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less
4100 /// than 0x00 are saturated to 0x00.
4102 /// \headerfile <x86intrin.h>
4104 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4107 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4108 /// a signed integer and is converted to an 8-bit unsigned integer with
4109 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4110 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4111 /// written to the lower 64 bits of the result.
4113 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4114 /// a signed integer and is converted to an 8-bit unsigned integer with
4115 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4116 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4117 /// written to the higher 64 bits of the result.
4118 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4119 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packus_epi16(__m128i __a
,
4121 return (__m128i
)__builtin_ia32_packuswb128((__v8hi
)__a
, (__v8hi
)__b
);
4124 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4125 /// the immediate-value parameter as a selector.
4127 /// \headerfile <x86intrin.h>
4130 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4133 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4136 /// A 128-bit integer vector.
4138 /// An immediate value. Bits [2:0] selects values from \a a to be assigned
4139 /// to bits[15:0] of the result. \n
4140 /// 000: assign values from bits [15:0] of \a a. \n
4141 /// 001: assign values from bits [31:16] of \a a. \n
4142 /// 010: assign values from bits [47:32] of \a a. \n
4143 /// 011: assign values from bits [63:48] of \a a. \n
4144 /// 100: assign values from bits [79:64] of \a a. \n
4145 /// 101: assign values from bits [95:80] of \a a. \n
4146 /// 110: assign values from bits [111:96] of \a a. \n
4147 /// 111: assign values from bits [127:112] of \a a.
4148 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4149 /// integer vector parameter and the remaining bits are assigned zeros.
4150 #define _mm_extract_epi16(a, imm) \
4151 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4154 /// Constructs a 128-bit integer vector by first making a copy of the
4155 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4156 /// of an integer parameter into an offset specified by the immediate-value
4159 /// \headerfile <x86intrin.h>
4162 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4165 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4168 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4169 /// result and then one of the eight elements in the result is replaced by
4170 /// the lower 16 bits of \a b.
4172 /// An integer. The lower 16 bits of this parameter are written to the
4173 /// result beginning at an offset specified by \a imm.
4175 /// An immediate value specifying the bit offset in the result at which the
4176 /// lower 16 bits of \a b are written.
4177 /// \returns A 128-bit integer vector containing the constructed values.
4178 #define _mm_insert_epi16(a, b, imm) \
4179 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4182 /// Copies the values of the most significant bits from each 8-bit
4183 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4184 /// value, zero-extends the value, and writes it to the destination.
4186 /// \headerfile <x86intrin.h>
4188 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4191 /// A 128-bit integer vector containing the values with bits to be extracted.
4192 /// \returns The most significant bits from each 8-bit element in \a __a,
4193 /// written to bits [15:0]. The other bits are assigned zeros.
4194 static __inline__
int __DEFAULT_FN_ATTRS
_mm_movemask_epi8(__m128i __a
) {
4195 return __builtin_ia32_pmovmskb128((__v16qi
)__a
);
4198 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4199 /// elements of a 128-bit integer vector parameter, using the immediate-value
4200 /// parameter as a specifier.
4202 /// \headerfile <x86intrin.h>
4205 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4208 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4211 /// A 128-bit integer vector containing the values to be copied.
4213 /// An immediate value containing an 8-bit value specifying which elements to
4214 /// copy from a. The destinations within the 128-bit destination are assigned
4215 /// values as follows: \n
4216 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4217 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4218 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4219 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4220 /// Bit value assignments: \n
4221 /// 00: assign values from bits [31:0] of \a a. \n
4222 /// 01: assign values from bits [63:32] of \a a. \n
4223 /// 10: assign values from bits [95:64] of \a a. \n
4224 /// 11: assign values from bits [127:96] of \a a. \n
4225 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4226 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4227 /// <c>[b6, b4, b2, b0]</c>.
4228 /// \returns A 128-bit integer vector containing the shuffled values.
4229 #define _mm_shuffle_epi32(a, imm) \
4230 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4232 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4233 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4234 /// value parameter as a specifier.
4236 /// \headerfile <x86intrin.h>
4239 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4242 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4245 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4246 /// [127:64] of the result.
4248 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4249 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4250 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4251 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4252 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4253 /// Bit value assignments: \n
4254 /// 00: assign values from bits [15:0] of \a a. \n
4255 /// 01: assign values from bits [31:16] of \a a. \n
4256 /// 10: assign values from bits [47:32] of \a a. \n
4257 /// 11: assign values from bits [63:48] of \a a. \n
4258 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4259 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4260 /// <c>[b6, b4, b2, b0]</c>.
4261 /// \returns A 128-bit integer vector containing the shuffled values.
4262 #define _mm_shufflelo_epi16(a, imm) \
4263 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4265 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4266 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4267 /// value parameter as a specifier.
4269 /// \headerfile <x86intrin.h>
4272 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4275 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4278 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4279 /// [63:0] of the result.
4281 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4282 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4283 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4284 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4285 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4286 /// Bit value assignments: \n
4287 /// 00: assign values from bits [79:64] of \a a. \n
4288 /// 01: assign values from bits [95:80] of \a a. \n
4289 /// 10: assign values from bits [111:96] of \a a. \n
4290 /// 11: assign values from bits [127:112] of \a a. \n
4291 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4292 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4293 /// <c>[b6, b4, b2, b0]</c>.
4294 /// \returns A 128-bit integer vector containing the shuffled values.
4295 #define _mm_shufflehi_epi16(a, imm) \
4296 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4298 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4299 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4301 /// \headerfile <x86intrin.h>
4303 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4307 /// A 128-bit vector of [16 x i8].
4308 /// Bits [71:64] are written to bits [7:0] of the result. \n
4309 /// Bits [79:72] are written to bits [23:16] of the result. \n
4310 /// Bits [87:80] are written to bits [39:32] of the result. \n
4311 /// Bits [95:88] are written to bits [55:48] of the result. \n
4312 /// Bits [103:96] are written to bits [71:64] of the result. \n
4313 /// Bits [111:104] are written to bits [87:80] of the result. \n
4314 /// Bits [119:112] are written to bits [103:96] of the result. \n
4315 /// Bits [127:120] are written to bits [119:112] of the result.
4317 /// A 128-bit vector of [16 x i8]. \n
4318 /// Bits [71:64] are written to bits [15:8] of the result. \n
4319 /// Bits [79:72] are written to bits [31:24] of the result. \n
4320 /// Bits [87:80] are written to bits [47:40] of the result. \n
4321 /// Bits [95:88] are written to bits [63:56] of the result. \n
4322 /// Bits [103:96] are written to bits [79:72] of the result. \n
4323 /// Bits [111:104] are written to bits [95:88] of the result. \n
4324 /// Bits [119:112] are written to bits [111:104] of the result. \n
4325 /// Bits [127:120] are written to bits [127:120] of the result.
4326 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4327 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi8(__m128i __a
,
4329 return (__m128i
)__builtin_shufflevector(
4330 (__v16qi
)__a
, (__v16qi
)__b
, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4331 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4334 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4335 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4337 /// \headerfile <x86intrin.h>
4339 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4343 /// A 128-bit vector of [8 x i16].
4344 /// Bits [79:64] are written to bits [15:0] of the result. \n
4345 /// Bits [95:80] are written to bits [47:32] of the result. \n
4346 /// Bits [111:96] are written to bits [79:64] of the result. \n
4347 /// Bits [127:112] are written to bits [111:96] of the result.
4349 /// A 128-bit vector of [8 x i16].
4350 /// Bits [79:64] are written to bits [31:16] of the result. \n
4351 /// Bits [95:80] are written to bits [63:48] of the result. \n
4352 /// Bits [111:96] are written to bits [95:80] of the result. \n
4353 /// Bits [127:112] are written to bits [127:112] of the result.
4354 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4355 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi16(__m128i __a
,
4357 return (__m128i
)__builtin_shufflevector((__v8hi
)__a
, (__v8hi
)__b
, 4, 8 + 4, 5,
4358 8 + 5, 6, 8 + 6, 7, 8 + 7);
4361 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4362 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4364 /// \headerfile <x86intrin.h>
4366 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4370 /// A 128-bit vector of [4 x i32]. \n
4371 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4372 /// Bits [127:96] are written to bits [95:64] of the destination.
4374 /// A 128-bit vector of [4 x i32]. \n
4375 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4376 /// Bits [127:96] are written to bits [127:96] of the destination.
4377 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4378 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi32(__m128i __a
,
4380 return (__m128i
)__builtin_shufflevector((__v4si
)__a
, (__v4si
)__b
, 2, 4 + 2, 3,
4384 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4385 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4387 /// \headerfile <x86intrin.h>
4389 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4393 /// A 128-bit vector of [2 x i64]. \n
4394 /// Bits [127:64] are written to bits [63:0] of the destination.
4396 /// A 128-bit vector of [2 x i64]. \n
4397 /// Bits [127:64] are written to bits [127:64] of the destination.
4398 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4399 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi64(__m128i __a
,
4401 return (__m128i
)__builtin_shufflevector((__v2di
)__a
, (__v2di
)__b
, 1, 2 + 1);
4404 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4405 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4407 /// \headerfile <x86intrin.h>
4409 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4413 /// A 128-bit vector of [16 x i8]. \n
4414 /// Bits [7:0] are written to bits [7:0] of the result. \n
4415 /// Bits [15:8] are written to bits [23:16] of the result. \n
4416 /// Bits [23:16] are written to bits [39:32] of the result. \n
4417 /// Bits [31:24] are written to bits [55:48] of the result. \n
4418 /// Bits [39:32] are written to bits [71:64] of the result. \n
4419 /// Bits [47:40] are written to bits [87:80] of the result. \n
4420 /// Bits [55:48] are written to bits [103:96] of the result. \n
4421 /// Bits [63:56] are written to bits [119:112] of the result.
4423 /// A 128-bit vector of [16 x i8].
4424 /// Bits [7:0] are written to bits [15:8] of the result. \n
4425 /// Bits [15:8] are written to bits [31:24] of the result. \n
4426 /// Bits [23:16] are written to bits [47:40] of the result. \n
4427 /// Bits [31:24] are written to bits [63:56] of the result. \n
4428 /// Bits [39:32] are written to bits [79:72] of the result. \n
4429 /// Bits [47:40] are written to bits [95:88] of the result. \n
4430 /// Bits [55:48] are written to bits [111:104] of the result. \n
4431 /// Bits [63:56] are written to bits [127:120] of the result.
4432 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4433 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi8(__m128i __a
,
4435 return (__m128i
)__builtin_shufflevector(
4436 (__v16qi
)__a
, (__v16qi
)__b
, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4437 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4440 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4441 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4444 /// \headerfile <x86intrin.h>
4446 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4450 /// A 128-bit vector of [8 x i16].
4451 /// Bits [15:0] are written to bits [15:0] of the result. \n
4452 /// Bits [31:16] are written to bits [47:32] of the result. \n
4453 /// Bits [47:32] are written to bits [79:64] of the result. \n
4454 /// Bits [63:48] are written to bits [111:96] of the result.
4456 /// A 128-bit vector of [8 x i16].
4457 /// Bits [15:0] are written to bits [31:16] of the result. \n
4458 /// Bits [31:16] are written to bits [63:48] of the result. \n
4459 /// Bits [47:32] are written to bits [95:80] of the result. \n
4460 /// Bits [63:48] are written to bits [127:112] of the result.
4461 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4462 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi16(__m128i __a
,
4464 return (__m128i
)__builtin_shufflevector((__v8hi
)__a
, (__v8hi
)__b
, 0, 8 + 0, 1,
4465 8 + 1, 2, 8 + 2, 3, 8 + 3);
4468 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4469 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4471 /// \headerfile <x86intrin.h>
4473 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4477 /// A 128-bit vector of [4 x i32]. \n
4478 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4479 /// Bits [63:32] are written to bits [95:64] of the destination.
4481 /// A 128-bit vector of [4 x i32]. \n
4482 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4483 /// Bits [63:32] are written to bits [127:96] of the destination.
4484 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4485 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi32(__m128i __a
,
4487 return (__m128i
)__builtin_shufflevector((__v4si
)__a
, (__v4si
)__b
, 0, 4 + 0, 1,
4491 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4492 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4494 /// \headerfile <x86intrin.h>
4496 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4500 /// A 128-bit vector of [2 x i64]. \n
4501 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4503 /// A 128-bit vector of [2 x i64]. \n
4504 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4505 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4506 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi64(__m128i __a
,
4508 return (__m128i
)__builtin_shufflevector((__v2di
)__a
, (__v2di
)__b
, 0, 2 + 0);
4511 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4514 /// \headerfile <x86intrin.h>
4516 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4519 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4521 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4522 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_movepi64_pi64(__m128i __a
) {
4523 return (__m64
)__a
[0];
4526 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4529 /// \headerfile <x86intrin.h>
4531 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4535 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4536 /// the operand. The upper 64 bits are assigned zeros.
4537 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_movpi64_epi64(__m64 __a
) {
4538 return __extension__(__m128i
)(__v2di
){(long long)__a
, 0};
4541 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4542 /// integer vector, zeroing the upper bits.
4544 /// \headerfile <x86intrin.h>
4546 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4549 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4551 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4552 /// the operand. The upper 64 bits are assigned zeros.
4553 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_move_epi64(__m128i __a
) {
4554 return __builtin_shufflevector((__v2di
)__a
, _mm_setzero_si128(), 0, 2);
4557 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4558 /// [2 x double] and interleaves them into a 128-bit vector of [2 x
4561 /// \headerfile <x86intrin.h>
4563 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4566 /// A 128-bit vector of [2 x double]. \n
4567 /// Bits [127:64] are written to bits [63:0] of the destination.
4569 /// A 128-bit vector of [2 x double]. \n
4570 /// Bits [127:64] are written to bits [127:64] of the destination.
4571 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4572 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_unpackhi_pd(__m128d __a
,
4574 return __builtin_shufflevector((__v2df
)__a
, (__v2df
)__b
, 1, 2 + 1);
4577 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4578 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4581 /// \headerfile <x86intrin.h>
4583 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4586 /// A 128-bit vector of [2 x double]. \n
4587 /// Bits [63:0] are written to bits [63:0] of the destination.
4589 /// A 128-bit vector of [2 x double]. \n
4590 /// Bits [63:0] are written to bits [127:64] of the destination.
4591 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4592 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_unpacklo_pd(__m128d __a
,
4594 return __builtin_shufflevector((__v2df
)__a
, (__v2df
)__b
, 0, 2 + 0);
4597 /// Extracts the sign bits of the double-precision values in the 128-bit
4598 /// vector of [2 x double], zero-extends the value, and writes it to the
4599 /// low-order bits of the destination.
4601 /// \headerfile <x86intrin.h>
4603 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4606 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4608 /// \returns The sign bits from each of the double-precision elements in \a __a,
4609 /// written to bits [1:0]. The remaining bits are assigned values of zero.
4610 static __inline__
int __DEFAULT_FN_ATTRS
_mm_movemask_pd(__m128d __a
) {
4611 return __builtin_ia32_movmskpd((__v2df
)__a
);
4614 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4615 /// 128-bit vector parameters of [2 x double], using the immediate-value
4616 /// parameter as a specifier.
4618 /// \headerfile <x86intrin.h>
4621 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4624 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4627 /// A 128-bit vector of [2 x double].
4629 /// A 128-bit vector of [2 x double].
4631 /// An 8-bit immediate value. The least significant two bits specify which
4632 /// elements to copy from \a a and \a b: \n
4633 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4634 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4635 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4636 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4637 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4638 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4639 /// <c>[b1, b0]</c>.
4640 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4641 #define _mm_shuffle_pd(a, b, i) \
4642 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4645 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4646 /// floating-point vector of [4 x float].
4648 /// \headerfile <x86intrin.h>
4650 /// This intrinsic has no corresponding instruction.
4653 /// A 128-bit floating-point vector of [2 x double].
4654 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4655 /// bitwise pattern as the parameter.
4656 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_castpd_ps(__m128d __a
) {
4660 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4663 /// \headerfile <x86intrin.h>
4665 /// This intrinsic has no corresponding instruction.
4668 /// A 128-bit floating-point vector of [2 x double].
4669 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4671 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_castpd_si128(__m128d __a
) {
4672 return (__m128i
)__a
;
4675 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4676 /// floating-point vector of [2 x double].
4678 /// \headerfile <x86intrin.h>
4680 /// This intrinsic has no corresponding instruction.
4683 /// A 128-bit floating-point vector of [4 x float].
4684 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4685 /// bitwise pattern as the parameter.
4686 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_castps_pd(__m128 __a
) {
4687 return (__m128d
)__a
;
4690 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4693 /// \headerfile <x86intrin.h>
4695 /// This intrinsic has no corresponding instruction.
4698 /// A 128-bit floating-point vector of [4 x float].
4699 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4701 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_castps_si128(__m128 __a
) {
4702 return (__m128i
)__a
;
4705 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4708 /// \headerfile <x86intrin.h>
4710 /// This intrinsic has no corresponding instruction.
4713 /// A 128-bit integer vector.
4714 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4715 /// bitwise pattern as the parameter.
4716 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_castsi128_ps(__m128i __a
) {
4720 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4721 /// of [2 x double].
4723 /// \headerfile <x86intrin.h>
4725 /// This intrinsic has no corresponding instruction.
4728 /// A 128-bit integer vector.
4729 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4730 /// bitwise pattern as the parameter.
4731 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_castsi128_pd(__m128i __a
) {
4732 return (__m128d
)__a
;
4735 #if defined(__cplusplus)
4739 /// Indicates that a spin loop is being executed for the purposes of
4740 /// optimizing power consumption during the loop.
4742 /// \headerfile <x86intrin.h>
4744 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4746 void _mm_pause(void);
4748 #if defined(__cplusplus)
4751 #undef __DEFAULT_FN_ATTRS
4752 #undef __DEFAULT_FN_ATTRS_MMX
4754 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4756 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4757 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4759 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4761 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4762 #define _MM_SET_DENORMALS_ZERO_MODE(x) \
4763 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4765 #endif /* __EMMINTRIN_H */