[ELF] Avoid make in elf::writeARMCmseImportLib
[llvm-project.git] / clang / lib / Headers / xmmintrin.h
blob20e66d190113a38037504ea79b3ad90565da4ae9
1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 #ifndef __XMMINTRIN_H
11 #define __XMMINTRIN_H
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
17 #include <mmintrin.h>
19 typedef int __v4si __attribute__((__vector_size__(16)));
20 typedef float __v4sf __attribute__((__vector_size__(16)));
21 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
23 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
25 /* Unsigned types */
26 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
28 /* This header should only be included in a hosted environment as it depends on
29 * a standard library to provide allocation routines. */
30 #if __STDC_HOSTED__
31 #include <mm_malloc.h>
32 #endif
34 /* Define the default attributes for the functions in this file. */
35 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
36 #define __DEFAULT_FN_ATTRS \
37 __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
38 __min_vector_width__(128)))
39 #define __DEFAULT_FN_ATTRS_SSE2 \
40 __attribute__((__always_inline__, __nodebug__, \
41 __target__("sse2,no-evex512"), __min_vector_width__(128)))
42 #else
43 #define __DEFAULT_FN_ATTRS \
44 __attribute__((__always_inline__, __nodebug__, __target__("sse"), \
45 __min_vector_width__(128)))
46 #define __DEFAULT_FN_ATTRS_SSE2 \
47 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
48 __min_vector_width__(128)))
49 #endif
51 #if defined(__cplusplus) && (__cplusplus >= 201103L)
52 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
53 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
54 #else
55 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
56 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
57 #endif
59 #define __trunc64(x) \
60 (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
61 #define __zext128(x) \
62 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
63 1, 2, 3)
64 #define __anyext128(x) \
65 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
66 1, -1, -1)
67 #define __zeroupper64(x) \
68 (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0, \
69 1, 4, 5)
71 /// Adds the 32-bit float values in the low-order bits of the operands.
72 ///
73 /// \headerfile <x86intrin.h>
74 ///
75 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
76 ///
77 /// \param __a
78 /// A 128-bit vector of [4 x float] containing one of the source operands.
79 /// The lower 32 bits of this operand are used in the calculation.
80 /// \param __b
81 /// A 128-bit vector of [4 x float] containing one of the source operands.
82 /// The lower 32 bits of this operand are used in the calculation.
83 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
84 /// of the lower 32 bits of both operands. The upper 96 bits are copied from
85 /// the upper 96 bits of the first source operand.
86 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
87 _mm_add_ss(__m128 __a, __m128 __b) {
88 __a[0] += __b[0];
89 return __a;
92 /// Adds two 128-bit vectors of [4 x float], and returns the results of
93 /// the addition.
94 ///
95 /// \headerfile <x86intrin.h>
96 ///
97 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
98 ///
99 /// \param __a
100 /// A 128-bit vector of [4 x float] containing one of the source operands.
101 /// \param __b
102 /// A 128-bit vector of [4 x float] containing one of the source operands.
103 /// \returns A 128-bit vector of [4 x float] containing the sums of both
104 /// operands.
105 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
106 _mm_add_ps(__m128 __a, __m128 __b) {
107 return (__m128)((__v4sf)__a + (__v4sf)__b);
110 /// Subtracts the 32-bit float value in the low-order bits of the second
111 /// operand from the corresponding value in the first operand.
113 /// \headerfile <x86intrin.h>
115 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
117 /// \param __a
118 /// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
119 /// of this operand are used in the calculation.
120 /// \param __b
121 /// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
122 /// bits of this operand are used in the calculation.
123 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
124 /// difference of the lower 32 bits of both operands. The upper 96 bits are
125 /// copied from the upper 96 bits of the first source operand.
126 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
127 _mm_sub_ss(__m128 __a, __m128 __b) {
128 __a[0] -= __b[0];
129 return __a;
132 /// Subtracts each of the values of the second operand from the first
133 /// operand, both of which are 128-bit vectors of [4 x float] and returns
134 /// the results of the subtraction.
136 /// \headerfile <x86intrin.h>
138 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
140 /// \param __a
141 /// A 128-bit vector of [4 x float] containing the minuend.
142 /// \param __b
143 /// A 128-bit vector of [4 x float] containing the subtrahend.
144 /// \returns A 128-bit vector of [4 x float] containing the differences between
145 /// both operands.
146 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
147 _mm_sub_ps(__m128 __a, __m128 __b) {
148 return (__m128)((__v4sf)__a - (__v4sf)__b);
151 /// Multiplies two 32-bit float values in the low-order bits of the
152 /// operands.
154 /// \headerfile <x86intrin.h>
156 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
158 /// \param __a
159 /// A 128-bit vector of [4 x float] containing one of the source operands.
160 /// The lower 32 bits of this operand are used in the calculation.
161 /// \param __b
162 /// A 128-bit vector of [4 x float] containing one of the source operands.
163 /// The lower 32 bits of this operand are used in the calculation.
164 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
165 /// 32 bits of both operands. The upper 96 bits are copied from the upper 96
166 /// bits of the first source operand.
167 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
168 _mm_mul_ss(__m128 __a, __m128 __b) {
169 __a[0] *= __b[0];
170 return __a;
173 /// Multiplies two 128-bit vectors of [4 x float] and returns the
174 /// results of the multiplication.
176 /// \headerfile <x86intrin.h>
178 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
180 /// \param __a
181 /// A 128-bit vector of [4 x float] containing one of the source operands.
182 /// \param __b
183 /// A 128-bit vector of [4 x float] containing one of the source operands.
184 /// \returns A 128-bit vector of [4 x float] containing the products of both
185 /// operands.
186 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
187 _mm_mul_ps(__m128 __a, __m128 __b) {
188 return (__m128)((__v4sf)__a * (__v4sf)__b);
191 /// Divides the value in the low-order 32 bits of the first operand by
192 /// the corresponding value in the second operand.
194 /// \headerfile <x86intrin.h>
196 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
198 /// \param __a
199 /// A 128-bit vector of [4 x float] containing the dividend. The lower 32
200 /// bits of this operand are used in the calculation.
201 /// \param __b
202 /// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
203 /// of this operand are used in the calculation.
204 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
205 /// lower 32 bits of both operands. The upper 96 bits are copied from the
206 /// upper 96 bits of the first source operand.
207 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
208 _mm_div_ss(__m128 __a, __m128 __b) {
209 __a[0] /= __b[0];
210 return __a;
213 /// Divides two 128-bit vectors of [4 x float].
215 /// \headerfile <x86intrin.h>
217 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
219 /// \param __a
220 /// A 128-bit vector of [4 x float] containing the dividend.
221 /// \param __b
222 /// A 128-bit vector of [4 x float] containing the divisor.
223 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
224 /// operands.
225 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
226 _mm_div_ps(__m128 __a, __m128 __b) {
227 return (__m128)((__v4sf)__a / (__v4sf)__b);
230 /// Calculates the square root of the value stored in the low-order bits
231 /// of a 128-bit vector of [4 x float].
233 /// \headerfile <x86intrin.h>
235 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
237 /// \param __a
238 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
239 /// used in the calculation.
240 /// \returns A 128-bit vector of [4 x float] containing the square root of the
241 /// value in the low-order bits of the operand.
242 static __inline__ __m128 __DEFAULT_FN_ATTRS
243 _mm_sqrt_ss(__m128 __a)
245 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
248 /// Calculates the square roots of the values stored in a 128-bit vector
249 /// of [4 x float].
251 /// \headerfile <x86intrin.h>
253 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
255 /// \param __a
256 /// A 128-bit vector of [4 x float].
257 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
258 /// values in the operand.
259 static __inline__ __m128 __DEFAULT_FN_ATTRS
260 _mm_sqrt_ps(__m128 __a)
262 return __builtin_ia32_sqrtps((__v4sf)__a);
265 /// Calculates the approximate reciprocal of the value stored in the
266 /// low-order bits of a 128-bit vector of [4 x float].
268 /// \headerfile <x86intrin.h>
270 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
272 /// \param __a
273 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
274 /// used in the calculation.
275 /// \returns A 128-bit vector of [4 x float] containing the approximate
276 /// reciprocal of the value in the low-order bits of the operand.
277 static __inline__ __m128 __DEFAULT_FN_ATTRS
278 _mm_rcp_ss(__m128 __a)
280 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
283 /// Calculates the approximate reciprocals of the values stored in a
284 /// 128-bit vector of [4 x float].
286 /// \headerfile <x86intrin.h>
288 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
290 /// \param __a
291 /// A 128-bit vector of [4 x float].
292 /// \returns A 128-bit vector of [4 x float] containing the approximate
293 /// reciprocals of the values in the operand.
294 static __inline__ __m128 __DEFAULT_FN_ATTRS
295 _mm_rcp_ps(__m128 __a)
297 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
300 /// Calculates the approximate reciprocal of the square root of the value
301 /// stored in the low-order bits of a 128-bit vector of [4 x float].
303 /// \headerfile <x86intrin.h>
305 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
307 /// \param __a
308 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
309 /// used in the calculation.
310 /// \returns A 128-bit vector of [4 x float] containing the approximate
311 /// reciprocal of the square root of the value in the low-order bits of the
312 /// operand.
313 static __inline__ __m128 __DEFAULT_FN_ATTRS
314 _mm_rsqrt_ss(__m128 __a)
316 return __builtin_ia32_rsqrtss((__v4sf)__a);
319 /// Calculates the approximate reciprocals of the square roots of the
320 /// values stored in a 128-bit vector of [4 x float].
322 /// \headerfile <x86intrin.h>
324 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
326 /// \param __a
327 /// A 128-bit vector of [4 x float].
328 /// \returns A 128-bit vector of [4 x float] containing the approximate
329 /// reciprocals of the square roots of the values in the operand.
330 static __inline__ __m128 __DEFAULT_FN_ATTRS
331 _mm_rsqrt_ps(__m128 __a)
333 return __builtin_ia32_rsqrtps((__v4sf)__a);
336 /// Compares two 32-bit float values in the low-order bits of both
337 /// operands and returns the lesser value in the low-order bits of the
338 /// vector of [4 x float].
340 /// If either value in a comparison is NaN, returns the value from \a __b.
342 /// \headerfile <x86intrin.h>
344 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
346 /// \param __a
347 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
348 /// 32 bits of this operand are used in the comparison.
349 /// \param __b
350 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
351 /// 32 bits of this operand are used in the comparison.
352 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
353 /// minimum value between both operands. The upper 96 bits are copied from
354 /// the upper 96 bits of the first source operand.
355 static __inline__ __m128 __DEFAULT_FN_ATTRS
356 _mm_min_ss(__m128 __a, __m128 __b)
358 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
361 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
362 /// of each pair of values.
364 /// If either value in a comparison is NaN, returns the value from \a __b.
366 /// \headerfile <x86intrin.h>
368 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
370 /// \param __a
371 /// A 128-bit vector of [4 x float] containing one of the operands.
372 /// \param __b
373 /// A 128-bit vector of [4 x float] containing one of the operands.
374 /// \returns A 128-bit vector of [4 x float] containing the minimum values
375 /// between both operands.
376 static __inline__ __m128 __DEFAULT_FN_ATTRS
377 _mm_min_ps(__m128 __a, __m128 __b)
379 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
382 /// Compares two 32-bit float values in the low-order bits of both
383 /// operands and returns the greater value in the low-order bits of a 128-bit
384 /// vector of [4 x float].
386 /// If either value in a comparison is NaN, returns the value from \a __b.
388 /// \headerfile <x86intrin.h>
390 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
392 /// \param __a
393 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
394 /// 32 bits of this operand are used in the comparison.
395 /// \param __b
396 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
397 /// 32 bits of this operand are used in the comparison.
398 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
399 /// maximum value between both operands. The upper 96 bits are copied from
400 /// the upper 96 bits of the first source operand.
401 static __inline__ __m128 __DEFAULT_FN_ATTRS
402 _mm_max_ss(__m128 __a, __m128 __b)
404 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
407 /// Compares two 128-bit vectors of [4 x float] and returns the greater
408 /// of each pair of values.
410 /// If either value in a comparison is NaN, returns the value from \a __b.
412 /// \headerfile <x86intrin.h>
414 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
416 /// \param __a
417 /// A 128-bit vector of [4 x float] containing one of the operands.
418 /// \param __b
419 /// A 128-bit vector of [4 x float] containing one of the operands.
420 /// \returns A 128-bit vector of [4 x float] containing the maximum values
421 /// between both operands.
422 static __inline__ __m128 __DEFAULT_FN_ATTRS
423 _mm_max_ps(__m128 __a, __m128 __b)
425 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
428 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
430 /// \headerfile <x86intrin.h>
432 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
434 /// \param __a
435 /// A 128-bit vector containing one of the source operands.
436 /// \param __b
437 /// A 128-bit vector containing one of the source operands.
438 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
439 /// values between both operands.
440 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
441 _mm_and_ps(__m128 __a, __m128 __b) {
442 return (__m128)((__v4su)__a & (__v4su)__b);
445 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
446 /// the one's complement of the values contained in the first source
447 /// operand.
449 /// \headerfile <x86intrin.h>
451 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
453 /// \param __a
454 /// A 128-bit vector of [4 x float] containing the first source operand. The
455 /// one's complement of this value is used in the bitwise AND.
456 /// \param __b
457 /// A 128-bit vector of [4 x float] containing the second source operand.
458 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
459 /// one's complement of the first operand and the values in the second
460 /// operand.
461 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
462 _mm_andnot_ps(__m128 __a, __m128 __b) {
463 return (__m128)(~(__v4su)__a & (__v4su)__b);
466 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
468 /// \headerfile <x86intrin.h>
470 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
472 /// \param __a
473 /// A 128-bit vector of [4 x float] containing one of the source operands.
474 /// \param __b
475 /// A 128-bit vector of [4 x float] containing one of the source operands.
476 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
477 /// values between both operands.
478 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
479 _mm_or_ps(__m128 __a, __m128 __b) {
480 return (__m128)((__v4su)__a | (__v4su)__b);
483 /// Performs a bitwise exclusive OR of two 128-bit vectors of
484 /// [4 x float].
486 /// \headerfile <x86intrin.h>
488 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
490 /// \param __a
491 /// A 128-bit vector of [4 x float] containing one of the source operands.
492 /// \param __b
493 /// A 128-bit vector of [4 x float] containing one of the source operands.
494 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
495 /// of the values between both operands.
496 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
497 _mm_xor_ps(__m128 __a, __m128 __b) {
498 return (__m128)((__v4su)__a ^ (__v4su)__b);
501 /// Compares two 32-bit float values in the low-order bits of both
502 /// operands for equality.
504 /// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
505 /// low-order bits of a vector [4 x float].
506 /// If either value in a comparison is NaN, returns false.
508 /// \headerfile <x86intrin.h>
510 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
512 /// \param __a
513 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
514 /// 32 bits of this operand are used in the comparison.
515 /// \param __b
516 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
517 /// 32 bits of this operand are used in the comparison.
518 /// \returns A 128-bit vector of [4 x float] containing the comparison results
519 /// in the low-order bits.
520 static __inline__ __m128 __DEFAULT_FN_ATTRS
521 _mm_cmpeq_ss(__m128 __a, __m128 __b)
523 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
526 /// Compares each of the corresponding 32-bit float values of the
527 /// 128-bit vectors of [4 x float] for equality.
529 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
530 /// If either value in a comparison is NaN, returns false.
532 /// \headerfile <x86intrin.h>
534 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
536 /// \param __a
537 /// A 128-bit vector of [4 x float].
538 /// \param __b
539 /// A 128-bit vector of [4 x float].
540 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
541 static __inline__ __m128 __DEFAULT_FN_ATTRS
542 _mm_cmpeq_ps(__m128 __a, __m128 __b)
544 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
547 /// Compares two 32-bit float values in the low-order bits of both
548 /// operands to determine if the value in the first operand is less than the
549 /// corresponding value in the second operand.
551 /// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
552 /// low-order bits of a vector of [4 x float].
553 /// If either value in a comparison is NaN, returns false.
555 /// \headerfile <x86intrin.h>
557 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
559 /// \param __a
560 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
561 /// 32 bits of this operand are used in the comparison.
562 /// \param __b
563 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
564 /// 32 bits of this operand are used in the comparison.
565 /// \returns A 128-bit vector of [4 x float] containing the comparison results
566 /// in the low-order bits.
567 static __inline__ __m128 __DEFAULT_FN_ATTRS
568 _mm_cmplt_ss(__m128 __a, __m128 __b)
570 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
573 /// Compares each of the corresponding 32-bit float values of the
574 /// 128-bit vectors of [4 x float] to determine if the values in the first
575 /// operand are less than those in the second operand.
577 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
578 /// If either value in a comparison is NaN, returns false.
580 /// \headerfile <x86intrin.h>
582 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
584 /// \param __a
585 /// A 128-bit vector of [4 x float].
586 /// \param __b
587 /// A 128-bit vector of [4 x float].
588 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
589 static __inline__ __m128 __DEFAULT_FN_ATTRS
590 _mm_cmplt_ps(__m128 __a, __m128 __b)
592 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
595 /// Compares two 32-bit float values in the low-order bits of both
596 /// operands to determine if the value in the first operand is less than or
597 /// equal to the corresponding value in the second operand.
599 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
600 /// the low-order bits of a vector of [4 x float].
601 /// If either value in a comparison is NaN, returns false.
603 /// \headerfile <x86intrin.h>
605 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
607 /// \param __a
608 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
609 /// 32 bits of this operand are used in the comparison.
610 /// \param __b
611 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
612 /// 32 bits of this operand are used in the comparison.
613 /// \returns A 128-bit vector of [4 x float] containing the comparison results
614 /// in the low-order bits.
615 static __inline__ __m128 __DEFAULT_FN_ATTRS
616 _mm_cmple_ss(__m128 __a, __m128 __b)
618 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
621 /// Compares each of the corresponding 32-bit float values of the
622 /// 128-bit vectors of [4 x float] to determine if the values in the first
623 /// operand are less than or equal to those in the second operand.
625 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
626 /// If either value in a comparison is NaN, returns false.
628 /// \headerfile <x86intrin.h>
630 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
632 /// \param __a
633 /// A 128-bit vector of [4 x float].
634 /// \param __b
635 /// A 128-bit vector of [4 x float].
636 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
637 static __inline__ __m128 __DEFAULT_FN_ATTRS
638 _mm_cmple_ps(__m128 __a, __m128 __b)
640 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
643 /// Compares two 32-bit float values in the low-order bits of both
644 /// operands to determine if the value in the first operand is greater than
645 /// the corresponding value in the second operand.
647 /// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
648 /// low-order bits of a vector of [4 x float].
649 /// If either value in a comparison is NaN, returns false.
651 /// \headerfile <x86intrin.h>
653 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
655 /// \param __a
656 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
657 /// 32 bits of this operand are used in the comparison.
658 /// \param __b
659 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
660 /// 32 bits of this operand are used in the comparison.
661 /// \returns A 128-bit vector of [4 x float] containing the comparison results
662 /// in the low-order bits.
663 static __inline__ __m128 __DEFAULT_FN_ATTRS
664 _mm_cmpgt_ss(__m128 __a, __m128 __b)
666 return (__m128)__builtin_shufflevector((__v4sf)__a,
667 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
668 4, 1, 2, 3);
671 /// Compares each of the corresponding 32-bit float values of the
672 /// 128-bit vectors of [4 x float] to determine if the values in the first
673 /// operand are greater than those in the second operand.
675 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
676 /// If either value in a comparison is NaN, returns false.
678 /// \headerfile <x86intrin.h>
680 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
682 /// \param __a
683 /// A 128-bit vector of [4 x float].
684 /// \param __b
685 /// A 128-bit vector of [4 x float].
686 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
687 static __inline__ __m128 __DEFAULT_FN_ATTRS
688 _mm_cmpgt_ps(__m128 __a, __m128 __b)
690 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
693 /// Compares two 32-bit float values in the low-order bits of both
694 /// operands to determine if the value in the first operand is greater than
695 /// or equal to the corresponding value in the second operand.
697 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
698 /// low-order bits of a vector of [4 x float].
699 /// If either value in a comparison is NaN, returns false.
701 /// \headerfile <x86intrin.h>
703 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
705 /// \param __a
706 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
707 /// 32 bits of this operand are used in the comparison.
708 /// \param __b
709 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
710 /// 32 bits of this operand are used in the comparison.
711 /// \returns A 128-bit vector of [4 x float] containing the comparison results
712 /// in the low-order bits.
713 static __inline__ __m128 __DEFAULT_FN_ATTRS
714 _mm_cmpge_ss(__m128 __a, __m128 __b)
716 return (__m128)__builtin_shufflevector((__v4sf)__a,
717 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
718 4, 1, 2, 3);
721 /// Compares each of the corresponding 32-bit float values of the
722 /// 128-bit vectors of [4 x float] to determine if the values in the first
723 /// operand are greater than or equal to those in the second operand.
725 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
726 /// If either value in a comparison is NaN, returns false.
728 /// \headerfile <x86intrin.h>
730 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
732 /// \param __a
733 /// A 128-bit vector of [4 x float].
734 /// \param __b
735 /// A 128-bit vector of [4 x float].
736 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
737 static __inline__ __m128 __DEFAULT_FN_ATTRS
738 _mm_cmpge_ps(__m128 __a, __m128 __b)
740 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
743 /// Compares two 32-bit float values in the low-order bits of both operands
744 /// for inequality.
746 /// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
747 /// low-order bits of a vector of [4 x float].
748 /// If either value in a comparison is NaN, returns true.
750 /// \headerfile <x86intrin.h>
752 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
753 /// instructions.
755 /// \param __a
756 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
757 /// 32 bits of this operand are used in the comparison.
758 /// \param __b
759 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
760 /// 32 bits of this operand are used in the comparison.
761 /// \returns A 128-bit vector of [4 x float] containing the comparison results
762 /// in the low-order bits.
763 static __inline__ __m128 __DEFAULT_FN_ATTRS
764 _mm_cmpneq_ss(__m128 __a, __m128 __b)
766 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
769 /// Compares each of the corresponding 32-bit float values of the
770 /// 128-bit vectors of [4 x float] for inequality.
772 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
773 /// If either value in a comparison is NaN, returns true.
775 /// \headerfile <x86intrin.h>
777 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
778 /// instructions.
780 /// \param __a
781 /// A 128-bit vector of [4 x float].
782 /// \param __b
783 /// A 128-bit vector of [4 x float].
784 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
785 static __inline__ __m128 __DEFAULT_FN_ATTRS
786 _mm_cmpneq_ps(__m128 __a, __m128 __b)
788 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
791 /// Compares two 32-bit float values in the low-order bits of both
792 /// operands to determine if the value in the first operand is not less than
793 /// the corresponding value in the second operand.
795 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
796 /// low-order bits of a vector of [4 x float].
797 /// If either value in a comparison is NaN, returns true.
799 /// \headerfile <x86intrin.h>
801 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
802 /// instructions.
804 /// \param __a
805 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
806 /// 32 bits of this operand are used in the comparison.
807 /// \param __b
808 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
809 /// 32 bits of this operand are used in the comparison.
810 /// \returns A 128-bit vector of [4 x float] containing the comparison results
811 /// in the low-order bits.
812 static __inline__ __m128 __DEFAULT_FN_ATTRS
813 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
815 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
818 /// Compares each of the corresponding 32-bit float values of the
819 /// 128-bit vectors of [4 x float] to determine if the values in the first
820 /// operand are not less than those in the second operand.
822 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
823 /// If either value in a comparison is NaN, returns true.
825 /// \headerfile <x86intrin.h>
827 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
828 /// instructions.
830 /// \param __a
831 /// A 128-bit vector of [4 x float].
832 /// \param __b
833 /// A 128-bit vector of [4 x float].
834 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
835 static __inline__ __m128 __DEFAULT_FN_ATTRS
836 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
838 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
841 /// Compares two 32-bit float values in the low-order bits of both
842 /// operands to determine if the value in the first operand is not less than
843 /// or equal to the corresponding value in the second operand.
845 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
846 /// low-order bits of a vector of [4 x float].
847 /// If either value in a comparison is NaN, returns true.
849 /// \headerfile <x86intrin.h>
851 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
852 /// instructions.
854 /// \param __a
855 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
856 /// 32 bits of this operand are used in the comparison.
857 /// \param __b
858 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
859 /// 32 bits of this operand are used in the comparison.
860 /// \returns A 128-bit vector of [4 x float] containing the comparison results
861 /// in the low-order bits.
862 static __inline__ __m128 __DEFAULT_FN_ATTRS
863 _mm_cmpnle_ss(__m128 __a, __m128 __b)
865 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
868 /// Compares each of the corresponding 32-bit float values of the
869 /// 128-bit vectors of [4 x float] to determine if the values in the first
870 /// operand are not less than or equal to those in the second operand.
872 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
873 /// If either value in a comparison is NaN, returns true.
875 /// \headerfile <x86intrin.h>
877 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
878 /// instructions.
880 /// \param __a
881 /// A 128-bit vector of [4 x float].
882 /// \param __b
883 /// A 128-bit vector of [4 x float].
884 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
885 static __inline__ __m128 __DEFAULT_FN_ATTRS
886 _mm_cmpnle_ps(__m128 __a, __m128 __b)
888 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
891 /// Compares two 32-bit float values in the low-order bits of both
892 /// operands to determine if the value in the first operand is not greater
893 /// than the corresponding value in the second operand.
895 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
896 /// low-order bits of a vector of [4 x float].
897 /// If either value in a comparison is NaN, returns true.
899 /// \headerfile <x86intrin.h>
901 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
902 /// instructions.
904 /// \param __a
905 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
906 /// 32 bits of this operand are used in the comparison.
907 /// \param __b
908 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
909 /// 32 bits of this operand are used in the comparison.
910 /// \returns A 128-bit vector of [4 x float] containing the comparison results
911 /// in the low-order bits.
912 static __inline__ __m128 __DEFAULT_FN_ATTRS
913 _mm_cmpngt_ss(__m128 __a, __m128 __b)
915 return (__m128)__builtin_shufflevector((__v4sf)__a,
916 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
917 4, 1, 2, 3);
920 /// Compares each of the corresponding 32-bit float values of the
921 /// 128-bit vectors of [4 x float] to determine if the values in the first
922 /// operand are not greater than those in the second operand.
924 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
925 /// If either value in a comparison is NaN, returns true.
927 /// \headerfile <x86intrin.h>
929 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
930 /// instructions.
932 /// \param __a
933 /// A 128-bit vector of [4 x float].
934 /// \param __b
935 /// A 128-bit vector of [4 x float].
936 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
937 static __inline__ __m128 __DEFAULT_FN_ATTRS
938 _mm_cmpngt_ps(__m128 __a, __m128 __b)
940 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
943 /// Compares two 32-bit float values in the low-order bits of both
944 /// operands to determine if the value in the first operand is not greater
945 /// than or equal to the corresponding value in the second operand.
947 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
948 /// low-order bits of a vector of [4 x float].
949 /// If either value in a comparison is NaN, returns true.
951 /// \headerfile <x86intrin.h>
953 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
954 /// instructions.
956 /// \param __a
957 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
958 /// 32 bits of this operand are used in the comparison.
959 /// \param __b
960 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
961 /// 32 bits of this operand are used in the comparison.
962 /// \returns A 128-bit vector of [4 x float] containing the comparison results
963 /// in the low-order bits.
964 static __inline__ __m128 __DEFAULT_FN_ATTRS
965 _mm_cmpnge_ss(__m128 __a, __m128 __b)
967 return (__m128)__builtin_shufflevector((__v4sf)__a,
968 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
969 4, 1, 2, 3);
972 /// Compares each of the corresponding 32-bit float values of the
973 /// 128-bit vectors of [4 x float] to determine if the values in the first
974 /// operand are not greater than or equal to those in the second operand.
976 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
977 /// If either value in a comparison is NaN, returns true.
979 /// \headerfile <x86intrin.h>
981 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
982 /// instructions.
984 /// \param __a
985 /// A 128-bit vector of [4 x float].
986 /// \param __b
987 /// A 128-bit vector of [4 x float].
988 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
989 static __inline__ __m128 __DEFAULT_FN_ATTRS
990 _mm_cmpnge_ps(__m128 __a, __m128 __b)
992 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
995 /// Compares two 32-bit float values in the low-order bits of both
996 /// operands to determine if the value in the first operand is ordered with
997 /// respect to the corresponding value in the second operand.
999 /// A pair of floating-point values are ordered with respect to each
1000 /// other if neither value is a NaN. Each comparison returns 0x0 for false,
1001 /// 0xFFFFFFFF for true.
1003 /// \headerfile <x86intrin.h>
1005 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
1006 /// instructions.
1008 /// \param __a
1009 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
1010 /// 32 bits of this operand are used in the comparison.
1011 /// \param __b
1012 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
1013 /// 32 bits of this operand are used in the comparison.
1014 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1015 /// in the low-order bits.
1016 static __inline__ __m128 __DEFAULT_FN_ATTRS
1017 _mm_cmpord_ss(__m128 __a, __m128 __b)
1019 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
1022 /// Compares each of the corresponding 32-bit float values of the
1023 /// 128-bit vectors of [4 x float] to determine if the values in the first
1024 /// operand are ordered with respect to those in the second operand.
1026 /// A pair of floating-point values are ordered with respect to each
1027 /// other if neither value is a NaN. Each comparison returns 0x0 for false,
1028 /// 0xFFFFFFFF for true.
1030 /// \headerfile <x86intrin.h>
1032 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
1033 /// instructions.
1035 /// \param __a
1036 /// A 128-bit vector of [4 x float].
1037 /// \param __b
1038 /// A 128-bit vector of [4 x float].
1039 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1040 static __inline__ __m128 __DEFAULT_FN_ATTRS
1041 _mm_cmpord_ps(__m128 __a, __m128 __b)
1043 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
1046 /// Compares two 32-bit float values in the low-order bits of both
1047 /// operands to determine if the value in the first operand is unordered
1048 /// with respect to the corresponding value in the second operand.
1050 /// A pair of double-precision values are unordered with respect to each
1051 /// other if one or both values are NaN. Each comparison returns 0x0 for
1052 /// false, 0xFFFFFFFF for true.
1054 /// \headerfile <x86intrin.h>
1056 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1057 /// instructions.
1059 /// \param __a
1060 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
1061 /// 32 bits of this operand are used in the comparison.
1062 /// \param __b
1063 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
1064 /// 32 bits of this operand are used in the comparison.
1065 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1066 /// in the low-order bits.
1067 static __inline__ __m128 __DEFAULT_FN_ATTRS
1068 _mm_cmpunord_ss(__m128 __a, __m128 __b)
1070 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1073 /// Compares each of the corresponding 32-bit float values of the
1074 /// 128-bit vectors of [4 x float] to determine if the values in the first
1075 /// operand are unordered with respect to those in the second operand.
1077 /// A pair of double-precision values are unordered with respect to each
1078 /// other if one or both values are NaN. Each comparison returns 0x0 for
1079 /// false, 0xFFFFFFFFFFFFFFFF for true.
1081 /// \headerfile <x86intrin.h>
1083 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1084 /// instructions.
1086 /// \param __a
1087 /// A 128-bit vector of [4 x float].
1088 /// \param __b
1089 /// A 128-bit vector of [4 x float].
1090 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1091 static __inline__ __m128 __DEFAULT_FN_ATTRS
1092 _mm_cmpunord_ps(__m128 __a, __m128 __b)
1094 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1097 /// Compares two 32-bit float values in the low-order bits of both
1098 /// operands for equality.
1100 /// The comparison returns 0 for false, 1 for true. If either value in a
1101 /// comparison is NaN, returns 0.
1103 /// \headerfile <x86intrin.h>
1105 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1106 /// instructions.
1108 /// \param __a
1109 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1110 /// used in the comparison.
1111 /// \param __b
1112 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1113 /// used in the comparison.
1114 /// \returns An integer containing the comparison results.
1115 static __inline__ int __DEFAULT_FN_ATTRS
1116 _mm_comieq_ss(__m128 __a, __m128 __b)
1118 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1121 /// Compares two 32-bit float values in the low-order bits of both
1122 /// operands to determine if the first operand is less than the second
1123 /// operand.
1125 /// The comparison returns 0 for false, 1 for true. If either value in a
1126 /// comparison is NaN, returns 0.
1128 /// \headerfile <x86intrin.h>
1130 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1131 /// instructions.
1133 /// \param __a
1134 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1135 /// used in the comparison.
1136 /// \param __b
1137 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1138 /// used in the comparison.
1139 /// \returns An integer containing the comparison results.
1140 static __inline__ int __DEFAULT_FN_ATTRS
1141 _mm_comilt_ss(__m128 __a, __m128 __b)
1143 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1146 /// Compares two 32-bit float values in the low-order bits of both
1147 /// operands to determine if the first operand is less than or equal to the
1148 /// second operand.
1150 /// The comparison returns 0 for false, 1 for true. If either value in a
1151 /// comparison is NaN, returns 0.
1153 /// \headerfile <x86intrin.h>
1155 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1157 /// \param __a
1158 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1159 /// used in the comparison.
1160 /// \param __b
1161 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1162 /// used in the comparison.
1163 /// \returns An integer containing the comparison results.
1164 static __inline__ int __DEFAULT_FN_ATTRS
1165 _mm_comile_ss(__m128 __a, __m128 __b)
1167 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1170 /// Compares two 32-bit float values in the low-order bits of both
1171 /// operands to determine if the first operand is greater than the second
1172 /// operand.
1174 /// The comparison returns 0 for false, 1 for true. If either value in a
1175 /// comparison is NaN, returns 0.
1177 /// \headerfile <x86intrin.h>
1179 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1181 /// \param __a
1182 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1183 /// used in the comparison.
1184 /// \param __b
1185 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1186 /// used in the comparison.
1187 /// \returns An integer containing the comparison results.
1188 static __inline__ int __DEFAULT_FN_ATTRS
1189 _mm_comigt_ss(__m128 __a, __m128 __b)
1191 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1194 /// Compares two 32-bit float values in the low-order bits of both
1195 /// operands to determine if the first operand is greater than or equal to
1196 /// the second operand.
1198 /// The comparison returns 0 for false, 1 for true. If either value in a
1199 /// comparison is NaN, returns 0.
1201 /// \headerfile <x86intrin.h>
1203 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1205 /// \param __a
1206 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1207 /// used in the comparison.
1208 /// \param __b
1209 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1210 /// used in the comparison.
1211 /// \returns An integer containing the comparison results.
1212 static __inline__ int __DEFAULT_FN_ATTRS
1213 _mm_comige_ss(__m128 __a, __m128 __b)
1215 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1218 /// Compares two 32-bit float values in the low-order bits of both
1219 /// operands to determine if the first operand is not equal to the second
1220 /// operand.
1222 /// The comparison returns 0 for false, 1 for true. If either value in a
1223 /// comparison is NaN, returns 1.
1225 /// \headerfile <x86intrin.h>
1227 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1229 /// \param __a
1230 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1231 /// used in the comparison.
1232 /// \param __b
1233 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1234 /// used in the comparison.
1235 /// \returns An integer containing the comparison results.
1236 static __inline__ int __DEFAULT_FN_ATTRS
1237 _mm_comineq_ss(__m128 __a, __m128 __b)
1239 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1242 /// Performs an unordered comparison of two 32-bit float values using
1243 /// the low-order bits of both operands to determine equality.
1245 /// The comparison returns 0 for false, 1 for true. If either value in a
1246 /// comparison is NaN, returns 0.
1248 /// \headerfile <x86intrin.h>
1250 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1252 /// \param __a
1253 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1254 /// used in the comparison.
1255 /// \param __b
1256 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1257 /// used in the comparison.
1258 /// \returns An integer containing the comparison results.
1259 static __inline__ int __DEFAULT_FN_ATTRS
1260 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1262 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1265 /// Performs an unordered comparison of two 32-bit float values using
1266 /// the low-order bits of both operands to determine if the first operand is
1267 /// less than the second operand.
1269 /// The comparison returns 0 for false, 1 for true. If either value in a
1270 /// comparison is NaN, returns 0.
1272 /// \headerfile <x86intrin.h>
1274 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1276 /// \param __a
1277 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1278 /// used in the comparison.
1279 /// \param __b
1280 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1281 /// used in the comparison.
1282 /// \returns An integer containing the comparison results.
1283 static __inline__ int __DEFAULT_FN_ATTRS
1284 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1286 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1289 /// Performs an unordered comparison of two 32-bit float values using
1290 /// the low-order bits of both operands to determine if the first operand is
1291 /// less than or equal to the second operand.
1293 /// The comparison returns 0 for false, 1 for true. If either value in a
1294 /// comparison is NaN, returns 0.
1296 /// \headerfile <x86intrin.h>
1298 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1300 /// \param __a
1301 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1302 /// used in the comparison.
1303 /// \param __b
1304 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1305 /// used in the comparison.
1306 /// \returns An integer containing the comparison results.
1307 static __inline__ int __DEFAULT_FN_ATTRS
1308 _mm_ucomile_ss(__m128 __a, __m128 __b)
1310 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1313 /// Performs an unordered comparison of two 32-bit float values using
1314 /// the low-order bits of both operands to determine if the first operand is
1315 /// greater than the second operand.
1317 /// The comparison returns 0 for false, 1 for true. If either value in a
1318 /// comparison is NaN, returns 0.
1320 /// \headerfile <x86intrin.h>
1322 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1324 /// \param __a
1325 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1326 /// used in the comparison.
1327 /// \param __b
1328 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1329 /// used in the comparison.
1330 /// \returns An integer containing the comparison results.
1331 static __inline__ int __DEFAULT_FN_ATTRS
1332 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1334 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1337 /// Performs an unordered comparison of two 32-bit float values using
1338 /// the low-order bits of both operands to determine if the first operand is
1339 /// greater than or equal to the second operand.
1341 /// The comparison returns 0 for false, 1 for true. If either value in a
1342 /// comparison is NaN, returns 0.
1344 /// \headerfile <x86intrin.h>
1346 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1348 /// \param __a
1349 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1350 /// used in the comparison.
1351 /// \param __b
1352 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1353 /// used in the comparison.
1354 /// \returns An integer containing the comparison results.
1355 static __inline__ int __DEFAULT_FN_ATTRS
1356 _mm_ucomige_ss(__m128 __a, __m128 __b)
1358 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1361 /// Performs an unordered comparison of two 32-bit float values using
1362 /// the low-order bits of both operands to determine inequality.
1364 /// The comparison returns 0 for false, 1 for true. If either value in a
1365 /// comparison is NaN, returns 0.
1367 /// \headerfile <x86intrin.h>
1369 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1371 /// \param __a
1372 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1373 /// used in the comparison.
1374 /// \param __b
1375 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1376 /// used in the comparison.
1377 /// \returns An integer containing the comparison results.
1378 static __inline__ int __DEFAULT_FN_ATTRS
1379 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1381 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1384 /// Converts a float value contained in the lower 32 bits of a vector of
1385 /// [4 x float] into a 32-bit integer.
1387 /// If the converted value does not fit in a 32-bit integer, raises a
1388 /// floating-point invalid exception. If the exception is masked, returns
1389 /// the most negative integer.
1391 /// \headerfile <x86intrin.h>
1393 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1394 /// instructions.
1396 /// \param __a
1397 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1398 /// used in the conversion.
1399 /// \returns A 32-bit integer containing the converted value.
1400 static __inline__ int __DEFAULT_FN_ATTRS
1401 _mm_cvtss_si32(__m128 __a)
1403 return __builtin_ia32_cvtss2si((__v4sf)__a);
1406 /// Converts a float value contained in the lower 32 bits of a vector of
1407 /// [4 x float] into a 32-bit integer.
1409 /// If the converted value does not fit in a 32-bit integer, raises a
1410 /// floating-point invalid exception. If the exception is masked, returns
1411 /// the most negative integer.
1413 /// \headerfile <x86intrin.h>
1415 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1416 /// instructions.
1418 /// \param __a
1419 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1420 /// used in the conversion.
1421 /// \returns A 32-bit integer containing the converted value.
1422 static __inline__ int __DEFAULT_FN_ATTRS
1423 _mm_cvt_ss2si(__m128 __a)
1425 return _mm_cvtss_si32(__a);
1428 #ifdef __x86_64__
1430 /// Converts a float value contained in the lower 32 bits of a vector of
1431 /// [4 x float] into a 64-bit integer.
1433 /// If the converted value does not fit in a 32-bit integer, raises a
1434 /// floating-point invalid exception. If the exception is masked, returns
1435 /// the most negative integer.
1437 /// \headerfile <x86intrin.h>
1439 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1440 /// instructions.
1442 /// \param __a
1443 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1444 /// used in the conversion.
1445 /// \returns A 64-bit integer containing the converted value.
1446 static __inline__ long long __DEFAULT_FN_ATTRS
1447 _mm_cvtss_si64(__m128 __a)
1449 return __builtin_ia32_cvtss2si64((__v4sf)__a);
1452 #endif
1454 /// Converts two low-order float values in a 128-bit vector of
1455 /// [4 x float] into a 64-bit vector of [2 x i32].
1457 /// If a converted value does not fit in a 32-bit integer, raises a
1458 /// floating-point invalid exception. If the exception is masked, returns
1459 /// the most negative integer.
1461 /// \headerfile <x86intrin.h>
1463 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1465 /// \param __a
1466 /// A 128-bit vector of [4 x float].
1467 /// \returns A 64-bit integer vector containing the converted values.
1468 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1469 _mm_cvtps_pi32(__m128 __a)
1471 return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
1474 /// Converts two low-order float values in a 128-bit vector of
1475 /// [4 x float] into a 64-bit vector of [2 x i32].
1477 /// If a converted value does not fit in a 32-bit integer, raises a
1478 /// floating-point invalid exception. If the exception is masked, returns
1479 /// the most negative integer.
1481 /// \headerfile <x86intrin.h>
1483 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1485 /// \param __a
1486 /// A 128-bit vector of [4 x float].
1487 /// \returns A 64-bit integer vector containing the converted values.
1488 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1489 _mm_cvt_ps2pi(__m128 __a)
1491 return _mm_cvtps_pi32(__a);
1494 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1495 /// truncated (rounded toward zero) 32-bit integer.
1497 /// If the converted value does not fit in a 32-bit integer, raises a
1498 /// floating-point invalid exception. If the exception is masked, returns
1499 /// the most negative integer.
1501 /// \headerfile <x86intrin.h>
1503 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1504 /// instructions.
1506 /// \param __a
1507 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1508 /// used in the conversion.
1509 /// \returns A 32-bit integer containing the converted value.
1510 static __inline__ int __DEFAULT_FN_ATTRS
1511 _mm_cvttss_si32(__m128 __a)
1513 return __builtin_ia32_cvttss2si((__v4sf)__a);
1516 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1517 /// truncated (rounded toward zero) 32-bit integer.
1519 /// If the converted value does not fit in a 32-bit integer, raises a
1520 /// floating-point invalid exception. If the exception is masked, returns
1521 /// the most negative integer.
1523 /// \headerfile <x86intrin.h>
1525 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1526 /// instructions.
1528 /// \param __a
1529 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1530 /// used in the conversion.
1531 /// \returns A 32-bit integer containing the converted value.
1532 static __inline__ int __DEFAULT_FN_ATTRS
1533 _mm_cvtt_ss2si(__m128 __a)
1535 return _mm_cvttss_si32(__a);
1538 #ifdef __x86_64__
1539 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1540 /// truncated (rounded toward zero) 64-bit integer.
1542 /// If the converted value does not fit in a 64-bit integer, raises a
1543 /// floating-point invalid exception. If the exception is masked, returns
1544 /// the most negative integer.
1546 /// \headerfile <x86intrin.h>
1548 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1549 /// instructions.
1551 /// \param __a
1552 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1553 /// used in the conversion.
1554 /// \returns A 64-bit integer containing the converted value.
1555 static __inline__ long long __DEFAULT_FN_ATTRS
1556 _mm_cvttss_si64(__m128 __a)
1558 return __builtin_ia32_cvttss2si64((__v4sf)__a);
1560 #endif
1562 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1563 /// into two signed truncated (rounded toward zero) 32-bit integers,
1564 /// returned in a 64-bit vector of [2 x i32].
1566 /// If a converted value does not fit in a 32-bit integer, raises a
1567 /// floating-point invalid exception. If the exception is masked, returns
1568 /// the most negative integer.
1570 /// \headerfile <x86intrin.h>
1572 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1573 /// instructions.
1575 /// \param __a
1576 /// A 128-bit vector of [4 x float].
1577 /// \returns A 64-bit integer vector containing the converted values.
1578 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1579 _mm_cvttps_pi32(__m128 __a)
1581 return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
1584 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1585 /// into two signed truncated (rounded toward zero) 64-bit integers,
1586 /// returned in a 64-bit vector of [2 x i32].
1588 /// If a converted value does not fit in a 32-bit integer, raises a
1589 /// floating-point invalid exception. If the exception is masked, returns
1590 /// the most negative integer.
1592 /// \headerfile <x86intrin.h>
1594 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1596 /// \param __a
1597 /// A 128-bit vector of [4 x float].
1598 /// \returns A 64-bit integer vector containing the converted values.
1599 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1600 _mm_cvtt_ps2pi(__m128 __a)
1602 return _mm_cvttps_pi32(__a);
1605 /// Converts a 32-bit signed integer value into a floating point value
1606 /// and writes it to the lower 32 bits of the destination. The remaining
1607 /// higher order elements of the destination vector are copied from the
1608 /// corresponding elements in the first operand.
1610 /// \headerfile <x86intrin.h>
1612 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1614 /// \param __a
1615 /// A 128-bit vector of [4 x float].
1616 /// \param __b
1617 /// A 32-bit signed integer operand containing the value to be converted.
1618 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1619 /// converted value of the second operand. The upper 96 bits are copied from
1620 /// the upper 96 bits of the first operand.
1621 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a,
1622 int __b) {
1623 __a[0] = __b;
1624 return __a;
1627 /// Converts a 32-bit signed integer value into a floating point value
1628 /// and writes it to the lower 32 bits of the destination. The remaining
1629 /// higher order elements of the destination are copied from the
1630 /// corresponding elements in the first operand.
1632 /// \headerfile <x86intrin.h>
1634 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1636 /// \param __a
1637 /// A 128-bit vector of [4 x float].
1638 /// \param __b
1639 /// A 32-bit signed integer operand containing the value to be converted.
1640 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1641 /// converted value of the second operand. The upper 96 bits are copied from
1642 /// the upper 96 bits of the first operand.
1643 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a,
1644 int __b) {
1645 return _mm_cvtsi32_ss(__a, __b);
1648 #ifdef __x86_64__
1650 /// Converts a 64-bit signed integer value into a floating point value
1651 /// and writes it to the lower 32 bits of the destination. The remaining
1652 /// higher order elements of the destination are copied from the
1653 /// corresponding elements in the first operand.
1655 /// \headerfile <x86intrin.h>
1657 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1659 /// \param __a
1660 /// A 128-bit vector of [4 x float].
1661 /// \param __b
1662 /// A 64-bit signed integer operand containing the value to be converted.
1663 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1664 /// converted value of the second operand. The upper 96 bits are copied from
1665 /// the upper 96 bits of the first operand.
1666 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1667 _mm_cvtsi64_ss(__m128 __a, long long __b) {
1668 __a[0] = __b;
1669 return __a;
1672 #endif
1674 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1675 /// floating point values and writes them to the lower 64-bits of the
1676 /// destination. The remaining higher order elements of the destination are
1677 /// copied from the corresponding elements in the first operand.
1679 /// \headerfile <x86intrin.h>
1681 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1683 /// \param __a
1684 /// A 128-bit vector of [4 x float].
1685 /// \param __b
1686 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1687 /// and written to the corresponding low-order elements in the destination.
1688 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1689 /// converted value of the second operand. The upper 64 bits are copied from
1690 /// the upper 64 bits of the first operand.
1691 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
1692 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1694 return (__m128)__builtin_shufflevector(
1695 (__v4sf)__a,
1696 __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
1697 4, 5, 2, 3);
1700 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1701 /// floating point values and writes them to the lower 64-bits of the
1702 /// destination. The remaining higher order elements of the destination are
1703 /// copied from the corresponding elements in the first operand.
1705 /// \headerfile <x86intrin.h>
1707 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1709 /// \param __a
1710 /// A 128-bit vector of [4 x float].
1711 /// \param __b
1712 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1713 /// and written to the corresponding low-order elements in the destination.
1714 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1715 /// converted value from the second operand. The upper 64 bits are copied
1716 /// from the upper 64 bits of the first operand.
1717 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
1718 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1720 return _mm_cvtpi32_ps(__a, __b);
1723 /// Extracts a float value contained in the lower 32 bits of a vector of
1724 /// [4 x float].
1726 /// \headerfile <x86intrin.h>
1728 /// This intrinsic has no corresponding instruction.
1730 /// \param __a
1731 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1732 /// used in the extraction.
1733 /// \returns A 32-bit float containing the extracted value.
1734 static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
1735 _mm_cvtss_f32(__m128 __a) {
1736 return __a[0];
1739 /// Loads two packed float values from the address \a __p into the
1740 /// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1741 /// are copied from the low-order bits of the first operand.
1743 /// \headerfile <x86intrin.h>
1745 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1747 /// \param __a
1748 /// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1749 /// of the destination.
1750 /// \param __p
1751 /// A pointer to two packed float values. Bits [63:0] are written to bits
1752 /// [127:64] of the destination.
1753 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1754 static __inline__ __m128 __DEFAULT_FN_ATTRS
1755 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1757 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1758 struct __mm_loadh_pi_struct {
1759 __mm_loadh_pi_v2f32 __u;
1760 } __attribute__((__packed__, __may_alias__));
1761 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1762 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1763 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1766 /// Loads two packed float values from the address \a __p into the
1767 /// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1768 /// are copied from the high-order bits of the first operand.
1770 /// \headerfile <x86intrin.h>
1772 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1774 /// \param __a
1775 /// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1776 /// [127:64] of the destination.
1777 /// \param __p
1778 /// A pointer to two packed float values. Bits [63:0] are written to bits
1779 /// [63:0] of the destination.
1780 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1781 static __inline__ __m128 __DEFAULT_FN_ATTRS
1782 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1784 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1785 struct __mm_loadl_pi_struct {
1786 __mm_loadl_pi_v2f32 __u;
1787 } __attribute__((__packed__, __may_alias__));
1788 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1789 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1790 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1793 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1794 /// 32 bits of the vector are initialized with the single-precision
1795 /// floating-point value loaded from a specified memory location. The upper
1796 /// 96 bits are set to zero.
1798 /// \headerfile <x86intrin.h>
1800 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1802 /// \param __p
1803 /// A pointer to a 32-bit memory location containing a single-precision
1804 /// floating-point value.
1805 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1806 /// lower 32 bits contain the value loaded from the memory location. The
1807 /// upper 96 bits are set to zero.
1808 static __inline__ __m128 __DEFAULT_FN_ATTRS
1809 _mm_load_ss(const float *__p)
1811 struct __mm_load_ss_struct {
1812 float __u;
1813 } __attribute__((__packed__, __may_alias__));
1814 float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1815 return __extension__ (__m128){ __u, 0, 0, 0 };
1818 /// Loads a 32-bit float value and duplicates it to all four vector
1819 /// elements of a 128-bit vector of [4 x float].
1821 /// \headerfile <x86intrin.h>
1823 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1824 /// instruction.
1826 /// \param __p
1827 /// A pointer to a float value to be loaded and duplicated.
1828 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1829 /// duplicated values.
1830 static __inline__ __m128 __DEFAULT_FN_ATTRS
1831 _mm_load1_ps(const float *__p)
1833 struct __mm_load1_ps_struct {
1834 float __u;
1835 } __attribute__((__packed__, __may_alias__));
1836 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1837 return __extension__ (__m128){ __u, __u, __u, __u };
1840 #define _mm_load_ps1(p) _mm_load1_ps(p)
1842 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1843 /// memory location.
1845 /// \headerfile <x86intrin.h>
1847 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1849 /// \param __p
1850 /// A pointer to a 128-bit memory location. The address of the memory
1851 /// location has to be 128-bit aligned.
1852 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1853 static __inline__ __m128 __DEFAULT_FN_ATTRS
1854 _mm_load_ps(const float *__p)
1856 return *(const __m128*)__p;
1859 /// Loads a 128-bit floating-point vector of [4 x float] from an
1860 /// unaligned memory location.
1862 /// \headerfile <x86intrin.h>
1864 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1866 /// \param __p
1867 /// A pointer to a 128-bit memory location. The address of the memory
1868 /// location does not have to be aligned.
1869 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1870 static __inline__ __m128 __DEFAULT_FN_ATTRS
1871 _mm_loadu_ps(const float *__p)
1873 struct __loadu_ps {
1874 __m128_u __v;
1875 } __attribute__((__packed__, __may_alias__));
1876 return ((const struct __loadu_ps*)__p)->__v;
1879 /// Loads four packed float values, in reverse order, from an aligned
1880 /// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1882 /// \headerfile <x86intrin.h>
1884 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1885 /// instruction.
1887 /// \param __p
1888 /// A pointer to a 128-bit memory location. The address of the memory
1889 /// location has to be 128-bit aligned.
1890 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1891 /// in reverse order.
1892 static __inline__ __m128 __DEFAULT_FN_ATTRS
1893 _mm_loadr_ps(const float *__p)
1895 __m128 __a = _mm_load_ps(__p);
1896 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1899 /// Create a 128-bit vector of [4 x float] with undefined values.
1901 /// \headerfile <x86intrin.h>
1903 /// This intrinsic has no corresponding instruction.
1905 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1906 static __inline__ __m128 __DEFAULT_FN_ATTRS
1907 _mm_undefined_ps(void)
1909 return (__m128)__builtin_ia32_undef128();
1912 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1913 /// 32 bits of the vector are initialized with the specified single-precision
1914 /// floating-point value. The upper 96 bits are set to zero.
1916 /// \headerfile <x86intrin.h>
1918 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1920 /// \param __w
1921 /// A single-precision floating-point value used to initialize the lower 32
1922 /// bits of the result.
1923 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1924 /// lower 32 bits contain the value provided in the source operand. The
1925 /// upper 96 bits are set to zero.
1926 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1927 _mm_set_ss(float __w) {
1928 return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
1931 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1932 /// of the four single-precision floating-point vector elements set to the
1933 /// specified single-precision floating-point value.
1935 /// \headerfile <x86intrin.h>
1937 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1939 /// \param __w
1940 /// A single-precision floating-point value used to initialize each vector
1941 /// element of the result.
1942 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1943 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1944 _mm_set1_ps(float __w) {
1945 return __extension__ (__m128){ __w, __w, __w, __w };
1948 /* Microsoft specific. */
1949 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1950 /// of the four single-precision floating-point vector elements set to the
1951 /// specified single-precision floating-point value.
1953 /// \headerfile <x86intrin.h>
1955 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1957 /// \param __w
1958 /// A single-precision floating-point value used to initialize each vector
1959 /// element of the result.
1960 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1961 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1962 _mm_set_ps1(float __w) {
1963 return _mm_set1_ps(__w);
1966 /// Constructs a 128-bit floating-point vector of [4 x float]
1967 /// initialized with the specified single-precision floating-point values.
1969 /// \headerfile <x86intrin.h>
1971 /// This intrinsic is a utility function and does not correspond to a specific
1972 /// instruction.
1974 /// \param __z
1975 /// A single-precision floating-point value used to initialize bits [127:96]
1976 /// of the result.
1977 /// \param __y
1978 /// A single-precision floating-point value used to initialize bits [95:64]
1979 /// of the result.
1980 /// \param __x
1981 /// A single-precision floating-point value used to initialize bits [63:32]
1982 /// of the result.
1983 /// \param __w
1984 /// A single-precision floating-point value used to initialize bits [31:0]
1985 /// of the result.
1986 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1987 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1988 _mm_set_ps(float __z, float __y, float __x, float __w) {
1989 return __extension__ (__m128){ __w, __x, __y, __z };
1992 /// Constructs a 128-bit floating-point vector of [4 x float],
1993 /// initialized in reverse order with the specified 32-bit single-precision
1994 /// float-point values.
1996 /// \headerfile <x86intrin.h>
1998 /// This intrinsic is a utility function and does not correspond to a specific
1999 /// instruction.
2001 /// \param __z
2002 /// A single-precision floating-point value used to initialize bits [31:0]
2003 /// of the result.
2004 /// \param __y
2005 /// A single-precision floating-point value used to initialize bits [63:32]
2006 /// of the result.
2007 /// \param __x
2008 /// A single-precision floating-point value used to initialize bits [95:64]
2009 /// of the result.
2010 /// \param __w
2011 /// A single-precision floating-point value used to initialize bits [127:96]
2012 /// of the result.
2013 /// \returns An initialized 128-bit floating-point vector of [4 x float].
2014 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2015 _mm_setr_ps(float __z, float __y, float __x, float __w) {
2016 return __extension__ (__m128){ __z, __y, __x, __w };
2019 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
2020 /// to zero.
2022 /// \headerfile <x86intrin.h>
2024 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
2026 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
2027 /// all elements set to zero.
2028 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2029 _mm_setzero_ps(void) {
2030 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
2033 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2034 /// memory location.
2036 /// \headerfile <x86intrin.h>
2038 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
2040 /// \param __p
2041 /// A pointer to a 64-bit memory location.
2042 /// \param __a
2043 /// A 128-bit vector of [4 x float] containing the values to be stored.
2044 static __inline__ void __DEFAULT_FN_ATTRS
2045 _mm_storeh_pi(__m64 *__p, __m128 __a)
2047 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2048 struct __mm_storeh_pi_struct {
2049 __mm_storeh_pi_v2f32 __u;
2050 } __attribute__((__packed__, __may_alias__));
2051 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
2054 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2055 /// memory location.
2057 /// \headerfile <x86intrin.h>
2059 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
2061 /// \param __p
2062 /// A pointer to a memory location that will receive the float values.
2063 /// \param __a
2064 /// A 128-bit vector of [4 x float] containing the values to be stored.
2065 static __inline__ void __DEFAULT_FN_ATTRS
2066 _mm_storel_pi(__m64 *__p, __m128 __a)
2068 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2069 struct __mm_storeh_pi_struct {
2070 __mm_storeh_pi_v2f32 __u;
2071 } __attribute__((__packed__, __may_alias__));
2072 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
2075 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2076 /// memory location.
2078 /// \headerfile <x86intrin.h>
2080 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2082 /// \param __p
2083 /// A pointer to a 32-bit memory location.
2084 /// \param __a
2085 /// A 128-bit vector of [4 x float] containing the value to be stored.
2086 static __inline__ void __DEFAULT_FN_ATTRS
2087 _mm_store_ss(float *__p, __m128 __a)
2089 struct __mm_store_ss_struct {
2090 float __u;
2091 } __attribute__((__packed__, __may_alias__));
2092 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2095 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
2096 /// location.
2098 /// \headerfile <x86intrin.h>
2100 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2102 /// \param __p
2103 /// A pointer to a 128-bit memory location. The address of the memory
2104 /// location does not have to be aligned.
2105 /// \param __a
2106 /// A 128-bit vector of [4 x float] containing the values to be stored.
2107 static __inline__ void __DEFAULT_FN_ATTRS
2108 _mm_storeu_ps(float *__p, __m128 __a)
2110 struct __storeu_ps {
2111 __m128_u __v;
2112 } __attribute__((__packed__, __may_alias__));
2113 ((struct __storeu_ps*)__p)->__v = __a;
2116 /// Stores a 128-bit vector of [4 x float] into an aligned memory
2117 /// location.
2119 /// \headerfile <x86intrin.h>
2121 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2123 /// \param __p
2124 /// A pointer to a 128-bit memory location. The address of the memory
2125 /// location has to be 16-byte aligned.
2126 /// \param __a
2127 /// A 128-bit vector of [4 x float] containing the values to be stored.
2128 static __inline__ void __DEFAULT_FN_ATTRS
2129 _mm_store_ps(float *__p, __m128 __a)
2131 *(__m128*)__p = __a;
2134 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2135 /// four contiguous elements in an aligned memory location.
2137 /// \headerfile <x86intrin.h>
2139 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2140 /// instruction.
2142 /// \param __p
2143 /// A pointer to a 128-bit memory location.
2144 /// \param __a
2145 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2146 /// of the four contiguous elements pointed by \a __p.
2147 static __inline__ void __DEFAULT_FN_ATTRS
2148 _mm_store1_ps(float *__p, __m128 __a)
2150 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2151 _mm_store_ps(__p, __a);
2154 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2155 /// four contiguous elements in an aligned memory location.
2157 /// \headerfile <x86intrin.h>
2159 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2160 /// instruction.
2162 /// \param __p
2163 /// A pointer to a 128-bit memory location.
2164 /// \param __a
2165 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2166 /// of the four contiguous elements pointed by \a __p.
2167 static __inline__ void __DEFAULT_FN_ATTRS
2168 _mm_store_ps1(float *__p, __m128 __a)
2170 _mm_store1_ps(__p, __a);
2173 /// Stores float values from a 128-bit vector of [4 x float] to an
2174 /// aligned memory location in reverse order.
2176 /// \headerfile <x86intrin.h>
2178 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2179 /// instruction.
2181 /// \param __p
2182 /// A pointer to a 128-bit memory location. The address of the memory
2183 /// location has to be 128-bit aligned.
2184 /// \param __a
2185 /// A 128-bit vector of [4 x float] containing the values to be stored.
2186 static __inline__ void __DEFAULT_FN_ATTRS
2187 _mm_storer_ps(float *__p, __m128 __a)
2189 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2190 _mm_store_ps(__p, __a);
2193 #define _MM_HINT_ET0 7
2194 #define _MM_HINT_ET1 6
2195 #define _MM_HINT_T0 3
2196 #define _MM_HINT_T1 2
2197 #define _MM_HINT_T2 1
2198 #define _MM_HINT_NTA 0
2200 #ifndef _MSC_VER
2201 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2202 Sema doesn't do any form of constant propagation yet. */
2204 /// Loads one cache line of data from the specified address to a location
2205 /// closer to the processor.
2207 /// \headerfile <x86intrin.h>
2209 /// \code
2210 /// void _mm_prefetch(const void *a, const int sel);
2211 /// \endcode
2213 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2215 /// \param a
2216 /// A pointer to a memory location containing a cache line of data.
2217 /// \param sel
2218 /// A predefined integer constant specifying the type of prefetch
2219 /// operation: \n
2220 /// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2221 /// PREFETCHNTA instruction will be generated. \n
2222 /// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2223 /// be generated. \n
2224 /// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2225 /// be generated. \n
2226 /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2227 /// be generated.
2228 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2229 ((sel) >> 2) & 1, (sel) & 0x3))
2230 #endif
2232 /// Stores a 64-bit integer in the specified aligned memory location. To
2233 /// minimize caching, the data is flagged as non-temporal (unlikely to be
2234 /// used again soon).
2236 /// \headerfile <x86intrin.h>
2238 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2240 /// \param __p
2241 /// A pointer to an aligned memory location used to store the register value.
2242 /// \param __a
2243 /// A 64-bit integer containing the value to be stored.
2244 static __inline__ void __DEFAULT_FN_ATTRS
2245 _mm_stream_pi(void *__p, __m64 __a)
2247 __builtin_nontemporal_store(__a, (__m64 *)__p);
2250 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2251 /// 128-bit aligned memory location. To minimize caching, the data is flagged
2252 /// as non-temporal (unlikely to be used again soon).
2254 /// \headerfile <x86intrin.h>
2256 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2258 /// \param __p
2259 /// A pointer to a 128-bit aligned memory location that will receive the
2260 /// single-precision floating-point values.
2261 /// \param __a
2262 /// A 128-bit vector of [4 x float] containing the values to be moved.
2263 static __inline__ void __DEFAULT_FN_ATTRS
2264 _mm_stream_ps(void *__p, __m128 __a)
2266 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2269 #if defined(__cplusplus)
2270 extern "C" {
2271 #endif
2273 /// Forces strong memory ordering (serialization) between store
2274 /// instructions preceding this instruction and store instructions following
2275 /// this instruction, ensuring the system completes all previous stores
2276 /// before executing subsequent stores.
2278 /// \headerfile <x86intrin.h>
2280 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2282 void _mm_sfence(void);
2284 #if defined(__cplusplus)
2285 } // extern "C"
2286 #endif
2288 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2289 /// returns it, as specified by the immediate integer operand.
2291 /// \headerfile <x86intrin.h>
2293 /// \code
2294 /// int _mm_extract_pi16(__m64 a, int n);
2295 /// \endcode
2297 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2299 /// \param a
2300 /// A 64-bit vector of [4 x i16].
2301 /// \param n
2302 /// An immediate integer operand that determines which bits are extracted: \n
2303 /// 0: Bits [15:0] are copied to the destination. \n
2304 /// 1: Bits [31:16] are copied to the destination. \n
2305 /// 2: Bits [47:32] are copied to the destination. \n
2306 /// 3: Bits [63:48] are copied to the destination.
2307 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2308 #define _mm_extract_pi16(a, n) \
2309 ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2311 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2312 /// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2313 /// specified by the immediate operand \a n.
2315 /// \headerfile <x86intrin.h>
2317 /// \code
2318 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2319 /// \endcode
2321 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2323 /// \param a
2324 /// A 64-bit vector of [4 x i16].
2325 /// \param d
2326 /// An integer. The lower 16-bit value from this operand is written to the
2327 /// destination at the offset specified by operand \a n.
2328 /// \param n
2329 /// An immediate integer operant that determines which the bits to be used
2330 /// in the destination. \n
2331 /// 0: Bits [15:0] are copied to the destination. \n
2332 /// 1: Bits [31:16] are copied to the destination. \n
2333 /// 2: Bits [47:32] are copied to the destination. \n
2334 /// 3: Bits [63:48] are copied to the destination. \n
2335 /// The remaining bits in the destination are copied from the corresponding
2336 /// bits in operand \a a.
2337 /// \returns A 64-bit integer vector containing the copied packed data from the
2338 /// operands.
2339 #define _mm_insert_pi16(a, d, n) \
2340 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2342 /// Compares each of the corresponding packed 16-bit integer values of
2343 /// the 64-bit integer vectors, and writes the greater value to the
2344 /// corresponding bits in the destination.
2346 /// \headerfile <x86intrin.h>
2348 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2350 /// \param __a
2351 /// A 64-bit integer vector containing one of the source operands.
2352 /// \param __b
2353 /// A 64-bit integer vector containing one of the source operands.
2354 /// \returns A 64-bit integer vector containing the comparison results.
2355 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2356 _mm_max_pi16(__m64 __a, __m64 __b)
2358 return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
2361 /// Compares each of the corresponding packed 8-bit unsigned integer
2362 /// values of the 64-bit integer vectors, and writes the greater value to the
2363 /// corresponding bits in the destination.
2365 /// \headerfile <x86intrin.h>
2367 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2369 /// \param __a
2370 /// A 64-bit integer vector containing one of the source operands.
2371 /// \param __b
2372 /// A 64-bit integer vector containing one of the source operands.
2373 /// \returns A 64-bit integer vector containing the comparison results.
2374 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2375 _mm_max_pu8(__m64 __a, __m64 __b)
2377 return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
2380 /// Compares each of the corresponding packed 16-bit integer values of
2381 /// the 64-bit integer vectors, and writes the lesser value to the
2382 /// corresponding bits in the destination.
2384 /// \headerfile <x86intrin.h>
2386 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2388 /// \param __a
2389 /// A 64-bit integer vector containing one of the source operands.
2390 /// \param __b
2391 /// A 64-bit integer vector containing one of the source operands.
2392 /// \returns A 64-bit integer vector containing the comparison results.
2393 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2394 _mm_min_pi16(__m64 __a, __m64 __b)
2396 return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
2399 /// Compares each of the corresponding packed 8-bit unsigned integer
2400 /// values of the 64-bit integer vectors, and writes the lesser value to the
2401 /// corresponding bits in the destination.
2403 /// \headerfile <x86intrin.h>
2405 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2407 /// \param __a
2408 /// A 64-bit integer vector containing one of the source operands.
2409 /// \param __b
2410 /// A 64-bit integer vector containing one of the source operands.
2411 /// \returns A 64-bit integer vector containing the comparison results.
2412 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2413 _mm_min_pu8(__m64 __a, __m64 __b)
2415 return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
2418 /// Takes the most significant bit from each 8-bit element in a 64-bit
2419 /// integer vector to create an 8-bit mask value. Zero-extends the value to
2420 /// 32-bit integer and writes it to the destination.
2422 /// \headerfile <x86intrin.h>
2424 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2426 /// \param __a
2427 /// A 64-bit integer vector containing the values with bits to be extracted.
2428 /// \returns The most significant bit from each 8-bit element in \a __a,
2429 /// written to bits [7:0].
2430 static __inline__ int __DEFAULT_FN_ATTRS_SSE2
2431 _mm_movemask_pi8(__m64 __a)
2433 return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
2436 /// Multiplies packed 16-bit unsigned integer values and writes the
2437 /// high-order 16 bits of each 32-bit product to the corresponding bits in
2438 /// the destination.
2440 /// \headerfile <x86intrin.h>
2442 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2444 /// \param __a
2445 /// A 64-bit integer vector containing one of the source operands.
2446 /// \param __b
2447 /// A 64-bit integer vector containing one of the source operands.
2448 /// \returns A 64-bit integer vector containing the products of both operands.
2449 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2450 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2452 return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
2453 (__v8hi)__anyext128(__b)));
2456 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2457 /// destination, as specified by the immediate value operand.
2459 /// \headerfile <x86intrin.h>
2461 /// \code
2462 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2463 /// \endcode
2465 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2467 /// \param a
2468 /// A 64-bit integer vector containing the values to be shuffled.
2469 /// \param n
2470 /// An immediate value containing an 8-bit value specifying which elements to
2471 /// copy from \a a. The destinations within the 64-bit destination are
2472 /// assigned values as follows: \n
2473 /// Bits [1:0] are used to assign values to bits [15:0] in the
2474 /// destination. \n
2475 /// Bits [3:2] are used to assign values to bits [31:16] in the
2476 /// destination. \n
2477 /// Bits [5:4] are used to assign values to bits [47:32] in the
2478 /// destination. \n
2479 /// Bits [7:6] are used to assign values to bits [63:48] in the
2480 /// destination. \n
2481 /// Bit value assignments: \n
2482 /// 00: assigned from bits [15:0] of \a a. \n
2483 /// 01: assigned from bits [31:16] of \a a. \n
2484 /// 10: assigned from bits [47:32] of \a a. \n
2485 /// 11: assigned from bits [63:48] of \a a. \n
2486 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2487 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2488 /// <c>[b6, b4, b2, b0]</c>.
2489 /// \returns A 64-bit integer vector containing the shuffled values.
2490 #define _mm_shuffle_pi16(a, n) \
2491 ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
2492 (n) & 0x3, ((n) >> 2) & 0x3, \
2493 ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
2495 /// Conditionally copies the values from each 8-bit element in the first
2496 /// 64-bit integer vector operand to the specified memory location, as
2497 /// specified by the most significant bit in the corresponding element in the
2498 /// second 64-bit integer vector operand.
2500 /// To minimize caching, the data is flagged as non-temporal
2501 /// (unlikely to be used again soon).
2503 /// \headerfile <x86intrin.h>
2505 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2507 /// \param __d
2508 /// A 64-bit integer vector containing the values with elements to be copied.
2509 /// \param __n
2510 /// A 64-bit integer vector operand. The most significant bit from each 8-bit
2511 /// element determines whether the corresponding element in operand \a __d
2512 /// is copied. If the most significant bit of a given element is 1, the
2513 /// corresponding element in operand \a __d is copied.
2514 /// \param __p
2515 /// A pointer to a 64-bit memory location that will receive the conditionally
2516 /// copied integer values. The address of the memory location does not have
2517 /// to be aligned.
2518 static __inline__ void __DEFAULT_FN_ATTRS_SSE2
2519 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2521 // This is complex, because we need to support the case where __p is pointing
2522 // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
2523 // write might cause a trap where a 64-bit maskmovq would not. (Memory
2524 // locations not selected by the mask bits might still cause traps.)
2525 __m128i __d128 = __anyext128(__d);
2526 __m128i __n128 = __zext128(__n);
2527 if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
2528 ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
2529 // If there's a risk of spurious trap due to a 128-bit write, back up the
2530 // pointer by 8 bytes and shift values in registers to match.
2531 __p -= 8;
2532 __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
2533 __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
2536 __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
2539 /// Computes the rounded averages of the packed unsigned 8-bit integer
2540 /// values and writes the averages to the corresponding bits in the
2541 /// destination.
2543 /// \headerfile <x86intrin.h>
2545 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2547 /// \param __a
2548 /// A 64-bit integer vector containing one of the source operands.
2549 /// \param __b
2550 /// A 64-bit integer vector containing one of the source operands.
2551 /// \returns A 64-bit integer vector containing the averages of both operands.
2552 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2553 _mm_avg_pu8(__m64 __a, __m64 __b)
2555 return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
2556 (__v16qi)__anyext128(__b)));
2559 /// Computes the rounded averages of the packed unsigned 16-bit integer
2560 /// values and writes the averages to the corresponding bits in the
2561 /// destination.
2563 /// \headerfile <x86intrin.h>
2565 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2567 /// \param __a
2568 /// A 64-bit integer vector containing one of the source operands.
2569 /// \param __b
2570 /// A 64-bit integer vector containing one of the source operands.
2571 /// \returns A 64-bit integer vector containing the averages of both operands.
2572 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2573 _mm_avg_pu16(__m64 __a, __m64 __b)
2575 return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
2576 (__v8hi)__anyext128(__b)));
2579 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2580 /// 64-bit vector operands and computes the absolute value for each of the
2581 /// difference. Then sum of the 8 absolute differences is written to the
2582 /// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2584 /// \headerfile <x86intrin.h>
2586 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2588 /// \param __a
2589 /// A 64-bit integer vector containing one of the source operands.
2590 /// \param __b
2591 /// A 64-bit integer vector containing one of the source operands.
2592 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2593 /// sets of absolute differences between both operands. The upper bits are
2594 /// cleared.
2595 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2596 _mm_sad_pu8(__m64 __a, __m64 __b)
2598 return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
2599 (__v16qi)__zext128(__b)));
2602 #if defined(__cplusplus)
2603 extern "C" {
2604 #endif
2606 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2607 /// integer value.
2609 /// There are several groups of macros associated with this
2610 /// intrinsic, including:
2611 /// <ul>
2612 /// <li>
2613 /// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2614 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2615 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2616 /// _MM_GET_EXCEPTION_STATE().
2617 /// </li>
2618 /// <li>
2619 /// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2620 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2621 /// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2622 /// </li>
2623 /// <li>
2624 /// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2625 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2626 /// _MM_GET_ROUNDING_MODE().
2627 /// </li>
2628 /// <li>
2629 /// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2630 /// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2631 /// </li>
2632 /// <li>
2633 /// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2634 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2635 /// _MM_GET_DENORMALS_ZERO_MODE().
2636 /// </li>
2637 /// </ul>
2639 /// For example, the following expression checks if an overflow exception has
2640 /// occurred:
2641 /// \code
2642 /// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2643 /// \endcode
2645 /// The following expression gets the current rounding mode:
2646 /// \code
2647 /// _MM_GET_ROUNDING_MODE()
2648 /// \endcode
2650 /// \headerfile <x86intrin.h>
2652 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2654 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2655 /// register.
2656 unsigned int _mm_getcsr(void);
2658 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2660 /// There are several groups of macros associated with this intrinsic,
2661 /// including:
2662 /// <ul>
2663 /// <li>
2664 /// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2665 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2666 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2667 /// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2668 /// </li>
2669 /// <li>
2670 /// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2671 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2672 /// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2673 /// of these macros.
2674 /// </li>
2675 /// <li>
2676 /// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2677 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2678 /// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2679 /// </li>
2680 /// <li>
2681 /// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2682 /// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2683 /// one of these macros.
2684 /// </li>
2685 /// <li>
2686 /// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2687 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2688 /// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2689 /// </li>
2690 /// </ul>
2692 /// For example, the following expression causes subsequent floating-point
2693 /// operations to round up:
2694 /// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2696 /// The following example sets the DAZ and FTZ flags:
2697 /// \code
2698 /// void setFlags() {
2699 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2700 /// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2701 /// }
2702 /// \endcode
2704 /// \headerfile <x86intrin.h>
2706 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2708 /// \param __i
2709 /// A 32-bit unsigned integer value to be written to the MXCSR register.
2710 void _mm_setcsr(unsigned int __i);
2712 #if defined(__cplusplus)
2713 } // extern "C"
2714 #endif
2716 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2717 /// specified by the immediate value operand.
2719 /// \headerfile <x86intrin.h>
2721 /// \code
2722 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2723 /// \endcode
2725 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2727 /// \param a
2728 /// A 128-bit vector of [4 x float].
2729 /// \param b
2730 /// A 128-bit vector of [4 x float].
2731 /// \param mask
2732 /// An immediate value containing an 8-bit value specifying which elements to
2733 /// copy from \a a and \a b. \n
2734 /// Bits [3:0] specify the values copied from operand \a a. \n
2735 /// Bits [7:4] specify the values copied from operand \a b. \n
2736 /// The destinations within the 128-bit destination are assigned values as
2737 /// follows: \n
2738 /// Bits [1:0] are used to assign values to bits [31:0] in the
2739 /// destination. \n
2740 /// Bits [3:2] are used to assign values to bits [63:32] in the
2741 /// destination. \n
2742 /// Bits [5:4] are used to assign values to bits [95:64] in the
2743 /// destination. \n
2744 /// Bits [7:6] are used to assign values to bits [127:96] in the
2745 /// destination. \n
2746 /// Bit value assignments: \n
2747 /// 00: Bits [31:0] copied from the specified operand. \n
2748 /// 01: Bits [63:32] copied from the specified operand. \n
2749 /// 10: Bits [95:64] copied from the specified operand. \n
2750 /// 11: Bits [127:96] copied from the specified operand. \n
2751 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2752 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2753 /// <c>[b6, b4, b2, b0]</c>.
2754 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2755 #define _mm_shuffle_ps(a, b, mask) \
2756 ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2757 (int)(mask)))
2759 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2760 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2762 /// \headerfile <x86intrin.h>
2764 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2766 /// \param __a
2767 /// A 128-bit vector of [4 x float]. \n
2768 /// Bits [95:64] are written to bits [31:0] of the destination. \n
2769 /// Bits [127:96] are written to bits [95:64] of the destination.
2770 /// \param __b
2771 /// A 128-bit vector of [4 x float].
2772 /// Bits [95:64] are written to bits [63:32] of the destination. \n
2773 /// Bits [127:96] are written to bits [127:96] of the destination.
2774 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2775 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2776 _mm_unpackhi_ps(__m128 __a, __m128 __b) {
2777 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2780 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2781 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2783 /// \headerfile <x86intrin.h>
2785 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2787 /// \param __a
2788 /// A 128-bit vector of [4 x float]. \n
2789 /// Bits [31:0] are written to bits [31:0] of the destination. \n
2790 /// Bits [63:32] are written to bits [95:64] of the destination.
2791 /// \param __b
2792 /// A 128-bit vector of [4 x float]. \n
2793 /// Bits [31:0] are written to bits [63:32] of the destination. \n
2794 /// Bits [63:32] are written to bits [127:96] of the destination.
2795 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2796 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2797 _mm_unpacklo_ps(__m128 __a, __m128 __b) {
2798 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2801 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2802 /// 32 bits are set to the lower 32 bits of the second parameter. The upper
2803 /// 96 bits are set to the upper 96 bits of the first parameter.
2805 /// \headerfile <x86intrin.h>
2807 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2808 /// instruction.
2810 /// \param __a
2811 /// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2812 /// written to the upper 96 bits of the result.
2813 /// \param __b
2814 /// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2815 /// written to the lower 32 bits of the result.
2816 /// \returns A 128-bit floating-point vector of [4 x float].
2817 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2818 _mm_move_ss(__m128 __a, __m128 __b) {
2819 __a[0] = __b[0];
2820 return __a;
2823 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2824 /// 64 bits are set to the upper 64 bits of the second parameter. The upper
2825 /// 64 bits are set to the upper 64 bits of the first parameter.
2827 /// \headerfile <x86intrin.h>
2829 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2831 /// \param __a
2832 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2833 /// written to the upper 64 bits of the result.
2834 /// \param __b
2835 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2836 /// written to the lower 64 bits of the result.
2837 /// \returns A 128-bit floating-point vector of [4 x float].
2838 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2839 _mm_movehl_ps(__m128 __a, __m128 __b) {
2840 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2843 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2844 /// 64 bits are set to the lower 64 bits of the first parameter. The upper
2845 /// 64 bits are set to the lower 64 bits of the second parameter.
2847 /// \headerfile <x86intrin.h>
2849 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2851 /// \param __a
2852 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2853 /// written to the lower 64 bits of the result.
2854 /// \param __b
2855 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2856 /// written to the upper 64 bits of the result.
2857 /// \returns A 128-bit floating-point vector of [4 x float].
2858 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2859 _mm_movelh_ps(__m128 __a, __m128 __b) {
2860 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2863 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2864 /// float].
2866 /// \headerfile <x86intrin.h>
2868 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2870 /// \param __a
2871 /// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2872 /// from the corresponding elements in this operand.
2873 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2874 /// values from the operand.
2875 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2876 _mm_cvtpi16_ps(__m64 __a)
2878 return __builtin_convertvector((__v4hi)__a, __v4sf);
2881 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2882 /// 128-bit vector of [4 x float].
2884 /// \headerfile <x86intrin.h>
2886 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2888 /// \param __a
2889 /// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2890 /// destination are copied from the corresponding elements in this operand.
2891 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2892 /// values from the operand.
2893 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2894 _mm_cvtpu16_ps(__m64 __a)
2896 return __builtin_convertvector((__v4hu)__a, __v4sf);
2899 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2900 /// into a 128-bit vector of [4 x float].
2902 /// \headerfile <x86intrin.h>
2904 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2906 /// \param __a
2907 /// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2908 /// from the corresponding lower 4 elements in this operand.
2909 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2910 /// values from the operand.
2911 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2912 _mm_cvtpi8_ps(__m64 __a)
2914 return __builtin_convertvector(
2915 __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
2916 0, 1, 2, 3), __v4sf);
2919 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2920 /// vector of [8 x u8] into a 128-bit vector of [4 x float].
2922 /// \headerfile <x86intrin.h>
2924 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2926 /// \param __a
2927 /// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2928 /// destination are copied from the corresponding lower 4 elements in this
2929 /// operand.
2930 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2931 /// values from the source operand.
2932 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2933 _mm_cvtpu8_ps(__m64 __a)
2935 return __builtin_convertvector(
2936 __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
2937 0, 1, 2, 3), __v4sf);
2940 /// Converts the two 32-bit signed integer values from each 64-bit vector
2941 /// operand of [2 x i32] into a 128-bit vector of [4 x float].
2943 /// \headerfile <x86intrin.h>
2945 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2947 /// \param __a
2948 /// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2949 /// copied from the elements in this operand.
2950 /// \param __b
2951 /// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2952 /// copied from the elements in this operand.
2953 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2954 /// copied and converted values from the first operand. The upper 64 bits
2955 /// contain the copied and converted values from the second operand.
2956 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2957 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2959 return __builtin_convertvector(
2960 __builtin_shufflevector((__v2si)__a, (__v2si)__b,
2961 0, 1, 2, 3), __v4sf);
2964 /// Converts each single-precision floating-point element of a 128-bit
2965 /// floating-point vector of [4 x float] into a 16-bit signed integer, and
2966 /// packs the results into a 64-bit integer vector of [4 x i16].
2968 /// If the floating-point element is NaN or infinity, or if the
2969 /// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2970 /// it is converted to 0x8000. Otherwise if the floating-point element is
2971 /// greater than 0x7FFF, it is converted to 0x7FFF.
2973 /// \headerfile <x86intrin.h>
2975 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2977 /// \param __a
2978 /// A 128-bit floating-point vector of [4 x float].
2979 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2980 /// values.
2981 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2982 _mm_cvtps_pi16(__m128 __a)
2984 return __trunc64(__builtin_ia32_packssdw128(
2985 (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
2988 /// Converts each single-precision floating-point element of a 128-bit
2989 /// floating-point vector of [4 x float] into an 8-bit signed integer, and
2990 /// packs the results into the lower 32 bits of a 64-bit integer vector of
2991 /// [8 x i8]. The upper 32 bits of the vector are set to 0.
2993 /// If the floating-point element is NaN or infinity, or if the
2994 /// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2995 /// is converted to 0x80. Otherwise if the floating-point element is greater
2996 /// than 0x7F, it is converted to 0x7F.
2998 /// \headerfile <x86intrin.h>
3000 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
3002 /// \param __a
3003 /// 128-bit floating-point vector of [4 x float].
3004 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
3005 /// converted values and the uppper 32 bits are set to zero.
3006 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
3007 _mm_cvtps_pi8(__m128 __a)
3009 __m64 __b, __c;
3011 __b = _mm_cvtps_pi16(__a);
3012 __c = _mm_setzero_si64();
3014 return _mm_packs_pi16(__b, __c);
3017 /// Extracts the sign bits from each single-precision floating-point
3018 /// element of a 128-bit floating-point vector of [4 x float] and returns the
3019 /// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
3020 /// to zero.
3022 /// \headerfile <x86intrin.h>
3024 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
3026 /// \param __a
3027 /// A 128-bit floating-point vector of [4 x float].
3028 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3029 /// single-precision floating-point element of the parameter. Bits [31:4] are
3030 /// set to zero.
3031 static __inline__ int __DEFAULT_FN_ATTRS
3032 _mm_movemask_ps(__m128 __a)
3034 return __builtin_ia32_movmskps((__v4sf)__a);
3037 /* Compare */
3038 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
3039 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
3040 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
3041 #define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
3042 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
3043 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
3044 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
3045 #define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
3047 /// Compares each of the corresponding values of two 128-bit vectors of
3048 /// [4 x float], using the operation specified by the immediate integer
3049 /// operand.
3051 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3052 /// If either value in a comparison is NaN, comparisons that are ordered
3053 /// return false, and comparisons that are unordered return true.
3055 /// \headerfile <x86intrin.h>
3057 /// \code
3058 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
3059 /// \endcode
3061 /// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3063 /// \param a
3064 /// A 128-bit vector of [4 x float].
3065 /// \param b
3066 /// A 128-bit vector of [4 x float].
3067 /// \param c
3068 /// An immediate integer operand, with bits [4:0] specifying which comparison
3069 /// operation to use: \n
3070 /// 0x00: Equal (ordered, non-signaling) \n
3071 /// 0x01: Less-than (ordered, signaling) \n
3072 /// 0x02: Less-than-or-equal (ordered, signaling) \n
3073 /// 0x03: Unordered (non-signaling) \n
3074 /// 0x04: Not-equal (unordered, non-signaling) \n
3075 /// 0x05: Not-less-than (unordered, signaling) \n
3076 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3077 /// 0x07: Ordered (non-signaling) \n
3078 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3079 #define _mm_cmp_ps(a, b, c) \
3080 ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3082 /// Compares each of the corresponding scalar values of two 128-bit
3083 /// vectors of [4 x float], using the operation specified by the immediate
3084 /// integer operand.
3086 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3087 /// If either value in a comparison is NaN, comparisons that are ordered
3088 /// return false, and comparisons that are unordered return true.
3090 /// \headerfile <x86intrin.h>
3092 /// \code
3093 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3094 /// \endcode
3096 /// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3098 /// \param a
3099 /// A 128-bit vector of [4 x float].
3100 /// \param b
3101 /// A 128-bit vector of [4 x float].
3102 /// \param c
3103 /// An immediate integer operand, with bits [4:0] specifying which comparison
3104 /// operation to use: \n
3105 /// 0x00: Equal (ordered, non-signaling) \n
3106 /// 0x01: Less-than (ordered, signaling) \n
3107 /// 0x02: Less-than-or-equal (ordered, signaling) \n
3108 /// 0x03: Unordered (non-signaling) \n
3109 /// 0x04: Not-equal (unordered, non-signaling) \n
3110 /// 0x05: Not-less-than (unordered, signaling) \n
3111 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3112 /// 0x07: Ordered (non-signaling) \n
3113 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3114 #define _mm_cmp_ss(a, b, c) \
3115 ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3117 #define _MM_ALIGN16 __attribute__((aligned(16)))
3119 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3121 #define _MM_EXCEPT_INVALID (0x0001U)
3122 #define _MM_EXCEPT_DENORM (0x0002U)
3123 #define _MM_EXCEPT_DIV_ZERO (0x0004U)
3124 #define _MM_EXCEPT_OVERFLOW (0x0008U)
3125 #define _MM_EXCEPT_UNDERFLOW (0x0010U)
3126 #define _MM_EXCEPT_INEXACT (0x0020U)
3127 #define _MM_EXCEPT_MASK (0x003fU)
3129 #define _MM_MASK_INVALID (0x0080U)
3130 #define _MM_MASK_DENORM (0x0100U)
3131 #define _MM_MASK_DIV_ZERO (0x0200U)
3132 #define _MM_MASK_OVERFLOW (0x0400U)
3133 #define _MM_MASK_UNDERFLOW (0x0800U)
3134 #define _MM_MASK_INEXACT (0x1000U)
3135 #define _MM_MASK_MASK (0x1f80U)
3137 #define _MM_ROUND_NEAREST (0x0000U)
3138 #define _MM_ROUND_DOWN (0x2000U)
3139 #define _MM_ROUND_UP (0x4000U)
3140 #define _MM_ROUND_TOWARD_ZERO (0x6000U)
3141 #define _MM_ROUND_MASK (0x6000U)
3143 #define _MM_FLUSH_ZERO_MASK (0x8000U)
3144 #define _MM_FLUSH_ZERO_ON (0x8000U)
3145 #define _MM_FLUSH_ZERO_OFF (0x0000U)
3147 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3148 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3149 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3150 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3152 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3153 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3154 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3155 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3157 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3158 do { \
3159 __m128 tmp3, tmp2, tmp1, tmp0; \
3160 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3161 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3162 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3163 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3164 (row0) = _mm_movelh_ps(tmp0, tmp2); \
3165 (row1) = _mm_movehl_ps(tmp2, tmp0); \
3166 (row2) = _mm_movelh_ps(tmp1, tmp3); \
3167 (row3) = _mm_movehl_ps(tmp3, tmp1); \
3168 } while (0)
3170 /* Aliases for compatibility. */
3171 #define _m_pextrw _mm_extract_pi16
3172 #define _m_pinsrw _mm_insert_pi16
3173 #define _m_pmaxsw _mm_max_pi16
3174 #define _m_pmaxub _mm_max_pu8
3175 #define _m_pminsw _mm_min_pi16
3176 #define _m_pminub _mm_min_pu8
3177 #define _m_pmovmskb _mm_movemask_pi8
3178 #define _m_pmulhuw _mm_mulhi_pu16
3179 #define _m_pshufw _mm_shuffle_pi16
3180 #define _m_maskmovq _mm_maskmove_si64
3181 #define _m_pavgb _mm_avg_pu8
3182 #define _m_pavgw _mm_avg_pu16
3183 #define _m_psadbw _mm_sad_pu8
3184 #define _m_ _mm_
3186 #undef __trunc64
3187 #undef __zext128
3188 #undef __anyext128
3189 #undef __zeroupper64
3190 #undef __DEFAULT_FN_ATTRS
3191 #undef __DEFAULT_FN_ATTRS_CONSTEXPR
3192 #undef __DEFAULT_FN_ATTRS_SSE2
3193 #undef __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
3195 /* Ugly hack for backwards-compatibility (compatible with gcc) */
3196 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3197 #include <emmintrin.h>
3198 #endif
3200 #endif /* __XMMINTRIN_H */