[clang-format] Fix a bug in aligning comments above PPDirective (#72791)
[llvm-project.git] / clang / lib / Headers / emmintrin.h
blob96e3ebdecbdf83c206ceb8a9a82590eb98eb8c3c
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
17 #include <xmmintrin.h>
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24 __attribute__((__vector_size__(16), __aligned__(1)));
26 /* Type defines. */
27 typedef double __v2df __attribute__((__vector_size__(16)));
28 typedef long long __v2di __attribute__((__vector_size__(16)));
29 typedef short __v8hi __attribute__((__vector_size__(16)));
30 typedef char __v16qi __attribute__((__vector_size__(16)));
32 /* Unsigned types */
33 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38 * appear in the interface though. */
39 typedef signed char __v16qs __attribute__((__vector_size__(16)));
41 #ifdef __SSE2__
42 /* Both _Float16 and __bf16 require SSE2 being enabled. */
43 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
47 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49 #endif
51 /* Define the default attributes for the functions in this file. */
52 #define __DEFAULT_FN_ATTRS \
53 __attribute__((__always_inline__, __nodebug__, \
54 __target__("sse2,no-evex512"), __min_vector_width__(128)))
55 #define __DEFAULT_FN_ATTRS_MMX \
56 __attribute__((__always_inline__, __nodebug__, \
57 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
59 /// Adds lower double-precision values in both operands and returns the
60 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
61 /// are copied from the upper double-precision value of the first operand.
62 ///
63 /// \headerfile <x86intrin.h>
64 ///
65 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
66 ///
67 /// \param __a
68 /// A 128-bit vector of [2 x double] containing one of the source operands.
69 /// \param __b
70 /// A 128-bit vector of [2 x double] containing one of the source operands.
71 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
72 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
73 /// from the upper 64 bits of the first source operand.
74 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
75 __m128d __b) {
76 __a[0] += __b[0];
77 return __a;
80 /// Adds two 128-bit vectors of [2 x double].
81 ///
82 /// \headerfile <x86intrin.h>
83 ///
84 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
85 ///
86 /// \param __a
87 /// A 128-bit vector of [2 x double] containing one of the source operands.
88 /// \param __b
89 /// A 128-bit vector of [2 x double] containing one of the source operands.
90 /// \returns A 128-bit vector of [2 x double] containing the sums of both
91 /// operands.
92 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
93 __m128d __b) {
94 return (__m128d)((__v2df)__a + (__v2df)__b);
97 /// Subtracts the lower double-precision value of the second operand
98 /// from the lower double-precision value of the first operand and returns
99 /// the difference in the lower 64 bits of the result. The upper 64 bits of
100 /// the result are copied from the upper double-precision value of the first
101 /// operand.
103 /// \headerfile <x86intrin.h>
105 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
107 /// \param __a
108 /// A 128-bit vector of [2 x double] containing the minuend.
109 /// \param __b
110 /// A 128-bit vector of [2 x double] containing the subtrahend.
111 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
112 /// difference of the lower 64 bits of both operands. The upper 64 bits are
113 /// copied from the upper 64 bits of the first source operand.
114 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
115 __m128d __b) {
116 __a[0] -= __b[0];
117 return __a;
120 /// Subtracts two 128-bit vectors of [2 x double].
122 /// \headerfile <x86intrin.h>
124 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
126 /// \param __a
127 /// A 128-bit vector of [2 x double] containing the minuend.
128 /// \param __b
129 /// A 128-bit vector of [2 x double] containing the subtrahend.
130 /// \returns A 128-bit vector of [2 x double] containing the differences between
131 /// both operands.
132 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
133 __m128d __b) {
134 return (__m128d)((__v2df)__a - (__v2df)__b);
137 /// Multiplies lower double-precision values in both operands and returns
138 /// the product in the lower 64 bits of the result. The upper 64 bits of the
139 /// result are copied from the upper double-precision value of the first
140 /// operand.
142 /// \headerfile <x86intrin.h>
144 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
146 /// \param __a
147 /// A 128-bit vector of [2 x double] containing one of the source operands.
148 /// \param __b
149 /// A 128-bit vector of [2 x double] containing one of the source operands.
150 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
151 /// product of the lower 64 bits of both operands. The upper 64 bits are
152 /// copied from the upper 64 bits of the first source operand.
153 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
154 __m128d __b) {
155 __a[0] *= __b[0];
156 return __a;
159 /// Multiplies two 128-bit vectors of [2 x double].
161 /// \headerfile <x86intrin.h>
163 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
165 /// \param __a
166 /// A 128-bit vector of [2 x double] containing one of the operands.
167 /// \param __b
168 /// A 128-bit vector of [2 x double] containing one of the operands.
169 /// \returns A 128-bit vector of [2 x double] containing the products of both
170 /// operands.
171 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
172 __m128d __b) {
173 return (__m128d)((__v2df)__a * (__v2df)__b);
176 /// Divides the lower double-precision value of the first operand by the
177 /// lower double-precision value of the second operand and returns the
178 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
179 /// result are copied from the upper double-precision value of the first
180 /// operand.
182 /// \headerfile <x86intrin.h>
184 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
186 /// \param __a
187 /// A 128-bit vector of [2 x double] containing the dividend.
188 /// \param __b
189 /// A 128-bit vector of [2 x double] containing divisor.
190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
191 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
192 /// copied from the upper 64 bits of the first source operand.
193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
194 __m128d __b) {
195 __a[0] /= __b[0];
196 return __a;
199 /// Performs an element-by-element division of two 128-bit vectors of
200 /// [2 x double].
202 /// \headerfile <x86intrin.h>
204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
206 /// \param __a
207 /// A 128-bit vector of [2 x double] containing the dividend.
208 /// \param __b
209 /// A 128-bit vector of [2 x double] containing the divisor.
210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
211 /// operands.
212 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
213 __m128d __b) {
214 return (__m128d)((__v2df)__a / (__v2df)__b);
217 /// Calculates the square root of the lower double-precision value of
218 /// the second operand and returns it in the lower 64 bits of the result.
219 /// The upper 64 bits of the result are copied from the upper
220 /// double-precision value of the first operand.
222 /// \headerfile <x86intrin.h>
224 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
226 /// \param __a
227 /// A 128-bit vector of [2 x double] containing one of the operands. The
228 /// upper 64 bits of this operand are copied to the upper 64 bits of the
229 /// result.
230 /// \param __b
231 /// A 128-bit vector of [2 x double] containing one of the operands. The
232 /// square root is calculated using the lower 64 bits of this operand.
233 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
235 /// bits are copied from the upper 64 bits of operand \a __a.
236 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
237 __m128d __b) {
238 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
239 return __extension__(__m128d){__c[0], __a[1]};
242 /// Calculates the square root of the each of two values stored in a
243 /// 128-bit vector of [2 x double].
245 /// \headerfile <x86intrin.h>
247 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
249 /// \param __a
250 /// A 128-bit vector of [2 x double].
251 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
252 /// values in the operand.
253 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
254 return __builtin_ia32_sqrtpd((__v2df)__a);
257 /// Compares lower 64-bit double-precision values of both operands, and
258 /// returns the lesser of the pair of values in the lower 64-bits of the
259 /// result. The upper 64 bits of the result are copied from the upper
260 /// double-precision value of the first operand.
262 /// \headerfile <x86intrin.h>
264 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
266 /// \param __a
267 /// A 128-bit vector of [2 x double] containing one of the operands. The
268 /// lower 64 bits of this operand are used in the comparison.
269 /// \param __b
270 /// A 128-bit vector of [2 x double] containing one of the operands. The
271 /// lower 64 bits of this operand are used in the comparison.
272 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
273 /// minimum value between both operands. The upper 64 bits are copied from
274 /// the upper 64 bits of the first source operand.
275 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
276 __m128d __b) {
277 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
280 /// Performs element-by-element comparison of the two 128-bit vectors of
281 /// [2 x double] and returns the vector containing the lesser of each pair of
282 /// values.
284 /// \headerfile <x86intrin.h>
286 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
288 /// \param __a
289 /// A 128-bit vector of [2 x double] containing one of the operands.
290 /// \param __b
291 /// A 128-bit vector of [2 x double] containing one of the operands.
292 /// \returns A 128-bit vector of [2 x double] containing the minimum values
293 /// between both operands.
294 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
295 __m128d __b) {
296 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
299 /// Compares lower 64-bit double-precision values of both operands, and
300 /// returns the greater of the pair of values in the lower 64-bits of the
301 /// result. The upper 64 bits of the result are copied from the upper
302 /// double-precision value of the first operand.
304 /// \headerfile <x86intrin.h>
306 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
308 /// \param __a
309 /// A 128-bit vector of [2 x double] containing one of the operands. The
310 /// lower 64 bits of this operand are used in the comparison.
311 /// \param __b
312 /// A 128-bit vector of [2 x double] containing one of the operands. The
313 /// lower 64 bits of this operand are used in the comparison.
314 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
315 /// maximum value between both operands. The upper 64 bits are copied from
316 /// the upper 64 bits of the first source operand.
317 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
318 __m128d __b) {
319 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
322 /// Performs element-by-element comparison of the two 128-bit vectors of
323 /// [2 x double] and returns the vector containing the greater of each pair
324 /// of values.
326 /// \headerfile <x86intrin.h>
328 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
330 /// \param __a
331 /// A 128-bit vector of [2 x double] containing one of the operands.
332 /// \param __b
333 /// A 128-bit vector of [2 x double] containing one of the operands.
334 /// \returns A 128-bit vector of [2 x double] containing the maximum values
335 /// between both operands.
336 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
337 __m128d __b) {
338 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
341 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
343 /// \headerfile <x86intrin.h>
345 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
347 /// \param __a
348 /// A 128-bit vector of [2 x double] containing one of the source operands.
349 /// \param __b
350 /// A 128-bit vector of [2 x double] containing one of the source operands.
351 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
352 /// values between both operands.
353 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
354 __m128d __b) {
355 return (__m128d)((__v2du)__a & (__v2du)__b);
358 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
359 /// the one's complement of the values contained in the first source operand.
361 /// \headerfile <x86intrin.h>
363 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
365 /// \param __a
366 /// A 128-bit vector of [2 x double] containing the left source operand. The
367 /// one's complement of this value is used in the bitwise AND.
368 /// \param __b
369 /// A 128-bit vector of [2 x double] containing the right source operand.
370 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
371 /// values in the second operand and the one's complement of the first
372 /// operand.
373 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
374 __m128d __b) {
375 return (__m128d)(~(__v2du)__a & (__v2du)__b);
378 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
380 /// \headerfile <x86intrin.h>
382 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
384 /// \param __a
385 /// A 128-bit vector of [2 x double] containing one of the source operands.
386 /// \param __b
387 /// A 128-bit vector of [2 x double] containing one of the source operands.
388 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
389 /// values between both operands.
390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
391 __m128d __b) {
392 return (__m128d)((__v2du)__a | (__v2du)__b);
395 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
397 /// \headerfile <x86intrin.h>
399 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
401 /// \param __a
402 /// A 128-bit vector of [2 x double] containing one of the source operands.
403 /// \param __b
404 /// A 128-bit vector of [2 x double] containing one of the source operands.
405 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
406 /// values between both operands.
407 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
408 __m128d __b) {
409 return (__m128d)((__v2du)__a ^ (__v2du)__b);
412 /// Compares each of the corresponding double-precision values of the
413 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
414 /// for false, 0xFFFFFFFFFFFFFFFF for true.
416 /// \headerfile <x86intrin.h>
418 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
420 /// \param __a
421 /// A 128-bit vector of [2 x double].
422 /// \param __b
423 /// A 128-bit vector of [2 x double].
424 /// \returns A 128-bit vector containing the comparison results.
425 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
426 __m128d __b) {
427 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
430 /// Compares each of the corresponding double-precision values of the
431 /// 128-bit vectors of [2 x double] to determine if the values in the first
432 /// operand are less than those in the second operand. Each comparison
433 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
435 /// \headerfile <x86intrin.h>
437 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
439 /// \param __a
440 /// A 128-bit vector of [2 x double].
441 /// \param __b
442 /// A 128-bit vector of [2 x double].
443 /// \returns A 128-bit vector containing the comparison results.
444 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
445 __m128d __b) {
446 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
449 /// Compares each of the corresponding double-precision values of the
450 /// 128-bit vectors of [2 x double] to determine if the values in the first
451 /// operand are less than or equal to those in the second operand.
453 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
455 /// \headerfile <x86intrin.h>
457 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
459 /// \param __a
460 /// A 128-bit vector of [2 x double].
461 /// \param __b
462 /// A 128-bit vector of [2 x double].
463 /// \returns A 128-bit vector containing the comparison results.
464 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
465 __m128d __b) {
466 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
469 /// Compares each of the corresponding double-precision values of the
470 /// 128-bit vectors of [2 x double] to determine if the values in the first
471 /// operand are greater than those in the second operand.
473 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
475 /// \headerfile <x86intrin.h>
477 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
479 /// \param __a
480 /// A 128-bit vector of [2 x double].
481 /// \param __b
482 /// A 128-bit vector of [2 x double].
483 /// \returns A 128-bit vector containing the comparison results.
484 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
485 __m128d __b) {
486 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
489 /// Compares each of the corresponding double-precision values of the
490 /// 128-bit vectors of [2 x double] to determine if the values in the first
491 /// operand are greater than or equal to those in the second operand.
493 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
495 /// \headerfile <x86intrin.h>
497 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
499 /// \param __a
500 /// A 128-bit vector of [2 x double].
501 /// \param __b
502 /// A 128-bit vector of [2 x double].
503 /// \returns A 128-bit vector containing the comparison results.
504 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
505 __m128d __b) {
506 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
509 /// Compares each of the corresponding double-precision values of the
510 /// 128-bit vectors of [2 x double] to determine if the values in the first
511 /// operand are ordered with respect to those in the second operand.
513 /// A pair of double-precision values are "ordered" with respect to each
514 /// other if neither value is a NaN. Each comparison yields 0x0 for false,
515 /// 0xFFFFFFFFFFFFFFFF for true.
517 /// \headerfile <x86intrin.h>
519 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
521 /// \param __a
522 /// A 128-bit vector of [2 x double].
523 /// \param __b
524 /// A 128-bit vector of [2 x double].
525 /// \returns A 128-bit vector containing the comparison results.
526 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
527 __m128d __b) {
528 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
531 /// Compares each of the corresponding double-precision values of the
532 /// 128-bit vectors of [2 x double] to determine if the values in the first
533 /// operand are unordered with respect to those in the second operand.
535 /// A pair of double-precision values are "unordered" with respect to each
536 /// other if one or both values are NaN. Each comparison yields 0x0 for
537 /// false, 0xFFFFFFFFFFFFFFFF for true.
539 /// \headerfile <x86intrin.h>
541 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
542 /// instruction.
544 /// \param __a
545 /// A 128-bit vector of [2 x double].
546 /// \param __b
547 /// A 128-bit vector of [2 x double].
548 /// \returns A 128-bit vector containing the comparison results.
549 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
550 __m128d __b) {
551 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
554 /// Compares each of the corresponding double-precision values of the
555 /// 128-bit vectors of [2 x double] to determine if the values in the first
556 /// operand are unequal to those in the second operand.
558 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
560 /// \headerfile <x86intrin.h>
562 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
564 /// \param __a
565 /// A 128-bit vector of [2 x double].
566 /// \param __b
567 /// A 128-bit vector of [2 x double].
568 /// \returns A 128-bit vector containing the comparison results.
569 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
570 __m128d __b) {
571 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
574 /// Compares each of the corresponding double-precision values of the
575 /// 128-bit vectors of [2 x double] to determine if the values in the first
576 /// operand are not less than those in the second operand.
578 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
580 /// \headerfile <x86intrin.h>
582 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
584 /// \param __a
585 /// A 128-bit vector of [2 x double].
586 /// \param __b
587 /// A 128-bit vector of [2 x double].
588 /// \returns A 128-bit vector containing the comparison results.
589 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
590 __m128d __b) {
591 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
594 /// Compares each of the corresponding double-precision values of the
595 /// 128-bit vectors of [2 x double] to determine if the values in the first
596 /// operand are not less than or equal to those in the second operand.
598 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
600 /// \headerfile <x86intrin.h>
602 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
604 /// \param __a
605 /// A 128-bit vector of [2 x double].
606 /// \param __b
607 /// A 128-bit vector of [2 x double].
608 /// \returns A 128-bit vector containing the comparison results.
609 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
610 __m128d __b) {
611 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
614 /// Compares each of the corresponding double-precision values of the
615 /// 128-bit vectors of [2 x double] to determine if the values in the first
616 /// operand are not greater than those in the second operand.
618 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
620 /// \headerfile <x86intrin.h>
622 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
624 /// \param __a
625 /// A 128-bit vector of [2 x double].
626 /// \param __b
627 /// A 128-bit vector of [2 x double].
628 /// \returns A 128-bit vector containing the comparison results.
629 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
630 __m128d __b) {
631 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
634 /// Compares each of the corresponding double-precision values of the
635 /// 128-bit vectors of [2 x double] to determine if the values in the first
636 /// operand are not greater than or equal to those in the second operand.
638 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
640 /// \headerfile <x86intrin.h>
642 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
644 /// \param __a
645 /// A 128-bit vector of [2 x double].
646 /// \param __b
647 /// A 128-bit vector of [2 x double].
648 /// \returns A 128-bit vector containing the comparison results.
649 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
650 __m128d __b) {
651 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
654 /// Compares the lower double-precision floating-point values in each of
655 /// the two 128-bit floating-point vectors of [2 x double] for equality.
657 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
659 /// \headerfile <x86intrin.h>
661 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
663 /// \param __a
664 /// A 128-bit vector of [2 x double]. The lower double-precision value is
665 /// compared to the lower double-precision value of \a __b.
666 /// \param __b
667 /// A 128-bit vector of [2 x double]. The lower double-precision value is
668 /// compared to the lower double-precision value of \a __a.
669 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
670 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
671 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
672 __m128d __b) {
673 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
676 /// Compares the lower double-precision floating-point values in each of
677 /// the two 128-bit floating-point vectors of [2 x double] to determine if
678 /// the value in the first parameter is less than the corresponding value in
679 /// the second parameter.
681 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
683 /// \headerfile <x86intrin.h>
685 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
687 /// \param __a
688 /// A 128-bit vector of [2 x double]. The lower double-precision value is
689 /// compared to the lower double-precision value of \a __b.
690 /// \param __b
691 /// A 128-bit vector of [2 x double]. The lower double-precision value is
692 /// compared to the lower double-precision value of \a __a.
693 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
694 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
695 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
696 __m128d __b) {
697 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
700 /// Compares the lower double-precision floating-point values in each of
701 /// the two 128-bit floating-point vectors of [2 x double] to determine if
702 /// the value in the first parameter is less than or equal to the
703 /// corresponding value in the second parameter.
705 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
707 /// \headerfile <x86intrin.h>
709 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
711 /// \param __a
712 /// A 128-bit vector of [2 x double]. The lower double-precision value is
713 /// compared to the lower double-precision value of \a __b.
714 /// \param __b
715 /// A 128-bit vector of [2 x double]. The lower double-precision value is
716 /// compared to the lower double-precision value of \a __a.
717 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
718 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
719 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
720 __m128d __b) {
721 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
724 /// Compares the lower double-precision floating-point values in each of
725 /// the two 128-bit floating-point vectors of [2 x double] to determine if
726 /// the value in the first parameter is greater than the corresponding value
727 /// in the second parameter.
729 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
731 /// \headerfile <x86intrin.h>
733 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
735 /// \param __a
736 /// A 128-bit vector of [2 x double]. The lower double-precision value is
737 /// compared to the lower double-precision value of \a __b.
738 /// \param __b
739 /// A 128-bit vector of [2 x double]. The lower double-precision value is
740 /// compared to the lower double-precision value of \a __a.
741 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
742 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
743 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
744 __m128d __b) {
745 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
746 return __extension__(__m128d){__c[0], __a[1]};
749 /// Compares the lower double-precision floating-point values in each of
750 /// the two 128-bit floating-point vectors of [2 x double] to determine if
751 /// the value in the first parameter is greater than or equal to the
752 /// corresponding value in the second parameter.
754 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
756 /// \headerfile <x86intrin.h>
758 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
760 /// \param __a
761 /// A 128-bit vector of [2 x double]. The lower double-precision value is
762 /// compared to the lower double-precision value of \a __b.
763 /// \param __b
764 /// A 128-bit vector of [2 x double]. The lower double-precision value is
765 /// compared to the lower double-precision value of \a __a.
766 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
767 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
768 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
769 __m128d __b) {
770 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
771 return __extension__(__m128d){__c[0], __a[1]};
774 /// Compares the lower double-precision floating-point values in each of
775 /// the two 128-bit floating-point vectors of [2 x double] to determine if
776 /// the value in the first parameter is "ordered" with respect to the
777 /// corresponding value in the second parameter.
779 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
780 /// of double-precision values are "ordered" with respect to each other if
781 /// neither value is a NaN.
783 /// \headerfile <x86intrin.h>
785 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
787 /// \param __a
788 /// A 128-bit vector of [2 x double]. The lower double-precision value is
789 /// compared to the lower double-precision value of \a __b.
790 /// \param __b
791 /// A 128-bit vector of [2 x double]. The lower double-precision value is
792 /// compared to the lower double-precision value of \a __a.
793 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
794 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
795 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
796 __m128d __b) {
797 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
800 /// Compares the lower double-precision floating-point values in each of
801 /// the two 128-bit floating-point vectors of [2 x double] to determine if
802 /// the value in the first parameter is "unordered" with respect to the
803 /// corresponding value in the second parameter.
805 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
806 /// of double-precision values are "unordered" with respect to each other if
807 /// one or both values are NaN.
809 /// \headerfile <x86intrin.h>
811 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
812 /// instruction.
814 /// \param __a
815 /// A 128-bit vector of [2 x double]. The lower double-precision value is
816 /// compared to the lower double-precision value of \a __b.
817 /// \param __b
818 /// A 128-bit vector of [2 x double]. The lower double-precision value is
819 /// compared to the lower double-precision value of \a __a.
820 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
821 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
822 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
823 __m128d __b) {
824 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
827 /// Compares the lower double-precision floating-point values in each of
828 /// the two 128-bit floating-point vectors of [2 x double] to determine if
829 /// the value in the first parameter is unequal to the corresponding value in
830 /// the second parameter.
832 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
834 /// \headerfile <x86intrin.h>
836 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
838 /// \param __a
839 /// A 128-bit vector of [2 x double]. The lower double-precision value is
840 /// compared to the lower double-precision value of \a __b.
841 /// \param __b
842 /// A 128-bit vector of [2 x double]. The lower double-precision value is
843 /// compared to the lower double-precision value of \a __a.
844 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
845 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
846 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
847 __m128d __b) {
848 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
851 /// Compares the lower double-precision floating-point values in each of
852 /// the two 128-bit floating-point vectors of [2 x double] to determine if
853 /// the value in the first parameter is not less than the corresponding
854 /// value in the second parameter.
856 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
858 /// \headerfile <x86intrin.h>
860 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
862 /// \param __a
863 /// A 128-bit vector of [2 x double]. The lower double-precision value is
864 /// compared to the lower double-precision value of \a __b.
865 /// \param __b
866 /// A 128-bit vector of [2 x double]. The lower double-precision value is
867 /// compared to the lower double-precision value of \a __a.
868 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
869 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
870 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
871 __m128d __b) {
872 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
875 /// Compares the lower double-precision floating-point values in each of
876 /// the two 128-bit floating-point vectors of [2 x double] to determine if
877 /// the value in the first parameter is not less than or equal to the
878 /// corresponding value in the second parameter.
880 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
882 /// \headerfile <x86intrin.h>
884 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
886 /// \param __a
887 /// A 128-bit vector of [2 x double]. The lower double-precision value is
888 /// compared to the lower double-precision value of \a __b.
889 /// \param __b
890 /// A 128-bit vector of [2 x double]. The lower double-precision value is
891 /// compared to the lower double-precision value of \a __a.
892 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
893 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
894 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
895 __m128d __b) {
896 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
899 /// Compares the lower double-precision floating-point values in each of
900 /// the two 128-bit floating-point vectors of [2 x double] to determine if
901 /// the value in the first parameter is not greater than the corresponding
902 /// value in the second parameter.
904 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
906 /// \headerfile <x86intrin.h>
908 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
910 /// \param __a
911 /// A 128-bit vector of [2 x double]. The lower double-precision value is
912 /// compared to the lower double-precision value of \a __b.
913 /// \param __b
914 /// A 128-bit vector of [2 x double]. The lower double-precision value is
915 /// compared to the lower double-precision value of \a __a.
916 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
917 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
918 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
919 __m128d __b) {
920 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
921 return __extension__(__m128d){__c[0], __a[1]};
924 /// Compares the lower double-precision floating-point values in each of
925 /// the two 128-bit floating-point vectors of [2 x double] to determine if
926 /// the value in the first parameter is not greater than or equal to the
927 /// corresponding value in the second parameter.
929 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
931 /// \headerfile <x86intrin.h>
933 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
935 /// \param __a
936 /// A 128-bit vector of [2 x double]. The lower double-precision value is
937 /// compared to the lower double-precision value of \a __b.
938 /// \param __b
939 /// A 128-bit vector of [2 x double]. The lower double-precision value is
940 /// compared to the lower double-precision value of \a __a.
941 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
942 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
943 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
944 __m128d __b) {
945 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
946 return __extension__(__m128d){__c[0], __a[1]};
949 /// Compares the lower double-precision floating-point values in each of
950 /// the two 128-bit floating-point vectors of [2 x double] for equality.
952 /// The comparison yields 0 for false, 1 for true. If either of the two
953 /// lower double-precision values is NaN, 0 is returned.
955 /// \headerfile <x86intrin.h>
957 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
959 /// \param __a
960 /// A 128-bit vector of [2 x double]. The lower double-precision value is
961 /// compared to the lower double-precision value of \a __b.
962 /// \param __b
963 /// A 128-bit vector of [2 x double]. The lower double-precision value is
964 /// compared to the lower double-precision value of \a __a.
965 /// \returns An integer containing the comparison results. If either of the two
966 /// lower double-precision values is NaN, 0 is returned.
967 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
968 __m128d __b) {
969 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
972 /// Compares the lower double-precision floating-point values in each of
973 /// the two 128-bit floating-point vectors of [2 x double] to determine if
974 /// the value in the first parameter is less than the corresponding value in
975 /// the second parameter.
977 /// The comparison yields 0 for false, 1 for true. If either of the two
978 /// lower double-precision values is NaN, 0 is returned.
980 /// \headerfile <x86intrin.h>
982 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
984 /// \param __a
985 /// A 128-bit vector of [2 x double]. The lower double-precision value is
986 /// compared to the lower double-precision value of \a __b.
987 /// \param __b
988 /// A 128-bit vector of [2 x double]. The lower double-precision value is
989 /// compared to the lower double-precision value of \a __a.
990 /// \returns An integer containing the comparison results. If either of the two
991 /// lower double-precision values is NaN, 0 is returned.
992 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
993 __m128d __b) {
994 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
997 /// Compares the lower double-precision floating-point values in each of
998 /// the two 128-bit floating-point vectors of [2 x double] to determine if
999 /// the value in the first parameter is less than or equal to the
1000 /// corresponding value in the second parameter.
1002 /// The comparison yields 0 for false, 1 for true. If either of the two
1003 /// lower double-precision values is NaN, 0 is returned.
1005 /// \headerfile <x86intrin.h>
1007 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1009 /// \param __a
1010 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1011 /// compared to the lower double-precision value of \a __b.
1012 /// \param __b
1013 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1014 /// compared to the lower double-precision value of \a __a.
1015 /// \returns An integer containing the comparison results. If either of the two
1016 /// lower double-precision values is NaN, 0 is returned.
1017 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1018 __m128d __b) {
1019 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1022 /// Compares the lower double-precision floating-point values in each of
1023 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1024 /// the value in the first parameter is greater than the corresponding value
1025 /// in the second parameter.
1027 /// The comparison yields 0 for false, 1 for true. If either of the two
1028 /// lower double-precision values is NaN, 0 is returned.
1030 /// \headerfile <x86intrin.h>
1032 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1034 /// \param __a
1035 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1036 /// compared to the lower double-precision value of \a __b.
1037 /// \param __b
1038 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1039 /// compared to the lower double-precision value of \a __a.
1040 /// \returns An integer containing the comparison results. If either of the two
1041 /// lower double-precision values is NaN, 0 is returned.
1042 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1043 __m128d __b) {
1044 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1047 /// Compares the lower double-precision floating-point values in each of
1048 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1049 /// the value in the first parameter is greater than or equal to the
1050 /// corresponding value in the second parameter.
1052 /// The comparison yields 0 for false, 1 for true. If either of the two
1053 /// lower double-precision values is NaN, 0 is returned.
1055 /// \headerfile <x86intrin.h>
1057 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1059 /// \param __a
1060 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1061 /// compared to the lower double-precision value of \a __b.
1062 /// \param __b
1063 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1064 /// compared to the lower double-precision value of \a __a.
1065 /// \returns An integer containing the comparison results. If either of the two
1066 /// lower double-precision values is NaN, 0 is returned.
1067 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1068 __m128d __b) {
1069 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1072 /// Compares the lower double-precision floating-point values in each of
1073 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1074 /// the value in the first parameter is unequal to the corresponding value in
1075 /// the second parameter.
1077 /// The comparison yields 0 for false, 1 for true. If either of the two
1078 /// lower double-precision values is NaN, 1 is returned.
1080 /// \headerfile <x86intrin.h>
1082 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1084 /// \param __a
1085 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1086 /// compared to the lower double-precision value of \a __b.
1087 /// \param __b
1088 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1089 /// compared to the lower double-precision value of \a __a.
1090 /// \returns An integer containing the comparison results. If either of the two
1091 /// lower double-precision values is NaN, 1 is returned.
1092 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1093 __m128d __b) {
1094 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1097 /// Compares the lower double-precision floating-point values in each of
1098 /// the two 128-bit floating-point vectors of [2 x double] for equality. The
1099 /// comparison yields 0 for false, 1 for true.
1101 /// If either of the two lower double-precision values is NaN, 0 is returned.
1103 /// \headerfile <x86intrin.h>
1105 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1107 /// \param __a
1108 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1109 /// compared to the lower double-precision value of \a __b.
1110 /// \param __b
1111 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1112 /// compared to the lower double-precision value of \a __a.
1113 /// \returns An integer containing the comparison results. If either of the two
1114 /// lower double-precision values is NaN, 0 is returned.
1115 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1116 __m128d __b) {
1117 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1120 /// Compares the lower double-precision floating-point values in each of
1121 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1122 /// the value in the first parameter is less than the corresponding value in
1123 /// the second parameter.
1125 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1126 /// double-precision values is NaN, 0 is returned.
1128 /// \headerfile <x86intrin.h>
1130 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1132 /// \param __a
1133 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1134 /// compared to the lower double-precision value of \a __b.
1135 /// \param __b
1136 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1137 /// compared to the lower double-precision value of \a __a.
1138 /// \returns An integer containing the comparison results. If either of the two
1139 /// lower double-precision values is NaN, 0 is returned.
1140 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1141 __m128d __b) {
1142 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1145 /// Compares the lower double-precision floating-point values in each of
1146 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1147 /// the value in the first parameter is less than or equal to the
1148 /// corresponding value in the second parameter.
1150 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1151 /// double-precision values is NaN, 0 is returned.
1153 /// \headerfile <x86intrin.h>
1155 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1157 /// \param __a
1158 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1159 /// compared to the lower double-precision value of \a __b.
1160 /// \param __b
1161 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1162 /// compared to the lower double-precision value of \a __a.
1163 /// \returns An integer containing the comparison results. If either of the two
1164 /// lower double-precision values is NaN, 0 is returned.
1165 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1166 __m128d __b) {
1167 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1170 /// Compares the lower double-precision floating-point values in each of
1171 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1172 /// the value in the first parameter is greater than the corresponding value
1173 /// in the second parameter.
1175 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1176 /// double-precision values is NaN, 0 is returned.
1178 /// \headerfile <x86intrin.h>
1180 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1182 /// \param __a
1183 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1184 /// compared to the lower double-precision value of \a __b.
1185 /// \param __b
1186 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1187 /// compared to the lower double-precision value of \a __a.
1188 /// \returns An integer containing the comparison results. If either of the two
1189 /// lower double-precision values is NaN, 0 is returned.
1190 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1191 __m128d __b) {
1192 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1195 /// Compares the lower double-precision floating-point values in each of
1196 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1197 /// the value in the first parameter is greater than or equal to the
1198 /// corresponding value in the second parameter.
1200 /// The comparison yields 0 for false, 1 for true. If either of the two
1201 /// lower double-precision values is NaN, 0 is returned.
1203 /// \headerfile <x86intrin.h>
1205 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1207 /// \param __a
1208 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1209 /// compared to the lower double-precision value of \a __b.
1210 /// \param __b
1211 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1212 /// compared to the lower double-precision value of \a __a.
1213 /// \returns An integer containing the comparison results. If either of the two
1214 /// lower double-precision values is NaN, 0 is returned.
1215 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1216 __m128d __b) {
1217 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1220 /// Compares the lower double-precision floating-point values in each of
1221 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1222 /// the value in the first parameter is unequal to the corresponding value in
1223 /// the second parameter.
1225 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1226 /// double-precision values is NaN, 1 is returned.
1228 /// \headerfile <x86intrin.h>
1230 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1232 /// \param __a
1233 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1234 /// compared to the lower double-precision value of \a __b.
1235 /// \param __b
1236 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1237 /// compared to the lower double-precision value of \a __a.
1238 /// \returns An integer containing the comparison result. If either of the two
1239 /// lower double-precision values is NaN, 1 is returned.
1240 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1241 __m128d __b) {
1242 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1245 /// Converts the two double-precision floating-point elements of a
1246 /// 128-bit vector of [2 x double] into two single-precision floating-point
1247 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1248 /// The upper 64 bits of the result vector are set to zero.
1250 /// \headerfile <x86intrin.h>
1252 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1254 /// \param __a
1255 /// A 128-bit vector of [2 x double].
1256 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1257 /// converted values. The upper 64 bits are set to zero.
1258 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1259 return __builtin_ia32_cvtpd2ps((__v2df)__a);
1262 /// Converts the lower two single-precision floating-point elements of a
1263 /// 128-bit vector of [4 x float] into two double-precision floating-point
1264 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1265 /// elements of the input vector are unused.
1267 /// \headerfile <x86intrin.h>
1269 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1271 /// \param __a
1272 /// A 128-bit vector of [4 x float]. The lower two single-precision
1273 /// floating-point elements are converted to double-precision values. The
1274 /// upper two elements are unused.
1275 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1276 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1277 return (__m128d) __builtin_convertvector(
1278 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1281 /// Converts the lower two integer elements of a 128-bit vector of
1282 /// [4 x i32] into two double-precision floating-point values, returned in a
1283 /// 128-bit vector of [2 x double].
1285 /// The upper two elements of the input vector are unused.
1287 /// \headerfile <x86intrin.h>
1289 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1291 /// \param __a
1292 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1293 /// converted to double-precision values.
1295 /// The upper two elements are unused.
1296 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1297 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1298 return (__m128d) __builtin_convertvector(
1299 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1302 /// Converts the two double-precision floating-point elements of a
1303 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1304 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1305 /// 64 bits of the result vector are set to zero.
1307 /// \headerfile <x86intrin.h>
1309 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1311 /// \param __a
1312 /// A 128-bit vector of [2 x double].
1313 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1314 /// converted values. The upper 64 bits are set to zero.
1315 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1316 return __builtin_ia32_cvtpd2dq((__v2df)__a);
1319 /// Converts the low-order element of a 128-bit vector of [2 x double]
1320 /// into a 32-bit signed integer value.
1322 /// \headerfile <x86intrin.h>
1324 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1326 /// \param __a
1327 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1328 /// conversion.
1329 /// \returns A 32-bit signed integer containing the converted value.
1330 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1331 return __builtin_ia32_cvtsd2si((__v2df)__a);
1334 /// Converts the lower double-precision floating-point element of a
1335 /// 128-bit vector of [2 x double], in the second parameter, into a
1336 /// single-precision floating-point value, returned in the lower 32 bits of a
1337 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1338 /// copied from the upper 96 bits of the first parameter.
1340 /// \headerfile <x86intrin.h>
1342 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1344 /// \param __a
1345 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1346 /// copied to the upper 96 bits of the result.
1347 /// \param __b
1348 /// A 128-bit vector of [2 x double]. The lower double-precision
1349 /// floating-point element is used in the conversion.
1350 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1351 /// converted value from the second parameter. The upper 96 bits are copied
1352 /// from the upper 96 bits of the first parameter.
1353 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1354 __m128d __b) {
1355 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1358 /// Converts a 32-bit signed integer value, in the second parameter, into
1359 /// a double-precision floating-point value, returned in the lower 64 bits of
1360 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1361 /// are copied from the upper 64 bits of the first parameter.
1363 /// \headerfile <x86intrin.h>
1365 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1367 /// \param __a
1368 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1369 /// copied to the upper 64 bits of the result.
1370 /// \param __b
1371 /// A 32-bit signed integer containing the value to be converted.
1372 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1373 /// converted value from the second parameter. The upper 64 bits are copied
1374 /// from the upper 64 bits of the first parameter.
1375 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1376 int __b) {
1377 __a[0] = __b;
1378 return __a;
1381 /// Converts the lower single-precision floating-point element of a
1382 /// 128-bit vector of [4 x float], in the second parameter, into a
1383 /// double-precision floating-point value, returned in the lower 64 bits of
1384 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1385 /// are copied from the upper 64 bits of the first parameter.
1387 /// \headerfile <x86intrin.h>
1389 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1391 /// \param __a
1392 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1393 /// copied to the upper 64 bits of the result.
1394 /// \param __b
1395 /// A 128-bit vector of [4 x float]. The lower single-precision
1396 /// floating-point element is used in the conversion.
1397 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1398 /// converted value from the second parameter. The upper 64 bits are copied
1399 /// from the upper 64 bits of the first parameter.
1400 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1401 __m128 __b) {
1402 __a[0] = __b[0];
1403 return __a;
1406 /// Converts the two double-precision floating-point elements of a
1407 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1408 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1410 /// If the result of either conversion is inexact, the result is truncated
1411 /// (rounded towards zero) regardless of the current MXCSR setting. The upper
1412 /// 64 bits of the result vector are set to zero.
1414 /// \headerfile <x86intrin.h>
1416 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1417 /// instruction.
1419 /// \param __a
1420 /// A 128-bit vector of [2 x double].
1421 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1422 /// converted values. The upper 64 bits are set to zero.
1423 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1424 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1427 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1428 /// signed integer value, truncating the result when it is inexact.
1430 /// \headerfile <x86intrin.h>
1432 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1433 /// instruction.
1435 /// \param __a
1436 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1437 /// conversion.
1438 /// \returns A 32-bit signed integer containing the converted value.
1439 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1440 return __builtin_ia32_cvttsd2si((__v2df)__a);
1443 /// Converts the two double-precision floating-point elements of a
1444 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1445 /// returned in a 64-bit vector of [2 x i32].
1447 /// \headerfile <x86intrin.h>
1449 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1451 /// \param __a
1452 /// A 128-bit vector of [2 x double].
1453 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1454 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1455 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1458 /// Converts the two double-precision floating-point elements of a
1459 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1460 /// returned in a 64-bit vector of [2 x i32].
1462 /// If the result of either conversion is inexact, the result is truncated
1463 /// (rounded towards zero) regardless of the current MXCSR setting.
1465 /// \headerfile <x86intrin.h>
1467 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1469 /// \param __a
1470 /// A 128-bit vector of [2 x double].
1471 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1472 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1473 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1476 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1477 /// [2 x i32] into two double-precision floating-point values, returned in a
1478 /// 128-bit vector of [2 x double].
1480 /// \headerfile <x86intrin.h>
1482 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1484 /// \param __a
1485 /// A 64-bit vector of [2 x i32].
1486 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1487 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1488 return __builtin_ia32_cvtpi2pd((__v2si)__a);
1491 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1492 /// a double-precision floating-point value.
1494 /// \headerfile <x86intrin.h>
1496 /// This intrinsic has no corresponding instruction.
1498 /// \param __a
1499 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1500 /// \returns A double-precision floating-point value copied from the lower 64
1501 /// bits of \a __a.
1502 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1503 return __a[0];
1506 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1507 /// memory location.
1509 /// \headerfile <x86intrin.h>
1511 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1513 /// \param __dp
1514 /// A pointer to a 128-bit memory location. The address of the memory
1515 /// location has to be 16-byte aligned.
1516 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1517 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1518 return *(const __m128d *)__dp;
1521 /// Loads a double-precision floating-point value from a specified memory
1522 /// location and duplicates it to both vector elements of a 128-bit vector of
1523 /// [2 x double].
1525 /// \headerfile <x86intrin.h>
1527 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1529 /// \param __dp
1530 /// A pointer to a memory location containing a double-precision value.
1531 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1532 /// duplicated values.
1533 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1534 struct __mm_load1_pd_struct {
1535 double __u;
1536 } __attribute__((__packed__, __may_alias__));
1537 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1538 return __extension__(__m128d){__u, __u};
1541 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1543 /// Loads two double-precision values, in reverse order, from an aligned
1544 /// memory location into a 128-bit vector of [2 x double].
1546 /// \headerfile <x86intrin.h>
1548 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1549 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1550 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1552 /// \param __dp
1553 /// A 16-byte aligned pointer to an array of double-precision values to be
1554 /// loaded in reverse order.
1555 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1556 /// values.
1557 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1558 __m128d __u = *(const __m128d *)__dp;
1559 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1562 /// Loads a 128-bit floating-point vector of [2 x double] from an
1563 /// unaligned memory location.
1565 /// \headerfile <x86intrin.h>
1567 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1569 /// \param __dp
1570 /// A pointer to a 128-bit memory location. The address of the memory
1571 /// location does not have to be aligned.
1572 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1573 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1574 struct __loadu_pd {
1575 __m128d_u __v;
1576 } __attribute__((__packed__, __may_alias__));
1577 return ((const struct __loadu_pd *)__dp)->__v;
1580 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1581 /// vector and clears the upper element.
1583 /// \headerfile <x86intrin.h>
1585 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1587 /// \param __a
1588 /// A pointer to a 64-bit memory location. The address of the memory
1589 /// location does not have to be aligned.
1590 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1591 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1592 struct __loadu_si64 {
1593 long long __v;
1594 } __attribute__((__packed__, __may_alias__));
1595 long long __u = ((const struct __loadu_si64 *)__a)->__v;
1596 return __extension__(__m128i)(__v2di){__u, 0LL};
1599 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1600 /// vector and clears the upper element.
1602 /// \headerfile <x86intrin.h>
1604 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1606 /// \param __a
1607 /// A pointer to a 32-bit memory location. The address of the memory
1608 /// location does not have to be aligned.
1609 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1610 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1611 struct __loadu_si32 {
1612 int __v;
1613 } __attribute__((__packed__, __may_alias__));
1614 int __u = ((const struct __loadu_si32 *)__a)->__v;
1615 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1618 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1619 /// vector and clears the upper element.
1621 /// \headerfile <x86intrin.h>
1623 /// This intrinsic does not correspond to a specific instruction.
1625 /// \param __a
1626 /// A pointer to a 16-bit memory location. The address of the memory
1627 /// location does not have to be aligned.
1628 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1629 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1630 struct __loadu_si16 {
1631 short __v;
1632 } __attribute__((__packed__, __may_alias__));
1633 short __u = ((const struct __loadu_si16 *)__a)->__v;
1634 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1637 /// Loads a 64-bit double-precision value to the low element of a
1638 /// 128-bit integer vector and clears the upper element.
1640 /// \headerfile <x86intrin.h>
1642 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1644 /// \param __dp
1645 /// A pointer to a memory location containing a double-precision value.
1646 /// The address of the memory location does not have to be aligned.
1647 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1648 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1649 struct __mm_load_sd_struct {
1650 double __u;
1651 } __attribute__((__packed__, __may_alias__));
1652 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1653 return __extension__(__m128d){__u, 0};
1656 /// Loads a double-precision value into the high-order bits of a 128-bit
1657 /// vector of [2 x double]. The low-order bits are copied from the low-order
1658 /// bits of the first operand.
1660 /// \headerfile <x86intrin.h>
1662 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1664 /// \param __a
1665 /// A 128-bit vector of [2 x double]. \n
1666 /// Bits [63:0] are written to bits [63:0] of the result.
1667 /// \param __dp
1668 /// A pointer to a 64-bit memory location containing a double-precision
1669 /// floating-point value that is loaded. The loaded value is written to bits
1670 /// [127:64] of the result. The address of the memory location does not have
1671 /// to be aligned.
1672 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1673 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1674 double const *__dp) {
1675 struct __mm_loadh_pd_struct {
1676 double __u;
1677 } __attribute__((__packed__, __may_alias__));
1678 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1679 return __extension__(__m128d){__a[0], __u};
1682 /// Loads a double-precision value into the low-order bits of a 128-bit
1683 /// vector of [2 x double]. The high-order bits are copied from the
1684 /// high-order bits of the first operand.
1686 /// \headerfile <x86intrin.h>
1688 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1690 /// \param __a
1691 /// A 128-bit vector of [2 x double]. \n
1692 /// Bits [127:64] are written to bits [127:64] of the result.
1693 /// \param __dp
1694 /// A pointer to a 64-bit memory location containing a double-precision
1695 /// floating-point value that is loaded. The loaded value is written to bits
1696 /// [63:0] of the result. The address of the memory location does not have to
1697 /// be aligned.
1698 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1699 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1700 double const *__dp) {
1701 struct __mm_loadl_pd_struct {
1702 double __u;
1703 } __attribute__((__packed__, __may_alias__));
1704 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1705 return __extension__(__m128d){__u, __a[1]};
1708 /// Constructs a 128-bit floating-point vector of [2 x double] with
1709 /// unspecified content. This could be used as an argument to another
1710 /// intrinsic function where the argument is required but the value is not
1711 /// actually used.
1713 /// \headerfile <x86intrin.h>
1715 /// This intrinsic has no corresponding instruction.
1717 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1718 /// content.
1719 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1720 return (__m128d)__builtin_ia32_undef128();
1723 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1724 /// 64 bits of the vector are initialized with the specified double-precision
1725 /// floating-point value. The upper 64 bits are set to zero.
1727 /// \headerfile <x86intrin.h>
1729 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1731 /// \param __w
1732 /// A double-precision floating-point value used to initialize the lower 64
1733 /// bits of the result.
1734 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1735 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1736 /// set to zero.
1737 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1738 return __extension__(__m128d){__w, 0};
1741 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1742 /// of the two double-precision floating-point vector elements set to the
1743 /// specified double-precision floating-point value.
1745 /// \headerfile <x86intrin.h>
1747 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1749 /// \param __w
1750 /// A double-precision floating-point value used to initialize each vector
1751 /// element of the result.
1752 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1753 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1754 return __extension__(__m128d){__w, __w};
1757 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1758 /// of the two double-precision floating-point vector elements set to the
1759 /// specified double-precision floating-point value.
1761 /// \headerfile <x86intrin.h>
1763 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1765 /// \param __w
1766 /// A double-precision floating-point value used to initialize each vector
1767 /// element of the result.
1768 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1769 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1770 return _mm_set1_pd(__w);
1773 /// Constructs a 128-bit floating-point vector of [2 x double]
1774 /// initialized with the specified double-precision floating-point values.
1776 /// \headerfile <x86intrin.h>
1778 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1780 /// \param __w
1781 /// A double-precision floating-point value used to initialize the upper 64
1782 /// bits of the result.
1783 /// \param __x
1784 /// A double-precision floating-point value used to initialize the lower 64
1785 /// bits of the result.
1786 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1787 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1788 double __x) {
1789 return __extension__(__m128d){__x, __w};
1792 /// Constructs a 128-bit floating-point vector of [2 x double],
1793 /// initialized in reverse order with the specified double-precision
1794 /// floating-point values.
1796 /// \headerfile <x86intrin.h>
1798 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1800 /// \param __w
1801 /// A double-precision floating-point value used to initialize the lower 64
1802 /// bits of the result.
1803 /// \param __x
1804 /// A double-precision floating-point value used to initialize the upper 64
1805 /// bits of the result.
1806 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1807 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1808 double __x) {
1809 return __extension__(__m128d){__w, __x};
1812 /// Constructs a 128-bit floating-point vector of [2 x double]
1813 /// initialized to zero.
1815 /// \headerfile <x86intrin.h>
1817 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1819 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1820 /// all elements set to zero.
1821 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1822 return __extension__(__m128d){0.0, 0.0};
1825 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1826 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1827 /// 64 bits are set to the upper 64 bits of the first parameter.
1829 /// \headerfile <x86intrin.h>
1831 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1833 /// \param __a
1834 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1835 /// upper 64 bits of the result.
1836 /// \param __b
1837 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1838 /// lower 64 bits of the result.
1839 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1840 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1841 __m128d __b) {
1842 __a[0] = __b[0];
1843 return __a;
1846 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1847 /// memory location.
1849 /// \headerfile <x86intrin.h>
1851 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1853 /// \param __dp
1854 /// A pointer to a 64-bit memory location.
1855 /// \param __a
1856 /// A 128-bit vector of [2 x double] containing the value to be stored.
1857 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1858 __m128d __a) {
1859 struct __mm_store_sd_struct {
1860 double __u;
1861 } __attribute__((__packed__, __may_alias__));
1862 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1865 /// Moves packed double-precision values from a 128-bit vector of
1866 /// [2 x double] to a memory location.
1868 /// \headerfile <x86intrin.h>
1870 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1872 /// \param __dp
1873 /// A pointer to an aligned memory location that can store two
1874 /// double-precision values.
1875 /// \param __a
1876 /// A packed 128-bit vector of [2 x double] containing the values to be
1877 /// moved.
1878 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1879 __m128d __a) {
1880 *(__m128d *)__dp = __a;
1883 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1884 /// the upper and lower 64 bits of a memory location.
1886 /// \headerfile <x86intrin.h>
1888 /// This intrinsic corresponds to the
1889 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1891 /// \param __dp
1892 /// A pointer to a memory location that can store two double-precision
1893 /// values.
1894 /// \param __a
1895 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1896 /// of the values in \a __dp.
1897 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1898 __m128d __a) {
1899 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1900 _mm_store_pd(__dp, __a);
1903 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1904 /// the upper and lower 64 bits of a memory location.
1906 /// \headerfile <x86intrin.h>
1908 /// This intrinsic corresponds to the
1909 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1911 /// \param __dp
1912 /// A pointer to a memory location that can store two double-precision
1913 /// values.
1914 /// \param __a
1915 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1916 /// of the values in \a __dp.
1917 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1918 __m128d __a) {
1919 _mm_store1_pd(__dp, __a);
1922 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1923 /// location.
1925 /// \headerfile <x86intrin.h>
1927 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1929 /// \param __dp
1930 /// A pointer to a 128-bit memory location. The address of the memory
1931 /// location does not have to be aligned.
1932 /// \param __a
1933 /// A 128-bit vector of [2 x double] containing the values to be stored.
1934 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1935 __m128d __a) {
1936 struct __storeu_pd {
1937 __m128d_u __v;
1938 } __attribute__((__packed__, __may_alias__));
1939 ((struct __storeu_pd *)__dp)->__v = __a;
1942 /// Stores two double-precision values, in reverse order, from a 128-bit
1943 /// vector of [2 x double] to a 16-byte aligned memory location.
1945 /// \headerfile <x86intrin.h>
1947 /// This intrinsic corresponds to a shuffling instruction followed by a
1948 /// <c> VMOVAPD / MOVAPD </c> instruction.
1950 /// \param __dp
1951 /// A pointer to a 16-byte aligned memory location that can store two
1952 /// double-precision values.
1953 /// \param __a
1954 /// A 128-bit vector of [2 x double] containing the values to be reversed and
1955 /// stored.
1956 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1957 __m128d __a) {
1958 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1959 *(__m128d *)__dp = __a;
1962 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1963 /// memory location.
1965 /// \headerfile <x86intrin.h>
1967 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1969 /// \param __dp
1970 /// A pointer to a 64-bit memory location.
1971 /// \param __a
1972 /// A 128-bit vector of [2 x double] containing the value to be stored.
1973 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
1974 __m128d __a) {
1975 struct __mm_storeh_pd_struct {
1976 double __u;
1977 } __attribute__((__packed__, __may_alias__));
1978 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
1981 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1982 /// memory location.
1984 /// \headerfile <x86intrin.h>
1986 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1988 /// \param __dp
1989 /// A pointer to a 64-bit memory location.
1990 /// \param __a
1991 /// A 128-bit vector of [2 x double] containing the value to be stored.
1992 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
1993 __m128d __a) {
1994 struct __mm_storeh_pd_struct {
1995 double __u;
1996 } __attribute__((__packed__, __may_alias__));
1997 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
2000 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2001 /// saving the lower 8 bits of each sum in the corresponding element of a
2002 /// 128-bit result vector of [16 x i8].
2004 /// The integer elements of both parameters can be either signed or unsigned.
2006 /// \headerfile <x86intrin.h>
2008 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2010 /// \param __a
2011 /// A 128-bit vector of [16 x i8].
2012 /// \param __b
2013 /// A 128-bit vector of [16 x i8].
2014 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2015 /// parameters.
2016 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2017 __m128i __b) {
2018 return (__m128i)((__v16qu)__a + (__v16qu)__b);
2021 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2022 /// saving the lower 16 bits of each sum in the corresponding element of a
2023 /// 128-bit result vector of [8 x i16].
2025 /// The integer elements of both parameters can be either signed or unsigned.
2027 /// \headerfile <x86intrin.h>
2029 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2031 /// \param __a
2032 /// A 128-bit vector of [8 x i16].
2033 /// \param __b
2034 /// A 128-bit vector of [8 x i16].
2035 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2036 /// parameters.
2037 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2038 __m128i __b) {
2039 return (__m128i)((__v8hu)__a + (__v8hu)__b);
2042 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2043 /// saving the lower 32 bits of each sum in the corresponding element of a
2044 /// 128-bit result vector of [4 x i32].
2046 /// The integer elements of both parameters can be either signed or unsigned.
2048 /// \headerfile <x86intrin.h>
2050 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2052 /// \param __a
2053 /// A 128-bit vector of [4 x i32].
2054 /// \param __b
2055 /// A 128-bit vector of [4 x i32].
2056 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2057 /// parameters.
2058 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2059 __m128i __b) {
2060 return (__m128i)((__v4su)__a + (__v4su)__b);
2063 /// Adds two signed or unsigned 64-bit integer values, returning the
2064 /// lower 64 bits of the sum.
2066 /// \headerfile <x86intrin.h>
2068 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2070 /// \param __a
2071 /// A 64-bit integer.
2072 /// \param __b
2073 /// A 64-bit integer.
2074 /// \returns A 64-bit integer containing the sum of both parameters.
2075 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2076 __m64 __b) {
2077 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2080 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2081 /// saving the lower 64 bits of each sum in the corresponding element of a
2082 /// 128-bit result vector of [2 x i64].
2084 /// The integer elements of both parameters can be either signed or unsigned.
2086 /// \headerfile <x86intrin.h>
2088 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2090 /// \param __a
2091 /// A 128-bit vector of [2 x i64].
2092 /// \param __b
2093 /// A 128-bit vector of [2 x i64].
2094 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2095 /// parameters.
2096 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2097 __m128i __b) {
2098 return (__m128i)((__v2du)__a + (__v2du)__b);
2101 /// Adds, with saturation, the corresponding elements of two 128-bit
2102 /// signed [16 x i8] vectors, saving each sum in the corresponding element of
2103 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2104 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2106 /// \headerfile <x86intrin.h>
2108 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2110 /// \param __a
2111 /// A 128-bit signed [16 x i8] vector.
2112 /// \param __b
2113 /// A 128-bit signed [16 x i8] vector.
2114 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2115 /// both parameters.
2116 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2117 __m128i __b) {
2118 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2121 /// Adds, with saturation, the corresponding elements of two 128-bit
2122 /// signed [8 x i16] vectors, saving each sum in the corresponding element of
2123 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2124 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2125 /// 0x8000.
2127 /// \headerfile <x86intrin.h>
2129 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2131 /// \param __a
2132 /// A 128-bit signed [8 x i16] vector.
2133 /// \param __b
2134 /// A 128-bit signed [8 x i16] vector.
2135 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2136 /// both parameters.
2137 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2138 __m128i __b) {
2139 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2142 /// Adds, with saturation, the corresponding elements of two 128-bit
2143 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2144 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2145 /// are saturated to 0xFF. Negative sums are saturated to 0x00.
2147 /// \headerfile <x86intrin.h>
2149 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2151 /// \param __a
2152 /// A 128-bit unsigned [16 x i8] vector.
2153 /// \param __b
2154 /// A 128-bit unsigned [16 x i8] vector.
2155 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2156 /// of both parameters.
2157 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2158 __m128i __b) {
2159 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2162 /// Adds, with saturation, the corresponding elements of two 128-bit
2163 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2164 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than
2165 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2167 /// \headerfile <x86intrin.h>
2169 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2171 /// \param __a
2172 /// A 128-bit unsigned [8 x i16] vector.
2173 /// \param __b
2174 /// A 128-bit unsigned [8 x i16] vector.
2175 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2176 /// of both parameters.
2177 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2178 __m128i __b) {
2179 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2182 /// Computes the rounded averages of corresponding elements of two
2183 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2184 /// corresponding element of a 128-bit result vector of [16 x i8].
2186 /// \headerfile <x86intrin.h>
2188 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2190 /// \param __a
2191 /// A 128-bit unsigned [16 x i8] vector.
2192 /// \param __b
2193 /// A 128-bit unsigned [16 x i8] vector.
2194 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2195 /// averages of both parameters.
2196 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2197 __m128i __b) {
2198 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2201 /// Computes the rounded averages of corresponding elements of two
2202 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2203 /// corresponding element of a 128-bit result vector of [8 x i16].
2205 /// \headerfile <x86intrin.h>
2207 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2209 /// \param __a
2210 /// A 128-bit unsigned [8 x i16] vector.
2211 /// \param __b
2212 /// A 128-bit unsigned [8 x i16] vector.
2213 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2214 /// averages of both parameters.
2215 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2216 __m128i __b) {
2217 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2220 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2221 /// vectors, producing eight intermediate 32-bit signed integer products, and
2222 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2223 /// [4 x i32] vector.
2225 /// For example, bits [15:0] of both parameters are multiplied producing a
2226 /// 32-bit product, bits [31:16] of both parameters are multiplied producing
2227 /// a 32-bit product, and the sum of those two products becomes bits [31:0]
2228 /// of the result.
2230 /// \headerfile <x86intrin.h>
2232 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2234 /// \param __a
2235 /// A 128-bit signed [8 x i16] vector.
2236 /// \param __b
2237 /// A 128-bit signed [8 x i16] vector.
2238 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2239 /// of both parameters.
2240 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2241 __m128i __b) {
2242 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2245 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2246 /// vectors, saving the greater value from each comparison in the
2247 /// corresponding element of a 128-bit result vector of [8 x i16].
2249 /// \headerfile <x86intrin.h>
2251 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2253 /// \param __a
2254 /// A 128-bit signed [8 x i16] vector.
2255 /// \param __b
2256 /// A 128-bit signed [8 x i16] vector.
2257 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2258 /// each comparison.
2259 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2260 __m128i __b) {
2261 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2264 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2265 /// vectors, saving the greater value from each comparison in the
2266 /// corresponding element of a 128-bit result vector of [16 x i8].
2268 /// \headerfile <x86intrin.h>
2270 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2272 /// \param __a
2273 /// A 128-bit unsigned [16 x i8] vector.
2274 /// \param __b
2275 /// A 128-bit unsigned [16 x i8] vector.
2276 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2277 /// each comparison.
2278 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2279 __m128i __b) {
2280 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2283 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2284 /// vectors, saving the smaller value from each comparison in the
2285 /// corresponding element of a 128-bit result vector of [8 x i16].
2287 /// \headerfile <x86intrin.h>
2289 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2291 /// \param __a
2292 /// A 128-bit signed [8 x i16] vector.
2293 /// \param __b
2294 /// A 128-bit signed [8 x i16] vector.
2295 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2296 /// each comparison.
2297 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2298 __m128i __b) {
2299 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2302 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2303 /// vectors, saving the smaller value from each comparison in the
2304 /// corresponding element of a 128-bit result vector of [16 x i8].
2306 /// \headerfile <x86intrin.h>
2308 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2310 /// \param __a
2311 /// A 128-bit unsigned [16 x i8] vector.
2312 /// \param __b
2313 /// A 128-bit unsigned [16 x i8] vector.
2314 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2315 /// each comparison.
2316 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2317 __m128i __b) {
2318 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2321 /// Multiplies the corresponding elements of two signed [8 x i16]
2322 /// vectors, saving the upper 16 bits of each 32-bit product in the
2323 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2325 /// \headerfile <x86intrin.h>
2327 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2329 /// \param __a
2330 /// A 128-bit signed [8 x i16] vector.
2331 /// \param __b
2332 /// A 128-bit signed [8 x i16] vector.
2333 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2334 /// each of the eight 32-bit products.
2335 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2336 __m128i __b) {
2337 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2340 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2341 /// vectors, saving the upper 16 bits of each 32-bit product in the
2342 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2344 /// \headerfile <x86intrin.h>
2346 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2348 /// \param __a
2349 /// A 128-bit unsigned [8 x i16] vector.
2350 /// \param __b
2351 /// A 128-bit unsigned [8 x i16] vector.
2352 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2353 /// of each of the eight 32-bit products.
2354 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2355 __m128i __b) {
2356 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2359 /// Multiplies the corresponding elements of two signed [8 x i16]
2360 /// vectors, saving the lower 16 bits of each 32-bit product in the
2361 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2363 /// \headerfile <x86intrin.h>
2365 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2367 /// \param __a
2368 /// A 128-bit signed [8 x i16] vector.
2369 /// \param __b
2370 /// A 128-bit signed [8 x i16] vector.
2371 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2372 /// each of the eight 32-bit products.
2373 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2374 __m128i __b) {
2375 return (__m128i)((__v8hu)__a * (__v8hu)__b);
2378 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2379 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2380 /// product.
2382 /// \headerfile <x86intrin.h>
2384 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2386 /// \param __a
2387 /// A 64-bit integer containing one of the source operands.
2388 /// \param __b
2389 /// A 64-bit integer containing one of the source operands.
2390 /// \returns A 64-bit integer vector containing the product of both operands.
2391 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2392 __m64 __b) {
2393 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2396 /// Multiplies 32-bit unsigned integer values contained in the lower
2397 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2398 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2400 /// \headerfile <x86intrin.h>
2402 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2404 /// \param __a
2405 /// A [2 x i64] vector containing one of the source operands.
2406 /// \param __b
2407 /// A [2 x i64] vector containing one of the source operands.
2408 /// \returns A [2 x i64] vector containing the product of both operands.
2409 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2410 __m128i __b) {
2411 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2414 /// Computes the absolute differences of corresponding 8-bit integer
2415 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2416 /// separately sums the second 8 absolute differences. Packs these two
2417 /// unsigned 16-bit integer sums into the upper and lower elements of a
2418 /// [2 x i64] vector.
2420 /// \headerfile <x86intrin.h>
2422 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2424 /// \param __a
2425 /// A 128-bit integer vector containing one of the source operands.
2426 /// \param __b
2427 /// A 128-bit integer vector containing one of the source operands.
2428 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2429 /// differences between both operands.
2430 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2431 __m128i __b) {
2432 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2435 /// Subtracts the corresponding 8-bit integer values in the operands.
2437 /// \headerfile <x86intrin.h>
2439 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2441 /// \param __a
2442 /// A 128-bit integer vector containing the minuends.
2443 /// \param __b
2444 /// A 128-bit integer vector containing the subtrahends.
2445 /// \returns A 128-bit integer vector containing the differences of the values
2446 /// in the operands.
2447 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2448 __m128i __b) {
2449 return (__m128i)((__v16qu)__a - (__v16qu)__b);
2452 /// Subtracts the corresponding 16-bit integer values in the operands.
2454 /// \headerfile <x86intrin.h>
2456 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2458 /// \param __a
2459 /// A 128-bit integer vector containing the minuends.
2460 /// \param __b
2461 /// A 128-bit integer vector containing the subtrahends.
2462 /// \returns A 128-bit integer vector containing the differences of the values
2463 /// in the operands.
2464 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2465 __m128i __b) {
2466 return (__m128i)((__v8hu)__a - (__v8hu)__b);
2469 /// Subtracts the corresponding 32-bit integer values in the operands.
2471 /// \headerfile <x86intrin.h>
2473 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2475 /// \param __a
2476 /// A 128-bit integer vector containing the minuends.
2477 /// \param __b
2478 /// A 128-bit integer vector containing the subtrahends.
2479 /// \returns A 128-bit integer vector containing the differences of the values
2480 /// in the operands.
2481 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2482 __m128i __b) {
2483 return (__m128i)((__v4su)__a - (__v4su)__b);
2486 /// Subtracts signed or unsigned 64-bit integer values and writes the
2487 /// difference to the corresponding bits in the destination.
2489 /// \headerfile <x86intrin.h>
2491 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2493 /// \param __a
2494 /// A 64-bit integer vector containing the minuend.
2495 /// \param __b
2496 /// A 64-bit integer vector containing the subtrahend.
2497 /// \returns A 64-bit integer vector containing the difference of the values in
2498 /// the operands.
2499 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2500 __m64 __b) {
2501 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2504 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2506 /// \headerfile <x86intrin.h>
2508 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2510 /// \param __a
2511 /// A 128-bit integer vector containing the minuends.
2512 /// \param __b
2513 /// A 128-bit integer vector containing the subtrahends.
2514 /// \returns A 128-bit integer vector containing the differences of the values
2515 /// in the operands.
2516 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2517 __m128i __b) {
2518 return (__m128i)((__v2du)__a - (__v2du)__b);
2521 /// Subtracts corresponding 8-bit signed integer values in the input and
2522 /// returns the differences in the corresponding bytes in the destination.
2523 /// Differences greater than 0x7F are saturated to 0x7F, and differences less
2524 /// than 0x80 are saturated to 0x80.
2526 /// \headerfile <x86intrin.h>
2528 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2530 /// \param __a
2531 /// A 128-bit integer vector containing the minuends.
2532 /// \param __b
2533 /// A 128-bit integer vector containing the subtrahends.
2534 /// \returns A 128-bit integer vector containing the differences of the values
2535 /// in the operands.
2536 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2537 __m128i __b) {
2538 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2541 /// Subtracts corresponding 16-bit signed integer values in the input and
2542 /// returns the differences in the corresponding bytes in the destination.
2543 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2544 /// than 0x8000 are saturated to 0x8000.
2546 /// \headerfile <x86intrin.h>
2548 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2550 /// \param __a
2551 /// A 128-bit integer vector containing the minuends.
2552 /// \param __b
2553 /// A 128-bit integer vector containing the subtrahends.
2554 /// \returns A 128-bit integer vector containing the differences of the values
2555 /// in the operands.
2556 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2557 __m128i __b) {
2558 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2561 /// Subtracts corresponding 8-bit unsigned integer values in the input
2562 /// and returns the differences in the corresponding bytes in the
2563 /// destination. Differences less than 0x00 are saturated to 0x00.
2565 /// \headerfile <x86intrin.h>
2567 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2569 /// \param __a
2570 /// A 128-bit integer vector containing the minuends.
2571 /// \param __b
2572 /// A 128-bit integer vector containing the subtrahends.
2573 /// \returns A 128-bit integer vector containing the unsigned integer
2574 /// differences of the values in the operands.
2575 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2576 __m128i __b) {
2577 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2580 /// Subtracts corresponding 16-bit unsigned integer values in the input
2581 /// and returns the differences in the corresponding bytes in the
2582 /// destination. Differences less than 0x0000 are saturated to 0x0000.
2584 /// \headerfile <x86intrin.h>
2586 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2588 /// \param __a
2589 /// A 128-bit integer vector containing the minuends.
2590 /// \param __b
2591 /// A 128-bit integer vector containing the subtrahends.
2592 /// \returns A 128-bit integer vector containing the unsigned integer
2593 /// differences of the values in the operands.
2594 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2595 __m128i __b) {
2596 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2599 /// Performs a bitwise AND of two 128-bit integer vectors.
2601 /// \headerfile <x86intrin.h>
2603 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2605 /// \param __a
2606 /// A 128-bit integer vector containing one of the source operands.
2607 /// \param __b
2608 /// A 128-bit integer vector containing one of the source operands.
2609 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2610 /// in both operands.
2611 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2612 __m128i __b) {
2613 return (__m128i)((__v2du)__a & (__v2du)__b);
2616 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2617 /// one's complement of the values contained in the first source operand.
2619 /// \headerfile <x86intrin.h>
2621 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2623 /// \param __a
2624 /// A 128-bit vector containing the left source operand. The one's complement
2625 /// of this value is used in the bitwise AND.
2626 /// \param __b
2627 /// A 128-bit vector containing the right source operand.
2628 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2629 /// complement of the first operand and the values in the second operand.
2630 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2631 __m128i __b) {
2632 return (__m128i)(~(__v2du)__a & (__v2du)__b);
2634 /// Performs a bitwise OR of two 128-bit integer vectors.
2636 /// \headerfile <x86intrin.h>
2638 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2640 /// \param __a
2641 /// A 128-bit integer vector containing one of the source operands.
2642 /// \param __b
2643 /// A 128-bit integer vector containing one of the source operands.
2644 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2645 /// in both operands.
2646 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2647 __m128i __b) {
2648 return (__m128i)((__v2du)__a | (__v2du)__b);
2651 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2653 /// \headerfile <x86intrin.h>
2655 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2657 /// \param __a
2658 /// A 128-bit integer vector containing one of the source operands.
2659 /// \param __b
2660 /// A 128-bit integer vector containing one of the source operands.
2661 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2662 /// values in both operands.
2663 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2664 __m128i __b) {
2665 return (__m128i)((__v2du)__a ^ (__v2du)__b);
2668 /// Left-shifts the 128-bit integer vector operand by the specified
2669 /// number of bytes. Low-order bits are cleared.
2671 /// \headerfile <x86intrin.h>
2673 /// \code
2674 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2675 /// \endcode
2677 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2679 /// \param a
2680 /// A 128-bit integer vector containing the source operand.
2681 /// \param imm
2682 /// An immediate value specifying the number of bytes to left-shift operand
2683 /// \a a.
2684 /// \returns A 128-bit integer vector containing the left-shifted value.
2685 #define _mm_slli_si128(a, imm) \
2686 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2687 (int)(imm)))
2689 #define _mm_bslli_si128(a, imm) \
2690 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2691 (int)(imm)))
2693 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2694 /// by the specified number of bits. Low-order bits are cleared.
2696 /// \headerfile <x86intrin.h>
2698 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2700 /// \param __a
2701 /// A 128-bit integer vector containing the source operand.
2702 /// \param __count
2703 /// An integer value specifying the number of bits to left-shift each value
2704 /// in operand \a __a.
2705 /// \returns A 128-bit integer vector containing the left-shifted values.
2706 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2707 int __count) {
2708 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2711 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2712 /// by the specified number of bits. Low-order bits are cleared.
2714 /// \headerfile <x86intrin.h>
2716 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2718 /// \param __a
2719 /// A 128-bit integer vector containing the source operand.
2720 /// \param __count
2721 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2722 /// to left-shift each value in operand \a __a.
2723 /// \returns A 128-bit integer vector containing the left-shifted values.
2724 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2725 __m128i __count) {
2726 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2729 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2730 /// by the specified number of bits. Low-order bits are cleared.
2732 /// \headerfile <x86intrin.h>
2734 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2736 /// \param __a
2737 /// A 128-bit integer vector containing the source operand.
2738 /// \param __count
2739 /// An integer value specifying the number of bits to left-shift each value
2740 /// in operand \a __a.
2741 /// \returns A 128-bit integer vector containing the left-shifted values.
2742 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2743 int __count) {
2744 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2747 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2748 /// by the specified number of bits. Low-order bits are cleared.
2750 /// \headerfile <x86intrin.h>
2752 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2754 /// \param __a
2755 /// A 128-bit integer vector containing the source operand.
2756 /// \param __count
2757 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2758 /// to left-shift each value in operand \a __a.
2759 /// \returns A 128-bit integer vector containing the left-shifted values.
2760 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2761 __m128i __count) {
2762 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2765 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2766 /// by the specified number of bits. Low-order bits are cleared.
2768 /// \headerfile <x86intrin.h>
2770 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2772 /// \param __a
2773 /// A 128-bit integer vector containing the source operand.
2774 /// \param __count
2775 /// An integer value specifying the number of bits to left-shift each value
2776 /// in operand \a __a.
2777 /// \returns A 128-bit integer vector containing the left-shifted values.
2778 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2779 int __count) {
2780 return __builtin_ia32_psllqi128((__v2di)__a, __count);
2783 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2784 /// by the specified number of bits. Low-order bits are cleared.
2786 /// \headerfile <x86intrin.h>
2788 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2790 /// \param __a
2791 /// A 128-bit integer vector containing the source operand.
2792 /// \param __count
2793 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2794 /// to left-shift each value in operand \a __a.
2795 /// \returns A 128-bit integer vector containing the left-shifted values.
2796 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2797 __m128i __count) {
2798 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2801 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2802 /// by the specified number of bits. High-order bits are filled with the sign
2803 /// bit of the initial value.
2805 /// \headerfile <x86intrin.h>
2807 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2809 /// \param __a
2810 /// A 128-bit integer vector containing the source operand.
2811 /// \param __count
2812 /// An integer value specifying the number of bits to right-shift each value
2813 /// in operand \a __a.
2814 /// \returns A 128-bit integer vector containing the right-shifted values.
2815 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2816 int __count) {
2817 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2820 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2821 /// by the specified number of bits. High-order bits are filled with the sign
2822 /// bit of the initial value.
2824 /// \headerfile <x86intrin.h>
2826 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2828 /// \param __a
2829 /// A 128-bit integer vector containing the source operand.
2830 /// \param __count
2831 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2832 /// to right-shift each value in operand \a __a.
2833 /// \returns A 128-bit integer vector containing the right-shifted values.
2834 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2835 __m128i __count) {
2836 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2839 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2840 /// by the specified number of bits. High-order bits are filled with the sign
2841 /// bit of the initial value.
2843 /// \headerfile <x86intrin.h>
2845 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2847 /// \param __a
2848 /// A 128-bit integer vector containing the source operand.
2849 /// \param __count
2850 /// An integer value specifying the number of bits to right-shift each value
2851 /// in operand \a __a.
2852 /// \returns A 128-bit integer vector containing the right-shifted values.
2853 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2854 int __count) {
2855 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2858 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2859 /// by the specified number of bits. High-order bits are filled with the sign
2860 /// bit of the initial value.
2862 /// \headerfile <x86intrin.h>
2864 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2866 /// \param __a
2867 /// A 128-bit integer vector containing the source operand.
2868 /// \param __count
2869 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2870 /// to right-shift each value in operand \a __a.
2871 /// \returns A 128-bit integer vector containing the right-shifted values.
2872 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2873 __m128i __count) {
2874 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2877 /// Right-shifts the 128-bit integer vector operand by the specified
2878 /// number of bytes. High-order bits are cleared.
2880 /// \headerfile <x86intrin.h>
2882 /// \code
2883 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2884 /// \endcode
2886 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2888 /// \param a
2889 /// A 128-bit integer vector containing the source operand.
2890 /// \param imm
2891 /// An immediate value specifying the number of bytes to right-shift operand
2892 /// \a a.
2893 /// \returns A 128-bit integer vector containing the right-shifted value.
2894 #define _mm_srli_si128(a, imm) \
2895 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2896 (int)(imm)))
2898 #define _mm_bsrli_si128(a, imm) \
2899 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2900 (int)(imm)))
2902 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2903 /// operand by the specified number of bits. High-order bits are cleared.
2905 /// \headerfile <x86intrin.h>
2907 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2909 /// \param __a
2910 /// A 128-bit integer vector containing the source operand.
2911 /// \param __count
2912 /// An integer value specifying the number of bits to right-shift each value
2913 /// in operand \a __a.
2914 /// \returns A 128-bit integer vector containing the right-shifted values.
2915 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2916 int __count) {
2917 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2920 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2921 /// operand by the specified number of bits. High-order bits are cleared.
2923 /// \headerfile <x86intrin.h>
2925 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2927 /// \param __a
2928 /// A 128-bit integer vector containing the source operand.
2929 /// \param __count
2930 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2931 /// to right-shift each value in operand \a __a.
2932 /// \returns A 128-bit integer vector containing the right-shifted values.
2933 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2934 __m128i __count) {
2935 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2938 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2939 /// operand by the specified number of bits. High-order bits are cleared.
2941 /// \headerfile <x86intrin.h>
2943 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2945 /// \param __a
2946 /// A 128-bit integer vector containing the source operand.
2947 /// \param __count
2948 /// An integer value specifying the number of bits to right-shift each value
2949 /// in operand \a __a.
2950 /// \returns A 128-bit integer vector containing the right-shifted values.
2951 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
2952 int __count) {
2953 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2956 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2957 /// operand by the specified number of bits. High-order bits are cleared.
2959 /// \headerfile <x86intrin.h>
2961 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2963 /// \param __a
2964 /// A 128-bit integer vector containing the source operand.
2965 /// \param __count
2966 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2967 /// to right-shift each value in operand \a __a.
2968 /// \returns A 128-bit integer vector containing the right-shifted values.
2969 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
2970 __m128i __count) {
2971 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2974 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2975 /// operand by the specified number of bits. High-order bits are cleared.
2977 /// \headerfile <x86intrin.h>
2979 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2981 /// \param __a
2982 /// A 128-bit integer vector containing the source operand.
2983 /// \param __count
2984 /// An integer value specifying the number of bits to right-shift each value
2985 /// in operand \a __a.
2986 /// \returns A 128-bit integer vector containing the right-shifted values.
2987 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
2988 int __count) {
2989 return __builtin_ia32_psrlqi128((__v2di)__a, __count);
2992 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2993 /// operand by the specified number of bits. High-order bits are cleared.
2995 /// \headerfile <x86intrin.h>
2997 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2999 /// \param __a
3000 /// A 128-bit integer vector containing the source operand.
3001 /// \param __count
3002 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3003 /// to right-shift each value in operand \a __a.
3004 /// \returns A 128-bit integer vector containing the right-shifted values.
3005 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3006 __m128i __count) {
3007 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3010 /// Compares each of the corresponding 8-bit values of the 128-bit
3011 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3012 /// for true.
3014 /// \headerfile <x86intrin.h>
3016 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3018 /// \param __a
3019 /// A 128-bit integer vector.
3020 /// \param __b
3021 /// A 128-bit integer vector.
3022 /// \returns A 128-bit integer vector containing the comparison results.
3023 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3024 __m128i __b) {
3025 return (__m128i)((__v16qi)__a == (__v16qi)__b);
3028 /// Compares each of the corresponding 16-bit values of the 128-bit
3029 /// integer vectors for equality. Each comparison yields 0x0 for false,
3030 /// 0xFFFF for true.
3032 /// \headerfile <x86intrin.h>
3034 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3036 /// \param __a
3037 /// A 128-bit integer vector.
3038 /// \param __b
3039 /// A 128-bit integer vector.
3040 /// \returns A 128-bit integer vector containing the comparison results.
3041 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3042 __m128i __b) {
3043 return (__m128i)((__v8hi)__a == (__v8hi)__b);
3046 /// Compares each of the corresponding 32-bit values of the 128-bit
3047 /// integer vectors for equality. Each comparison yields 0x0 for false,
3048 /// 0xFFFFFFFF for true.
3050 /// \headerfile <x86intrin.h>
3052 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3054 /// \param __a
3055 /// A 128-bit integer vector.
3056 /// \param __b
3057 /// A 128-bit integer vector.
3058 /// \returns A 128-bit integer vector containing the comparison results.
3059 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3060 __m128i __b) {
3061 return (__m128i)((__v4si)__a == (__v4si)__b);
3064 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3065 /// integer vectors to determine if the values in the first operand are
3066 /// greater than those in the second operand. Each comparison yields 0x0 for
3067 /// false, 0xFF for true.
3069 /// \headerfile <x86intrin.h>
3071 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3073 /// \param __a
3074 /// A 128-bit integer vector.
3075 /// \param __b
3076 /// A 128-bit integer vector.
3077 /// \returns A 128-bit integer vector containing the comparison results.
3078 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3079 __m128i __b) {
3080 /* This function always performs a signed comparison, but __v16qi is a char
3081 which may be signed or unsigned, so use __v16qs. */
3082 return (__m128i)((__v16qs)__a > (__v16qs)__b);
3085 /// Compares each of the corresponding signed 16-bit values of the
3086 /// 128-bit integer vectors to determine if the values in the first operand
3087 /// are greater than those in the second operand.
3089 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3091 /// \headerfile <x86intrin.h>
3093 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3095 /// \param __a
3096 /// A 128-bit integer vector.
3097 /// \param __b
3098 /// A 128-bit integer vector.
3099 /// \returns A 128-bit integer vector containing the comparison results.
3100 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3101 __m128i __b) {
3102 return (__m128i)((__v8hi)__a > (__v8hi)__b);
3105 /// Compares each of the corresponding signed 32-bit values of the
3106 /// 128-bit integer vectors to determine if the values in the first operand
3107 /// are greater than those in the second operand.
3109 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3111 /// \headerfile <x86intrin.h>
3113 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3115 /// \param __a
3116 /// A 128-bit integer vector.
3117 /// \param __b
3118 /// A 128-bit integer vector.
3119 /// \returns A 128-bit integer vector containing the comparison results.
3120 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3121 __m128i __b) {
3122 return (__m128i)((__v4si)__a > (__v4si)__b);
3125 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3126 /// integer vectors to determine if the values in the first operand are less
3127 /// than those in the second operand.
3129 /// Each comparison yields 0x0 for false, 0xFF for true.
3131 /// \headerfile <x86intrin.h>
3133 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3135 /// \param __a
3136 /// A 128-bit integer vector.
3137 /// \param __b
3138 /// A 128-bit integer vector.
3139 /// \returns A 128-bit integer vector containing the comparison results.
3140 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3141 __m128i __b) {
3142 return _mm_cmpgt_epi8(__b, __a);
3145 /// Compares each of the corresponding signed 16-bit values of the
3146 /// 128-bit integer vectors to determine if the values in the first operand
3147 /// are less than those in the second operand.
3149 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3151 /// \headerfile <x86intrin.h>
3153 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3155 /// \param __a
3156 /// A 128-bit integer vector.
3157 /// \param __b
3158 /// A 128-bit integer vector.
3159 /// \returns A 128-bit integer vector containing the comparison results.
3160 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3161 __m128i __b) {
3162 return _mm_cmpgt_epi16(__b, __a);
3165 /// Compares each of the corresponding signed 32-bit values of the
3166 /// 128-bit integer vectors to determine if the values in the first operand
3167 /// are less than those in the second operand.
3169 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3171 /// \headerfile <x86intrin.h>
3173 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3175 /// \param __a
3176 /// A 128-bit integer vector.
3177 /// \param __b
3178 /// A 128-bit integer vector.
3179 /// \returns A 128-bit integer vector containing the comparison results.
3180 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3181 __m128i __b) {
3182 return _mm_cmpgt_epi32(__b, __a);
3185 #ifdef __x86_64__
3186 /// Converts a 64-bit signed integer value from the second operand into a
3187 /// double-precision value and returns it in the lower element of a [2 x
3188 /// double] vector; the upper element of the returned vector is copied from
3189 /// the upper element of the first operand.
3191 /// \headerfile <x86intrin.h>
3193 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3195 /// \param __a
3196 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3197 /// copied to the upper 64 bits of the destination.
3198 /// \param __b
3199 /// A 64-bit signed integer operand containing the value to be converted.
3200 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3201 /// converted value of the second operand. The upper 64 bits are copied from
3202 /// the upper 64 bits of the first operand.
3203 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3204 long long __b) {
3205 __a[0] = __b;
3206 return __a;
3209 /// Converts the first (lower) element of a vector of [2 x double] into a
3210 /// 64-bit signed integer value, according to the current rounding mode.
3212 /// \headerfile <x86intrin.h>
3214 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3216 /// \param __a
3217 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3218 /// conversion.
3219 /// \returns A 64-bit signed integer containing the converted value.
3220 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3221 return __builtin_ia32_cvtsd2si64((__v2df)__a);
3224 /// Converts the first (lower) element of a vector of [2 x double] into a
3225 /// 64-bit signed integer value, truncating the result when it is inexact.
3227 /// \headerfile <x86intrin.h>
3229 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3230 /// instruction.
3232 /// \param __a
3233 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3234 /// conversion.
3235 /// \returns A 64-bit signed integer containing the converted value.
3236 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3237 return __builtin_ia32_cvttsd2si64((__v2df)__a);
3239 #endif
3241 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3243 /// \headerfile <x86intrin.h>
3245 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3247 /// \param __a
3248 /// A 128-bit integer vector.
3249 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3250 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3251 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3254 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3256 /// \headerfile <x86intrin.h>
3258 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3260 /// \param __a
3261 /// A 128-bit vector of [4 x float].
3262 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3263 /// values.
3264 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3265 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3268 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3269 /// truncating the result when it is inexact.
3271 /// \headerfile <x86intrin.h>
3273 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3274 /// instruction.
3276 /// \param __a
3277 /// A 128-bit vector of [4 x float].
3278 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3279 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3280 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3283 /// Returns a vector of [4 x i32] where the lowest element is the input
3284 /// operand and the remaining elements are zero.
3286 /// \headerfile <x86intrin.h>
3288 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3290 /// \param __a
3291 /// A 32-bit signed integer operand.
3292 /// \returns A 128-bit vector of [4 x i32].
3293 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3294 return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3297 /// Returns a vector of [2 x i64] where the lower element is the input
3298 /// operand and the upper element is zero.
3300 /// \headerfile <x86intrin.h>
3302 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3303 /// in 64-bit mode.
3305 /// \param __a
3306 /// A 64-bit signed integer operand containing the value to be converted.
3307 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3308 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3309 return __extension__(__m128i)(__v2di){__a, 0};
3312 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3313 /// 32-bit signed integer value.
3315 /// \headerfile <x86intrin.h>
3317 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3319 /// \param __a
3320 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3321 /// destination.
3322 /// \returns A 32-bit signed integer containing the moved value.
3323 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3324 __v4si __b = (__v4si)__a;
3325 return __b[0];
3328 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3329 /// 64-bit signed integer value.
3331 /// \headerfile <x86intrin.h>
3333 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3335 /// \param __a
3336 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3337 /// destination.
3338 /// \returns A 64-bit signed integer containing the moved value.
3339 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3340 return __a[0];
3343 /// Moves packed integer values from an aligned 128-bit memory location
3344 /// to elements in a 128-bit integer vector.
3346 /// \headerfile <x86intrin.h>
3348 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3350 /// \param __p
3351 /// An aligned pointer to a memory location containing integer values.
3352 /// \returns A 128-bit integer vector containing the moved values.
3353 static __inline__ __m128i __DEFAULT_FN_ATTRS
3354 _mm_load_si128(__m128i const *__p) {
3355 return *__p;
3358 /// Moves packed integer values from an unaligned 128-bit memory location
3359 /// to elements in a 128-bit integer vector.
3361 /// \headerfile <x86intrin.h>
3363 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3365 /// \param __p
3366 /// A pointer to a memory location containing integer values.
3367 /// \returns A 128-bit integer vector containing the moved values.
3368 static __inline__ __m128i __DEFAULT_FN_ATTRS
3369 _mm_loadu_si128(__m128i_u const *__p) {
3370 struct __loadu_si128 {
3371 __m128i_u __v;
3372 } __attribute__((__packed__, __may_alias__));
3373 return ((const struct __loadu_si128 *)__p)->__v;
3376 /// Returns a vector of [2 x i64] where the lower element is taken from
3377 /// the lower element of the operand, and the upper element is zero.
3379 /// \headerfile <x86intrin.h>
3381 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3383 /// \param __p
3384 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3385 /// the destination.
3386 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3387 /// moved value. The higher order bits are cleared.
3388 static __inline__ __m128i __DEFAULT_FN_ATTRS
3389 _mm_loadl_epi64(__m128i_u const *__p) {
3390 struct __mm_loadl_epi64_struct {
3391 long long __u;
3392 } __attribute__((__packed__, __may_alias__));
3393 return __extension__(__m128i){
3394 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3397 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3398 /// This could be used as an argument to another intrinsic function where the
3399 /// argument is required but the value is not actually used.
3401 /// \headerfile <x86intrin.h>
3403 /// This intrinsic has no corresponding instruction.
3405 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3406 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3407 return (__m128i)__builtin_ia32_undef128();
3410 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3411 /// the specified 64-bit integer values.
3413 /// \headerfile <x86intrin.h>
3415 /// This intrinsic is a utility function and does not correspond to a specific
3416 /// instruction.
3418 /// \param __q1
3419 /// A 64-bit integer value used to initialize the upper 64 bits of the
3420 /// destination vector of [2 x i64].
3421 /// \param __q0
3422 /// A 64-bit integer value used to initialize the lower 64 bits of the
3423 /// destination vector of [2 x i64].
3424 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3425 /// provided in the operands.
3426 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3427 long long __q0) {
3428 return __extension__(__m128i)(__v2di){__q0, __q1};
3431 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3432 /// the specified 64-bit integer values.
3434 /// \headerfile <x86intrin.h>
3436 /// This intrinsic is a utility function and does not correspond to a specific
3437 /// instruction.
3439 /// \param __q1
3440 /// A 64-bit integer value used to initialize the upper 64 bits of the
3441 /// destination vector of [2 x i64].
3442 /// \param __q0
3443 /// A 64-bit integer value used to initialize the lower 64 bits of the
3444 /// destination vector of [2 x i64].
3445 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3446 /// provided in the operands.
3447 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3448 __m64 __q0) {
3449 return _mm_set_epi64x((long long)__q1, (long long)__q0);
3452 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3453 /// the specified 32-bit integer values.
3455 /// \headerfile <x86intrin.h>
3457 /// This intrinsic is a utility function and does not correspond to a specific
3458 /// instruction.
3460 /// \param __i3
3461 /// A 32-bit integer value used to initialize bits [127:96] of the
3462 /// destination vector.
3463 /// \param __i2
3464 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3465 /// vector.
3466 /// \param __i1
3467 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3468 /// vector.
3469 /// \param __i0
3470 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3471 /// vector.
3472 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3473 /// provided in the operands.
3474 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3475 int __i1, int __i0) {
3476 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3479 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3480 /// the specified 16-bit integer values.
3482 /// \headerfile <x86intrin.h>
3484 /// This intrinsic is a utility function and does not correspond to a specific
3485 /// instruction.
3487 /// \param __w7
3488 /// A 16-bit integer value used to initialize bits [127:112] of the
3489 /// destination vector.
3490 /// \param __w6
3491 /// A 16-bit integer value used to initialize bits [111:96] of the
3492 /// destination vector.
3493 /// \param __w5
3494 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3495 /// vector.
3496 /// \param __w4
3497 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3498 /// vector.
3499 /// \param __w3
3500 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3501 /// vector.
3502 /// \param __w2
3503 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3504 /// vector.
3505 /// \param __w1
3506 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3507 /// vector.
3508 /// \param __w0
3509 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3510 /// vector.
3511 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3512 /// provided in the operands.
3513 static __inline__ __m128i __DEFAULT_FN_ATTRS
3514 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3515 short __w2, short __w1, short __w0) {
3516 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3517 __w4, __w5, __w6, __w7};
3520 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3521 /// the specified 8-bit integer values.
3523 /// \headerfile <x86intrin.h>
3525 /// This intrinsic is a utility function and does not correspond to a specific
3526 /// instruction.
3528 /// \param __b15
3529 /// Initializes bits [127:120] of the destination vector.
3530 /// \param __b14
3531 /// Initializes bits [119:112] of the destination vector.
3532 /// \param __b13
3533 /// Initializes bits [111:104] of the destination vector.
3534 /// \param __b12
3535 /// Initializes bits [103:96] of the destination vector.
3536 /// \param __b11
3537 /// Initializes bits [95:88] of the destination vector.
3538 /// \param __b10
3539 /// Initializes bits [87:80] of the destination vector.
3540 /// \param __b9
3541 /// Initializes bits [79:72] of the destination vector.
3542 /// \param __b8
3543 /// Initializes bits [71:64] of the destination vector.
3544 /// \param __b7
3545 /// Initializes bits [63:56] of the destination vector.
3546 /// \param __b6
3547 /// Initializes bits [55:48] of the destination vector.
3548 /// \param __b5
3549 /// Initializes bits [47:40] of the destination vector.
3550 /// \param __b4
3551 /// Initializes bits [39:32] of the destination vector.
3552 /// \param __b3
3553 /// Initializes bits [31:24] of the destination vector.
3554 /// \param __b2
3555 /// Initializes bits [23:16] of the destination vector.
3556 /// \param __b1
3557 /// Initializes bits [15:8] of the destination vector.
3558 /// \param __b0
3559 /// Initializes bits [7:0] of the destination vector.
3560 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3561 /// provided in the operands.
3562 static __inline__ __m128i __DEFAULT_FN_ATTRS
3563 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3564 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3565 char __b4, char __b3, char __b2, char __b1, char __b0) {
3566 return __extension__(__m128i)(__v16qi){
3567 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3568 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3571 /// Initializes both values in a 128-bit integer vector with the
3572 /// specified 64-bit integer value.
3574 /// \headerfile <x86intrin.h>
3576 /// This intrinsic is a utility function and does not correspond to a specific
3577 /// instruction.
3579 /// \param __q
3580 /// Integer value used to initialize the elements of the destination integer
3581 /// vector.
3582 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3583 /// elements containing the value provided in the operand.
3584 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3585 return _mm_set_epi64x(__q, __q);
3588 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3589 /// specified 64-bit value.
3591 /// \headerfile <x86intrin.h>
3593 /// This intrinsic is a utility function and does not correspond to a specific
3594 /// instruction.
3596 /// \param __q
3597 /// A 64-bit value used to initialize the elements of the destination integer
3598 /// vector.
3599 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3600 /// containing the value provided in the operand.
3601 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3602 return _mm_set_epi64(__q, __q);
3605 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3606 /// specified 32-bit value.
3608 /// \headerfile <x86intrin.h>
3610 /// This intrinsic is a utility function and does not correspond to a specific
3611 /// instruction.
3613 /// \param __i
3614 /// A 32-bit value used to initialize the elements of the destination integer
3615 /// vector.
3616 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3617 /// containing the value provided in the operand.
3618 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3619 return _mm_set_epi32(__i, __i, __i, __i);
3622 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3623 /// specified 16-bit value.
3625 /// \headerfile <x86intrin.h>
3627 /// This intrinsic is a utility function and does not correspond to a specific
3628 /// instruction.
3630 /// \param __w
3631 /// A 16-bit value used to initialize the elements of the destination integer
3632 /// vector.
3633 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3634 /// containing the value provided in the operand.
3635 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3636 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3639 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3640 /// specified 8-bit value.
3642 /// \headerfile <x86intrin.h>
3644 /// This intrinsic is a utility function and does not correspond to a specific
3645 /// instruction.
3647 /// \param __b
3648 /// An 8-bit value used to initialize the elements of the destination integer
3649 /// vector.
3650 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3651 /// containing the value provided in the operand.
3652 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3653 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3654 __b, __b, __b, __b, __b);
3657 /// Constructs a 128-bit integer vector, initialized in reverse order
3658 /// with the specified 64-bit integral values.
3660 /// \headerfile <x86intrin.h>
3662 /// This intrinsic does not correspond to a specific instruction.
3664 /// \param __q0
3665 /// A 64-bit integral value used to initialize the lower 64 bits of the
3666 /// result.
3667 /// \param __q1
3668 /// A 64-bit integral value used to initialize the upper 64 bits of the
3669 /// result.
3670 /// \returns An initialized 128-bit integer vector.
3671 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3672 __m64 __q1) {
3673 return _mm_set_epi64(__q1, __q0);
3676 /// Constructs a 128-bit integer vector, initialized in reverse order
3677 /// with the specified 32-bit integral values.
3679 /// \headerfile <x86intrin.h>
3681 /// This intrinsic is a utility function and does not correspond to a specific
3682 /// instruction.
3684 /// \param __i0
3685 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3686 /// \param __i1
3687 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3688 /// \param __i2
3689 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3690 /// \param __i3
3691 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3692 /// \returns An initialized 128-bit integer vector.
3693 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3694 int __i2,
3695 int __i3) {
3696 return _mm_set_epi32(__i3, __i2, __i1, __i0);
3699 /// Constructs a 128-bit integer vector, initialized in reverse order
3700 /// with the specified 16-bit integral values.
3702 /// \headerfile <x86intrin.h>
3704 /// This intrinsic is a utility function and does not correspond to a specific
3705 /// instruction.
3707 /// \param __w0
3708 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3709 /// \param __w1
3710 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3711 /// \param __w2
3712 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3713 /// \param __w3
3714 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3715 /// \param __w4
3716 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3717 /// \param __w5
3718 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3719 /// \param __w6
3720 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3721 /// \param __w7
3722 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3723 /// \returns An initialized 128-bit integer vector.
3724 static __inline__ __m128i __DEFAULT_FN_ATTRS
3725 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3726 short __w5, short __w6, short __w7) {
3727 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3730 /// Constructs a 128-bit integer vector, initialized in reverse order
3731 /// with the specified 8-bit integral values.
3733 /// \headerfile <x86intrin.h>
3735 /// This intrinsic is a utility function and does not correspond to a specific
3736 /// instruction.
3738 /// \param __b0
3739 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3740 /// \param __b1
3741 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3742 /// \param __b2
3743 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3744 /// \param __b3
3745 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3746 /// \param __b4
3747 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3748 /// \param __b5
3749 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3750 /// \param __b6
3751 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3752 /// \param __b7
3753 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3754 /// \param __b8
3755 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3756 /// \param __b9
3757 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3758 /// \param __b10
3759 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3760 /// \param __b11
3761 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3762 /// \param __b12
3763 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3764 /// \param __b13
3765 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3766 /// \param __b14
3767 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3768 /// \param __b15
3769 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3770 /// \returns An initialized 128-bit integer vector.
3771 static __inline__ __m128i __DEFAULT_FN_ATTRS
3772 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3773 char __b6, char __b7, char __b8, char __b9, char __b10,
3774 char __b11, char __b12, char __b13, char __b14, char __b15) {
3775 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3776 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3779 /// Creates a 128-bit integer vector initialized to zero.
3781 /// \headerfile <x86intrin.h>
3783 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3785 /// \returns An initialized 128-bit integer vector with all elements set to
3786 /// zero.
3787 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3788 return __extension__(__m128i)(__v2di){0LL, 0LL};
3791 /// Stores a 128-bit integer vector to a memory location aligned on a
3792 /// 128-bit boundary.
3794 /// \headerfile <x86intrin.h>
3796 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3798 /// \param __p
3799 /// A pointer to an aligned memory location that will receive the integer
3800 /// values.
3801 /// \param __b
3802 /// A 128-bit integer vector containing the values to be moved.
3803 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3804 __m128i __b) {
3805 *__p = __b;
3808 /// Stores a 128-bit integer vector to an unaligned memory location.
3810 /// \headerfile <x86intrin.h>
3812 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3814 /// \param __p
3815 /// A pointer to a memory location that will receive the integer values.
3816 /// \param __b
3817 /// A 128-bit integer vector containing the values to be moved.
3818 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3819 __m128i __b) {
3820 struct __storeu_si128 {
3821 __m128i_u __v;
3822 } __attribute__((__packed__, __may_alias__));
3823 ((struct __storeu_si128 *)__p)->__v = __b;
3826 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3827 /// vector.
3829 /// \headerfile <x86intrin.h>
3831 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3833 /// \param __p
3834 /// A pointer to a 64-bit memory location. The address of the memory
3835 /// location does not have to be aligned.
3836 /// \param __b
3837 /// A 128-bit integer vector containing the value to be stored.
3838 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3839 __m128i __b) {
3840 struct __storeu_si64 {
3841 long long __v;
3842 } __attribute__((__packed__, __may_alias__));
3843 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3846 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3847 /// vector.
3849 /// \headerfile <x86intrin.h>
3851 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3853 /// \param __p
3854 /// A pointer to a 32-bit memory location. The address of the memory
3855 /// location does not have to be aligned.
3856 /// \param __b
3857 /// A 128-bit integer vector containing the value to be stored.
3858 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3859 __m128i __b) {
3860 struct __storeu_si32 {
3861 int __v;
3862 } __attribute__((__packed__, __may_alias__));
3863 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3866 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3867 /// vector.
3869 /// \headerfile <x86intrin.h>
3871 /// This intrinsic does not correspond to a specific instruction.
3873 /// \param __p
3874 /// A pointer to a 16-bit memory location. The address of the memory
3875 /// location does not have to be aligned.
3876 /// \param __b
3877 /// A 128-bit integer vector containing the value to be stored.
3878 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3879 __m128i __b) {
3880 struct __storeu_si16 {
3881 short __v;
3882 } __attribute__((__packed__, __may_alias__));
3883 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3886 /// Moves bytes selected by the mask from the first operand to the
3887 /// specified unaligned memory location. When a mask bit is 1, the
3888 /// corresponding byte is written, otherwise it is not written.
3890 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3891 /// used again soon). Exception and trap behavior for elements not selected
3892 /// for storage to memory are implementation dependent.
3894 /// \headerfile <x86intrin.h>
3896 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3897 /// instruction.
3899 /// \param __d
3900 /// A 128-bit integer vector containing the values to be moved.
3901 /// \param __n
3902 /// A 128-bit integer vector containing the mask. The most significant bit of
3903 /// each byte represents the mask bits.
3904 /// \param __p
3905 /// A pointer to an unaligned 128-bit memory location where the specified
3906 /// values are moved.
3907 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3908 __m128i __n,
3909 char *__p) {
3910 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3913 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3914 /// a memory location.
3916 /// \headerfile <x86intrin.h>
3918 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3920 /// \param __p
3921 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
3922 /// of the integer vector parameter.
3923 /// \param __a
3924 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3925 /// value to be stored.
3926 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3927 __m128i __a) {
3928 struct __mm_storel_epi64_struct {
3929 long long __u;
3930 } __attribute__((__packed__, __may_alias__));
3931 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
3934 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3935 /// aligned memory location.
3937 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3938 /// used again soon).
3940 /// \headerfile <x86intrin.h>
3942 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3944 /// \param __p
3945 /// A pointer to the 128-bit aligned memory location used to store the value.
3946 /// \param __a
3947 /// A vector of [2 x double] containing the 64-bit values to be stored.
3948 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
3949 __m128d __a) {
3950 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
3953 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3955 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3956 /// used again soon).
3958 /// \headerfile <x86intrin.h>
3960 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3962 /// \param __p
3963 /// A pointer to the 128-bit aligned memory location used to store the value.
3964 /// \param __a
3965 /// A 128-bit integer vector containing the values to be stored.
3966 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
3967 __m128i __a) {
3968 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
3971 /// Stores a 32-bit integer value in the specified memory location.
3973 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3974 /// used again soon).
3976 /// \headerfile <x86intrin.h>
3978 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3980 /// \param __p
3981 /// A pointer to the 32-bit memory location used to store the value.
3982 /// \param __a
3983 /// A 32-bit integer containing the value to be stored.
3984 static __inline__ void
3985 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
3986 _mm_stream_si32(void *__p, int __a) {
3987 __builtin_ia32_movnti((int *)__p, __a);
3990 #ifdef __x86_64__
3991 /// Stores a 64-bit integer value in the specified memory location.
3993 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3994 /// used again soon).
3996 /// \headerfile <x86intrin.h>
3998 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4000 /// \param __p
4001 /// A pointer to the 64-bit memory location used to store the value.
4002 /// \param __a
4003 /// A 64-bit integer containing the value to be stored.
4004 static __inline__ void
4005 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4006 _mm_stream_si64(void *__p, long long __a) {
4007 __builtin_ia32_movnti64((long long *)__p, __a);
4009 #endif
4011 #if defined(__cplusplus)
4012 extern "C" {
4013 #endif
4015 /// The cache line containing \a __p is flushed and invalidated from all
4016 /// caches in the coherency domain.
4018 /// \headerfile <x86intrin.h>
4020 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4022 /// \param __p
4023 /// A pointer to the memory location used to identify the cache line to be
4024 /// flushed.
4025 void _mm_clflush(void const *__p);
4027 /// Forces strong memory ordering (serialization) between load
4028 /// instructions preceding this instruction and load instructions following
4029 /// this instruction, ensuring the system completes all previous loads before
4030 /// executing subsequent loads.
4032 /// \headerfile <x86intrin.h>
4034 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4036 void _mm_lfence(void);
4038 /// Forces strong memory ordering (serialization) between load and store
4039 /// instructions preceding this instruction and load and store instructions
4040 /// following this instruction, ensuring that the system completes all
4041 /// previous memory accesses before executing subsequent memory accesses.
4043 /// \headerfile <x86intrin.h>
4045 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4047 void _mm_mfence(void);
4049 #if defined(__cplusplus)
4050 } // extern "C"
4051 #endif
4053 /// Converts 16-bit signed integers from both 128-bit integer vector
4054 /// operands into 8-bit signed integers, and packs the results into the
4055 /// destination. Positive values greater than 0x7F are saturated to 0x7F.
4056 /// Negative values less than 0x80 are saturated to 0x80.
4058 /// \headerfile <x86intrin.h>
4060 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4062 /// \param __a
4063 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4064 /// a signed integer and is converted to a 8-bit signed integer with
4065 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4066 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4067 /// written to the lower 64 bits of the result.
4068 /// \param __b
4069 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4070 /// a signed integer and is converted to a 8-bit signed integer with
4071 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4072 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4073 /// written to the higher 64 bits of the result.
4074 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4075 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4076 __m128i __b) {
4077 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4080 /// Converts 32-bit signed integers from both 128-bit integer vector
4081 /// operands into 16-bit signed integers, and packs the results into the
4082 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4083 /// Negative values less than 0x8000 are saturated to 0x8000.
4085 /// \headerfile <x86intrin.h>
4087 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4089 /// \param __a
4090 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4091 /// a signed integer and is converted to a 16-bit signed integer with
4092 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4093 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4094 /// are written to the lower 64 bits of the result.
4095 /// \param __b
4096 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4097 /// a signed integer and is converted to a 16-bit signed integer with
4098 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4099 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4100 /// are written to the higher 64 bits of the result.
4101 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4102 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4103 __m128i __b) {
4104 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4107 /// Converts 16-bit signed integers from both 128-bit integer vector
4108 /// operands into 8-bit unsigned integers, and packs the results into the
4109 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less
4110 /// than 0x00 are saturated to 0x00.
4112 /// \headerfile <x86intrin.h>
4114 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4116 /// \param __a
4117 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4118 /// a signed integer and is converted to an 8-bit unsigned integer with
4119 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4120 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4121 /// written to the lower 64 bits of the result.
4122 /// \param __b
4123 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4124 /// a signed integer and is converted to an 8-bit unsigned integer with
4125 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4126 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4127 /// written to the higher 64 bits of the result.
4128 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4129 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4130 __m128i __b) {
4131 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4134 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4135 /// the immediate-value parameter as a selector.
4137 /// \headerfile <x86intrin.h>
4139 /// \code
4140 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4141 /// \endcode
4143 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4145 /// \param a
4146 /// A 128-bit integer vector.
4147 /// \param imm
4148 /// An immediate value. Bits [2:0] selects values from \a a to be assigned
4149 /// to bits[15:0] of the result. \n
4150 /// 000: assign values from bits [15:0] of \a a. \n
4151 /// 001: assign values from bits [31:16] of \a a. \n
4152 /// 010: assign values from bits [47:32] of \a a. \n
4153 /// 011: assign values from bits [63:48] of \a a. \n
4154 /// 100: assign values from bits [79:64] of \a a. \n
4155 /// 101: assign values from bits [95:80] of \a a. \n
4156 /// 110: assign values from bits [111:96] of \a a. \n
4157 /// 111: assign values from bits [127:112] of \a a.
4158 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4159 /// integer vector parameter and the remaining bits are assigned zeros.
4160 #define _mm_extract_epi16(a, imm) \
4161 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4162 (int)(imm)))
4164 /// Constructs a 128-bit integer vector by first making a copy of the
4165 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4166 /// of an integer parameter into an offset specified by the immediate-value
4167 /// parameter.
4169 /// \headerfile <x86intrin.h>
4171 /// \code
4172 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4173 /// \endcode
4175 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4177 /// \param a
4178 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4179 /// result and then one of the eight elements in the result is replaced by
4180 /// the lower 16 bits of \a b.
4181 /// \param b
4182 /// An integer. The lower 16 bits of this parameter are written to the
4183 /// result beginning at an offset specified by \a imm.
4184 /// \param imm
4185 /// An immediate value specifying the bit offset in the result at which the
4186 /// lower 16 bits of \a b are written.
4187 /// \returns A 128-bit integer vector containing the constructed values.
4188 #define _mm_insert_epi16(a, b, imm) \
4189 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4190 (int)(imm)))
4192 /// Copies the values of the most significant bits from each 8-bit
4193 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4194 /// value, zero-extends the value, and writes it to the destination.
4196 /// \headerfile <x86intrin.h>
4198 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4200 /// \param __a
4201 /// A 128-bit integer vector containing the values with bits to be extracted.
4202 /// \returns The most significant bits from each 8-bit element in \a __a,
4203 /// written to bits [15:0]. The other bits are assigned zeros.
4204 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4205 return __builtin_ia32_pmovmskb128((__v16qi)__a);
4208 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4209 /// elements of a 128-bit integer vector parameter, using the immediate-value
4210 /// parameter as a specifier.
4212 /// \headerfile <x86intrin.h>
4214 /// \code
4215 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4216 /// \endcode
4218 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4220 /// \param a
4221 /// A 128-bit integer vector containing the values to be copied.
4222 /// \param imm
4223 /// An immediate value containing an 8-bit value specifying which elements to
4224 /// copy from a. The destinations within the 128-bit destination are assigned
4225 /// values as follows: \n
4226 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4227 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4228 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4229 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4230 /// Bit value assignments: \n
4231 /// 00: assign values from bits [31:0] of \a a. \n
4232 /// 01: assign values from bits [63:32] of \a a. \n
4233 /// 10: assign values from bits [95:64] of \a a. \n
4234 /// 11: assign values from bits [127:96] of \a a. \n
4235 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4236 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4237 /// <c>[b6, b4, b2, b0]</c>.
4238 /// \returns A 128-bit integer vector containing the shuffled values.
4239 #define _mm_shuffle_epi32(a, imm) \
4240 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4242 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4243 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4244 /// value parameter as a specifier.
4246 /// \headerfile <x86intrin.h>
4248 /// \code
4249 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4250 /// \endcode
4252 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4254 /// \param a
4255 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4256 /// [127:64] of the result.
4257 /// \param imm
4258 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4259 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4260 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4261 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4262 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4263 /// Bit value assignments: \n
4264 /// 00: assign values from bits [15:0] of \a a. \n
4265 /// 01: assign values from bits [31:16] of \a a. \n
4266 /// 10: assign values from bits [47:32] of \a a. \n
4267 /// 11: assign values from bits [63:48] of \a a. \n
4268 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4269 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4270 /// <c>[b6, b4, b2, b0]</c>.
4271 /// \returns A 128-bit integer vector containing the shuffled values.
4272 #define _mm_shufflelo_epi16(a, imm) \
4273 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4275 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4276 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4277 /// value parameter as a specifier.
4279 /// \headerfile <x86intrin.h>
4281 /// \code
4282 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4283 /// \endcode
4285 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4287 /// \param a
4288 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4289 /// [63:0] of the result.
4290 /// \param imm
4291 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4292 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4293 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4294 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4295 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4296 /// Bit value assignments: \n
4297 /// 00: assign values from bits [79:64] of \a a. \n
4298 /// 01: assign values from bits [95:80] of \a a. \n
4299 /// 10: assign values from bits [111:96] of \a a. \n
4300 /// 11: assign values from bits [127:112] of \a a. \n
4301 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4302 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4303 /// <c>[b6, b4, b2, b0]</c>.
4304 /// \returns A 128-bit integer vector containing the shuffled values.
4305 #define _mm_shufflehi_epi16(a, imm) \
4306 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4308 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4309 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4311 /// \headerfile <x86intrin.h>
4313 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4314 /// instruction.
4316 /// \param __a
4317 /// A 128-bit vector of [16 x i8].
4318 /// Bits [71:64] are written to bits [7:0] of the result. \n
4319 /// Bits [79:72] are written to bits [23:16] of the result. \n
4320 /// Bits [87:80] are written to bits [39:32] of the result. \n
4321 /// Bits [95:88] are written to bits [55:48] of the result. \n
4322 /// Bits [103:96] are written to bits [71:64] of the result. \n
4323 /// Bits [111:104] are written to bits [87:80] of the result. \n
4324 /// Bits [119:112] are written to bits [103:96] of the result. \n
4325 /// Bits [127:120] are written to bits [119:112] of the result.
4326 /// \param __b
4327 /// A 128-bit vector of [16 x i8]. \n
4328 /// Bits [71:64] are written to bits [15:8] of the result. \n
4329 /// Bits [79:72] are written to bits [31:24] of the result. \n
4330 /// Bits [87:80] are written to bits [47:40] of the result. \n
4331 /// Bits [95:88] are written to bits [63:56] of the result. \n
4332 /// Bits [103:96] are written to bits [79:72] of the result. \n
4333 /// Bits [111:104] are written to bits [95:88] of the result. \n
4334 /// Bits [119:112] are written to bits [111:104] of the result. \n
4335 /// Bits [127:120] are written to bits [127:120] of the result.
4336 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4337 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4338 __m128i __b) {
4339 return (__m128i)__builtin_shufflevector(
4340 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4341 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4344 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4345 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4347 /// \headerfile <x86intrin.h>
4349 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4350 /// instruction.
4352 /// \param __a
4353 /// A 128-bit vector of [8 x i16].
4354 /// Bits [79:64] are written to bits [15:0] of the result. \n
4355 /// Bits [95:80] are written to bits [47:32] of the result. \n
4356 /// Bits [111:96] are written to bits [79:64] of the result. \n
4357 /// Bits [127:112] are written to bits [111:96] of the result.
4358 /// \param __b
4359 /// A 128-bit vector of [8 x i16].
4360 /// Bits [79:64] are written to bits [31:16] of the result. \n
4361 /// Bits [95:80] are written to bits [63:48] of the result. \n
4362 /// Bits [111:96] are written to bits [95:80] of the result. \n
4363 /// Bits [127:112] are written to bits [127:112] of the result.
4364 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4365 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4366 __m128i __b) {
4367 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4368 8 + 5, 6, 8 + 6, 7, 8 + 7);
4371 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4372 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4374 /// \headerfile <x86intrin.h>
4376 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4377 /// instruction.
4379 /// \param __a
4380 /// A 128-bit vector of [4 x i32]. \n
4381 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4382 /// Bits [127:96] are written to bits [95:64] of the destination.
4383 /// \param __b
4384 /// A 128-bit vector of [4 x i32]. \n
4385 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4386 /// Bits [127:96] are written to bits [127:96] of the destination.
4387 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4388 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4389 __m128i __b) {
4390 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4391 4 + 3);
4394 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4395 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4397 /// \headerfile <x86intrin.h>
4399 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4400 /// instruction.
4402 /// \param __a
4403 /// A 128-bit vector of [2 x i64]. \n
4404 /// Bits [127:64] are written to bits [63:0] of the destination.
4405 /// \param __b
4406 /// A 128-bit vector of [2 x i64]. \n
4407 /// Bits [127:64] are written to bits [127:64] of the destination.
4408 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4409 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4410 __m128i __b) {
4411 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4414 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4415 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4417 /// \headerfile <x86intrin.h>
4419 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4420 /// instruction.
4422 /// \param __a
4423 /// A 128-bit vector of [16 x i8]. \n
4424 /// Bits [7:0] are written to bits [7:0] of the result. \n
4425 /// Bits [15:8] are written to bits [23:16] of the result. \n
4426 /// Bits [23:16] are written to bits [39:32] of the result. \n
4427 /// Bits [31:24] are written to bits [55:48] of the result. \n
4428 /// Bits [39:32] are written to bits [71:64] of the result. \n
4429 /// Bits [47:40] are written to bits [87:80] of the result. \n
4430 /// Bits [55:48] are written to bits [103:96] of the result. \n
4431 /// Bits [63:56] are written to bits [119:112] of the result.
4432 /// \param __b
4433 /// A 128-bit vector of [16 x i8].
4434 /// Bits [7:0] are written to bits [15:8] of the result. \n
4435 /// Bits [15:8] are written to bits [31:24] of the result. \n
4436 /// Bits [23:16] are written to bits [47:40] of the result. \n
4437 /// Bits [31:24] are written to bits [63:56] of the result. \n
4438 /// Bits [39:32] are written to bits [79:72] of the result. \n
4439 /// Bits [47:40] are written to bits [95:88] of the result. \n
4440 /// Bits [55:48] are written to bits [111:104] of the result. \n
4441 /// Bits [63:56] are written to bits [127:120] of the result.
4442 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4443 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4444 __m128i __b) {
4445 return (__m128i)__builtin_shufflevector(
4446 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4447 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4450 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4451 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4452 /// [8 x i16].
4454 /// \headerfile <x86intrin.h>
4456 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4457 /// instruction.
4459 /// \param __a
4460 /// A 128-bit vector of [8 x i16].
4461 /// Bits [15:0] are written to bits [15:0] of the result. \n
4462 /// Bits [31:16] are written to bits [47:32] of the result. \n
4463 /// Bits [47:32] are written to bits [79:64] of the result. \n
4464 /// Bits [63:48] are written to bits [111:96] of the result.
4465 /// \param __b
4466 /// A 128-bit vector of [8 x i16].
4467 /// Bits [15:0] are written to bits [31:16] of the result. \n
4468 /// Bits [31:16] are written to bits [63:48] of the result. \n
4469 /// Bits [47:32] are written to bits [95:80] of the result. \n
4470 /// Bits [63:48] are written to bits [127:112] of the result.
4471 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4472 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4473 __m128i __b) {
4474 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4475 8 + 1, 2, 8 + 2, 3, 8 + 3);
4478 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4479 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4481 /// \headerfile <x86intrin.h>
4483 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4484 /// instruction.
4486 /// \param __a
4487 /// A 128-bit vector of [4 x i32]. \n
4488 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4489 /// Bits [63:32] are written to bits [95:64] of the destination.
4490 /// \param __b
4491 /// A 128-bit vector of [4 x i32]. \n
4492 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4493 /// Bits [63:32] are written to bits [127:96] of the destination.
4494 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4495 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4496 __m128i __b) {
4497 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4498 4 + 1);
4501 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4502 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4504 /// \headerfile <x86intrin.h>
4506 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4507 /// instruction.
4509 /// \param __a
4510 /// A 128-bit vector of [2 x i64]. \n
4511 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4512 /// \param __b
4513 /// A 128-bit vector of [2 x i64]. \n
4514 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4515 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4516 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4517 __m128i __b) {
4518 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4521 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4522 /// integer.
4524 /// \headerfile <x86intrin.h>
4526 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4528 /// \param __a
4529 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4530 /// destination.
4531 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4532 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4533 return (__m64)__a[0];
4536 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4537 /// upper bits.
4539 /// \headerfile <x86intrin.h>
4541 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4543 /// \param __a
4544 /// A 64-bit value.
4545 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4546 /// the operand. The upper 64 bits are assigned zeros.
4547 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4548 return __extension__(__m128i)(__v2di){(long long)__a, 0};
4551 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4552 /// integer vector, zeroing the upper bits.
4554 /// \headerfile <x86intrin.h>
4556 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4558 /// \param __a
4559 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4560 /// destination.
4561 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4562 /// the operand. The upper 64 bits are assigned zeros.
4563 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4564 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4567 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4568 /// [2 x double] and interleaves them into a 128-bit vector of [2 x
4569 /// double].
4571 /// \headerfile <x86intrin.h>
4573 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4575 /// \param __a
4576 /// A 128-bit vector of [2 x double]. \n
4577 /// Bits [127:64] are written to bits [63:0] of the destination.
4578 /// \param __b
4579 /// A 128-bit vector of [2 x double]. \n
4580 /// Bits [127:64] are written to bits [127:64] of the destination.
4581 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4582 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4583 __m128d __b) {
4584 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4587 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4588 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4589 /// double].
4591 /// \headerfile <x86intrin.h>
4593 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4595 /// \param __a
4596 /// A 128-bit vector of [2 x double]. \n
4597 /// Bits [63:0] are written to bits [63:0] of the destination.
4598 /// \param __b
4599 /// A 128-bit vector of [2 x double]. \n
4600 /// Bits [63:0] are written to bits [127:64] of the destination.
4601 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4602 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4603 __m128d __b) {
4604 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4607 /// Extracts the sign bits of the double-precision values in the 128-bit
4608 /// vector of [2 x double], zero-extends the value, and writes it to the
4609 /// low-order bits of the destination.
4611 /// \headerfile <x86intrin.h>
4613 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4615 /// \param __a
4616 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4617 /// be extracted.
4618 /// \returns The sign bits from each of the double-precision elements in \a __a,
4619 /// written to bits [1:0]. The remaining bits are assigned values of zero.
4620 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4621 return __builtin_ia32_movmskpd((__v2df)__a);
4624 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4625 /// 128-bit vector parameters of [2 x double], using the immediate-value
4626 /// parameter as a specifier.
4628 /// \headerfile <x86intrin.h>
4630 /// \code
4631 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4632 /// \endcode
4634 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4636 /// \param a
4637 /// A 128-bit vector of [2 x double].
4638 /// \param b
4639 /// A 128-bit vector of [2 x double].
4640 /// \param i
4641 /// An 8-bit immediate value. The least significant two bits specify which
4642 /// elements to copy from \a a and \a b: \n
4643 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4644 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4645 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4646 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4647 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4648 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4649 /// <c>[b1, b0]</c>.
4650 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4651 #define _mm_shuffle_pd(a, b, i) \
4652 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4653 (int)(i)))
4655 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4656 /// floating-point vector of [4 x float].
4658 /// \headerfile <x86intrin.h>
4660 /// This intrinsic has no corresponding instruction.
4662 /// \param __a
4663 /// A 128-bit floating-point vector of [2 x double].
4664 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4665 /// bitwise pattern as the parameter.
4666 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4667 return (__m128)__a;
4670 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4671 /// integer vector.
4673 /// \headerfile <x86intrin.h>
4675 /// This intrinsic has no corresponding instruction.
4677 /// \param __a
4678 /// A 128-bit floating-point vector of [2 x double].
4679 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4680 /// parameter.
4681 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4682 return (__m128i)__a;
4685 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4686 /// floating-point vector of [2 x double].
4688 /// \headerfile <x86intrin.h>
4690 /// This intrinsic has no corresponding instruction.
4692 /// \param __a
4693 /// A 128-bit floating-point vector of [4 x float].
4694 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4695 /// bitwise pattern as the parameter.
4696 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4697 return (__m128d)__a;
4700 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4701 /// integer vector.
4703 /// \headerfile <x86intrin.h>
4705 /// This intrinsic has no corresponding instruction.
4707 /// \param __a
4708 /// A 128-bit floating-point vector of [4 x float].
4709 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4710 /// parameter.
4711 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4712 return (__m128i)__a;
4715 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4716 /// of [4 x float].
4718 /// \headerfile <x86intrin.h>
4720 /// This intrinsic has no corresponding instruction.
4722 /// \param __a
4723 /// A 128-bit integer vector.
4724 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4725 /// bitwise pattern as the parameter.
4726 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4727 return (__m128)__a;
4730 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4731 /// of [2 x double].
4733 /// \headerfile <x86intrin.h>
4735 /// This intrinsic has no corresponding instruction.
4737 /// \param __a
4738 /// A 128-bit integer vector.
4739 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4740 /// bitwise pattern as the parameter.
4741 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4742 return (__m128d)__a;
4745 #if defined(__cplusplus)
4746 extern "C" {
4747 #endif
4749 /// Indicates that a spin loop is being executed for the purposes of
4750 /// optimizing power consumption during the loop.
4752 /// \headerfile <x86intrin.h>
4754 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4756 void _mm_pause(void);
4758 #if defined(__cplusplus)
4759 } // extern "C"
4760 #endif
4761 #undef __DEFAULT_FN_ATTRS
4762 #undef __DEFAULT_FN_ATTRS_MMX
4764 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4766 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4767 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4769 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4771 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4772 #define _MM_SET_DENORMALS_ZERO_MODE(x) \
4773 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4775 #endif /* __EMMINTRIN_H */