[docs] Fix build-docs.sh
[llvm-project.git] / clang / lib / Headers / emmintrin.h
bloba3f56e832b32d44c714beda379a4c55c4ebca8c9
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
17 #include <xmmintrin.h>
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24 __attribute__((__vector_size__(16), __aligned__(1)));
26 /* Type defines. */
27 typedef double __v2df __attribute__((__vector_size__(16)));
28 typedef long long __v2di __attribute__((__vector_size__(16)));
29 typedef short __v8hi __attribute__((__vector_size__(16)));
30 typedef char __v16qi __attribute__((__vector_size__(16)));
32 /* Unsigned types */
33 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38 * appear in the interface though. */
39 typedef signed char __v16qs __attribute__((__vector_size__(16)));
41 /* Define the default attributes for the functions in this file. */
42 #define __DEFAULT_FN_ATTRS \
43 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
44 __min_vector_width__(128)))
45 #define __DEFAULT_FN_ATTRS_MMX \
46 __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), \
47 __min_vector_width__(64)))
49 /// Adds lower double-precision values in both operands and returns the
50 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
51 /// are copied from the upper double-precision value of the first operand.
52 ///
53 /// \headerfile <x86intrin.h>
54 ///
55 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
56 ///
57 /// \param __a
58 /// A 128-bit vector of [2 x double] containing one of the source operands.
59 /// \param __b
60 /// A 128-bit vector of [2 x double] containing one of the source operands.
61 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
62 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
63 /// from the upper 64 bits of the first source operand.
64 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
65 __m128d __b) {
66 __a[0] += __b[0];
67 return __a;
70 /// Adds two 128-bit vectors of [2 x double].
71 ///
72 /// \headerfile <x86intrin.h>
73 ///
74 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
75 ///
76 /// \param __a
77 /// A 128-bit vector of [2 x double] containing one of the source operands.
78 /// \param __b
79 /// A 128-bit vector of [2 x double] containing one of the source operands.
80 /// \returns A 128-bit vector of [2 x double] containing the sums of both
81 /// operands.
82 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
83 __m128d __b) {
84 return (__m128d)((__v2df)__a + (__v2df)__b);
87 /// Subtracts the lower double-precision value of the second operand
88 /// from the lower double-precision value of the first operand and returns
89 /// the difference in the lower 64 bits of the result. The upper 64 bits of
90 /// the result are copied from the upper double-precision value of the first
91 /// operand.
92 ///
93 /// \headerfile <x86intrin.h>
94 ///
95 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
96 ///
97 /// \param __a
98 /// A 128-bit vector of [2 x double] containing the minuend.
99 /// \param __b
100 /// A 128-bit vector of [2 x double] containing the subtrahend.
101 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
102 /// difference of the lower 64 bits of both operands. The upper 64 bits are
103 /// copied from the upper 64 bits of the first source operand.
104 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
105 __m128d __b) {
106 __a[0] -= __b[0];
107 return __a;
110 /// Subtracts two 128-bit vectors of [2 x double].
112 /// \headerfile <x86intrin.h>
114 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
116 /// \param __a
117 /// A 128-bit vector of [2 x double] containing the minuend.
118 /// \param __b
119 /// A 128-bit vector of [2 x double] containing the subtrahend.
120 /// \returns A 128-bit vector of [2 x double] containing the differences between
121 /// both operands.
122 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
123 __m128d __b) {
124 return (__m128d)((__v2df)__a - (__v2df)__b);
127 /// Multiplies lower double-precision values in both operands and returns
128 /// the product in the lower 64 bits of the result. The upper 64 bits of the
129 /// result are copied from the upper double-precision value of the first
130 /// operand.
132 /// \headerfile <x86intrin.h>
134 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
136 /// \param __a
137 /// A 128-bit vector of [2 x double] containing one of the source operands.
138 /// \param __b
139 /// A 128-bit vector of [2 x double] containing one of the source operands.
140 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
141 /// product of the lower 64 bits of both operands. The upper 64 bits are
142 /// copied from the upper 64 bits of the first source operand.
143 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
144 __m128d __b) {
145 __a[0] *= __b[0];
146 return __a;
149 /// Multiplies two 128-bit vectors of [2 x double].
151 /// \headerfile <x86intrin.h>
153 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
155 /// \param __a
156 /// A 128-bit vector of [2 x double] containing one of the operands.
157 /// \param __b
158 /// A 128-bit vector of [2 x double] containing one of the operands.
159 /// \returns A 128-bit vector of [2 x double] containing the products of both
160 /// operands.
161 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
162 __m128d __b) {
163 return (__m128d)((__v2df)__a * (__v2df)__b);
166 /// Divides the lower double-precision value of the first operand by the
167 /// lower double-precision value of the second operand and returns the
168 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
169 /// result are copied from the upper double-precision value of the first
170 /// operand.
172 /// \headerfile <x86intrin.h>
174 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
176 /// \param __a
177 /// A 128-bit vector of [2 x double] containing the dividend.
178 /// \param __b
179 /// A 128-bit vector of [2 x double] containing divisor.
180 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
181 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
182 /// copied from the upper 64 bits of the first source operand.
183 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
184 __m128d __b) {
185 __a[0] /= __b[0];
186 return __a;
189 /// Performs an element-by-element division of two 128-bit vectors of
190 /// [2 x double].
192 /// \headerfile <x86intrin.h>
194 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
196 /// \param __a
197 /// A 128-bit vector of [2 x double] containing the dividend.
198 /// \param __b
199 /// A 128-bit vector of [2 x double] containing the divisor.
200 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
201 /// operands.
202 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
203 __m128d __b) {
204 return (__m128d)((__v2df)__a / (__v2df)__b);
207 /// Calculates the square root of the lower double-precision value of
208 /// the second operand and returns it in the lower 64 bits of the result.
209 /// The upper 64 bits of the result are copied from the upper
210 /// double-precision value of the first operand.
212 /// \headerfile <x86intrin.h>
214 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
216 /// \param __a
217 /// A 128-bit vector of [2 x double] containing one of the operands. The
218 /// upper 64 bits of this operand are copied to the upper 64 bits of the
219 /// result.
220 /// \param __b
221 /// A 128-bit vector of [2 x double] containing one of the operands. The
222 /// square root is calculated using the lower 64 bits of this operand.
223 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
224 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
225 /// bits are copied from the upper 64 bits of operand \a __a.
226 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
227 __m128d __b) {
228 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
229 return __extension__(__m128d){__c[0], __a[1]};
232 /// Calculates the square root of the each of two values stored in a
233 /// 128-bit vector of [2 x double].
235 /// \headerfile <x86intrin.h>
237 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
239 /// \param __a
240 /// A 128-bit vector of [2 x double].
241 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
242 /// values in the operand.
243 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
244 return __builtin_ia32_sqrtpd((__v2df)__a);
247 /// Compares lower 64-bit double-precision values of both operands, and
248 /// returns the lesser of the pair of values in the lower 64-bits of the
249 /// result. The upper 64 bits of the result are copied from the upper
250 /// double-precision value of the first operand.
252 /// \headerfile <x86intrin.h>
254 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
256 /// \param __a
257 /// A 128-bit vector of [2 x double] containing one of the operands. The
258 /// lower 64 bits of this operand are used in the comparison.
259 /// \param __b
260 /// A 128-bit vector of [2 x double] containing one of the operands. The
261 /// lower 64 bits of this operand are used in the comparison.
262 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
263 /// minimum value between both operands. The upper 64 bits are copied from
264 /// the upper 64 bits of the first source operand.
265 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
266 __m128d __b) {
267 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
270 /// Performs element-by-element comparison of the two 128-bit vectors of
271 /// [2 x double] and returns the vector containing the lesser of each pair of
272 /// values.
274 /// \headerfile <x86intrin.h>
276 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
278 /// \param __a
279 /// A 128-bit vector of [2 x double] containing one of the operands.
280 /// \param __b
281 /// A 128-bit vector of [2 x double] containing one of the operands.
282 /// \returns A 128-bit vector of [2 x double] containing the minimum values
283 /// between both operands.
284 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
285 __m128d __b) {
286 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
289 /// Compares lower 64-bit double-precision values of both operands, and
290 /// returns the greater of the pair of values in the lower 64-bits of the
291 /// result. The upper 64 bits of the result are copied from the upper
292 /// double-precision value of the first operand.
294 /// \headerfile <x86intrin.h>
296 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
298 /// \param __a
299 /// A 128-bit vector of [2 x double] containing one of the operands. The
300 /// lower 64 bits of this operand are used in the comparison.
301 /// \param __b
302 /// A 128-bit vector of [2 x double] containing one of the operands. The
303 /// lower 64 bits of this operand are used in the comparison.
304 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
305 /// maximum value between both operands. The upper 64 bits are copied from
306 /// the upper 64 bits of the first source operand.
307 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
308 __m128d __b) {
309 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
312 /// Performs element-by-element comparison of the two 128-bit vectors of
313 /// [2 x double] and returns the vector containing the greater of each pair
314 /// of values.
316 /// \headerfile <x86intrin.h>
318 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
320 /// \param __a
321 /// A 128-bit vector of [2 x double] containing one of the operands.
322 /// \param __b
323 /// A 128-bit vector of [2 x double] containing one of the operands.
324 /// \returns A 128-bit vector of [2 x double] containing the maximum values
325 /// between both operands.
326 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
327 __m128d __b) {
328 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
331 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
333 /// \headerfile <x86intrin.h>
335 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
337 /// \param __a
338 /// A 128-bit vector of [2 x double] containing one of the source operands.
339 /// \param __b
340 /// A 128-bit vector of [2 x double] containing one of the source operands.
341 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
342 /// values between both operands.
343 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
344 __m128d __b) {
345 return (__m128d)((__v2du)__a & (__v2du)__b);
348 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
349 /// the one's complement of the values contained in the first source operand.
351 /// \headerfile <x86intrin.h>
353 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
355 /// \param __a
356 /// A 128-bit vector of [2 x double] containing the left source operand. The
357 /// one's complement of this value is used in the bitwise AND.
358 /// \param __b
359 /// A 128-bit vector of [2 x double] containing the right source operand.
360 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
361 /// values in the second operand and the one's complement of the first
362 /// operand.
363 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
364 __m128d __b) {
365 return (__m128d)(~(__v2du)__a & (__v2du)__b);
368 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
370 /// \headerfile <x86intrin.h>
372 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
374 /// \param __a
375 /// A 128-bit vector of [2 x double] containing one of the source operands.
376 /// \param __b
377 /// A 128-bit vector of [2 x double] containing one of the source operands.
378 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
379 /// values between both operands.
380 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
381 __m128d __b) {
382 return (__m128d)((__v2du)__a | (__v2du)__b);
385 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
387 /// \headerfile <x86intrin.h>
389 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
391 /// \param __a
392 /// A 128-bit vector of [2 x double] containing one of the source operands.
393 /// \param __b
394 /// A 128-bit vector of [2 x double] containing one of the source operands.
395 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
396 /// values between both operands.
397 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
398 __m128d __b) {
399 return (__m128d)((__v2du)__a ^ (__v2du)__b);
402 /// Compares each of the corresponding double-precision values of the
403 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
404 /// for false, 0xFFFFFFFFFFFFFFFF for true.
406 /// \headerfile <x86intrin.h>
408 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
410 /// \param __a
411 /// A 128-bit vector of [2 x double].
412 /// \param __b
413 /// A 128-bit vector of [2 x double].
414 /// \returns A 128-bit vector containing the comparison results.
415 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
416 __m128d __b) {
417 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
420 /// Compares each of the corresponding double-precision values of the
421 /// 128-bit vectors of [2 x double] to determine if the values in the first
422 /// operand are less than those in the second operand. Each comparison
423 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
425 /// \headerfile <x86intrin.h>
427 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
429 /// \param __a
430 /// A 128-bit vector of [2 x double].
431 /// \param __b
432 /// A 128-bit vector of [2 x double].
433 /// \returns A 128-bit vector containing the comparison results.
434 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
435 __m128d __b) {
436 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
439 /// Compares each of the corresponding double-precision values of the
440 /// 128-bit vectors of [2 x double] to determine if the values in the first
441 /// operand are less than or equal to those in the second operand.
443 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
445 /// \headerfile <x86intrin.h>
447 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
449 /// \param __a
450 /// A 128-bit vector of [2 x double].
451 /// \param __b
452 /// A 128-bit vector of [2 x double].
453 /// \returns A 128-bit vector containing the comparison results.
454 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
455 __m128d __b) {
456 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
459 /// Compares each of the corresponding double-precision values of the
460 /// 128-bit vectors of [2 x double] to determine if the values in the first
461 /// operand are greater than those in the second operand.
463 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
465 /// \headerfile <x86intrin.h>
467 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
469 /// \param __a
470 /// A 128-bit vector of [2 x double].
471 /// \param __b
472 /// A 128-bit vector of [2 x double].
473 /// \returns A 128-bit vector containing the comparison results.
474 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
475 __m128d __b) {
476 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
479 /// Compares each of the corresponding double-precision values of the
480 /// 128-bit vectors of [2 x double] to determine if the values in the first
481 /// operand are greater than or equal to those in the second operand.
483 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
485 /// \headerfile <x86intrin.h>
487 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
489 /// \param __a
490 /// A 128-bit vector of [2 x double].
491 /// \param __b
492 /// A 128-bit vector of [2 x double].
493 /// \returns A 128-bit vector containing the comparison results.
494 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
495 __m128d __b) {
496 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
499 /// Compares each of the corresponding double-precision values of the
500 /// 128-bit vectors of [2 x double] to determine if the values in the first
501 /// operand are ordered with respect to those in the second operand.
503 /// A pair of double-precision values are "ordered" with respect to each
504 /// other if neither value is a NaN. Each comparison yields 0x0 for false,
505 /// 0xFFFFFFFFFFFFFFFF for true.
507 /// \headerfile <x86intrin.h>
509 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
511 /// \param __a
512 /// A 128-bit vector of [2 x double].
513 /// \param __b
514 /// A 128-bit vector of [2 x double].
515 /// \returns A 128-bit vector containing the comparison results.
516 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
517 __m128d __b) {
518 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
521 /// Compares each of the corresponding double-precision values of the
522 /// 128-bit vectors of [2 x double] to determine if the values in the first
523 /// operand are unordered with respect to those in the second operand.
525 /// A pair of double-precision values are "unordered" with respect to each
526 /// other if one or both values are NaN. Each comparison yields 0x0 for
527 /// false, 0xFFFFFFFFFFFFFFFF for true.
529 /// \headerfile <x86intrin.h>
531 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
532 /// instruction.
534 /// \param __a
535 /// A 128-bit vector of [2 x double].
536 /// \param __b
537 /// A 128-bit vector of [2 x double].
538 /// \returns A 128-bit vector containing the comparison results.
539 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
540 __m128d __b) {
541 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
544 /// Compares each of the corresponding double-precision values of the
545 /// 128-bit vectors of [2 x double] to determine if the values in the first
546 /// operand are unequal to those in the second operand.
548 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
550 /// \headerfile <x86intrin.h>
552 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
554 /// \param __a
555 /// A 128-bit vector of [2 x double].
556 /// \param __b
557 /// A 128-bit vector of [2 x double].
558 /// \returns A 128-bit vector containing the comparison results.
559 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
560 __m128d __b) {
561 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
564 /// Compares each of the corresponding double-precision values of the
565 /// 128-bit vectors of [2 x double] to determine if the values in the first
566 /// operand are not less than those in the second operand.
568 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
570 /// \headerfile <x86intrin.h>
572 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
574 /// \param __a
575 /// A 128-bit vector of [2 x double].
576 /// \param __b
577 /// A 128-bit vector of [2 x double].
578 /// \returns A 128-bit vector containing the comparison results.
579 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
580 __m128d __b) {
581 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
584 /// Compares each of the corresponding double-precision values of the
585 /// 128-bit vectors of [2 x double] to determine if the values in the first
586 /// operand are not less than or equal to those in the second operand.
588 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
590 /// \headerfile <x86intrin.h>
592 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
594 /// \param __a
595 /// A 128-bit vector of [2 x double].
596 /// \param __b
597 /// A 128-bit vector of [2 x double].
598 /// \returns A 128-bit vector containing the comparison results.
599 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
600 __m128d __b) {
601 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
604 /// Compares each of the corresponding double-precision values of the
605 /// 128-bit vectors of [2 x double] to determine if the values in the first
606 /// operand are not greater than those in the second operand.
608 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
610 /// \headerfile <x86intrin.h>
612 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
614 /// \param __a
615 /// A 128-bit vector of [2 x double].
616 /// \param __b
617 /// A 128-bit vector of [2 x double].
618 /// \returns A 128-bit vector containing the comparison results.
619 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
620 __m128d __b) {
621 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
624 /// Compares each of the corresponding double-precision values of the
625 /// 128-bit vectors of [2 x double] to determine if the values in the first
626 /// operand are not greater than or equal to those in the second operand.
628 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
630 /// \headerfile <x86intrin.h>
632 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
634 /// \param __a
635 /// A 128-bit vector of [2 x double].
636 /// \param __b
637 /// A 128-bit vector of [2 x double].
638 /// \returns A 128-bit vector containing the comparison results.
639 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
640 __m128d __b) {
641 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
644 /// Compares the lower double-precision floating-point values in each of
645 /// the two 128-bit floating-point vectors of [2 x double] for equality.
647 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
649 /// \headerfile <x86intrin.h>
651 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
653 /// \param __a
654 /// A 128-bit vector of [2 x double]. The lower double-precision value is
655 /// compared to the lower double-precision value of \a __b.
656 /// \param __b
657 /// A 128-bit vector of [2 x double]. The lower double-precision value is
658 /// compared to the lower double-precision value of \a __a.
659 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
660 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
661 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
662 __m128d __b) {
663 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
666 /// Compares the lower double-precision floating-point values in each of
667 /// the two 128-bit floating-point vectors of [2 x double] to determine if
668 /// the value in the first parameter is less than the corresponding value in
669 /// the second parameter.
671 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
673 /// \headerfile <x86intrin.h>
675 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
677 /// \param __a
678 /// A 128-bit vector of [2 x double]. The lower double-precision value is
679 /// compared to the lower double-precision value of \a __b.
680 /// \param __b
681 /// A 128-bit vector of [2 x double]. The lower double-precision value is
682 /// compared to the lower double-precision value of \a __a.
683 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
684 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
685 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
686 __m128d __b) {
687 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
690 /// Compares the lower double-precision floating-point values in each of
691 /// the two 128-bit floating-point vectors of [2 x double] to determine if
692 /// the value in the first parameter is less than or equal to the
693 /// corresponding value in the second parameter.
695 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
697 /// \headerfile <x86intrin.h>
699 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
701 /// \param __a
702 /// A 128-bit vector of [2 x double]. The lower double-precision value is
703 /// compared to the lower double-precision value of \a __b.
704 /// \param __b
705 /// A 128-bit vector of [2 x double]. The lower double-precision value is
706 /// compared to the lower double-precision value of \a __a.
707 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
708 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
710 __m128d __b) {
711 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
714 /// Compares the lower double-precision floating-point values in each of
715 /// the two 128-bit floating-point vectors of [2 x double] to determine if
716 /// the value in the first parameter is greater than the corresponding value
717 /// in the second parameter.
719 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
721 /// \headerfile <x86intrin.h>
723 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
725 /// \param __a
726 /// A 128-bit vector of [2 x double]. The lower double-precision value is
727 /// compared to the lower double-precision value of \a __b.
728 /// \param __b
729 /// A 128-bit vector of [2 x double]. The lower double-precision value is
730 /// compared to the lower double-precision value of \a __a.
731 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
732 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
733 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
734 __m128d __b) {
735 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
736 return __extension__(__m128d){__c[0], __a[1]};
739 /// Compares the lower double-precision floating-point values in each of
740 /// the two 128-bit floating-point vectors of [2 x double] to determine if
741 /// the value in the first parameter is greater than or equal to the
742 /// corresponding value in the second parameter.
744 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
746 /// \headerfile <x86intrin.h>
748 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
750 /// \param __a
751 /// A 128-bit vector of [2 x double]. The lower double-precision value is
752 /// compared to the lower double-precision value of \a __b.
753 /// \param __b
754 /// A 128-bit vector of [2 x double]. The lower double-precision value is
755 /// compared to the lower double-precision value of \a __a.
756 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
757 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
758 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
759 __m128d __b) {
760 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
761 return __extension__(__m128d){__c[0], __a[1]};
764 /// Compares the lower double-precision floating-point values in each of
765 /// the two 128-bit floating-point vectors of [2 x double] to determine if
766 /// the value in the first parameter is "ordered" with respect to the
767 /// corresponding value in the second parameter.
769 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
770 /// of double-precision values are "ordered" with respect to each other if
771 /// neither value is a NaN.
773 /// \headerfile <x86intrin.h>
775 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
777 /// \param __a
778 /// A 128-bit vector of [2 x double]. The lower double-precision value is
779 /// compared to the lower double-precision value of \a __b.
780 /// \param __b
781 /// A 128-bit vector of [2 x double]. The lower double-precision value is
782 /// compared to the lower double-precision value of \a __a.
783 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
784 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
785 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
786 __m128d __b) {
787 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
790 /// Compares the lower double-precision floating-point values in each of
791 /// the two 128-bit floating-point vectors of [2 x double] to determine if
792 /// the value in the first parameter is "unordered" with respect to the
793 /// corresponding value in the second parameter.
795 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
796 /// of double-precision values are "unordered" with respect to each other if
797 /// one or both values are NaN.
799 /// \headerfile <x86intrin.h>
801 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
802 /// instruction.
804 /// \param __a
805 /// A 128-bit vector of [2 x double]. The lower double-precision value is
806 /// compared to the lower double-precision value of \a __b.
807 /// \param __b
808 /// A 128-bit vector of [2 x double]. The lower double-precision value is
809 /// compared to the lower double-precision value of \a __a.
810 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
811 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
812 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
813 __m128d __b) {
814 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
817 /// Compares the lower double-precision floating-point values in each of
818 /// the two 128-bit floating-point vectors of [2 x double] to determine if
819 /// the value in the first parameter is unequal to the corresponding value in
820 /// the second parameter.
822 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
824 /// \headerfile <x86intrin.h>
826 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
828 /// \param __a
829 /// A 128-bit vector of [2 x double]. The lower double-precision value is
830 /// compared to the lower double-precision value of \a __b.
831 /// \param __b
832 /// A 128-bit vector of [2 x double]. The lower double-precision value is
833 /// compared to the lower double-precision value of \a __a.
834 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
835 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
836 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
837 __m128d __b) {
838 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
841 /// Compares the lower double-precision floating-point values in each of
842 /// the two 128-bit floating-point vectors of [2 x double] to determine if
843 /// the value in the first parameter is not less than the corresponding
844 /// value in the second parameter.
846 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
848 /// \headerfile <x86intrin.h>
850 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
852 /// \param __a
853 /// A 128-bit vector of [2 x double]. The lower double-precision value is
854 /// compared to the lower double-precision value of \a __b.
855 /// \param __b
856 /// A 128-bit vector of [2 x double]. The lower double-precision value is
857 /// compared to the lower double-precision value of \a __a.
858 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
859 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
860 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
861 __m128d __b) {
862 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
865 /// Compares the lower double-precision floating-point values in each of
866 /// the two 128-bit floating-point vectors of [2 x double] to determine if
867 /// the value in the first parameter is not less than or equal to the
868 /// corresponding value in the second parameter.
870 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
872 /// \headerfile <x86intrin.h>
874 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
876 /// \param __a
877 /// A 128-bit vector of [2 x double]. The lower double-precision value is
878 /// compared to the lower double-precision value of \a __b.
879 /// \param __b
880 /// A 128-bit vector of [2 x double]. The lower double-precision value is
881 /// compared to the lower double-precision value of \a __a.
882 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
883 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
884 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
885 __m128d __b) {
886 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
889 /// Compares the lower double-precision floating-point values in each of
890 /// the two 128-bit floating-point vectors of [2 x double] to determine if
891 /// the value in the first parameter is not greater than the corresponding
892 /// value in the second parameter.
894 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
896 /// \headerfile <x86intrin.h>
898 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
900 /// \param __a
901 /// A 128-bit vector of [2 x double]. The lower double-precision value is
902 /// compared to the lower double-precision value of \a __b.
903 /// \param __b
904 /// A 128-bit vector of [2 x double]. The lower double-precision value is
905 /// compared to the lower double-precision value of \a __a.
906 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
907 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
908 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
909 __m128d __b) {
910 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
911 return __extension__(__m128d){__c[0], __a[1]};
914 /// Compares the lower double-precision floating-point values in each of
915 /// the two 128-bit floating-point vectors of [2 x double] to determine if
916 /// the value in the first parameter is not greater than or equal to the
917 /// corresponding value in the second parameter.
919 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
921 /// \headerfile <x86intrin.h>
923 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
925 /// \param __a
926 /// A 128-bit vector of [2 x double]. The lower double-precision value is
927 /// compared to the lower double-precision value of \a __b.
928 /// \param __b
929 /// A 128-bit vector of [2 x double]. The lower double-precision value is
930 /// compared to the lower double-precision value of \a __a.
931 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
932 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
933 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
934 __m128d __b) {
935 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
936 return __extension__(__m128d){__c[0], __a[1]};
939 /// Compares the lower double-precision floating-point values in each of
940 /// the two 128-bit floating-point vectors of [2 x double] for equality.
942 /// The comparison yields 0 for false, 1 for true. If either of the two
943 /// lower double-precision values is NaN, 0 is returned.
945 /// \headerfile <x86intrin.h>
947 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
949 /// \param __a
950 /// A 128-bit vector of [2 x double]. The lower double-precision value is
951 /// compared to the lower double-precision value of \a __b.
952 /// \param __b
953 /// A 128-bit vector of [2 x double]. The lower double-precision value is
954 /// compared to the lower double-precision value of \a __a.
955 /// \returns An integer containing the comparison results. If either of the two
956 /// lower double-precision values is NaN, 0 is returned.
957 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
958 __m128d __b) {
959 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
962 /// Compares the lower double-precision floating-point values in each of
963 /// the two 128-bit floating-point vectors of [2 x double] to determine if
964 /// the value in the first parameter is less than the corresponding value in
965 /// the second parameter.
967 /// The comparison yields 0 for false, 1 for true. If either of the two
968 /// lower double-precision values is NaN, 0 is returned.
970 /// \headerfile <x86intrin.h>
972 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
974 /// \param __a
975 /// A 128-bit vector of [2 x double]. The lower double-precision value is
976 /// compared to the lower double-precision value of \a __b.
977 /// \param __b
978 /// A 128-bit vector of [2 x double]. The lower double-precision value is
979 /// compared to the lower double-precision value of \a __a.
980 /// \returns An integer containing the comparison results. If either of the two
981 /// lower double-precision values is NaN, 0 is returned.
982 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
983 __m128d __b) {
984 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
987 /// Compares the lower double-precision floating-point values in each of
988 /// the two 128-bit floating-point vectors of [2 x double] to determine if
989 /// the value in the first parameter is less than or equal to the
990 /// corresponding value in the second parameter.
992 /// The comparison yields 0 for false, 1 for true. If either of the two
993 /// lower double-precision values is NaN, 0 is returned.
995 /// \headerfile <x86intrin.h>
997 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
999 /// \param __a
1000 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1001 /// compared to the lower double-precision value of \a __b.
1002 /// \param __b
1003 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1004 /// compared to the lower double-precision value of \a __a.
1005 /// \returns An integer containing the comparison results. If either of the two
1006 /// lower double-precision values is NaN, 0 is returned.
1007 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1008 __m128d __b) {
1009 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1012 /// Compares the lower double-precision floating-point values in each of
1013 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1014 /// the value in the first parameter is greater than the corresponding value
1015 /// in the second parameter.
1017 /// The comparison yields 0 for false, 1 for true. If either of the two
1018 /// lower double-precision values is NaN, 0 is returned.
1020 /// \headerfile <x86intrin.h>
1022 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1024 /// \param __a
1025 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1026 /// compared to the lower double-precision value of \a __b.
1027 /// \param __b
1028 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1029 /// compared to the lower double-precision value of \a __a.
1030 /// \returns An integer containing the comparison results. If either of the two
1031 /// lower double-precision values is NaN, 0 is returned.
1032 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1033 __m128d __b) {
1034 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1037 /// Compares the lower double-precision floating-point values in each of
1038 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1039 /// the value in the first parameter is greater than or equal to the
1040 /// corresponding value in the second parameter.
1042 /// The comparison yields 0 for false, 1 for true. If either of the two
1043 /// lower double-precision values is NaN, 0 is returned.
1045 /// \headerfile <x86intrin.h>
1047 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1049 /// \param __a
1050 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1051 /// compared to the lower double-precision value of \a __b.
1052 /// \param __b
1053 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1054 /// compared to the lower double-precision value of \a __a.
1055 /// \returns An integer containing the comparison results. If either of the two
1056 /// lower double-precision values is NaN, 0 is returned.
1057 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1058 __m128d __b) {
1059 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1062 /// Compares the lower double-precision floating-point values in each of
1063 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1064 /// the value in the first parameter is unequal to the corresponding value in
1065 /// the second parameter.
1067 /// The comparison yields 0 for false, 1 for true. If either of the two
1068 /// lower double-precision values is NaN, 1 is returned.
1070 /// \headerfile <x86intrin.h>
1072 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1074 /// \param __a
1075 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1076 /// compared to the lower double-precision value of \a __b.
1077 /// \param __b
1078 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1079 /// compared to the lower double-precision value of \a __a.
1080 /// \returns An integer containing the comparison results. If either of the two
1081 /// lower double-precision values is NaN, 1 is returned.
1082 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1083 __m128d __b) {
1084 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1087 /// Compares the lower double-precision floating-point values in each of
1088 /// the two 128-bit floating-point vectors of [2 x double] for equality. The
1089 /// comparison yields 0 for false, 1 for true.
1091 /// If either of the two lower double-precision values is NaN, 0 is returned.
1093 /// \headerfile <x86intrin.h>
1095 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1097 /// \param __a
1098 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1099 /// compared to the lower double-precision value of \a __b.
1100 /// \param __b
1101 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1102 /// compared to the lower double-precision value of \a __a.
1103 /// \returns An integer containing the comparison results. If either of the two
1104 /// lower double-precision values is NaN, 0 is returned.
1105 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1106 __m128d __b) {
1107 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1110 /// Compares the lower double-precision floating-point values in each of
1111 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1112 /// the value in the first parameter is less than the corresponding value in
1113 /// the second parameter.
1115 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1116 /// double-precision values is NaN, 0 is returned.
1118 /// \headerfile <x86intrin.h>
1120 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1122 /// \param __a
1123 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1124 /// compared to the lower double-precision value of \a __b.
1125 /// \param __b
1126 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1127 /// compared to the lower double-precision value of \a __a.
1128 /// \returns An integer containing the comparison results. If either of the two
1129 /// lower double-precision values is NaN, 0 is returned.
1130 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1131 __m128d __b) {
1132 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1135 /// Compares the lower double-precision floating-point values in each of
1136 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1137 /// the value in the first parameter is less than or equal to the
1138 /// corresponding value in the second parameter.
1140 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1141 /// double-precision values is NaN, 0 is returned.
1143 /// \headerfile <x86intrin.h>
1145 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1147 /// \param __a
1148 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1149 /// compared to the lower double-precision value of \a __b.
1150 /// \param __b
1151 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1152 /// compared to the lower double-precision value of \a __a.
1153 /// \returns An integer containing the comparison results. If either of the two
1154 /// lower double-precision values is NaN, 0 is returned.
1155 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1156 __m128d __b) {
1157 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1160 /// Compares the lower double-precision floating-point values in each of
1161 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1162 /// the value in the first parameter is greater than the corresponding value
1163 /// in the second parameter.
1165 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1166 /// double-precision values is NaN, 0 is returned.
1168 /// \headerfile <x86intrin.h>
1170 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1172 /// \param __a
1173 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1174 /// compared to the lower double-precision value of \a __b.
1175 /// \param __b
1176 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1177 /// compared to the lower double-precision value of \a __a.
1178 /// \returns An integer containing the comparison results. If either of the two
1179 /// lower double-precision values is NaN, 0 is returned.
1180 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1181 __m128d __b) {
1182 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1185 /// Compares the lower double-precision floating-point values in each of
1186 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1187 /// the value in the first parameter is greater than or equal to the
1188 /// corresponding value in the second parameter.
1190 /// The comparison yields 0 for false, 1 for true. If either of the two
1191 /// lower double-precision values is NaN, 0 is returned.
1193 /// \headerfile <x86intrin.h>
1195 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1197 /// \param __a
1198 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1199 /// compared to the lower double-precision value of \a __b.
1200 /// \param __b
1201 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1202 /// compared to the lower double-precision value of \a __a.
1203 /// \returns An integer containing the comparison results. If either of the two
1204 /// lower double-precision values is NaN, 0 is returned.
1205 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1206 __m128d __b) {
1207 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1210 /// Compares the lower double-precision floating-point values in each of
1211 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1212 /// the value in the first parameter is unequal to the corresponding value in
1213 /// the second parameter.
1215 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1216 /// double-precision values is NaN, 1 is returned.
1218 /// \headerfile <x86intrin.h>
1220 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1222 /// \param __a
1223 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1224 /// compared to the lower double-precision value of \a __b.
1225 /// \param __b
1226 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1227 /// compared to the lower double-precision value of \a __a.
1228 /// \returns An integer containing the comparison result. If either of the two
1229 /// lower double-precision values is NaN, 1 is returned.
1230 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1231 __m128d __b) {
1232 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1235 /// Converts the two double-precision floating-point elements of a
1236 /// 128-bit vector of [2 x double] into two single-precision floating-point
1237 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1238 /// The upper 64 bits of the result vector are set to zero.
1240 /// \headerfile <x86intrin.h>
1242 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1244 /// \param __a
1245 /// A 128-bit vector of [2 x double].
1246 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1247 /// converted values. The upper 64 bits are set to zero.
1248 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1249 return __builtin_ia32_cvtpd2ps((__v2df)__a);
1252 /// Converts the lower two single-precision floating-point elements of a
1253 /// 128-bit vector of [4 x float] into two double-precision floating-point
1254 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1255 /// elements of the input vector are unused.
1257 /// \headerfile <x86intrin.h>
1259 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1261 /// \param __a
1262 /// A 128-bit vector of [4 x float]. The lower two single-precision
1263 /// floating-point elements are converted to double-precision values. The
1264 /// upper two elements are unused.
1265 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1266 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1267 return (__m128d) __builtin_convertvector(
1268 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1271 /// Converts the lower two integer elements of a 128-bit vector of
1272 /// [4 x i32] into two double-precision floating-point values, returned in a
1273 /// 128-bit vector of [2 x double].
1275 /// The upper two elements of the input vector are unused.
1277 /// \headerfile <x86intrin.h>
1279 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1281 /// \param __a
1282 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1283 /// converted to double-precision values.
1285 /// The upper two elements are unused.
1286 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1287 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1288 return (__m128d) __builtin_convertvector(
1289 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1292 /// Converts the two double-precision floating-point elements of a
1293 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1294 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1295 /// 64 bits of the result vector are set to zero.
1297 /// \headerfile <x86intrin.h>
1299 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1301 /// \param __a
1302 /// A 128-bit vector of [2 x double].
1303 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1304 /// converted values. The upper 64 bits are set to zero.
1305 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1306 return __builtin_ia32_cvtpd2dq((__v2df)__a);
1309 /// Converts the low-order element of a 128-bit vector of [2 x double]
1310 /// into a 32-bit signed integer value.
1312 /// \headerfile <x86intrin.h>
1314 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1316 /// \param __a
1317 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1318 /// conversion.
1319 /// \returns A 32-bit signed integer containing the converted value.
1320 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1321 return __builtin_ia32_cvtsd2si((__v2df)__a);
1324 /// Converts the lower double-precision floating-point element of a
1325 /// 128-bit vector of [2 x double], in the second parameter, into a
1326 /// single-precision floating-point value, returned in the lower 32 bits of a
1327 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1328 /// copied from the upper 96 bits of the first parameter.
1330 /// \headerfile <x86intrin.h>
1332 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1334 /// \param __a
1335 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1336 /// copied to the upper 96 bits of the result.
1337 /// \param __b
1338 /// A 128-bit vector of [2 x double]. The lower double-precision
1339 /// floating-point element is used in the conversion.
1340 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1341 /// converted value from the second parameter. The upper 96 bits are copied
1342 /// from the upper 96 bits of the first parameter.
1343 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1344 __m128d __b) {
1345 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1348 /// Converts a 32-bit signed integer value, in the second parameter, into
1349 /// a double-precision floating-point value, returned in the lower 64 bits of
1350 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1351 /// are copied from the upper 64 bits of the first parameter.
1353 /// \headerfile <x86intrin.h>
1355 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1357 /// \param __a
1358 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1359 /// copied to the upper 64 bits of the result.
1360 /// \param __b
1361 /// A 32-bit signed integer containing the value to be converted.
1362 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1363 /// converted value from the second parameter. The upper 64 bits are copied
1364 /// from the upper 64 bits of the first parameter.
1365 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1366 int __b) {
1367 __a[0] = __b;
1368 return __a;
1371 /// Converts the lower single-precision floating-point element of a
1372 /// 128-bit vector of [4 x float], in the second parameter, into a
1373 /// double-precision floating-point value, returned in the lower 64 bits of
1374 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1375 /// are copied from the upper 64 bits of the first parameter.
1377 /// \headerfile <x86intrin.h>
1379 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1381 /// \param __a
1382 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1383 /// copied to the upper 64 bits of the result.
1384 /// \param __b
1385 /// A 128-bit vector of [4 x float]. The lower single-precision
1386 /// floating-point element is used in the conversion.
1387 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1388 /// converted value from the second parameter. The upper 64 bits are copied
1389 /// from the upper 64 bits of the first parameter.
1390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1391 __m128 __b) {
1392 __a[0] = __b[0];
1393 return __a;
1396 /// Converts the two double-precision floating-point elements of a
1397 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1398 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1400 /// If the result of either conversion is inexact, the result is truncated
1401 /// (rounded towards zero) regardless of the current MXCSR setting. The upper
1402 /// 64 bits of the result vector are set to zero.
1404 /// \headerfile <x86intrin.h>
1406 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1407 /// instruction.
1409 /// \param __a
1410 /// A 128-bit vector of [2 x double].
1411 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1412 /// converted values. The upper 64 bits are set to zero.
1413 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1414 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1417 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1418 /// signed integer value, truncating the result when it is inexact.
1420 /// \headerfile <x86intrin.h>
1422 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1423 /// instruction.
1425 /// \param __a
1426 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1427 /// conversion.
1428 /// \returns A 32-bit signed integer containing the converted value.
1429 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1430 return __builtin_ia32_cvttsd2si((__v2df)__a);
1433 /// Converts the two double-precision floating-point elements of a
1434 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1435 /// returned in a 64-bit vector of [2 x i32].
1437 /// \headerfile <x86intrin.h>
1439 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1441 /// \param __a
1442 /// A 128-bit vector of [2 x double].
1443 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1444 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1445 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1448 /// Converts the two double-precision floating-point elements of a
1449 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1450 /// returned in a 64-bit vector of [2 x i32].
1452 /// If the result of either conversion is inexact, the result is truncated
1453 /// (rounded towards zero) regardless of the current MXCSR setting.
1455 /// \headerfile <x86intrin.h>
1457 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1459 /// \param __a
1460 /// A 128-bit vector of [2 x double].
1461 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1462 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1463 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1466 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1467 /// [2 x i32] into two double-precision floating-point values, returned in a
1468 /// 128-bit vector of [2 x double].
1470 /// \headerfile <x86intrin.h>
1472 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1474 /// \param __a
1475 /// A 64-bit vector of [2 x i32].
1476 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1477 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1478 return __builtin_ia32_cvtpi2pd((__v2si)__a);
1481 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1482 /// a double-precision floating-point value.
1484 /// \headerfile <x86intrin.h>
1486 /// This intrinsic has no corresponding instruction.
1488 /// \param __a
1489 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1490 /// \returns A double-precision floating-point value copied from the lower 64
1491 /// bits of \a __a.
1492 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1493 return __a[0];
1496 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1497 /// memory location.
1499 /// \headerfile <x86intrin.h>
1501 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1503 /// \param __dp
1504 /// A pointer to a 128-bit memory location. The address of the memory
1505 /// location has to be 16-byte aligned.
1506 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1507 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1508 return *(const __m128d *)__dp;
1511 /// Loads a double-precision floating-point value from a specified memory
1512 /// location and duplicates it to both vector elements of a 128-bit vector of
1513 /// [2 x double].
1515 /// \headerfile <x86intrin.h>
1517 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1519 /// \param __dp
1520 /// A pointer to a memory location containing a double-precision value.
1521 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1522 /// duplicated values.
1523 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1524 struct __mm_load1_pd_struct {
1525 double __u;
1526 } __attribute__((__packed__, __may_alias__));
1527 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1528 return __extension__(__m128d){__u, __u};
1531 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1533 /// Loads two double-precision values, in reverse order, from an aligned
1534 /// memory location into a 128-bit vector of [2 x double].
1536 /// \headerfile <x86intrin.h>
1538 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1539 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1540 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1542 /// \param __dp
1543 /// A 16-byte aligned pointer to an array of double-precision values to be
1544 /// loaded in reverse order.
1545 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1546 /// values.
1547 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1548 __m128d __u = *(const __m128d *)__dp;
1549 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1552 /// Loads a 128-bit floating-point vector of [2 x double] from an
1553 /// unaligned memory location.
1555 /// \headerfile <x86intrin.h>
1557 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1559 /// \param __dp
1560 /// A pointer to a 128-bit memory location. The address of the memory
1561 /// location does not have to be aligned.
1562 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1563 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1564 struct __loadu_pd {
1565 __m128d_u __v;
1566 } __attribute__((__packed__, __may_alias__));
1567 return ((const struct __loadu_pd *)__dp)->__v;
1570 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1571 /// vector and clears the upper element.
1573 /// \headerfile <x86intrin.h>
1575 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1577 /// \param __a
1578 /// A pointer to a 64-bit memory location. The address of the memory
1579 /// location does not have to be aligned.
1580 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1581 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1582 struct __loadu_si64 {
1583 long long __v;
1584 } __attribute__((__packed__, __may_alias__));
1585 long long __u = ((const struct __loadu_si64 *)__a)->__v;
1586 return __extension__(__m128i)(__v2di){__u, 0LL};
1589 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1590 /// vector and clears the upper element.
1592 /// \headerfile <x86intrin.h>
1594 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1596 /// \param __a
1597 /// A pointer to a 32-bit memory location. The address of the memory
1598 /// location does not have to be aligned.
1599 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1600 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1601 struct __loadu_si32 {
1602 int __v;
1603 } __attribute__((__packed__, __may_alias__));
1604 int __u = ((const struct __loadu_si32 *)__a)->__v;
1605 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1608 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1609 /// vector and clears the upper element.
1611 /// \headerfile <x86intrin.h>
1613 /// This intrinsic does not correspond to a specific instruction.
1615 /// \param __a
1616 /// A pointer to a 16-bit memory location. The address of the memory
1617 /// location does not have to be aligned.
1618 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1619 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1620 struct __loadu_si16 {
1621 short __v;
1622 } __attribute__((__packed__, __may_alias__));
1623 short __u = ((const struct __loadu_si16 *)__a)->__v;
1624 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1627 /// Loads a 64-bit double-precision value to the low element of a
1628 /// 128-bit integer vector and clears the upper element.
1630 /// \headerfile <x86intrin.h>
1632 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1634 /// \param __dp
1635 /// A pointer to a memory location containing a double-precision value.
1636 /// The address of the memory location does not have to be aligned.
1637 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1638 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1639 struct __mm_load_sd_struct {
1640 double __u;
1641 } __attribute__((__packed__, __may_alias__));
1642 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1643 return __extension__(__m128d){__u, 0};
1646 /// Loads a double-precision value into the high-order bits of a 128-bit
1647 /// vector of [2 x double]. The low-order bits are copied from the low-order
1648 /// bits of the first operand.
1650 /// \headerfile <x86intrin.h>
1652 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1654 /// \param __a
1655 /// A 128-bit vector of [2 x double]. \n
1656 /// Bits [63:0] are written to bits [63:0] of the result.
1657 /// \param __dp
1658 /// A pointer to a 64-bit memory location containing a double-precision
1659 /// floating-point value that is loaded. The loaded value is written to bits
1660 /// [127:64] of the result. The address of the memory location does not have
1661 /// to be aligned.
1662 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1663 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1664 double const *__dp) {
1665 struct __mm_loadh_pd_struct {
1666 double __u;
1667 } __attribute__((__packed__, __may_alias__));
1668 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1669 return __extension__(__m128d){__a[0], __u};
1672 /// Loads a double-precision value into the low-order bits of a 128-bit
1673 /// vector of [2 x double]. The high-order bits are copied from the
1674 /// high-order bits of the first operand.
1676 /// \headerfile <x86intrin.h>
1678 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1680 /// \param __a
1681 /// A 128-bit vector of [2 x double]. \n
1682 /// Bits [127:64] are written to bits [127:64] of the result.
1683 /// \param __dp
1684 /// A pointer to a 64-bit memory location containing a double-precision
1685 /// floating-point value that is loaded. The loaded value is written to bits
1686 /// [63:0] of the result. The address of the memory location does not have to
1687 /// be aligned.
1688 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1689 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1690 double const *__dp) {
1691 struct __mm_loadl_pd_struct {
1692 double __u;
1693 } __attribute__((__packed__, __may_alias__));
1694 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1695 return __extension__(__m128d){__u, __a[1]};
1698 /// Constructs a 128-bit floating-point vector of [2 x double] with
1699 /// unspecified content. This could be used as an argument to another
1700 /// intrinsic function where the argument is required but the value is not
1701 /// actually used.
1703 /// \headerfile <x86intrin.h>
1705 /// This intrinsic has no corresponding instruction.
1707 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1708 /// content.
1709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1710 return (__m128d)__builtin_ia32_undef128();
1713 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1714 /// 64 bits of the vector are initialized with the specified double-precision
1715 /// floating-point value. The upper 64 bits are set to zero.
1717 /// \headerfile <x86intrin.h>
1719 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1721 /// \param __w
1722 /// A double-precision floating-point value used to initialize the lower 64
1723 /// bits of the result.
1724 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1725 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1726 /// set to zero.
1727 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1728 return __extension__(__m128d){__w, 0};
1731 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1732 /// of the two double-precision floating-point vector elements set to the
1733 /// specified double-precision floating-point value.
1735 /// \headerfile <x86intrin.h>
1737 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1739 /// \param __w
1740 /// A double-precision floating-point value used to initialize each vector
1741 /// element of the result.
1742 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1743 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1744 return __extension__(__m128d){__w, __w};
1747 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1748 /// of the two double-precision floating-point vector elements set to the
1749 /// specified double-precision floating-point value.
1751 /// \headerfile <x86intrin.h>
1753 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1755 /// \param __w
1756 /// A double-precision floating-point value used to initialize each vector
1757 /// element of the result.
1758 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1759 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1760 return _mm_set1_pd(__w);
1763 /// Constructs a 128-bit floating-point vector of [2 x double]
1764 /// initialized with the specified double-precision floating-point values.
1766 /// \headerfile <x86intrin.h>
1768 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1770 /// \param __w
1771 /// A double-precision floating-point value used to initialize the upper 64
1772 /// bits of the result.
1773 /// \param __x
1774 /// A double-precision floating-point value used to initialize the lower 64
1775 /// bits of the result.
1776 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1777 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1778 double __x) {
1779 return __extension__(__m128d){__x, __w};
1782 /// Constructs a 128-bit floating-point vector of [2 x double],
1783 /// initialized in reverse order with the specified double-precision
1784 /// floating-point values.
1786 /// \headerfile <x86intrin.h>
1788 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1790 /// \param __w
1791 /// A double-precision floating-point value used to initialize the lower 64
1792 /// bits of the result.
1793 /// \param __x
1794 /// A double-precision floating-point value used to initialize the upper 64
1795 /// bits of the result.
1796 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1797 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1798 double __x) {
1799 return __extension__(__m128d){__w, __x};
1802 /// Constructs a 128-bit floating-point vector of [2 x double]
1803 /// initialized to zero.
1805 /// \headerfile <x86intrin.h>
1807 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1809 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1810 /// all elements set to zero.
1811 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1812 return __extension__(__m128d){0, 0};
1815 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1816 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1817 /// 64 bits are set to the upper 64 bits of the first parameter.
1819 /// \headerfile <x86intrin.h>
1821 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1823 /// \param __a
1824 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1825 /// upper 64 bits of the result.
1826 /// \param __b
1827 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1828 /// lower 64 bits of the result.
1829 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1830 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1831 __m128d __b) {
1832 __a[0] = __b[0];
1833 return __a;
1836 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1837 /// memory location.
1839 /// \headerfile <x86intrin.h>
1841 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1843 /// \param __dp
1844 /// A pointer to a 64-bit memory location.
1845 /// \param __a
1846 /// A 128-bit vector of [2 x double] containing the value to be stored.
1847 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1848 __m128d __a) {
1849 struct __mm_store_sd_struct {
1850 double __u;
1851 } __attribute__((__packed__, __may_alias__));
1852 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1855 /// Moves packed double-precision values from a 128-bit vector of
1856 /// [2 x double] to a memory location.
1858 /// \headerfile <x86intrin.h>
1860 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1862 /// \param __dp
1863 /// A pointer to an aligned memory location that can store two
1864 /// double-precision values.
1865 /// \param __a
1866 /// A packed 128-bit vector of [2 x double] containing the values to be
1867 /// moved.
1868 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1869 __m128d __a) {
1870 *(__m128d *)__dp = __a;
1873 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1874 /// the upper and lower 64 bits of a memory location.
1876 /// \headerfile <x86intrin.h>
1878 /// This intrinsic corresponds to the
1879 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1881 /// \param __dp
1882 /// A pointer to a memory location that can store two double-precision
1883 /// values.
1884 /// \param __a
1885 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1886 /// of the values in \a __dp.
1887 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1888 __m128d __a) {
1889 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1890 _mm_store_pd(__dp, __a);
1893 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1894 /// the upper and lower 64 bits of a memory location.
1896 /// \headerfile <x86intrin.h>
1898 /// This intrinsic corresponds to the
1899 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1901 /// \param __dp
1902 /// A pointer to a memory location that can store two double-precision
1903 /// values.
1904 /// \param __a
1905 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1906 /// of the values in \a __dp.
1907 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1908 __m128d __a) {
1909 _mm_store1_pd(__dp, __a);
1912 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1913 /// location.
1915 /// \headerfile <x86intrin.h>
1917 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1919 /// \param __dp
1920 /// A pointer to a 128-bit memory location. The address of the memory
1921 /// location does not have to be aligned.
1922 /// \param __a
1923 /// A 128-bit vector of [2 x double] containing the values to be stored.
1924 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1925 __m128d __a) {
1926 struct __storeu_pd {
1927 __m128d_u __v;
1928 } __attribute__((__packed__, __may_alias__));
1929 ((struct __storeu_pd *)__dp)->__v = __a;
1932 /// Stores two double-precision values, in reverse order, from a 128-bit
1933 /// vector of [2 x double] to a 16-byte aligned memory location.
1935 /// \headerfile <x86intrin.h>
1937 /// This intrinsic corresponds to a shuffling instruction followed by a
1938 /// <c> VMOVAPD / MOVAPD </c> instruction.
1940 /// \param __dp
1941 /// A pointer to a 16-byte aligned memory location that can store two
1942 /// double-precision values.
1943 /// \param __a
1944 /// A 128-bit vector of [2 x double] containing the values to be reversed and
1945 /// stored.
1946 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1947 __m128d __a) {
1948 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1949 *(__m128d *)__dp = __a;
1952 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1953 /// memory location.
1955 /// \headerfile <x86intrin.h>
1957 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1959 /// \param __dp
1960 /// A pointer to a 64-bit memory location.
1961 /// \param __a
1962 /// A 128-bit vector of [2 x double] containing the value to be stored.
1963 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
1964 __m128d __a) {
1965 struct __mm_storeh_pd_struct {
1966 double __u;
1967 } __attribute__((__packed__, __may_alias__));
1968 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
1971 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1972 /// memory location.
1974 /// \headerfile <x86intrin.h>
1976 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1978 /// \param __dp
1979 /// A pointer to a 64-bit memory location.
1980 /// \param __a
1981 /// A 128-bit vector of [2 x double] containing the value to be stored.
1982 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
1983 __m128d __a) {
1984 struct __mm_storeh_pd_struct {
1985 double __u;
1986 } __attribute__((__packed__, __may_alias__));
1987 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
1990 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
1991 /// saving the lower 8 bits of each sum in the corresponding element of a
1992 /// 128-bit result vector of [16 x i8].
1994 /// The integer elements of both parameters can be either signed or unsigned.
1996 /// \headerfile <x86intrin.h>
1998 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2000 /// \param __a
2001 /// A 128-bit vector of [16 x i8].
2002 /// \param __b
2003 /// A 128-bit vector of [16 x i8].
2004 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2005 /// parameters.
2006 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2007 __m128i __b) {
2008 return (__m128i)((__v16qu)__a + (__v16qu)__b);
2011 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2012 /// saving the lower 16 bits of each sum in the corresponding element of a
2013 /// 128-bit result vector of [8 x i16].
2015 /// The integer elements of both parameters can be either signed or unsigned.
2017 /// \headerfile <x86intrin.h>
2019 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2021 /// \param __a
2022 /// A 128-bit vector of [8 x i16].
2023 /// \param __b
2024 /// A 128-bit vector of [8 x i16].
2025 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2026 /// parameters.
2027 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2028 __m128i __b) {
2029 return (__m128i)((__v8hu)__a + (__v8hu)__b);
2032 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2033 /// saving the lower 32 bits of each sum in the corresponding element of a
2034 /// 128-bit result vector of [4 x i32].
2036 /// The integer elements of both parameters can be either signed or unsigned.
2038 /// \headerfile <x86intrin.h>
2040 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2042 /// \param __a
2043 /// A 128-bit vector of [4 x i32].
2044 /// \param __b
2045 /// A 128-bit vector of [4 x i32].
2046 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2047 /// parameters.
2048 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2049 __m128i __b) {
2050 return (__m128i)((__v4su)__a + (__v4su)__b);
2053 /// Adds two signed or unsigned 64-bit integer values, returning the
2054 /// lower 64 bits of the sum.
2056 /// \headerfile <x86intrin.h>
2058 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2060 /// \param __a
2061 /// A 64-bit integer.
2062 /// \param __b
2063 /// A 64-bit integer.
2064 /// \returns A 64-bit integer containing the sum of both parameters.
2065 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2066 __m64 __b) {
2067 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2070 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2071 /// saving the lower 64 bits of each sum in the corresponding element of a
2072 /// 128-bit result vector of [2 x i64].
2074 /// The integer elements of both parameters can be either signed or unsigned.
2076 /// \headerfile <x86intrin.h>
2078 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2080 /// \param __a
2081 /// A 128-bit vector of [2 x i64].
2082 /// \param __b
2083 /// A 128-bit vector of [2 x i64].
2084 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2085 /// parameters.
2086 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2087 __m128i __b) {
2088 return (__m128i)((__v2du)__a + (__v2du)__b);
2091 /// Adds, with saturation, the corresponding elements of two 128-bit
2092 /// signed [16 x i8] vectors, saving each sum in the corresponding element of
2093 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2094 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2096 /// \headerfile <x86intrin.h>
2098 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2100 /// \param __a
2101 /// A 128-bit signed [16 x i8] vector.
2102 /// \param __b
2103 /// A 128-bit signed [16 x i8] vector.
2104 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2105 /// both parameters.
2106 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2107 __m128i __b) {
2108 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2111 /// Adds, with saturation, the corresponding elements of two 128-bit
2112 /// signed [8 x i16] vectors, saving each sum in the corresponding element of
2113 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2114 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2115 /// 0x8000.
2117 /// \headerfile <x86intrin.h>
2119 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2121 /// \param __a
2122 /// A 128-bit signed [8 x i16] vector.
2123 /// \param __b
2124 /// A 128-bit signed [8 x i16] vector.
2125 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2126 /// both parameters.
2127 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2128 __m128i __b) {
2129 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2132 /// Adds, with saturation, the corresponding elements of two 128-bit
2133 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2134 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2135 /// are saturated to 0xFF. Negative sums are saturated to 0x00.
2137 /// \headerfile <x86intrin.h>
2139 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2141 /// \param __a
2142 /// A 128-bit unsigned [16 x i8] vector.
2143 /// \param __b
2144 /// A 128-bit unsigned [16 x i8] vector.
2145 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2146 /// of both parameters.
2147 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2148 __m128i __b) {
2149 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2152 /// Adds, with saturation, the corresponding elements of two 128-bit
2153 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2154 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than
2155 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2157 /// \headerfile <x86intrin.h>
2159 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2161 /// \param __a
2162 /// A 128-bit unsigned [8 x i16] vector.
2163 /// \param __b
2164 /// A 128-bit unsigned [8 x i16] vector.
2165 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2166 /// of both parameters.
2167 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2168 __m128i __b) {
2169 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2172 /// Computes the rounded averages of corresponding elements of two
2173 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2174 /// corresponding element of a 128-bit result vector of [16 x i8].
2176 /// \headerfile <x86intrin.h>
2178 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2180 /// \param __a
2181 /// A 128-bit unsigned [16 x i8] vector.
2182 /// \param __b
2183 /// A 128-bit unsigned [16 x i8] vector.
2184 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2185 /// averages of both parameters.
2186 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2187 __m128i __b) {
2188 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2191 /// Computes the rounded averages of corresponding elements of two
2192 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2193 /// corresponding element of a 128-bit result vector of [8 x i16].
2195 /// \headerfile <x86intrin.h>
2197 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2199 /// \param __a
2200 /// A 128-bit unsigned [8 x i16] vector.
2201 /// \param __b
2202 /// A 128-bit unsigned [8 x i16] vector.
2203 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2204 /// averages of both parameters.
2205 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2206 __m128i __b) {
2207 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2210 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2211 /// vectors, producing eight intermediate 32-bit signed integer products, and
2212 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2213 /// [4 x i32] vector.
2215 /// For example, bits [15:0] of both parameters are multiplied producing a
2216 /// 32-bit product, bits [31:16] of both parameters are multiplied producing
2217 /// a 32-bit product, and the sum of those two products becomes bits [31:0]
2218 /// of the result.
2220 /// \headerfile <x86intrin.h>
2222 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2224 /// \param __a
2225 /// A 128-bit signed [8 x i16] vector.
2226 /// \param __b
2227 /// A 128-bit signed [8 x i16] vector.
2228 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2229 /// of both parameters.
2230 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2231 __m128i __b) {
2232 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2235 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2236 /// vectors, saving the greater value from each comparison in the
2237 /// corresponding element of a 128-bit result vector of [8 x i16].
2239 /// \headerfile <x86intrin.h>
2241 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2243 /// \param __a
2244 /// A 128-bit signed [8 x i16] vector.
2245 /// \param __b
2246 /// A 128-bit signed [8 x i16] vector.
2247 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2248 /// each comparison.
2249 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2250 __m128i __b) {
2251 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2254 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2255 /// vectors, saving the greater value from each comparison in the
2256 /// corresponding element of a 128-bit result vector of [16 x i8].
2258 /// \headerfile <x86intrin.h>
2260 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2262 /// \param __a
2263 /// A 128-bit unsigned [16 x i8] vector.
2264 /// \param __b
2265 /// A 128-bit unsigned [16 x i8] vector.
2266 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2267 /// each comparison.
2268 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2269 __m128i __b) {
2270 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2273 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2274 /// vectors, saving the smaller value from each comparison in the
2275 /// corresponding element of a 128-bit result vector of [8 x i16].
2277 /// \headerfile <x86intrin.h>
2279 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2281 /// \param __a
2282 /// A 128-bit signed [8 x i16] vector.
2283 /// \param __b
2284 /// A 128-bit signed [8 x i16] vector.
2285 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2286 /// each comparison.
2287 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2288 __m128i __b) {
2289 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2292 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2293 /// vectors, saving the smaller value from each comparison in the
2294 /// corresponding element of a 128-bit result vector of [16 x i8].
2296 /// \headerfile <x86intrin.h>
2298 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2300 /// \param __a
2301 /// A 128-bit unsigned [16 x i8] vector.
2302 /// \param __b
2303 /// A 128-bit unsigned [16 x i8] vector.
2304 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2305 /// each comparison.
2306 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2307 __m128i __b) {
2308 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2311 /// Multiplies the corresponding elements of two signed [8 x i16]
2312 /// vectors, saving the upper 16 bits of each 32-bit product in the
2313 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2315 /// \headerfile <x86intrin.h>
2317 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2319 /// \param __a
2320 /// A 128-bit signed [8 x i16] vector.
2321 /// \param __b
2322 /// A 128-bit signed [8 x i16] vector.
2323 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2324 /// each of the eight 32-bit products.
2325 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2326 __m128i __b) {
2327 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2330 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2331 /// vectors, saving the upper 16 bits of each 32-bit product in the
2332 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2334 /// \headerfile <x86intrin.h>
2336 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2338 /// \param __a
2339 /// A 128-bit unsigned [8 x i16] vector.
2340 /// \param __b
2341 /// A 128-bit unsigned [8 x i16] vector.
2342 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2343 /// of each of the eight 32-bit products.
2344 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2345 __m128i __b) {
2346 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2349 /// Multiplies the corresponding elements of two signed [8 x i16]
2350 /// vectors, saving the lower 16 bits of each 32-bit product in the
2351 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2353 /// \headerfile <x86intrin.h>
2355 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2357 /// \param __a
2358 /// A 128-bit signed [8 x i16] vector.
2359 /// \param __b
2360 /// A 128-bit signed [8 x i16] vector.
2361 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2362 /// each of the eight 32-bit products.
2363 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2364 __m128i __b) {
2365 return (__m128i)((__v8hu)__a * (__v8hu)__b);
2368 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2369 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2370 /// product.
2372 /// \headerfile <x86intrin.h>
2374 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2376 /// \param __a
2377 /// A 64-bit integer containing one of the source operands.
2378 /// \param __b
2379 /// A 64-bit integer containing one of the source operands.
2380 /// \returns A 64-bit integer vector containing the product of both operands.
2381 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2382 __m64 __b) {
2383 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2386 /// Multiplies 32-bit unsigned integer values contained in the lower
2387 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2388 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2390 /// \headerfile <x86intrin.h>
2392 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2394 /// \param __a
2395 /// A [2 x i64] vector containing one of the source operands.
2396 /// \param __b
2397 /// A [2 x i64] vector containing one of the source operands.
2398 /// \returns A [2 x i64] vector containing the product of both operands.
2399 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2400 __m128i __b) {
2401 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2404 /// Computes the absolute differences of corresponding 8-bit integer
2405 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2406 /// separately sums the second 8 absolute differences. Packs these two
2407 /// unsigned 16-bit integer sums into the upper and lower elements of a
2408 /// [2 x i64] vector.
2410 /// \headerfile <x86intrin.h>
2412 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2414 /// \param __a
2415 /// A 128-bit integer vector containing one of the source operands.
2416 /// \param __b
2417 /// A 128-bit integer vector containing one of the source operands.
2418 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2419 /// differences between both operands.
2420 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2421 __m128i __b) {
2422 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2425 /// Subtracts the corresponding 8-bit integer values in the operands.
2427 /// \headerfile <x86intrin.h>
2429 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2431 /// \param __a
2432 /// A 128-bit integer vector containing the minuends.
2433 /// \param __b
2434 /// A 128-bit integer vector containing the subtrahends.
2435 /// \returns A 128-bit integer vector containing the differences of the values
2436 /// in the operands.
2437 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2438 __m128i __b) {
2439 return (__m128i)((__v16qu)__a - (__v16qu)__b);
2442 /// Subtracts the corresponding 16-bit integer values in the operands.
2444 /// \headerfile <x86intrin.h>
2446 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2448 /// \param __a
2449 /// A 128-bit integer vector containing the minuends.
2450 /// \param __b
2451 /// A 128-bit integer vector containing the subtrahends.
2452 /// \returns A 128-bit integer vector containing the differences of the values
2453 /// in the operands.
2454 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2455 __m128i __b) {
2456 return (__m128i)((__v8hu)__a - (__v8hu)__b);
2459 /// Subtracts the corresponding 32-bit integer values in the operands.
2461 /// \headerfile <x86intrin.h>
2463 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2465 /// \param __a
2466 /// A 128-bit integer vector containing the minuends.
2467 /// \param __b
2468 /// A 128-bit integer vector containing the subtrahends.
2469 /// \returns A 128-bit integer vector containing the differences of the values
2470 /// in the operands.
2471 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2472 __m128i __b) {
2473 return (__m128i)((__v4su)__a - (__v4su)__b);
2476 /// Subtracts signed or unsigned 64-bit integer values and writes the
2477 /// difference to the corresponding bits in the destination.
2479 /// \headerfile <x86intrin.h>
2481 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2483 /// \param __a
2484 /// A 64-bit integer vector containing the minuend.
2485 /// \param __b
2486 /// A 64-bit integer vector containing the subtrahend.
2487 /// \returns A 64-bit integer vector containing the difference of the values in
2488 /// the operands.
2489 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2490 __m64 __b) {
2491 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2494 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2496 /// \headerfile <x86intrin.h>
2498 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2500 /// \param __a
2501 /// A 128-bit integer vector containing the minuends.
2502 /// \param __b
2503 /// A 128-bit integer vector containing the subtrahends.
2504 /// \returns A 128-bit integer vector containing the differences of the values
2505 /// in the operands.
2506 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2507 __m128i __b) {
2508 return (__m128i)((__v2du)__a - (__v2du)__b);
2511 /// Subtracts corresponding 8-bit signed integer values in the input and
2512 /// returns the differences in the corresponding bytes in the destination.
2513 /// Differences greater than 0x7F are saturated to 0x7F, and differences less
2514 /// than 0x80 are saturated to 0x80.
2516 /// \headerfile <x86intrin.h>
2518 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2520 /// \param __a
2521 /// A 128-bit integer vector containing the minuends.
2522 /// \param __b
2523 /// A 128-bit integer vector containing the subtrahends.
2524 /// \returns A 128-bit integer vector containing the differences of the values
2525 /// in the operands.
2526 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2527 __m128i __b) {
2528 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2531 /// Subtracts corresponding 16-bit signed integer values in the input and
2532 /// returns the differences in the corresponding bytes in the destination.
2533 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2534 /// than 0x8000 are saturated to 0x8000.
2536 /// \headerfile <x86intrin.h>
2538 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2540 /// \param __a
2541 /// A 128-bit integer vector containing the minuends.
2542 /// \param __b
2543 /// A 128-bit integer vector containing the subtrahends.
2544 /// \returns A 128-bit integer vector containing the differences of the values
2545 /// in the operands.
2546 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2547 __m128i __b) {
2548 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2551 /// Subtracts corresponding 8-bit unsigned integer values in the input
2552 /// and returns the differences in the corresponding bytes in the
2553 /// destination. Differences less than 0x00 are saturated to 0x00.
2555 /// \headerfile <x86intrin.h>
2557 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2559 /// \param __a
2560 /// A 128-bit integer vector containing the minuends.
2561 /// \param __b
2562 /// A 128-bit integer vector containing the subtrahends.
2563 /// \returns A 128-bit integer vector containing the unsigned integer
2564 /// differences of the values in the operands.
2565 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2566 __m128i __b) {
2567 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2570 /// Subtracts corresponding 16-bit unsigned integer values in the input
2571 /// and returns the differences in the corresponding bytes in the
2572 /// destination. Differences less than 0x0000 are saturated to 0x0000.
2574 /// \headerfile <x86intrin.h>
2576 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2578 /// \param __a
2579 /// A 128-bit integer vector containing the minuends.
2580 /// \param __b
2581 /// A 128-bit integer vector containing the subtrahends.
2582 /// \returns A 128-bit integer vector containing the unsigned integer
2583 /// differences of the values in the operands.
2584 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2585 __m128i __b) {
2586 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2589 /// Performs a bitwise AND of two 128-bit integer vectors.
2591 /// \headerfile <x86intrin.h>
2593 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2595 /// \param __a
2596 /// A 128-bit integer vector containing one of the source operands.
2597 /// \param __b
2598 /// A 128-bit integer vector containing one of the source operands.
2599 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2600 /// in both operands.
2601 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2602 __m128i __b) {
2603 return (__m128i)((__v2du)__a & (__v2du)__b);
2606 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2607 /// one's complement of the values contained in the first source operand.
2609 /// \headerfile <x86intrin.h>
2611 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2613 /// \param __a
2614 /// A 128-bit vector containing the left source operand. The one's complement
2615 /// of this value is used in the bitwise AND.
2616 /// \param __b
2617 /// A 128-bit vector containing the right source operand.
2618 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2619 /// complement of the first operand and the values in the second operand.
2620 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2621 __m128i __b) {
2622 return (__m128i)(~(__v2du)__a & (__v2du)__b);
2624 /// Performs a bitwise OR of two 128-bit integer vectors.
2626 /// \headerfile <x86intrin.h>
2628 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2630 /// \param __a
2631 /// A 128-bit integer vector containing one of the source operands.
2632 /// \param __b
2633 /// A 128-bit integer vector containing one of the source operands.
2634 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2635 /// in both operands.
2636 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2637 __m128i __b) {
2638 return (__m128i)((__v2du)__a | (__v2du)__b);
2641 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2643 /// \headerfile <x86intrin.h>
2645 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2647 /// \param __a
2648 /// A 128-bit integer vector containing one of the source operands.
2649 /// \param __b
2650 /// A 128-bit integer vector containing one of the source operands.
2651 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2652 /// values in both operands.
2653 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2654 __m128i __b) {
2655 return (__m128i)((__v2du)__a ^ (__v2du)__b);
2658 /// Left-shifts the 128-bit integer vector operand by the specified
2659 /// number of bytes. Low-order bits are cleared.
2661 /// \headerfile <x86intrin.h>
2663 /// \code
2664 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2665 /// \endcode
2667 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2669 /// \param a
2670 /// A 128-bit integer vector containing the source operand.
2671 /// \param imm
2672 /// An immediate value specifying the number of bytes to left-shift operand
2673 /// \a a.
2674 /// \returns A 128-bit integer vector containing the left-shifted value.
2675 #define _mm_slli_si128(a, imm) \
2676 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2677 (int)(imm)))
2679 #define _mm_bslli_si128(a, imm) \
2680 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2681 (int)(imm)))
2683 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2684 /// by the specified number of bits. Low-order bits are cleared.
2686 /// \headerfile <x86intrin.h>
2688 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2690 /// \param __a
2691 /// A 128-bit integer vector containing the source operand.
2692 /// \param __count
2693 /// An integer value specifying the number of bits to left-shift each value
2694 /// in operand \a __a.
2695 /// \returns A 128-bit integer vector containing the left-shifted values.
2696 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2697 int __count) {
2698 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2701 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2702 /// by the specified number of bits. Low-order bits are cleared.
2704 /// \headerfile <x86intrin.h>
2706 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2708 /// \param __a
2709 /// A 128-bit integer vector containing the source operand.
2710 /// \param __count
2711 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2712 /// to left-shift each value in operand \a __a.
2713 /// \returns A 128-bit integer vector containing the left-shifted values.
2714 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2715 __m128i __count) {
2716 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2719 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2720 /// by the specified number of bits. Low-order bits are cleared.
2722 /// \headerfile <x86intrin.h>
2724 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2726 /// \param __a
2727 /// A 128-bit integer vector containing the source operand.
2728 /// \param __count
2729 /// An integer value specifying the number of bits to left-shift each value
2730 /// in operand \a __a.
2731 /// \returns A 128-bit integer vector containing the left-shifted values.
2732 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2733 int __count) {
2734 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2737 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2738 /// by the specified number of bits. Low-order bits are cleared.
2740 /// \headerfile <x86intrin.h>
2742 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2744 /// \param __a
2745 /// A 128-bit integer vector containing the source operand.
2746 /// \param __count
2747 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2748 /// to left-shift each value in operand \a __a.
2749 /// \returns A 128-bit integer vector containing the left-shifted values.
2750 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2751 __m128i __count) {
2752 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2755 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2756 /// by the specified number of bits. Low-order bits are cleared.
2758 /// \headerfile <x86intrin.h>
2760 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2762 /// \param __a
2763 /// A 128-bit integer vector containing the source operand.
2764 /// \param __count
2765 /// An integer value specifying the number of bits to left-shift each value
2766 /// in operand \a __a.
2767 /// \returns A 128-bit integer vector containing the left-shifted values.
2768 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2769 int __count) {
2770 return __builtin_ia32_psllqi128((__v2di)__a, __count);
2773 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2774 /// by the specified number of bits. Low-order bits are cleared.
2776 /// \headerfile <x86intrin.h>
2778 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2780 /// \param __a
2781 /// A 128-bit integer vector containing the source operand.
2782 /// \param __count
2783 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2784 /// to left-shift each value in operand \a __a.
2785 /// \returns A 128-bit integer vector containing the left-shifted values.
2786 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2787 __m128i __count) {
2788 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2791 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2792 /// by the specified number of bits. High-order bits are filled with the sign
2793 /// bit of the initial value.
2795 /// \headerfile <x86intrin.h>
2797 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2799 /// \param __a
2800 /// A 128-bit integer vector containing the source operand.
2801 /// \param __count
2802 /// An integer value specifying the number of bits to right-shift each value
2803 /// in operand \a __a.
2804 /// \returns A 128-bit integer vector containing the right-shifted values.
2805 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2806 int __count) {
2807 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2810 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2811 /// by the specified number of bits. High-order bits are filled with the sign
2812 /// bit of the initial value.
2814 /// \headerfile <x86intrin.h>
2816 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2818 /// \param __a
2819 /// A 128-bit integer vector containing the source operand.
2820 /// \param __count
2821 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2822 /// to right-shift each value in operand \a __a.
2823 /// \returns A 128-bit integer vector containing the right-shifted values.
2824 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2825 __m128i __count) {
2826 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2829 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2830 /// by the specified number of bits. High-order bits are filled with the sign
2831 /// bit of the initial value.
2833 /// \headerfile <x86intrin.h>
2835 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2837 /// \param __a
2838 /// A 128-bit integer vector containing the source operand.
2839 /// \param __count
2840 /// An integer value specifying the number of bits to right-shift each value
2841 /// in operand \a __a.
2842 /// \returns A 128-bit integer vector containing the right-shifted values.
2843 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2844 int __count) {
2845 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2848 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2849 /// by the specified number of bits. High-order bits are filled with the sign
2850 /// bit of the initial value.
2852 /// \headerfile <x86intrin.h>
2854 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2856 /// \param __a
2857 /// A 128-bit integer vector containing the source operand.
2858 /// \param __count
2859 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2860 /// to right-shift each value in operand \a __a.
2861 /// \returns A 128-bit integer vector containing the right-shifted values.
2862 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2863 __m128i __count) {
2864 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2867 /// Right-shifts the 128-bit integer vector operand by the specified
2868 /// number of bytes. High-order bits are cleared.
2870 /// \headerfile <x86intrin.h>
2872 /// \code
2873 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2874 /// \endcode
2876 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2878 /// \param a
2879 /// A 128-bit integer vector containing the source operand.
2880 /// \param imm
2881 /// An immediate value specifying the number of bytes to right-shift operand
2882 /// \a a.
2883 /// \returns A 128-bit integer vector containing the right-shifted value.
2884 #define _mm_srli_si128(a, imm) \
2885 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2886 (int)(imm)))
2888 #define _mm_bsrli_si128(a, imm) \
2889 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2890 (int)(imm)))
2892 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2893 /// operand by the specified number of bits. High-order bits are cleared.
2895 /// \headerfile <x86intrin.h>
2897 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2899 /// \param __a
2900 /// A 128-bit integer vector containing the source operand.
2901 /// \param __count
2902 /// An integer value specifying the number of bits to right-shift each value
2903 /// in operand \a __a.
2904 /// \returns A 128-bit integer vector containing the right-shifted values.
2905 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2906 int __count) {
2907 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2910 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2911 /// operand by the specified number of bits. High-order bits are cleared.
2913 /// \headerfile <x86intrin.h>
2915 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2917 /// \param __a
2918 /// A 128-bit integer vector containing the source operand.
2919 /// \param __count
2920 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2921 /// to right-shift each value in operand \a __a.
2922 /// \returns A 128-bit integer vector containing the right-shifted values.
2923 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2924 __m128i __count) {
2925 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2928 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2929 /// operand by the specified number of bits. High-order bits are cleared.
2931 /// \headerfile <x86intrin.h>
2933 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2935 /// \param __a
2936 /// A 128-bit integer vector containing the source operand.
2937 /// \param __count
2938 /// An integer value specifying the number of bits to right-shift each value
2939 /// in operand \a __a.
2940 /// \returns A 128-bit integer vector containing the right-shifted values.
2941 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
2942 int __count) {
2943 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2946 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2947 /// operand by the specified number of bits. High-order bits are cleared.
2949 /// \headerfile <x86intrin.h>
2951 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2953 /// \param __a
2954 /// A 128-bit integer vector containing the source operand.
2955 /// \param __count
2956 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2957 /// to right-shift each value in operand \a __a.
2958 /// \returns A 128-bit integer vector containing the right-shifted values.
2959 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
2960 __m128i __count) {
2961 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2964 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2965 /// operand by the specified number of bits. High-order bits are cleared.
2967 /// \headerfile <x86intrin.h>
2969 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2971 /// \param __a
2972 /// A 128-bit integer vector containing the source operand.
2973 /// \param __count
2974 /// An integer value specifying the number of bits to right-shift each value
2975 /// in operand \a __a.
2976 /// \returns A 128-bit integer vector containing the right-shifted values.
2977 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
2978 int __count) {
2979 return __builtin_ia32_psrlqi128((__v2di)__a, __count);
2982 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2983 /// operand by the specified number of bits. High-order bits are cleared.
2985 /// \headerfile <x86intrin.h>
2987 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2989 /// \param __a
2990 /// A 128-bit integer vector containing the source operand.
2991 /// \param __count
2992 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2993 /// to right-shift each value in operand \a __a.
2994 /// \returns A 128-bit integer vector containing the right-shifted values.
2995 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
2996 __m128i __count) {
2997 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3000 /// Compares each of the corresponding 8-bit values of the 128-bit
3001 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3002 /// for true.
3004 /// \headerfile <x86intrin.h>
3006 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3008 /// \param __a
3009 /// A 128-bit integer vector.
3010 /// \param __b
3011 /// A 128-bit integer vector.
3012 /// \returns A 128-bit integer vector containing the comparison results.
3013 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3014 __m128i __b) {
3015 return (__m128i)((__v16qi)__a == (__v16qi)__b);
3018 /// Compares each of the corresponding 16-bit values of the 128-bit
3019 /// integer vectors for equality. Each comparison yields 0x0 for false,
3020 /// 0xFFFF for true.
3022 /// \headerfile <x86intrin.h>
3024 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3026 /// \param __a
3027 /// A 128-bit integer vector.
3028 /// \param __b
3029 /// A 128-bit integer vector.
3030 /// \returns A 128-bit integer vector containing the comparison results.
3031 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3032 __m128i __b) {
3033 return (__m128i)((__v8hi)__a == (__v8hi)__b);
3036 /// Compares each of the corresponding 32-bit values of the 128-bit
3037 /// integer vectors for equality. Each comparison yields 0x0 for false,
3038 /// 0xFFFFFFFF for true.
3040 /// \headerfile <x86intrin.h>
3042 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3044 /// \param __a
3045 /// A 128-bit integer vector.
3046 /// \param __b
3047 /// A 128-bit integer vector.
3048 /// \returns A 128-bit integer vector containing the comparison results.
3049 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3050 __m128i __b) {
3051 return (__m128i)((__v4si)__a == (__v4si)__b);
3054 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3055 /// integer vectors to determine if the values in the first operand are
3056 /// greater than those in the second operand. Each comparison yields 0x0 for
3057 /// false, 0xFF for true.
3059 /// \headerfile <x86intrin.h>
3061 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3063 /// \param __a
3064 /// A 128-bit integer vector.
3065 /// \param __b
3066 /// A 128-bit integer vector.
3067 /// \returns A 128-bit integer vector containing the comparison results.
3068 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3069 __m128i __b) {
3070 /* This function always performs a signed comparison, but __v16qi is a char
3071 which may be signed or unsigned, so use __v16qs. */
3072 return (__m128i)((__v16qs)__a > (__v16qs)__b);
3075 /// Compares each of the corresponding signed 16-bit values of the
3076 /// 128-bit integer vectors to determine if the values in the first operand
3077 /// are greater than those in the second operand.
3079 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3081 /// \headerfile <x86intrin.h>
3083 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3085 /// \param __a
3086 /// A 128-bit integer vector.
3087 /// \param __b
3088 /// A 128-bit integer vector.
3089 /// \returns A 128-bit integer vector containing the comparison results.
3090 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3091 __m128i __b) {
3092 return (__m128i)((__v8hi)__a > (__v8hi)__b);
3095 /// Compares each of the corresponding signed 32-bit values of the
3096 /// 128-bit integer vectors to determine if the values in the first operand
3097 /// are greater than those in the second operand.
3099 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3101 /// \headerfile <x86intrin.h>
3103 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3105 /// \param __a
3106 /// A 128-bit integer vector.
3107 /// \param __b
3108 /// A 128-bit integer vector.
3109 /// \returns A 128-bit integer vector containing the comparison results.
3110 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3111 __m128i __b) {
3112 return (__m128i)((__v4si)__a > (__v4si)__b);
3115 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3116 /// integer vectors to determine if the values in the first operand are less
3117 /// than those in the second operand.
3119 /// Each comparison yields 0x0 for false, 0xFF for true.
3121 /// \headerfile <x86intrin.h>
3123 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3125 /// \param __a
3126 /// A 128-bit integer vector.
3127 /// \param __b
3128 /// A 128-bit integer vector.
3129 /// \returns A 128-bit integer vector containing the comparison results.
3130 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3131 __m128i __b) {
3132 return _mm_cmpgt_epi8(__b, __a);
3135 /// Compares each of the corresponding signed 16-bit values of the
3136 /// 128-bit integer vectors to determine if the values in the first operand
3137 /// are less than those in the second operand.
3139 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3141 /// \headerfile <x86intrin.h>
3143 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3145 /// \param __a
3146 /// A 128-bit integer vector.
3147 /// \param __b
3148 /// A 128-bit integer vector.
3149 /// \returns A 128-bit integer vector containing the comparison results.
3150 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3151 __m128i __b) {
3152 return _mm_cmpgt_epi16(__b, __a);
3155 /// Compares each of the corresponding signed 32-bit values of the
3156 /// 128-bit integer vectors to determine if the values in the first operand
3157 /// are less than those in the second operand.
3159 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3161 /// \headerfile <x86intrin.h>
3163 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3165 /// \param __a
3166 /// A 128-bit integer vector.
3167 /// \param __b
3168 /// A 128-bit integer vector.
3169 /// \returns A 128-bit integer vector containing the comparison results.
3170 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3171 __m128i __b) {
3172 return _mm_cmpgt_epi32(__b, __a);
3175 #ifdef __x86_64__
3176 /// Converts a 64-bit signed integer value from the second operand into a
3177 /// double-precision value and returns it in the lower element of a [2 x
3178 /// double] vector; the upper element of the returned vector is copied from
3179 /// the upper element of the first operand.
3181 /// \headerfile <x86intrin.h>
3183 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3185 /// \param __a
3186 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3187 /// copied to the upper 64 bits of the destination.
3188 /// \param __b
3189 /// A 64-bit signed integer operand containing the value to be converted.
3190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3191 /// converted value of the second operand. The upper 64 bits are copied from
3192 /// the upper 64 bits of the first operand.
3193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3194 long long __b) {
3195 __a[0] = __b;
3196 return __a;
3199 /// Converts the first (lower) element of a vector of [2 x double] into a
3200 /// 64-bit signed integer value, according to the current rounding mode.
3202 /// \headerfile <x86intrin.h>
3204 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3206 /// \param __a
3207 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3208 /// conversion.
3209 /// \returns A 64-bit signed integer containing the converted value.
3210 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3211 return __builtin_ia32_cvtsd2si64((__v2df)__a);
3214 /// Converts the first (lower) element of a vector of [2 x double] into a
3215 /// 64-bit signed integer value, truncating the result when it is inexact.
3217 /// \headerfile <x86intrin.h>
3219 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3220 /// instruction.
3222 /// \param __a
3223 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3224 /// conversion.
3225 /// \returns A 64-bit signed integer containing the converted value.
3226 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3227 return __builtin_ia32_cvttsd2si64((__v2df)__a);
3229 #endif
3231 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3233 /// \headerfile <x86intrin.h>
3235 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3237 /// \param __a
3238 /// A 128-bit integer vector.
3239 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3240 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3241 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3244 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3246 /// \headerfile <x86intrin.h>
3248 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3250 /// \param __a
3251 /// A 128-bit vector of [4 x float].
3252 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3253 /// values.
3254 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3255 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3258 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3259 /// truncating the result when it is inexact.
3261 /// \headerfile <x86intrin.h>
3263 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3264 /// instruction.
3266 /// \param __a
3267 /// A 128-bit vector of [4 x float].
3268 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3269 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3270 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3273 /// Returns a vector of [4 x i32] where the lowest element is the input
3274 /// operand and the remaining elements are zero.
3276 /// \headerfile <x86intrin.h>
3278 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3280 /// \param __a
3281 /// A 32-bit signed integer operand.
3282 /// \returns A 128-bit vector of [4 x i32].
3283 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3284 return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3287 /// Returns a vector of [2 x i64] where the lower element is the input
3288 /// operand and the upper element is zero.
3290 /// \headerfile <x86intrin.h>
3292 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3293 /// in 64-bit mode.
3295 /// \param __a
3296 /// A 64-bit signed integer operand containing the value to be converted.
3297 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3298 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3299 return __extension__(__m128i)(__v2di){__a, 0};
3302 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3303 /// 32-bit signed integer value.
3305 /// \headerfile <x86intrin.h>
3307 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3309 /// \param __a
3310 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3311 /// destination.
3312 /// \returns A 32-bit signed integer containing the moved value.
3313 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3314 __v4si __b = (__v4si)__a;
3315 return __b[0];
3318 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3319 /// 64-bit signed integer value.
3321 /// \headerfile <x86intrin.h>
3323 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3325 /// \param __a
3326 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3327 /// destination.
3328 /// \returns A 64-bit signed integer containing the moved value.
3329 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3330 return __a[0];
3333 /// Moves packed integer values from an aligned 128-bit memory location
3334 /// to elements in a 128-bit integer vector.
3336 /// \headerfile <x86intrin.h>
3338 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3340 /// \param __p
3341 /// An aligned pointer to a memory location containing integer values.
3342 /// \returns A 128-bit integer vector containing the moved values.
3343 static __inline__ __m128i __DEFAULT_FN_ATTRS
3344 _mm_load_si128(__m128i const *__p) {
3345 return *__p;
3348 /// Moves packed integer values from an unaligned 128-bit memory location
3349 /// to elements in a 128-bit integer vector.
3351 /// \headerfile <x86intrin.h>
3353 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3355 /// \param __p
3356 /// A pointer to a memory location containing integer values.
3357 /// \returns A 128-bit integer vector containing the moved values.
3358 static __inline__ __m128i __DEFAULT_FN_ATTRS
3359 _mm_loadu_si128(__m128i_u const *__p) {
3360 struct __loadu_si128 {
3361 __m128i_u __v;
3362 } __attribute__((__packed__, __may_alias__));
3363 return ((const struct __loadu_si128 *)__p)->__v;
3366 /// Returns a vector of [2 x i64] where the lower element is taken from
3367 /// the lower element of the operand, and the upper element is zero.
3369 /// \headerfile <x86intrin.h>
3371 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3373 /// \param __p
3374 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3375 /// the destination.
3376 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3377 /// moved value. The higher order bits are cleared.
3378 static __inline__ __m128i __DEFAULT_FN_ATTRS
3379 _mm_loadl_epi64(__m128i_u const *__p) {
3380 struct __mm_loadl_epi64_struct {
3381 long long __u;
3382 } __attribute__((__packed__, __may_alias__));
3383 return __extension__(__m128i){
3384 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3387 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3388 /// This could be used as an argument to another intrinsic function where the
3389 /// argument is required but the value is not actually used.
3391 /// \headerfile <x86intrin.h>
3393 /// This intrinsic has no corresponding instruction.
3395 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3396 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3397 return (__m128i)__builtin_ia32_undef128();
3400 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3401 /// the specified 64-bit integer values.
3403 /// \headerfile <x86intrin.h>
3405 /// This intrinsic is a utility function and does not correspond to a specific
3406 /// instruction.
3408 /// \param __q1
3409 /// A 64-bit integer value used to initialize the upper 64 bits of the
3410 /// destination vector of [2 x i64].
3411 /// \param __q0
3412 /// A 64-bit integer value used to initialize the lower 64 bits of the
3413 /// destination vector of [2 x i64].
3414 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3415 /// provided in the operands.
3416 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3417 long long __q0) {
3418 return __extension__(__m128i)(__v2di){__q0, __q1};
3421 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3422 /// the specified 64-bit integer values.
3424 /// \headerfile <x86intrin.h>
3426 /// This intrinsic is a utility function and does not correspond to a specific
3427 /// instruction.
3429 /// \param __q1
3430 /// A 64-bit integer value used to initialize the upper 64 bits of the
3431 /// destination vector of [2 x i64].
3432 /// \param __q0
3433 /// A 64-bit integer value used to initialize the lower 64 bits of the
3434 /// destination vector of [2 x i64].
3435 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3436 /// provided in the operands.
3437 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3438 __m64 __q0) {
3439 return _mm_set_epi64x((long long)__q1, (long long)__q0);
3442 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3443 /// the specified 32-bit integer values.
3445 /// \headerfile <x86intrin.h>
3447 /// This intrinsic is a utility function and does not correspond to a specific
3448 /// instruction.
3450 /// \param __i3
3451 /// A 32-bit integer value used to initialize bits [127:96] of the
3452 /// destination vector.
3453 /// \param __i2
3454 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3455 /// vector.
3456 /// \param __i1
3457 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3458 /// vector.
3459 /// \param __i0
3460 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3461 /// vector.
3462 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3463 /// provided in the operands.
3464 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3465 int __i1, int __i0) {
3466 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3469 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3470 /// the specified 16-bit integer values.
3472 /// \headerfile <x86intrin.h>
3474 /// This intrinsic is a utility function and does not correspond to a specific
3475 /// instruction.
3477 /// \param __w7
3478 /// A 16-bit integer value used to initialize bits [127:112] of the
3479 /// destination vector.
3480 /// \param __w6
3481 /// A 16-bit integer value used to initialize bits [111:96] of the
3482 /// destination vector.
3483 /// \param __w5
3484 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3485 /// vector.
3486 /// \param __w4
3487 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3488 /// vector.
3489 /// \param __w3
3490 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3491 /// vector.
3492 /// \param __w2
3493 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3494 /// vector.
3495 /// \param __w1
3496 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3497 /// vector.
3498 /// \param __w0
3499 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3500 /// vector.
3501 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3502 /// provided in the operands.
3503 static __inline__ __m128i __DEFAULT_FN_ATTRS
3504 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3505 short __w2, short __w1, short __w0) {
3506 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3507 __w4, __w5, __w6, __w7};
3510 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3511 /// the specified 8-bit integer values.
3513 /// \headerfile <x86intrin.h>
3515 /// This intrinsic is a utility function and does not correspond to a specific
3516 /// instruction.
3518 /// \param __b15
3519 /// Initializes bits [127:120] of the destination vector.
3520 /// \param __b14
3521 /// Initializes bits [119:112] of the destination vector.
3522 /// \param __b13
3523 /// Initializes bits [111:104] of the destination vector.
3524 /// \param __b12
3525 /// Initializes bits [103:96] of the destination vector.
3526 /// \param __b11
3527 /// Initializes bits [95:88] of the destination vector.
3528 /// \param __b10
3529 /// Initializes bits [87:80] of the destination vector.
3530 /// \param __b9
3531 /// Initializes bits [79:72] of the destination vector.
3532 /// \param __b8
3533 /// Initializes bits [71:64] of the destination vector.
3534 /// \param __b7
3535 /// Initializes bits [63:56] of the destination vector.
3536 /// \param __b6
3537 /// Initializes bits [55:48] of the destination vector.
3538 /// \param __b5
3539 /// Initializes bits [47:40] of the destination vector.
3540 /// \param __b4
3541 /// Initializes bits [39:32] of the destination vector.
3542 /// \param __b3
3543 /// Initializes bits [31:24] of the destination vector.
3544 /// \param __b2
3545 /// Initializes bits [23:16] of the destination vector.
3546 /// \param __b1
3547 /// Initializes bits [15:8] of the destination vector.
3548 /// \param __b0
3549 /// Initializes bits [7:0] of the destination vector.
3550 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3551 /// provided in the operands.
3552 static __inline__ __m128i __DEFAULT_FN_ATTRS
3553 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3554 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3555 char __b4, char __b3, char __b2, char __b1, char __b0) {
3556 return __extension__(__m128i)(__v16qi){
3557 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3558 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3561 /// Initializes both values in a 128-bit integer vector with the
3562 /// specified 64-bit integer value.
3564 /// \headerfile <x86intrin.h>
3566 /// This intrinsic is a utility function and does not correspond to a specific
3567 /// instruction.
3569 /// \param __q
3570 /// Integer value used to initialize the elements of the destination integer
3571 /// vector.
3572 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3573 /// elements containing the value provided in the operand.
3574 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3575 return _mm_set_epi64x(__q, __q);
3578 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3579 /// specified 64-bit value.
3581 /// \headerfile <x86intrin.h>
3583 /// This intrinsic is a utility function and does not correspond to a specific
3584 /// instruction.
3586 /// \param __q
3587 /// A 64-bit value used to initialize the elements of the destination integer
3588 /// vector.
3589 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3590 /// containing the value provided in the operand.
3591 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3592 return _mm_set_epi64(__q, __q);
3595 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3596 /// specified 32-bit value.
3598 /// \headerfile <x86intrin.h>
3600 /// This intrinsic is a utility function and does not correspond to a specific
3601 /// instruction.
3603 /// \param __i
3604 /// A 32-bit value used to initialize the elements of the destination integer
3605 /// vector.
3606 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3607 /// containing the value provided in the operand.
3608 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3609 return _mm_set_epi32(__i, __i, __i, __i);
3612 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3613 /// specified 16-bit value.
3615 /// \headerfile <x86intrin.h>
3617 /// This intrinsic is a utility function and does not correspond to a specific
3618 /// instruction.
3620 /// \param __w
3621 /// A 16-bit value used to initialize the elements of the destination integer
3622 /// vector.
3623 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3624 /// containing the value provided in the operand.
3625 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3626 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3629 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3630 /// specified 8-bit value.
3632 /// \headerfile <x86intrin.h>
3634 /// This intrinsic is a utility function and does not correspond to a specific
3635 /// instruction.
3637 /// \param __b
3638 /// An 8-bit value used to initialize the elements of the destination integer
3639 /// vector.
3640 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3641 /// containing the value provided in the operand.
3642 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3643 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3644 __b, __b, __b, __b, __b);
3647 /// Constructs a 128-bit integer vector, initialized in reverse order
3648 /// with the specified 64-bit integral values.
3650 /// \headerfile <x86intrin.h>
3652 /// This intrinsic does not correspond to a specific instruction.
3654 /// \param __q0
3655 /// A 64-bit integral value used to initialize the lower 64 bits of the
3656 /// result.
3657 /// \param __q1
3658 /// A 64-bit integral value used to initialize the upper 64 bits of the
3659 /// result.
3660 /// \returns An initialized 128-bit integer vector.
3661 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3662 __m64 __q1) {
3663 return _mm_set_epi64(__q1, __q0);
3666 /// Constructs a 128-bit integer vector, initialized in reverse order
3667 /// with the specified 32-bit integral values.
3669 /// \headerfile <x86intrin.h>
3671 /// This intrinsic is a utility function and does not correspond to a specific
3672 /// instruction.
3674 /// \param __i0
3675 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3676 /// \param __i1
3677 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3678 /// \param __i2
3679 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3680 /// \param __i3
3681 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3682 /// \returns An initialized 128-bit integer vector.
3683 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3684 int __i2,
3685 int __i3) {
3686 return _mm_set_epi32(__i3, __i2, __i1, __i0);
3689 /// Constructs a 128-bit integer vector, initialized in reverse order
3690 /// with the specified 16-bit integral values.
3692 /// \headerfile <x86intrin.h>
3694 /// This intrinsic is a utility function and does not correspond to a specific
3695 /// instruction.
3697 /// \param __w0
3698 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3699 /// \param __w1
3700 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3701 /// \param __w2
3702 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3703 /// \param __w3
3704 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3705 /// \param __w4
3706 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3707 /// \param __w5
3708 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3709 /// \param __w6
3710 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3711 /// \param __w7
3712 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3713 /// \returns An initialized 128-bit integer vector.
3714 static __inline__ __m128i __DEFAULT_FN_ATTRS
3715 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3716 short __w5, short __w6, short __w7) {
3717 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3720 /// Constructs a 128-bit integer vector, initialized in reverse order
3721 /// with the specified 8-bit integral values.
3723 /// \headerfile <x86intrin.h>
3725 /// This intrinsic is a utility function and does not correspond to a specific
3726 /// instruction.
3728 /// \param __b0
3729 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3730 /// \param __b1
3731 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3732 /// \param __b2
3733 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3734 /// \param __b3
3735 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3736 /// \param __b4
3737 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3738 /// \param __b5
3739 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3740 /// \param __b6
3741 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3742 /// \param __b7
3743 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3744 /// \param __b8
3745 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3746 /// \param __b9
3747 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3748 /// \param __b10
3749 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3750 /// \param __b11
3751 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3752 /// \param __b12
3753 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3754 /// \param __b13
3755 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3756 /// \param __b14
3757 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3758 /// \param __b15
3759 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3760 /// \returns An initialized 128-bit integer vector.
3761 static __inline__ __m128i __DEFAULT_FN_ATTRS
3762 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3763 char __b6, char __b7, char __b8, char __b9, char __b10,
3764 char __b11, char __b12, char __b13, char __b14, char __b15) {
3765 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3766 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3769 /// Creates a 128-bit integer vector initialized to zero.
3771 /// \headerfile <x86intrin.h>
3773 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3775 /// \returns An initialized 128-bit integer vector with all elements set to
3776 /// zero.
3777 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3778 return __extension__(__m128i)(__v2di){0LL, 0LL};
3781 /// Stores a 128-bit integer vector to a memory location aligned on a
3782 /// 128-bit boundary.
3784 /// \headerfile <x86intrin.h>
3786 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3788 /// \param __p
3789 /// A pointer to an aligned memory location that will receive the integer
3790 /// values.
3791 /// \param __b
3792 /// A 128-bit integer vector containing the values to be moved.
3793 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3794 __m128i __b) {
3795 *__p = __b;
3798 /// Stores a 128-bit integer vector to an unaligned memory location.
3800 /// \headerfile <x86intrin.h>
3802 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3804 /// \param __p
3805 /// A pointer to a memory location that will receive the integer values.
3806 /// \param __b
3807 /// A 128-bit integer vector containing the values to be moved.
3808 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3809 __m128i __b) {
3810 struct __storeu_si128 {
3811 __m128i_u __v;
3812 } __attribute__((__packed__, __may_alias__));
3813 ((struct __storeu_si128 *)__p)->__v = __b;
3816 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3817 /// vector.
3819 /// \headerfile <x86intrin.h>
3821 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3823 /// \param __p
3824 /// A pointer to a 64-bit memory location. The address of the memory
3825 /// location does not have to be aligned.
3826 /// \param __b
3827 /// A 128-bit integer vector containing the value to be stored.
3828 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3829 __m128i __b) {
3830 struct __storeu_si64 {
3831 long long __v;
3832 } __attribute__((__packed__, __may_alias__));
3833 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3836 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3837 /// vector.
3839 /// \headerfile <x86intrin.h>
3841 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3843 /// \param __p
3844 /// A pointer to a 32-bit memory location. The address of the memory
3845 /// location does not have to be aligned.
3846 /// \param __b
3847 /// A 128-bit integer vector containing the value to be stored.
3848 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3849 __m128i __b) {
3850 struct __storeu_si32 {
3851 int __v;
3852 } __attribute__((__packed__, __may_alias__));
3853 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3856 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3857 /// vector.
3859 /// \headerfile <x86intrin.h>
3861 /// This intrinsic does not correspond to a specific instruction.
3863 /// \param __p
3864 /// A pointer to a 16-bit memory location. The address of the memory
3865 /// location does not have to be aligned.
3866 /// \param __b
3867 /// A 128-bit integer vector containing the value to be stored.
3868 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3869 __m128i __b) {
3870 struct __storeu_si16 {
3871 short __v;
3872 } __attribute__((__packed__, __may_alias__));
3873 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3876 /// Moves bytes selected by the mask from the first operand to the
3877 /// specified unaligned memory location. When a mask bit is 1, the
3878 /// corresponding byte is written, otherwise it is not written.
3880 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3881 /// used again soon). Exception and trap behavior for elements not selected
3882 /// for storage to memory are implementation dependent.
3884 /// \headerfile <x86intrin.h>
3886 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3887 /// instruction.
3889 /// \param __d
3890 /// A 128-bit integer vector containing the values to be moved.
3891 /// \param __n
3892 /// A 128-bit integer vector containing the mask. The most significant bit of
3893 /// each byte represents the mask bits.
3894 /// \param __p
3895 /// A pointer to an unaligned 128-bit memory location where the specified
3896 /// values are moved.
3897 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3898 __m128i __n,
3899 char *__p) {
3900 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3903 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3904 /// a memory location.
3906 /// \headerfile <x86intrin.h>
3908 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3910 /// \param __p
3911 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
3912 /// of the integer vector parameter.
3913 /// \param __a
3914 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3915 /// value to be stored.
3916 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3917 __m128i __a) {
3918 struct __mm_storel_epi64_struct {
3919 long long __u;
3920 } __attribute__((__packed__, __may_alias__));
3921 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
3924 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3925 /// aligned memory location.
3927 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3928 /// used again soon).
3930 /// \headerfile <x86intrin.h>
3932 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3934 /// \param __p
3935 /// A pointer to the 128-bit aligned memory location used to store the value.
3936 /// \param __a
3937 /// A vector of [2 x double] containing the 64-bit values to be stored.
3938 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
3939 __m128d __a) {
3940 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
3943 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3945 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3946 /// used again soon).
3948 /// \headerfile <x86intrin.h>
3950 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3952 /// \param __p
3953 /// A pointer to the 128-bit aligned memory location used to store the value.
3954 /// \param __a
3955 /// A 128-bit integer vector containing the values to be stored.
3956 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
3957 __m128i __a) {
3958 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
3961 /// Stores a 32-bit integer value in the specified memory location.
3963 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3964 /// used again soon).
3966 /// \headerfile <x86intrin.h>
3968 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3970 /// \param __p
3971 /// A pointer to the 32-bit memory location used to store the value.
3972 /// \param __a
3973 /// A 32-bit integer containing the value to be stored.
3974 static __inline__ void
3975 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
3976 _mm_stream_si32(int *__p, int __a) {
3977 __builtin_ia32_movnti(__p, __a);
3980 #ifdef __x86_64__
3981 /// Stores a 64-bit integer value in the specified memory location.
3983 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3984 /// used again soon).
3986 /// \headerfile <x86intrin.h>
3988 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
3990 /// \param __p
3991 /// A pointer to the 64-bit memory location used to store the value.
3992 /// \param __a
3993 /// A 64-bit integer containing the value to be stored.
3994 static __inline__ void
3995 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
3996 _mm_stream_si64(long long *__p, long long __a) {
3997 __builtin_ia32_movnti64(__p, __a);
3999 #endif
4001 #if defined(__cplusplus)
4002 extern "C" {
4003 #endif
4005 /// The cache line containing \a __p is flushed and invalidated from all
4006 /// caches in the coherency domain.
4008 /// \headerfile <x86intrin.h>
4010 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4012 /// \param __p
4013 /// A pointer to the memory location used to identify the cache line to be
4014 /// flushed.
4015 void _mm_clflush(void const *__p);
4017 /// Forces strong memory ordering (serialization) between load
4018 /// instructions preceding this instruction and load instructions following
4019 /// this instruction, ensuring the system completes all previous loads before
4020 /// executing subsequent loads.
4022 /// \headerfile <x86intrin.h>
4024 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4026 void _mm_lfence(void);
4028 /// Forces strong memory ordering (serialization) between load and store
4029 /// instructions preceding this instruction and load and store instructions
4030 /// following this instruction, ensuring that the system completes all
4031 /// previous memory accesses before executing subsequent memory accesses.
4033 /// \headerfile <x86intrin.h>
4035 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4037 void _mm_mfence(void);
4039 #if defined(__cplusplus)
4040 } // extern "C"
4041 #endif
4043 /// Converts 16-bit signed integers from both 128-bit integer vector
4044 /// operands into 8-bit signed integers, and packs the results into the
4045 /// destination. Positive values greater than 0x7F are saturated to 0x7F.
4046 /// Negative values less than 0x80 are saturated to 0x80.
4048 /// \headerfile <x86intrin.h>
4050 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4052 /// \param __a
4053 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4054 /// a signed integer and is converted to a 8-bit signed integer with
4055 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4056 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4057 /// written to the lower 64 bits of the result.
4058 /// \param __b
4059 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4060 /// a signed integer and is converted to a 8-bit signed integer with
4061 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4062 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4063 /// written to the higher 64 bits of the result.
4064 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4065 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4066 __m128i __b) {
4067 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4070 /// Converts 32-bit signed integers from both 128-bit integer vector
4071 /// operands into 16-bit signed integers, and packs the results into the
4072 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4073 /// Negative values less than 0x8000 are saturated to 0x8000.
4075 /// \headerfile <x86intrin.h>
4077 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4079 /// \param __a
4080 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4081 /// a signed integer and is converted to a 16-bit signed integer with
4082 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4083 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4084 /// are written to the lower 64 bits of the result.
4085 /// \param __b
4086 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4087 /// a signed integer and is converted to a 16-bit signed integer with
4088 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4089 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4090 /// are written to the higher 64 bits of the result.
4091 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4092 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4093 __m128i __b) {
4094 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4097 /// Converts 16-bit signed integers from both 128-bit integer vector
4098 /// operands into 8-bit unsigned integers, and packs the results into the
4099 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less
4100 /// than 0x00 are saturated to 0x00.
4102 /// \headerfile <x86intrin.h>
4104 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4106 /// \param __a
4107 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4108 /// a signed integer and is converted to an 8-bit unsigned integer with
4109 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4110 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4111 /// written to the lower 64 bits of the result.
4112 /// \param __b
4113 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4114 /// a signed integer and is converted to an 8-bit unsigned integer with
4115 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4116 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4117 /// written to the higher 64 bits of the result.
4118 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4119 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4120 __m128i __b) {
4121 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4124 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4125 /// the immediate-value parameter as a selector.
4127 /// \headerfile <x86intrin.h>
4129 /// \code
4130 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4131 /// \endcode
4133 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4135 /// \param a
4136 /// A 128-bit integer vector.
4137 /// \param imm
4138 /// An immediate value. Bits [2:0] selects values from \a a to be assigned
4139 /// to bits[15:0] of the result. \n
4140 /// 000: assign values from bits [15:0] of \a a. \n
4141 /// 001: assign values from bits [31:16] of \a a. \n
4142 /// 010: assign values from bits [47:32] of \a a. \n
4143 /// 011: assign values from bits [63:48] of \a a. \n
4144 /// 100: assign values from bits [79:64] of \a a. \n
4145 /// 101: assign values from bits [95:80] of \a a. \n
4146 /// 110: assign values from bits [111:96] of \a a. \n
4147 /// 111: assign values from bits [127:112] of \a a.
4148 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4149 /// integer vector parameter and the remaining bits are assigned zeros.
4150 #define _mm_extract_epi16(a, imm) \
4151 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4152 (int)(imm)))
4154 /// Constructs a 128-bit integer vector by first making a copy of the
4155 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4156 /// of an integer parameter into an offset specified by the immediate-value
4157 /// parameter.
4159 /// \headerfile <x86intrin.h>
4161 /// \code
4162 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4163 /// \endcode
4165 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4167 /// \param a
4168 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4169 /// result and then one of the eight elements in the result is replaced by
4170 /// the lower 16 bits of \a b.
4171 /// \param b
4172 /// An integer. The lower 16 bits of this parameter are written to the
4173 /// result beginning at an offset specified by \a imm.
4174 /// \param imm
4175 /// An immediate value specifying the bit offset in the result at which the
4176 /// lower 16 bits of \a b are written.
4177 /// \returns A 128-bit integer vector containing the constructed values.
4178 #define _mm_insert_epi16(a, b, imm) \
4179 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4180 (int)(imm)))
4182 /// Copies the values of the most significant bits from each 8-bit
4183 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4184 /// value, zero-extends the value, and writes it to the destination.
4186 /// \headerfile <x86intrin.h>
4188 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4190 /// \param __a
4191 /// A 128-bit integer vector containing the values with bits to be extracted.
4192 /// \returns The most significant bits from each 8-bit element in \a __a,
4193 /// written to bits [15:0]. The other bits are assigned zeros.
4194 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4195 return __builtin_ia32_pmovmskb128((__v16qi)__a);
4198 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4199 /// elements of a 128-bit integer vector parameter, using the immediate-value
4200 /// parameter as a specifier.
4202 /// \headerfile <x86intrin.h>
4204 /// \code
4205 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4206 /// \endcode
4208 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4210 /// \param a
4211 /// A 128-bit integer vector containing the values to be copied.
4212 /// \param imm
4213 /// An immediate value containing an 8-bit value specifying which elements to
4214 /// copy from a. The destinations within the 128-bit destination are assigned
4215 /// values as follows: \n
4216 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4217 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4218 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4219 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4220 /// Bit value assignments: \n
4221 /// 00: assign values from bits [31:0] of \a a. \n
4222 /// 01: assign values from bits [63:32] of \a a. \n
4223 /// 10: assign values from bits [95:64] of \a a. \n
4224 /// 11: assign values from bits [127:96] of \a a. \n
4225 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4226 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4227 /// <c>[b6, b4, b2, b0]</c>.
4228 /// \returns A 128-bit integer vector containing the shuffled values.
4229 #define _mm_shuffle_epi32(a, imm) \
4230 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4232 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4233 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4234 /// value parameter as a specifier.
4236 /// \headerfile <x86intrin.h>
4238 /// \code
4239 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4240 /// \endcode
4242 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4244 /// \param a
4245 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4246 /// [127:64] of the result.
4247 /// \param imm
4248 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4249 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4250 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4251 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4252 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4253 /// Bit value assignments: \n
4254 /// 00: assign values from bits [15:0] of \a a. \n
4255 /// 01: assign values from bits [31:16] of \a a. \n
4256 /// 10: assign values from bits [47:32] of \a a. \n
4257 /// 11: assign values from bits [63:48] of \a a. \n
4258 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4259 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4260 /// <c>[b6, b4, b2, b0]</c>.
4261 /// \returns A 128-bit integer vector containing the shuffled values.
4262 #define _mm_shufflelo_epi16(a, imm) \
4263 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4265 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4266 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4267 /// value parameter as a specifier.
4269 /// \headerfile <x86intrin.h>
4271 /// \code
4272 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4273 /// \endcode
4275 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4277 /// \param a
4278 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4279 /// [63:0] of the result.
4280 /// \param imm
4281 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4282 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4283 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4284 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4285 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4286 /// Bit value assignments: \n
4287 /// 00: assign values from bits [79:64] of \a a. \n
4288 /// 01: assign values from bits [95:80] of \a a. \n
4289 /// 10: assign values from bits [111:96] of \a a. \n
4290 /// 11: assign values from bits [127:112] of \a a. \n
4291 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4292 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4293 /// <c>[b6, b4, b2, b0]</c>.
4294 /// \returns A 128-bit integer vector containing the shuffled values.
4295 #define _mm_shufflehi_epi16(a, imm) \
4296 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4298 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4299 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4301 /// \headerfile <x86intrin.h>
4303 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4304 /// instruction.
4306 /// \param __a
4307 /// A 128-bit vector of [16 x i8].
4308 /// Bits [71:64] are written to bits [7:0] of the result. \n
4309 /// Bits [79:72] are written to bits [23:16] of the result. \n
4310 /// Bits [87:80] are written to bits [39:32] of the result. \n
4311 /// Bits [95:88] are written to bits [55:48] of the result. \n
4312 /// Bits [103:96] are written to bits [71:64] of the result. \n
4313 /// Bits [111:104] are written to bits [87:80] of the result. \n
4314 /// Bits [119:112] are written to bits [103:96] of the result. \n
4315 /// Bits [127:120] are written to bits [119:112] of the result.
4316 /// \param __b
4317 /// A 128-bit vector of [16 x i8]. \n
4318 /// Bits [71:64] are written to bits [15:8] of the result. \n
4319 /// Bits [79:72] are written to bits [31:24] of the result. \n
4320 /// Bits [87:80] are written to bits [47:40] of the result. \n
4321 /// Bits [95:88] are written to bits [63:56] of the result. \n
4322 /// Bits [103:96] are written to bits [79:72] of the result. \n
4323 /// Bits [111:104] are written to bits [95:88] of the result. \n
4324 /// Bits [119:112] are written to bits [111:104] of the result. \n
4325 /// Bits [127:120] are written to bits [127:120] of the result.
4326 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4327 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4328 __m128i __b) {
4329 return (__m128i)__builtin_shufflevector(
4330 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4331 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4334 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4335 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4337 /// \headerfile <x86intrin.h>
4339 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4340 /// instruction.
4342 /// \param __a
4343 /// A 128-bit vector of [8 x i16].
4344 /// Bits [79:64] are written to bits [15:0] of the result. \n
4345 /// Bits [95:80] are written to bits [47:32] of the result. \n
4346 /// Bits [111:96] are written to bits [79:64] of the result. \n
4347 /// Bits [127:112] are written to bits [111:96] of the result.
4348 /// \param __b
4349 /// A 128-bit vector of [8 x i16].
4350 /// Bits [79:64] are written to bits [31:16] of the result. \n
4351 /// Bits [95:80] are written to bits [63:48] of the result. \n
4352 /// Bits [111:96] are written to bits [95:80] of the result. \n
4353 /// Bits [127:112] are written to bits [127:112] of the result.
4354 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4355 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4356 __m128i __b) {
4357 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4358 8 + 5, 6, 8 + 6, 7, 8 + 7);
4361 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4362 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4364 /// \headerfile <x86intrin.h>
4366 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4367 /// instruction.
4369 /// \param __a
4370 /// A 128-bit vector of [4 x i32]. \n
4371 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4372 /// Bits [127:96] are written to bits [95:64] of the destination.
4373 /// \param __b
4374 /// A 128-bit vector of [4 x i32]. \n
4375 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4376 /// Bits [127:96] are written to bits [127:96] of the destination.
4377 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4378 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4379 __m128i __b) {
4380 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4381 4 + 3);
4384 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4385 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4387 /// \headerfile <x86intrin.h>
4389 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4390 /// instruction.
4392 /// \param __a
4393 /// A 128-bit vector of [2 x i64]. \n
4394 /// Bits [127:64] are written to bits [63:0] of the destination.
4395 /// \param __b
4396 /// A 128-bit vector of [2 x i64]. \n
4397 /// Bits [127:64] are written to bits [127:64] of the destination.
4398 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4399 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4400 __m128i __b) {
4401 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4404 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4405 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4407 /// \headerfile <x86intrin.h>
4409 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4410 /// instruction.
4412 /// \param __a
4413 /// A 128-bit vector of [16 x i8]. \n
4414 /// Bits [7:0] are written to bits [7:0] of the result. \n
4415 /// Bits [15:8] are written to bits [23:16] of the result. \n
4416 /// Bits [23:16] are written to bits [39:32] of the result. \n
4417 /// Bits [31:24] are written to bits [55:48] of the result. \n
4418 /// Bits [39:32] are written to bits [71:64] of the result. \n
4419 /// Bits [47:40] are written to bits [87:80] of the result. \n
4420 /// Bits [55:48] are written to bits [103:96] of the result. \n
4421 /// Bits [63:56] are written to bits [119:112] of the result.
4422 /// \param __b
4423 /// A 128-bit vector of [16 x i8].
4424 /// Bits [7:0] are written to bits [15:8] of the result. \n
4425 /// Bits [15:8] are written to bits [31:24] of the result. \n
4426 /// Bits [23:16] are written to bits [47:40] of the result. \n
4427 /// Bits [31:24] are written to bits [63:56] of the result. \n
4428 /// Bits [39:32] are written to bits [79:72] of the result. \n
4429 /// Bits [47:40] are written to bits [95:88] of the result. \n
4430 /// Bits [55:48] are written to bits [111:104] of the result. \n
4431 /// Bits [63:56] are written to bits [127:120] of the result.
4432 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4433 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4434 __m128i __b) {
4435 return (__m128i)__builtin_shufflevector(
4436 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4437 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4440 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4441 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4442 /// [8 x i16].
4444 /// \headerfile <x86intrin.h>
4446 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4447 /// instruction.
4449 /// \param __a
4450 /// A 128-bit vector of [8 x i16].
4451 /// Bits [15:0] are written to bits [15:0] of the result. \n
4452 /// Bits [31:16] are written to bits [47:32] of the result. \n
4453 /// Bits [47:32] are written to bits [79:64] of the result. \n
4454 /// Bits [63:48] are written to bits [111:96] of the result.
4455 /// \param __b
4456 /// A 128-bit vector of [8 x i16].
4457 /// Bits [15:0] are written to bits [31:16] of the result. \n
4458 /// Bits [31:16] are written to bits [63:48] of the result. \n
4459 /// Bits [47:32] are written to bits [95:80] of the result. \n
4460 /// Bits [63:48] are written to bits [127:112] of the result.
4461 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4462 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4463 __m128i __b) {
4464 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4465 8 + 1, 2, 8 + 2, 3, 8 + 3);
4468 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4469 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4471 /// \headerfile <x86intrin.h>
4473 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4474 /// instruction.
4476 /// \param __a
4477 /// A 128-bit vector of [4 x i32]. \n
4478 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4479 /// Bits [63:32] are written to bits [95:64] of the destination.
4480 /// \param __b
4481 /// A 128-bit vector of [4 x i32]. \n
4482 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4483 /// Bits [63:32] are written to bits [127:96] of the destination.
4484 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4485 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4486 __m128i __b) {
4487 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4488 4 + 1);
4491 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4492 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4494 /// \headerfile <x86intrin.h>
4496 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4497 /// instruction.
4499 /// \param __a
4500 /// A 128-bit vector of [2 x i64]. \n
4501 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4502 /// \param __b
4503 /// A 128-bit vector of [2 x i64]. \n
4504 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4505 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4506 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4507 __m128i __b) {
4508 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4511 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4512 /// integer.
4514 /// \headerfile <x86intrin.h>
4516 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4518 /// \param __a
4519 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4520 /// destination.
4521 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4522 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4523 return (__m64)__a[0];
4526 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4527 /// upper bits.
4529 /// \headerfile <x86intrin.h>
4531 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4533 /// \param __a
4534 /// A 64-bit value.
4535 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4536 /// the operand. The upper 64 bits are assigned zeros.
4537 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4538 return __extension__(__m128i)(__v2di){(long long)__a, 0};
4541 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4542 /// integer vector, zeroing the upper bits.
4544 /// \headerfile <x86intrin.h>
4546 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4548 /// \param __a
4549 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4550 /// destination.
4551 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4552 /// the operand. The upper 64 bits are assigned zeros.
4553 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4554 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4557 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4558 /// [2 x double] and interleaves them into a 128-bit vector of [2 x
4559 /// double].
4561 /// \headerfile <x86intrin.h>
4563 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4565 /// \param __a
4566 /// A 128-bit vector of [2 x double]. \n
4567 /// Bits [127:64] are written to bits [63:0] of the destination.
4568 /// \param __b
4569 /// A 128-bit vector of [2 x double]. \n
4570 /// Bits [127:64] are written to bits [127:64] of the destination.
4571 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4572 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4573 __m128d __b) {
4574 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4577 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4578 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4579 /// double].
4581 /// \headerfile <x86intrin.h>
4583 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4585 /// \param __a
4586 /// A 128-bit vector of [2 x double]. \n
4587 /// Bits [63:0] are written to bits [63:0] of the destination.
4588 /// \param __b
4589 /// A 128-bit vector of [2 x double]. \n
4590 /// Bits [63:0] are written to bits [127:64] of the destination.
4591 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4592 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4593 __m128d __b) {
4594 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4597 /// Extracts the sign bits of the double-precision values in the 128-bit
4598 /// vector of [2 x double], zero-extends the value, and writes it to the
4599 /// low-order bits of the destination.
4601 /// \headerfile <x86intrin.h>
4603 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4605 /// \param __a
4606 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4607 /// be extracted.
4608 /// \returns The sign bits from each of the double-precision elements in \a __a,
4609 /// written to bits [1:0]. The remaining bits are assigned values of zero.
4610 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4611 return __builtin_ia32_movmskpd((__v2df)__a);
4614 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4615 /// 128-bit vector parameters of [2 x double], using the immediate-value
4616 /// parameter as a specifier.
4618 /// \headerfile <x86intrin.h>
4620 /// \code
4621 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4622 /// \endcode
4624 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4626 /// \param a
4627 /// A 128-bit vector of [2 x double].
4628 /// \param b
4629 /// A 128-bit vector of [2 x double].
4630 /// \param i
4631 /// An 8-bit immediate value. The least significant two bits specify which
4632 /// elements to copy from \a a and \a b: \n
4633 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4634 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4635 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4636 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4637 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4638 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4639 /// <c>[b1, b0]</c>.
4640 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4641 #define _mm_shuffle_pd(a, b, i) \
4642 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4643 (int)(i)))
4645 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4646 /// floating-point vector of [4 x float].
4648 /// \headerfile <x86intrin.h>
4650 /// This intrinsic has no corresponding instruction.
4652 /// \param __a
4653 /// A 128-bit floating-point vector of [2 x double].
4654 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4655 /// bitwise pattern as the parameter.
4656 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4657 return (__m128)__a;
4660 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4661 /// integer vector.
4663 /// \headerfile <x86intrin.h>
4665 /// This intrinsic has no corresponding instruction.
4667 /// \param __a
4668 /// A 128-bit floating-point vector of [2 x double].
4669 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4670 /// parameter.
4671 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4672 return (__m128i)__a;
4675 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4676 /// floating-point vector of [2 x double].
4678 /// \headerfile <x86intrin.h>
4680 /// This intrinsic has no corresponding instruction.
4682 /// \param __a
4683 /// A 128-bit floating-point vector of [4 x float].
4684 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4685 /// bitwise pattern as the parameter.
4686 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4687 return (__m128d)__a;
4690 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4691 /// integer vector.
4693 /// \headerfile <x86intrin.h>
4695 /// This intrinsic has no corresponding instruction.
4697 /// \param __a
4698 /// A 128-bit floating-point vector of [4 x float].
4699 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4700 /// parameter.
4701 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4702 return (__m128i)__a;
4705 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4706 /// of [4 x float].
4708 /// \headerfile <x86intrin.h>
4710 /// This intrinsic has no corresponding instruction.
4712 /// \param __a
4713 /// A 128-bit integer vector.
4714 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4715 /// bitwise pattern as the parameter.
4716 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4717 return (__m128)__a;
4720 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4721 /// of [2 x double].
4723 /// \headerfile <x86intrin.h>
4725 /// This intrinsic has no corresponding instruction.
4727 /// \param __a
4728 /// A 128-bit integer vector.
4729 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4730 /// bitwise pattern as the parameter.
4731 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4732 return (__m128d)__a;
4735 #if defined(__cplusplus)
4736 extern "C" {
4737 #endif
4739 /// Indicates that a spin loop is being executed for the purposes of
4740 /// optimizing power consumption during the loop.
4742 /// \headerfile <x86intrin.h>
4744 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4746 void _mm_pause(void);
4748 #if defined(__cplusplus)
4749 } // extern "C"
4750 #endif
4751 #undef __DEFAULT_FN_ATTRS
4752 #undef __DEFAULT_FN_ATTRS_MMX
4754 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4756 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4757 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4759 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4761 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4762 #define _MM_SET_DENORMALS_ZERO_MODE(x) \
4763 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4765 #endif /* __EMMINTRIN_H */