1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 *===-----------------------------------------------------------------------===
28 #error "SSE2 instruction set not enabled"
31 #include <xmmintrin.h>
33 typedef double __m128d
__attribute__((__vector_size__(16)));
34 typedef long long __m128i
__attribute__((__vector_size__(16)));
37 typedef double __v2df
__attribute__ ((__vector_size__ (16)));
38 typedef long long __v2di
__attribute__ ((__vector_size__ (16)));
39 typedef short __v8hi
__attribute__((__vector_size__(16)));
40 typedef char __v16qi
__attribute__((__vector_size__(16)));
42 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
43 _mm_add_sd(__m128d __a
, __m128d __b
)
49 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
50 _mm_add_pd(__m128d __a
, __m128d __b
)
55 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
56 _mm_sub_sd(__m128d __a
, __m128d __b
)
62 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
63 _mm_sub_pd(__m128d __a
, __m128d __b
)
68 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
69 _mm_mul_sd(__m128d __a
, __m128d __b
)
75 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
76 _mm_mul_pd(__m128d __a
, __m128d __b
)
81 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
82 _mm_div_sd(__m128d __a
, __m128d __b
)
88 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
89 _mm_div_pd(__m128d __a
, __m128d __b
)
94 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
95 _mm_sqrt_sd(__m128d __a
, __m128d __b
)
97 __m128d __c
= __builtin_ia32_sqrtsd(__b
);
98 return (__m128d
) { __c
[0], __a
[1] };
101 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
102 _mm_sqrt_pd(__m128d __a
)
104 return __builtin_ia32_sqrtpd(__a
);
107 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
108 _mm_min_sd(__m128d __a
, __m128d __b
)
110 return __builtin_ia32_minsd(__a
, __b
);
113 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
114 _mm_min_pd(__m128d __a
, __m128d __b
)
116 return __builtin_ia32_minpd(__a
, __b
);
119 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
120 _mm_max_sd(__m128d __a
, __m128d __b
)
122 return __builtin_ia32_maxsd(__a
, __b
);
125 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
126 _mm_max_pd(__m128d __a
, __m128d __b
)
128 return __builtin_ia32_maxpd(__a
, __b
);
131 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
132 _mm_and_pd(__m128d __a
, __m128d __b
)
134 return (__m128d
)((__v4si
)__a
& (__v4si
)__b
);
137 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
138 _mm_andnot_pd(__m128d __a
, __m128d __b
)
140 return (__m128d
)(~(__v4si
)__a
& (__v4si
)__b
);
143 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
144 _mm_or_pd(__m128d __a
, __m128d __b
)
146 return (__m128d
)((__v4si
)__a
| (__v4si
)__b
);
149 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
150 _mm_xor_pd(__m128d __a
, __m128d __b
)
152 return (__m128d
)((__v4si
)__a
^ (__v4si
)__b
);
155 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
156 _mm_cmpeq_pd(__m128d __a
, __m128d __b
)
158 return (__m128d
)__builtin_ia32_cmpeqpd(__a
, __b
);
161 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
162 _mm_cmplt_pd(__m128d __a
, __m128d __b
)
164 return (__m128d
)__builtin_ia32_cmpltpd(__a
, __b
);
167 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
168 _mm_cmple_pd(__m128d __a
, __m128d __b
)
170 return (__m128d
)__builtin_ia32_cmplepd(__a
, __b
);
173 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
174 _mm_cmpgt_pd(__m128d __a
, __m128d __b
)
176 return (__m128d
)__builtin_ia32_cmpltpd(__b
, __a
);
179 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
180 _mm_cmpge_pd(__m128d __a
, __m128d __b
)
182 return (__m128d
)__builtin_ia32_cmplepd(__b
, __a
);
185 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
186 _mm_cmpord_pd(__m128d __a
, __m128d __b
)
188 return (__m128d
)__builtin_ia32_cmpordpd(__a
, __b
);
191 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
192 _mm_cmpunord_pd(__m128d __a
, __m128d __b
)
194 return (__m128d
)__builtin_ia32_cmpunordpd(__a
, __b
);
197 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
198 _mm_cmpneq_pd(__m128d __a
, __m128d __b
)
200 return (__m128d
)__builtin_ia32_cmpneqpd(__a
, __b
);
203 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
204 _mm_cmpnlt_pd(__m128d __a
, __m128d __b
)
206 return (__m128d
)__builtin_ia32_cmpnltpd(__a
, __b
);
209 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
210 _mm_cmpnle_pd(__m128d __a
, __m128d __b
)
212 return (__m128d
)__builtin_ia32_cmpnlepd(__a
, __b
);
215 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
216 _mm_cmpngt_pd(__m128d __a
, __m128d __b
)
218 return (__m128d
)__builtin_ia32_cmpnltpd(__b
, __a
);
221 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
222 _mm_cmpnge_pd(__m128d __a
, __m128d __b
)
224 return (__m128d
)__builtin_ia32_cmpnlepd(__b
, __a
);
227 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
228 _mm_cmpeq_sd(__m128d __a
, __m128d __b
)
230 return (__m128d
)__builtin_ia32_cmpeqsd(__a
, __b
);
233 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
234 _mm_cmplt_sd(__m128d __a
, __m128d __b
)
236 return (__m128d
)__builtin_ia32_cmpltsd(__a
, __b
);
239 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
240 _mm_cmple_sd(__m128d __a
, __m128d __b
)
242 return (__m128d
)__builtin_ia32_cmplesd(__a
, __b
);
245 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
246 _mm_cmpgt_sd(__m128d __a
, __m128d __b
)
248 __m128d __c
= __builtin_ia32_cmpltsd(__b
, __a
);
249 return (__m128d
) { __c
[0], __a
[1] };
252 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
253 _mm_cmpge_sd(__m128d __a
, __m128d __b
)
255 __m128d __c
= __builtin_ia32_cmplesd(__b
, __a
);
256 return (__m128d
) { __c
[0], __a
[1] };
259 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
260 _mm_cmpord_sd(__m128d __a
, __m128d __b
)
262 return (__m128d
)__builtin_ia32_cmpordsd(__a
, __b
);
265 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
266 _mm_cmpunord_sd(__m128d __a
, __m128d __b
)
268 return (__m128d
)__builtin_ia32_cmpunordsd(__a
, __b
);
271 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
272 _mm_cmpneq_sd(__m128d __a
, __m128d __b
)
274 return (__m128d
)__builtin_ia32_cmpneqsd(__a
, __b
);
277 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
278 _mm_cmpnlt_sd(__m128d __a
, __m128d __b
)
280 return (__m128d
)__builtin_ia32_cmpnltsd(__a
, __b
);
283 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
284 _mm_cmpnle_sd(__m128d __a
, __m128d __b
)
286 return (__m128d
)__builtin_ia32_cmpnlesd(__a
, __b
);
289 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
290 _mm_cmpngt_sd(__m128d __a
, __m128d __b
)
292 __m128d __c
= __builtin_ia32_cmpnltsd(__b
, __a
);
293 return (__m128d
) { __c
[0], __a
[1] };
296 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
297 _mm_cmpnge_sd(__m128d __a
, __m128d __b
)
299 __m128d __c
= __builtin_ia32_cmpnlesd(__b
, __a
);
300 return (__m128d
) { __c
[0], __a
[1] };
303 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
304 _mm_comieq_sd(__m128d __a
, __m128d __b
)
306 return __builtin_ia32_comisdeq(__a
, __b
);
309 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
310 _mm_comilt_sd(__m128d __a
, __m128d __b
)
312 return __builtin_ia32_comisdlt(__a
, __b
);
315 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
316 _mm_comile_sd(__m128d __a
, __m128d __b
)
318 return __builtin_ia32_comisdle(__a
, __b
);
321 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
322 _mm_comigt_sd(__m128d __a
, __m128d __b
)
324 return __builtin_ia32_comisdgt(__a
, __b
);
327 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
328 _mm_comige_sd(__m128d __a
, __m128d __b
)
330 return __builtin_ia32_comisdge(__a
, __b
);
333 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
334 _mm_comineq_sd(__m128d __a
, __m128d __b
)
336 return __builtin_ia32_comisdneq(__a
, __b
);
339 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
340 _mm_ucomieq_sd(__m128d __a
, __m128d __b
)
342 return __builtin_ia32_ucomisdeq(__a
, __b
);
345 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
346 _mm_ucomilt_sd(__m128d __a
, __m128d __b
)
348 return __builtin_ia32_ucomisdlt(__a
, __b
);
351 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
352 _mm_ucomile_sd(__m128d __a
, __m128d __b
)
354 return __builtin_ia32_ucomisdle(__a
, __b
);
357 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
358 _mm_ucomigt_sd(__m128d __a
, __m128d __b
)
360 return __builtin_ia32_ucomisdgt(__a
, __b
);
363 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
364 _mm_ucomige_sd(__m128d __a
, __m128d __b
)
366 return __builtin_ia32_ucomisdge(__a
, __b
);
369 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
370 _mm_ucomineq_sd(__m128d __a
, __m128d __b
)
372 return __builtin_ia32_ucomisdneq(__a
, __b
);
375 static __inline__ __m128
__attribute__((__always_inline__
, __nodebug__
))
376 _mm_cvtpd_ps(__m128d __a
)
378 return __builtin_ia32_cvtpd2ps(__a
);
381 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
382 _mm_cvtps_pd(__m128 __a
)
384 return __builtin_ia32_cvtps2pd(__a
);
387 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
388 _mm_cvtepi32_pd(__m128i __a
)
390 return __builtin_ia32_cvtdq2pd((__v4si
)__a
);
393 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
394 _mm_cvtpd_epi32(__m128d __a
)
396 return __builtin_ia32_cvtpd2dq(__a
);
399 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
400 _mm_cvtsd_si32(__m128d __a
)
402 return __builtin_ia32_cvtsd2si(__a
);
405 static __inline__ __m128
__attribute__((__always_inline__
, __nodebug__
))
406 _mm_cvtsd_ss(__m128 __a
, __m128d __b
)
412 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
413 _mm_cvtsi32_sd(__m128d __a
, int __b
)
419 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
420 _mm_cvtss_sd(__m128d __a
, __m128 __b
)
426 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
427 _mm_cvttpd_epi32(__m128d __a
)
429 return (__m128i
)__builtin_ia32_cvttpd2dq(__a
);
432 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
433 _mm_cvttsd_si32(__m128d __a
)
438 static __inline__ __m64
__attribute__((__always_inline__
, __nodebug__
))
439 _mm_cvtpd_pi32(__m128d __a
)
441 return (__m64
)__builtin_ia32_cvtpd2pi(__a
);
444 static __inline__ __m64
__attribute__((__always_inline__
, __nodebug__
))
445 _mm_cvttpd_pi32(__m128d __a
)
447 return (__m64
)__builtin_ia32_cvttpd2pi(__a
);
450 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
451 _mm_cvtpi32_pd(__m64 __a
)
453 return __builtin_ia32_cvtpi2pd((__v2si
)__a
);
456 static __inline__
double __attribute__((__always_inline__
, __nodebug__
))
457 _mm_cvtsd_f64(__m128d __a
)
462 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
463 _mm_load_pd(double const *__dp
)
465 return *(__m128d
*)__dp
;
468 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
469 _mm_load1_pd(double const *__dp
)
471 struct __mm_load1_pd_struct
{
473 } __attribute__((__packed__
, __may_alias__
));
474 double __u
= ((struct __mm_load1_pd_struct
*)__dp
)->__u
;
475 return (__m128d
){ __u
, __u
};
478 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
480 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
481 _mm_loadr_pd(double const *__dp
)
483 __m128d __u
= *(__m128d
*)__dp
;
484 return __builtin_shufflevector(__u
, __u
, 1, 0);
487 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
488 _mm_loadu_pd(double const *__dp
)
492 } __attribute__((packed
, may_alias
));
493 return ((struct __loadu_pd
*)__dp
)->__v
;
496 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
497 _mm_load_sd(double const *__dp
)
499 struct __mm_load_sd_struct
{
501 } __attribute__((__packed__
, __may_alias__
));
502 double __u
= ((struct __mm_load_sd_struct
*)__dp
)->__u
;
503 return (__m128d
){ __u
, 0 };
506 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
507 _mm_loadh_pd(__m128d __a
, double const *__dp
)
509 struct __mm_loadh_pd_struct
{
511 } __attribute__((__packed__
, __may_alias__
));
512 double __u
= ((struct __mm_loadh_pd_struct
*)__dp
)->__u
;
513 return (__m128d
){ __a
[0], __u
};
516 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
517 _mm_loadl_pd(__m128d __a
, double const *__dp
)
519 struct __mm_loadl_pd_struct
{
521 } __attribute__((__packed__
, __may_alias__
));
522 double __u
= ((struct __mm_loadl_pd_struct
*)__dp
)->__u
;
523 return (__m128d
){ __u
, __a
[1] };
526 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
527 _mm_set_sd(double __w
)
529 return (__m128d
){ __w
, 0 };
532 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
533 _mm_set1_pd(double __w
)
535 return (__m128d
){ __w
, __w
};
538 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
539 _mm_set_pd(double __w
, double __x
)
541 return (__m128d
){ __x
, __w
};
544 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
545 _mm_setr_pd(double __w
, double __x
)
547 return (__m128d
){ __w
, __x
};
550 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
553 return (__m128d
){ 0, 0 };
556 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
557 _mm_move_sd(__m128d __a
, __m128d __b
)
559 return (__m128d
){ __b
[0], __a
[1] };
562 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
563 _mm_store_sd(double *__dp
, __m128d __a
)
565 struct __mm_store_sd_struct
{
567 } __attribute__((__packed__
, __may_alias__
));
568 ((struct __mm_store_sd_struct
*)__dp
)->__u
= __a
[0];
571 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
572 _mm_store1_pd(double *__dp
, __m128d __a
)
574 struct __mm_store1_pd_struct
{
576 } __attribute__((__packed__
, __may_alias__
));
577 ((struct __mm_store1_pd_struct
*)__dp
)->__u
[0] = __a
[0];
578 ((struct __mm_store1_pd_struct
*)__dp
)->__u
[1] = __a
[0];
581 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
582 _mm_store_pd(double *__dp
, __m128d __a
)
584 *(__m128d
*)__dp
= __a
;
587 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
588 _mm_storeu_pd(double *__dp
, __m128d __a
)
590 __builtin_ia32_storeupd(__dp
, __a
);
593 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
594 _mm_storer_pd(double *__dp
, __m128d __a
)
596 __a
= __builtin_shufflevector(__a
, __a
, 1, 0);
597 *(__m128d
*)__dp
= __a
;
600 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
601 _mm_storeh_pd(double *__dp
, __m128d __a
)
603 struct __mm_storeh_pd_struct
{
605 } __attribute__((__packed__
, __may_alias__
));
606 ((struct __mm_storeh_pd_struct
*)__dp
)->__u
= __a
[1];
609 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
610 _mm_storel_pd(double *__dp
, __m128d __a
)
612 struct __mm_storeh_pd_struct
{
614 } __attribute__((__packed__
, __may_alias__
));
615 ((struct __mm_storeh_pd_struct
*)__dp
)->__u
= __a
[0];
618 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
619 _mm_add_epi8(__m128i __a
, __m128i __b
)
621 return (__m128i
)((__v16qi
)__a
+ (__v16qi
)__b
);
624 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
625 _mm_add_epi16(__m128i __a
, __m128i __b
)
627 return (__m128i
)((__v8hi
)__a
+ (__v8hi
)__b
);
630 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
631 _mm_add_epi32(__m128i __a
, __m128i __b
)
633 return (__m128i
)((__v4si
)__a
+ (__v4si
)__b
);
636 static __inline__ __m64
__attribute__((__always_inline__
, __nodebug__
))
637 _mm_add_si64(__m64 __a
, __m64 __b
)
642 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
643 _mm_add_epi64(__m128i __a
, __m128i __b
)
648 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
649 _mm_adds_epi8(__m128i __a
, __m128i __b
)
651 return (__m128i
)__builtin_ia32_paddsb128((__v16qi
)__a
, (__v16qi
)__b
);
654 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
655 _mm_adds_epi16(__m128i __a
, __m128i __b
)
657 return (__m128i
)__builtin_ia32_paddsw128((__v8hi
)__a
, (__v8hi
)__b
);
660 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
661 _mm_adds_epu8(__m128i __a
, __m128i __b
)
663 return (__m128i
)__builtin_ia32_paddusb128((__v16qi
)__a
, (__v16qi
)__b
);
666 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
667 _mm_adds_epu16(__m128i __a
, __m128i __b
)
669 return (__m128i
)__builtin_ia32_paddusw128((__v8hi
)__a
, (__v8hi
)__b
);
672 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
673 _mm_avg_epu8(__m128i __a
, __m128i __b
)
675 return (__m128i
)__builtin_ia32_pavgb128((__v16qi
)__a
, (__v16qi
)__b
);
678 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
679 _mm_avg_epu16(__m128i __a
, __m128i __b
)
681 return (__m128i
)__builtin_ia32_pavgw128((__v8hi
)__a
, (__v8hi
)__b
);
684 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
685 _mm_madd_epi16(__m128i __a
, __m128i __b
)
687 return (__m128i
)__builtin_ia32_pmaddwd128((__v8hi
)__a
, (__v8hi
)__b
);
690 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
691 _mm_max_epi16(__m128i __a
, __m128i __b
)
693 return (__m128i
)__builtin_ia32_pmaxsw128((__v8hi
)__a
, (__v8hi
)__b
);
696 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
697 _mm_max_epu8(__m128i __a
, __m128i __b
)
699 return (__m128i
)__builtin_ia32_pmaxub128((__v16qi
)__a
, (__v16qi
)__b
);
702 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
703 _mm_min_epi16(__m128i __a
, __m128i __b
)
705 return (__m128i
)__builtin_ia32_pminsw128((__v8hi
)__a
, (__v8hi
)__b
);
708 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
709 _mm_min_epu8(__m128i __a
, __m128i __b
)
711 return (__m128i
)__builtin_ia32_pminub128((__v16qi
)__a
, (__v16qi
)__b
);
714 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
715 _mm_mulhi_epi16(__m128i __a
, __m128i __b
)
717 return (__m128i
)__builtin_ia32_pmulhw128((__v8hi
)__a
, (__v8hi
)__b
);
720 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
721 _mm_mulhi_epu16(__m128i __a
, __m128i __b
)
723 return (__m128i
)__builtin_ia32_pmulhuw128((__v8hi
)__a
, (__v8hi
)__b
);
726 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
727 _mm_mullo_epi16(__m128i __a
, __m128i __b
)
729 return (__m128i
)((__v8hi
)__a
* (__v8hi
)__b
);
732 static __inline__ __m64
__attribute__((__always_inline__
, __nodebug__
))
733 _mm_mul_su32(__m64 __a
, __m64 __b
)
735 return __builtin_ia32_pmuludq((__v2si
)__a
, (__v2si
)__b
);
738 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
739 _mm_mul_epu32(__m128i __a
, __m128i __b
)
741 return __builtin_ia32_pmuludq128((__v4si
)__a
, (__v4si
)__b
);
744 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
745 _mm_sad_epu8(__m128i __a
, __m128i __b
)
747 return __builtin_ia32_psadbw128((__v16qi
)__a
, (__v16qi
)__b
);
750 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
751 _mm_sub_epi8(__m128i __a
, __m128i __b
)
753 return (__m128i
)((__v16qi
)__a
- (__v16qi
)__b
);
756 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
757 _mm_sub_epi16(__m128i __a
, __m128i __b
)
759 return (__m128i
)((__v8hi
)__a
- (__v8hi
)__b
);
762 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
763 _mm_sub_epi32(__m128i __a
, __m128i __b
)
765 return (__m128i
)((__v4si
)__a
- (__v4si
)__b
);
768 static __inline__ __m64
__attribute__((__always_inline__
, __nodebug__
))
769 _mm_sub_si64(__m64 __a
, __m64 __b
)
774 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
775 _mm_sub_epi64(__m128i __a
, __m128i __b
)
780 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
781 _mm_subs_epi8(__m128i __a
, __m128i __b
)
783 return (__m128i
)__builtin_ia32_psubsb128((__v16qi
)__a
, (__v16qi
)__b
);
786 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
787 _mm_subs_epi16(__m128i __a
, __m128i __b
)
789 return (__m128i
)__builtin_ia32_psubsw128((__v8hi
)__a
, (__v8hi
)__b
);
792 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
793 _mm_subs_epu8(__m128i __a
, __m128i __b
)
795 return (__m128i
)__builtin_ia32_psubusb128((__v16qi
)__a
, (__v16qi
)__b
);
798 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
799 _mm_subs_epu16(__m128i __a
, __m128i __b
)
801 return (__m128i
)__builtin_ia32_psubusw128((__v8hi
)__a
, (__v8hi
)__b
);
804 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
805 _mm_and_si128(__m128i __a
, __m128i __b
)
810 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
811 _mm_andnot_si128(__m128i __a
, __m128i __b
)
816 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
817 _mm_or_si128(__m128i __a
, __m128i __b
)
822 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
823 _mm_xor_si128(__m128i __a
, __m128i __b
)
828 #define _mm_slli_si128(a, count) __extension__ ({ \
829 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
831 _Pragma("clang diagnostic pop"); \
832 (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
834 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
835 _mm_slli_epi16(__m128i __a
, int __count
)
837 return (__m128i
)__builtin_ia32_psllwi128((__v8hi
)__a
, __count
);
840 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
841 _mm_sll_epi16(__m128i __a
, __m128i __count
)
843 return (__m128i
)__builtin_ia32_psllw128((__v8hi
)__a
, (__v8hi
)__count
);
846 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
847 _mm_slli_epi32(__m128i __a
, int __count
)
849 return (__m128i
)__builtin_ia32_pslldi128((__v4si
)__a
, __count
);
852 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
853 _mm_sll_epi32(__m128i __a
, __m128i __count
)
855 return (__m128i
)__builtin_ia32_pslld128((__v4si
)__a
, (__v4si
)__count
);
858 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
859 _mm_slli_epi64(__m128i __a
, int __count
)
861 return __builtin_ia32_psllqi128(__a
, __count
);
864 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
865 _mm_sll_epi64(__m128i __a
, __m128i __count
)
867 return __builtin_ia32_psllq128(__a
, __count
);
870 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
871 _mm_srai_epi16(__m128i __a
, int __count
)
873 return (__m128i
)__builtin_ia32_psrawi128((__v8hi
)__a
, __count
);
876 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
877 _mm_sra_epi16(__m128i __a
, __m128i __count
)
879 return (__m128i
)__builtin_ia32_psraw128((__v8hi
)__a
, (__v8hi
)__count
);
882 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
883 _mm_srai_epi32(__m128i __a
, int __count
)
885 return (__m128i
)__builtin_ia32_psradi128((__v4si
)__a
, __count
);
888 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
889 _mm_sra_epi32(__m128i __a
, __m128i __count
)
891 return (__m128i
)__builtin_ia32_psrad128((__v4si
)__a
, (__v4si
)__count
);
895 #define _mm_srli_si128(a, count) __extension__ ({ \
896 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
898 _Pragma("clang diagnostic pop"); \
899 (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
901 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
902 _mm_srli_epi16(__m128i __a
, int __count
)
904 return (__m128i
)__builtin_ia32_psrlwi128((__v8hi
)__a
, __count
);
907 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
908 _mm_srl_epi16(__m128i __a
, __m128i __count
)
910 return (__m128i
)__builtin_ia32_psrlw128((__v8hi
)__a
, (__v8hi
)__count
);
913 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
914 _mm_srli_epi32(__m128i __a
, int __count
)
916 return (__m128i
)__builtin_ia32_psrldi128((__v4si
)__a
, __count
);
919 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
920 _mm_srl_epi32(__m128i __a
, __m128i __count
)
922 return (__m128i
)__builtin_ia32_psrld128((__v4si
)__a
, (__v4si
)__count
);
925 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
926 _mm_srli_epi64(__m128i __a
, int __count
)
928 return __builtin_ia32_psrlqi128(__a
, __count
);
931 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
932 _mm_srl_epi64(__m128i __a
, __m128i __count
)
934 return __builtin_ia32_psrlq128(__a
, __count
);
937 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
938 _mm_cmpeq_epi8(__m128i __a
, __m128i __b
)
940 return (__m128i
)((__v16qi
)__a
== (__v16qi
)__b
);
943 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
944 _mm_cmpeq_epi16(__m128i __a
, __m128i __b
)
946 return (__m128i
)((__v8hi
)__a
== (__v8hi
)__b
);
949 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
950 _mm_cmpeq_epi32(__m128i __a
, __m128i __b
)
952 return (__m128i
)((__v4si
)__a
== (__v4si
)__b
);
955 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
956 _mm_cmpgt_epi8(__m128i __a
, __m128i __b
)
958 /* This function always performs a signed comparison, but __v16qi is a char
959 which may be signed or unsigned. */
960 typedef signed char __v16qs
__attribute__((__vector_size__(16)));
961 return (__m128i
)((__v16qs
)__a
> (__v16qs
)__b
);
964 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
965 _mm_cmpgt_epi16(__m128i __a
, __m128i __b
)
967 return (__m128i
)((__v8hi
)__a
> (__v8hi
)__b
);
970 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
971 _mm_cmpgt_epi32(__m128i __a
, __m128i __b
)
973 return (__m128i
)((__v4si
)__a
> (__v4si
)__b
);
976 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
977 _mm_cmplt_epi8(__m128i __a
, __m128i __b
)
979 return _mm_cmpgt_epi8(__b
, __a
);
982 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
983 _mm_cmplt_epi16(__m128i __a
, __m128i __b
)
985 return _mm_cmpgt_epi16(__b
, __a
);
988 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
989 _mm_cmplt_epi32(__m128i __a
, __m128i __b
)
991 return _mm_cmpgt_epi32(__b
, __a
);
995 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
996 _mm_cvtsi64_sd(__m128d __a
, long long __b
)
1002 static __inline__
long long __attribute__((__always_inline__
, __nodebug__
))
1003 _mm_cvtsd_si64(__m128d __a
)
1005 return __builtin_ia32_cvtsd2si64(__a
);
1008 static __inline__
long long __attribute__((__always_inline__
, __nodebug__
))
1009 _mm_cvttsd_si64(__m128d __a
)
1015 static __inline__ __m128
__attribute__((__always_inline__
, __nodebug__
))
1016 _mm_cvtepi32_ps(__m128i __a
)
1018 return __builtin_ia32_cvtdq2ps((__v4si
)__a
);
1021 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1022 _mm_cvtps_epi32(__m128 __a
)
1024 return (__m128i
)__builtin_ia32_cvtps2dq(__a
);
1027 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1028 _mm_cvttps_epi32(__m128 __a
)
1030 return (__m128i
)__builtin_ia32_cvttps2dq(__a
);
1033 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1034 _mm_cvtsi32_si128(int __a
)
1036 return (__m128i
)(__v4si
){ __a
, 0, 0, 0 };
1040 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1041 _mm_cvtsi64_si128(long long __a
)
1043 return (__m128i
){ __a
, 0 };
1047 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
1048 _mm_cvtsi128_si32(__m128i __a
)
1050 __v4si __b
= (__v4si
)__a
;
1055 static __inline__
long long __attribute__((__always_inline__
, __nodebug__
))
1056 _mm_cvtsi128_si64(__m128i __a
)
1062 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1063 _mm_load_si128(__m128i
const *__p
)
1068 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1069 _mm_loadu_si128(__m128i
const *__p
)
1071 struct __loadu_si128
{
1073 } __attribute__((packed
, may_alias
));
1074 return ((struct __loadu_si128
*)__p
)->__v
;
1077 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1078 _mm_loadl_epi64(__m128i
const *__p
)
1080 struct __mm_loadl_epi64_struct
{
1082 } __attribute__((__packed__
, __may_alias__
));
1083 return (__m128i
) { ((struct __mm_loadl_epi64_struct
*)__p
)->__u
, 0};
1086 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1087 _mm_set_epi64x(long long q1
, long long q0
)
1089 return (__m128i
){ q0
, q1
};
1092 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1093 _mm_set_epi64(__m64 q1
, __m64 q0
)
1095 return (__m128i
){ (long long)q0
, (long long)q1
};
1098 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1099 _mm_set_epi32(int i3
, int i2
, int i1
, int i0
)
1101 return (__m128i
)(__v4si
){ i0
, i1
, i2
, i3
};
1104 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1105 _mm_set_epi16(short w7
, short w6
, short w5
, short w4
, short w3
, short w2
, short w1
, short w0
)
1107 return (__m128i
)(__v8hi
){ w0
, w1
, w2
, w3
, w4
, w5
, w6
, w7
};
1110 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1111 _mm_set_epi8(char b15
, char b14
, char b13
, char b12
, char b11
, char b10
, char b9
, char b8
, char b7
, char b6
, char b5
, char b4
, char b3
, char b2
, char b1
, char b0
)
1113 return (__m128i
)(__v16qi
){ b0
, b1
, b2
, b3
, b4
, b5
, b6
, b7
, b8
, b9
, b10
, b11
, b12
, b13
, b14
, b15
};
1116 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1117 _mm_set1_epi64x(long long __q
)
1119 return (__m128i
){ __q
, __q
};
1122 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1123 _mm_set1_epi64(__m64 __q
)
1125 return (__m128i
){ (long long)__q
, (long long)__q
};
1128 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1129 _mm_set1_epi32(int __i
)
1131 return (__m128i
)(__v4si
){ __i
, __i
, __i
, __i
};
1134 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1135 _mm_set1_epi16(short __w
)
1137 return (__m128i
)(__v8hi
){ __w
, __w
, __w
, __w
, __w
, __w
, __w
, __w
};
1140 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1141 _mm_set1_epi8(char __b
)
1143 return (__m128i
)(__v16qi
){ __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
};
1146 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1147 _mm_setr_epi64(__m64 q0
, __m64 q1
)
1149 return (__m128i
){ (long long)q0
, (long long)q1
};
1152 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1153 _mm_setr_epi32(int i0
, int i1
, int i2
, int i3
)
1155 return (__m128i
)(__v4si
){ i0
, i1
, i2
, i3
};
1158 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1159 _mm_setr_epi16(short w0
, short w1
, short w2
, short w3
, short w4
, short w5
, short w6
, short w7
)
1161 return (__m128i
)(__v8hi
){ w0
, w1
, w2
, w3
, w4
, w5
, w6
, w7
};
1164 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1165 _mm_setr_epi8(char b0
, char b1
, char b2
, char b3
, char b4
, char b5
, char b6
, char b7
, char b8
, char b9
, char b10
, char b11
, char b12
, char b13
, char b14
, char b15
)
1167 return (__m128i
)(__v16qi
){ b0
, b1
, b2
, b3
, b4
, b5
, b6
, b7
, b8
, b9
, b10
, b11
, b12
, b13
, b14
, b15
};
1170 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1171 _mm_setzero_si128(void)
1173 return (__m128i
){ 0LL, 0LL };
1176 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
1177 _mm_store_si128(__m128i
*__p
, __m128i __b
)
1182 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
1183 _mm_storeu_si128(__m128i
*__p
, __m128i __b
)
1185 __builtin_ia32_storedqu((char *)__p
, (__v16qi
)__b
);
1188 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
1189 _mm_maskmoveu_si128(__m128i __d
, __m128i __n
, char *__p
)
1191 __builtin_ia32_maskmovdqu((__v16qi
)__d
, (__v16qi
)__n
, __p
);
1194 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
1195 _mm_storel_epi64(__m128i
*__p
, __m128i __a
)
1197 struct __mm_storel_epi64_struct
{
1199 } __attribute__((__packed__
, __may_alias__
));
1200 ((struct __mm_storel_epi64_struct
*)__p
)->__u
= __a
[0];
1203 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
1204 _mm_stream_pd(double *__p
, __m128d __a
)
1206 __builtin_ia32_movntpd(__p
, __a
);
1209 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
1210 _mm_stream_si128(__m128i
*__p
, __m128i __a
)
1212 __builtin_ia32_movntdq(__p
, __a
);
1215 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
1216 _mm_stream_si32(int *__p
, int __a
)
1218 __builtin_ia32_movnti(__p
, __a
);
1222 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
1223 _mm_stream_si64(long long *__p
, long long __a
)
1225 __builtin_ia32_movnti64(__p
, __a
);
1229 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
1230 _mm_clflush(void const *__p
)
1232 __builtin_ia32_clflush(__p
);
1235 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
1238 __builtin_ia32_lfence();
1241 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
1244 __builtin_ia32_mfence();
1247 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1248 _mm_packs_epi16(__m128i __a
, __m128i __b
)
1250 return (__m128i
)__builtin_ia32_packsswb128((__v8hi
)__a
, (__v8hi
)__b
);
1253 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1254 _mm_packs_epi32(__m128i __a
, __m128i __b
)
1256 return (__m128i
)__builtin_ia32_packssdw128((__v4si
)__a
, (__v4si
)__b
);
1259 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1260 _mm_packus_epi16(__m128i __a
, __m128i __b
)
1262 return (__m128i
)__builtin_ia32_packuswb128((__v8hi
)__a
, (__v8hi
)__b
);
1265 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
1266 _mm_extract_epi16(__m128i __a
, int __imm
)
1268 __v8hi __b
= (__v8hi
)__a
;
1269 return (unsigned short)__b
[__imm
& 7];
1272 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1273 _mm_insert_epi16(__m128i __a
, int __b
, int __imm
)
1275 __v8hi __c
= (__v8hi
)__a
;
1276 __c
[__imm
& 7] = __b
;
1277 return (__m128i
)__c
;
1280 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
1281 _mm_movemask_epi8(__m128i __a
)
1283 return __builtin_ia32_pmovmskb128((__v16qi
)__a
);
1286 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1287 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1288 __m128i __a = (a); \
1289 _Pragma("clang diagnostic pop"); \
1290 (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
1291 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1292 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1294 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1295 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1296 __m128i __a = (a); \
1297 _Pragma("clang diagnostic pop"); \
1298 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1299 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1300 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1303 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1304 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1305 __m128i __a = (a); \
1306 _Pragma("clang diagnostic pop"); \
1307 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1309 4 + (((imm) & 0x03) >> 0), \
1310 4 + (((imm) & 0x0c) >> 2), \
1311 4 + (((imm) & 0x30) >> 4), \
1312 4 + (((imm) & 0xc0) >> 6)); })
1314 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1315 _mm_unpackhi_epi8(__m128i __a
, __m128i __b
)
1317 return (__m128i
)__builtin_shufflevector((__v16qi
)__a
, (__v16qi
)__b
, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1320 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1321 _mm_unpackhi_epi16(__m128i __a
, __m128i __b
)
1323 return (__m128i
)__builtin_shufflevector((__v8hi
)__a
, (__v8hi
)__b
, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1326 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1327 _mm_unpackhi_epi32(__m128i __a
, __m128i __b
)
1329 return (__m128i
)__builtin_shufflevector((__v4si
)__a
, (__v4si
)__b
, 2, 4+2, 3, 4+3);
1332 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1333 _mm_unpackhi_epi64(__m128i __a
, __m128i __b
)
1335 return (__m128i
)__builtin_shufflevector(__a
, __b
, 1, 2+1);
1338 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1339 _mm_unpacklo_epi8(__m128i __a
, __m128i __b
)
1341 return (__m128i
)__builtin_shufflevector((__v16qi
)__a
, (__v16qi
)__b
, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1344 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1345 _mm_unpacklo_epi16(__m128i __a
, __m128i __b
)
1347 return (__m128i
)__builtin_shufflevector((__v8hi
)__a
, (__v8hi
)__b
, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1350 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1351 _mm_unpacklo_epi32(__m128i __a
, __m128i __b
)
1353 return (__m128i
)__builtin_shufflevector((__v4si
)__a
, (__v4si
)__b
, 0, 4+0, 1, 4+1);
1356 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1357 _mm_unpacklo_epi64(__m128i __a
, __m128i __b
)
1359 return (__m128i
)__builtin_shufflevector(__a
, __b
, 0, 2+0);
1362 static __inline__ __m64
__attribute__((__always_inline__
, __nodebug__
))
1363 _mm_movepi64_pi64(__m128i __a
)
1365 return (__m64
)__a
[0];
1368 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1369 _mm_movpi64_epi64(__m64 __a
)
1371 return (__m128i
){ (long long)__a
, 0 };
1374 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1375 _mm_move_epi64(__m128i __a
)
1377 return __builtin_shufflevector(__a
, (__m128i
){ 0 }, 0, 2);
1380 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
1381 _mm_unpackhi_pd(__m128d __a
, __m128d __b
)
1383 return __builtin_shufflevector(__a
, __b
, 1, 2+1);
1386 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
1387 _mm_unpacklo_pd(__m128d __a
, __m128d __b
)
1389 return __builtin_shufflevector(__a
, __b
, 0, 2+0);
1392 static __inline__
int __attribute__((__always_inline__
, __nodebug__
))
1393 _mm_movemask_pd(__m128d __a
)
1395 return __builtin_ia32_movmskpd(__a
);
1398 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1399 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1400 __m128d __a = (a); \
1401 __m128d __b = (b); \
1402 _Pragma("clang diagnostic pop"); \
1403 __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
1405 static __inline__ __m128
__attribute__((__always_inline__
, __nodebug__
))
1406 _mm_castpd_ps(__m128d __a
)
1411 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1412 _mm_castpd_si128(__m128d __a
)
1414 return (__m128i
)__a
;
1417 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
1418 _mm_castps_pd(__m128 __a
)
1420 return (__m128d
)__a
;
1423 static __inline__ __m128i
__attribute__((__always_inline__
, __nodebug__
))
1424 _mm_castps_si128(__m128 __a
)
1426 return (__m128i
)__a
;
1429 static __inline__ __m128
__attribute__((__always_inline__
, __nodebug__
))
1430 _mm_castsi128_ps(__m128i __a
)
1435 static __inline__ __m128d
__attribute__((__always_inline__
, __nodebug__
))
1436 _mm_castsi128_pd(__m128i __a
)
1438 return (__m128d
)__a
;
1441 static __inline__
void __attribute__((__always_inline__
, __nodebug__
))
1444 __asm__
volatile ("pause");
1447 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1449 #endif /* __SSE2__ */
1451 #endif /* __EMMINTRIN_H */