1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
10 /* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19 However scalar float operations in vector (XMM) registers require
20 the POWER8 VSX ISA (2.07) level. There are differences for data
21 format and placement of float scalars in the vector register, which
22 require extra steps to match SSE2 scalar float semantics on POWER.
24 It should be noted that there's much difference between X86_64's
25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26 portable <fenv.h> instead of access MXSCR directly.
28 Most SSE2 scalar float intrinsic operations can be performed more
29 efficiently as C language float scalar operations or optimized to
30 use vector SIMD operations. We recommend this for new applications.
33 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
39 #if defined(__ppc64__) && \
40 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
44 /* We need definitions from the SSE header files. */
45 #include <xmmintrin.h>
48 typedef __vector
double __v2df
;
49 typedef __vector
long long __v2di
;
50 typedef __vector
unsigned long long __v2du
;
51 typedef __vector
int __v4si
;
52 typedef __vector
unsigned int __v4su
;
53 typedef __vector
short __v8hi
;
54 typedef __vector
unsigned short __v8hu
;
55 typedef __vector
signed char __v16qi
;
56 typedef __vector
unsigned char __v16qu
;
58 /* The Intel API is flexible enough that we must allow aliasing with other
59 vector types, and their scalar components. */
60 typedef long long __m128i
__attribute__((__vector_size__(16), __may_alias__
));
61 typedef double __m128d
__attribute__((__vector_size__(16), __may_alias__
));
63 /* Unaligned version of the same types. */
64 typedef long long __m128i_u
65 __attribute__((__vector_size__(16), __may_alias__
, __aligned__(1)));
66 typedef double __m128d_u
67 __attribute__((__vector_size__(16), __may_alias__
, __aligned__(1)));
69 /* Define two value permute mask. */
70 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
72 /* Create a vector with element 0 as F and the rest zero. */
73 extern __inline __m128d
74 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
75 _mm_set_sd(double __F
) {
76 return __extension__(__m128d
){__F
, 0.0};
79 /* Create a vector with both elements equal to F. */
80 extern __inline __m128d
81 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
82 _mm_set1_pd(double __F
) {
83 return __extension__(__m128d
){__F
, __F
};
86 extern __inline __m128d
87 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
88 _mm_set_pd1(double __F
) {
89 return _mm_set1_pd(__F
);
92 /* Create a vector with the lower value X and upper value W. */
93 extern __inline __m128d
94 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
95 _mm_set_pd(double __W
, double __X
) {
96 return __extension__(__m128d
){__X
, __W
};
99 /* Create a vector with the lower value W and upper value X. */
100 extern __inline __m128d
101 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
102 _mm_setr_pd(double __W
, double __X
) {
103 return __extension__(__m128d
){__W
, __X
};
106 /* Create an undefined vector. */
107 extern __inline __m128d
108 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
109 _mm_undefined_pd(void) {
114 /* Create a vector of zeros. */
115 extern __inline __m128d
116 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
117 _mm_setzero_pd(void) {
118 return (__m128d
)vec_splats(0);
121 /* Sets the low DPFP value of A from the low value of B. */
122 extern __inline __m128d
123 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
124 _mm_move_sd(__m128d __A
, __m128d __B
) {
125 __v2df __result
= (__v2df
)__A
;
126 __result
[0] = ((__v2df
)__B
)[0];
127 return (__m128d
)__result
;
130 /* Load two DPFP values from P. The address must be 16-byte aligned. */
131 extern __inline __m128d
132 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
133 _mm_load_pd(double const *__P
) {
134 return ((__m128d
)vec_ld(0, (__v16qu
*)__P
));
137 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
138 extern __inline __m128d
139 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
140 _mm_loadu_pd(double const *__P
) {
141 return (vec_vsx_ld(0, __P
));
144 /* Create a vector with all two elements equal to *P. */
145 extern __inline __m128d
146 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
147 _mm_load1_pd(double const *__P
) {
148 return (vec_splats(*__P
));
151 /* Create a vector with element 0 as *P and the rest zero. */
152 extern __inline __m128d
153 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
154 _mm_load_sd(double const *__P
) {
155 return _mm_set_sd(*__P
);
158 extern __inline __m128d
159 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
160 _mm_load_pd1(double const *__P
) {
161 return _mm_load1_pd(__P
);
164 /* Load two DPFP values in reverse order. The address must be aligned. */
165 extern __inline __m128d
166 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
167 _mm_loadr_pd(double const *__P
) {
168 __v2df __tmp
= _mm_load_pd(__P
);
169 return (__m128d
)vec_xxpermdi(__tmp
, __tmp
, 2);
172 /* Store two DPFP values. The address must be 16-byte aligned. */
174 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
175 _mm_store_pd(double *__P
, __m128d __A
) {
176 vec_st((__v16qu
)__A
, 0, (__v16qu
*)__P
);
179 /* Store two DPFP values. The address need not be 16-byte aligned. */
181 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
182 _mm_storeu_pd(double *__P
, __m128d __A
) {
183 *(__m128d_u
*)__P
= __A
;
186 /* Stores the lower DPFP value. */
188 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
189 _mm_store_sd(double *__P
, __m128d __A
) {
190 *__P
= ((__v2df
)__A
)[0];
193 extern __inline
double
194 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
195 _mm_cvtsd_f64(__m128d __A
) {
196 return ((__v2df
)__A
)[0];
200 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
201 _mm_storel_pd(double *__P
, __m128d __A
) {
202 _mm_store_sd(__P
, __A
);
205 /* Stores the upper DPFP value. */
207 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
208 _mm_storeh_pd(double *__P
, __m128d __A
) {
209 *__P
= ((__v2df
)__A
)[1];
211 /* Store the lower DPFP value across two words.
212 The address must be 16-byte aligned. */
214 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
215 _mm_store1_pd(double *__P
, __m128d __A
) {
216 _mm_store_pd(__P
, vec_splat(__A
, 0));
220 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
221 _mm_store_pd1(double *__P
, __m128d __A
) {
222 _mm_store1_pd(__P
, __A
);
225 /* Store two DPFP values in reverse order. The address must be aligned. */
227 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
228 _mm_storer_pd(double *__P
, __m128d __A
) {
229 _mm_store_pd(__P
, vec_xxpermdi(__A
, __A
, 2));
232 /* Intel intrinsic. */
233 extern __inline
long long
234 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
235 _mm_cvtsi128_si64(__m128i __A
) {
236 return ((__v2di
)__A
)[0];
239 /* Microsoft intrinsic. */
240 extern __inline
long long
241 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
242 _mm_cvtsi128_si64x(__m128i __A
) {
243 return ((__v2di
)__A
)[0];
246 extern __inline __m128d
247 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
248 _mm_add_pd(__m128d __A
, __m128d __B
) {
249 return (__m128d
)((__v2df
)__A
+ (__v2df
)__B
);
252 /* Add the lower double-precision (64-bit) floating-point element in
253 a and b, store the result in the lower element of dst, and copy
254 the upper element from a to the upper element of dst. */
255 extern __inline __m128d
256 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
257 _mm_add_sd(__m128d __A
, __m128d __B
) {
258 __A
[0] = __A
[0] + __B
[0];
262 extern __inline __m128d
263 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
264 _mm_sub_pd(__m128d __A
, __m128d __B
) {
265 return (__m128d
)((__v2df
)__A
- (__v2df
)__B
);
268 extern __inline __m128d
269 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
270 _mm_sub_sd(__m128d __A
, __m128d __B
) {
271 __A
[0] = __A
[0] - __B
[0];
275 extern __inline __m128d
276 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
277 _mm_mul_pd(__m128d __A
, __m128d __B
) {
278 return (__m128d
)((__v2df
)__A
* (__v2df
)__B
);
281 extern __inline __m128d
282 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
283 _mm_mul_sd(__m128d __A
, __m128d __B
) {
284 __A
[0] = __A
[0] * __B
[0];
288 extern __inline __m128d
289 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
290 _mm_div_pd(__m128d __A
, __m128d __B
) {
291 return (__m128d
)((__v2df
)__A
/ (__v2df
)__B
);
294 extern __inline __m128d
295 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
296 _mm_div_sd(__m128d __A
, __m128d __B
) {
297 __A
[0] = __A
[0] / __B
[0];
301 extern __inline __m128d
302 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
303 _mm_sqrt_pd(__m128d __A
) {
304 return (vec_sqrt(__A
));
307 /* Return pair {sqrt (B[0]), A[1]}. */
308 extern __inline __m128d
309 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
310 _mm_sqrt_sd(__m128d __A
, __m128d __B
) {
312 __c
= vec_sqrt((__v2df
)_mm_set1_pd(__B
[0]));
313 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
316 extern __inline __m128d
317 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
318 _mm_min_pd(__m128d __A
, __m128d __B
) {
319 return (vec_min(__A
, __B
));
322 extern __inline __m128d
323 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
324 _mm_min_sd(__m128d __A
, __m128d __B
) {
325 __v2df __a
, __b
, __c
;
326 __a
= vec_splats(__A
[0]);
327 __b
= vec_splats(__B
[0]);
328 __c
= vec_min(__a
, __b
);
329 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
332 extern __inline __m128d
333 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
334 _mm_max_pd(__m128d __A
, __m128d __B
) {
335 return (vec_max(__A
, __B
));
338 extern __inline __m128d
339 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
340 _mm_max_sd(__m128d __A
, __m128d __B
) {
341 __v2df __a
, __b
, __c
;
342 __a
= vec_splats(__A
[0]);
343 __b
= vec_splats(__B
[0]);
344 __c
= vec_max(__a
, __b
);
345 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
348 extern __inline __m128d
349 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
350 _mm_cmpeq_pd(__m128d __A
, __m128d __B
) {
351 return ((__m128d
)vec_cmpeq((__v2df
)__A
, (__v2df
)__B
));
354 extern __inline __m128d
355 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
356 _mm_cmplt_pd(__m128d __A
, __m128d __B
) {
357 return ((__m128d
)vec_cmplt((__v2df
)__A
, (__v2df
)__B
));
360 extern __inline __m128d
361 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
362 _mm_cmple_pd(__m128d __A
, __m128d __B
) {
363 return ((__m128d
)vec_cmple((__v2df
)__A
, (__v2df
)__B
));
366 extern __inline __m128d
367 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
368 _mm_cmpgt_pd(__m128d __A
, __m128d __B
) {
369 return ((__m128d
)vec_cmpgt((__v2df
)__A
, (__v2df
)__B
));
372 extern __inline __m128d
373 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
374 _mm_cmpge_pd(__m128d __A
, __m128d __B
) {
375 return ((__m128d
)vec_cmpge((__v2df
)__A
, (__v2df
)__B
));
378 extern __inline __m128d
379 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
380 _mm_cmpneq_pd(__m128d __A
, __m128d __B
) {
381 __v2df __temp
= (__v2df
)vec_cmpeq((__v2df
)__A
, (__v2df
)__B
);
382 return ((__m128d
)vec_nor(__temp
, __temp
));
385 extern __inline __m128d
386 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
387 _mm_cmpnlt_pd(__m128d __A
, __m128d __B
) {
388 return ((__m128d
)vec_cmpge((__v2df
)__A
, (__v2df
)__B
));
391 extern __inline __m128d
392 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
393 _mm_cmpnle_pd(__m128d __A
, __m128d __B
) {
394 return ((__m128d
)vec_cmpgt((__v2df
)__A
, (__v2df
)__B
));
397 extern __inline __m128d
398 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
399 _mm_cmpngt_pd(__m128d __A
, __m128d __B
) {
400 return ((__m128d
)vec_cmple((__v2df
)__A
, (__v2df
)__B
));
403 extern __inline __m128d
404 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
405 _mm_cmpnge_pd(__m128d __A
, __m128d __B
) {
406 return ((__m128d
)vec_cmplt((__v2df
)__A
, (__v2df
)__B
));
409 extern __inline __m128d
410 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
411 _mm_cmpord_pd(__m128d __A
, __m128d __B
) {
413 /* Compare against self will return false (0's) if NAN. */
414 __c
= (__v2du
)vec_cmpeq(__A
, __A
);
415 __d
= (__v2du
)vec_cmpeq(__B
, __B
);
416 /* A != NAN and B != NAN. */
417 return ((__m128d
)vec_and(__c
, __d
));
420 extern __inline __m128d
421 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
422 _mm_cmpunord_pd(__m128d __A
, __m128d __B
) {
425 /* Compare against self will return false (0's) if NAN. */
426 __c
= (__v2du
)vec_cmpeq((__v2df
)__A
, (__v2df
)__A
);
427 __d
= (__v2du
)vec_cmpeq((__v2df
)__B
, (__v2df
)__B
);
428 /* A == NAN OR B == NAN converts too:
429 NOT(A != NAN) OR NOT(B != NAN). */
430 __c
= vec_nor(__c
, __c
);
431 return ((__m128d
)vec_orc(__c
, __d
));
434 /* Compare against self will return false (0's) if NAN. */
435 __c
= (__v2du
)vec_cmpeq((__v2df
)__A
, (__v2df
)__A
);
436 __d
= (__v2du
)vec_cmpeq((__v2df
)__B
, (__v2df
)__B
);
437 /* Convert the true ('1's) is NAN. */
438 __c
= vec_nor(__c
, __c
);
439 __d
= vec_nor(__d
, __d
);
440 return ((__m128d
)vec_or(__c
, __d
));
444 extern __inline __m128d
445 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
446 _mm_cmpeq_sd(__m128d __A
, __m128d __B
) {
447 __v2df __a
, __b
, __c
;
448 /* PowerISA VSX does not allow partial (for just lower double)
449 results. So to insure we don't generate spurious exceptions
450 (from the upper double values) we splat the lower double
451 before we do the operation. */
452 __a
= vec_splats(__A
[0]);
453 __b
= vec_splats(__B
[0]);
454 __c
= (__v2df
)vec_cmpeq(__a
, __b
);
455 /* Then we merge the lower double result with the original upper
457 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
460 extern __inline __m128d
461 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
462 _mm_cmplt_sd(__m128d __A
, __m128d __B
) {
463 __v2df __a
, __b
, __c
;
464 __a
= vec_splats(__A
[0]);
465 __b
= vec_splats(__B
[0]);
466 __c
= (__v2df
)vec_cmplt(__a
, __b
);
467 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
470 extern __inline __m128d
471 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
472 _mm_cmple_sd(__m128d __A
, __m128d __B
) {
473 __v2df __a
, __b
, __c
;
474 __a
= vec_splats(__A
[0]);
475 __b
= vec_splats(__B
[0]);
476 __c
= (__v2df
)vec_cmple(__a
, __b
);
477 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
480 extern __inline __m128d
481 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
482 _mm_cmpgt_sd(__m128d __A
, __m128d __B
) {
483 __v2df __a
, __b
, __c
;
484 __a
= vec_splats(__A
[0]);
485 __b
= vec_splats(__B
[0]);
486 __c
= (__v2df
)vec_cmpgt(__a
, __b
);
487 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
490 extern __inline __m128d
491 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
492 _mm_cmpge_sd(__m128d __A
, __m128d __B
) {
493 __v2df __a
, __b
, __c
;
494 __a
= vec_splats(__A
[0]);
495 __b
= vec_splats(__B
[0]);
496 __c
= (__v2df
)vec_cmpge(__a
, __b
);
497 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
500 extern __inline __m128d
501 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
502 _mm_cmpneq_sd(__m128d __A
, __m128d __B
) {
503 __v2df __a
, __b
, __c
;
504 __a
= vec_splats(__A
[0]);
505 __b
= vec_splats(__B
[0]);
506 __c
= (__v2df
)vec_cmpeq(__a
, __b
);
507 __c
= vec_nor(__c
, __c
);
508 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
511 extern __inline __m128d
512 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
513 _mm_cmpnlt_sd(__m128d __A
, __m128d __B
) {
514 __v2df __a
, __b
, __c
;
515 __a
= vec_splats(__A
[0]);
516 __b
= vec_splats(__B
[0]);
517 /* Not less than is just greater than or equal. */
518 __c
= (__v2df
)vec_cmpge(__a
, __b
);
519 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
522 extern __inline __m128d
523 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
524 _mm_cmpnle_sd(__m128d __A
, __m128d __B
) {
525 __v2df __a
, __b
, __c
;
526 __a
= vec_splats(__A
[0]);
527 __b
= vec_splats(__B
[0]);
528 /* Not less than or equal is just greater than. */
529 __c
= (__v2df
)vec_cmpge(__a
, __b
);
530 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
533 extern __inline __m128d
534 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
535 _mm_cmpngt_sd(__m128d __A
, __m128d __B
) {
536 __v2df __a
, __b
, __c
;
537 __a
= vec_splats(__A
[0]);
538 __b
= vec_splats(__B
[0]);
539 /* Not greater than is just less than or equal. */
540 __c
= (__v2df
)vec_cmple(__a
, __b
);
541 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
544 extern __inline __m128d
545 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
546 _mm_cmpnge_sd(__m128d __A
, __m128d __B
) {
547 __v2df __a
, __b
, __c
;
548 __a
= vec_splats(__A
[0]);
549 __b
= vec_splats(__B
[0]);
550 /* Not greater than or equal is just less than. */
551 __c
= (__v2df
)vec_cmplt(__a
, __b
);
552 return (__m128d
)_mm_setr_pd(__c
[0], __A
[1]);
555 extern __inline __m128d
556 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
557 _mm_cmpord_sd(__m128d __A
, __m128d __B
) {
559 __r
= (__v2df
)_mm_cmpord_pd(vec_splats(__A
[0]), vec_splats(__B
[0]));
560 return (__m128d
)_mm_setr_pd(__r
[0], ((__v2df
)__A
)[1]);
563 extern __inline __m128d
564 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
565 _mm_cmpunord_sd(__m128d __A
, __m128d __B
) {
567 __r
= _mm_cmpunord_pd(vec_splats(__A
[0]), vec_splats(__B
[0]));
568 return (__m128d
)_mm_setr_pd(__r
[0], __A
[1]);
572 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
573 exactly the same because GCC for PowerPC only generates unordered
574 compares (scalar and vector).
575 Technically __mm_comieq_sp et all should be using the ordered
576 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
579 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
580 _mm_comieq_sd(__m128d __A
, __m128d __B
) {
581 return (__A
[0] == __B
[0]);
585 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
586 _mm_comilt_sd(__m128d __A
, __m128d __B
) {
587 return (__A
[0] < __B
[0]);
591 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
592 _mm_comile_sd(__m128d __A
, __m128d __B
) {
593 return (__A
[0] <= __B
[0]);
597 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
598 _mm_comigt_sd(__m128d __A
, __m128d __B
) {
599 return (__A
[0] > __B
[0]);
603 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
604 _mm_comige_sd(__m128d __A
, __m128d __B
) {
605 return (__A
[0] >= __B
[0]);
609 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
610 _mm_comineq_sd(__m128d __A
, __m128d __B
) {
611 return (__A
[0] != __B
[0]);
615 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
616 _mm_ucomieq_sd(__m128d __A
, __m128d __B
) {
617 return (__A
[0] == __B
[0]);
621 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
622 _mm_ucomilt_sd(__m128d __A
, __m128d __B
) {
623 return (__A
[0] < __B
[0]);
627 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
628 _mm_ucomile_sd(__m128d __A
, __m128d __B
) {
629 return (__A
[0] <= __B
[0]);
633 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
634 _mm_ucomigt_sd(__m128d __A
, __m128d __B
) {
635 return (__A
[0] > __B
[0]);
639 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
640 _mm_ucomige_sd(__m128d __A
, __m128d __B
) {
641 return (__A
[0] >= __B
[0]);
645 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
646 _mm_ucomineq_sd(__m128d __A
, __m128d __B
) {
647 return (__A
[0] != __B
[0]);
650 /* Create a vector of Qi, where i is the element number. */
651 extern __inline __m128i
652 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
653 _mm_set_epi64x(long long __q1
, long long __q0
) {
654 return __extension__(__m128i
)(__v2di
){__q0
, __q1
};
657 extern __inline __m128i
658 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
659 _mm_set_epi64(__m64 __q1
, __m64 __q0
) {
660 return _mm_set_epi64x((long long)__q1
, (long long)__q0
);
663 extern __inline __m128i
664 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
665 _mm_set_epi32(int __q3
, int __q2
, int __q1
, int __q0
) {
666 return __extension__(__m128i
)(__v4si
){__q0
, __q1
, __q2
, __q3
};
669 extern __inline __m128i
670 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
671 _mm_set_epi16(short __q7
, short __q6
, short __q5
, short __q4
, short __q3
,
672 short __q2
, short __q1
, short __q0
) {
673 return __extension__(__m128i
)(__v8hi
){__q0
, __q1
, __q2
, __q3
,
674 __q4
, __q5
, __q6
, __q7
};
677 extern __inline __m128i
678 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
679 _mm_set_epi8(char __q15
, char __q14
, char __q13
, char __q12
, char __q11
,
680 char __q10
, char __q09
, char __q08
, char __q07
, char __q06
,
681 char __q05
, char __q04
, char __q03
, char __q02
, char __q01
,
683 return __extension__(__m128i
)(__v16qi
){
684 __q00
, __q01
, __q02
, __q03
, __q04
, __q05
, __q06
, __q07
,
685 __q08
, __q09
, __q10
, __q11
, __q12
, __q13
, __q14
, __q15
};
688 /* Set all of the elements of the vector to A. */
689 extern __inline __m128i
690 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
691 _mm_set1_epi64x(long long __A
) {
692 return _mm_set_epi64x(__A
, __A
);
695 extern __inline __m128i
696 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
697 _mm_set1_epi64(__m64 __A
) {
698 return _mm_set_epi64(__A
, __A
);
701 extern __inline __m128i
702 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
703 _mm_set1_epi32(int __A
) {
704 return _mm_set_epi32(__A
, __A
, __A
, __A
);
707 extern __inline __m128i
708 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
709 _mm_set1_epi16(short __A
) {
710 return _mm_set_epi16(__A
, __A
, __A
, __A
, __A
, __A
, __A
, __A
);
713 extern __inline __m128i
714 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
715 _mm_set1_epi8(char __A
) {
716 return _mm_set_epi8(__A
, __A
, __A
, __A
, __A
, __A
, __A
, __A
, __A
, __A
, __A
,
717 __A
, __A
, __A
, __A
, __A
);
720 /* Create a vector of Qi, where i is the element number.
721 The parameter order is reversed from the _mm_set_epi* functions. */
722 extern __inline __m128i
723 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
724 _mm_setr_epi64(__m64 __q0
, __m64 __q1
) {
725 return _mm_set_epi64(__q1
, __q0
);
728 extern __inline __m128i
729 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
730 _mm_setr_epi32(int __q0
, int __q1
, int __q2
, int __q3
) {
731 return _mm_set_epi32(__q3
, __q2
, __q1
, __q0
);
734 extern __inline __m128i
735 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
736 _mm_setr_epi16(short __q0
, short __q1
, short __q2
, short __q3
, short __q4
,
737 short __q5
, short __q6
, short __q7
) {
738 return _mm_set_epi16(__q7
, __q6
, __q5
, __q4
, __q3
, __q2
, __q1
, __q0
);
741 extern __inline __m128i
742 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
743 _mm_setr_epi8(char __q00
, char __q01
, char __q02
, char __q03
, char __q04
,
744 char __q05
, char __q06
, char __q07
, char __q08
, char __q09
,
745 char __q10
, char __q11
, char __q12
, char __q13
, char __q14
,
747 return _mm_set_epi8(__q15
, __q14
, __q13
, __q12
, __q11
, __q10
, __q09
, __q08
,
748 __q07
, __q06
, __q05
, __q04
, __q03
, __q02
, __q01
, __q00
);
751 /* Create a vector with element 0 as *P and the rest zero. */
752 extern __inline __m128i
753 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
754 _mm_load_si128(__m128i
const *__P
) {
758 extern __inline __m128i
759 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
760 _mm_loadu_si128(__m128i_u
const *__P
) {
761 return (__m128i
)(vec_vsx_ld(0, (signed int const *)__P
));
764 extern __inline __m128i
765 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
766 _mm_loadl_epi64(__m128i_u
const *__P
) {
767 return _mm_set_epi64((__m64
)0LL, *(__m64
*)__P
);
771 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
772 _mm_store_si128(__m128i
*__P
, __m128i __B
) {
773 vec_st((__v16qu
)__B
, 0, (__v16qu
*)__P
);
777 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
778 _mm_storeu_si128(__m128i_u
*__P
, __m128i __B
) {
783 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
784 _mm_storel_epi64(__m128i_u
*__P
, __m128i __B
) {
785 *(long long *)__P
= ((__v2di
)__B
)[0];
788 extern __inline __m64
789 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
790 _mm_movepi64_pi64(__m128i_u __B
) {
791 return (__m64
)((__v2di
)__B
)[0];
794 extern __inline __m128i
795 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
796 _mm_movpi64_epi64(__m64 __A
) {
797 return _mm_set_epi64((__m64
)0LL, __A
);
800 extern __inline __m128i
801 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
802 _mm_move_epi64(__m128i __A
) {
803 return _mm_set_epi64((__m64
)0LL, (__m64
)__A
[0]);
806 /* Create an undefined vector. */
807 extern __inline __m128i
808 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
809 _mm_undefined_si128(void) {
814 /* Create a vector of zeros. */
815 extern __inline __m128i
816 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
817 _mm_setzero_si128(void) {
818 return __extension__(__m128i
)(__v4si
){0, 0, 0, 0};
822 extern __inline __m128d
823 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
824 _mm_cvtepi32_pd(__m128i __A
) {
826 /* For LE need to generate Vector Unpack Low Signed Word.
827 Which is generated from unpackh. */
828 __val
= (__v2di
)vec_unpackh((__v4si
)__A
);
830 return (__m128d
)vec_ctf(__val
, 0);
834 extern __inline __m128
835 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
836 _mm_cvtepi32_ps(__m128i __A
) {
837 return ((__m128
)vec_ctf((__v4si
)__A
, 0));
840 extern __inline __m128i
841 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
842 _mm_cvtpd_epi32(__m128d __A
) {
843 __v2df __rounded
= vec_rint(__A
);
844 __v4si __result
, __temp
;
845 const __v4si __vzero
= {0, 0, 0, 0};
847 /* VSX Vector truncate Double-Precision to integer and Convert to
848 Signed Integer Word format with Saturate. */
849 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp
) : "wa"(__rounded
) :);
852 #ifdef __LITTLE_ENDIAN__
853 __temp
= vec_mergeo(__temp
, __temp
);
855 __temp
= vec_mergee(__temp
, __temp
);
857 __result
= (__v4si
)vec_vpkudum((__vector
long long)__temp
,
858 (__vector
long long)__vzero
);
861 const __v16qu __pkperm
= {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
862 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
863 __result
= (__v4si
)vec_perm((__v16qu
)__temp
, (__v16qu
)__vzero
, __pkperm
);
866 return (__m128i
)__result
;
869 extern __inline __m64
870 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
871 _mm_cvtpd_pi32(__m128d __A
) {
872 __m128i __result
= _mm_cvtpd_epi32(__A
);
874 return (__m64
)__result
[0];
877 extern __inline __m128
878 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
879 _mm_cvtpd_ps(__m128d __A
) {
882 const __v4si __vzero
= {0, 0, 0, 0};
884 __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp
) : "wa"(__A
) :);
887 #ifdef __LITTLE_ENDIAN__
888 __temp
= vec_mergeo(__temp
, __temp
);
890 __temp
= vec_mergee(__temp
, __temp
);
892 __result
= (__v4sf
)vec_vpkudum((__vector
long long)__temp
,
893 (__vector
long long)__vzero
);
896 const __v16qu __pkperm
= {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
897 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
898 __result
= (__v4sf
)vec_perm((__v16qu
)__temp
, (__v16qu
)__vzero
, __pkperm
);
901 return ((__m128
)__result
);
904 extern __inline __m128i
905 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
906 _mm_cvttpd_epi32(__m128d __A
) {
909 const __v4si __vzero
= {0, 0, 0, 0};
911 /* VSX Vector truncate Double-Precision to integer and Convert to
912 Signed Integer Word format with Saturate. */
913 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp
) : "wa"(__A
) :);
916 #ifdef __LITTLE_ENDIAN__
917 __temp
= vec_mergeo(__temp
, __temp
);
919 __temp
= vec_mergee(__temp
, __temp
);
921 __result
= (__v4si
)vec_vpkudum((__vector
long long)__temp
,
922 (__vector
long long)__vzero
);
925 const __v16qu __pkperm
= {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
926 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
927 __result
= (__v4si
)vec_perm((__v16qu
)__temp
, (__v16qu
)__vzero
, __pkperm
);
931 return ((__m128i
)__result
);
934 extern __inline __m64
935 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
936 _mm_cvttpd_pi32(__m128d __A
) {
937 __m128i __result
= _mm_cvttpd_epi32(__A
);
939 return (__m64
)__result
[0];
943 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
944 _mm_cvtsi128_si32(__m128i __A
) {
945 return ((__v4si
)__A
)[0];
949 extern __inline __m128d
950 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
951 _mm_cvtpi32_pd(__m64 __A
) {
956 __temp
= (__v4si
)vec_splats(__A
);
957 __tmp2
= (__v2di
)vec_unpackl(__temp
);
958 __result
= vec_ctf((__vector
signed long long)__tmp2
, 0);
959 return (__m128d
)__result
;
963 extern __inline __m128i
964 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
965 _mm_cvtps_epi32(__m128 __A
) {
969 __rounded
= vec_rint((__v4sf
)__A
);
970 __result
= vec_cts(__rounded
, 0);
971 return (__m128i
)__result
;
974 extern __inline __m128i
975 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
976 _mm_cvttps_epi32(__m128 __A
) {
979 __result
= vec_cts((__v4sf
)__A
, 0);
980 return (__m128i
)__result
;
983 extern __inline __m128d
984 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
985 _mm_cvtps_pd(__m128 __A
) {
986 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
988 return (__m128d
)vec_doubleh((__v4sf
)__A
);
990 /* Otherwise the compiler is not current and so need to generate the
992 __v4sf __a
= (__v4sf
)__A
;
995 #ifdef __LITTLE_ENDIAN__
996 /* The input float values are in elements {[0], [1]} but the convert
997 instruction needs them in elements {[1], [3]}, So we use two
998 shift left double vector word immediates to get the elements
1000 __temp
= __builtin_vsx_xxsldwi(__a
, __a
, 3);
1001 __temp
= __builtin_vsx_xxsldwi(__a
, __temp
, 2);
1003 /* The input float values are in elements {[0], [1]} but the convert
1004 instruction needs them in elements {[0], [2]}, So we use two
1005 shift left double vector word immediates to get the elements
1007 __temp
= vec_vmrghw(__a
, __a
);
1009 __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result
) : "wa"(__temp
) :);
1010 return (__m128d
)__result
;
1015 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1016 _mm_cvtsd_si32(__m128d __A
) {
1017 __v2df __rounded
= vec_rint((__v2df
)__A
);
1018 int __result
= ((__v2df
)__rounded
)[0];
1022 /* Intel intrinsic. */
1023 extern __inline
long long
1024 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1025 _mm_cvtsd_si64(__m128d __A
) {
1026 __v2df __rounded
= vec_rint((__v2df
)__A
);
1027 long long __result
= ((__v2df
)__rounded
)[0];
1032 /* Microsoft intrinsic. */
1033 extern __inline
long long
1034 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1035 _mm_cvtsd_si64x(__m128d __A
) {
1036 return _mm_cvtsd_si64((__v2df
)__A
);
1040 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1041 _mm_cvttsd_si32(__m128d __A
) {
1042 int __result
= ((__v2df
)__A
)[0];
1047 /* Intel intrinsic. */
1048 extern __inline
long long
1049 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1050 _mm_cvttsd_si64(__m128d __A
) {
1051 long long __result
= ((__v2df
)__A
)[0];
1056 /* Microsoft intrinsic. */
1057 extern __inline
long long
1058 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1059 _mm_cvttsd_si64x(__m128d __A
) {
1060 return _mm_cvttsd_si64(__A
);
1063 extern __inline __m128
1064 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1065 _mm_cvtsd_ss(__m128 __A
, __m128d __B
) {
1066 __v4sf __result
= (__v4sf
)__A
;
1068 #ifdef __LITTLE_ENDIAN__
1070 /* Copy double element[0] to element [1] for conversion. */
1071 __v2df __temp_b
= vec_splat((__v2df
)__B
, 0);
1073 /* Pre-rotate __A left 3 (logically right 1) elements. */
1074 __result
= __builtin_vsx_xxsldwi(__result
, __result
, 3);
1075 /* Convert double to single float scalar in a vector. */
1076 __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s
) : "wa"(__temp_b
) :);
1077 /* Shift the resulting scalar into vector element [0]. */
1078 __result
= __builtin_vsx_xxsldwi(__result
, __temp_s
, 1);
1080 __result
[0] = ((__v2df
)__B
)[0];
1082 return (__m128
)__result
;
1085 extern __inline __m128d
1086 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1087 _mm_cvtsi32_sd(__m128d __A
, int __B
) {
1088 __v2df __result
= (__v2df
)__A
;
1091 return (__m128d
)__result
;
1094 /* Intel intrinsic. */
1095 extern __inline __m128d
1096 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1097 _mm_cvtsi64_sd(__m128d __A
, long long __B
) {
1098 __v2df __result
= (__v2df
)__A
;
1101 return (__m128d
)__result
;
1104 /* Microsoft intrinsic. */
1105 extern __inline __m128d
1106 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1107 _mm_cvtsi64x_sd(__m128d __A
, long long __B
) {
1108 return _mm_cvtsi64_sd(__A
, __B
);
1111 extern __inline __m128d
1112 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1113 _mm_cvtss_sd(__m128d __A
, __m128 __B
) {
1114 #ifdef __LITTLE_ENDIAN__
1115 /* Use splat to move element [0] into position for the convert. */
1116 __v4sf __temp
= vec_splat((__v4sf
)__B
, 0);
1118 /* Convert single float scalar to double in a vector. */
1119 __asm__("xscvspdp %x0,%x1" : "=wa"(__res
) : "wa"(__temp
) :);
1120 return (__m128d
)vec_mergel(__res
, (__v2df
)__A
);
1122 __v2df __res
= (__v2df
)__A
;
1123 __res
[0] = ((__v4sf
)__B
)[0];
1124 return (__m128d
)__res
;
1128 extern __inline __m128d
1129 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1130 _mm_shuffle_pd(__m128d __A
, __m128d __B
, const int __mask
) {
1131 __vector
double __result
;
1132 const int __litmsk
= __mask
& 0x3;
1135 __result
= vec_mergeh(__A
, __B
);
1137 else if (__litmsk
== 1)
1138 __result
= vec_xxpermdi(__B
, __A
, 2);
1139 else if (__litmsk
== 2)
1140 __result
= vec_xxpermdi(__B
, __A
, 1);
1142 else if (__litmsk
== 1)
1143 __result
= vec_xxpermdi(__A
, __B
, 2);
1144 else if (__litmsk
== 2)
1145 __result
= vec_xxpermdi(__A
, __B
, 1);
1148 __result
= vec_mergel(__A
, __B
);
1153 extern __inline __m128d
1154 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1155 _mm_unpackhi_pd(__m128d __A
, __m128d __B
) {
1156 return (__m128d
)vec_mergel((__v2df
)__A
, (__v2df
)__B
);
1159 extern __inline __m128d
1160 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1161 _mm_unpacklo_pd(__m128d __A
, __m128d __B
) {
1162 return (__m128d
)vec_mergeh((__v2df
)__A
, (__v2df
)__B
);
1165 extern __inline __m128d
1166 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1167 _mm_loadh_pd(__m128d __A
, double const *__B
) {
1168 __v2df __result
= (__v2df
)__A
;
1170 return (__m128d
)__result
;
1173 extern __inline __m128d
1174 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1175 _mm_loadl_pd(__m128d __A
, double const *__B
) {
1176 __v2df __result
= (__v2df
)__A
;
1178 return (__m128d
)__result
;
1182 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1184 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1186 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1187 _mm_movemask_pd(__m128d __A
) {
1189 return vec_extractm((__v2du
)__A
);
1191 __vector
unsigned long long __result
;
1192 static const __vector
unsigned int __perm_mask
= {
1193 #ifdef __LITTLE_ENDIAN__
1194 0x80800040, 0x80808080, 0x80808080, 0x80808080
1196 0x80808080, 0x80808080, 0x80808080, 0x80804000
1200 __result
= ((__vector
unsigned long long)vec_vbpermq(
1201 (__vector
unsigned char)__A
, (__vector
unsigned char)__perm_mask
));
1203 #ifdef __LITTLE_ENDIAN__
1208 #endif /* !_ARCH_PWR10 */
1210 #endif /* _ARCH_PWR8 */
1212 extern __inline __m128i
1213 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1214 _mm_packs_epi16(__m128i __A
, __m128i __B
) {
1215 return (__m128i
)vec_packs((__v8hi
)__A
, (__v8hi
)__B
);
1218 extern __inline __m128i
1219 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1220 _mm_packs_epi32(__m128i __A
, __m128i __B
) {
1221 return (__m128i
)vec_packs((__v4si
)__A
, (__v4si
)__B
);
1224 extern __inline __m128i
1225 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1226 _mm_packus_epi16(__m128i __A
, __m128i __B
) {
1227 return (__m128i
)vec_packsu((__v8hi
)__A
, (__v8hi
)__B
);
1230 extern __inline __m128i
1231 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1232 _mm_unpackhi_epi8(__m128i __A
, __m128i __B
) {
1233 return (__m128i
)vec_mergel((__v16qu
)__A
, (__v16qu
)__B
);
1236 extern __inline __m128i
1237 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1238 _mm_unpackhi_epi16(__m128i __A
, __m128i __B
) {
1239 return (__m128i
)vec_mergel((__v8hu
)__A
, (__v8hu
)__B
);
1242 extern __inline __m128i
1243 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1244 _mm_unpackhi_epi32(__m128i __A
, __m128i __B
) {
1245 return (__m128i
)vec_mergel((__v4su
)__A
, (__v4su
)__B
);
1248 extern __inline __m128i
1249 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1250 _mm_unpackhi_epi64(__m128i __A
, __m128i __B
) {
1251 return (__m128i
)vec_mergel((__vector
long long)__A
, (__vector
long long)__B
);
1254 extern __inline __m128i
1255 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1256 _mm_unpacklo_epi8(__m128i __A
, __m128i __B
) {
1257 return (__m128i
)vec_mergeh((__v16qu
)__A
, (__v16qu
)__B
);
1260 extern __inline __m128i
1261 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1262 _mm_unpacklo_epi16(__m128i __A
, __m128i __B
) {
1263 return (__m128i
)vec_mergeh((__v8hi
)__A
, (__v8hi
)__B
);
1266 extern __inline __m128i
1267 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1268 _mm_unpacklo_epi32(__m128i __A
, __m128i __B
) {
1269 return (__m128i
)vec_mergeh((__v4si
)__A
, (__v4si
)__B
);
1272 extern __inline __m128i
1273 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1274 _mm_unpacklo_epi64(__m128i __A
, __m128i __B
) {
1275 return (__m128i
)vec_mergeh((__vector
long long)__A
, (__vector
long long)__B
);
1278 extern __inline __m128i
1279 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1280 _mm_add_epi8(__m128i __A
, __m128i __B
) {
1281 return (__m128i
)((__v16qu
)__A
+ (__v16qu
)__B
);
1284 extern __inline __m128i
1285 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1286 _mm_add_epi16(__m128i __A
, __m128i __B
) {
1287 return (__m128i
)((__v8hu
)__A
+ (__v8hu
)__B
);
1290 extern __inline __m128i
1291 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1292 _mm_add_epi32(__m128i __A
, __m128i __B
) {
1293 return (__m128i
)((__v4su
)__A
+ (__v4su
)__B
);
1296 extern __inline __m128i
1297 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1298 _mm_add_epi64(__m128i __A
, __m128i __B
) {
1299 return (__m128i
)((__v2du
)__A
+ (__v2du
)__B
);
1302 extern __inline __m128i
1303 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1304 _mm_adds_epi8(__m128i __A
, __m128i __B
) {
1305 return (__m128i
)vec_adds((__v16qi
)__A
, (__v16qi
)__B
);
1308 extern __inline __m128i
1309 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1310 _mm_adds_epi16(__m128i __A
, __m128i __B
) {
1311 return (__m128i
)vec_adds((__v8hi
)__A
, (__v8hi
)__B
);
1314 extern __inline __m128i
1315 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1316 _mm_adds_epu8(__m128i __A
, __m128i __B
) {
1317 return (__m128i
)vec_adds((__v16qu
)__A
, (__v16qu
)__B
);
1320 extern __inline __m128i
1321 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1322 _mm_adds_epu16(__m128i __A
, __m128i __B
) {
1323 return (__m128i
)vec_adds((__v8hu
)__A
, (__v8hu
)__B
);
1326 extern __inline __m128i
1327 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1328 _mm_sub_epi8(__m128i __A
, __m128i __B
) {
1329 return (__m128i
)((__v16qu
)__A
- (__v16qu
)__B
);
1332 extern __inline __m128i
1333 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1334 _mm_sub_epi16(__m128i __A
, __m128i __B
) {
1335 return (__m128i
)((__v8hu
)__A
- (__v8hu
)__B
);
1338 extern __inline __m128i
1339 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1340 _mm_sub_epi32(__m128i __A
, __m128i __B
) {
1341 return (__m128i
)((__v4su
)__A
- (__v4su
)__B
);
1344 extern __inline __m128i
1345 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1346 _mm_sub_epi64(__m128i __A
, __m128i __B
) {
1347 return (__m128i
)((__v2du
)__A
- (__v2du
)__B
);
1350 extern __inline __m128i
1351 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1352 _mm_subs_epi8(__m128i __A
, __m128i __B
) {
1353 return (__m128i
)vec_subs((__v16qi
)__A
, (__v16qi
)__B
);
1356 extern __inline __m128i
1357 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1358 _mm_subs_epi16(__m128i __A
, __m128i __B
) {
1359 return (__m128i
)vec_subs((__v8hi
)__A
, (__v8hi
)__B
);
1362 extern __inline __m128i
1363 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1364 _mm_subs_epu8(__m128i __A
, __m128i __B
) {
1365 return (__m128i
)vec_subs((__v16qu
)__A
, (__v16qu
)__B
);
1368 extern __inline __m128i
1369 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1370 _mm_subs_epu16(__m128i __A
, __m128i __B
) {
1371 return (__m128i
)vec_subs((__v8hu
)__A
, (__v8hu
)__B
);
1374 extern __inline __m128i
1375 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1376 _mm_madd_epi16(__m128i __A
, __m128i __B
) {
1377 __vector
signed int __zero
= {0, 0, 0, 0};
1379 return (__m128i
)vec_vmsumshm((__v8hi
)__A
, (__v8hi
)__B
, __zero
);
1382 extern __inline __m128i
1383 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1384 _mm_mulhi_epi16(__m128i __A
, __m128i __B
) {
1385 __vector
signed int __w0
, __w1
;
1387 __vector
unsigned char __xform1
= {
1388 #ifdef __LITTLE_ENDIAN__
1389 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1390 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1392 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1393 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1397 __w0
= vec_vmulesh((__v8hi
)__A
, (__v8hi
)__B
);
1398 __w1
= vec_vmulosh((__v8hi
)__A
, (__v8hi
)__B
);
1399 return (__m128i
)vec_perm(__w0
, __w1
, __xform1
);
1402 extern __inline __m128i
1403 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1404 _mm_mullo_epi16(__m128i __A
, __m128i __B
) {
1405 return (__m128i
)((__v8hi
)__A
* (__v8hi
)__B
);
1408 extern __inline __m64
1409 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1410 _mm_mul_su32(__m64 __A
, __m64 __B
) {
1411 unsigned int __a
= __A
;
1412 unsigned int __b
= __B
;
1414 return ((__m64
)__a
* (__m64
)__b
);
1418 extern __inline __m128i
1419 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1420 _mm_mul_epu32(__m128i __A
, __m128i __B
) {
1424 #ifdef __LITTLE_ENDIAN__
1425 /* VMX Vector Multiply Odd Unsigned Word. */
1426 __asm__("vmulouw %0,%1,%2" : "=v"(__result
) : "v"(__A
), "v"(__B
) :);
1428 /* VMX Vector Multiply Even Unsigned Word. */
1429 __asm__("vmuleuw %0,%1,%2" : "=v"(__result
) : "v"(__A
), "v"(__B
) :);
1431 return (__m128i
)__result
;
1433 return (__m128i
)vec_mule((__v4su
)__A
, (__v4su
)__B
);
1438 extern __inline __m128i
1439 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1440 _mm_slli_epi16(__m128i __A
, int __B
) {
1442 __v8hi __result
= {0, 0, 0, 0, 0, 0, 0, 0};
1444 if (__B
>= 0 && __B
< 16) {
1445 if (__builtin_constant_p(__B
))
1446 __lshift
= (__v8hu
)vec_splat_s16(__B
);
1448 __lshift
= vec_splats((unsigned short)__B
);
1450 __result
= vec_sl((__v8hi
)__A
, __lshift
);
1453 return (__m128i
)__result
;
1456 extern __inline __m128i
1457 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1458 _mm_slli_epi32(__m128i __A
, int __B
) {
1460 __v4si __result
= {0, 0, 0, 0};
1462 if (__B
>= 0 && __B
< 32) {
1463 if (__builtin_constant_p(__B
) && __B
< 16)
1464 __lshift
= (__v4su
)vec_splat_s32(__B
);
1466 __lshift
= vec_splats((unsigned int)__B
);
1468 __result
= vec_sl((__v4si
)__A
, __lshift
);
1471 return (__m128i
)__result
;
1475 extern __inline __m128i
1476 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1477 _mm_slli_epi64(__m128i __A
, int __B
) {
1479 __v2di __result
= {0, 0};
1481 if (__B
>= 0 && __B
< 64) {
1482 if (__builtin_constant_p(__B
) && __B
< 16)
1483 __lshift
= (__v2du
)vec_splat_s32(__B
);
1485 __lshift
= (__v2du
)vec_splats((unsigned int)__B
);
1487 __result
= vec_sl((__v2di
)__A
, __lshift
);
1490 return (__m128i
)__result
;
1494 extern __inline __m128i
1495 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1496 _mm_srai_epi16(__m128i __A
, int __B
) {
1497 __v8hu __rshift
= {15, 15, 15, 15, 15, 15, 15, 15};
1501 if (__builtin_constant_p(__B
))
1502 __rshift
= (__v8hu
)vec_splat_s16(__B
);
1504 __rshift
= vec_splats((unsigned short)__B
);
1506 __result
= vec_sra((__v8hi
)__A
, __rshift
);
1508 return (__m128i
)__result
;
1511 extern __inline __m128i
1512 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1513 _mm_srai_epi32(__m128i __A
, int __B
) {
1514 __v4su __rshift
= {31, 31, 31, 31};
1518 if (__builtin_constant_p(__B
)) {
1520 __rshift
= (__v4su
)vec_splat_s32(__B
);
1522 __rshift
= (__v4su
)vec_splats((unsigned int)__B
);
1524 __rshift
= vec_splats((unsigned int)__B
);
1526 __result
= vec_sra((__v4si
)__A
, __rshift
);
1528 return (__m128i
)__result
;
1531 extern __inline __m128i
1532 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1533 _mm_bslli_si128(__m128i __A
, const int __N
) {
1535 const __v16qu __zeros
= {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1538 __result
= vec_sld((__v16qu
)__A
, __zeros
, __N
);
1542 return (__m128i
)__result
;
1545 extern __inline __m128i
1546 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1547 _mm_bsrli_si128(__m128i __A
, const int __N
) {
1549 const __v16qu __zeros
= {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1552 #ifdef __LITTLE_ENDIAN__
1553 if (__builtin_constant_p(__N
))
1554 /* Would like to use Vector Shift Left Double by Octet
1555 Immediate here to use the immediate form and avoid
1556 load of __N * 8 value into a separate VR. */
1557 __result
= vec_sld(__zeros
, (__v16qu
)__A
, (16 - __N
));
1561 __v16qu __shift
= vec_splats((unsigned char)(__N
* 8));
1562 #ifdef __LITTLE_ENDIAN__
1563 __result
= vec_sro((__v16qu
)__A
, __shift
);
1565 __result
= vec_slo((__v16qu
)__A
, __shift
);
1571 return (__m128i
)__result
;
1574 extern __inline __m128i
1575 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1576 _mm_srli_si128(__m128i __A
, const int __N
) {
1577 return _mm_bsrli_si128(__A
, __N
);
1580 extern __inline __m128i
1581 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1582 _mm_slli_si128(__m128i __A
, const int _imm5
) {
1584 const __v16qu __zeros
= {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1587 #ifdef __LITTLE_ENDIAN__
1588 __result
= vec_sld((__v16qu
)__A
, __zeros
, _imm5
);
1590 __result
= vec_sld(__zeros
, (__v16qu
)__A
, (16 - _imm5
));
1595 return (__m128i
)__result
;
1598 extern __inline __m128i
1599 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1601 _mm_srli_epi16(__m128i __A
, int __B
) {
1603 __v8hi __result
= {0, 0, 0, 0, 0, 0, 0, 0};
1606 if (__builtin_constant_p(__B
))
1607 __rshift
= (__v8hu
)vec_splat_s16(__B
);
1609 __rshift
= vec_splats((unsigned short)__B
);
1611 __result
= vec_sr((__v8hi
)__A
, __rshift
);
1614 return (__m128i
)__result
;
1617 extern __inline __m128i
1618 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1619 _mm_srli_epi32(__m128i __A
, int __B
) {
1621 __v4si __result
= {0, 0, 0, 0};
1624 if (__builtin_constant_p(__B
)) {
1626 __rshift
= (__v4su
)vec_splat_s32(__B
);
1628 __rshift
= (__v4su
)vec_splats((unsigned int)__B
);
1630 __rshift
= vec_splats((unsigned int)__B
);
1632 __result
= vec_sr((__v4si
)__A
, __rshift
);
1635 return (__m128i
)__result
;
1639 extern __inline __m128i
1640 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1641 _mm_srli_epi64(__m128i __A
, int __B
) {
1643 __v2di __result
= {0, 0};
1646 if (__builtin_constant_p(__B
)) {
1648 __rshift
= (__v2du
)vec_splat_s32(__B
);
1650 __rshift
= (__v2du
)vec_splats((unsigned long long)__B
);
1652 __rshift
= (__v2du
)vec_splats((unsigned int)__B
);
1654 __result
= vec_sr((__v2di
)__A
, __rshift
);
1657 return (__m128i
)__result
;
1661 extern __inline __m128i
1662 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1663 _mm_sll_epi16(__m128i __A
, __m128i __B
) {
1665 __vector __bool
short __shmask
;
1666 const __v8hu __shmax
= {15, 15, 15, 15, 15, 15, 15, 15};
1669 #ifdef __LITTLE_ENDIAN__
1670 __lshift
= vec_splat((__v8hu
)__B
, 0);
1672 __lshift
= vec_splat((__v8hu
)__B
, 3);
1674 __shmask
= vec_cmple(__lshift
, __shmax
);
1675 __result
= vec_sl((__v8hu
)__A
, __lshift
);
1676 __result
= vec_sel((__v8hu
)__shmask
, __result
, __shmask
);
1678 return (__m128i
)__result
;
1681 extern __inline __m128i
1682 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1683 _mm_sll_epi32(__m128i __A
, __m128i __B
) {
1685 __vector __bool
int __shmask
;
1686 const __v4su __shmax
= {32, 32, 32, 32};
1688 #ifdef __LITTLE_ENDIAN__
1689 __lshift
= vec_splat((__v4su
)__B
, 0);
1691 __lshift
= vec_splat((__v4su
)__B
, 1);
1693 __shmask
= vec_cmplt(__lshift
, __shmax
);
1694 __result
= vec_sl((__v4su
)__A
, __lshift
);
1695 __result
= vec_sel((__v4su
)__shmask
, __result
, __shmask
);
1697 return (__m128i
)__result
;
1701 extern __inline __m128i
1702 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1703 _mm_sll_epi64(__m128i __A
, __m128i __B
) {
1705 __vector __bool
long long __shmask
;
1706 const __v2du __shmax
= {64, 64};
1709 __lshift
= vec_splat((__v2du
)__B
, 0);
1710 __shmask
= vec_cmplt(__lshift
, __shmax
);
1711 __result
= vec_sl((__v2du
)__A
, __lshift
);
1712 __result
= vec_sel((__v2du
)__shmask
, __result
, __shmask
);
1714 return (__m128i
)__result
;
1718 extern __inline __m128i
1719 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1720 _mm_sra_epi16(__m128i __A
, __m128i __B
) {
1721 const __v8hu __rshmax
= {15, 15, 15, 15, 15, 15, 15, 15};
1725 #ifdef __LITTLE_ENDIAN__
1726 __rshift
= vec_splat((__v8hu
)__B
, 0);
1728 __rshift
= vec_splat((__v8hu
)__B
, 3);
1730 __rshift
= vec_min(__rshift
, __rshmax
);
1731 __result
= vec_sra((__v8hi
)__A
, __rshift
);
1733 return (__m128i
)__result
;
1736 extern __inline __m128i
1737 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1738 _mm_sra_epi32(__m128i __A
, __m128i __B
) {
1739 const __v4su __rshmax
= {31, 31, 31, 31};
1743 #ifdef __LITTLE_ENDIAN__
1744 __rshift
= vec_splat((__v4su
)__B
, 0);
1746 __rshift
= vec_splat((__v4su
)__B
, 1);
1748 __rshift
= vec_min(__rshift
, __rshmax
);
1749 __result
= vec_sra((__v4si
)__A
, __rshift
);
1751 return (__m128i
)__result
;
1754 extern __inline __m128i
1755 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1756 _mm_srl_epi16(__m128i __A
, __m128i __B
) {
1758 __vector __bool
short __shmask
;
1759 const __v8hu __shmax
= {15, 15, 15, 15, 15, 15, 15, 15};
1762 #ifdef __LITTLE_ENDIAN__
1763 __rshift
= vec_splat((__v8hu
)__B
, 0);
1765 __rshift
= vec_splat((__v8hu
)__B
, 3);
1767 __shmask
= vec_cmple(__rshift
, __shmax
);
1768 __result
= vec_sr((__v8hu
)__A
, __rshift
);
1769 __result
= vec_sel((__v8hu
)__shmask
, __result
, __shmask
);
1771 return (__m128i
)__result
;
1774 extern __inline __m128i
1775 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1776 _mm_srl_epi32(__m128i __A
, __m128i __B
) {
1778 __vector __bool
int __shmask
;
1779 const __v4su __shmax
= {32, 32, 32, 32};
1782 #ifdef __LITTLE_ENDIAN__
1783 __rshift
= vec_splat((__v4su
)__B
, 0);
1785 __rshift
= vec_splat((__v4su
)__B
, 1);
1787 __shmask
= vec_cmplt(__rshift
, __shmax
);
1788 __result
= vec_sr((__v4su
)__A
, __rshift
);
1789 __result
= vec_sel((__v4su
)__shmask
, __result
, __shmask
);
1791 return (__m128i
)__result
;
1795 extern __inline __m128i
1796 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1797 _mm_srl_epi64(__m128i __A
, __m128i __B
) {
1799 __vector __bool
long long __shmask
;
1800 const __v2du __shmax
= {64, 64};
1803 __rshift
= vec_splat((__v2du
)__B
, 0);
1804 __shmask
= vec_cmplt(__rshift
, __shmax
);
1805 __result
= vec_sr((__v2du
)__A
, __rshift
);
1806 __result
= vec_sel((__v2du
)__shmask
, __result
, __shmask
);
1808 return (__m128i
)__result
;
1812 extern __inline __m128d
1813 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1814 _mm_and_pd(__m128d __A
, __m128d __B
) {
1815 return (vec_and((__v2df
)__A
, (__v2df
)__B
));
1818 extern __inline __m128d
1819 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1820 _mm_andnot_pd(__m128d __A
, __m128d __B
) {
1821 return (vec_andc((__v2df
)__B
, (__v2df
)__A
));
1824 extern __inline __m128d
1825 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1826 _mm_or_pd(__m128d __A
, __m128d __B
) {
1827 return (vec_or((__v2df
)__A
, (__v2df
)__B
));
1830 extern __inline __m128d
1831 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1832 _mm_xor_pd(__m128d __A
, __m128d __B
) {
1833 return (vec_xor((__v2df
)__A
, (__v2df
)__B
));
1836 extern __inline __m128i
1837 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1838 _mm_and_si128(__m128i __A
, __m128i __B
) {
1839 return (__m128i
)vec_and((__v2di
)__A
, (__v2di
)__B
);
1842 extern __inline __m128i
1843 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1844 _mm_andnot_si128(__m128i __A
, __m128i __B
) {
1845 return (__m128i
)vec_andc((__v2di
)__B
, (__v2di
)__A
);
1848 extern __inline __m128i
1849 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1850 _mm_or_si128(__m128i __A
, __m128i __B
) {
1851 return (__m128i
)vec_or((__v2di
)__A
, (__v2di
)__B
);
1854 extern __inline __m128i
1855 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1856 _mm_xor_si128(__m128i __A
, __m128i __B
) {
1857 return (__m128i
)vec_xor((__v2di
)__A
, (__v2di
)__B
);
1860 extern __inline __m128i
1861 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1862 _mm_cmpeq_epi8(__m128i __A
, __m128i __B
) {
1863 return (__m128i
)vec_cmpeq((__v16qi
)__A
, (__v16qi
)__B
);
1866 extern __inline __m128i
1867 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1868 _mm_cmpeq_epi16(__m128i __A
, __m128i __B
) {
1869 return (__m128i
)vec_cmpeq((__v8hi
)__A
, (__v8hi
)__B
);
1872 extern __inline __m128i
1873 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1874 _mm_cmpeq_epi32(__m128i __A
, __m128i __B
) {
1875 return (__m128i
)vec_cmpeq((__v4si
)__A
, (__v4si
)__B
);
1878 extern __inline __m128i
1879 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1880 _mm_cmplt_epi8(__m128i __A
, __m128i __B
) {
1881 return (__m128i
)vec_cmplt((__v16qi
)__A
, (__v16qi
)__B
);
1884 extern __inline __m128i
1885 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1886 _mm_cmplt_epi16(__m128i __A
, __m128i __B
) {
1887 return (__m128i
)vec_cmplt((__v8hi
)__A
, (__v8hi
)__B
);
1890 extern __inline __m128i
1891 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1892 _mm_cmplt_epi32(__m128i __A
, __m128i __B
) {
1893 return (__m128i
)vec_cmplt((__v4si
)__A
, (__v4si
)__B
);
1896 extern __inline __m128i
1897 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1898 _mm_cmpgt_epi8(__m128i __A
, __m128i __B
) {
1899 return (__m128i
)vec_cmpgt((__v16qi
)__A
, (__v16qi
)__B
);
1902 extern __inline __m128i
1903 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1904 _mm_cmpgt_epi16(__m128i __A
, __m128i __B
) {
1905 return (__m128i
)vec_cmpgt((__v8hi
)__A
, (__v8hi
)__B
);
1908 extern __inline __m128i
1909 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1910 _mm_cmpgt_epi32(__m128i __A
, __m128i __B
) {
1911 return (__m128i
)vec_cmpgt((__v4si
)__A
, (__v4si
)__B
);
1915 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1916 _mm_extract_epi16(__m128i
const __A
, int const __N
) {
1917 return (unsigned short)((__v8hi
)__A
)[__N
& 7];
1920 extern __inline __m128i
1921 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1922 _mm_insert_epi16(__m128i
const __A
, int const __D
, int const __N
) {
1923 __v8hi __result
= (__v8hi
)__A
;
1925 __result
[(__N
& 7)] = __D
;
1927 return (__m128i
)__result
;
1930 extern __inline __m128i
1931 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1932 _mm_max_epi16(__m128i __A
, __m128i __B
) {
1933 return (__m128i
)vec_max((__v8hi
)__A
, (__v8hi
)__B
);
1936 extern __inline __m128i
1937 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1938 _mm_max_epu8(__m128i __A
, __m128i __B
) {
1939 return (__m128i
)vec_max((__v16qu
)__A
, (__v16qu
)__B
);
1942 extern __inline __m128i
1943 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1944 _mm_min_epi16(__m128i __A
, __m128i __B
) {
1945 return (__m128i
)vec_min((__v8hi
)__A
, (__v8hi
)__B
);
1948 extern __inline __m128i
1949 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1950 _mm_min_epu8(__m128i __A
, __m128i __B
) {
1951 return (__m128i
)vec_min((__v16qu
)__A
, (__v16qu
)__B
);
1955 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1957 /* Return a mask created from the most significant bit of each 8-bit
1960 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1961 _mm_movemask_epi8(__m128i __A
) {
1963 return vec_extractm((__v16qu
)__A
);
1965 __vector
unsigned long long __result
;
1966 static const __vector
unsigned char __perm_mask
= {
1967 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1968 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1970 __result
= ((__vector
unsigned long long)vec_vbpermq(
1971 (__vector
unsigned char)__A
, (__vector
unsigned char)__perm_mask
));
1973 #ifdef __LITTLE_ENDIAN__
1978 #endif /* !_ARCH_PWR10 */
1980 #endif /* _ARCH_PWR8 */
1982 extern __inline __m128i
1983 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1984 _mm_mulhi_epu16(__m128i __A
, __m128i __B
) {
1986 __v16qu __xform1
= {
1987 #ifdef __LITTLE_ENDIAN__
1988 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1989 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1991 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1992 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1996 __w0
= vec_vmuleuh((__v8hu
)__A
, (__v8hu
)__B
);
1997 __w1
= vec_vmulouh((__v8hu
)__A
, (__v8hu
)__B
);
1998 return (__m128i
)vec_perm(__w0
, __w1
, __xform1
);
2001 extern __inline __m128i
2002 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2003 _mm_shufflehi_epi16(__m128i __A
, const int __mask
) {
2004 unsigned long __element_selector_98
= __mask
& 0x03;
2005 unsigned long __element_selector_BA
= (__mask
>> 2) & 0x03;
2006 unsigned long __element_selector_DC
= (__mask
>> 4) & 0x03;
2007 unsigned long __element_selector_FE
= (__mask
>> 6) & 0x03;
2008 static const unsigned short __permute_selectors
[4] = {
2009 #ifdef __LITTLE_ENDIAN__
2010 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2012 0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2016 #ifdef __LITTLE_ENDIAN__
2017 {0x1716151413121110UL
, 0UL};
2019 {0x1011121314151617UL
, 0UL};
2024 __t
.as_short
[0] = __permute_selectors
[__element_selector_98
];
2025 __t
.as_short
[1] = __permute_selectors
[__element_selector_BA
];
2026 __t
.as_short
[2] = __permute_selectors
[__element_selector_DC
];
2027 __t
.as_short
[3] = __permute_selectors
[__element_selector_FE
];
2028 __pmask
[1] = __t
.as_m64
;
2030 __r
= vec_perm(__a
, __a
, (__vector
unsigned char)__pmask
);
2031 return (__m128i
)__r
;
2034 extern __inline __m128i
2035 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2036 _mm_shufflelo_epi16(__m128i __A
, const int __mask
) {
2037 unsigned long __element_selector_10
= __mask
& 0x03;
2038 unsigned long __element_selector_32
= (__mask
>> 2) & 0x03;
2039 unsigned long __element_selector_54
= (__mask
>> 4) & 0x03;
2040 unsigned long __element_selector_76
= (__mask
>> 6) & 0x03;
2041 static const unsigned short __permute_selectors
[4] = {
2042 #ifdef __LITTLE_ENDIAN__
2043 0x0100, 0x0302, 0x0504, 0x0706
2045 0x0001, 0x0203, 0x0405, 0x0607
2049 #ifdef __LITTLE_ENDIAN__
2050 {0UL, 0x1f1e1d1c1b1a1918UL
};
2052 {0UL, 0x18191a1b1c1d1e1fUL
};
2056 __t
.as_short
[0] = __permute_selectors
[__element_selector_10
];
2057 __t
.as_short
[1] = __permute_selectors
[__element_selector_32
];
2058 __t
.as_short
[2] = __permute_selectors
[__element_selector_54
];
2059 __t
.as_short
[3] = __permute_selectors
[__element_selector_76
];
2060 __pmask
[0] = __t
.as_m64
;
2062 __r
= vec_perm(__a
, __a
, (__vector
unsigned char)__pmask
);
2063 return (__m128i
)__r
;
2066 extern __inline __m128i
2067 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2068 _mm_shuffle_epi32(__m128i __A
, const int __mask
) {
2069 unsigned long __element_selector_10
= __mask
& 0x03;
2070 unsigned long __element_selector_32
= (__mask
>> 2) & 0x03;
2071 unsigned long __element_selector_54
= (__mask
>> 4) & 0x03;
2072 unsigned long __element_selector_76
= (__mask
>> 6) & 0x03;
2073 static const unsigned int __permute_selectors
[4] = {
2074 #ifdef __LITTLE_ENDIAN__
2075 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2077 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2082 __t
[0] = __permute_selectors
[__element_selector_10
];
2083 __t
[1] = __permute_selectors
[__element_selector_32
];
2084 __t
[2] = __permute_selectors
[__element_selector_54
] + 0x10101010;
2085 __t
[3] = __permute_selectors
[__element_selector_76
] + 0x10101010;
2086 return (__m128i
)vec_perm((__v4si
)__A
, (__v4si
)__A
,
2087 (__vector
unsigned char)__t
);
2090 extern __inline
void
2091 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2092 _mm_maskmoveu_si128(__m128i __A
, __m128i __B
, char *__C
) {
2093 __v2du __hibit
= {0x7f7f7f7f7f7f7f7fUL
, 0x7f7f7f7f7f7f7f7fUL
};
2094 __v16qu __mask
, __tmp
;
2095 __m128i_u
*__p
= (__m128i_u
*)__C
;
2097 __tmp
= (__v16qu
)_mm_loadu_si128(__p
);
2098 __mask
= (__v16qu
)vec_cmpgt((__v16qu
)__B
, (__v16qu
)__hibit
);
2099 __tmp
= vec_sel(__tmp
, (__v16qu
)__A
, __mask
);
2100 _mm_storeu_si128(__p
, (__m128i
)__tmp
);
2103 extern __inline __m128i
2104 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2105 _mm_avg_epu8(__m128i __A
, __m128i __B
) {
2106 return (__m128i
)vec_avg((__v16qu
)__A
, (__v16qu
)__B
);
2109 extern __inline __m128i
2110 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2111 _mm_avg_epu16(__m128i __A
, __m128i __B
) {
2112 return (__m128i
)vec_avg((__v8hu
)__A
, (__v8hu
)__B
);
2115 extern __inline __m128i
2116 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2117 _mm_sad_epu8(__m128i __A
, __m128i __B
) {
2121 const __v4su __zero
= {0, 0, 0, 0};
2127 __v16qu __vmin
= vec_min(__a
, __b
);
2128 __v16qu __vmax
= vec_max(__a
, __b
);
2129 __vabsdiff
= vec_sub(__vmax
, __vmin
);
2131 __vabsdiff
= vec_absd(__a
, __b
);
2133 /* Sum four groups of bytes into integers. */
2134 __vsum
= (__vector
signed int)vec_sum4s(__vabsdiff
, __zero
);
2135 #ifdef __LITTLE_ENDIAN__
2136 /* Sum across four integers with two integer results. */
2137 __asm__("vsum2sws %0,%1,%2" : "=v"(__result
) : "v"(__vsum
), "v"(__zero
));
2138 /* Note: vec_sum2s could be used here, but on little-endian, vector
2139 shifts are added that are not needed for this use-case.
2140 A vector shift to correctly position the 32-bit integer results
2141 (currently at [0] and [2]) to [1] and [3] would then need to be
2142 swapped back again since the desired results are two 64-bit
2143 integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */
2145 /* Sum across four integers with two integer results. */
2146 __result
= vec_sum2s(__vsum
, (__vector
signed int)__zero
);
2147 /* Rotate the sums into the correct position. */
2148 __result
= vec_sld(__result
, __result
, 6);
2150 return (__m128i
)__result
;
2153 extern __inline
void
2154 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2155 _mm_stream_si32(int *__A
, int __B
) {
2156 /* Use the data cache block touch for store transient. */
2157 __asm__("dcbtstt 0,%0" : : "b"(__A
) : "memory");
2161 extern __inline
void
2162 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2163 _mm_stream_si64(long long int *__A
, long long int __B
) {
2164 /* Use the data cache block touch for store transient. */
2165 __asm__(" dcbtstt 0,%0" : : "b"(__A
) : "memory");
2169 extern __inline
void
2170 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2171 _mm_stream_si128(__m128i
*__A
, __m128i __B
) {
2172 /* Use the data cache block touch for store transient. */
2173 __asm__("dcbtstt 0,%0" : : "b"(__A
) : "memory");
2177 extern __inline
void
2178 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2179 _mm_stream_pd(double *__A
, __m128d __B
) {
2180 /* Use the data cache block touch for store transient. */
2181 __asm__("dcbtstt 0,%0" : : "b"(__A
) : "memory");
2182 *(__m128d
*)__A
= __B
;
2185 extern __inline
void
2186 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2187 _mm_clflush(void const *__A
) {
2188 /* Use the data cache block flush. */
2189 __asm__("dcbf 0,%0" : : "b"(__A
) : "memory");
2192 extern __inline
void
2193 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2195 /* Use light weight sync for load to load ordering. */
2196 __atomic_thread_fence(__ATOMIC_RELEASE
);
2199 extern __inline
void
2200 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2202 /* Use heavy weight sync for any to any ordering. */
2203 __atomic_thread_fence(__ATOMIC_SEQ_CST
);
2206 extern __inline __m128i
2207 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2208 _mm_cvtsi32_si128(int __A
) {
2209 return _mm_set_epi32(0, 0, 0, __A
);
2212 extern __inline __m128i
2213 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2214 _mm_cvtsi64_si128(long long __A
) {
2215 return __extension__(__m128i
)(__v2di
){__A
, 0LL};
2218 /* Microsoft intrinsic. */
2219 extern __inline __m128i
2220 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2221 _mm_cvtsi64x_si128(long long __A
) {
2222 return __extension__(__m128i
)(__v2di
){__A
, 0LL};
2225 /* Casts between various SP, DP, INT vector types. Note that these do no
2226 conversion of values, they just change the type. */
2227 extern __inline __m128
2228 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2229 _mm_castpd_ps(__m128d __A
) {
2233 extern __inline __m128i
2234 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2235 _mm_castpd_si128(__m128d __A
) {
2236 return (__m128i
)__A
;
2239 extern __inline __m128d
2240 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2241 _mm_castps_pd(__m128 __A
) {
2242 return (__m128d
)__A
;
2245 extern __inline __m128i
2246 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2247 _mm_castps_si128(__m128 __A
) {
2248 return (__m128i
)__A
;
2251 extern __inline __m128
2252 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2253 _mm_castsi128_ps(__m128i __A
) {
2257 extern __inline __m128d
2258 __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2259 _mm_castsi128_pd(__m128i __A
) {
2260 return (__m128d
)__A
;
2264 #include_next <emmintrin.h>
2265 #endif /* defined(__ppc64__) &&
2266 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2268 #endif /* EMMINTRIN_H_ */