[flang][cuda] Do not register global constants (#118582)
[llvm-project.git] / clang / lib / Headers / ppc_wrappers / emmintrin.h
blobfc18ab9d43b15ec3ed09efa67253a3143aa06fb6
1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 /* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19 However scalar float operations in vector (XMM) registers require
20 the POWER8 VSX ISA (2.07) level. There are differences for data
21 format and placement of float scalars in the vector register, which
22 require extra steps to match SSE2 scalar float semantics on POWER.
24 It should be noted that there's much difference between X86_64's
25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26 portable <fenv.h> instead of access MXSCR directly.
28 Most SSE2 scalar float intrinsic operations can be performed more
29 efficiently as C language float scalar operations or optimized to
30 use vector SIMD operations. We recommend this for new applications.
32 #error \
33 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
34 #endif
36 #ifndef EMMINTRIN_H_
37 #define EMMINTRIN_H_
39 #if defined(__powerpc64__) && \
40 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
42 #include <altivec.h>
44 /* We need definitions from the SSE header files. */
45 #include <xmmintrin.h>
47 /* SSE2 */
48 typedef __vector double __v2df;
49 typedef __vector float __v4f;
50 typedef __vector long long __v2di;
51 typedef __vector unsigned long long __v2du;
52 typedef __vector int __v4si;
53 typedef __vector unsigned int __v4su;
54 typedef __vector short __v8hi;
55 typedef __vector unsigned short __v8hu;
56 typedef __vector signed char __v16qi;
57 typedef __vector unsigned char __v16qu;
59 /* The Intel API is flexible enough that we must allow aliasing with other
60 vector types, and their scalar components. */
61 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
62 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
64 /* Unaligned version of the same types. */
65 typedef long long __m128i_u
66 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
67 typedef double __m128d_u
68 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
70 /* Define two value permute mask. */
71 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
73 /* Create a vector with element 0 as F and the rest zero. */
74 extern __inline __m128d
75 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
76 _mm_set_sd(double __F) {
77 return __extension__(__m128d){__F, 0.0};
80 /* Create a vector with both elements equal to F. */
81 extern __inline __m128d
82 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83 _mm_set1_pd(double __F) {
84 return __extension__(__m128d){__F, __F};
87 extern __inline __m128d
88 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
89 _mm_set_pd1(double __F) {
90 return _mm_set1_pd(__F);
93 /* Create a vector with the lower value X and upper value W. */
94 extern __inline __m128d
95 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
96 _mm_set_pd(double __W, double __X) {
97 return __extension__(__m128d){__X, __W};
100 /* Create a vector with the lower value W and upper value X. */
101 extern __inline __m128d
102 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
103 _mm_setr_pd(double __W, double __X) {
104 return __extension__(__m128d){__W, __X};
107 /* Create an undefined vector. */
108 extern __inline __m128d
109 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 _mm_undefined_pd(void) {
111 __m128d __Y = __Y;
112 return __Y;
115 /* Create a vector of zeros. */
116 extern __inline __m128d
117 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
118 _mm_setzero_pd(void) {
119 return (__m128d)vec_splats(0);
122 /* Sets the low DPFP value of A from the low value of B. */
123 extern __inline __m128d
124 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 _mm_move_sd(__m128d __A, __m128d __B) {
126 __v2df __result = (__v2df)__A;
127 __result[0] = ((__v2df)__B)[0];
128 return (__m128d)__result;
131 /* Load two DPFP values from P. The address must be 16-byte aligned. */
132 extern __inline __m128d
133 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134 _mm_load_pd(double const *__P) {
135 return ((__m128d)vec_ld(0, (__v16qu *)__P));
138 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
139 extern __inline __m128d
140 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141 _mm_loadu_pd(double const *__P) {
142 return (vec_vsx_ld(0, __P));
145 /* Create a vector with all two elements equal to *P. */
146 extern __inline __m128d
147 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 _mm_load1_pd(double const *__P) {
149 return (vec_splats(*__P));
152 /* Create a vector with element 0 as *P and the rest zero. */
153 extern __inline __m128d
154 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155 _mm_load_sd(double const *__P) {
156 return _mm_set_sd(*__P);
159 extern __inline __m128d
160 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
161 _mm_load_pd1(double const *__P) {
162 return _mm_load1_pd(__P);
165 /* Load two DPFP values in reverse order. The address must be aligned. */
166 extern __inline __m128d
167 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168 _mm_loadr_pd(double const *__P) {
169 __v2df __tmp = _mm_load_pd(__P);
170 return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
173 /* Store two DPFP values. The address must be 16-byte aligned. */
174 extern __inline void
175 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176 _mm_store_pd(double *__P, __m128d __A) {
177 vec_st((__v16qu)__A, 0, (__v16qu *)__P);
180 /* Store two DPFP values. The address need not be 16-byte aligned. */
181 extern __inline void
182 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183 _mm_storeu_pd(double *__P, __m128d __A) {
184 *(__m128d_u *)__P = __A;
187 /* Stores the lower DPFP value. */
188 extern __inline void
189 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190 _mm_store_sd(double *__P, __m128d __A) {
191 *__P = ((__v2df)__A)[0];
194 extern __inline double
195 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196 _mm_cvtsd_f64(__m128d __A) {
197 return ((__v2df)__A)[0];
200 extern __inline void
201 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202 _mm_storel_pd(double *__P, __m128d __A) {
203 _mm_store_sd(__P, __A);
206 /* Stores the upper DPFP value. */
207 extern __inline void
208 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
209 _mm_storeh_pd(double *__P, __m128d __A) {
210 *__P = ((__v2df)__A)[1];
212 /* Store the lower DPFP value across two words.
213 The address must be 16-byte aligned. */
214 extern __inline void
215 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216 _mm_store1_pd(double *__P, __m128d __A) {
217 _mm_store_pd(__P, vec_splat(__A, 0));
220 extern __inline void
221 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
222 _mm_store_pd1(double *__P, __m128d __A) {
223 _mm_store1_pd(__P, __A);
226 /* Store two DPFP values in reverse order. The address must be aligned. */
227 extern __inline void
228 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229 _mm_storer_pd(double *__P, __m128d __A) {
230 _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
233 /* Intel intrinsic. */
234 extern __inline long long
235 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
236 _mm_cvtsi128_si64(__m128i __A) {
237 return ((__v2di)__A)[0];
240 /* Microsoft intrinsic. */
241 extern __inline long long
242 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
243 _mm_cvtsi128_si64x(__m128i __A) {
244 return ((__v2di)__A)[0];
247 extern __inline __m128d
248 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
249 _mm_add_pd(__m128d __A, __m128d __B) {
250 return (__m128d)((__v2df)__A + (__v2df)__B);
253 /* Add the lower double-precision (64-bit) floating-point element in
254 a and b, store the result in the lower element of dst, and copy
255 the upper element from a to the upper element of dst. */
256 extern __inline __m128d
257 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
258 _mm_add_sd(__m128d __A, __m128d __B) {
259 __A[0] = __A[0] + __B[0];
260 return (__A);
263 extern __inline __m128d
264 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265 _mm_sub_pd(__m128d __A, __m128d __B) {
266 return (__m128d)((__v2df)__A - (__v2df)__B);
269 extern __inline __m128d
270 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
271 _mm_sub_sd(__m128d __A, __m128d __B) {
272 __A[0] = __A[0] - __B[0];
273 return (__A);
276 extern __inline __m128d
277 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278 _mm_mul_pd(__m128d __A, __m128d __B) {
279 return (__m128d)((__v2df)__A * (__v2df)__B);
282 extern __inline __m128d
283 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284 _mm_mul_sd(__m128d __A, __m128d __B) {
285 __A[0] = __A[0] * __B[0];
286 return (__A);
289 extern __inline __m128d
290 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
291 _mm_div_pd(__m128d __A, __m128d __B) {
292 return (__m128d)((__v2df)__A / (__v2df)__B);
295 extern __inline __m128d
296 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
297 _mm_div_sd(__m128d __A, __m128d __B) {
298 __A[0] = __A[0] / __B[0];
299 return (__A);
302 extern __inline __m128d
303 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304 _mm_sqrt_pd(__m128d __A) {
305 return (vec_sqrt(__A));
308 /* Return pair {sqrt (B[0]), A[1]}. */
309 extern __inline __m128d
310 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311 _mm_sqrt_sd(__m128d __A, __m128d __B) {
312 __v2df __c;
313 __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
314 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
317 extern __inline __m128d
318 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
319 _mm_min_pd(__m128d __A, __m128d __B) {
320 return (vec_min(__A, __B));
323 extern __inline __m128d
324 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
325 _mm_min_sd(__m128d __A, __m128d __B) {
326 __v2df __a, __b, __c;
327 __a = vec_splats(__A[0]);
328 __b = vec_splats(__B[0]);
329 __c = vec_min(__a, __b);
330 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
333 extern __inline __m128d
334 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335 _mm_max_pd(__m128d __A, __m128d __B) {
336 return (vec_max(__A, __B));
339 extern __inline __m128d
340 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 _mm_max_sd(__m128d __A, __m128d __B) {
342 __v2df __a, __b, __c;
343 __a = vec_splats(__A[0]);
344 __b = vec_splats(__B[0]);
345 __c = vec_max(__a, __b);
346 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
349 extern __inline __m128d
350 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351 _mm_cmpeq_pd(__m128d __A, __m128d __B) {
352 return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
355 extern __inline __m128d
356 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357 _mm_cmplt_pd(__m128d __A, __m128d __B) {
358 return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
361 extern __inline __m128d
362 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363 _mm_cmple_pd(__m128d __A, __m128d __B) {
364 return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
367 extern __inline __m128d
368 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369 _mm_cmpgt_pd(__m128d __A, __m128d __B) {
370 return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
373 extern __inline __m128d
374 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375 _mm_cmpge_pd(__m128d __A, __m128d __B) {
376 return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
379 extern __inline __m128d
380 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381 _mm_cmpneq_pd(__m128d __A, __m128d __B) {
382 __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
383 return ((__m128d)vec_nor(__temp, __temp));
386 extern __inline __m128d
387 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388 _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
389 return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
392 extern __inline __m128d
393 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
394 _mm_cmpnle_pd(__m128d __A, __m128d __B) {
395 return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
398 extern __inline __m128d
399 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400 _mm_cmpngt_pd(__m128d __A, __m128d __B) {
401 return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
404 extern __inline __m128d
405 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
406 _mm_cmpnge_pd(__m128d __A, __m128d __B) {
407 return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
410 extern __inline __m128d
411 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
412 _mm_cmpord_pd(__m128d __A, __m128d __B) {
413 __v2du __c, __d;
414 /* Compare against self will return false (0's) if NAN. */
415 __c = (__v2du)vec_cmpeq(__A, __A);
416 __d = (__v2du)vec_cmpeq(__B, __B);
417 /* A != NAN and B != NAN. */
418 return ((__m128d)vec_and(__c, __d));
421 extern __inline __m128d
422 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
423 _mm_cmpunord_pd(__m128d __A, __m128d __B) {
424 #if _ARCH_PWR8
425 __v2du __c, __d;
426 /* Compare against self will return false (0's) if NAN. */
427 __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
428 __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
429 /* A == NAN OR B == NAN converts too:
430 NOT(A != NAN) OR NOT(B != NAN). */
431 __c = vec_nor(__c, __c);
432 return ((__m128d)vec_orc(__c, __d));
433 #else
434 __v2du __c, __d;
435 /* Compare against self will return false (0's) if NAN. */
436 __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
437 __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
438 /* Convert the true ('1's) is NAN. */
439 __c = vec_nor(__c, __c);
440 __d = vec_nor(__d, __d);
441 return ((__m128d)vec_or(__c, __d));
442 #endif
445 extern __inline __m128d
446 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447 _mm_cmpeq_sd(__m128d __A, __m128d __B) {
448 __v2df __a, __b, __c;
449 /* PowerISA VSX does not allow partial (for just lower double)
450 results. So to insure we don't generate spurious exceptions
451 (from the upper double values) we splat the lower double
452 before we do the operation. */
453 __a = vec_splats(__A[0]);
454 __b = vec_splats(__B[0]);
455 __c = (__v2df)vec_cmpeq(__a, __b);
456 /* Then we merge the lower double result with the original upper
457 double from __A. */
458 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
461 extern __inline __m128d
462 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
463 _mm_cmplt_sd(__m128d __A, __m128d __B) {
464 __v2df __a, __b, __c;
465 __a = vec_splats(__A[0]);
466 __b = vec_splats(__B[0]);
467 __c = (__v2df)vec_cmplt(__a, __b);
468 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
471 extern __inline __m128d
472 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
473 _mm_cmple_sd(__m128d __A, __m128d __B) {
474 __v2df __a, __b, __c;
475 __a = vec_splats(__A[0]);
476 __b = vec_splats(__B[0]);
477 __c = (__v2df)vec_cmple(__a, __b);
478 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
481 extern __inline __m128d
482 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483 _mm_cmpgt_sd(__m128d __A, __m128d __B) {
484 __v2df __a, __b, __c;
485 __a = vec_splats(__A[0]);
486 __b = vec_splats(__B[0]);
487 __c = (__v2df)vec_cmpgt(__a, __b);
488 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
491 extern __inline __m128d
492 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493 _mm_cmpge_sd(__m128d __A, __m128d __B) {
494 __v2df __a, __b, __c;
495 __a = vec_splats(__A[0]);
496 __b = vec_splats(__B[0]);
497 __c = (__v2df)vec_cmpge(__a, __b);
498 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
501 extern __inline __m128d
502 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
503 _mm_cmpneq_sd(__m128d __A, __m128d __B) {
504 __v2df __a, __b, __c;
505 __a = vec_splats(__A[0]);
506 __b = vec_splats(__B[0]);
507 __c = (__v2df)vec_cmpeq(__a, __b);
508 __c = vec_nor(__c, __c);
509 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
512 extern __inline __m128d
513 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
514 _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
515 __v2df __a, __b, __c;
516 __a = vec_splats(__A[0]);
517 __b = vec_splats(__B[0]);
518 /* Not less than is just greater than or equal. */
519 __c = (__v2df)vec_cmpge(__a, __b);
520 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
523 extern __inline __m128d
524 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
525 _mm_cmpnle_sd(__m128d __A, __m128d __B) {
526 __v2df __a, __b, __c;
527 __a = vec_splats(__A[0]);
528 __b = vec_splats(__B[0]);
529 /* Not less than or equal is just greater than. */
530 __c = (__v2df)vec_cmpge(__a, __b);
531 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
534 extern __inline __m128d
535 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
536 _mm_cmpngt_sd(__m128d __A, __m128d __B) {
537 __v2df __a, __b, __c;
538 __a = vec_splats(__A[0]);
539 __b = vec_splats(__B[0]);
540 /* Not greater than is just less than or equal. */
541 __c = (__v2df)vec_cmple(__a, __b);
542 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
545 extern __inline __m128d
546 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
547 _mm_cmpnge_sd(__m128d __A, __m128d __B) {
548 __v2df __a, __b, __c;
549 __a = vec_splats(__A[0]);
550 __b = vec_splats(__B[0]);
551 /* Not greater than or equal is just less than. */
552 __c = (__v2df)vec_cmplt(__a, __b);
553 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
556 extern __inline __m128d
557 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
558 _mm_cmpord_sd(__m128d __A, __m128d __B) {
559 __v2df __r;
560 __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
561 return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
564 extern __inline __m128d
565 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
566 _mm_cmpunord_sd(__m128d __A, __m128d __B) {
567 __v2df __r;
568 __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
569 return (__m128d)_mm_setr_pd(__r[0], __A[1]);
572 /* FIXME
573 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
574 exactly the same because GCC for PowerPC only generates unordered
575 compares (scalar and vector).
576 Technically __mm_comieq_sp et all should be using the ordered
577 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
578 be OK. */
579 extern __inline int
580 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
581 _mm_comieq_sd(__m128d __A, __m128d __B) {
582 return (__A[0] == __B[0]);
585 extern __inline int
586 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587 _mm_comilt_sd(__m128d __A, __m128d __B) {
588 return (__A[0] < __B[0]);
591 extern __inline int
592 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
593 _mm_comile_sd(__m128d __A, __m128d __B) {
594 return (__A[0] <= __B[0]);
597 extern __inline int
598 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599 _mm_comigt_sd(__m128d __A, __m128d __B) {
600 return (__A[0] > __B[0]);
603 extern __inline int
604 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
605 _mm_comige_sd(__m128d __A, __m128d __B) {
606 return (__A[0] >= __B[0]);
609 extern __inline int
610 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
611 _mm_comineq_sd(__m128d __A, __m128d __B) {
612 return (__A[0] != __B[0]);
615 extern __inline int
616 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
617 _mm_ucomieq_sd(__m128d __A, __m128d __B) {
618 return (__A[0] == __B[0]);
621 extern __inline int
622 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623 _mm_ucomilt_sd(__m128d __A, __m128d __B) {
624 return (__A[0] < __B[0]);
627 extern __inline int
628 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629 _mm_ucomile_sd(__m128d __A, __m128d __B) {
630 return (__A[0] <= __B[0]);
633 extern __inline int
634 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635 _mm_ucomigt_sd(__m128d __A, __m128d __B) {
636 return (__A[0] > __B[0]);
639 extern __inline int
640 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
641 _mm_ucomige_sd(__m128d __A, __m128d __B) {
642 return (__A[0] >= __B[0]);
645 extern __inline int
646 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
647 _mm_ucomineq_sd(__m128d __A, __m128d __B) {
648 return (__A[0] != __B[0]);
651 /* Create a vector of Qi, where i is the element number. */
652 extern __inline __m128i
653 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654 _mm_set_epi64x(long long __q1, long long __q0) {
655 return __extension__(__m128i)(__v2di){__q0, __q1};
658 extern __inline __m128i
659 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660 _mm_set_epi64(__m64 __q1, __m64 __q0) {
661 return _mm_set_epi64x((long long)__q1, (long long)__q0);
664 extern __inline __m128i
665 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
666 _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
667 return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
670 extern __inline __m128i
671 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
672 _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
673 short __q2, short __q1, short __q0) {
674 return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
675 __q4, __q5, __q6, __q7};
678 extern __inline __m128i
679 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
680 _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
681 char __q10, char __q09, char __q08, char __q07, char __q06,
682 char __q05, char __q04, char __q03, char __q02, char __q01,
683 char __q00) {
684 return __extension__(__m128i)(__v16qi){
685 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
686 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
689 /* Set all of the elements of the vector to A. */
690 extern __inline __m128i
691 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
692 _mm_set1_epi64x(long long __A) {
693 return _mm_set_epi64x(__A, __A);
696 extern __inline __m128i
697 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
698 _mm_set1_epi64(__m64 __A) {
699 return _mm_set_epi64(__A, __A);
702 extern __inline __m128i
703 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
704 _mm_set1_epi32(int __A) {
705 return _mm_set_epi32(__A, __A, __A, __A);
708 extern __inline __m128i
709 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
710 _mm_set1_epi16(short __A) {
711 return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
714 extern __inline __m128i
715 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716 _mm_set1_epi8(char __A) {
717 return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
718 __A, __A, __A, __A, __A);
721 /* Create a vector of Qi, where i is the element number.
722 The parameter order is reversed from the _mm_set_epi* functions. */
723 extern __inline __m128i
724 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
725 _mm_setr_epi64(__m64 __q0, __m64 __q1) {
726 return _mm_set_epi64(__q1, __q0);
729 extern __inline __m128i
730 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
731 _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
732 return _mm_set_epi32(__q3, __q2, __q1, __q0);
735 extern __inline __m128i
736 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
738 short __q5, short __q6, short __q7) {
739 return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
742 extern __inline __m128i
743 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
744 _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
745 char __q05, char __q06, char __q07, char __q08, char __q09,
746 char __q10, char __q11, char __q12, char __q13, char __q14,
747 char __q15) {
748 return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
749 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
752 /* Create a vector with element 0 as *P and the rest zero. */
753 extern __inline __m128i
754 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
755 _mm_load_si128(__m128i const *__P) {
756 return *__P;
759 extern __inline __m128i
760 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
761 _mm_loadu_si128(__m128i_u const *__P) {
762 return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
765 extern __inline __m128i
766 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
767 _mm_loadl_epi64(__m128i_u const *__P) {
768 return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
771 extern __inline void
772 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
773 _mm_store_si128(__m128i *__P, __m128i __B) {
774 vec_st((__v16qu)__B, 0, (__v16qu *)__P);
777 extern __inline void
778 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
779 _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
780 *__P = __B;
783 extern __inline void
784 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785 _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
786 *(long long *)__P = ((__v2di)__B)[0];
789 extern __inline __m64
790 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791 _mm_movepi64_pi64(__m128i_u __B) {
792 return (__m64)((__v2di)__B)[0];
795 extern __inline __m128i
796 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797 _mm_movpi64_epi64(__m64 __A) {
798 return _mm_set_epi64((__m64)0LL, __A);
801 extern __inline __m128i
802 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803 _mm_move_epi64(__m128i __A) {
804 return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
807 /* Create an undefined vector. */
808 extern __inline __m128i
809 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
810 _mm_undefined_si128(void) {
811 __m128i __Y = __Y;
812 return __Y;
815 /* Create a vector of zeros. */
816 extern __inline __m128i
817 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818 _mm_setzero_si128(void) {
819 return __extension__(__m128i)(__v4si){0, 0, 0, 0};
822 #ifdef _ARCH_PWR8
823 extern __inline __m128d
824 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
825 _mm_cvtepi32_pd(__m128i __A) {
826 __v2di __val;
827 /* For LE need to generate Vector Unpack Low Signed Word.
828 Which is generated from unpackh. */
829 __val = (__v2di)vec_unpackh((__v4si)__A);
831 return (__m128d)vec_ctf(__val, 0);
833 #endif
835 extern __inline __m128
836 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
837 _mm_cvtepi32_ps(__m128i __A) {
838 return ((__m128)vec_ctf((__v4si)__A, 0));
841 extern __inline __m128i
842 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
843 _mm_cvtpd_epi32(__m128d __A) {
844 __v2df __rounded = vec_rint(__A);
845 __v4si __result, __temp;
846 const __v4si __vzero = {0, 0, 0, 0};
848 /* VSX Vector truncate Double-Precision to integer and Convert to
849 Signed Integer Word format with Saturate. */
850 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
852 #ifdef _ARCH_PWR8
853 #ifdef __LITTLE_ENDIAN__
854 __temp = vec_mergeo(__temp, __temp);
855 #else
856 __temp = vec_mergee(__temp, __temp);
857 #endif
858 __result = (__v4si)vec_vpkudum((__vector long long)__temp,
859 (__vector long long)__vzero);
860 #else
862 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
863 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
864 __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
866 #endif
867 return (__m128i)__result;
870 extern __inline __m64
871 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872 _mm_cvtpd_pi32(__m128d __A) {
873 __m128i __result = _mm_cvtpd_epi32(__A);
875 return (__m64)__result[0];
878 extern __inline __m128
879 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
880 _mm_cvtpd_ps(__m128d __A) {
881 __v4sf __result;
882 __v4si __temp;
883 const __v4si __vzero = {0, 0, 0, 0};
885 __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
887 #ifdef _ARCH_PWR8
888 #ifdef __LITTLE_ENDIAN__
889 __temp = vec_mergeo(__temp, __temp);
890 #else
891 __temp = vec_mergee(__temp, __temp);
892 #endif
893 __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
894 (__vector long long)__vzero);
895 #else
897 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
898 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
899 __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
901 #endif
902 return ((__m128)__result);
905 extern __inline __m128i
906 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
907 _mm_cvttpd_epi32(__m128d __A) {
908 __v4si __result;
909 __v4si __temp;
910 const __v4si __vzero = {0, 0, 0, 0};
912 /* VSX Vector truncate Double-Precision to integer and Convert to
913 Signed Integer Word format with Saturate. */
914 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
916 #ifdef _ARCH_PWR8
917 #ifdef __LITTLE_ENDIAN__
918 __temp = vec_mergeo(__temp, __temp);
919 #else
920 __temp = vec_mergee(__temp, __temp);
921 #endif
922 __result = (__v4si)vec_vpkudum((__vector long long)__temp,
923 (__vector long long)__vzero);
924 #else
926 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
927 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
928 __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
930 #endif
932 return ((__m128i)__result);
935 extern __inline __m64
936 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
937 _mm_cvttpd_pi32(__m128d __A) {
938 __m128i __result = _mm_cvttpd_epi32(__A);
940 return (__m64)__result[0];
943 extern __inline int
944 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945 _mm_cvtsi128_si32(__m128i __A) {
946 return ((__v4si)__A)[0];
949 #ifdef _ARCH_PWR8
950 extern __inline __m128d
951 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
952 _mm_cvtpi32_pd(__m64 __A) {
953 __v4si __temp;
954 __v2di __tmp2;
955 __v4f __result;
957 __temp = (__v4si)vec_splats(__A);
958 __tmp2 = (__v2di)vec_unpackl(__temp);
959 __result = vec_ctf((__vector signed long long)__tmp2, 0);
960 return (__m128d)__result;
962 #endif
964 extern __inline __m128i
965 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
966 _mm_cvtps_epi32(__m128 __A) {
967 __v4sf __rounded;
968 __v4si __result;
970 __rounded = vec_rint((__v4sf)__A);
971 __result = vec_cts(__rounded, 0);
972 return (__m128i)__result;
975 extern __inline __m128i
976 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977 _mm_cvttps_epi32(__m128 __A) {
978 __v4si __result;
980 __result = vec_cts((__v4sf)__A, 0);
981 return (__m128i)__result;
984 extern __inline __m128d
985 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
986 _mm_cvtps_pd(__m128 __A) {
987 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
988 #ifdef vec_doubleh
989 return (__m128d)vec_doubleh((__v4sf)__A);
990 #else
991 /* Otherwise the compiler is not current and so need to generate the
992 equivalent code. */
993 __v4sf __a = (__v4sf)__A;
994 __v4sf __temp;
995 __v2df __result;
996 #ifdef __LITTLE_ENDIAN__
997 /* The input float values are in elements {[0], [1]} but the convert
998 instruction needs them in elements {[1], [3]}, So we use two
999 shift left double vector word immediates to get the elements
1000 lined up. */
1001 __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1002 __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1003 #else
1004 /* The input float values are in elements {[0], [1]} but the convert
1005 instruction needs them in elements {[0], [2]}, So we use two
1006 shift left double vector word immediates to get the elements
1007 lined up. */
1008 __temp = vec_vmrghw(__a, __a);
1009 #endif
1010 __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1011 return (__m128d)__result;
1012 #endif
1015 extern __inline int
1016 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017 _mm_cvtsd_si32(__m128d __A) {
1018 __v2df __rounded = vec_rint((__v2df)__A);
1019 int __result = ((__v2df)__rounded)[0];
1021 return __result;
1023 /* Intel intrinsic. */
1024 extern __inline long long
1025 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026 _mm_cvtsd_si64(__m128d __A) {
1027 __v2df __rounded = vec_rint((__v2df)__A);
1028 long long __result = ((__v2df)__rounded)[0];
1030 return __result;
1033 /* Microsoft intrinsic. */
1034 extern __inline long long
1035 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036 _mm_cvtsd_si64x(__m128d __A) {
1037 return _mm_cvtsd_si64((__v2df)__A);
1040 extern __inline int
1041 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042 _mm_cvttsd_si32(__m128d __A) {
1043 int __result = ((__v2df)__A)[0];
1045 return __result;
1048 /* Intel intrinsic. */
1049 extern __inline long long
1050 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051 _mm_cvttsd_si64(__m128d __A) {
1052 long long __result = ((__v2df)__A)[0];
1054 return __result;
1057 /* Microsoft intrinsic. */
1058 extern __inline long long
1059 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm_cvttsd_si64x(__m128d __A) {
1061 return _mm_cvttsd_si64(__A);
1064 extern __inline __m128
1065 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066 _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1067 __v4sf __result = (__v4sf)__A;
1069 #ifdef __LITTLE_ENDIAN__
1070 __v4sf __temp_s;
1071 /* Copy double element[0] to element [1] for conversion. */
1072 __v2df __temp_b = vec_splat((__v2df)__B, 0);
1074 /* Pre-rotate __A left 3 (logically right 1) elements. */
1075 __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1076 /* Convert double to single float scalar in a vector. */
1077 __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1078 /* Shift the resulting scalar into vector element [0]. */
1079 __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1080 #else
1081 __result[0] = ((__v2df)__B)[0];
1082 #endif
1083 return (__m128)__result;
1086 extern __inline __m128d
1087 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1088 _mm_cvtsi32_sd(__m128d __A, int __B) {
1089 __v2df __result = (__v2df)__A;
1090 double __db = __B;
1091 __result[0] = __db;
1092 return (__m128d)__result;
1095 /* Intel intrinsic. */
1096 extern __inline __m128d
1097 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098 _mm_cvtsi64_sd(__m128d __A, long long __B) {
1099 __v2df __result = (__v2df)__A;
1100 double __db = __B;
1101 __result[0] = __db;
1102 return (__m128d)__result;
1105 /* Microsoft intrinsic. */
1106 extern __inline __m128d
1107 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108 _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1109 return _mm_cvtsi64_sd(__A, __B);
1112 extern __inline __m128d
1113 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114 _mm_cvtss_sd(__m128d __A, __m128 __B) {
1115 #ifdef __LITTLE_ENDIAN__
1116 /* Use splat to move element [0] into position for the convert. */
1117 __v4sf __temp = vec_splat((__v4sf)__B, 0);
1118 __v2df __res;
1119 /* Convert single float scalar to double in a vector. */
1120 __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1121 return (__m128d)vec_mergel(__res, (__v2df)__A);
1122 #else
1123 __v2df __res = (__v2df)__A;
1124 __res[0] = ((__v4sf)__B)[0];
1125 return (__m128d)__res;
1126 #endif
1129 extern __inline __m128d
1130 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1131 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1132 __vector double __result;
1133 const int __litmsk = __mask & 0x3;
1135 if (__litmsk == 0)
1136 __result = vec_mergeh(__A, __B);
1137 #if __GNUC__ < 6
1138 else if (__litmsk == 1)
1139 __result = vec_xxpermdi(__B, __A, 2);
1140 else if (__litmsk == 2)
1141 __result = vec_xxpermdi(__B, __A, 1);
1142 #else
1143 else if (__litmsk == 1)
1144 __result = vec_xxpermdi(__A, __B, 2);
1145 else if (__litmsk == 2)
1146 __result = vec_xxpermdi(__A, __B, 1);
1147 #endif
1148 else
1149 __result = vec_mergel(__A, __B);
1151 return __result;
1154 extern __inline __m128d
1155 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156 _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1157 return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1160 extern __inline __m128d
1161 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162 _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1163 return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1166 extern __inline __m128d
1167 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168 _mm_loadh_pd(__m128d __A, double const *__B) {
1169 __v2df __result = (__v2df)__A;
1170 __result[1] = *__B;
1171 return (__m128d)__result;
1174 extern __inline __m128d
1175 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1176 _mm_loadl_pd(__m128d __A, double const *__B) {
1177 __v2df __result = (__v2df)__A;
1178 __result[0] = *__B;
1179 return (__m128d)__result;
1182 #ifdef _ARCH_PWR8
1183 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1185 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1186 extern __inline int
1187 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188 _mm_movemask_pd(__m128d __A) {
1189 #ifdef _ARCH_PWR10
1190 return vec_extractm((__v2du)__A);
1191 #else
1192 __vector unsigned long long __result;
1193 static const __vector unsigned int __perm_mask = {
1194 #ifdef __LITTLE_ENDIAN__
1195 0x80800040, 0x80808080, 0x80808080, 0x80808080
1196 #else
1197 0x80808080, 0x80808080, 0x80808080, 0x80804000
1198 #endif
1201 __result = ((__vector unsigned long long)vec_vbpermq(
1202 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1204 #ifdef __LITTLE_ENDIAN__
1205 return __result[1];
1206 #else
1207 return __result[0];
1208 #endif
1209 #endif /* !_ARCH_PWR10 */
1211 #endif /* _ARCH_PWR8 */
1213 extern __inline __m128i
1214 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215 _mm_packs_epi16(__m128i __A, __m128i __B) {
1216 return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1219 extern __inline __m128i
1220 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221 _mm_packs_epi32(__m128i __A, __m128i __B) {
1222 return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1225 extern __inline __m128i
1226 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227 _mm_packus_epi16(__m128i __A, __m128i __B) {
1228 return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1231 extern __inline __m128i
1232 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1233 _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1234 return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1237 extern __inline __m128i
1238 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1239 _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1240 return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1243 extern __inline __m128i
1244 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1245 _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1246 return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1249 extern __inline __m128i
1250 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251 _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1252 return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1255 extern __inline __m128i
1256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257 _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1258 return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1261 extern __inline __m128i
1262 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263 _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1264 return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1267 extern __inline __m128i
1268 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269 _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1270 return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1273 extern __inline __m128i
1274 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275 _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1276 return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1279 extern __inline __m128i
1280 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 _mm_add_epi8(__m128i __A, __m128i __B) {
1282 return (__m128i)((__v16qu)__A + (__v16qu)__B);
1285 extern __inline __m128i
1286 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1287 _mm_add_epi16(__m128i __A, __m128i __B) {
1288 return (__m128i)((__v8hu)__A + (__v8hu)__B);
1291 extern __inline __m128i
1292 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293 _mm_add_epi32(__m128i __A, __m128i __B) {
1294 return (__m128i)((__v4su)__A + (__v4su)__B);
1297 extern __inline __m128i
1298 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1299 _mm_add_epi64(__m128i __A, __m128i __B) {
1300 return (__m128i)((__v2du)__A + (__v2du)__B);
1303 extern __inline __m128i
1304 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305 _mm_adds_epi8(__m128i __A, __m128i __B) {
1306 return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1309 extern __inline __m128i
1310 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311 _mm_adds_epi16(__m128i __A, __m128i __B) {
1312 return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1315 extern __inline __m128i
1316 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317 _mm_adds_epu8(__m128i __A, __m128i __B) {
1318 return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1321 extern __inline __m128i
1322 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323 _mm_adds_epu16(__m128i __A, __m128i __B) {
1324 return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1327 extern __inline __m128i
1328 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329 _mm_sub_epi8(__m128i __A, __m128i __B) {
1330 return (__m128i)((__v16qu)__A - (__v16qu)__B);
1333 extern __inline __m128i
1334 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1335 _mm_sub_epi16(__m128i __A, __m128i __B) {
1336 return (__m128i)((__v8hu)__A - (__v8hu)__B);
1339 extern __inline __m128i
1340 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341 _mm_sub_epi32(__m128i __A, __m128i __B) {
1342 return (__m128i)((__v4su)__A - (__v4su)__B);
1345 extern __inline __m128i
1346 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1347 _mm_sub_epi64(__m128i __A, __m128i __B) {
1348 return (__m128i)((__v2du)__A - (__v2du)__B);
1351 extern __inline __m128i
1352 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353 _mm_subs_epi8(__m128i __A, __m128i __B) {
1354 return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1357 extern __inline __m128i
1358 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359 _mm_subs_epi16(__m128i __A, __m128i __B) {
1360 return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1363 extern __inline __m128i
1364 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1365 _mm_subs_epu8(__m128i __A, __m128i __B) {
1366 return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1369 extern __inline __m128i
1370 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371 _mm_subs_epu16(__m128i __A, __m128i __B) {
1372 return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1375 extern __inline __m128i
1376 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1377 _mm_madd_epi16(__m128i __A, __m128i __B) {
1378 __vector signed int __zero = {0, 0, 0, 0};
1380 return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1383 extern __inline __m128i
1384 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385 _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1386 __vector signed int __w0, __w1;
1388 __vector unsigned char __xform1 = {
1389 #ifdef __LITTLE_ENDIAN__
1390 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1391 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1392 #else
1393 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1394 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1395 #endif
1398 __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1399 __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1400 return (__m128i)vec_perm(__w0, __w1, __xform1);
1403 extern __inline __m128i
1404 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405 _mm_mullo_epi16(__m128i __A, __m128i __B) {
1406 return (__m128i)((__v8hi)__A * (__v8hi)__B);
1409 extern __inline __m64
1410 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411 _mm_mul_su32(__m64 __A, __m64 __B) {
1412 unsigned int __a = __A;
1413 unsigned int __b = __B;
1415 return ((__m64)__a * (__m64)__b);
1418 #ifdef _ARCH_PWR8
1419 extern __inline __m128i
1420 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421 _mm_mul_epu32(__m128i __A, __m128i __B) {
1422 #if __GNUC__ < 8
1423 __v2du __result;
1425 #ifdef __LITTLE_ENDIAN__
1426 /* VMX Vector Multiply Odd Unsigned Word. */
1427 __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1428 #else
1429 /* VMX Vector Multiply Even Unsigned Word. */
1430 __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1431 #endif
1432 return (__m128i)__result;
1433 #else
1434 return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1435 #endif
1437 #endif
1439 extern __inline __m128i
1440 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1441 _mm_slli_epi16(__m128i __A, int __B) {
1442 __v8hu __lshift;
1443 __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1445 if (__B >= 0 && __B < 16) {
1446 if (__builtin_constant_p(__B))
1447 __lshift = (__v8hu)vec_splat_s16(__B);
1448 else
1449 __lshift = vec_splats((unsigned short)__B);
1451 __result = vec_sl((__v8hi)__A, __lshift);
1454 return (__m128i)__result;
1457 extern __inline __m128i
1458 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1459 _mm_slli_epi32(__m128i __A, int __B) {
1460 __v4su __lshift;
1461 __v4si __result = {0, 0, 0, 0};
1463 if (__B >= 0 && __B < 32) {
1464 if (__builtin_constant_p(__B) && __B < 16)
1465 __lshift = (__v4su)vec_splat_s32(__B);
1466 else
1467 __lshift = vec_splats((unsigned int)__B);
1469 __result = vec_sl((__v4si)__A, __lshift);
1472 return (__m128i)__result;
1475 #ifdef _ARCH_PWR8
1476 extern __inline __m128i
1477 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1478 _mm_slli_epi64(__m128i __A, int __B) {
1479 __v2du __lshift;
1480 __v2di __result = {0, 0};
1482 if (__B >= 0 && __B < 64) {
1483 if (__builtin_constant_p(__B) && __B < 16)
1484 __lshift = (__v2du)vec_splat_s32(__B);
1485 else
1486 __lshift = (__v2du)vec_splats((unsigned int)__B);
1488 __result = vec_sl((__v2di)__A, __lshift);
1491 return (__m128i)__result;
1493 #endif
1495 extern __inline __m128i
1496 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1497 _mm_srai_epi16(__m128i __A, int __B) {
1498 __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1499 __v8hi __result;
1501 if (__B < 16) {
1502 if (__builtin_constant_p(__B))
1503 __rshift = (__v8hu)vec_splat_s16(__B);
1504 else
1505 __rshift = vec_splats((unsigned short)__B);
1507 __result = vec_sra((__v8hi)__A, __rshift);
1509 return (__m128i)__result;
1512 extern __inline __m128i
1513 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1514 _mm_srai_epi32(__m128i __A, int __B) {
1515 __v4su __rshift = {31, 31, 31, 31};
1516 __v4si __result;
1518 if (__B < 32) {
1519 if (__builtin_constant_p(__B)) {
1520 if (__B < 16)
1521 __rshift = (__v4su)vec_splat_s32(__B);
1522 else
1523 __rshift = (__v4su)vec_splats((unsigned int)__B);
1524 } else
1525 __rshift = vec_splats((unsigned int)__B);
1527 __result = vec_sra((__v4si)__A, __rshift);
1529 return (__m128i)__result;
1532 extern __inline __m128i
1533 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1534 _mm_bslli_si128(__m128i __A, const int __N) {
1535 __v16qu __result;
1536 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1538 if (__N < 16)
1539 __result = vec_sld((__v16qu)__A, __zeros, __N);
1540 else
1541 __result = __zeros;
1543 return (__m128i)__result;
1546 extern __inline __m128i
1547 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1548 _mm_bsrli_si128(__m128i __A, const int __N) {
1549 __v16qu __result;
1550 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1552 if (__N < 16)
1553 #ifdef __LITTLE_ENDIAN__
1554 if (__builtin_constant_p(__N))
1555 /* Would like to use Vector Shift Left Double by Octet
1556 Immediate here to use the immediate form and avoid
1557 load of __N * 8 value into a separate VR. */
1558 __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1559 else
1560 #endif
1562 __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1563 #ifdef __LITTLE_ENDIAN__
1564 __result = vec_sro((__v16qu)__A, __shift);
1565 #else
1566 __result = vec_slo((__v16qu)__A, __shift);
1567 #endif
1569 else
1570 __result = __zeros;
1572 return (__m128i)__result;
1575 extern __inline __m128i
1576 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577 _mm_srli_si128(__m128i __A, const int __N) {
1578 return _mm_bsrli_si128(__A, __N);
1581 extern __inline __m128i
1582 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1583 _mm_slli_si128(__m128i __A, const int _imm5) {
1584 __v16qu __result;
1585 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1587 if (_imm5 < 16)
1588 #ifdef __LITTLE_ENDIAN__
1589 __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1590 #else
1591 __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1592 #endif
1593 else
1594 __result = __zeros;
1596 return (__m128i)__result;
1599 extern __inline __m128i
1600 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1602 _mm_srli_epi16(__m128i __A, int __B) {
1603 __v8hu __rshift;
1604 __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1606 if (__B < 16) {
1607 if (__builtin_constant_p(__B))
1608 __rshift = (__v8hu)vec_splat_s16(__B);
1609 else
1610 __rshift = vec_splats((unsigned short)__B);
1612 __result = vec_sr((__v8hi)__A, __rshift);
1615 return (__m128i)__result;
1618 extern __inline __m128i
1619 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1620 _mm_srli_epi32(__m128i __A, int __B) {
1621 __v4su __rshift;
1622 __v4si __result = {0, 0, 0, 0};
1624 if (__B < 32) {
1625 if (__builtin_constant_p(__B)) {
1626 if (__B < 16)
1627 __rshift = (__v4su)vec_splat_s32(__B);
1628 else
1629 __rshift = (__v4su)vec_splats((unsigned int)__B);
1630 } else
1631 __rshift = vec_splats((unsigned int)__B);
1633 __result = vec_sr((__v4si)__A, __rshift);
1636 return (__m128i)__result;
1639 #ifdef _ARCH_PWR8
1640 extern __inline __m128i
1641 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1642 _mm_srli_epi64(__m128i __A, int __B) {
1643 __v2du __rshift;
1644 __v2di __result = {0, 0};
1646 if (__B < 64) {
1647 if (__builtin_constant_p(__B)) {
1648 if (__B < 16)
1649 __rshift = (__v2du)vec_splat_s32(__B);
1650 else
1651 __rshift = (__v2du)vec_splats((unsigned long long)__B);
1652 } else
1653 __rshift = (__v2du)vec_splats((unsigned int)__B);
1655 __result = vec_sr((__v2di)__A, __rshift);
1658 return (__m128i)__result;
1660 #endif
1662 extern __inline __m128i
1663 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1664 _mm_sll_epi16(__m128i __A, __m128i __B) {
1665 __v8hu __lshift;
1666 __vector __bool short __shmask;
1667 const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1668 __v8hu __result;
1670 #ifdef __LITTLE_ENDIAN__
1671 __lshift = vec_splat((__v8hu)__B, 0);
1672 #else
1673 __lshift = vec_splat((__v8hu)__B, 3);
1674 #endif
1675 __shmask = vec_cmple(__lshift, __shmax);
1676 __result = vec_sl((__v8hu)__A, __lshift);
1677 __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1679 return (__m128i)__result;
1682 extern __inline __m128i
1683 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1684 _mm_sll_epi32(__m128i __A, __m128i __B) {
1685 __v4su __lshift;
1686 __vector __bool int __shmask;
1687 const __v4su __shmax = {32, 32, 32, 32};
1688 __v4su __result;
1689 #ifdef __LITTLE_ENDIAN__
1690 __lshift = vec_splat((__v4su)__B, 0);
1691 #else
1692 __lshift = vec_splat((__v4su)__B, 1);
1693 #endif
1694 __shmask = vec_cmplt(__lshift, __shmax);
1695 __result = vec_sl((__v4su)__A, __lshift);
1696 __result = vec_sel((__v4su)__shmask, __result, __shmask);
1698 return (__m128i)__result;
1701 #ifdef _ARCH_PWR8
1702 extern __inline __m128i
1703 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1704 _mm_sll_epi64(__m128i __A, __m128i __B) {
1705 __v2du __lshift;
1706 __vector __bool long long __shmask;
1707 const __v2du __shmax = {64, 64};
1708 __v2du __result;
1710 __lshift = vec_splat((__v2du)__B, 0);
1711 __shmask = vec_cmplt(__lshift, __shmax);
1712 __result = vec_sl((__v2du)__A, __lshift);
1713 __result = vec_sel((__v2du)__shmask, __result, __shmask);
1715 return (__m128i)__result;
1717 #endif
1719 extern __inline __m128i
1720 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1721 _mm_sra_epi16(__m128i __A, __m128i __B) {
1722 const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1723 __v8hu __rshift;
1724 __v8hi __result;
1726 #ifdef __LITTLE_ENDIAN__
1727 __rshift = vec_splat((__v8hu)__B, 0);
1728 #else
1729 __rshift = vec_splat((__v8hu)__B, 3);
1730 #endif
1731 __rshift = vec_min(__rshift, __rshmax);
1732 __result = vec_sra((__v8hi)__A, __rshift);
1734 return (__m128i)__result;
1737 extern __inline __m128i
1738 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1739 _mm_sra_epi32(__m128i __A, __m128i __B) {
1740 const __v4su __rshmax = {31, 31, 31, 31};
1741 __v4su __rshift;
1742 __v4si __result;
1744 #ifdef __LITTLE_ENDIAN__
1745 __rshift = vec_splat((__v4su)__B, 0);
1746 #else
1747 __rshift = vec_splat((__v4su)__B, 1);
1748 #endif
1749 __rshift = vec_min(__rshift, __rshmax);
1750 __result = vec_sra((__v4si)__A, __rshift);
1752 return (__m128i)__result;
1755 extern __inline __m128i
1756 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757 _mm_srl_epi16(__m128i __A, __m128i __B) {
1758 __v8hu __rshift;
1759 __vector __bool short __shmask;
1760 const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1761 __v8hu __result;
1763 #ifdef __LITTLE_ENDIAN__
1764 __rshift = vec_splat((__v8hu)__B, 0);
1765 #else
1766 __rshift = vec_splat((__v8hu)__B, 3);
1767 #endif
1768 __shmask = vec_cmple(__rshift, __shmax);
1769 __result = vec_sr((__v8hu)__A, __rshift);
1770 __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1772 return (__m128i)__result;
1775 extern __inline __m128i
1776 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1777 _mm_srl_epi32(__m128i __A, __m128i __B) {
1778 __v4su __rshift;
1779 __vector __bool int __shmask;
1780 const __v4su __shmax = {32, 32, 32, 32};
1781 __v4su __result;
1783 #ifdef __LITTLE_ENDIAN__
1784 __rshift = vec_splat((__v4su)__B, 0);
1785 #else
1786 __rshift = vec_splat((__v4su)__B, 1);
1787 #endif
1788 __shmask = vec_cmplt(__rshift, __shmax);
1789 __result = vec_sr((__v4su)__A, __rshift);
1790 __result = vec_sel((__v4su)__shmask, __result, __shmask);
1792 return (__m128i)__result;
1795 #ifdef _ARCH_PWR8
1796 extern __inline __m128i
1797 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1798 _mm_srl_epi64(__m128i __A, __m128i __B) {
1799 __v2du __rshift;
1800 __vector __bool long long __shmask;
1801 const __v2du __shmax = {64, 64};
1802 __v2du __result;
1804 __rshift = vec_splat((__v2du)__B, 0);
1805 __shmask = vec_cmplt(__rshift, __shmax);
1806 __result = vec_sr((__v2du)__A, __rshift);
1807 __result = vec_sel((__v2du)__shmask, __result, __shmask);
1809 return (__m128i)__result;
1811 #endif
1813 extern __inline __m128d
1814 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1815 _mm_and_pd(__m128d __A, __m128d __B) {
1816 return (vec_and((__v2df)__A, (__v2df)__B));
1819 extern __inline __m128d
1820 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1821 _mm_andnot_pd(__m128d __A, __m128d __B) {
1822 return (vec_andc((__v2df)__B, (__v2df)__A));
1825 extern __inline __m128d
1826 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1827 _mm_or_pd(__m128d __A, __m128d __B) {
1828 return (vec_or((__v2df)__A, (__v2df)__B));
1831 extern __inline __m128d
1832 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1833 _mm_xor_pd(__m128d __A, __m128d __B) {
1834 return (vec_xor((__v2df)__A, (__v2df)__B));
1837 extern __inline __m128i
1838 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1839 _mm_and_si128(__m128i __A, __m128i __B) {
1840 return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1843 extern __inline __m128i
1844 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1845 _mm_andnot_si128(__m128i __A, __m128i __B) {
1846 return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1849 extern __inline __m128i
1850 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1851 _mm_or_si128(__m128i __A, __m128i __B) {
1852 return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1855 extern __inline __m128i
1856 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1857 _mm_xor_si128(__m128i __A, __m128i __B) {
1858 return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1861 extern __inline __m128i
1862 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1863 _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1864 return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1867 extern __inline __m128i
1868 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1869 _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1870 return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1873 extern __inline __m128i
1874 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1875 _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1876 return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1879 extern __inline __m128i
1880 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1881 _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1882 return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1885 extern __inline __m128i
1886 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1887 _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1888 return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1891 extern __inline __m128i
1892 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1893 _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1894 return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1897 extern __inline __m128i
1898 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1899 _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1900 return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1903 extern __inline __m128i
1904 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1905 _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1906 return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1909 extern __inline __m128i
1910 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1911 _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1912 return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1915 extern __inline int
1916 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1917 _mm_extract_epi16(__m128i const __A, int const __N) {
1918 return (unsigned short)((__v8hi)__A)[__N & 7];
1921 extern __inline __m128i
1922 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1923 _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1924 __v8hi __result = (__v8hi)__A;
1926 __result[(__N & 7)] = __D;
1928 return (__m128i)__result;
1931 extern __inline __m128i
1932 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1933 _mm_max_epi16(__m128i __A, __m128i __B) {
1934 return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1937 extern __inline __m128i
1938 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1939 _mm_max_epu8(__m128i __A, __m128i __B) {
1940 return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1943 extern __inline __m128i
1944 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1945 _mm_min_epi16(__m128i __A, __m128i __B) {
1946 return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1949 extern __inline __m128i
1950 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1951 _mm_min_epu8(__m128i __A, __m128i __B) {
1952 return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1955 #ifdef _ARCH_PWR8
1956 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1958 /* Return a mask created from the most significant bit of each 8-bit
1959 element in A. */
1960 extern __inline int
1961 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1962 _mm_movemask_epi8(__m128i __A) {
1963 #ifdef _ARCH_PWR10
1964 return vec_extractm((__v16qu)__A);
1965 #else
1966 __vector unsigned long long __result;
1967 static const __vector unsigned char __perm_mask = {
1968 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1969 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1971 __result = ((__vector unsigned long long)vec_vbpermq(
1972 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1974 #ifdef __LITTLE_ENDIAN__
1975 return __result[1];
1976 #else
1977 return __result[0];
1978 #endif
1979 #endif /* !_ARCH_PWR10 */
1981 #endif /* _ARCH_PWR8 */
1983 extern __inline __m128i
1984 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1985 _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1986 __v4su __w0, __w1;
1987 __v16qu __xform1 = {
1988 #ifdef __LITTLE_ENDIAN__
1989 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1990 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1991 #else
1992 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1993 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1994 #endif
1997 __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1998 __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1999 return (__m128i)vec_perm(__w0, __w1, __xform1);
2002 extern __inline __m128i
2003 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2004 _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2005 unsigned long __element_selector_98 = __mask & 0x03;
2006 unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2007 unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2008 unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2009 static const unsigned short __permute_selectors[4] = {
2010 #ifdef __LITTLE_ENDIAN__
2011 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2012 #else
2013 0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2014 #endif
2016 __v2du __pmask =
2017 #ifdef __LITTLE_ENDIAN__
2018 {0x1716151413121110UL, 0UL};
2019 #else
2020 {0x1011121314151617UL, 0UL};
2021 #endif
2022 __m64_union __t;
2023 __v2du __a, __r;
2025 __t.as_short[0] = __permute_selectors[__element_selector_98];
2026 __t.as_short[1] = __permute_selectors[__element_selector_BA];
2027 __t.as_short[2] = __permute_selectors[__element_selector_DC];
2028 __t.as_short[3] = __permute_selectors[__element_selector_FE];
2029 __pmask[1] = __t.as_m64;
2030 __a = (__v2du)__A;
2031 __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2032 return (__m128i)__r;
2035 extern __inline __m128i
2036 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2037 _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2038 unsigned long __element_selector_10 = __mask & 0x03;
2039 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2040 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2041 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2042 static const unsigned short __permute_selectors[4] = {
2043 #ifdef __LITTLE_ENDIAN__
2044 0x0100, 0x0302, 0x0504, 0x0706
2045 #else
2046 0x0001, 0x0203, 0x0405, 0x0607
2047 #endif
2049 __v2du __pmask =
2050 #ifdef __LITTLE_ENDIAN__
2051 {0UL, 0x1f1e1d1c1b1a1918UL};
2052 #else
2053 {0UL, 0x18191a1b1c1d1e1fUL};
2054 #endif
2055 __m64_union __t;
2056 __v2du __a, __r;
2057 __t.as_short[0] = __permute_selectors[__element_selector_10];
2058 __t.as_short[1] = __permute_selectors[__element_selector_32];
2059 __t.as_short[2] = __permute_selectors[__element_selector_54];
2060 __t.as_short[3] = __permute_selectors[__element_selector_76];
2061 __pmask[0] = __t.as_m64;
2062 __a = (__v2du)__A;
2063 __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2064 return (__m128i)__r;
2067 extern __inline __m128i
2068 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2069 _mm_shuffle_epi32(__m128i __A, const int __mask) {
2070 unsigned long __element_selector_10 = __mask & 0x03;
2071 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2072 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2073 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2074 static const unsigned int __permute_selectors[4] = {
2075 #ifdef __LITTLE_ENDIAN__
2076 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2077 #else
2078 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2079 #endif
2081 __v4su __t;
2083 __t[0] = __permute_selectors[__element_selector_10];
2084 __t[1] = __permute_selectors[__element_selector_32];
2085 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2086 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2087 return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2088 (__vector unsigned char)__t);
2091 extern __inline void
2092 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2093 _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2094 __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2095 __v16qu __mask, __tmp;
2096 __m128i_u *__p = (__m128i_u *)__C;
2098 __tmp = (__v16qu)_mm_loadu_si128(__p);
2099 __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2100 __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2101 _mm_storeu_si128(__p, (__m128i)__tmp);
2104 extern __inline __m128i
2105 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2106 _mm_avg_epu8(__m128i __A, __m128i __B) {
2107 return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2110 extern __inline __m128i
2111 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2112 _mm_avg_epu16(__m128i __A, __m128i __B) {
2113 return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2116 extern __inline __m128i
2117 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2118 _mm_sad_epu8(__m128i __A, __m128i __B) {
2119 __v16qu __a, __b;
2120 __v16qu __vabsdiff;
2121 __v4si __vsum;
2122 const __v4su __zero = {0, 0, 0, 0};
2123 __v4si __result;
2125 __a = (__v16qu)__A;
2126 __b = (__v16qu)__B;
2127 #ifndef _ARCH_PWR9
2128 __v16qu __vmin = vec_min(__a, __b);
2129 __v16qu __vmax = vec_max(__a, __b);
2130 __vabsdiff = vec_sub(__vmax, __vmin);
2131 #else
2132 __vabsdiff = vec_absd(__a, __b);
2133 #endif
2134 /* Sum four groups of bytes into integers. */
2135 __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2136 #ifdef __LITTLE_ENDIAN__
2137 /* Sum across four integers with two integer results. */
2138 __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2139 /* Note: vec_sum2s could be used here, but on little-endian, vector
2140 shifts are added that are not needed for this use-case.
2141 A vector shift to correctly position the 32-bit integer results
2142 (currently at [0] and [2]) to [1] and [3] would then need to be
2143 swapped back again since the desired results are two 64-bit
2144 integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */
2145 #else
2146 /* Sum across four integers with two integer results. */
2147 __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2148 /* Rotate the sums into the correct position. */
2149 __result = vec_sld(__result, __result, 6);
2150 #endif
2151 return (__m128i)__result;
2154 extern __inline void
2155 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2156 _mm_stream_si32(int *__A, int __B) {
2157 /* Use the data cache block touch for store transient. */
2158 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2159 *__A = __B;
2162 extern __inline void
2163 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2164 _mm_stream_si64(long long int *__A, long long int __B) {
2165 /* Use the data cache block touch for store transient. */
2166 __asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory");
2167 *__A = __B;
2170 extern __inline void
2171 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2172 _mm_stream_si128(__m128i *__A, __m128i __B) {
2173 /* Use the data cache block touch for store transient. */
2174 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2175 *__A = __B;
2178 extern __inline void
2179 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2180 _mm_stream_pd(double *__A, __m128d __B) {
2181 /* Use the data cache block touch for store transient. */
2182 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2183 *(__m128d *)__A = __B;
2186 extern __inline void
2187 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2188 _mm_clflush(void const *__A) {
2189 /* Use the data cache block flush. */
2190 __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2193 extern __inline void
2194 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2195 _mm_lfence(void) {
2196 /* Use light weight sync for load to load ordering. */
2197 __atomic_thread_fence(__ATOMIC_RELEASE);
2200 extern __inline void
2201 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2202 _mm_mfence(void) {
2203 /* Use heavy weight sync for any to any ordering. */
2204 __atomic_thread_fence(__ATOMIC_SEQ_CST);
2207 extern __inline __m128i
2208 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2209 _mm_cvtsi32_si128(int __A) {
2210 return _mm_set_epi32(0, 0, 0, __A);
2213 extern __inline __m128i
2214 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2215 _mm_cvtsi64_si128(long long __A) {
2216 return __extension__(__m128i)(__v2di){__A, 0LL};
2219 /* Microsoft intrinsic. */
2220 extern __inline __m128i
2221 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2222 _mm_cvtsi64x_si128(long long __A) {
2223 return __extension__(__m128i)(__v2di){__A, 0LL};
2226 /* Casts between various SP, DP, INT vector types. Note that these do no
2227 conversion of values, they just change the type. */
2228 extern __inline __m128
2229 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2230 _mm_castpd_ps(__m128d __A) {
2231 return (__m128)__A;
2234 extern __inline __m128i
2235 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2236 _mm_castpd_si128(__m128d __A) {
2237 return (__m128i)__A;
2240 extern __inline __m128d
2241 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2242 _mm_castps_pd(__m128 __A) {
2243 return (__m128d)__A;
2246 extern __inline __m128i
2247 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2248 _mm_castps_si128(__m128 __A) {
2249 return (__m128i)__A;
2252 extern __inline __m128
2253 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2254 _mm_castsi128_ps(__m128i __A) {
2255 return (__m128)__A;
2258 extern __inline __m128d
2259 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2260 _mm_castsi128_pd(__m128i __A) {
2261 return (__m128d)__A;
2264 #else
2265 #include_next <emmintrin.h>
2266 #endif /* defined(__powerpc64__) && \
2267 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2269 #endif /* EMMINTRIN_H_ */