[docs] Fix build-docs.sh
[llvm-project.git] / clang / lib / Headers / ppc_wrappers / emmintrin.h
bloba4c458a41bcf49bd53932f25c291f9da9ed240fc
1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 /* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19 However scalar float operations in vector (XMM) registers require
20 the POWER8 VSX ISA (2.07) level. There are differences for data
21 format and placement of float scalars in the vector register, which
22 require extra steps to match SSE2 scalar float semantics on POWER.
24 It should be noted that there's much difference between X86_64's
25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26 portable <fenv.h> instead of access MXSCR directly.
28 Most SSE2 scalar float intrinsic operations can be performed more
29 efficiently as C language float scalar operations or optimized to
30 use vector SIMD operations. We recommend this for new applications.
32 #error \
33 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
34 #endif
36 #ifndef EMMINTRIN_H_
37 #define EMMINTRIN_H_
39 #if defined(__ppc64__) && \
40 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
42 #include <altivec.h>
44 /* We need definitions from the SSE header files. */
45 #include <xmmintrin.h>
47 /* SSE2 */
48 typedef __vector double __v2df;
49 typedef __vector long long __v2di;
50 typedef __vector unsigned long long __v2du;
51 typedef __vector int __v4si;
52 typedef __vector unsigned int __v4su;
53 typedef __vector short __v8hi;
54 typedef __vector unsigned short __v8hu;
55 typedef __vector signed char __v16qi;
56 typedef __vector unsigned char __v16qu;
58 /* The Intel API is flexible enough that we must allow aliasing with other
59 vector types, and their scalar components. */
60 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
61 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
63 /* Unaligned version of the same types. */
64 typedef long long __m128i_u
65 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
66 typedef double __m128d_u
67 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
69 /* Define two value permute mask. */
70 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
72 /* Create a vector with element 0 as F and the rest zero. */
73 extern __inline __m128d
74 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75 _mm_set_sd(double __F) {
76 return __extension__(__m128d){__F, 0.0};
79 /* Create a vector with both elements equal to F. */
80 extern __inline __m128d
81 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
82 _mm_set1_pd(double __F) {
83 return __extension__(__m128d){__F, __F};
86 extern __inline __m128d
87 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
88 _mm_set_pd1(double __F) {
89 return _mm_set1_pd(__F);
92 /* Create a vector with the lower value X and upper value W. */
93 extern __inline __m128d
94 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95 _mm_set_pd(double __W, double __X) {
96 return __extension__(__m128d){__X, __W};
99 /* Create a vector with the lower value W and upper value X. */
100 extern __inline __m128d
101 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
102 _mm_setr_pd(double __W, double __X) {
103 return __extension__(__m128d){__W, __X};
106 /* Create an undefined vector. */
107 extern __inline __m128d
108 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
109 _mm_undefined_pd(void) {
110 __m128d __Y = __Y;
111 return __Y;
114 /* Create a vector of zeros. */
115 extern __inline __m128d
116 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117 _mm_setzero_pd(void) {
118 return (__m128d)vec_splats(0);
121 /* Sets the low DPFP value of A from the low value of B. */
122 extern __inline __m128d
123 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
124 _mm_move_sd(__m128d __A, __m128d __B) {
125 __v2df __result = (__v2df)__A;
126 __result[0] = ((__v2df)__B)[0];
127 return (__m128d)__result;
130 /* Load two DPFP values from P. The address must be 16-byte aligned. */
131 extern __inline __m128d
132 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133 _mm_load_pd(double const *__P) {
134 return ((__m128d)vec_ld(0, (__v16qu *)__P));
137 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
138 extern __inline __m128d
139 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140 _mm_loadu_pd(double const *__P) {
141 return (vec_vsx_ld(0, __P));
144 /* Create a vector with all two elements equal to *P. */
145 extern __inline __m128d
146 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 _mm_load1_pd(double const *__P) {
148 return (vec_splats(*__P));
151 /* Create a vector with element 0 as *P and the rest zero. */
152 extern __inline __m128d
153 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154 _mm_load_sd(double const *__P) {
155 return _mm_set_sd(*__P);
158 extern __inline __m128d
159 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160 _mm_load_pd1(double const *__P) {
161 return _mm_load1_pd(__P);
164 /* Load two DPFP values in reverse order. The address must be aligned. */
165 extern __inline __m128d
166 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
167 _mm_loadr_pd(double const *__P) {
168 __v2df __tmp = _mm_load_pd(__P);
169 return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
172 /* Store two DPFP values. The address must be 16-byte aligned. */
173 extern __inline void
174 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175 _mm_store_pd(double *__P, __m128d __A) {
176 vec_st((__v16qu)__A, 0, (__v16qu *)__P);
179 /* Store two DPFP values. The address need not be 16-byte aligned. */
180 extern __inline void
181 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182 _mm_storeu_pd(double *__P, __m128d __A) {
183 *(__m128d_u *)__P = __A;
186 /* Stores the lower DPFP value. */
187 extern __inline void
188 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
189 _mm_store_sd(double *__P, __m128d __A) {
190 *__P = ((__v2df)__A)[0];
193 extern __inline double
194 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
195 _mm_cvtsd_f64(__m128d __A) {
196 return ((__v2df)__A)[0];
199 extern __inline void
200 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201 _mm_storel_pd(double *__P, __m128d __A) {
202 _mm_store_sd(__P, __A);
205 /* Stores the upper DPFP value. */
206 extern __inline void
207 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208 _mm_storeh_pd(double *__P, __m128d __A) {
209 *__P = ((__v2df)__A)[1];
211 /* Store the lower DPFP value across two words.
212 The address must be 16-byte aligned. */
213 extern __inline void
214 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215 _mm_store1_pd(double *__P, __m128d __A) {
216 _mm_store_pd(__P, vec_splat(__A, 0));
219 extern __inline void
220 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _mm_store_pd1(double *__P, __m128d __A) {
222 _mm_store1_pd(__P, __A);
225 /* Store two DPFP values in reverse order. The address must be aligned. */
226 extern __inline void
227 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228 _mm_storer_pd(double *__P, __m128d __A) {
229 _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
232 /* Intel intrinsic. */
233 extern __inline long long
234 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235 _mm_cvtsi128_si64(__m128i __A) {
236 return ((__v2di)__A)[0];
239 /* Microsoft intrinsic. */
240 extern __inline long long
241 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
242 _mm_cvtsi128_si64x(__m128i __A) {
243 return ((__v2di)__A)[0];
246 extern __inline __m128d
247 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 _mm_add_pd(__m128d __A, __m128d __B) {
249 return (__m128d)((__v2df)__A + (__v2df)__B);
252 /* Add the lower double-precision (64-bit) floating-point element in
253 a and b, store the result in the lower element of dst, and copy
254 the upper element from a to the upper element of dst. */
255 extern __inline __m128d
256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257 _mm_add_sd(__m128d __A, __m128d __B) {
258 __A[0] = __A[0] + __B[0];
259 return (__A);
262 extern __inline __m128d
263 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
264 _mm_sub_pd(__m128d __A, __m128d __B) {
265 return (__m128d)((__v2df)__A - (__v2df)__B);
268 extern __inline __m128d
269 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270 _mm_sub_sd(__m128d __A, __m128d __B) {
271 __A[0] = __A[0] - __B[0];
272 return (__A);
275 extern __inline __m128d
276 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
277 _mm_mul_pd(__m128d __A, __m128d __B) {
278 return (__m128d)((__v2df)__A * (__v2df)__B);
281 extern __inline __m128d
282 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283 _mm_mul_sd(__m128d __A, __m128d __B) {
284 __A[0] = __A[0] * __B[0];
285 return (__A);
288 extern __inline __m128d
289 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290 _mm_div_pd(__m128d __A, __m128d __B) {
291 return (__m128d)((__v2df)__A / (__v2df)__B);
294 extern __inline __m128d
295 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296 _mm_div_sd(__m128d __A, __m128d __B) {
297 __A[0] = __A[0] / __B[0];
298 return (__A);
301 extern __inline __m128d
302 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303 _mm_sqrt_pd(__m128d __A) {
304 return (vec_sqrt(__A));
307 /* Return pair {sqrt (B[0]), A[1]}. */
308 extern __inline __m128d
309 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
310 _mm_sqrt_sd(__m128d __A, __m128d __B) {
311 __v2df __c;
312 __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
313 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
316 extern __inline __m128d
317 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318 _mm_min_pd(__m128d __A, __m128d __B) {
319 return (vec_min(__A, __B));
322 extern __inline __m128d
323 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
324 _mm_min_sd(__m128d __A, __m128d __B) {
325 __v2df __a, __b, __c;
326 __a = vec_splats(__A[0]);
327 __b = vec_splats(__B[0]);
328 __c = vec_min(__a, __b);
329 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
332 extern __inline __m128d
333 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
334 _mm_max_pd(__m128d __A, __m128d __B) {
335 return (vec_max(__A, __B));
338 extern __inline __m128d
339 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340 _mm_max_sd(__m128d __A, __m128d __B) {
341 __v2df __a, __b, __c;
342 __a = vec_splats(__A[0]);
343 __b = vec_splats(__B[0]);
344 __c = vec_max(__a, __b);
345 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
348 extern __inline __m128d
349 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350 _mm_cmpeq_pd(__m128d __A, __m128d __B) {
351 return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
354 extern __inline __m128d
355 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356 _mm_cmplt_pd(__m128d __A, __m128d __B) {
357 return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
360 extern __inline __m128d
361 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362 _mm_cmple_pd(__m128d __A, __m128d __B) {
363 return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
366 extern __inline __m128d
367 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368 _mm_cmpgt_pd(__m128d __A, __m128d __B) {
369 return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
372 extern __inline __m128d
373 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
374 _mm_cmpge_pd(__m128d __A, __m128d __B) {
375 return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
378 extern __inline __m128d
379 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380 _mm_cmpneq_pd(__m128d __A, __m128d __B) {
381 __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
382 return ((__m128d)vec_nor(__temp, __temp));
385 extern __inline __m128d
386 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
387 _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
388 return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
391 extern __inline __m128d
392 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
393 _mm_cmpnle_pd(__m128d __A, __m128d __B) {
394 return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
397 extern __inline __m128d
398 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399 _mm_cmpngt_pd(__m128d __A, __m128d __B) {
400 return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
403 extern __inline __m128d
404 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405 _mm_cmpnge_pd(__m128d __A, __m128d __B) {
406 return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
409 extern __inline __m128d
410 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
411 _mm_cmpord_pd(__m128d __A, __m128d __B) {
412 __v2du __c, __d;
413 /* Compare against self will return false (0's) if NAN. */
414 __c = (__v2du)vec_cmpeq(__A, __A);
415 __d = (__v2du)vec_cmpeq(__B, __B);
416 /* A != NAN and B != NAN. */
417 return ((__m128d)vec_and(__c, __d));
420 extern __inline __m128d
421 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422 _mm_cmpunord_pd(__m128d __A, __m128d __B) {
423 #if _ARCH_PWR8
424 __v2du __c, __d;
425 /* Compare against self will return false (0's) if NAN. */
426 __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
427 __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
428 /* A == NAN OR B == NAN converts too:
429 NOT(A != NAN) OR NOT(B != NAN). */
430 __c = vec_nor(__c, __c);
431 return ((__m128d)vec_orc(__c, __d));
432 #else
433 __v2du __c, __d;
434 /* Compare against self will return false (0's) if NAN. */
435 __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
436 __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
437 /* Convert the true ('1's) is NAN. */
438 __c = vec_nor(__c, __c);
439 __d = vec_nor(__d, __d);
440 return ((__m128d)vec_or(__c, __d));
441 #endif
444 extern __inline __m128d
445 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
446 _mm_cmpeq_sd(__m128d __A, __m128d __B) {
447 __v2df __a, __b, __c;
448 /* PowerISA VSX does not allow partial (for just lower double)
449 results. So to insure we don't generate spurious exceptions
450 (from the upper double values) we splat the lower double
451 before we do the operation. */
452 __a = vec_splats(__A[0]);
453 __b = vec_splats(__B[0]);
454 __c = (__v2df)vec_cmpeq(__a, __b);
455 /* Then we merge the lower double result with the original upper
456 double from __A. */
457 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
460 extern __inline __m128d
461 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462 _mm_cmplt_sd(__m128d __A, __m128d __B) {
463 __v2df __a, __b, __c;
464 __a = vec_splats(__A[0]);
465 __b = vec_splats(__B[0]);
466 __c = (__v2df)vec_cmplt(__a, __b);
467 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
470 extern __inline __m128d
471 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472 _mm_cmple_sd(__m128d __A, __m128d __B) {
473 __v2df __a, __b, __c;
474 __a = vec_splats(__A[0]);
475 __b = vec_splats(__B[0]);
476 __c = (__v2df)vec_cmple(__a, __b);
477 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
480 extern __inline __m128d
481 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482 _mm_cmpgt_sd(__m128d __A, __m128d __B) {
483 __v2df __a, __b, __c;
484 __a = vec_splats(__A[0]);
485 __b = vec_splats(__B[0]);
486 __c = (__v2df)vec_cmpgt(__a, __b);
487 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
490 extern __inline __m128d
491 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492 _mm_cmpge_sd(__m128d __A, __m128d __B) {
493 __v2df __a, __b, __c;
494 __a = vec_splats(__A[0]);
495 __b = vec_splats(__B[0]);
496 __c = (__v2df)vec_cmpge(__a, __b);
497 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
500 extern __inline __m128d
501 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502 _mm_cmpneq_sd(__m128d __A, __m128d __B) {
503 __v2df __a, __b, __c;
504 __a = vec_splats(__A[0]);
505 __b = vec_splats(__B[0]);
506 __c = (__v2df)vec_cmpeq(__a, __b);
507 __c = vec_nor(__c, __c);
508 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
511 extern __inline __m128d
512 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513 _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
514 __v2df __a, __b, __c;
515 __a = vec_splats(__A[0]);
516 __b = vec_splats(__B[0]);
517 /* Not less than is just greater than or equal. */
518 __c = (__v2df)vec_cmpge(__a, __b);
519 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
522 extern __inline __m128d
523 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524 _mm_cmpnle_sd(__m128d __A, __m128d __B) {
525 __v2df __a, __b, __c;
526 __a = vec_splats(__A[0]);
527 __b = vec_splats(__B[0]);
528 /* Not less than or equal is just greater than. */
529 __c = (__v2df)vec_cmpge(__a, __b);
530 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
533 extern __inline __m128d
534 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535 _mm_cmpngt_sd(__m128d __A, __m128d __B) {
536 __v2df __a, __b, __c;
537 __a = vec_splats(__A[0]);
538 __b = vec_splats(__B[0]);
539 /* Not greater than is just less than or equal. */
540 __c = (__v2df)vec_cmple(__a, __b);
541 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
544 extern __inline __m128d
545 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
546 _mm_cmpnge_sd(__m128d __A, __m128d __B) {
547 __v2df __a, __b, __c;
548 __a = vec_splats(__A[0]);
549 __b = vec_splats(__B[0]);
550 /* Not greater than or equal is just less than. */
551 __c = (__v2df)vec_cmplt(__a, __b);
552 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
555 extern __inline __m128d
556 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
557 _mm_cmpord_sd(__m128d __A, __m128d __B) {
558 __v2df __r;
559 __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
560 return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
563 extern __inline __m128d
564 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
565 _mm_cmpunord_sd(__m128d __A, __m128d __B) {
566 __v2df __r;
567 __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
568 return (__m128d)_mm_setr_pd(__r[0], __A[1]);
571 /* FIXME
572 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
573 exactly the same because GCC for PowerPC only generates unordered
574 compares (scalar and vector).
575 Technically __mm_comieq_sp et all should be using the ordered
576 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
577 be OK. */
578 extern __inline int
579 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
580 _mm_comieq_sd(__m128d __A, __m128d __B) {
581 return (__A[0] == __B[0]);
584 extern __inline int
585 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
586 _mm_comilt_sd(__m128d __A, __m128d __B) {
587 return (__A[0] < __B[0]);
590 extern __inline int
591 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592 _mm_comile_sd(__m128d __A, __m128d __B) {
593 return (__A[0] <= __B[0]);
596 extern __inline int
597 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598 _mm_comigt_sd(__m128d __A, __m128d __B) {
599 return (__A[0] > __B[0]);
602 extern __inline int
603 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604 _mm_comige_sd(__m128d __A, __m128d __B) {
605 return (__A[0] >= __B[0]);
608 extern __inline int
609 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610 _mm_comineq_sd(__m128d __A, __m128d __B) {
611 return (__A[0] != __B[0]);
614 extern __inline int
615 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616 _mm_ucomieq_sd(__m128d __A, __m128d __B) {
617 return (__A[0] == __B[0]);
620 extern __inline int
621 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
622 _mm_ucomilt_sd(__m128d __A, __m128d __B) {
623 return (__A[0] < __B[0]);
626 extern __inline int
627 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
628 _mm_ucomile_sd(__m128d __A, __m128d __B) {
629 return (__A[0] <= __B[0]);
632 extern __inline int
633 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
634 _mm_ucomigt_sd(__m128d __A, __m128d __B) {
635 return (__A[0] > __B[0]);
638 extern __inline int
639 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
640 _mm_ucomige_sd(__m128d __A, __m128d __B) {
641 return (__A[0] >= __B[0]);
644 extern __inline int
645 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646 _mm_ucomineq_sd(__m128d __A, __m128d __B) {
647 return (__A[0] != __B[0]);
650 /* Create a vector of Qi, where i is the element number. */
651 extern __inline __m128i
652 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
653 _mm_set_epi64x(long long __q1, long long __q0) {
654 return __extension__(__m128i)(__v2di){__q0, __q1};
657 extern __inline __m128i
658 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
659 _mm_set_epi64(__m64 __q1, __m64 __q0) {
660 return _mm_set_epi64x((long long)__q1, (long long)__q0);
663 extern __inline __m128i
664 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
665 _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
666 return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
669 extern __inline __m128i
670 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
671 _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
672 short __q2, short __q1, short __q0) {
673 return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
674 __q4, __q5, __q6, __q7};
677 extern __inline __m128i
678 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
679 _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
680 char __q10, char __q09, char __q08, char __q07, char __q06,
681 char __q05, char __q04, char __q03, char __q02, char __q01,
682 char __q00) {
683 return __extension__(__m128i)(__v16qi){
684 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
685 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
688 /* Set all of the elements of the vector to A. */
689 extern __inline __m128i
690 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691 _mm_set1_epi64x(long long __A) {
692 return _mm_set_epi64x(__A, __A);
695 extern __inline __m128i
696 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697 _mm_set1_epi64(__m64 __A) {
698 return _mm_set_epi64(__A, __A);
701 extern __inline __m128i
702 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703 _mm_set1_epi32(int __A) {
704 return _mm_set_epi32(__A, __A, __A, __A);
707 extern __inline __m128i
708 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709 _mm_set1_epi16(short __A) {
710 return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
713 extern __inline __m128i
714 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715 _mm_set1_epi8(char __A) {
716 return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
717 __A, __A, __A, __A, __A);
720 /* Create a vector of Qi, where i is the element number.
721 The parameter order is reversed from the _mm_set_epi* functions. */
722 extern __inline __m128i
723 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
724 _mm_setr_epi64(__m64 __q0, __m64 __q1) {
725 return _mm_set_epi64(__q1, __q0);
728 extern __inline __m128i
729 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730 _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
731 return _mm_set_epi32(__q3, __q2, __q1, __q0);
734 extern __inline __m128i
735 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
736 _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
737 short __q5, short __q6, short __q7) {
738 return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
741 extern __inline __m128i
742 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743 _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
744 char __q05, char __q06, char __q07, char __q08, char __q09,
745 char __q10, char __q11, char __q12, char __q13, char __q14,
746 char __q15) {
747 return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
748 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
751 /* Create a vector with element 0 as *P and the rest zero. */
752 extern __inline __m128i
753 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
754 _mm_load_si128(__m128i const *__P) {
755 return *__P;
758 extern __inline __m128i
759 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760 _mm_loadu_si128(__m128i_u const *__P) {
761 return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
764 extern __inline __m128i
765 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766 _mm_loadl_epi64(__m128i_u const *__P) {
767 return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
770 extern __inline void
771 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772 _mm_store_si128(__m128i *__P, __m128i __B) {
773 vec_st((__v16qu)__B, 0, (__v16qu *)__P);
776 extern __inline void
777 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778 _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
779 *__P = __B;
782 extern __inline void
783 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
784 _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
785 *(long long *)__P = ((__v2di)__B)[0];
788 extern __inline __m64
789 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
790 _mm_movepi64_pi64(__m128i_u __B) {
791 return (__m64)((__v2di)__B)[0];
794 extern __inline __m128i
795 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
796 _mm_movpi64_epi64(__m64 __A) {
797 return _mm_set_epi64((__m64)0LL, __A);
800 extern __inline __m128i
801 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
802 _mm_move_epi64(__m128i __A) {
803 return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
806 /* Create an undefined vector. */
807 extern __inline __m128i
808 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809 _mm_undefined_si128(void) {
810 __m128i __Y = __Y;
811 return __Y;
814 /* Create a vector of zeros. */
815 extern __inline __m128i
816 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
817 _mm_setzero_si128(void) {
818 return __extension__(__m128i)(__v4si){0, 0, 0, 0};
821 #ifdef _ARCH_PWR8
822 extern __inline __m128d
823 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824 _mm_cvtepi32_pd(__m128i __A) {
825 __v2di __val;
826 /* For LE need to generate Vector Unpack Low Signed Word.
827 Which is generated from unpackh. */
828 __val = (__v2di)vec_unpackh((__v4si)__A);
830 return (__m128d)vec_ctf(__val, 0);
832 #endif
834 extern __inline __m128
835 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836 _mm_cvtepi32_ps(__m128i __A) {
837 return ((__m128)vec_ctf((__v4si)__A, 0));
840 extern __inline __m128i
841 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842 _mm_cvtpd_epi32(__m128d __A) {
843 __v2df __rounded = vec_rint(__A);
844 __v4si __result, __temp;
845 const __v4si __vzero = {0, 0, 0, 0};
847 /* VSX Vector truncate Double-Precision to integer and Convert to
848 Signed Integer Word format with Saturate. */
849 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
851 #ifdef _ARCH_PWR8
852 #ifdef __LITTLE_ENDIAN__
853 __temp = vec_mergeo(__temp, __temp);
854 #else
855 __temp = vec_mergee(__temp, __temp);
856 #endif
857 __result = (__v4si)vec_vpkudum((__vector long long)__temp,
858 (__vector long long)__vzero);
859 #else
861 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
862 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
863 __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
865 #endif
866 return (__m128i)__result;
869 extern __inline __m64
870 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871 _mm_cvtpd_pi32(__m128d __A) {
872 __m128i __result = _mm_cvtpd_epi32(__A);
874 return (__m64)__result[0];
877 extern __inline __m128
878 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
879 _mm_cvtpd_ps(__m128d __A) {
880 __v4sf __result;
881 __v4si __temp;
882 const __v4si __vzero = {0, 0, 0, 0};
884 __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
886 #ifdef _ARCH_PWR8
887 #ifdef __LITTLE_ENDIAN__
888 __temp = vec_mergeo(__temp, __temp);
889 #else
890 __temp = vec_mergee(__temp, __temp);
891 #endif
892 __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
893 (__vector long long)__vzero);
894 #else
896 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
897 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
898 __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
900 #endif
901 return ((__m128)__result);
904 extern __inline __m128i
905 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906 _mm_cvttpd_epi32(__m128d __A) {
907 __v4si __result;
908 __v4si __temp;
909 const __v4si __vzero = {0, 0, 0, 0};
911 /* VSX Vector truncate Double-Precision to integer and Convert to
912 Signed Integer Word format with Saturate. */
913 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
915 #ifdef _ARCH_PWR8
916 #ifdef __LITTLE_ENDIAN__
917 __temp = vec_mergeo(__temp, __temp);
918 #else
919 __temp = vec_mergee(__temp, __temp);
920 #endif
921 __result = (__v4si)vec_vpkudum((__vector long long)__temp,
922 (__vector long long)__vzero);
923 #else
925 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
926 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
927 __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
929 #endif
931 return ((__m128i)__result);
934 extern __inline __m64
935 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
936 _mm_cvttpd_pi32(__m128d __A) {
937 __m128i __result = _mm_cvttpd_epi32(__A);
939 return (__m64)__result[0];
942 extern __inline int
943 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
944 _mm_cvtsi128_si32(__m128i __A) {
945 return ((__v4si)__A)[0];
948 #ifdef _ARCH_PWR8
949 extern __inline __m128d
950 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951 _mm_cvtpi32_pd(__m64 __A) {
952 __v4si __temp;
953 __v2di __tmp2;
954 __v2df __result;
956 __temp = (__v4si)vec_splats(__A);
957 __tmp2 = (__v2di)vec_unpackl(__temp);
958 __result = vec_ctf((__vector signed long long)__tmp2, 0);
959 return (__m128d)__result;
961 #endif
963 extern __inline __m128i
964 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
965 _mm_cvtps_epi32(__m128 __A) {
966 __v4sf __rounded;
967 __v4si __result;
969 __rounded = vec_rint((__v4sf)__A);
970 __result = vec_cts(__rounded, 0);
971 return (__m128i)__result;
974 extern __inline __m128i
975 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
976 _mm_cvttps_epi32(__m128 __A) {
977 __v4si __result;
979 __result = vec_cts((__v4sf)__A, 0);
980 return (__m128i)__result;
983 extern __inline __m128d
984 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
985 _mm_cvtps_pd(__m128 __A) {
986 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
987 #ifdef vec_doubleh
988 return (__m128d)vec_doubleh((__v4sf)__A);
989 #else
990 /* Otherwise the compiler is not current and so need to generate the
991 equivalent code. */
992 __v4sf __a = (__v4sf)__A;
993 __v4sf __temp;
994 __v2df __result;
995 #ifdef __LITTLE_ENDIAN__
996 /* The input float values are in elements {[0], [1]} but the convert
997 instruction needs them in elements {[1], [3]}, So we use two
998 shift left double vector word immediates to get the elements
999 lined up. */
1000 __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1001 __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1002 #else
1003 /* The input float values are in elements {[0], [1]} but the convert
1004 instruction needs them in elements {[0], [2]}, So we use two
1005 shift left double vector word immediates to get the elements
1006 lined up. */
1007 __temp = vec_vmrghw(__a, __a);
1008 #endif
1009 __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1010 return (__m128d)__result;
1011 #endif
1014 extern __inline int
1015 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1016 _mm_cvtsd_si32(__m128d __A) {
1017 __v2df __rounded = vec_rint((__v2df)__A);
1018 int __result = ((__v2df)__rounded)[0];
1020 return __result;
1022 /* Intel intrinsic. */
1023 extern __inline long long
1024 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025 _mm_cvtsd_si64(__m128d __A) {
1026 __v2df __rounded = vec_rint((__v2df)__A);
1027 long long __result = ((__v2df)__rounded)[0];
1029 return __result;
1032 /* Microsoft intrinsic. */
1033 extern __inline long long
1034 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1035 _mm_cvtsd_si64x(__m128d __A) {
1036 return _mm_cvtsd_si64((__v2df)__A);
1039 extern __inline int
1040 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1041 _mm_cvttsd_si32(__m128d __A) {
1042 int __result = ((__v2df)__A)[0];
1044 return __result;
1047 /* Intel intrinsic. */
1048 extern __inline long long
1049 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050 _mm_cvttsd_si64(__m128d __A) {
1051 long long __result = ((__v2df)__A)[0];
1053 return __result;
1056 /* Microsoft intrinsic. */
1057 extern __inline long long
1058 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059 _mm_cvttsd_si64x(__m128d __A) {
1060 return _mm_cvttsd_si64(__A);
1063 extern __inline __m128
1064 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065 _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1066 __v4sf __result = (__v4sf)__A;
1068 #ifdef __LITTLE_ENDIAN__
1069 __v4sf __temp_s;
1070 /* Copy double element[0] to element [1] for conversion. */
1071 __v2df __temp_b = vec_splat((__v2df)__B, 0);
1073 /* Pre-rotate __A left 3 (logically right 1) elements. */
1074 __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1075 /* Convert double to single float scalar in a vector. */
1076 __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1077 /* Shift the resulting scalar into vector element [0]. */
1078 __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1079 #else
1080 __result[0] = ((__v2df)__B)[0];
1081 #endif
1082 return (__m128)__result;
1085 extern __inline __m128d
1086 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087 _mm_cvtsi32_sd(__m128d __A, int __B) {
1088 __v2df __result = (__v2df)__A;
1089 double __db = __B;
1090 __result[0] = __db;
1091 return (__m128d)__result;
1094 /* Intel intrinsic. */
1095 extern __inline __m128d
1096 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1097 _mm_cvtsi64_sd(__m128d __A, long long __B) {
1098 __v2df __result = (__v2df)__A;
1099 double __db = __B;
1100 __result[0] = __db;
1101 return (__m128d)__result;
1104 /* Microsoft intrinsic. */
1105 extern __inline __m128d
1106 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1107 _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1108 return _mm_cvtsi64_sd(__A, __B);
1111 extern __inline __m128d
1112 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1113 _mm_cvtss_sd(__m128d __A, __m128 __B) {
1114 #ifdef __LITTLE_ENDIAN__
1115 /* Use splat to move element [0] into position for the convert. */
1116 __v4sf __temp = vec_splat((__v4sf)__B, 0);
1117 __v2df __res;
1118 /* Convert single float scalar to double in a vector. */
1119 __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1120 return (__m128d)vec_mergel(__res, (__v2df)__A);
1121 #else
1122 __v2df __res = (__v2df)__A;
1123 __res[0] = ((__v4sf)__B)[0];
1124 return (__m128d)__res;
1125 #endif
1128 extern __inline __m128d
1129 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1130 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1131 __vector double __result;
1132 const int __litmsk = __mask & 0x3;
1134 if (__litmsk == 0)
1135 __result = vec_mergeh(__A, __B);
1136 #if __GNUC__ < 6
1137 else if (__litmsk == 1)
1138 __result = vec_xxpermdi(__B, __A, 2);
1139 else if (__litmsk == 2)
1140 __result = vec_xxpermdi(__B, __A, 1);
1141 #else
1142 else if (__litmsk == 1)
1143 __result = vec_xxpermdi(__A, __B, 2);
1144 else if (__litmsk == 2)
1145 __result = vec_xxpermdi(__A, __B, 1);
1146 #endif
1147 else
1148 __result = vec_mergel(__A, __B);
1150 return __result;
1153 extern __inline __m128d
1154 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1155 _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1156 return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1159 extern __inline __m128d
1160 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161 _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1162 return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1165 extern __inline __m128d
1166 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1167 _mm_loadh_pd(__m128d __A, double const *__B) {
1168 __v2df __result = (__v2df)__A;
1169 __result[1] = *__B;
1170 return (__m128d)__result;
1173 extern __inline __m128d
1174 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175 _mm_loadl_pd(__m128d __A, double const *__B) {
1176 __v2df __result = (__v2df)__A;
1177 __result[0] = *__B;
1178 return (__m128d)__result;
1181 #ifdef _ARCH_PWR8
1182 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1184 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1185 extern __inline int
1186 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1187 _mm_movemask_pd(__m128d __A) {
1188 #ifdef _ARCH_PWR10
1189 return vec_extractm((__v2du)__A);
1190 #else
1191 __vector unsigned long long __result;
1192 static const __vector unsigned int __perm_mask = {
1193 #ifdef __LITTLE_ENDIAN__
1194 0x80800040, 0x80808080, 0x80808080, 0x80808080
1195 #else
1196 0x80808080, 0x80808080, 0x80808080, 0x80804000
1197 #endif
1200 __result = ((__vector unsigned long long)vec_vbpermq(
1201 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1203 #ifdef __LITTLE_ENDIAN__
1204 return __result[1];
1205 #else
1206 return __result[0];
1207 #endif
1208 #endif /* !_ARCH_PWR10 */
1210 #endif /* _ARCH_PWR8 */
1212 extern __inline __m128i
1213 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 _mm_packs_epi16(__m128i __A, __m128i __B) {
1215 return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1218 extern __inline __m128i
1219 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220 _mm_packs_epi32(__m128i __A, __m128i __B) {
1221 return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1224 extern __inline __m128i
1225 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226 _mm_packus_epi16(__m128i __A, __m128i __B) {
1227 return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1230 extern __inline __m128i
1231 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1232 _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1233 return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1236 extern __inline __m128i
1237 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238 _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1239 return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1242 extern __inline __m128i
1243 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244 _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1245 return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1248 extern __inline __m128i
1249 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250 _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1251 return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1254 extern __inline __m128i
1255 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1256 _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1257 return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1260 extern __inline __m128i
1261 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262 _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1263 return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1266 extern __inline __m128i
1267 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268 _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1269 return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1272 extern __inline __m128i
1273 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274 _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1275 return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1278 extern __inline __m128i
1279 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280 _mm_add_epi8(__m128i __A, __m128i __B) {
1281 return (__m128i)((__v16qu)__A + (__v16qu)__B);
1284 extern __inline __m128i
1285 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286 _mm_add_epi16(__m128i __A, __m128i __B) {
1287 return (__m128i)((__v8hu)__A + (__v8hu)__B);
1290 extern __inline __m128i
1291 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292 _mm_add_epi32(__m128i __A, __m128i __B) {
1293 return (__m128i)((__v4su)__A + (__v4su)__B);
1296 extern __inline __m128i
1297 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298 _mm_add_epi64(__m128i __A, __m128i __B) {
1299 return (__m128i)((__v2du)__A + (__v2du)__B);
1302 extern __inline __m128i
1303 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304 _mm_adds_epi8(__m128i __A, __m128i __B) {
1305 return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1308 extern __inline __m128i
1309 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310 _mm_adds_epi16(__m128i __A, __m128i __B) {
1311 return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1314 extern __inline __m128i
1315 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316 _mm_adds_epu8(__m128i __A, __m128i __B) {
1317 return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1320 extern __inline __m128i
1321 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322 _mm_adds_epu16(__m128i __A, __m128i __B) {
1323 return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1326 extern __inline __m128i
1327 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328 _mm_sub_epi8(__m128i __A, __m128i __B) {
1329 return (__m128i)((__v16qu)__A - (__v16qu)__B);
1332 extern __inline __m128i
1333 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1334 _mm_sub_epi16(__m128i __A, __m128i __B) {
1335 return (__m128i)((__v8hu)__A - (__v8hu)__B);
1338 extern __inline __m128i
1339 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340 _mm_sub_epi32(__m128i __A, __m128i __B) {
1341 return (__m128i)((__v4su)__A - (__v4su)__B);
1344 extern __inline __m128i
1345 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346 _mm_sub_epi64(__m128i __A, __m128i __B) {
1347 return (__m128i)((__v2du)__A - (__v2du)__B);
1350 extern __inline __m128i
1351 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352 _mm_subs_epi8(__m128i __A, __m128i __B) {
1353 return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1356 extern __inline __m128i
1357 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358 _mm_subs_epi16(__m128i __A, __m128i __B) {
1359 return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1362 extern __inline __m128i
1363 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364 _mm_subs_epu8(__m128i __A, __m128i __B) {
1365 return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1368 extern __inline __m128i
1369 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm_subs_epu16(__m128i __A, __m128i __B) {
1371 return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1374 extern __inline __m128i
1375 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376 _mm_madd_epi16(__m128i __A, __m128i __B) {
1377 __vector signed int __zero = {0, 0, 0, 0};
1379 return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1382 extern __inline __m128i
1383 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384 _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1385 __vector signed int __w0, __w1;
1387 __vector unsigned char __xform1 = {
1388 #ifdef __LITTLE_ENDIAN__
1389 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1390 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1391 #else
1392 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1393 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1394 #endif
1397 __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1398 __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1399 return (__m128i)vec_perm(__w0, __w1, __xform1);
1402 extern __inline __m128i
1403 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1404 _mm_mullo_epi16(__m128i __A, __m128i __B) {
1405 return (__m128i)((__v8hi)__A * (__v8hi)__B);
1408 extern __inline __m64
1409 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1410 _mm_mul_su32(__m64 __A, __m64 __B) {
1411 unsigned int __a = __A;
1412 unsigned int __b = __B;
1414 return ((__m64)__a * (__m64)__b);
1417 #ifdef _ARCH_PWR8
1418 extern __inline __m128i
1419 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420 _mm_mul_epu32(__m128i __A, __m128i __B) {
1421 #if __GNUC__ < 8
1422 __v2du __result;
1424 #ifdef __LITTLE_ENDIAN__
1425 /* VMX Vector Multiply Odd Unsigned Word. */
1426 __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1427 #else
1428 /* VMX Vector Multiply Even Unsigned Word. */
1429 __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1430 #endif
1431 return (__m128i)__result;
1432 #else
1433 return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1434 #endif
1436 #endif
1438 extern __inline __m128i
1439 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1440 _mm_slli_epi16(__m128i __A, int __B) {
1441 __v8hu __lshift;
1442 __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1444 if (__B >= 0 && __B < 16) {
1445 if (__builtin_constant_p(__B))
1446 __lshift = (__v8hu)vec_splat_s16(__B);
1447 else
1448 __lshift = vec_splats((unsigned short)__B);
1450 __result = vec_sl((__v8hi)__A, __lshift);
1453 return (__m128i)__result;
1456 extern __inline __m128i
1457 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1458 _mm_slli_epi32(__m128i __A, int __B) {
1459 __v4su __lshift;
1460 __v4si __result = {0, 0, 0, 0};
1462 if (__B >= 0 && __B < 32) {
1463 if (__builtin_constant_p(__B) && __B < 16)
1464 __lshift = (__v4su)vec_splat_s32(__B);
1465 else
1466 __lshift = vec_splats((unsigned int)__B);
1468 __result = vec_sl((__v4si)__A, __lshift);
1471 return (__m128i)__result;
1474 #ifdef _ARCH_PWR8
1475 extern __inline __m128i
1476 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1477 _mm_slli_epi64(__m128i __A, int __B) {
1478 __v2du __lshift;
1479 __v2di __result = {0, 0};
1481 if (__B >= 0 && __B < 64) {
1482 if (__builtin_constant_p(__B) && __B < 16)
1483 __lshift = (__v2du)vec_splat_s32(__B);
1484 else
1485 __lshift = (__v2du)vec_splats((unsigned int)__B);
1487 __result = vec_sl((__v2di)__A, __lshift);
1490 return (__m128i)__result;
1492 #endif
1494 extern __inline __m128i
1495 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1496 _mm_srai_epi16(__m128i __A, int __B) {
1497 __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1498 __v8hi __result;
1500 if (__B < 16) {
1501 if (__builtin_constant_p(__B))
1502 __rshift = (__v8hu)vec_splat_s16(__B);
1503 else
1504 __rshift = vec_splats((unsigned short)__B);
1506 __result = vec_sra((__v8hi)__A, __rshift);
1508 return (__m128i)__result;
1511 extern __inline __m128i
1512 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1513 _mm_srai_epi32(__m128i __A, int __B) {
1514 __v4su __rshift = {31, 31, 31, 31};
1515 __v4si __result;
1517 if (__B < 32) {
1518 if (__builtin_constant_p(__B)) {
1519 if (__B < 16)
1520 __rshift = (__v4su)vec_splat_s32(__B);
1521 else
1522 __rshift = (__v4su)vec_splats((unsigned int)__B);
1523 } else
1524 __rshift = vec_splats((unsigned int)__B);
1526 __result = vec_sra((__v4si)__A, __rshift);
1528 return (__m128i)__result;
1531 extern __inline __m128i
1532 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1533 _mm_bslli_si128(__m128i __A, const int __N) {
1534 __v16qu __result;
1535 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1537 if (__N < 16)
1538 __result = vec_sld((__v16qu)__A, __zeros, __N);
1539 else
1540 __result = __zeros;
1542 return (__m128i)__result;
1545 extern __inline __m128i
1546 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1547 _mm_bsrli_si128(__m128i __A, const int __N) {
1548 __v16qu __result;
1549 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1551 if (__N < 16)
1552 #ifdef __LITTLE_ENDIAN__
1553 if (__builtin_constant_p(__N))
1554 /* Would like to use Vector Shift Left Double by Octet
1555 Immediate here to use the immediate form and avoid
1556 load of __N * 8 value into a separate VR. */
1557 __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1558 else
1559 #endif
1561 __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1562 #ifdef __LITTLE_ENDIAN__
1563 __result = vec_sro((__v16qu)__A, __shift);
1564 #else
1565 __result = vec_slo((__v16qu)__A, __shift);
1566 #endif
1568 else
1569 __result = __zeros;
1571 return (__m128i)__result;
1574 extern __inline __m128i
1575 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1576 _mm_srli_si128(__m128i __A, const int __N) {
1577 return _mm_bsrli_si128(__A, __N);
1580 extern __inline __m128i
1581 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1582 _mm_slli_si128(__m128i __A, const int _imm5) {
1583 __v16qu __result;
1584 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1586 if (_imm5 < 16)
1587 #ifdef __LITTLE_ENDIAN__
1588 __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1589 #else
1590 __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1591 #endif
1592 else
1593 __result = __zeros;
1595 return (__m128i)__result;
1598 extern __inline __m128i
1599 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1601 _mm_srli_epi16(__m128i __A, int __B) {
1602 __v8hu __rshift;
1603 __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1605 if (__B < 16) {
1606 if (__builtin_constant_p(__B))
1607 __rshift = (__v8hu)vec_splat_s16(__B);
1608 else
1609 __rshift = vec_splats((unsigned short)__B);
1611 __result = vec_sr((__v8hi)__A, __rshift);
1614 return (__m128i)__result;
1617 extern __inline __m128i
1618 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1619 _mm_srli_epi32(__m128i __A, int __B) {
1620 __v4su __rshift;
1621 __v4si __result = {0, 0, 0, 0};
1623 if (__B < 32) {
1624 if (__builtin_constant_p(__B)) {
1625 if (__B < 16)
1626 __rshift = (__v4su)vec_splat_s32(__B);
1627 else
1628 __rshift = (__v4su)vec_splats((unsigned int)__B);
1629 } else
1630 __rshift = vec_splats((unsigned int)__B);
1632 __result = vec_sr((__v4si)__A, __rshift);
1635 return (__m128i)__result;
1638 #ifdef _ARCH_PWR8
1639 extern __inline __m128i
1640 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1641 _mm_srli_epi64(__m128i __A, int __B) {
1642 __v2du __rshift;
1643 __v2di __result = {0, 0};
1645 if (__B < 64) {
1646 if (__builtin_constant_p(__B)) {
1647 if (__B < 16)
1648 __rshift = (__v2du)vec_splat_s32(__B);
1649 else
1650 __rshift = (__v2du)vec_splats((unsigned long long)__B);
1651 } else
1652 __rshift = (__v2du)vec_splats((unsigned int)__B);
1654 __result = vec_sr((__v2di)__A, __rshift);
1657 return (__m128i)__result;
1659 #endif
1661 extern __inline __m128i
1662 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1663 _mm_sll_epi16(__m128i __A, __m128i __B) {
1664 __v8hu __lshift;
1665 __vector __bool short __shmask;
1666 const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1667 __v8hu __result;
1669 #ifdef __LITTLE_ENDIAN__
1670 __lshift = vec_splat((__v8hu)__B, 0);
1671 #else
1672 __lshift = vec_splat((__v8hu)__B, 3);
1673 #endif
1674 __shmask = vec_cmple(__lshift, __shmax);
1675 __result = vec_sl((__v8hu)__A, __lshift);
1676 __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1678 return (__m128i)__result;
1681 extern __inline __m128i
1682 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683 _mm_sll_epi32(__m128i __A, __m128i __B) {
1684 __v4su __lshift;
1685 __vector __bool int __shmask;
1686 const __v4su __shmax = {32, 32, 32, 32};
1687 __v4su __result;
1688 #ifdef __LITTLE_ENDIAN__
1689 __lshift = vec_splat((__v4su)__B, 0);
1690 #else
1691 __lshift = vec_splat((__v4su)__B, 1);
1692 #endif
1693 __shmask = vec_cmplt(__lshift, __shmax);
1694 __result = vec_sl((__v4su)__A, __lshift);
1695 __result = vec_sel((__v4su)__shmask, __result, __shmask);
1697 return (__m128i)__result;
1700 #ifdef _ARCH_PWR8
1701 extern __inline __m128i
1702 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1703 _mm_sll_epi64(__m128i __A, __m128i __B) {
1704 __v2du __lshift;
1705 __vector __bool long long __shmask;
1706 const __v2du __shmax = {64, 64};
1707 __v2du __result;
1709 __lshift = vec_splat((__v2du)__B, 0);
1710 __shmask = vec_cmplt(__lshift, __shmax);
1711 __result = vec_sl((__v2du)__A, __lshift);
1712 __result = vec_sel((__v2du)__shmask, __result, __shmask);
1714 return (__m128i)__result;
1716 #endif
1718 extern __inline __m128i
1719 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1720 _mm_sra_epi16(__m128i __A, __m128i __B) {
1721 const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1722 __v8hu __rshift;
1723 __v8hi __result;
1725 #ifdef __LITTLE_ENDIAN__
1726 __rshift = vec_splat((__v8hu)__B, 0);
1727 #else
1728 __rshift = vec_splat((__v8hu)__B, 3);
1729 #endif
1730 __rshift = vec_min(__rshift, __rshmax);
1731 __result = vec_sra((__v8hi)__A, __rshift);
1733 return (__m128i)__result;
1736 extern __inline __m128i
1737 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738 _mm_sra_epi32(__m128i __A, __m128i __B) {
1739 const __v4su __rshmax = {31, 31, 31, 31};
1740 __v4su __rshift;
1741 __v4si __result;
1743 #ifdef __LITTLE_ENDIAN__
1744 __rshift = vec_splat((__v4su)__B, 0);
1745 #else
1746 __rshift = vec_splat((__v4su)__B, 1);
1747 #endif
1748 __rshift = vec_min(__rshift, __rshmax);
1749 __result = vec_sra((__v4si)__A, __rshift);
1751 return (__m128i)__result;
1754 extern __inline __m128i
1755 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1756 _mm_srl_epi16(__m128i __A, __m128i __B) {
1757 __v8hu __rshift;
1758 __vector __bool short __shmask;
1759 const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1760 __v8hu __result;
1762 #ifdef __LITTLE_ENDIAN__
1763 __rshift = vec_splat((__v8hu)__B, 0);
1764 #else
1765 __rshift = vec_splat((__v8hu)__B, 3);
1766 #endif
1767 __shmask = vec_cmple(__rshift, __shmax);
1768 __result = vec_sr((__v8hu)__A, __rshift);
1769 __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1771 return (__m128i)__result;
1774 extern __inline __m128i
1775 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1776 _mm_srl_epi32(__m128i __A, __m128i __B) {
1777 __v4su __rshift;
1778 __vector __bool int __shmask;
1779 const __v4su __shmax = {32, 32, 32, 32};
1780 __v4su __result;
1782 #ifdef __LITTLE_ENDIAN__
1783 __rshift = vec_splat((__v4su)__B, 0);
1784 #else
1785 __rshift = vec_splat((__v4su)__B, 1);
1786 #endif
1787 __shmask = vec_cmplt(__rshift, __shmax);
1788 __result = vec_sr((__v4su)__A, __rshift);
1789 __result = vec_sel((__v4su)__shmask, __result, __shmask);
1791 return (__m128i)__result;
1794 #ifdef _ARCH_PWR8
1795 extern __inline __m128i
1796 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1797 _mm_srl_epi64(__m128i __A, __m128i __B) {
1798 __v2du __rshift;
1799 __vector __bool long long __shmask;
1800 const __v2du __shmax = {64, 64};
1801 __v2du __result;
1803 __rshift = vec_splat((__v2du)__B, 0);
1804 __shmask = vec_cmplt(__rshift, __shmax);
1805 __result = vec_sr((__v2du)__A, __rshift);
1806 __result = vec_sel((__v2du)__shmask, __result, __shmask);
1808 return (__m128i)__result;
1810 #endif
1812 extern __inline __m128d
1813 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1814 _mm_and_pd(__m128d __A, __m128d __B) {
1815 return (vec_and((__v2df)__A, (__v2df)__B));
1818 extern __inline __m128d
1819 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1820 _mm_andnot_pd(__m128d __A, __m128d __B) {
1821 return (vec_andc((__v2df)__B, (__v2df)__A));
1824 extern __inline __m128d
1825 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1826 _mm_or_pd(__m128d __A, __m128d __B) {
1827 return (vec_or((__v2df)__A, (__v2df)__B));
1830 extern __inline __m128d
1831 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1832 _mm_xor_pd(__m128d __A, __m128d __B) {
1833 return (vec_xor((__v2df)__A, (__v2df)__B));
1836 extern __inline __m128i
1837 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1838 _mm_and_si128(__m128i __A, __m128i __B) {
1839 return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1842 extern __inline __m128i
1843 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1844 _mm_andnot_si128(__m128i __A, __m128i __B) {
1845 return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1848 extern __inline __m128i
1849 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1850 _mm_or_si128(__m128i __A, __m128i __B) {
1851 return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1854 extern __inline __m128i
1855 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1856 _mm_xor_si128(__m128i __A, __m128i __B) {
1857 return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1860 extern __inline __m128i
1861 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1862 _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1863 return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1866 extern __inline __m128i
1867 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1868 _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1869 return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1872 extern __inline __m128i
1873 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1874 _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1875 return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1878 extern __inline __m128i
1879 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1880 _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1881 return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1884 extern __inline __m128i
1885 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1886 _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1887 return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1890 extern __inline __m128i
1891 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1892 _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1893 return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1896 extern __inline __m128i
1897 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1898 _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1899 return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1902 extern __inline __m128i
1903 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1904 _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1905 return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1908 extern __inline __m128i
1909 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1910 _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1911 return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1914 extern __inline int
1915 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1916 _mm_extract_epi16(__m128i const __A, int const __N) {
1917 return (unsigned short)((__v8hi)__A)[__N & 7];
1920 extern __inline __m128i
1921 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1922 _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1923 __v8hi __result = (__v8hi)__A;
1925 __result[(__N & 7)] = __D;
1927 return (__m128i)__result;
1930 extern __inline __m128i
1931 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1932 _mm_max_epi16(__m128i __A, __m128i __B) {
1933 return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1936 extern __inline __m128i
1937 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1938 _mm_max_epu8(__m128i __A, __m128i __B) {
1939 return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1942 extern __inline __m128i
1943 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1944 _mm_min_epi16(__m128i __A, __m128i __B) {
1945 return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1948 extern __inline __m128i
1949 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1950 _mm_min_epu8(__m128i __A, __m128i __B) {
1951 return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1954 #ifdef _ARCH_PWR8
1955 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1957 /* Return a mask created from the most significant bit of each 8-bit
1958 element in A. */
1959 extern __inline int
1960 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1961 _mm_movemask_epi8(__m128i __A) {
1962 #ifdef _ARCH_PWR10
1963 return vec_extractm((__v16qu)__A);
1964 #else
1965 __vector unsigned long long __result;
1966 static const __vector unsigned char __perm_mask = {
1967 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1968 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1970 __result = ((__vector unsigned long long)vec_vbpermq(
1971 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1973 #ifdef __LITTLE_ENDIAN__
1974 return __result[1];
1975 #else
1976 return __result[0];
1977 #endif
1978 #endif /* !_ARCH_PWR10 */
1980 #endif /* _ARCH_PWR8 */
1982 extern __inline __m128i
1983 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1984 _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1985 __v4su __w0, __w1;
1986 __v16qu __xform1 = {
1987 #ifdef __LITTLE_ENDIAN__
1988 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1989 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1990 #else
1991 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1992 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1993 #endif
1996 __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1997 __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1998 return (__m128i)vec_perm(__w0, __w1, __xform1);
2001 extern __inline __m128i
2002 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2003 _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2004 unsigned long __element_selector_98 = __mask & 0x03;
2005 unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2006 unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2007 unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2008 static const unsigned short __permute_selectors[4] = {
2009 #ifdef __LITTLE_ENDIAN__
2010 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2011 #else
2012 0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2013 #endif
2015 __v2du __pmask =
2016 #ifdef __LITTLE_ENDIAN__
2017 {0x1716151413121110UL, 0UL};
2018 #else
2019 {0x1011121314151617UL, 0UL};
2020 #endif
2021 __m64_union __t;
2022 __v2du __a, __r;
2024 __t.as_short[0] = __permute_selectors[__element_selector_98];
2025 __t.as_short[1] = __permute_selectors[__element_selector_BA];
2026 __t.as_short[2] = __permute_selectors[__element_selector_DC];
2027 __t.as_short[3] = __permute_selectors[__element_selector_FE];
2028 __pmask[1] = __t.as_m64;
2029 __a = (__v2du)__A;
2030 __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2031 return (__m128i)__r;
2034 extern __inline __m128i
2035 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2036 _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2037 unsigned long __element_selector_10 = __mask & 0x03;
2038 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2039 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2040 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2041 static const unsigned short __permute_selectors[4] = {
2042 #ifdef __LITTLE_ENDIAN__
2043 0x0100, 0x0302, 0x0504, 0x0706
2044 #else
2045 0x0001, 0x0203, 0x0405, 0x0607
2046 #endif
2048 __v2du __pmask =
2049 #ifdef __LITTLE_ENDIAN__
2050 {0UL, 0x1f1e1d1c1b1a1918UL};
2051 #else
2052 {0UL, 0x18191a1b1c1d1e1fUL};
2053 #endif
2054 __m64_union __t;
2055 __v2du __a, __r;
2056 __t.as_short[0] = __permute_selectors[__element_selector_10];
2057 __t.as_short[1] = __permute_selectors[__element_selector_32];
2058 __t.as_short[2] = __permute_selectors[__element_selector_54];
2059 __t.as_short[3] = __permute_selectors[__element_selector_76];
2060 __pmask[0] = __t.as_m64;
2061 __a = (__v2du)__A;
2062 __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2063 return (__m128i)__r;
2066 extern __inline __m128i
2067 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2068 _mm_shuffle_epi32(__m128i __A, const int __mask) {
2069 unsigned long __element_selector_10 = __mask & 0x03;
2070 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2071 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2072 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2073 static const unsigned int __permute_selectors[4] = {
2074 #ifdef __LITTLE_ENDIAN__
2075 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2076 #else
2077 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2078 #endif
2080 __v4su __t;
2082 __t[0] = __permute_selectors[__element_selector_10];
2083 __t[1] = __permute_selectors[__element_selector_32];
2084 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2085 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2086 return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2087 (__vector unsigned char)__t);
2090 extern __inline void
2091 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2092 _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2093 __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2094 __v16qu __mask, __tmp;
2095 __m128i_u *__p = (__m128i_u *)__C;
2097 __tmp = (__v16qu)_mm_loadu_si128(__p);
2098 __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2099 __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2100 _mm_storeu_si128(__p, (__m128i)__tmp);
2103 extern __inline __m128i
2104 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2105 _mm_avg_epu8(__m128i __A, __m128i __B) {
2106 return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2109 extern __inline __m128i
2110 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2111 _mm_avg_epu16(__m128i __A, __m128i __B) {
2112 return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2115 extern __inline __m128i
2116 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2117 _mm_sad_epu8(__m128i __A, __m128i __B) {
2118 __v16qu __a, __b;
2119 __v16qu __vabsdiff;
2120 __v4si __vsum;
2121 const __v4su __zero = {0, 0, 0, 0};
2122 __v4si __result;
2124 __a = (__v16qu)__A;
2125 __b = (__v16qu)__B;
2126 #ifndef _ARCH_PWR9
2127 __v16qu __vmin = vec_min(__a, __b);
2128 __v16qu __vmax = vec_max(__a, __b);
2129 __vabsdiff = vec_sub(__vmax, __vmin);
2130 #else
2131 __vabsdiff = vec_absd(__a, __b);
2132 #endif
2133 /* Sum four groups of bytes into integers. */
2134 __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2135 #ifdef __LITTLE_ENDIAN__
2136 /* Sum across four integers with two integer results. */
2137 __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2138 /* Note: vec_sum2s could be used here, but on little-endian, vector
2139 shifts are added that are not needed for this use-case.
2140 A vector shift to correctly position the 32-bit integer results
2141 (currently at [0] and [2]) to [1] and [3] would then need to be
2142 swapped back again since the desired results are two 64-bit
2143 integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */
2144 #else
2145 /* Sum across four integers with two integer results. */
2146 __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2147 /* Rotate the sums into the correct position. */
2148 __result = vec_sld(__result, __result, 6);
2149 #endif
2150 return (__m128i)__result;
2153 extern __inline void
2154 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2155 _mm_stream_si32(int *__A, int __B) {
2156 /* Use the data cache block touch for store transient. */
2157 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2158 *__A = __B;
2161 extern __inline void
2162 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2163 _mm_stream_si64(long long int *__A, long long int __B) {
2164 /* Use the data cache block touch for store transient. */
2165 __asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory");
2166 *__A = __B;
2169 extern __inline void
2170 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2171 _mm_stream_si128(__m128i *__A, __m128i __B) {
2172 /* Use the data cache block touch for store transient. */
2173 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2174 *__A = __B;
2177 extern __inline void
2178 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2179 _mm_stream_pd(double *__A, __m128d __B) {
2180 /* Use the data cache block touch for store transient. */
2181 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2182 *(__m128d *)__A = __B;
2185 extern __inline void
2186 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2187 _mm_clflush(void const *__A) {
2188 /* Use the data cache block flush. */
2189 __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2192 extern __inline void
2193 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2194 _mm_lfence(void) {
2195 /* Use light weight sync for load to load ordering. */
2196 __atomic_thread_fence(__ATOMIC_RELEASE);
2199 extern __inline void
2200 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2201 _mm_mfence(void) {
2202 /* Use heavy weight sync for any to any ordering. */
2203 __atomic_thread_fence(__ATOMIC_SEQ_CST);
2206 extern __inline __m128i
2207 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2208 _mm_cvtsi32_si128(int __A) {
2209 return _mm_set_epi32(0, 0, 0, __A);
2212 extern __inline __m128i
2213 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2214 _mm_cvtsi64_si128(long long __A) {
2215 return __extension__(__m128i)(__v2di){__A, 0LL};
2218 /* Microsoft intrinsic. */
2219 extern __inline __m128i
2220 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2221 _mm_cvtsi64x_si128(long long __A) {
2222 return __extension__(__m128i)(__v2di){__A, 0LL};
2225 /* Casts between various SP, DP, INT vector types. Note that these do no
2226 conversion of values, they just change the type. */
2227 extern __inline __m128
2228 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2229 _mm_castpd_ps(__m128d __A) {
2230 return (__m128)__A;
2233 extern __inline __m128i
2234 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2235 _mm_castpd_si128(__m128d __A) {
2236 return (__m128i)__A;
2239 extern __inline __m128d
2240 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2241 _mm_castps_pd(__m128 __A) {
2242 return (__m128d)__A;
2245 extern __inline __m128i
2246 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2247 _mm_castps_si128(__m128 __A) {
2248 return (__m128i)__A;
2251 extern __inline __m128
2252 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2253 _mm_castsi128_ps(__m128i __A) {
2254 return (__m128)__A;
2257 extern __inline __m128d
2258 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2259 _mm_castsi128_pd(__m128i __A) {
2260 return (__m128d)__A;
2263 #else
2264 #include_next <emmintrin.h>
2265 #endif /* defined(__ppc64__) &&
2266 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2268 #endif /* EMMINTRIN_H_ */