clang/lib/Headers/ppc_wrappers/emmintrin.h

   1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 /* Implemented from the specification included in the Intel C++ Compiler
  11    User Guide and Reference, version 9.0.  */
  12
  13 #ifndef NO_WARN_X86_INTRINSICS
  14 /* This header file is to help porting code using Intel intrinsics
  15    explicitly from x86_64 to powerpc64/powerpc64le.
  16
  17    Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
  18    PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
  19    However scalar float operations in vector (XMM) registers require
  20    the POWER8 VSX ISA (2.07) level. There are differences for data
  21    format and placement of float scalars in the vector register, which
  22    require extra steps to match SSE2 scalar float semantics on POWER.
  23
  24    It should be noted that there's much difference between X86_64's
  25    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
  26    portable <fenv.h> instead of access MXSCR directly.
  27
  28    Most SSE2 scalar float intrinsic operations can be performed more
  29    efficiently as C language float scalar operations or optimized to
  30    use vector SIMD operations. We recommend this for new applications.
  31 */
  32 #error                                                                         \
  33     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  34 #endif
  35
  36 #ifndef EMMINTRIN_H_
  37 #define EMMINTRIN_H_
  38
  39 #if defined(__ppc64__) &&                                                      \
  40     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
  41
  42 #include <altivec.h>
  43
  44 /* We need definitions from the SSE header files.  */
  45 #include <xmmintrin.h>
  46
  47 /* SSE2 */
  48 typedef __vector double __v2df;
  49 typedef __vector long long __v2di;
  50 typedef __vector unsigned long long __v2du;
  51 typedef __vector int __v4si;
  52 typedef __vector unsigned int __v4su;
  53 typedef __vector short __v8hi;
  54 typedef __vector unsigned short __v8hu;
  55 typedef __vector signed char __v16qi;
  56 typedef __vector unsigned char __v16qu;
  57
  58 /* The Intel API is flexible enough that we must allow aliasing with other
  59    vector types, and their scalar components.  */
  60 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
  61 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
  62
  63 /* Unaligned version of the same types.  */
  64 typedef long long __m128i_u
  65     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
  66 typedef double __m128d_u
  67     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
  68
  69 /* Define two value permute mask.  */
  70 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
  71
  72 /* Create a vector with element 0 as F and the rest zero.  */
  73 extern __inline __m128d
  74     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  75     _mm_set_sd(double __F) {
  76   return __extension__(__m128d){__F, 0.0};
  77 }
  78
  79 /* Create a vector with both elements equal to F.  */
  80 extern __inline __m128d
  81     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  82     _mm_set1_pd(double __F) {
  83   return __extension__(__m128d){__F, __F};
  84 }
  85
  86 extern __inline __m128d
  87     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  88     _mm_set_pd1(double __F) {
  89   return _mm_set1_pd(__F);
  90 }
  91
  92 /* Create a vector with the lower value X and upper value W.  */
  93 extern __inline __m128d
  94     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  95     _mm_set_pd(double __W, double __X) {
  96   return __extension__(__m128d){__X, __W};
  97 }
  98
  99 /* Create a vector with the lower value W and upper value X.  */
 100 extern __inline __m128d
 101     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 102     _mm_setr_pd(double __W, double __X) {
 103   return __extension__(__m128d){__W, __X};
 104 }
 105
 106 /* Create an undefined vector.  */
 107 extern __inline __m128d
 108     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 109     _mm_undefined_pd(void) {
 110   __m128d __Y = __Y;
 111   return __Y;
 112 }
 113
 114 /* Create a vector of zeros.  */
 115 extern __inline __m128d
 116     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 117     _mm_setzero_pd(void) {
 118   return (__m128d)vec_splats(0);
 119 }
 120
 121 /* Sets the low DPFP value of A from the low value of B.  */
 122 extern __inline __m128d
 123     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 124     _mm_move_sd(__m128d __A, __m128d __B) {
 125   __v2df __result = (__v2df)__A;
 126   __result[0] = ((__v2df)__B)[0];
 127   return (__m128d)__result;
 128 }
 129
 130 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
 131 extern __inline __m128d
 132     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 133     _mm_load_pd(double const *__P) {
 134   return ((__m128d)vec_ld(0, (__v16qu *)__P));
 135 }
 136
 137 /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
 138 extern __inline __m128d
 139     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 140     _mm_loadu_pd(double const *__P) {
 141   return (vec_vsx_ld(0, __P));
 142 }
 143
 144 /* Create a vector with all two elements equal to *P.  */
 145 extern __inline __m128d
 146     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 147     _mm_load1_pd(double const *__P) {
 148   return (vec_splats(*__P));
 149 }
 150
 151 /* Create a vector with element 0 as *P and the rest zero.  */
 152 extern __inline __m128d
 153     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 154     _mm_load_sd(double const *__P) {
 155   return _mm_set_sd(*__P);
 156 }
 157
 158 extern __inline __m128d
 159     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 160     _mm_load_pd1(double const *__P) {
 161   return _mm_load1_pd(__P);
 162 }
 163
 164 /* Load two DPFP values in reverse order.  The address must be aligned.  */
 165 extern __inline __m128d
 166     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 167     _mm_loadr_pd(double const *__P) {
 168   __v2df __tmp = _mm_load_pd(__P);
 169   return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
 170 }
 171
 172 /* Store two DPFP values.  The address must be 16-byte aligned.  */
 173 extern __inline void
 174     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 175     _mm_store_pd(double *__P, __m128d __A) {
 176   vec_st((__v16qu)__A, 0, (__v16qu *)__P);
 177 }
 178
 179 /* Store two DPFP values.  The address need not be 16-byte aligned.  */
 180 extern __inline void
 181     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 182     _mm_storeu_pd(double *__P, __m128d __A) {
 183   *(__m128d_u *)__P = __A;
 184 }
 185
 186 /* Stores the lower DPFP value.  */
 187 extern __inline void
 188     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 189     _mm_store_sd(double *__P, __m128d __A) {
 190   *__P = ((__v2df)__A)[0];
 191 }
 192
 193 extern __inline double
 194     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 195     _mm_cvtsd_f64(__m128d __A) {
 196   return ((__v2df)__A)[0];
 197 }
 198
 199 extern __inline void
 200     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 201     _mm_storel_pd(double *__P, __m128d __A) {
 202   _mm_store_sd(__P, __A);
 203 }
 204
 205 /* Stores the upper DPFP value.  */
 206 extern __inline void
 207     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 208     _mm_storeh_pd(double *__P, __m128d __A) {
 209   *__P = ((__v2df)__A)[1];
 210 }
 211 /* Store the lower DPFP value across two words.
 212    The address must be 16-byte aligned.  */
 213 extern __inline void
 214     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 215     _mm_store1_pd(double *__P, __m128d __A) {
 216   _mm_store_pd(__P, vec_splat(__A, 0));
 217 }
 218
 219 extern __inline void
 220     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 221     _mm_store_pd1(double *__P, __m128d __A) {
 222   _mm_store1_pd(__P, __A);
 223 }
 224
 225 /* Store two DPFP values in reverse order.  The address must be aligned.  */
 226 extern __inline void
 227     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 228     _mm_storer_pd(double *__P, __m128d __A) {
 229   _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
 230 }
 231
 232 /* Intel intrinsic.  */
 233 extern __inline long long
 234     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 235     _mm_cvtsi128_si64(__m128i __A) {
 236   return ((__v2di)__A)[0];
 237 }
 238
 239 /* Microsoft intrinsic.  */
 240 extern __inline long long
 241     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 242     _mm_cvtsi128_si64x(__m128i __A) {
 243   return ((__v2di)__A)[0];
 244 }
 245
 246 extern __inline __m128d
 247     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 248     _mm_add_pd(__m128d __A, __m128d __B) {
 249   return (__m128d)((__v2df)__A + (__v2df)__B);
 250 }
 251
 252 /* Add the lower double-precision (64-bit) floating-point element in
 253    a and b, store the result in the lower element of dst, and copy
 254    the upper element from a to the upper element of dst. */
 255 extern __inline __m128d
 256     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 257     _mm_add_sd(__m128d __A, __m128d __B) {
 258   __A[0] = __A[0] + __B[0];
 259   return (__A);
 260 }
 261
 262 extern __inline __m128d
 263     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 264     _mm_sub_pd(__m128d __A, __m128d __B) {
 265   return (__m128d)((__v2df)__A - (__v2df)__B);
 266 }
 267
 268 extern __inline __m128d
 269     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 270     _mm_sub_sd(__m128d __A, __m128d __B) {
 271   __A[0] = __A[0] - __B[0];
 272   return (__A);
 273 }
 274
 275 extern __inline __m128d
 276     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 277     _mm_mul_pd(__m128d __A, __m128d __B) {
 278   return (__m128d)((__v2df)__A * (__v2df)__B);
 279 }
 280
 281 extern __inline __m128d
 282     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 283     _mm_mul_sd(__m128d __A, __m128d __B) {
 284   __A[0] = __A[0] * __B[0];
 285   return (__A);
 286 }
 287
 288 extern __inline __m128d
 289     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 290     _mm_div_pd(__m128d __A, __m128d __B) {
 291   return (__m128d)((__v2df)__A / (__v2df)__B);
 292 }
 293
 294 extern __inline __m128d
 295     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 296     _mm_div_sd(__m128d __A, __m128d __B) {
 297   __A[0] = __A[0] / __B[0];
 298   return (__A);
 299 }
 300
 301 extern __inline __m128d
 302     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 303     _mm_sqrt_pd(__m128d __A) {
 304   return (vec_sqrt(__A));
 305 }
 306
 307 /* Return pair {sqrt (B[0]), A[1]}.  */
 308 extern __inline __m128d
 309     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 310     _mm_sqrt_sd(__m128d __A, __m128d __B) {
 311   __v2df __c;
 312   __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
 313   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 314 }
 315
 316 extern __inline __m128d
 317     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 318     _mm_min_pd(__m128d __A, __m128d __B) {
 319   return (vec_min(__A, __B));
 320 }
 321
 322 extern __inline __m128d
 323     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 324     _mm_min_sd(__m128d __A, __m128d __B) {
 325   __v2df __a, __b, __c;
 326   __a = vec_splats(__A[0]);
 327   __b = vec_splats(__B[0]);
 328   __c = vec_min(__a, __b);
 329   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 330 }
 331
 332 extern __inline __m128d
 333     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 334     _mm_max_pd(__m128d __A, __m128d __B) {
 335   return (vec_max(__A, __B));
 336 }
 337
 338 extern __inline __m128d
 339     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 340     _mm_max_sd(__m128d __A, __m128d __B) {
 341   __v2df __a, __b, __c;
 342   __a = vec_splats(__A[0]);
 343   __b = vec_splats(__B[0]);
 344   __c = vec_max(__a, __b);
 345   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 346 }
 347
 348 extern __inline __m128d
 349     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 350     _mm_cmpeq_pd(__m128d __A, __m128d __B) {
 351   return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
 352 }
 353
 354 extern __inline __m128d
 355     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 356     _mm_cmplt_pd(__m128d __A, __m128d __B) {
 357   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
 358 }
 359
 360 extern __inline __m128d
 361     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 362     _mm_cmple_pd(__m128d __A, __m128d __B) {
 363   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
 364 }
 365
 366 extern __inline __m128d
 367     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 368     _mm_cmpgt_pd(__m128d __A, __m128d __B) {
 369   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
 370 }
 371
 372 extern __inline __m128d
 373     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 374     _mm_cmpge_pd(__m128d __A, __m128d __B) {
 375   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
 376 }
 377
 378 extern __inline __m128d
 379     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 380     _mm_cmpneq_pd(__m128d __A, __m128d __B) {
 381   __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
 382   return ((__m128d)vec_nor(__temp, __temp));
 383 }
 384
 385 extern __inline __m128d
 386     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 387     _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
 388   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
 389 }
 390
 391 extern __inline __m128d
 392     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 393     _mm_cmpnle_pd(__m128d __A, __m128d __B) {
 394   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
 395 }
 396
 397 extern __inline __m128d
 398     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 399     _mm_cmpngt_pd(__m128d __A, __m128d __B) {
 400   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
 401 }
 402
 403 extern __inline __m128d
 404     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 405     _mm_cmpnge_pd(__m128d __A, __m128d __B) {
 406   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
 407 }
 408
 409 extern __inline __m128d
 410     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 411     _mm_cmpord_pd(__m128d __A, __m128d __B) {
 412   __v2du __c, __d;
 413   /* Compare against self will return false (0's) if NAN.  */
 414   __c = (__v2du)vec_cmpeq(__A, __A);
 415   __d = (__v2du)vec_cmpeq(__B, __B);
 416   /* A != NAN and B != NAN.  */
 417   return ((__m128d)vec_and(__c, __d));
 418 }
 419
 420 extern __inline __m128d
 421     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 422     _mm_cmpunord_pd(__m128d __A, __m128d __B) {
 423 #if _ARCH_PWR8
 424   __v2du __c, __d;
 425   /* Compare against self will return false (0's) if NAN.  */
 426   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
 427   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
 428   /* A == NAN OR B == NAN converts too:
 429      NOT(A != NAN) OR NOT(B != NAN).  */
 430   __c = vec_nor(__c, __c);
 431   return ((__m128d)vec_orc(__c, __d));
 432 #else
 433   __v2du __c, __d;
 434   /* Compare against self will return false (0's) if NAN.  */
 435   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
 436   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
 437   /* Convert the true ('1's) is NAN.  */
 438   __c = vec_nor(__c, __c);
 439   __d = vec_nor(__d, __d);
 440   return ((__m128d)vec_or(__c, __d));
 441 #endif
 442 }
 443
 444 extern __inline __m128d
 445     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 446     _mm_cmpeq_sd(__m128d __A, __m128d __B) {
 447   __v2df __a, __b, __c;
 448   /* PowerISA VSX does not allow partial (for just lower double)
 449      results. So to insure we don't generate spurious exceptions
 450      (from the upper double values) we splat the lower double
 451      before we do the operation. */
 452   __a = vec_splats(__A[0]);
 453   __b = vec_splats(__B[0]);
 454   __c = (__v2df)vec_cmpeq(__a, __b);
 455   /* Then we merge the lower double result with the original upper
 456      double from __A.  */
 457   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 458 }
 459
 460 extern __inline __m128d
 461     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 462     _mm_cmplt_sd(__m128d __A, __m128d __B) {
 463   __v2df __a, __b, __c;
 464   __a = vec_splats(__A[0]);
 465   __b = vec_splats(__B[0]);
 466   __c = (__v2df)vec_cmplt(__a, __b);
 467   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 468 }
 469
 470 extern __inline __m128d
 471     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 472     _mm_cmple_sd(__m128d __A, __m128d __B) {
 473   __v2df __a, __b, __c;
 474   __a = vec_splats(__A[0]);
 475   __b = vec_splats(__B[0]);
 476   __c = (__v2df)vec_cmple(__a, __b);
 477   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 478 }
 479
 480 extern __inline __m128d
 481     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 482     _mm_cmpgt_sd(__m128d __A, __m128d __B) {
 483   __v2df __a, __b, __c;
 484   __a = vec_splats(__A[0]);
 485   __b = vec_splats(__B[0]);
 486   __c = (__v2df)vec_cmpgt(__a, __b);
 487   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 488 }
 489
 490 extern __inline __m128d
 491     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 492     _mm_cmpge_sd(__m128d __A, __m128d __B) {
 493   __v2df __a, __b, __c;
 494   __a = vec_splats(__A[0]);
 495   __b = vec_splats(__B[0]);
 496   __c = (__v2df)vec_cmpge(__a, __b);
 497   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 498 }
 499
 500 extern __inline __m128d
 501     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 502     _mm_cmpneq_sd(__m128d __A, __m128d __B) {
 503   __v2df __a, __b, __c;
 504   __a = vec_splats(__A[0]);
 505   __b = vec_splats(__B[0]);
 506   __c = (__v2df)vec_cmpeq(__a, __b);
 507   __c = vec_nor(__c, __c);
 508   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 509 }
 510
 511 extern __inline __m128d
 512     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 513     _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
 514   __v2df __a, __b, __c;
 515   __a = vec_splats(__A[0]);
 516   __b = vec_splats(__B[0]);
 517   /* Not less than is just greater than or equal.  */
 518   __c = (__v2df)vec_cmpge(__a, __b);
 519   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 520 }
 521
 522 extern __inline __m128d
 523     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 524     _mm_cmpnle_sd(__m128d __A, __m128d __B) {
 525   __v2df __a, __b, __c;
 526   __a = vec_splats(__A[0]);
 527   __b = vec_splats(__B[0]);
 528   /* Not less than or equal is just greater than.  */
 529   __c = (__v2df)vec_cmpge(__a, __b);
 530   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 531 }
 532
 533 extern __inline __m128d
 534     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 535     _mm_cmpngt_sd(__m128d __A, __m128d __B) {
 536   __v2df __a, __b, __c;
 537   __a = vec_splats(__A[0]);
 538   __b = vec_splats(__B[0]);
 539   /* Not greater than is just less than or equal.  */
 540   __c = (__v2df)vec_cmple(__a, __b);
 541   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 542 }
 543
 544 extern __inline __m128d
 545     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 546     _mm_cmpnge_sd(__m128d __A, __m128d __B) {
 547   __v2df __a, __b, __c;
 548   __a = vec_splats(__A[0]);
 549   __b = vec_splats(__B[0]);
 550   /* Not greater than or equal is just less than.  */
 551   __c = (__v2df)vec_cmplt(__a, __b);
 552   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 553 }
 554
 555 extern __inline __m128d
 556     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 557     _mm_cmpord_sd(__m128d __A, __m128d __B) {
 558   __v2df __r;
 559   __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
 560   return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
 561 }
 562
 563 extern __inline __m128d
 564     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 565     _mm_cmpunord_sd(__m128d __A, __m128d __B) {
 566   __v2df __r;
 567   __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
 568   return (__m128d)_mm_setr_pd(__r[0], __A[1]);
 569 }
 570
 571 /* FIXME
 572    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
 573    exactly the same because GCC for PowerPC only generates unordered
 574    compares (scalar and vector).
 575    Technically __mm_comieq_sp et all should be using the ordered
 576    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
 577    be OK.   */
 578 extern __inline int
 579     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 580     _mm_comieq_sd(__m128d __A, __m128d __B) {
 581   return (__A[0] == __B[0]);
 582 }
 583
 584 extern __inline int
 585     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 586     _mm_comilt_sd(__m128d __A, __m128d __B) {
 587   return (__A[0] < __B[0]);
 588 }
 589
 590 extern __inline int
 591     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 592     _mm_comile_sd(__m128d __A, __m128d __B) {
 593   return (__A[0] <= __B[0]);
 594 }
 595
 596 extern __inline int
 597     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 598     _mm_comigt_sd(__m128d __A, __m128d __B) {
 599   return (__A[0] > __B[0]);
 600 }
 601
 602 extern __inline int
 603     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 604     _mm_comige_sd(__m128d __A, __m128d __B) {
 605   return (__A[0] >= __B[0]);
 606 }
 607
 608 extern __inline int
 609     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 610     _mm_comineq_sd(__m128d __A, __m128d __B) {
 611   return (__A[0] != __B[0]);
 612 }
 613
 614 extern __inline int
 615     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 616     _mm_ucomieq_sd(__m128d __A, __m128d __B) {
 617   return (__A[0] == __B[0]);
 618 }
 619
 620 extern __inline int
 621     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 622     _mm_ucomilt_sd(__m128d __A, __m128d __B) {
 623   return (__A[0] < __B[0]);
 624 }
 625
 626 extern __inline int
 627     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 628     _mm_ucomile_sd(__m128d __A, __m128d __B) {
 629   return (__A[0] <= __B[0]);
 630 }
 631
 632 extern __inline int
 633     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 634     _mm_ucomigt_sd(__m128d __A, __m128d __B) {
 635   return (__A[0] > __B[0]);
 636 }
 637
 638 extern __inline int
 639     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 640     _mm_ucomige_sd(__m128d __A, __m128d __B) {
 641   return (__A[0] >= __B[0]);
 642 }
 643
 644 extern __inline int
 645     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 646     _mm_ucomineq_sd(__m128d __A, __m128d __B) {
 647   return (__A[0] != __B[0]);
 648 }
 649
 650 /* Create a vector of Qi, where i is the element number.  */
 651 extern __inline __m128i
 652     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 653     _mm_set_epi64x(long long __q1, long long __q0) {
 654   return __extension__(__m128i)(__v2di){__q0, __q1};
 655 }
 656
 657 extern __inline __m128i
 658     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 659     _mm_set_epi64(__m64 __q1, __m64 __q0) {
 660   return _mm_set_epi64x((long long)__q1, (long long)__q0);
 661 }
 662
 663 extern __inline __m128i
 664     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 665     _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
 666   return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
 667 }
 668
 669 extern __inline __m128i
 670     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 671     _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
 672                   short __q2, short __q1, short __q0) {
 673   return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
 674                                         __q4, __q5, __q6, __q7};
 675 }
 676
 677 extern __inline __m128i
 678     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 679     _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
 680                  char __q10, char __q09, char __q08, char __q07, char __q06,
 681                  char __q05, char __q04, char __q03, char __q02, char __q01,
 682                  char __q00) {
 683   return __extension__(__m128i)(__v16qi){
 684       __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
 685       __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
 686 }
 687
 688 /* Set all of the elements of the vector to A.  */
 689 extern __inline __m128i
 690     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 691     _mm_set1_epi64x(long long __A) {
 692   return _mm_set_epi64x(__A, __A);
 693 }
 694
 695 extern __inline __m128i
 696     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 697     _mm_set1_epi64(__m64 __A) {
 698   return _mm_set_epi64(__A, __A);
 699 }
 700
 701 extern __inline __m128i
 702     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 703     _mm_set1_epi32(int __A) {
 704   return _mm_set_epi32(__A, __A, __A, __A);
 705 }
 706
 707 extern __inline __m128i
 708     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 709     _mm_set1_epi16(short __A) {
 710   return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
 711 }
 712
 713 extern __inline __m128i
 714     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 715     _mm_set1_epi8(char __A) {
 716   return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
 717                       __A, __A, __A, __A, __A);
 718 }
 719
 720 /* Create a vector of Qi, where i is the element number.
 721    The parameter order is reversed from the _mm_set_epi* functions.  */
 722 extern __inline __m128i
 723     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 724     _mm_setr_epi64(__m64 __q0, __m64 __q1) {
 725   return _mm_set_epi64(__q1, __q0);
 726 }
 727
 728 extern __inline __m128i
 729     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 730     _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
 731   return _mm_set_epi32(__q3, __q2, __q1, __q0);
 732 }
 733
 734 extern __inline __m128i
 735     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 736     _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
 737                    short __q5, short __q6, short __q7) {
 738   return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
 739 }
 740
 741 extern __inline __m128i
 742     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 743     _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
 744                   char __q05, char __q06, char __q07, char __q08, char __q09,
 745                   char __q10, char __q11, char __q12, char __q13, char __q14,
 746                   char __q15) {
 747   return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
 748                       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
 749 }
 750
 751 /* Create a vector with element 0 as *P and the rest zero.  */
 752 extern __inline __m128i
 753     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 754     _mm_load_si128(__m128i const *__P) {
 755   return *__P;
 756 }
 757
 758 extern __inline __m128i
 759     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 760     _mm_loadu_si128(__m128i_u const *__P) {
 761   return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
 762 }
 763
 764 extern __inline __m128i
 765     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 766     _mm_loadl_epi64(__m128i_u const *__P) {
 767   return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
 768 }
 769
 770 extern __inline void
 771     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 772     _mm_store_si128(__m128i *__P, __m128i __B) {
 773   vec_st((__v16qu)__B, 0, (__v16qu *)__P);
 774 }
 775
 776 extern __inline void
 777     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 778     _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
 779   *__P = __B;
 780 }
 781
 782 extern __inline void
 783     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 784     _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
 785   *(long long *)__P = ((__v2di)__B)[0];
 786 }
 787
 788 extern __inline __m64
 789     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 790     _mm_movepi64_pi64(__m128i_u __B) {
 791   return (__m64)((__v2di)__B)[0];
 792 }
 793
 794 extern __inline __m128i
 795     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 796     _mm_movpi64_epi64(__m64 __A) {
 797   return _mm_set_epi64((__m64)0LL, __A);
 798 }
 799
 800 extern __inline __m128i
 801     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 802     _mm_move_epi64(__m128i __A) {
 803   return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
 804 }
 805
 806 /* Create an undefined vector.  */
 807 extern __inline __m128i
 808     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 809     _mm_undefined_si128(void) {
 810   __m128i __Y = __Y;
 811   return __Y;
 812 }
 813
 814 /* Create a vector of zeros.  */
 815 extern __inline __m128i
 816     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 817     _mm_setzero_si128(void) {
 818   return __extension__(__m128i)(__v4si){0, 0, 0, 0};
 819 }
 820
 821 #ifdef _ARCH_PWR8
 822 extern __inline __m128d
 823     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 824     _mm_cvtepi32_pd(__m128i __A) {
 825   __v2di __val;
 826   /* For LE need to generate Vector Unpack Low Signed Word.
 827      Which is generated from unpackh.  */
 828   __val = (__v2di)vec_unpackh((__v4si)__A);
 829
 830   return (__m128d)vec_ctf(__val, 0);
 831 }
 832 #endif
 833
 834 extern __inline __m128
 835     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 836     _mm_cvtepi32_ps(__m128i __A) {
 837   return ((__m128)vec_ctf((__v4si)__A, 0));
 838 }
 839
 840 extern __inline __m128i
 841     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 842     _mm_cvtpd_epi32(__m128d __A) {
 843   __v2df __rounded = vec_rint(__A);
 844   __v4si __result, __temp;
 845   const __v4si __vzero = {0, 0, 0, 0};
 846
 847   /* VSX Vector truncate Double-Precision to integer and Convert to
 848    Signed Integer Word format with Saturate.  */
 849   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
 850
 851 #ifdef _ARCH_PWR8
 852 #ifdef __LITTLE_ENDIAN__
 853   __temp = vec_mergeo(__temp, __temp);
 854 #else
 855   __temp = vec_mergee(__temp, __temp);
 856 #endif
 857   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
 858                                  (__vector long long)__vzero);
 859 #else
 860   {
 861     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
 862                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
 863     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
 864   }
 865 #endif
 866   return (__m128i)__result;
 867 }
 868
 869 extern __inline __m64
 870     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 871     _mm_cvtpd_pi32(__m128d __A) {
 872   __m128i __result = _mm_cvtpd_epi32(__A);
 873
 874   return (__m64)__result[0];
 875 }
 876
 877 extern __inline __m128
 878     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 879     _mm_cvtpd_ps(__m128d __A) {
 880   __v4sf __result;
 881   __v4si __temp;
 882   const __v4si __vzero = {0, 0, 0, 0};
 883
 884   __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
 885
 886 #ifdef _ARCH_PWR8
 887 #ifdef __LITTLE_ENDIAN__
 888   __temp = vec_mergeo(__temp, __temp);
 889 #else
 890   __temp = vec_mergee(__temp, __temp);
 891 #endif
 892   __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
 893                                  (__vector long long)__vzero);
 894 #else
 895   {
 896     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
 897                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
 898     __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
 899   }
 900 #endif
 901   return ((__m128)__result);
 902 }
 903
 904 extern __inline __m128i
 905     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 906     _mm_cvttpd_epi32(__m128d __A) {
 907   __v4si __result;
 908   __v4si __temp;
 909   const __v4si __vzero = {0, 0, 0, 0};
 910
 911   /* VSX Vector truncate Double-Precision to integer and Convert to
 912    Signed Integer Word format with Saturate.  */
 913   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
 914
 915 #ifdef _ARCH_PWR8
 916 #ifdef __LITTLE_ENDIAN__
 917   __temp = vec_mergeo(__temp, __temp);
 918 #else
 919   __temp = vec_mergee(__temp, __temp);
 920 #endif
 921   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
 922                                  (__vector long long)__vzero);
 923 #else
 924   {
 925     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
 926                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
 927     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
 928   }
 929 #endif
 930
 931   return ((__m128i)__result);
 932 }
 933
 934 extern __inline __m64
 935     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 936     _mm_cvttpd_pi32(__m128d __A) {
 937   __m128i __result = _mm_cvttpd_epi32(__A);
 938
 939   return (__m64)__result[0];
 940 }
 941
 942 extern __inline int
 943     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 944     _mm_cvtsi128_si32(__m128i __A) {
 945   return ((__v4si)__A)[0];
 946 }
 947
 948 #ifdef _ARCH_PWR8
 949 extern __inline __m128d
 950     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 951     _mm_cvtpi32_pd(__m64 __A) {
 952   __v4si __temp;
 953   __v2di __tmp2;
 954   __v2df __result;
 955
 956   __temp = (__v4si)vec_splats(__A);
 957   __tmp2 = (__v2di)vec_unpackl(__temp);
 958   __result = vec_ctf((__vector signed long long)__tmp2, 0);
 959   return (__m128d)__result;
 960 }
 961 #endif
 962
 963 extern __inline __m128i
 964     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 965     _mm_cvtps_epi32(__m128 __A) {
 966   __v4sf __rounded;
 967   __v4si __result;
 968
 969   __rounded = vec_rint((__v4sf)__A);
 970   __result = vec_cts(__rounded, 0);
 971   return (__m128i)__result;
 972 }
 973
 974 extern __inline __m128i
 975     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 976     _mm_cvttps_epi32(__m128 __A) {
 977   __v4si __result;
 978
 979   __result = vec_cts((__v4sf)__A, 0);
 980   return (__m128i)__result;
 981 }
 982
 983 extern __inline __m128d
 984     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 985     _mm_cvtps_pd(__m128 __A) {
 986   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
 987 #ifdef vec_doubleh
 988   return (__m128d)vec_doubleh((__v4sf)__A);
 989 #else
 990   /* Otherwise the compiler is not current and so need to generate the
 991      equivalent code.  */
 992   __v4sf __a = (__v4sf)__A;
 993   __v4sf __temp;
 994   __v2df __result;
 995 #ifdef __LITTLE_ENDIAN__
 996   /* The input float values are in elements {[0], [1]} but the convert
 997      instruction needs them in elements {[1], [3]}, So we use two
 998      shift left double vector word immediates to get the elements
 999      lined up.  */
1000   __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1001   __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1002 #else
1003   /* The input float values are in elements {[0], [1]} but the convert
1004      instruction needs them in elements {[0], [2]}, So we use two
1005      shift left double vector word immediates to get the elements
1006      lined up.  */
1007   __temp = vec_vmrghw(__a, __a);
1008 #endif
1009   __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1010   return (__m128d)__result;
1011 #endif
1012 }
1013
1014 extern __inline int
1015     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1016     _mm_cvtsd_si32(__m128d __A) {
1017   __v2df __rounded = vec_rint((__v2df)__A);
1018   int __result = ((__v2df)__rounded)[0];
1019
1020   return __result;
1021 }
1022 /* Intel intrinsic.  */
1023 extern __inline long long
1024     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025     _mm_cvtsd_si64(__m128d __A) {
1026   __v2df __rounded = vec_rint((__v2df)__A);
1027   long long __result = ((__v2df)__rounded)[0];
1028
1029   return __result;
1030 }
1031
1032 /* Microsoft intrinsic.  */
1033 extern __inline long long
1034     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1035     _mm_cvtsd_si64x(__m128d __A) {
1036   return _mm_cvtsd_si64((__v2df)__A);
1037 }
1038
1039 extern __inline int
1040     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1041     _mm_cvttsd_si32(__m128d __A) {
1042   int __result = ((__v2df)__A)[0];
1043
1044   return __result;
1045 }
1046
1047 /* Intel intrinsic.  */
1048 extern __inline long long
1049     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050     _mm_cvttsd_si64(__m128d __A) {
1051   long long __result = ((__v2df)__A)[0];
1052
1053   return __result;
1054 }
1055
1056 /* Microsoft intrinsic.  */
1057 extern __inline long long
1058     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059     _mm_cvttsd_si64x(__m128d __A) {
1060   return _mm_cvttsd_si64(__A);
1061 }
1062
1063 extern __inline __m128
1064     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065     _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1066   __v4sf __result = (__v4sf)__A;
1067
1068 #ifdef __LITTLE_ENDIAN__
1069   __v4sf __temp_s;
1070   /* Copy double element[0] to element [1] for conversion.  */
1071   __v2df __temp_b = vec_splat((__v2df)__B, 0);
1072
1073   /* Pre-rotate __A left 3 (logically right 1) elements.  */
1074   __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1075   /* Convert double to single float scalar in a vector.  */
1076   __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1077   /* Shift the resulting scalar into vector element [0].  */
1078   __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1079 #else
1080   __result[0] = ((__v2df)__B)[0];
1081 #endif
1082   return (__m128)__result;
1083 }
1084
1085 extern __inline __m128d
1086     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087     _mm_cvtsi32_sd(__m128d __A, int __B) {
1088   __v2df __result = (__v2df)__A;
1089   double __db = __B;
1090   __result[0] = __db;
1091   return (__m128d)__result;
1092 }
1093
1094 /* Intel intrinsic.  */
1095 extern __inline __m128d
1096     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1097     _mm_cvtsi64_sd(__m128d __A, long long __B) {
1098   __v2df __result = (__v2df)__A;
1099   double __db = __B;
1100   __result[0] = __db;
1101   return (__m128d)__result;
1102 }
1103
1104 /* Microsoft intrinsic.  */
1105 extern __inline __m128d
1106     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1107     _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1108   return _mm_cvtsi64_sd(__A, __B);
1109 }
1110
1111 extern __inline __m128d
1112     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1113     _mm_cvtss_sd(__m128d __A, __m128 __B) {
1114 #ifdef __LITTLE_ENDIAN__
1115   /* Use splat to move element [0] into position for the convert. */
1116   __v4sf __temp = vec_splat((__v4sf)__B, 0);
1117   __v2df __res;
1118   /* Convert single float scalar to double in a vector.  */
1119   __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1120   return (__m128d)vec_mergel(__res, (__v2df)__A);
1121 #else
1122   __v2df __res = (__v2df)__A;
1123   __res[0] = ((__v4sf)__B)[0];
1124   return (__m128d)__res;
1125 #endif
1126 }
1127
1128 extern __inline __m128d
1129     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1130     _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1131   __vector double __result;
1132   const int __litmsk = __mask & 0x3;
1133
1134   if (__litmsk == 0)
1135     __result = vec_mergeh(__A, __B);
1136 #if __GNUC__ < 6
1137   else if (__litmsk == 1)
1138     __result = vec_xxpermdi(__B, __A, 2);
1139   else if (__litmsk == 2)
1140     __result = vec_xxpermdi(__B, __A, 1);
1141 #else
1142   else if (__litmsk == 1)
1143     __result = vec_xxpermdi(__A, __B, 2);
1144   else if (__litmsk == 2)
1145     __result = vec_xxpermdi(__A, __B, 1);
1146 #endif
1147   else
1148     __result = vec_mergel(__A, __B);
1149
1150   return __result;
1151 }
1152
1153 extern __inline __m128d
1154     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1155     _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1156   return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1157 }
1158
1159 extern __inline __m128d
1160     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161     _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1162   return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1163 }
1164
1165 extern __inline __m128d
1166     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1167     _mm_loadh_pd(__m128d __A, double const *__B) {
1168   __v2df __result = (__v2df)__A;
1169   __result[1] = *__B;
1170   return (__m128d)__result;
1171 }
1172
1173 extern __inline __m128d
1174     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175     _mm_loadl_pd(__m128d __A, double const *__B) {
1176   __v2df __result = (__v2df)__A;
1177   __result[0] = *__B;
1178   return (__m128d)__result;
1179 }
1180
1181 #ifdef _ARCH_PWR8
1182 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1183
1184 /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1185 extern __inline int
1186     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1187     _mm_movemask_pd(__m128d __A) {
1188 #ifdef _ARCH_PWR10
1189   return vec_extractm((__v2du)__A);
1190 #else
1191   __vector unsigned long long __result;
1192   static const __vector unsigned int __perm_mask = {
1193 #ifdef __LITTLE_ENDIAN__
1194       0x80800040, 0x80808080, 0x80808080, 0x80808080
1195 #else
1196       0x80808080, 0x80808080, 0x80808080, 0x80804000
1197 #endif
1198   };
1199
1200   __result = ((__vector unsigned long long)vec_vbpermq(
1201       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1202
1203 #ifdef __LITTLE_ENDIAN__
1204   return __result[1];
1205 #else
1206   return __result[0];
1207 #endif
1208 #endif /* !_ARCH_PWR10 */
1209 }
1210 #endif /* _ARCH_PWR8 */
1211
1212 extern __inline __m128i
1213     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214     _mm_packs_epi16(__m128i __A, __m128i __B) {
1215   return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1216 }
1217
1218 extern __inline __m128i
1219     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220     _mm_packs_epi32(__m128i __A, __m128i __B) {
1221   return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1222 }
1223
1224 extern __inline __m128i
1225     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226     _mm_packus_epi16(__m128i __A, __m128i __B) {
1227   return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1228 }
1229
1230 extern __inline __m128i
1231     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1232     _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1233   return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1234 }
1235
1236 extern __inline __m128i
1237     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238     _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1239   return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1240 }
1241
1242 extern __inline __m128i
1243     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244     _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1245   return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1246 }
1247
1248 extern __inline __m128i
1249     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250     _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1251   return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1252 }
1253
1254 extern __inline __m128i
1255     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1256     _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1257   return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1258 }
1259
1260 extern __inline __m128i
1261     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262     _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1263   return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1264 }
1265
1266 extern __inline __m128i
1267     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268     _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1269   return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1270 }
1271
1272 extern __inline __m128i
1273     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274     _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1275   return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1276 }
1277
1278 extern __inline __m128i
1279     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280     _mm_add_epi8(__m128i __A, __m128i __B) {
1281   return (__m128i)((__v16qu)__A + (__v16qu)__B);
1282 }
1283
1284 extern __inline __m128i
1285     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286     _mm_add_epi16(__m128i __A, __m128i __B) {
1287   return (__m128i)((__v8hu)__A + (__v8hu)__B);
1288 }
1289
1290 extern __inline __m128i
1291     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292     _mm_add_epi32(__m128i __A, __m128i __B) {
1293   return (__m128i)((__v4su)__A + (__v4su)__B);
1294 }
1295
1296 extern __inline __m128i
1297     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298     _mm_add_epi64(__m128i __A, __m128i __B) {
1299   return (__m128i)((__v2du)__A + (__v2du)__B);
1300 }
1301
1302 extern __inline __m128i
1303     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304     _mm_adds_epi8(__m128i __A, __m128i __B) {
1305   return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1306 }
1307
1308 extern __inline __m128i
1309     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310     _mm_adds_epi16(__m128i __A, __m128i __B) {
1311   return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1312 }
1313
1314 extern __inline __m128i
1315     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316     _mm_adds_epu8(__m128i __A, __m128i __B) {
1317   return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1318 }
1319
1320 extern __inline __m128i
1321     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322     _mm_adds_epu16(__m128i __A, __m128i __B) {
1323   return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1324 }
1325
1326 extern __inline __m128i
1327     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328     _mm_sub_epi8(__m128i __A, __m128i __B) {
1329   return (__m128i)((__v16qu)__A - (__v16qu)__B);
1330 }
1331
1332 extern __inline __m128i
1333     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1334     _mm_sub_epi16(__m128i __A, __m128i __B) {
1335   return (__m128i)((__v8hu)__A - (__v8hu)__B);
1336 }
1337
1338 extern __inline __m128i
1339     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340     _mm_sub_epi32(__m128i __A, __m128i __B) {
1341   return (__m128i)((__v4su)__A - (__v4su)__B);
1342 }
1343
1344 extern __inline __m128i
1345     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346     _mm_sub_epi64(__m128i __A, __m128i __B) {
1347   return (__m128i)((__v2du)__A - (__v2du)__B);
1348 }
1349
1350 extern __inline __m128i
1351     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352     _mm_subs_epi8(__m128i __A, __m128i __B) {
1353   return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1354 }
1355
1356 extern __inline __m128i
1357     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358     _mm_subs_epi16(__m128i __A, __m128i __B) {
1359   return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1360 }
1361
1362 extern __inline __m128i
1363     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364     _mm_subs_epu8(__m128i __A, __m128i __B) {
1365   return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1366 }
1367
1368 extern __inline __m128i
1369     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370     _mm_subs_epu16(__m128i __A, __m128i __B) {
1371   return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1372 }
1373
1374 extern __inline __m128i
1375     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376     _mm_madd_epi16(__m128i __A, __m128i __B) {
1377   __vector signed int __zero = {0, 0, 0, 0};
1378
1379   return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1380 }
1381
1382 extern __inline __m128i
1383     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384     _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1385   __vector signed int __w0, __w1;
1386
1387   __vector unsigned char __xform1 = {
1388 #ifdef __LITTLE_ENDIAN__
1389       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1390       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1391 #else
1392       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1393       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1394 #endif
1395   };
1396
1397   __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1398   __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1399   return (__m128i)vec_perm(__w0, __w1, __xform1);
1400 }
1401
1402 extern __inline __m128i
1403     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1404     _mm_mullo_epi16(__m128i __A, __m128i __B) {
1405   return (__m128i)((__v8hi)__A * (__v8hi)__B);
1406 }
1407
1408 extern __inline __m64
1409     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1410     _mm_mul_su32(__m64 __A, __m64 __B) {
1411   unsigned int __a = __A;
1412   unsigned int __b = __B;
1413
1414   return ((__m64)__a * (__m64)__b);
1415 }
1416
1417 #ifdef _ARCH_PWR8
1418 extern __inline __m128i
1419     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420     _mm_mul_epu32(__m128i __A, __m128i __B) {
1421 #if __GNUC__ < 8
1422   __v2du __result;
1423
1424 #ifdef __LITTLE_ENDIAN__
1425   /* VMX Vector Multiply Odd Unsigned Word.  */
1426   __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1427 #else
1428   /* VMX Vector Multiply Even Unsigned Word.  */
1429   __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1430 #endif
1431   return (__m128i)__result;
1432 #else
1433   return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1434 #endif
1435 }
1436 #endif
1437
1438 extern __inline __m128i
1439     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1440     _mm_slli_epi16(__m128i __A, int __B) {
1441   __v8hu __lshift;
1442   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1443
1444   if (__B >= 0 && __B < 16) {
1445     if (__builtin_constant_p(__B))
1446       __lshift = (__v8hu)vec_splat_s16(__B);
1447     else
1448       __lshift = vec_splats((unsigned short)__B);
1449
1450     __result = vec_sl((__v8hi)__A, __lshift);
1451   }
1452
1453   return (__m128i)__result;
1454 }
1455
1456 extern __inline __m128i
1457     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1458     _mm_slli_epi32(__m128i __A, int __B) {
1459   __v4su __lshift;
1460   __v4si __result = {0, 0, 0, 0};
1461
1462   if (__B >= 0 && __B < 32) {
1463     if (__builtin_constant_p(__B) && __B < 16)
1464       __lshift = (__v4su)vec_splat_s32(__B);
1465     else
1466       __lshift = vec_splats((unsigned int)__B);
1467
1468     __result = vec_sl((__v4si)__A, __lshift);
1469   }
1470
1471   return (__m128i)__result;
1472 }
1473
1474 #ifdef _ARCH_PWR8
1475 extern __inline __m128i
1476     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1477     _mm_slli_epi64(__m128i __A, int __B) {
1478   __v2du __lshift;
1479   __v2di __result = {0, 0};
1480
1481   if (__B >= 0 && __B < 64) {
1482     if (__builtin_constant_p(__B) && __B < 16)
1483       __lshift = (__v2du)vec_splat_s32(__B);
1484     else
1485       __lshift = (__v2du)vec_splats((unsigned int)__B);
1486
1487     __result = vec_sl((__v2di)__A, __lshift);
1488   }
1489
1490   return (__m128i)__result;
1491 }
1492 #endif
1493
1494 extern __inline __m128i
1495     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1496     _mm_srai_epi16(__m128i __A, int __B) {
1497   __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1498   __v8hi __result;
1499
1500   if (__B < 16) {
1501     if (__builtin_constant_p(__B))
1502       __rshift = (__v8hu)vec_splat_s16(__B);
1503     else
1504       __rshift = vec_splats((unsigned short)__B);
1505   }
1506   __result = vec_sra((__v8hi)__A, __rshift);
1507
1508   return (__m128i)__result;
1509 }
1510
1511 extern __inline __m128i
1512     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1513     _mm_srai_epi32(__m128i __A, int __B) {
1514   __v4su __rshift = {31, 31, 31, 31};
1515   __v4si __result;
1516
1517   if (__B < 32) {
1518     if (__builtin_constant_p(__B)) {
1519       if (__B < 16)
1520         __rshift = (__v4su)vec_splat_s32(__B);
1521       else
1522         __rshift = (__v4su)vec_splats((unsigned int)__B);
1523     } else
1524       __rshift = vec_splats((unsigned int)__B);
1525   }
1526   __result = vec_sra((__v4si)__A, __rshift);
1527
1528   return (__m128i)__result;
1529 }
1530
1531 extern __inline __m128i
1532     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1533     _mm_bslli_si128(__m128i __A, const int __N) {
1534   __v16qu __result;
1535   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1536
1537   if (__N < 16)
1538     __result = vec_sld((__v16qu)__A, __zeros, __N);
1539   else
1540     __result = __zeros;
1541
1542   return (__m128i)__result;
1543 }
1544
1545 extern __inline __m128i
1546     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1547     _mm_bsrli_si128(__m128i __A, const int __N) {
1548   __v16qu __result;
1549   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1550
1551   if (__N < 16)
1552 #ifdef __LITTLE_ENDIAN__
1553     if (__builtin_constant_p(__N))
1554       /* Would like to use Vector Shift Left Double by Octet
1555          Immediate here to use the immediate form and avoid
1556          load of __N * 8 value into a separate VR.  */
1557       __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1558     else
1559 #endif
1560     {
1561       __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1562 #ifdef __LITTLE_ENDIAN__
1563       __result = vec_sro((__v16qu)__A, __shift);
1564 #else
1565     __result = vec_slo((__v16qu)__A, __shift);
1566 #endif
1567     }
1568   else
1569     __result = __zeros;
1570
1571   return (__m128i)__result;
1572 }
1573
1574 extern __inline __m128i
1575     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1576     _mm_srli_si128(__m128i __A, const int __N) {
1577   return _mm_bsrli_si128(__A, __N);
1578 }
1579
1580 extern __inline __m128i
1581     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1582     _mm_slli_si128(__m128i __A, const int _imm5) {
1583   __v16qu __result;
1584   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1585
1586   if (_imm5 < 16)
1587 #ifdef __LITTLE_ENDIAN__
1588     __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1589 #else
1590     __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1591 #endif
1592   else
1593     __result = __zeros;
1594
1595   return (__m128i)__result;
1596 }
1597
1598 extern __inline __m128i
1599     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1600
1601     _mm_srli_epi16(__m128i __A, int __B) {
1602   __v8hu __rshift;
1603   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1604
1605   if (__B < 16) {
1606     if (__builtin_constant_p(__B))
1607       __rshift = (__v8hu)vec_splat_s16(__B);
1608     else
1609       __rshift = vec_splats((unsigned short)__B);
1610
1611     __result = vec_sr((__v8hi)__A, __rshift);
1612   }
1613
1614   return (__m128i)__result;
1615 }
1616
1617 extern __inline __m128i
1618     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1619     _mm_srli_epi32(__m128i __A, int __B) {
1620   __v4su __rshift;
1621   __v4si __result = {0, 0, 0, 0};
1622
1623   if (__B < 32) {
1624     if (__builtin_constant_p(__B)) {
1625       if (__B < 16)
1626         __rshift = (__v4su)vec_splat_s32(__B);
1627       else
1628         __rshift = (__v4su)vec_splats((unsigned int)__B);
1629     } else
1630       __rshift = vec_splats((unsigned int)__B);
1631
1632     __result = vec_sr((__v4si)__A, __rshift);
1633   }
1634
1635   return (__m128i)__result;
1636 }
1637
1638 #ifdef _ARCH_PWR8
1639 extern __inline __m128i
1640     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1641     _mm_srli_epi64(__m128i __A, int __B) {
1642   __v2du __rshift;
1643   __v2di __result = {0, 0};
1644
1645   if (__B < 64) {
1646     if (__builtin_constant_p(__B)) {
1647       if (__B < 16)
1648         __rshift = (__v2du)vec_splat_s32(__B);
1649       else
1650         __rshift = (__v2du)vec_splats((unsigned long long)__B);
1651     } else
1652       __rshift = (__v2du)vec_splats((unsigned int)__B);
1653
1654     __result = vec_sr((__v2di)__A, __rshift);
1655   }
1656
1657   return (__m128i)__result;
1658 }
1659 #endif
1660
1661 extern __inline __m128i
1662     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1663     _mm_sll_epi16(__m128i __A, __m128i __B) {
1664   __v8hu __lshift;
1665   __vector __bool short __shmask;
1666   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1667   __v8hu __result;
1668
1669 #ifdef __LITTLE_ENDIAN__
1670   __lshift = vec_splat((__v8hu)__B, 0);
1671 #else
1672   __lshift = vec_splat((__v8hu)__B, 3);
1673 #endif
1674   __shmask = vec_cmple(__lshift, __shmax);
1675   __result = vec_sl((__v8hu)__A, __lshift);
1676   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1677
1678   return (__m128i)__result;
1679 }
1680
1681 extern __inline __m128i
1682     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683     _mm_sll_epi32(__m128i __A, __m128i __B) {
1684   __v4su __lshift;
1685   __vector __bool int __shmask;
1686   const __v4su __shmax = {32, 32, 32, 32};
1687   __v4su __result;
1688 #ifdef __LITTLE_ENDIAN__
1689   __lshift = vec_splat((__v4su)__B, 0);
1690 #else
1691   __lshift = vec_splat((__v4su)__B, 1);
1692 #endif
1693   __shmask = vec_cmplt(__lshift, __shmax);
1694   __result = vec_sl((__v4su)__A, __lshift);
1695   __result = vec_sel((__v4su)__shmask, __result, __shmask);
1696
1697   return (__m128i)__result;
1698 }
1699
1700 #ifdef _ARCH_PWR8
1701 extern __inline __m128i
1702     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1703     _mm_sll_epi64(__m128i __A, __m128i __B) {
1704   __v2du __lshift;
1705   __vector __bool long long __shmask;
1706   const __v2du __shmax = {64, 64};
1707   __v2du __result;
1708
1709   __lshift = vec_splat((__v2du)__B, 0);
1710   __shmask = vec_cmplt(__lshift, __shmax);
1711   __result = vec_sl((__v2du)__A, __lshift);
1712   __result = vec_sel((__v2du)__shmask, __result, __shmask);
1713
1714   return (__m128i)__result;
1715 }
1716 #endif
1717
1718 extern __inline __m128i
1719     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1720     _mm_sra_epi16(__m128i __A, __m128i __B) {
1721   const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1722   __v8hu __rshift;
1723   __v8hi __result;
1724
1725 #ifdef __LITTLE_ENDIAN__
1726   __rshift = vec_splat((__v8hu)__B, 0);
1727 #else
1728   __rshift = vec_splat((__v8hu)__B, 3);
1729 #endif
1730   __rshift = vec_min(__rshift, __rshmax);
1731   __result = vec_sra((__v8hi)__A, __rshift);
1732
1733   return (__m128i)__result;
1734 }
1735
1736 extern __inline __m128i
1737     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738     _mm_sra_epi32(__m128i __A, __m128i __B) {
1739   const __v4su __rshmax = {31, 31, 31, 31};
1740   __v4su __rshift;
1741   __v4si __result;
1742
1743 #ifdef __LITTLE_ENDIAN__
1744   __rshift = vec_splat((__v4su)__B, 0);
1745 #else
1746   __rshift = vec_splat((__v4su)__B, 1);
1747 #endif
1748   __rshift = vec_min(__rshift, __rshmax);
1749   __result = vec_sra((__v4si)__A, __rshift);
1750
1751   return (__m128i)__result;
1752 }
1753
1754 extern __inline __m128i
1755     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1756     _mm_srl_epi16(__m128i __A, __m128i __B) {
1757   __v8hu __rshift;
1758   __vector __bool short __shmask;
1759   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1760   __v8hu __result;
1761
1762 #ifdef __LITTLE_ENDIAN__
1763   __rshift = vec_splat((__v8hu)__B, 0);
1764 #else
1765   __rshift = vec_splat((__v8hu)__B, 3);
1766 #endif
1767   __shmask = vec_cmple(__rshift, __shmax);
1768   __result = vec_sr((__v8hu)__A, __rshift);
1769   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1770
1771   return (__m128i)__result;
1772 }
1773
1774 extern __inline __m128i
1775     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1776     _mm_srl_epi32(__m128i __A, __m128i __B) {
1777   __v4su __rshift;
1778   __vector __bool int __shmask;
1779   const __v4su __shmax = {32, 32, 32, 32};
1780   __v4su __result;
1781
1782 #ifdef __LITTLE_ENDIAN__
1783   __rshift = vec_splat((__v4su)__B, 0);
1784 #else
1785   __rshift = vec_splat((__v4su)__B, 1);
1786 #endif
1787   __shmask = vec_cmplt(__rshift, __shmax);
1788   __result = vec_sr((__v4su)__A, __rshift);
1789   __result = vec_sel((__v4su)__shmask, __result, __shmask);
1790
1791   return (__m128i)__result;
1792 }
1793
1794 #ifdef _ARCH_PWR8
1795 extern __inline __m128i
1796     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1797     _mm_srl_epi64(__m128i __A, __m128i __B) {
1798   __v2du __rshift;
1799   __vector __bool long long __shmask;
1800   const __v2du __shmax = {64, 64};
1801   __v2du __result;
1802
1803   __rshift = vec_splat((__v2du)__B, 0);
1804   __shmask = vec_cmplt(__rshift, __shmax);
1805   __result = vec_sr((__v2du)__A, __rshift);
1806   __result = vec_sel((__v2du)__shmask, __result, __shmask);
1807
1808   return (__m128i)__result;
1809 }
1810 #endif
1811
1812 extern __inline __m128d
1813     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1814     _mm_and_pd(__m128d __A, __m128d __B) {
1815   return (vec_and((__v2df)__A, (__v2df)__B));
1816 }
1817
1818 extern __inline __m128d
1819     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1820     _mm_andnot_pd(__m128d __A, __m128d __B) {
1821   return (vec_andc((__v2df)__B, (__v2df)__A));
1822 }
1823
1824 extern __inline __m128d
1825     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1826     _mm_or_pd(__m128d __A, __m128d __B) {
1827   return (vec_or((__v2df)__A, (__v2df)__B));
1828 }
1829
1830 extern __inline __m128d
1831     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1832     _mm_xor_pd(__m128d __A, __m128d __B) {
1833   return (vec_xor((__v2df)__A, (__v2df)__B));
1834 }
1835
1836 extern __inline __m128i
1837     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1838     _mm_and_si128(__m128i __A, __m128i __B) {
1839   return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1840 }
1841
1842 extern __inline __m128i
1843     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1844     _mm_andnot_si128(__m128i __A, __m128i __B) {
1845   return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1846 }
1847
1848 extern __inline __m128i
1849     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1850     _mm_or_si128(__m128i __A, __m128i __B) {
1851   return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1852 }
1853
1854 extern __inline __m128i
1855     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1856     _mm_xor_si128(__m128i __A, __m128i __B) {
1857   return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1858 }
1859
1860 extern __inline __m128i
1861     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1862     _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1863   return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1864 }
1865
1866 extern __inline __m128i
1867     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1868     _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1869   return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1870 }
1871
1872 extern __inline __m128i
1873     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1874     _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1875   return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1876 }
1877
1878 extern __inline __m128i
1879     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1880     _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1881   return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1882 }
1883
1884 extern __inline __m128i
1885     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1886     _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1887   return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1888 }
1889
1890 extern __inline __m128i
1891     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1892     _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1893   return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1894 }
1895
1896 extern __inline __m128i
1897     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1898     _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1899   return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1900 }
1901
1902 extern __inline __m128i
1903     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1904     _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1905   return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1906 }
1907
1908 extern __inline __m128i
1909     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1910     _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1911   return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1912 }
1913
1914 extern __inline int
1915     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1916     _mm_extract_epi16(__m128i const __A, int const __N) {
1917   return (unsigned short)((__v8hi)__A)[__N & 7];
1918 }
1919
1920 extern __inline __m128i
1921     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1922     _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1923   __v8hi __result = (__v8hi)__A;
1924
1925   __result[(__N & 7)] = __D;
1926
1927   return (__m128i)__result;
1928 }
1929
1930 extern __inline __m128i
1931     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1932     _mm_max_epi16(__m128i __A, __m128i __B) {
1933   return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1934 }
1935
1936 extern __inline __m128i
1937     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1938     _mm_max_epu8(__m128i __A, __m128i __B) {
1939   return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1940 }
1941
1942 extern __inline __m128i
1943     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1944     _mm_min_epi16(__m128i __A, __m128i __B) {
1945   return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1946 }
1947
1948 extern __inline __m128i
1949     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1950     _mm_min_epu8(__m128i __A, __m128i __B) {
1951   return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1952 }
1953
1954 #ifdef _ARCH_PWR8
1955 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1956
1957 /* Return a mask created from the most significant bit of each 8-bit
1958    element in A.  */
1959 extern __inline int
1960     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1961     _mm_movemask_epi8(__m128i __A) {
1962 #ifdef _ARCH_PWR10
1963   return vec_extractm((__v16qu)__A);
1964 #else
1965   __vector unsigned long long __result;
1966   static const __vector unsigned char __perm_mask = {
1967       0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1968       0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1969
1970   __result = ((__vector unsigned long long)vec_vbpermq(
1971       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1972
1973 #ifdef __LITTLE_ENDIAN__
1974   return __result[1];
1975 #else
1976   return __result[0];
1977 #endif
1978 #endif /* !_ARCH_PWR10 */
1979 }
1980 #endif /* _ARCH_PWR8 */
1981
1982 extern __inline __m128i
1983     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1984     _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1985   __v4su __w0, __w1;
1986   __v16qu __xform1 = {
1987 #ifdef __LITTLE_ENDIAN__
1988       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1989       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1990 #else
1991       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1992       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1993 #endif
1994   };
1995
1996   __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1997   __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1998   return (__m128i)vec_perm(__w0, __w1, __xform1);
1999 }
2000
2001 extern __inline __m128i
2002     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2003     _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2004   unsigned long __element_selector_98 = __mask & 0x03;
2005   unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2006   unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2007   unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2008   static const unsigned short __permute_selectors[4] = {
2009 #ifdef __LITTLE_ENDIAN__
2010       0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2011 #else
2012       0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2013 #endif
2014   };
2015   __v2du __pmask =
2016 #ifdef __LITTLE_ENDIAN__
2017       {0x1716151413121110UL, 0UL};
2018 #else
2019       {0x1011121314151617UL, 0UL};
2020 #endif
2021   __m64_union __t;
2022   __v2du __a, __r;
2023
2024   __t.as_short[0] = __permute_selectors[__element_selector_98];
2025   __t.as_short[1] = __permute_selectors[__element_selector_BA];
2026   __t.as_short[2] = __permute_selectors[__element_selector_DC];
2027   __t.as_short[3] = __permute_selectors[__element_selector_FE];
2028   __pmask[1] = __t.as_m64;
2029   __a = (__v2du)__A;
2030   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2031   return (__m128i)__r;
2032 }
2033
2034 extern __inline __m128i
2035     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2036     _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2037   unsigned long __element_selector_10 = __mask & 0x03;
2038   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2039   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2040   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2041   static const unsigned short __permute_selectors[4] = {
2042 #ifdef __LITTLE_ENDIAN__
2043       0x0100, 0x0302, 0x0504, 0x0706
2044 #else
2045       0x0001, 0x0203, 0x0405, 0x0607
2046 #endif
2047   };
2048   __v2du __pmask =
2049 #ifdef __LITTLE_ENDIAN__
2050       {0UL, 0x1f1e1d1c1b1a1918UL};
2051 #else
2052       {0UL, 0x18191a1b1c1d1e1fUL};
2053 #endif
2054   __m64_union __t;
2055   __v2du __a, __r;
2056   __t.as_short[0] = __permute_selectors[__element_selector_10];
2057   __t.as_short[1] = __permute_selectors[__element_selector_32];
2058   __t.as_short[2] = __permute_selectors[__element_selector_54];
2059   __t.as_short[3] = __permute_selectors[__element_selector_76];
2060   __pmask[0] = __t.as_m64;
2061   __a = (__v2du)__A;
2062   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2063   return (__m128i)__r;
2064 }
2065
2066 extern __inline __m128i
2067     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2068     _mm_shuffle_epi32(__m128i __A, const int __mask) {
2069   unsigned long __element_selector_10 = __mask & 0x03;
2070   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2071   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2072   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2073   static const unsigned int __permute_selectors[4] = {
2074 #ifdef __LITTLE_ENDIAN__
2075       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2076 #else
2077       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2078 #endif
2079   };
2080   __v4su __t;
2081
2082   __t[0] = __permute_selectors[__element_selector_10];
2083   __t[1] = __permute_selectors[__element_selector_32];
2084   __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2085   __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2086   return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2087                            (__vector unsigned char)__t);
2088 }
2089
2090 extern __inline void
2091     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2092     _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2093   __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2094   __v16qu __mask, __tmp;
2095   __m128i_u *__p = (__m128i_u *)__C;
2096
2097   __tmp = (__v16qu)_mm_loadu_si128(__p);
2098   __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2099   __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2100   _mm_storeu_si128(__p, (__m128i)__tmp);
2101 }
2102
2103 extern __inline __m128i
2104     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2105     _mm_avg_epu8(__m128i __A, __m128i __B) {
2106   return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2107 }
2108
2109 extern __inline __m128i
2110     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2111     _mm_avg_epu16(__m128i __A, __m128i __B) {
2112   return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2113 }
2114
2115 extern __inline __m128i
2116     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2117     _mm_sad_epu8(__m128i __A, __m128i __B) {
2118   __v16qu __a, __b;
2119   __v16qu __vabsdiff;
2120   __v4si __vsum;
2121   const __v4su __zero = {0, 0, 0, 0};
2122   __v4si __result;
2123
2124   __a = (__v16qu)__A;
2125   __b = (__v16qu)__B;
2126 #ifndef _ARCH_PWR9
2127   __v16qu __vmin = vec_min(__a, __b);
2128   __v16qu __vmax = vec_max(__a, __b);
2129   __vabsdiff = vec_sub(__vmax, __vmin);
2130 #else
2131   __vabsdiff = vec_absd(__a, __b);
2132 #endif
2133   /* Sum four groups of bytes into integers.  */
2134   __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2135 #ifdef __LITTLE_ENDIAN__
2136   /* Sum across four integers with two integer results.  */
2137   __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2138   /* Note: vec_sum2s could be used here, but on little-endian, vector
2139      shifts are added that are not needed for this use-case.
2140      A vector shift to correctly position the 32-bit integer results
2141      (currently at [0] and [2]) to [1] and [3] would then need to be
2142      swapped back again since the desired results are two 64-bit
2143      integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
2144 #else
2145   /* Sum across four integers with two integer results.  */
2146   __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2147   /* Rotate the sums into the correct position.  */
2148   __result = vec_sld(__result, __result, 6);
2149 #endif
2150   return (__m128i)__result;
2151 }
2152
2153 extern __inline void
2154     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2155     _mm_stream_si32(int *__A, int __B) {
2156   /* Use the data cache block touch for store transient.  */
2157   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2158   *__A = __B;
2159 }
2160
2161 extern __inline void
2162     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2163     _mm_stream_si64(long long int *__A, long long int __B) {
2164   /* Use the data cache block touch for store transient.  */
2165   __asm__("     dcbtstt 0,%0" : : "b"(__A) : "memory");
2166   *__A = __B;
2167 }
2168
2169 extern __inline void
2170     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2171     _mm_stream_si128(__m128i *__A, __m128i __B) {
2172   /* Use the data cache block touch for store transient.  */
2173   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2174   *__A = __B;
2175 }
2176
2177 extern __inline void
2178     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2179     _mm_stream_pd(double *__A, __m128d __B) {
2180   /* Use the data cache block touch for store transient.  */
2181   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2182   *(__m128d *)__A = __B;
2183 }
2184
2185 extern __inline void
2186     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2187     _mm_clflush(void const *__A) {
2188   /* Use the data cache block flush.  */
2189   __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2190 }
2191
2192 extern __inline void
2193     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2194     _mm_lfence(void) {
2195   /* Use light weight sync for load to load ordering.  */
2196   __atomic_thread_fence(__ATOMIC_RELEASE);
2197 }
2198
2199 extern __inline void
2200     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2201     _mm_mfence(void) {
2202   /* Use heavy weight sync for any to any ordering.  */
2203   __atomic_thread_fence(__ATOMIC_SEQ_CST);
2204 }
2205
2206 extern __inline __m128i
2207     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2208     _mm_cvtsi32_si128(int __A) {
2209   return _mm_set_epi32(0, 0, 0, __A);
2210 }
2211
2212 extern __inline __m128i
2213     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2214     _mm_cvtsi64_si128(long long __A) {
2215   return __extension__(__m128i)(__v2di){__A, 0LL};
2216 }
2217
2218 /* Microsoft intrinsic.  */
2219 extern __inline __m128i
2220     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2221     _mm_cvtsi64x_si128(long long __A) {
2222   return __extension__(__m128i)(__v2di){__A, 0LL};
2223 }
2224
2225 /* Casts between various SP, DP, INT vector types.  Note that these do no
2226    conversion of values, they just change the type.  */
2227 extern __inline __m128
2228     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2229     _mm_castpd_ps(__m128d __A) {
2230   return (__m128)__A;
2231 }
2232
2233 extern __inline __m128i
2234     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2235     _mm_castpd_si128(__m128d __A) {
2236   return (__m128i)__A;
2237 }
2238
2239 extern __inline __m128d
2240     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2241     _mm_castps_pd(__m128 __A) {
2242   return (__m128d)__A;
2243 }
2244
2245 extern __inline __m128i
2246     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2247     _mm_castps_si128(__m128 __A) {
2248   return (__m128i)__A;
2249 }
2250
2251 extern __inline __m128
2252     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2253     _mm_castsi128_ps(__m128i __A) {
2254   return (__m128)__A;
2255 }
2256
2257 extern __inline __m128d
2258     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2259     _mm_castsi128_pd(__m128i __A) {
2260   return (__m128d)__A;
2261 }
2262
2263 #else
2264 #include_next <emmintrin.h>
2265 #endif /* defined(__ppc64__) &&
2266         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2267
2268 #endif /* EMMINTRIN_H_ */