clang/lib/Headers/ppc_wrappers/emmintrin.h

   1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 /* Implemented from the specification included in the Intel C++ Compiler
  11    User Guide and Reference, version 9.0.  */
  12
  13 #ifndef NO_WARN_X86_INTRINSICS
  14 /* This header file is to help porting code using Intel intrinsics
  15    explicitly from x86_64 to powerpc64/powerpc64le.
  16
  17    Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
  18    PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
  19    However scalar float operations in vector (XMM) registers require
  20    the POWER8 VSX ISA (2.07) level. There are differences for data
  21    format and placement of float scalars in the vector register, which
  22    require extra steps to match SSE2 scalar float semantics on POWER.
  23
  24    It should be noted that there's much difference between X86_64's
  25    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
  26    portable <fenv.h> instead of access MXSCR directly.
  27
  28    Most SSE2 scalar float intrinsic operations can be performed more
  29    efficiently as C language float scalar operations or optimized to
  30    use vector SIMD operations. We recommend this for new applications.
  31 */
  32 #error                                                                         \
  33     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  34 #endif
  35
  36 #ifndef EMMINTRIN_H_
  37 #define EMMINTRIN_H_
  38
  39 #if defined(__powerpc64__) &&                                                  \
  40     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
  41
  42 #include <altivec.h>
  43
  44 /* We need definitions from the SSE header files.  */
  45 #include <xmmintrin.h>
  46
  47 /* SSE2 */
  48 typedef __vector double __v2df;
  49 typedef __vector float __v4f;
  50 typedef __vector long long __v2di;
  51 typedef __vector unsigned long long __v2du;
  52 typedef __vector int __v4si;
  53 typedef __vector unsigned int __v4su;
  54 typedef __vector short __v8hi;
  55 typedef __vector unsigned short __v8hu;
  56 typedef __vector signed char __v16qi;
  57 typedef __vector unsigned char __v16qu;
  58
  59 /* The Intel API is flexible enough that we must allow aliasing with other
  60    vector types, and their scalar components.  */
  61 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
  62 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
  63
  64 /* Unaligned version of the same types.  */
  65 typedef long long __m128i_u
  66     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
  67 typedef double __m128d_u
  68     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
  69
  70 /* Define two value permute mask.  */
  71 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
  72
  73 /* Create a vector with element 0 as F and the rest zero.  */
  74 extern __inline __m128d
  75     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  76     _mm_set_sd(double __F) {
  77   return __extension__(__m128d){__F, 0.0};
  78 }
  79
  80 /* Create a vector with both elements equal to F.  */
  81 extern __inline __m128d
  82     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  83     _mm_set1_pd(double __F) {
  84   return __extension__(__m128d){__F, __F};
  85 }
  86
  87 extern __inline __m128d
  88     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  89     _mm_set_pd1(double __F) {
  90   return _mm_set1_pd(__F);
  91 }
  92
  93 /* Create a vector with the lower value X and upper value W.  */
  94 extern __inline __m128d
  95     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  96     _mm_set_pd(double __W, double __X) {
  97   return __extension__(__m128d){__X, __W};
  98 }
  99
 100 /* Create a vector with the lower value W and upper value X.  */
 101 extern __inline __m128d
 102     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 103     _mm_setr_pd(double __W, double __X) {
 104   return __extension__(__m128d){__W, __X};
 105 }
 106
 107 /* Create an undefined vector.  */
 108 extern __inline __m128d
 109     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 110     _mm_undefined_pd(void) {
 111   __m128d __Y = __Y;
 112   return __Y;
 113 }
 114
 115 /* Create a vector of zeros.  */
 116 extern __inline __m128d
 117     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 118     _mm_setzero_pd(void) {
 119   return (__m128d)vec_splats(0);
 120 }
 121
 122 /* Sets the low DPFP value of A from the low value of B.  */
 123 extern __inline __m128d
 124     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 125     _mm_move_sd(__m128d __A, __m128d __B) {
 126   __v2df __result = (__v2df)__A;
 127   __result[0] = ((__v2df)__B)[0];
 128   return (__m128d)__result;
 129 }
 130
 131 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
 132 extern __inline __m128d
 133     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 134     _mm_load_pd(double const *__P) {
 135   return ((__m128d)vec_ld(0, (__v16qu *)__P));
 136 }
 137
 138 /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
 139 extern __inline __m128d
 140     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 141     _mm_loadu_pd(double const *__P) {
 142   return (vec_vsx_ld(0, __P));
 143 }
 144
 145 /* Create a vector with all two elements equal to *P.  */
 146 extern __inline __m128d
 147     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 148     _mm_load1_pd(double const *__P) {
 149   return (vec_splats(*__P));
 150 }
 151
 152 /* Create a vector with element 0 as *P and the rest zero.  */
 153 extern __inline __m128d
 154     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 155     _mm_load_sd(double const *__P) {
 156   return _mm_set_sd(*__P);
 157 }
 158
 159 extern __inline __m128d
 160     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 161     _mm_load_pd1(double const *__P) {
 162   return _mm_load1_pd(__P);
 163 }
 164
 165 /* Load two DPFP values in reverse order.  The address must be aligned.  */
 166 extern __inline __m128d
 167     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 168     _mm_loadr_pd(double const *__P) {
 169   __v2df __tmp = _mm_load_pd(__P);
 170   return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
 171 }
 172
 173 /* Store two DPFP values.  The address must be 16-byte aligned.  */
 174 extern __inline void
 175     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 176     _mm_store_pd(double *__P, __m128d __A) {
 177   vec_st((__v16qu)__A, 0, (__v16qu *)__P);
 178 }
 179
 180 /* Store two DPFP values.  The address need not be 16-byte aligned.  */
 181 extern __inline void
 182     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 183     _mm_storeu_pd(double *__P, __m128d __A) {
 184   *(__m128d_u *)__P = __A;
 185 }
 186
 187 /* Stores the lower DPFP value.  */
 188 extern __inline void
 189     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 190     _mm_store_sd(double *__P, __m128d __A) {
 191   *__P = ((__v2df)__A)[0];
 192 }
 193
 194 extern __inline double
 195     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 196     _mm_cvtsd_f64(__m128d __A) {
 197   return ((__v2df)__A)[0];
 198 }
 199
 200 extern __inline void
 201     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 202     _mm_storel_pd(double *__P, __m128d __A) {
 203   _mm_store_sd(__P, __A);
 204 }
 205
 206 /* Stores the upper DPFP value.  */
 207 extern __inline void
 208     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 209     _mm_storeh_pd(double *__P, __m128d __A) {
 210   *__P = ((__v2df)__A)[1];
 211 }
 212 /* Store the lower DPFP value across two words.
 213    The address must be 16-byte aligned.  */
 214 extern __inline void
 215     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 216     _mm_store1_pd(double *__P, __m128d __A) {
 217   _mm_store_pd(__P, vec_splat(__A, 0));
 218 }
 219
 220 extern __inline void
 221     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 222     _mm_store_pd1(double *__P, __m128d __A) {
 223   _mm_store1_pd(__P, __A);
 224 }
 225
 226 /* Store two DPFP values in reverse order.  The address must be aligned.  */
 227 extern __inline void
 228     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 229     _mm_storer_pd(double *__P, __m128d __A) {
 230   _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
 231 }
 232
 233 /* Intel intrinsic.  */
 234 extern __inline long long
 235     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 236     _mm_cvtsi128_si64(__m128i __A) {
 237   return ((__v2di)__A)[0];
 238 }
 239
 240 /* Microsoft intrinsic.  */
 241 extern __inline long long
 242     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 243     _mm_cvtsi128_si64x(__m128i __A) {
 244   return ((__v2di)__A)[0];
 245 }
 246
 247 extern __inline __m128d
 248     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 249     _mm_add_pd(__m128d __A, __m128d __B) {
 250   return (__m128d)((__v2df)__A + (__v2df)__B);
 251 }
 252
 253 /* Add the lower double-precision (64-bit) floating-point element in
 254    a and b, store the result in the lower element of dst, and copy
 255    the upper element from a to the upper element of dst. */
 256 extern __inline __m128d
 257     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 258     _mm_add_sd(__m128d __A, __m128d __B) {
 259   __A[0] = __A[0] + __B[0];
 260   return (__A);
 261 }
 262
 263 extern __inline __m128d
 264     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 265     _mm_sub_pd(__m128d __A, __m128d __B) {
 266   return (__m128d)((__v2df)__A - (__v2df)__B);
 267 }
 268
 269 extern __inline __m128d
 270     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 271     _mm_sub_sd(__m128d __A, __m128d __B) {
 272   __A[0] = __A[0] - __B[0];
 273   return (__A);
 274 }
 275
 276 extern __inline __m128d
 277     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 278     _mm_mul_pd(__m128d __A, __m128d __B) {
 279   return (__m128d)((__v2df)__A * (__v2df)__B);
 280 }
 281
 282 extern __inline __m128d
 283     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 284     _mm_mul_sd(__m128d __A, __m128d __B) {
 285   __A[0] = __A[0] * __B[0];
 286   return (__A);
 287 }
 288
 289 extern __inline __m128d
 290     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 291     _mm_div_pd(__m128d __A, __m128d __B) {
 292   return (__m128d)((__v2df)__A / (__v2df)__B);
 293 }
 294
 295 extern __inline __m128d
 296     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 297     _mm_div_sd(__m128d __A, __m128d __B) {
 298   __A[0] = __A[0] / __B[0];
 299   return (__A);
 300 }
 301
 302 extern __inline __m128d
 303     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 304     _mm_sqrt_pd(__m128d __A) {
 305   return (vec_sqrt(__A));
 306 }
 307
 308 /* Return pair {sqrt (B[0]), A[1]}.  */
 309 extern __inline __m128d
 310     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 311     _mm_sqrt_sd(__m128d __A, __m128d __B) {
 312   __v2df __c;
 313   __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
 314   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 315 }
 316
 317 extern __inline __m128d
 318     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 319     _mm_min_pd(__m128d __A, __m128d __B) {
 320   return (vec_min(__A, __B));
 321 }
 322
 323 extern __inline __m128d
 324     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 325     _mm_min_sd(__m128d __A, __m128d __B) {
 326   __v2df __a, __b, __c;
 327   __a = vec_splats(__A[0]);
 328   __b = vec_splats(__B[0]);
 329   __c = vec_min(__a, __b);
 330   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 331 }
 332
 333 extern __inline __m128d
 334     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 335     _mm_max_pd(__m128d __A, __m128d __B) {
 336   return (vec_max(__A, __B));
 337 }
 338
 339 extern __inline __m128d
 340     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 341     _mm_max_sd(__m128d __A, __m128d __B) {
 342   __v2df __a, __b, __c;
 343   __a = vec_splats(__A[0]);
 344   __b = vec_splats(__B[0]);
 345   __c = vec_max(__a, __b);
 346   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 347 }
 348
 349 extern __inline __m128d
 350     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 351     _mm_cmpeq_pd(__m128d __A, __m128d __B) {
 352   return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
 353 }
 354
 355 extern __inline __m128d
 356     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 357     _mm_cmplt_pd(__m128d __A, __m128d __B) {
 358   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
 359 }
 360
 361 extern __inline __m128d
 362     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 363     _mm_cmple_pd(__m128d __A, __m128d __B) {
 364   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
 365 }
 366
 367 extern __inline __m128d
 368     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 369     _mm_cmpgt_pd(__m128d __A, __m128d __B) {
 370   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
 371 }
 372
 373 extern __inline __m128d
 374     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 375     _mm_cmpge_pd(__m128d __A, __m128d __B) {
 376   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
 377 }
 378
 379 extern __inline __m128d
 380     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 381     _mm_cmpneq_pd(__m128d __A, __m128d __B) {
 382   __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
 383   return ((__m128d)vec_nor(__temp, __temp));
 384 }
 385
 386 extern __inline __m128d
 387     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 388     _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
 389   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
 390 }
 391
 392 extern __inline __m128d
 393     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 394     _mm_cmpnle_pd(__m128d __A, __m128d __B) {
 395   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
 396 }
 397
 398 extern __inline __m128d
 399     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 400     _mm_cmpngt_pd(__m128d __A, __m128d __B) {
 401   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
 402 }
 403
 404 extern __inline __m128d
 405     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 406     _mm_cmpnge_pd(__m128d __A, __m128d __B) {
 407   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
 408 }
 409
 410 extern __inline __m128d
 411     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 412     _mm_cmpord_pd(__m128d __A, __m128d __B) {
 413   __v2du __c, __d;
 414   /* Compare against self will return false (0's) if NAN.  */
 415   __c = (__v2du)vec_cmpeq(__A, __A);
 416   __d = (__v2du)vec_cmpeq(__B, __B);
 417   /* A != NAN and B != NAN.  */
 418   return ((__m128d)vec_and(__c, __d));
 419 }
 420
 421 extern __inline __m128d
 422     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 423     _mm_cmpunord_pd(__m128d __A, __m128d __B) {
 424 #if _ARCH_PWR8
 425   __v2du __c, __d;
 426   /* Compare against self will return false (0's) if NAN.  */
 427   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
 428   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
 429   /* A == NAN OR B == NAN converts too:
 430      NOT(A != NAN) OR NOT(B != NAN).  */
 431   __c = vec_nor(__c, __c);
 432   return ((__m128d)vec_orc(__c, __d));
 433 #else
 434   __v2du __c, __d;
 435   /* Compare against self will return false (0's) if NAN.  */
 436   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
 437   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
 438   /* Convert the true ('1's) is NAN.  */
 439   __c = vec_nor(__c, __c);
 440   __d = vec_nor(__d, __d);
 441   return ((__m128d)vec_or(__c, __d));
 442 #endif
 443 }
 444
 445 extern __inline __m128d
 446     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 447     _mm_cmpeq_sd(__m128d __A, __m128d __B) {
 448   __v2df __a, __b, __c;
 449   /* PowerISA VSX does not allow partial (for just lower double)
 450      results. So to insure we don't generate spurious exceptions
 451      (from the upper double values) we splat the lower double
 452      before we do the operation. */
 453   __a = vec_splats(__A[0]);
 454   __b = vec_splats(__B[0]);
 455   __c = (__v2df)vec_cmpeq(__a, __b);
 456   /* Then we merge the lower double result with the original upper
 457      double from __A.  */
 458   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 459 }
 460
 461 extern __inline __m128d
 462     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 463     _mm_cmplt_sd(__m128d __A, __m128d __B) {
 464   __v2df __a, __b, __c;
 465   __a = vec_splats(__A[0]);
 466   __b = vec_splats(__B[0]);
 467   __c = (__v2df)vec_cmplt(__a, __b);
 468   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 469 }
 470
 471 extern __inline __m128d
 472     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 473     _mm_cmple_sd(__m128d __A, __m128d __B) {
 474   __v2df __a, __b, __c;
 475   __a = vec_splats(__A[0]);
 476   __b = vec_splats(__B[0]);
 477   __c = (__v2df)vec_cmple(__a, __b);
 478   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 479 }
 480
 481 extern __inline __m128d
 482     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 483     _mm_cmpgt_sd(__m128d __A, __m128d __B) {
 484   __v2df __a, __b, __c;
 485   __a = vec_splats(__A[0]);
 486   __b = vec_splats(__B[0]);
 487   __c = (__v2df)vec_cmpgt(__a, __b);
 488   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 489 }
 490
 491 extern __inline __m128d
 492     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 493     _mm_cmpge_sd(__m128d __A, __m128d __B) {
 494   __v2df __a, __b, __c;
 495   __a = vec_splats(__A[0]);
 496   __b = vec_splats(__B[0]);
 497   __c = (__v2df)vec_cmpge(__a, __b);
 498   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 499 }
 500
 501 extern __inline __m128d
 502     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 503     _mm_cmpneq_sd(__m128d __A, __m128d __B) {
 504   __v2df __a, __b, __c;
 505   __a = vec_splats(__A[0]);
 506   __b = vec_splats(__B[0]);
 507   __c = (__v2df)vec_cmpeq(__a, __b);
 508   __c = vec_nor(__c, __c);
 509   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 510 }
 511
 512 extern __inline __m128d
 513     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 514     _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
 515   __v2df __a, __b, __c;
 516   __a = vec_splats(__A[0]);
 517   __b = vec_splats(__B[0]);
 518   /* Not less than is just greater than or equal.  */
 519   __c = (__v2df)vec_cmpge(__a, __b);
 520   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 521 }
 522
 523 extern __inline __m128d
 524     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 525     _mm_cmpnle_sd(__m128d __A, __m128d __B) {
 526   __v2df __a, __b, __c;
 527   __a = vec_splats(__A[0]);
 528   __b = vec_splats(__B[0]);
 529   /* Not less than or equal is just greater than.  */
 530   __c = (__v2df)vec_cmpge(__a, __b);
 531   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 532 }
 533
 534 extern __inline __m128d
 535     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 536     _mm_cmpngt_sd(__m128d __A, __m128d __B) {
 537   __v2df __a, __b, __c;
 538   __a = vec_splats(__A[0]);
 539   __b = vec_splats(__B[0]);
 540   /* Not greater than is just less than or equal.  */
 541   __c = (__v2df)vec_cmple(__a, __b);
 542   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 543 }
 544
 545 extern __inline __m128d
 546     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 547     _mm_cmpnge_sd(__m128d __A, __m128d __B) {
 548   __v2df __a, __b, __c;
 549   __a = vec_splats(__A[0]);
 550   __b = vec_splats(__B[0]);
 551   /* Not greater than or equal is just less than.  */
 552   __c = (__v2df)vec_cmplt(__a, __b);
 553   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 554 }
 555
 556 extern __inline __m128d
 557     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 558     _mm_cmpord_sd(__m128d __A, __m128d __B) {
 559   __v2df __r;
 560   __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
 561   return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
 562 }
 563
 564 extern __inline __m128d
 565     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 566     _mm_cmpunord_sd(__m128d __A, __m128d __B) {
 567   __v2df __r;
 568   __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
 569   return (__m128d)_mm_setr_pd(__r[0], __A[1]);
 570 }
 571
 572 /* FIXME
 573    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
 574    exactly the same because GCC for PowerPC only generates unordered
 575    compares (scalar and vector).
 576    Technically __mm_comieq_sp et all should be using the ordered
 577    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
 578    be OK.   */
 579 extern __inline int
 580     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 581     _mm_comieq_sd(__m128d __A, __m128d __B) {
 582   return (__A[0] == __B[0]);
 583 }
 584
 585 extern __inline int
 586     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 587     _mm_comilt_sd(__m128d __A, __m128d __B) {
 588   return (__A[0] < __B[0]);
 589 }
 590
 591 extern __inline int
 592     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 593     _mm_comile_sd(__m128d __A, __m128d __B) {
 594   return (__A[0] <= __B[0]);
 595 }
 596
 597 extern __inline int
 598     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 599     _mm_comigt_sd(__m128d __A, __m128d __B) {
 600   return (__A[0] > __B[0]);
 601 }
 602
 603 extern __inline int
 604     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 605     _mm_comige_sd(__m128d __A, __m128d __B) {
 606   return (__A[0] >= __B[0]);
 607 }
 608
 609 extern __inline int
 610     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 611     _mm_comineq_sd(__m128d __A, __m128d __B) {
 612   return (__A[0] != __B[0]);
 613 }
 614
 615 extern __inline int
 616     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 617     _mm_ucomieq_sd(__m128d __A, __m128d __B) {
 618   return (__A[0] == __B[0]);
 619 }
 620
 621 extern __inline int
 622     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 623     _mm_ucomilt_sd(__m128d __A, __m128d __B) {
 624   return (__A[0] < __B[0]);
 625 }
 626
 627 extern __inline int
 628     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 629     _mm_ucomile_sd(__m128d __A, __m128d __B) {
 630   return (__A[0] <= __B[0]);
 631 }
 632
 633 extern __inline int
 634     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 635     _mm_ucomigt_sd(__m128d __A, __m128d __B) {
 636   return (__A[0] > __B[0]);
 637 }
 638
 639 extern __inline int
 640     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 641     _mm_ucomige_sd(__m128d __A, __m128d __B) {
 642   return (__A[0] >= __B[0]);
 643 }
 644
 645 extern __inline int
 646     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 647     _mm_ucomineq_sd(__m128d __A, __m128d __B) {
 648   return (__A[0] != __B[0]);
 649 }
 650
 651 /* Create a vector of Qi, where i is the element number.  */
 652 extern __inline __m128i
 653     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 654     _mm_set_epi64x(long long __q1, long long __q0) {
 655   return __extension__(__m128i)(__v2di){__q0, __q1};
 656 }
 657
 658 extern __inline __m128i
 659     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 660     _mm_set_epi64(__m64 __q1, __m64 __q0) {
 661   return _mm_set_epi64x((long long)__q1, (long long)__q0);
 662 }
 663
 664 extern __inline __m128i
 665     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 666     _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
 667   return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
 668 }
 669
 670 extern __inline __m128i
 671     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 672     _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
 673                   short __q2, short __q1, short __q0) {
 674   return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
 675                                         __q4, __q5, __q6, __q7};
 676 }
 677
 678 extern __inline __m128i
 679     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 680     _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
 681                  char __q10, char __q09, char __q08, char __q07, char __q06,
 682                  char __q05, char __q04, char __q03, char __q02, char __q01,
 683                  char __q00) {
 684   return __extension__(__m128i)(__v16qi){
 685       __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
 686       __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
 687 }
 688
 689 /* Set all of the elements of the vector to A.  */
 690 extern __inline __m128i
 691     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 692     _mm_set1_epi64x(long long __A) {
 693   return _mm_set_epi64x(__A, __A);
 694 }
 695
 696 extern __inline __m128i
 697     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 698     _mm_set1_epi64(__m64 __A) {
 699   return _mm_set_epi64(__A, __A);
 700 }
 701
 702 extern __inline __m128i
 703     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 704     _mm_set1_epi32(int __A) {
 705   return _mm_set_epi32(__A, __A, __A, __A);
 706 }
 707
 708 extern __inline __m128i
 709     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 710     _mm_set1_epi16(short __A) {
 711   return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
 712 }
 713
 714 extern __inline __m128i
 715     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 716     _mm_set1_epi8(char __A) {
 717   return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
 718                       __A, __A, __A, __A, __A);
 719 }
 720
 721 /* Create a vector of Qi, where i is the element number.
 722    The parameter order is reversed from the _mm_set_epi* functions.  */
 723 extern __inline __m128i
 724     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 725     _mm_setr_epi64(__m64 __q0, __m64 __q1) {
 726   return _mm_set_epi64(__q1, __q0);
 727 }
 728
 729 extern __inline __m128i
 730     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 731     _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
 732   return _mm_set_epi32(__q3, __q2, __q1, __q0);
 733 }
 734
 735 extern __inline __m128i
 736     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 737     _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
 738                    short __q5, short __q6, short __q7) {
 739   return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
 740 }
 741
 742 extern __inline __m128i
 743     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 744     _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
 745                   char __q05, char __q06, char __q07, char __q08, char __q09,
 746                   char __q10, char __q11, char __q12, char __q13, char __q14,
 747                   char __q15) {
 748   return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
 749                       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
 750 }
 751
 752 /* Create a vector with element 0 as *P and the rest zero.  */
 753 extern __inline __m128i
 754     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 755     _mm_load_si128(__m128i const *__P) {
 756   return *__P;
 757 }
 758
 759 extern __inline __m128i
 760     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 761     _mm_loadu_si128(__m128i_u const *__P) {
 762   return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
 763 }
 764
 765 extern __inline __m128i
 766     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 767     _mm_loadl_epi64(__m128i_u const *__P) {
 768   return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
 769 }
 770
 771 extern __inline void
 772     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 773     _mm_store_si128(__m128i *__P, __m128i __B) {
 774   vec_st((__v16qu)__B, 0, (__v16qu *)__P);
 775 }
 776
 777 extern __inline void
 778     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 779     _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
 780   *__P = __B;
 781 }
 782
 783 extern __inline void
 784     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 785     _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
 786   *(long long *)__P = ((__v2di)__B)[0];
 787 }
 788
 789 extern __inline __m64
 790     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 791     _mm_movepi64_pi64(__m128i_u __B) {
 792   return (__m64)((__v2di)__B)[0];
 793 }
 794
 795 extern __inline __m128i
 796     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 797     _mm_movpi64_epi64(__m64 __A) {
 798   return _mm_set_epi64((__m64)0LL, __A);
 799 }
 800
 801 extern __inline __m128i
 802     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 803     _mm_move_epi64(__m128i __A) {
 804   return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
 805 }
 806
 807 /* Create an undefined vector.  */
 808 extern __inline __m128i
 809     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 810     _mm_undefined_si128(void) {
 811   __m128i __Y = __Y;
 812   return __Y;
 813 }
 814
 815 /* Create a vector of zeros.  */
 816 extern __inline __m128i
 817     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 818     _mm_setzero_si128(void) {
 819   return __extension__(__m128i)(__v4si){0, 0, 0, 0};
 820 }
 821
 822 #ifdef _ARCH_PWR8
 823 extern __inline __m128d
 824     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 825     _mm_cvtepi32_pd(__m128i __A) {
 826   __v2di __val;
 827   /* For LE need to generate Vector Unpack Low Signed Word.
 828      Which is generated from unpackh.  */
 829   __val = (__v2di)vec_unpackh((__v4si)__A);
 830
 831   return (__m128d)vec_ctf(__val, 0);
 832 }
 833 #endif
 834
 835 extern __inline __m128
 836     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 837     _mm_cvtepi32_ps(__m128i __A) {
 838   return ((__m128)vec_ctf((__v4si)__A, 0));
 839 }
 840
 841 extern __inline __m128i
 842     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 843     _mm_cvtpd_epi32(__m128d __A) {
 844   __v2df __rounded = vec_rint(__A);
 845   __v4si __result, __temp;
 846   const __v4si __vzero = {0, 0, 0, 0};
 847
 848   /* VSX Vector truncate Double-Precision to integer and Convert to
 849    Signed Integer Word format with Saturate.  */
 850   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
 851
 852 #ifdef _ARCH_PWR8
 853 #ifdef __LITTLE_ENDIAN__
 854   __temp = vec_mergeo(__temp, __temp);
 855 #else
 856   __temp = vec_mergee(__temp, __temp);
 857 #endif
 858   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
 859                                  (__vector long long)__vzero);
 860 #else
 861   {
 862     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
 863                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
 864     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
 865   }
 866 #endif
 867   return (__m128i)__result;
 868 }
 869
 870 extern __inline __m64
 871     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 872     _mm_cvtpd_pi32(__m128d __A) {
 873   __m128i __result = _mm_cvtpd_epi32(__A);
 874
 875   return (__m64)__result[0];
 876 }
 877
 878 extern __inline __m128
 879     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 880     _mm_cvtpd_ps(__m128d __A) {
 881   __v4sf __result;
 882   __v4si __temp;
 883   const __v4si __vzero = {0, 0, 0, 0};
 884
 885   __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
 886
 887 #ifdef _ARCH_PWR8
 888 #ifdef __LITTLE_ENDIAN__
 889   __temp = vec_mergeo(__temp, __temp);
 890 #else
 891   __temp = vec_mergee(__temp, __temp);
 892 #endif
 893   __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
 894                                  (__vector long long)__vzero);
 895 #else
 896   {
 897     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
 898                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
 899     __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
 900   }
 901 #endif
 902   return ((__m128)__result);
 903 }
 904
 905 extern __inline __m128i
 906     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 907     _mm_cvttpd_epi32(__m128d __A) {
 908   __v4si __result;
 909   __v4si __temp;
 910   const __v4si __vzero = {0, 0, 0, 0};
 911
 912   /* VSX Vector truncate Double-Precision to integer and Convert to
 913    Signed Integer Word format with Saturate.  */
 914   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
 915
 916 #ifdef _ARCH_PWR8
 917 #ifdef __LITTLE_ENDIAN__
 918   __temp = vec_mergeo(__temp, __temp);
 919 #else
 920   __temp = vec_mergee(__temp, __temp);
 921 #endif
 922   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
 923                                  (__vector long long)__vzero);
 924 #else
 925   {
 926     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
 927                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
 928     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
 929   }
 930 #endif
 931
 932   return ((__m128i)__result);
 933 }
 934
 935 extern __inline __m64
 936     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 937     _mm_cvttpd_pi32(__m128d __A) {
 938   __m128i __result = _mm_cvttpd_epi32(__A);
 939
 940   return (__m64)__result[0];
 941 }
 942
 943 extern __inline int
 944     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 945     _mm_cvtsi128_si32(__m128i __A) {
 946   return ((__v4si)__A)[0];
 947 }
 948
 949 #ifdef _ARCH_PWR8
 950 extern __inline __m128d
 951     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 952     _mm_cvtpi32_pd(__m64 __A) {
 953   __v4si __temp;
 954   __v2di __tmp2;
 955   __v4f __result;
 956
 957   __temp = (__v4si)vec_splats(__A);
 958   __tmp2 = (__v2di)vec_unpackl(__temp);
 959   __result = vec_ctf((__vector signed long long)__tmp2, 0);
 960   return (__m128d)__result;
 961 }
 962 #endif
 963
 964 extern __inline __m128i
 965     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 966     _mm_cvtps_epi32(__m128 __A) {
 967   __v4sf __rounded;
 968   __v4si __result;
 969
 970   __rounded = vec_rint((__v4sf)__A);
 971   __result = vec_cts(__rounded, 0);
 972   return (__m128i)__result;
 973 }
 974
 975 extern __inline __m128i
 976     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 977     _mm_cvttps_epi32(__m128 __A) {
 978   __v4si __result;
 979
 980   __result = vec_cts((__v4sf)__A, 0);
 981   return (__m128i)__result;
 982 }
 983
 984 extern __inline __m128d
 985     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 986     _mm_cvtps_pd(__m128 __A) {
 987   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
 988 #ifdef vec_doubleh
 989   return (__m128d)vec_doubleh((__v4sf)__A);
 990 #else
 991   /* Otherwise the compiler is not current and so need to generate the
 992      equivalent code.  */
 993   __v4sf __a = (__v4sf)__A;
 994   __v4sf __temp;
 995   __v2df __result;
 996 #ifdef __LITTLE_ENDIAN__
 997   /* The input float values are in elements {[0], [1]} but the convert
 998      instruction needs them in elements {[1], [3]}, So we use two
 999      shift left double vector word immediates to get the elements
1000      lined up.  */
1001   __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1002   __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1003 #else
1004   /* The input float values are in elements {[0], [1]} but the convert
1005      instruction needs them in elements {[0], [2]}, So we use two
1006      shift left double vector word immediates to get the elements
1007      lined up.  */
1008   __temp = vec_vmrghw(__a, __a);
1009 #endif
1010   __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1011   return (__m128d)__result;
1012 #endif
1013 }
1014
1015 extern __inline int
1016     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017     _mm_cvtsd_si32(__m128d __A) {
1018   __v2df __rounded = vec_rint((__v2df)__A);
1019   int __result = ((__v2df)__rounded)[0];
1020
1021   return __result;
1022 }
1023 /* Intel intrinsic.  */
1024 extern __inline long long
1025     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026     _mm_cvtsd_si64(__m128d __A) {
1027   __v2df __rounded = vec_rint((__v2df)__A);
1028   long long __result = ((__v2df)__rounded)[0];
1029
1030   return __result;
1031 }
1032
1033 /* Microsoft intrinsic.  */
1034 extern __inline long long
1035     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036     _mm_cvtsd_si64x(__m128d __A) {
1037   return _mm_cvtsd_si64((__v2df)__A);
1038 }
1039
1040 extern __inline int
1041     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042     _mm_cvttsd_si32(__m128d __A) {
1043   int __result = ((__v2df)__A)[0];
1044
1045   return __result;
1046 }
1047
1048 /* Intel intrinsic.  */
1049 extern __inline long long
1050     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051     _mm_cvttsd_si64(__m128d __A) {
1052   long long __result = ((__v2df)__A)[0];
1053
1054   return __result;
1055 }
1056
1057 /* Microsoft intrinsic.  */
1058 extern __inline long long
1059     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060     _mm_cvttsd_si64x(__m128d __A) {
1061   return _mm_cvttsd_si64(__A);
1062 }
1063
1064 extern __inline __m128
1065     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066     _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1067   __v4sf __result = (__v4sf)__A;
1068
1069 #ifdef __LITTLE_ENDIAN__
1070   __v4sf __temp_s;
1071   /* Copy double element[0] to element [1] for conversion.  */
1072   __v2df __temp_b = vec_splat((__v2df)__B, 0);
1073
1074   /* Pre-rotate __A left 3 (logically right 1) elements.  */
1075   __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1076   /* Convert double to single float scalar in a vector.  */
1077   __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1078   /* Shift the resulting scalar into vector element [0].  */
1079   __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1080 #else
1081   __result[0] = ((__v2df)__B)[0];
1082 #endif
1083   return (__m128)__result;
1084 }
1085
1086 extern __inline __m128d
1087     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1088     _mm_cvtsi32_sd(__m128d __A, int __B) {
1089   __v2df __result = (__v2df)__A;
1090   double __db = __B;
1091   __result[0] = __db;
1092   return (__m128d)__result;
1093 }
1094
1095 /* Intel intrinsic.  */
1096 extern __inline __m128d
1097     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098     _mm_cvtsi64_sd(__m128d __A, long long __B) {
1099   __v2df __result = (__v2df)__A;
1100   double __db = __B;
1101   __result[0] = __db;
1102   return (__m128d)__result;
1103 }
1104
1105 /* Microsoft intrinsic.  */
1106 extern __inline __m128d
1107     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108     _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1109   return _mm_cvtsi64_sd(__A, __B);
1110 }
1111
1112 extern __inline __m128d
1113     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114     _mm_cvtss_sd(__m128d __A, __m128 __B) {
1115 #ifdef __LITTLE_ENDIAN__
1116   /* Use splat to move element [0] into position for the convert. */
1117   __v4sf __temp = vec_splat((__v4sf)__B, 0);
1118   __v2df __res;
1119   /* Convert single float scalar to double in a vector.  */
1120   __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1121   return (__m128d)vec_mergel(__res, (__v2df)__A);
1122 #else
1123   __v2df __res = (__v2df)__A;
1124   __res[0] = ((__v4sf)__B)[0];
1125   return (__m128d)__res;
1126 #endif
1127 }
1128
1129 extern __inline __m128d
1130     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1131     _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1132   __vector double __result;
1133   const int __litmsk = __mask & 0x3;
1134
1135   if (__litmsk == 0)
1136     __result = vec_mergeh(__A, __B);
1137 #if __GNUC__ < 6
1138   else if (__litmsk == 1)
1139     __result = vec_xxpermdi(__B, __A, 2);
1140   else if (__litmsk == 2)
1141     __result = vec_xxpermdi(__B, __A, 1);
1142 #else
1143   else if (__litmsk == 1)
1144     __result = vec_xxpermdi(__A, __B, 2);
1145   else if (__litmsk == 2)
1146     __result = vec_xxpermdi(__A, __B, 1);
1147 #endif
1148   else
1149     __result = vec_mergel(__A, __B);
1150
1151   return __result;
1152 }
1153
1154 extern __inline __m128d
1155     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156     _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1157   return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1158 }
1159
1160 extern __inline __m128d
1161     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162     _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1163   return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1164 }
1165
1166 extern __inline __m128d
1167     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168     _mm_loadh_pd(__m128d __A, double const *__B) {
1169   __v2df __result = (__v2df)__A;
1170   __result[1] = *__B;
1171   return (__m128d)__result;
1172 }
1173
1174 extern __inline __m128d
1175     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1176     _mm_loadl_pd(__m128d __A, double const *__B) {
1177   __v2df __result = (__v2df)__A;
1178   __result[0] = *__B;
1179   return (__m128d)__result;
1180 }
1181
1182 #ifdef _ARCH_PWR8
1183 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1184
1185 /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1186 extern __inline int
1187     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188     _mm_movemask_pd(__m128d __A) {
1189 #ifdef _ARCH_PWR10
1190   return vec_extractm((__v2du)__A);
1191 #else
1192   __vector unsigned long long __result;
1193   static const __vector unsigned int __perm_mask = {
1194 #ifdef __LITTLE_ENDIAN__
1195       0x80800040, 0x80808080, 0x80808080, 0x80808080
1196 #else
1197       0x80808080, 0x80808080, 0x80808080, 0x80804000
1198 #endif
1199   };
1200
1201   __result = ((__vector unsigned long long)vec_vbpermq(
1202       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1203
1204 #ifdef __LITTLE_ENDIAN__
1205   return __result[1];
1206 #else
1207   return __result[0];
1208 #endif
1209 #endif /* !_ARCH_PWR10 */
1210 }
1211 #endif /* _ARCH_PWR8 */
1212
1213 extern __inline __m128i
1214     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215     _mm_packs_epi16(__m128i __A, __m128i __B) {
1216   return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1217 }
1218
1219 extern __inline __m128i
1220     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221     _mm_packs_epi32(__m128i __A, __m128i __B) {
1222   return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1223 }
1224
1225 extern __inline __m128i
1226     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227     _mm_packus_epi16(__m128i __A, __m128i __B) {
1228   return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1229 }
1230
1231 extern __inline __m128i
1232     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1233     _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1234   return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1235 }
1236
1237 extern __inline __m128i
1238     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1239     _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1240   return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1241 }
1242
1243 extern __inline __m128i
1244     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1245     _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1246   return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1247 }
1248
1249 extern __inline __m128i
1250     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251     _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1252   return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1253 }
1254
1255 extern __inline __m128i
1256     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257     _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1258   return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1259 }
1260
1261 extern __inline __m128i
1262     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263     _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1264   return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1265 }
1266
1267 extern __inline __m128i
1268     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269     _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1270   return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1271 }
1272
1273 extern __inline __m128i
1274     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275     _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1276   return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1277 }
1278
1279 extern __inline __m128i
1280     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281     _mm_add_epi8(__m128i __A, __m128i __B) {
1282   return (__m128i)((__v16qu)__A + (__v16qu)__B);
1283 }
1284
1285 extern __inline __m128i
1286     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1287     _mm_add_epi16(__m128i __A, __m128i __B) {
1288   return (__m128i)((__v8hu)__A + (__v8hu)__B);
1289 }
1290
1291 extern __inline __m128i
1292     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293     _mm_add_epi32(__m128i __A, __m128i __B) {
1294   return (__m128i)((__v4su)__A + (__v4su)__B);
1295 }
1296
1297 extern __inline __m128i
1298     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1299     _mm_add_epi64(__m128i __A, __m128i __B) {
1300   return (__m128i)((__v2du)__A + (__v2du)__B);
1301 }
1302
1303 extern __inline __m128i
1304     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305     _mm_adds_epi8(__m128i __A, __m128i __B) {
1306   return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1307 }
1308
1309 extern __inline __m128i
1310     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311     _mm_adds_epi16(__m128i __A, __m128i __B) {
1312   return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1313 }
1314
1315 extern __inline __m128i
1316     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317     _mm_adds_epu8(__m128i __A, __m128i __B) {
1318   return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1319 }
1320
1321 extern __inline __m128i
1322     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323     _mm_adds_epu16(__m128i __A, __m128i __B) {
1324   return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1325 }
1326
1327 extern __inline __m128i
1328     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329     _mm_sub_epi8(__m128i __A, __m128i __B) {
1330   return (__m128i)((__v16qu)__A - (__v16qu)__B);
1331 }
1332
1333 extern __inline __m128i
1334     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1335     _mm_sub_epi16(__m128i __A, __m128i __B) {
1336   return (__m128i)((__v8hu)__A - (__v8hu)__B);
1337 }
1338
1339 extern __inline __m128i
1340     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341     _mm_sub_epi32(__m128i __A, __m128i __B) {
1342   return (__m128i)((__v4su)__A - (__v4su)__B);
1343 }
1344
1345 extern __inline __m128i
1346     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1347     _mm_sub_epi64(__m128i __A, __m128i __B) {
1348   return (__m128i)((__v2du)__A - (__v2du)__B);
1349 }
1350
1351 extern __inline __m128i
1352     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353     _mm_subs_epi8(__m128i __A, __m128i __B) {
1354   return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1355 }
1356
1357 extern __inline __m128i
1358     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359     _mm_subs_epi16(__m128i __A, __m128i __B) {
1360   return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1361 }
1362
1363 extern __inline __m128i
1364     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1365     _mm_subs_epu8(__m128i __A, __m128i __B) {
1366   return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1367 }
1368
1369 extern __inline __m128i
1370     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371     _mm_subs_epu16(__m128i __A, __m128i __B) {
1372   return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1373 }
1374
1375 extern __inline __m128i
1376     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1377     _mm_madd_epi16(__m128i __A, __m128i __B) {
1378   __vector signed int __zero = {0, 0, 0, 0};
1379
1380   return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1381 }
1382
1383 extern __inline __m128i
1384     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385     _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1386   __vector signed int __w0, __w1;
1387
1388   __vector unsigned char __xform1 = {
1389 #ifdef __LITTLE_ENDIAN__
1390       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1391       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1392 #else
1393       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1394       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1395 #endif
1396   };
1397
1398   __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1399   __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1400   return (__m128i)vec_perm(__w0, __w1, __xform1);
1401 }
1402
1403 extern __inline __m128i
1404     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405     _mm_mullo_epi16(__m128i __A, __m128i __B) {
1406   return (__m128i)((__v8hi)__A * (__v8hi)__B);
1407 }
1408
1409 extern __inline __m64
1410     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411     _mm_mul_su32(__m64 __A, __m64 __B) {
1412   unsigned int __a = __A;
1413   unsigned int __b = __B;
1414
1415   return ((__m64)__a * (__m64)__b);
1416 }
1417
1418 #ifdef _ARCH_PWR8
1419 extern __inline __m128i
1420     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421     _mm_mul_epu32(__m128i __A, __m128i __B) {
1422 #if __GNUC__ < 8
1423   __v2du __result;
1424
1425 #ifdef __LITTLE_ENDIAN__
1426   /* VMX Vector Multiply Odd Unsigned Word.  */
1427   __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1428 #else
1429   /* VMX Vector Multiply Even Unsigned Word.  */
1430   __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1431 #endif
1432   return (__m128i)__result;
1433 #else
1434   return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1435 #endif
1436 }
1437 #endif
1438
1439 extern __inline __m128i
1440     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1441     _mm_slli_epi16(__m128i __A, int __B) {
1442   __v8hu __lshift;
1443   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1444
1445   if (__B >= 0 && __B < 16) {
1446     if (__builtin_constant_p(__B))
1447       __lshift = (__v8hu)vec_splat_s16(__B);
1448     else
1449       __lshift = vec_splats((unsigned short)__B);
1450
1451     __result = vec_sl((__v8hi)__A, __lshift);
1452   }
1453
1454   return (__m128i)__result;
1455 }
1456
1457 extern __inline __m128i
1458     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1459     _mm_slli_epi32(__m128i __A, int __B) {
1460   __v4su __lshift;
1461   __v4si __result = {0, 0, 0, 0};
1462
1463   if (__B >= 0 && __B < 32) {
1464     if (__builtin_constant_p(__B) && __B < 16)
1465       __lshift = (__v4su)vec_splat_s32(__B);
1466     else
1467       __lshift = vec_splats((unsigned int)__B);
1468
1469     __result = vec_sl((__v4si)__A, __lshift);
1470   }
1471
1472   return (__m128i)__result;
1473 }
1474
1475 #ifdef _ARCH_PWR8
1476 extern __inline __m128i
1477     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1478     _mm_slli_epi64(__m128i __A, int __B) {
1479   __v2du __lshift;
1480   __v2di __result = {0, 0};
1481
1482   if (__B >= 0 && __B < 64) {
1483     if (__builtin_constant_p(__B) && __B < 16)
1484       __lshift = (__v2du)vec_splat_s32(__B);
1485     else
1486       __lshift = (__v2du)vec_splats((unsigned int)__B);
1487
1488     __result = vec_sl((__v2di)__A, __lshift);
1489   }
1490
1491   return (__m128i)__result;
1492 }
1493 #endif
1494
1495 extern __inline __m128i
1496     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1497     _mm_srai_epi16(__m128i __A, int __B) {
1498   __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1499   __v8hi __result;
1500
1501   if (__B < 16) {
1502     if (__builtin_constant_p(__B))
1503       __rshift = (__v8hu)vec_splat_s16(__B);
1504     else
1505       __rshift = vec_splats((unsigned short)__B);
1506   }
1507   __result = vec_sra((__v8hi)__A, __rshift);
1508
1509   return (__m128i)__result;
1510 }
1511
1512 extern __inline __m128i
1513     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1514     _mm_srai_epi32(__m128i __A, int __B) {
1515   __v4su __rshift = {31, 31, 31, 31};
1516   __v4si __result;
1517
1518   if (__B < 32) {
1519     if (__builtin_constant_p(__B)) {
1520       if (__B < 16)
1521         __rshift = (__v4su)vec_splat_s32(__B);
1522       else
1523         __rshift = (__v4su)vec_splats((unsigned int)__B);
1524     } else
1525       __rshift = vec_splats((unsigned int)__B);
1526   }
1527   __result = vec_sra((__v4si)__A, __rshift);
1528
1529   return (__m128i)__result;
1530 }
1531
1532 extern __inline __m128i
1533     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1534     _mm_bslli_si128(__m128i __A, const int __N) {
1535   __v16qu __result;
1536   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1537
1538   if (__N < 16)
1539     __result = vec_sld((__v16qu)__A, __zeros, __N);
1540   else
1541     __result = __zeros;
1542
1543   return (__m128i)__result;
1544 }
1545
1546 extern __inline __m128i
1547     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1548     _mm_bsrli_si128(__m128i __A, const int __N) {
1549   __v16qu __result;
1550   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1551
1552   if (__N < 16)
1553 #ifdef __LITTLE_ENDIAN__
1554     if (__builtin_constant_p(__N))
1555       /* Would like to use Vector Shift Left Double by Octet
1556          Immediate here to use the immediate form and avoid
1557          load of __N * 8 value into a separate VR.  */
1558       __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1559     else
1560 #endif
1561     {
1562       __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1563 #ifdef __LITTLE_ENDIAN__
1564       __result = vec_sro((__v16qu)__A, __shift);
1565 #else
1566     __result = vec_slo((__v16qu)__A, __shift);
1567 #endif
1568     }
1569   else
1570     __result = __zeros;
1571
1572   return (__m128i)__result;
1573 }
1574
1575 extern __inline __m128i
1576     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577     _mm_srli_si128(__m128i __A, const int __N) {
1578   return _mm_bsrli_si128(__A, __N);
1579 }
1580
1581 extern __inline __m128i
1582     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1583     _mm_slli_si128(__m128i __A, const int _imm5) {
1584   __v16qu __result;
1585   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1586
1587   if (_imm5 < 16)
1588 #ifdef __LITTLE_ENDIAN__
1589     __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1590 #else
1591     __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1592 #endif
1593   else
1594     __result = __zeros;
1595
1596   return (__m128i)__result;
1597 }
1598
1599 extern __inline __m128i
1600     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1601
1602     _mm_srli_epi16(__m128i __A, int __B) {
1603   __v8hu __rshift;
1604   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1605
1606   if (__B < 16) {
1607     if (__builtin_constant_p(__B))
1608       __rshift = (__v8hu)vec_splat_s16(__B);
1609     else
1610       __rshift = vec_splats((unsigned short)__B);
1611
1612     __result = vec_sr((__v8hi)__A, __rshift);
1613   }
1614
1615   return (__m128i)__result;
1616 }
1617
1618 extern __inline __m128i
1619     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1620     _mm_srli_epi32(__m128i __A, int __B) {
1621   __v4su __rshift;
1622   __v4si __result = {0, 0, 0, 0};
1623
1624   if (__B < 32) {
1625     if (__builtin_constant_p(__B)) {
1626       if (__B < 16)
1627         __rshift = (__v4su)vec_splat_s32(__B);
1628       else
1629         __rshift = (__v4su)vec_splats((unsigned int)__B);
1630     } else
1631       __rshift = vec_splats((unsigned int)__B);
1632
1633     __result = vec_sr((__v4si)__A, __rshift);
1634   }
1635
1636   return (__m128i)__result;
1637 }
1638
1639 #ifdef _ARCH_PWR8
1640 extern __inline __m128i
1641     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1642     _mm_srli_epi64(__m128i __A, int __B) {
1643   __v2du __rshift;
1644   __v2di __result = {0, 0};
1645
1646   if (__B < 64) {
1647     if (__builtin_constant_p(__B)) {
1648       if (__B < 16)
1649         __rshift = (__v2du)vec_splat_s32(__B);
1650       else
1651         __rshift = (__v2du)vec_splats((unsigned long long)__B);
1652     } else
1653       __rshift = (__v2du)vec_splats((unsigned int)__B);
1654
1655     __result = vec_sr((__v2di)__A, __rshift);
1656   }
1657
1658   return (__m128i)__result;
1659 }
1660 #endif
1661
1662 extern __inline __m128i
1663     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1664     _mm_sll_epi16(__m128i __A, __m128i __B) {
1665   __v8hu __lshift;
1666   __vector __bool short __shmask;
1667   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1668   __v8hu __result;
1669
1670 #ifdef __LITTLE_ENDIAN__
1671   __lshift = vec_splat((__v8hu)__B, 0);
1672 #else
1673   __lshift = vec_splat((__v8hu)__B, 3);
1674 #endif
1675   __shmask = vec_cmple(__lshift, __shmax);
1676   __result = vec_sl((__v8hu)__A, __lshift);
1677   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1678
1679   return (__m128i)__result;
1680 }
1681
1682 extern __inline __m128i
1683     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1684     _mm_sll_epi32(__m128i __A, __m128i __B) {
1685   __v4su __lshift;
1686   __vector __bool int __shmask;
1687   const __v4su __shmax = {32, 32, 32, 32};
1688   __v4su __result;
1689 #ifdef __LITTLE_ENDIAN__
1690   __lshift = vec_splat((__v4su)__B, 0);
1691 #else
1692   __lshift = vec_splat((__v4su)__B, 1);
1693 #endif
1694   __shmask = vec_cmplt(__lshift, __shmax);
1695   __result = vec_sl((__v4su)__A, __lshift);
1696   __result = vec_sel((__v4su)__shmask, __result, __shmask);
1697
1698   return (__m128i)__result;
1699 }
1700
1701 #ifdef _ARCH_PWR8
1702 extern __inline __m128i
1703     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1704     _mm_sll_epi64(__m128i __A, __m128i __B) {
1705   __v2du __lshift;
1706   __vector __bool long long __shmask;
1707   const __v2du __shmax = {64, 64};
1708   __v2du __result;
1709
1710   __lshift = vec_splat((__v2du)__B, 0);
1711   __shmask = vec_cmplt(__lshift, __shmax);
1712   __result = vec_sl((__v2du)__A, __lshift);
1713   __result = vec_sel((__v2du)__shmask, __result, __shmask);
1714
1715   return (__m128i)__result;
1716 }
1717 #endif
1718
1719 extern __inline __m128i
1720     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1721     _mm_sra_epi16(__m128i __A, __m128i __B) {
1722   const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1723   __v8hu __rshift;
1724   __v8hi __result;
1725
1726 #ifdef __LITTLE_ENDIAN__
1727   __rshift = vec_splat((__v8hu)__B, 0);
1728 #else
1729   __rshift = vec_splat((__v8hu)__B, 3);
1730 #endif
1731   __rshift = vec_min(__rshift, __rshmax);
1732   __result = vec_sra((__v8hi)__A, __rshift);
1733
1734   return (__m128i)__result;
1735 }
1736
1737 extern __inline __m128i
1738     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1739     _mm_sra_epi32(__m128i __A, __m128i __B) {
1740   const __v4su __rshmax = {31, 31, 31, 31};
1741   __v4su __rshift;
1742   __v4si __result;
1743
1744 #ifdef __LITTLE_ENDIAN__
1745   __rshift = vec_splat((__v4su)__B, 0);
1746 #else
1747   __rshift = vec_splat((__v4su)__B, 1);
1748 #endif
1749   __rshift = vec_min(__rshift, __rshmax);
1750   __result = vec_sra((__v4si)__A, __rshift);
1751
1752   return (__m128i)__result;
1753 }
1754
1755 extern __inline __m128i
1756     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757     _mm_srl_epi16(__m128i __A, __m128i __B) {
1758   __v8hu __rshift;
1759   __vector __bool short __shmask;
1760   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1761   __v8hu __result;
1762
1763 #ifdef __LITTLE_ENDIAN__
1764   __rshift = vec_splat((__v8hu)__B, 0);
1765 #else
1766   __rshift = vec_splat((__v8hu)__B, 3);
1767 #endif
1768   __shmask = vec_cmple(__rshift, __shmax);
1769   __result = vec_sr((__v8hu)__A, __rshift);
1770   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1771
1772   return (__m128i)__result;
1773 }
1774
1775 extern __inline __m128i
1776     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1777     _mm_srl_epi32(__m128i __A, __m128i __B) {
1778   __v4su __rshift;
1779   __vector __bool int __shmask;
1780   const __v4su __shmax = {32, 32, 32, 32};
1781   __v4su __result;
1782
1783 #ifdef __LITTLE_ENDIAN__
1784   __rshift = vec_splat((__v4su)__B, 0);
1785 #else
1786   __rshift = vec_splat((__v4su)__B, 1);
1787 #endif
1788   __shmask = vec_cmplt(__rshift, __shmax);
1789   __result = vec_sr((__v4su)__A, __rshift);
1790   __result = vec_sel((__v4su)__shmask, __result, __shmask);
1791
1792   return (__m128i)__result;
1793 }
1794
1795 #ifdef _ARCH_PWR8
1796 extern __inline __m128i
1797     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1798     _mm_srl_epi64(__m128i __A, __m128i __B) {
1799   __v2du __rshift;
1800   __vector __bool long long __shmask;
1801   const __v2du __shmax = {64, 64};
1802   __v2du __result;
1803
1804   __rshift = vec_splat((__v2du)__B, 0);
1805   __shmask = vec_cmplt(__rshift, __shmax);
1806   __result = vec_sr((__v2du)__A, __rshift);
1807   __result = vec_sel((__v2du)__shmask, __result, __shmask);
1808
1809   return (__m128i)__result;
1810 }
1811 #endif
1812
1813 extern __inline __m128d
1814     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1815     _mm_and_pd(__m128d __A, __m128d __B) {
1816   return (vec_and((__v2df)__A, (__v2df)__B));
1817 }
1818
1819 extern __inline __m128d
1820     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1821     _mm_andnot_pd(__m128d __A, __m128d __B) {
1822   return (vec_andc((__v2df)__B, (__v2df)__A));
1823 }
1824
1825 extern __inline __m128d
1826     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1827     _mm_or_pd(__m128d __A, __m128d __B) {
1828   return (vec_or((__v2df)__A, (__v2df)__B));
1829 }
1830
1831 extern __inline __m128d
1832     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1833     _mm_xor_pd(__m128d __A, __m128d __B) {
1834   return (vec_xor((__v2df)__A, (__v2df)__B));
1835 }
1836
1837 extern __inline __m128i
1838     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1839     _mm_and_si128(__m128i __A, __m128i __B) {
1840   return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1841 }
1842
1843 extern __inline __m128i
1844     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1845     _mm_andnot_si128(__m128i __A, __m128i __B) {
1846   return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1847 }
1848
1849 extern __inline __m128i
1850     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1851     _mm_or_si128(__m128i __A, __m128i __B) {
1852   return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1853 }
1854
1855 extern __inline __m128i
1856     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1857     _mm_xor_si128(__m128i __A, __m128i __B) {
1858   return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1859 }
1860
1861 extern __inline __m128i
1862     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1863     _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1864   return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1865 }
1866
1867 extern __inline __m128i
1868     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1869     _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1870   return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1871 }
1872
1873 extern __inline __m128i
1874     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1875     _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1876   return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1877 }
1878
1879 extern __inline __m128i
1880     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1881     _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1882   return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1883 }
1884
1885 extern __inline __m128i
1886     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1887     _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1888   return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1889 }
1890
1891 extern __inline __m128i
1892     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1893     _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1894   return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1895 }
1896
1897 extern __inline __m128i
1898     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1899     _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1900   return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1901 }
1902
1903 extern __inline __m128i
1904     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1905     _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1906   return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1907 }
1908
1909 extern __inline __m128i
1910     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1911     _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1912   return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1913 }
1914
1915 extern __inline int
1916     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1917     _mm_extract_epi16(__m128i const __A, int const __N) {
1918   return (unsigned short)((__v8hi)__A)[__N & 7];
1919 }
1920
1921 extern __inline __m128i
1922     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1923     _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1924   __v8hi __result = (__v8hi)__A;
1925
1926   __result[(__N & 7)] = __D;
1927
1928   return (__m128i)__result;
1929 }
1930
1931 extern __inline __m128i
1932     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1933     _mm_max_epi16(__m128i __A, __m128i __B) {
1934   return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1935 }
1936
1937 extern __inline __m128i
1938     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1939     _mm_max_epu8(__m128i __A, __m128i __B) {
1940   return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1941 }
1942
1943 extern __inline __m128i
1944     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1945     _mm_min_epi16(__m128i __A, __m128i __B) {
1946   return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1947 }
1948
1949 extern __inline __m128i
1950     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1951     _mm_min_epu8(__m128i __A, __m128i __B) {
1952   return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1953 }
1954
1955 #ifdef _ARCH_PWR8
1956 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1957
1958 /* Return a mask created from the most significant bit of each 8-bit
1959    element in A.  */
1960 extern __inline int
1961     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1962     _mm_movemask_epi8(__m128i __A) {
1963 #ifdef _ARCH_PWR10
1964   return vec_extractm((__v16qu)__A);
1965 #else
1966   __vector unsigned long long __result;
1967   static const __vector unsigned char __perm_mask = {
1968       0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1969       0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1970
1971   __result = ((__vector unsigned long long)vec_vbpermq(
1972       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1973
1974 #ifdef __LITTLE_ENDIAN__
1975   return __result[1];
1976 #else
1977   return __result[0];
1978 #endif
1979 #endif /* !_ARCH_PWR10 */
1980 }
1981 #endif /* _ARCH_PWR8 */
1982
1983 extern __inline __m128i
1984     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1985     _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1986   __v4su __w0, __w1;
1987   __v16qu __xform1 = {
1988 #ifdef __LITTLE_ENDIAN__
1989       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1990       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1991 #else
1992       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1993       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1994 #endif
1995   };
1996
1997   __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1998   __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1999   return (__m128i)vec_perm(__w0, __w1, __xform1);
2000 }
2001
2002 extern __inline __m128i
2003     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2004     _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2005   unsigned long __element_selector_98 = __mask & 0x03;
2006   unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2007   unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2008   unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2009   static const unsigned short __permute_selectors[4] = {
2010 #ifdef __LITTLE_ENDIAN__
2011       0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2012 #else
2013       0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2014 #endif
2015   };
2016   __v2du __pmask =
2017 #ifdef __LITTLE_ENDIAN__
2018       {0x1716151413121110UL, 0UL};
2019 #else
2020       {0x1011121314151617UL, 0UL};
2021 #endif
2022   __m64_union __t;
2023   __v2du __a, __r;
2024
2025   __t.as_short[0] = __permute_selectors[__element_selector_98];
2026   __t.as_short[1] = __permute_selectors[__element_selector_BA];
2027   __t.as_short[2] = __permute_selectors[__element_selector_DC];
2028   __t.as_short[3] = __permute_selectors[__element_selector_FE];
2029   __pmask[1] = __t.as_m64;
2030   __a = (__v2du)__A;
2031   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2032   return (__m128i)__r;
2033 }
2034
2035 extern __inline __m128i
2036     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2037     _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2038   unsigned long __element_selector_10 = __mask & 0x03;
2039   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2040   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2041   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2042   static const unsigned short __permute_selectors[4] = {
2043 #ifdef __LITTLE_ENDIAN__
2044       0x0100, 0x0302, 0x0504, 0x0706
2045 #else
2046       0x0001, 0x0203, 0x0405, 0x0607
2047 #endif
2048   };
2049   __v2du __pmask =
2050 #ifdef __LITTLE_ENDIAN__
2051       {0UL, 0x1f1e1d1c1b1a1918UL};
2052 #else
2053       {0UL, 0x18191a1b1c1d1e1fUL};
2054 #endif
2055   __m64_union __t;
2056   __v2du __a, __r;
2057   __t.as_short[0] = __permute_selectors[__element_selector_10];
2058   __t.as_short[1] = __permute_selectors[__element_selector_32];
2059   __t.as_short[2] = __permute_selectors[__element_selector_54];
2060   __t.as_short[3] = __permute_selectors[__element_selector_76];
2061   __pmask[0] = __t.as_m64;
2062   __a = (__v2du)__A;
2063   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2064   return (__m128i)__r;
2065 }
2066
2067 extern __inline __m128i
2068     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2069     _mm_shuffle_epi32(__m128i __A, const int __mask) {
2070   unsigned long __element_selector_10 = __mask & 0x03;
2071   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2072   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2073   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2074   static const unsigned int __permute_selectors[4] = {
2075 #ifdef __LITTLE_ENDIAN__
2076       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2077 #else
2078       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2079 #endif
2080   };
2081   __v4su __t;
2082
2083   __t[0] = __permute_selectors[__element_selector_10];
2084   __t[1] = __permute_selectors[__element_selector_32];
2085   __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2086   __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2087   return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2088                            (__vector unsigned char)__t);
2089 }
2090
2091 extern __inline void
2092     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2093     _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2094   __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2095   __v16qu __mask, __tmp;
2096   __m128i_u *__p = (__m128i_u *)__C;
2097
2098   __tmp = (__v16qu)_mm_loadu_si128(__p);
2099   __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2100   __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2101   _mm_storeu_si128(__p, (__m128i)__tmp);
2102 }
2103
2104 extern __inline __m128i
2105     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2106     _mm_avg_epu8(__m128i __A, __m128i __B) {
2107   return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2108 }
2109
2110 extern __inline __m128i
2111     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2112     _mm_avg_epu16(__m128i __A, __m128i __B) {
2113   return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2114 }
2115
2116 extern __inline __m128i
2117     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2118     _mm_sad_epu8(__m128i __A, __m128i __B) {
2119   __v16qu __a, __b;
2120   __v16qu __vabsdiff;
2121   __v4si __vsum;
2122   const __v4su __zero = {0, 0, 0, 0};
2123   __v4si __result;
2124
2125   __a = (__v16qu)__A;
2126   __b = (__v16qu)__B;
2127 #ifndef _ARCH_PWR9
2128   __v16qu __vmin = vec_min(__a, __b);
2129   __v16qu __vmax = vec_max(__a, __b);
2130   __vabsdiff = vec_sub(__vmax, __vmin);
2131 #else
2132   __vabsdiff = vec_absd(__a, __b);
2133 #endif
2134   /* Sum four groups of bytes into integers.  */
2135   __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2136 #ifdef __LITTLE_ENDIAN__
2137   /* Sum across four integers with two integer results.  */
2138   __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2139   /* Note: vec_sum2s could be used here, but on little-endian, vector
2140      shifts are added that are not needed for this use-case.
2141      A vector shift to correctly position the 32-bit integer results
2142      (currently at [0] and [2]) to [1] and [3] would then need to be
2143      swapped back again since the desired results are two 64-bit
2144      integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
2145 #else
2146   /* Sum across four integers with two integer results.  */
2147   __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2148   /* Rotate the sums into the correct position.  */
2149   __result = vec_sld(__result, __result, 6);
2150 #endif
2151   return (__m128i)__result;
2152 }
2153
2154 extern __inline void
2155     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2156     _mm_stream_si32(int *__A, int __B) {
2157   /* Use the data cache block touch for store transient.  */
2158   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2159   *__A = __B;
2160 }
2161
2162 extern __inline void
2163     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2164     _mm_stream_si64(long long int *__A, long long int __B) {
2165   /* Use the data cache block touch for store transient.  */
2166   __asm__("     dcbtstt 0,%0" : : "b"(__A) : "memory");
2167   *__A = __B;
2168 }
2169
2170 extern __inline void
2171     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2172     _mm_stream_si128(__m128i *__A, __m128i __B) {
2173   /* Use the data cache block touch for store transient.  */
2174   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2175   *__A = __B;
2176 }
2177
2178 extern __inline void
2179     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2180     _mm_stream_pd(double *__A, __m128d __B) {
2181   /* Use the data cache block touch for store transient.  */
2182   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2183   *(__m128d *)__A = __B;
2184 }
2185
2186 extern __inline void
2187     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2188     _mm_clflush(void const *__A) {
2189   /* Use the data cache block flush.  */
2190   __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2191 }
2192
2193 extern __inline void
2194     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2195     _mm_lfence(void) {
2196   /* Use light weight sync for load to load ordering.  */
2197   __atomic_thread_fence(__ATOMIC_RELEASE);
2198 }
2199
2200 extern __inline void
2201     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2202     _mm_mfence(void) {
2203   /* Use heavy weight sync for any to any ordering.  */
2204   __atomic_thread_fence(__ATOMIC_SEQ_CST);
2205 }
2206
2207 extern __inline __m128i
2208     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2209     _mm_cvtsi32_si128(int __A) {
2210   return _mm_set_epi32(0, 0, 0, __A);
2211 }
2212
2213 extern __inline __m128i
2214     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2215     _mm_cvtsi64_si128(long long __A) {
2216   return __extension__(__m128i)(__v2di){__A, 0LL};
2217 }
2218
2219 /* Microsoft intrinsic.  */
2220 extern __inline __m128i
2221     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2222     _mm_cvtsi64x_si128(long long __A) {
2223   return __extension__(__m128i)(__v2di){__A, 0LL};
2224 }
2225
2226 /* Casts between various SP, DP, INT vector types.  Note that these do no
2227    conversion of values, they just change the type.  */
2228 extern __inline __m128
2229     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2230     _mm_castpd_ps(__m128d __A) {
2231   return (__m128)__A;
2232 }
2233
2234 extern __inline __m128i
2235     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2236     _mm_castpd_si128(__m128d __A) {
2237   return (__m128i)__A;
2238 }
2239
2240 extern __inline __m128d
2241     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2242     _mm_castps_pd(__m128 __A) {
2243   return (__m128d)__A;
2244 }
2245
2246 extern __inline __m128i
2247     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2248     _mm_castps_si128(__m128 __A) {
2249   return (__m128i)__A;
2250 }
2251
2252 extern __inline __m128
2253     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2254     _mm_castsi128_ps(__m128i __A) {
2255   return (__m128)__A;
2256 }
2257
2258 extern __inline __m128d
2259     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2260     _mm_castsi128_pd(__m128i __A) {
2261   return (__m128d)__A;
2262 }
2263
2264 #else
2265 #include_next <emmintrin.h>
2266 #endif /* defined(__powerpc64__) &&                                            \
2267         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2268
2269 #endif /* EMMINTRIN_H_ */