clang/lib/Headers/ppc_wrappers/mmintrin.h

   1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 /* Implemented from the specification included in the Intel C++ Compiler
  11    User Guide and Reference, version 9.0.  */
  12
  13 #ifndef NO_WARN_X86_INTRINSICS
  14 /* This header file is to help porting code using Intel intrinsics
  15    explicitly from x86_64 to powerpc64/powerpc64le.
  16
  17    Since PowerPC target doesn't support native 64-bit vector type, we
  18    typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
  19    works well for _si64 and some _pi32 operations.
  20
  21    For _pi16 and _pi8 operations, it's better to transfer __m64 into
  22    128-bit PowerPC vector first. Power8 introduced direct register
  23    move instructions which helps for more efficient implementation.
  24
  25    It's user's responsibility to determine if the results of such port
  26    are acceptable or further changes are needed. Please note that much
  27    code using Intel intrinsics CAN BE REWRITTEN in more portable and
  28    efficient standard C or GNU C extensions with 64-bit scalar
  29    operations, or 128-bit SSE/Altivec operations, which are more
  30    recommended. */
  31 #error                                                                         \
  32     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  33 #endif
  34
  35 #ifndef _MMINTRIN_H_INCLUDED
  36 #define _MMINTRIN_H_INCLUDED
  37
  38 #if defined(__powerpc64__) &&                                                  \
  39     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
  40
  41 #include <altivec.h>
  42 /* The Intel API is flexible enough that we must allow aliasing with other
  43    vector types, and their scalar components.  */
  44 typedef __attribute__((__aligned__(8))) unsigned long long __m64;
  45
  46 typedef __attribute__((__aligned__(8))) union {
  47   __m64 as_m64;
  48   char as_char[8];
  49   signed char as_signed_char[8];
  50   short as_short[4];
  51   int as_int[2];
  52   long long as_long_long;
  53   float as_float[2];
  54   double as_double;
  55 } __m64_union;
  56
  57 /* Empty the multimedia state.  */
  58 extern __inline void
  59     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  60     _mm_empty(void) {
  61   /* nothing to do on PowerPC.  */
  62 }
  63
  64 extern __inline void
  65     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  66     _m_empty(void) {
  67   /* nothing to do on PowerPC.  */
  68 }
  69
  70 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
  71 extern __inline __m64
  72     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  73     _mm_cvtsi32_si64(int __i) {
  74   return (__m64)(unsigned int)__i;
  75 }
  76
  77 extern __inline __m64
  78     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  79     _m_from_int(int __i) {
  80   return _mm_cvtsi32_si64(__i);
  81 }
  82
  83 /* Convert the lower 32 bits of the __m64 object into an integer.  */
  84 extern __inline int
  85     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  86     _mm_cvtsi64_si32(__m64 __i) {
  87   return ((int)__i);
  88 }
  89
  90 extern __inline int
  91     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  92     _m_to_int(__m64 __i) {
  93   return _mm_cvtsi64_si32(__i);
  94 }
  95
  96 /* Convert I to a __m64 object.  */
  97
  98 /* Intel intrinsic.  */
  99 extern __inline __m64
 100     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 101     _m_from_int64(long long __i) {
 102   return (__m64)__i;
 103 }
 104
 105 extern __inline __m64
 106     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 107     _mm_cvtsi64_m64(long long __i) {
 108   return (__m64)__i;
 109 }
 110
 111 /* Microsoft intrinsic.  */
 112 extern __inline __m64
 113     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 114     _mm_cvtsi64x_si64(long long __i) {
 115   return (__m64)__i;
 116 }
 117
 118 extern __inline __m64
 119     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 120     _mm_set_pi64x(long long __i) {
 121   return (__m64)__i;
 122 }
 123
 124 /* Convert the __m64 object to a 64bit integer.  */
 125
 126 /* Intel intrinsic.  */
 127 extern __inline long long
 128     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 129     _m_to_int64(__m64 __i) {
 130   return (long long)__i;
 131 }
 132
 133 extern __inline long long
 134     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 135     _mm_cvtm64_si64(__m64 __i) {
 136   return (long long)__i;
 137 }
 138
 139 /* Microsoft intrinsic.  */
 140 extern __inline long long
 141     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 142     _mm_cvtsi64_si64x(__m64 __i) {
 143   return (long long)__i;
 144 }
 145
 146 #ifdef _ARCH_PWR8
 147 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 148    the result, and the four 16-bit values from M2 into the upper four 8-bit
 149    values of the result, all with signed saturation.  */
 150 extern __inline __m64
 151     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 152     _mm_packs_pi16(__m64 __m1, __m64 __m2) {
 153   __vector signed short __vm1;
 154   __vector signed char __vresult;
 155
 156   __vm1 = (__vector signed short)(__vector unsigned long long)
 157 #ifdef __LITTLE_ENDIAN__
 158       {__m1, __m2};
 159 #else
 160       {__m2, __m1};
 161 #endif
 162   __vresult = vec_packs(__vm1, __vm1);
 163   return (__m64)((__vector long long)__vresult)[0];
 164 }
 165
 166 extern __inline __m64
 167     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 168     _m_packsswb(__m64 __m1, __m64 __m2) {
 169   return _mm_packs_pi16(__m1, __m2);
 170 }
 171
 172 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
 173    the result, and the two 32-bit values from M2 into the upper two 16-bit
 174    values of the result, all with signed saturation.  */
 175 extern __inline __m64
 176     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 177     _mm_packs_pi32(__m64 __m1, __m64 __m2) {
 178   __vector signed int __vm1;
 179   __vector signed short __vresult;
 180
 181   __vm1 = (__vector signed int)(__vector unsigned long long)
 182 #ifdef __LITTLE_ENDIAN__
 183       {__m1, __m2};
 184 #else
 185       {__m2, __m1};
 186 #endif
 187   __vresult = vec_packs(__vm1, __vm1);
 188   return (__m64)((__vector long long)__vresult)[0];
 189 }
 190
 191 extern __inline __m64
 192     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 193     _m_packssdw(__m64 __m1, __m64 __m2) {
 194   return _mm_packs_pi32(__m1, __m2);
 195 }
 196
 197 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 198    the result, and the four 16-bit values from M2 into the upper four 8-bit
 199    values of the result, all with unsigned saturation.  */
 200 extern __inline __m64
 201     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 202     _mm_packs_pu16(__m64 __m1, __m64 __m2) {
 203   __vector unsigned char __r;
 204   __vector signed short __vm1 = (__vector signed short)(__vector long long)
 205 #ifdef __LITTLE_ENDIAN__
 206       {__m1, __m2};
 207 #else
 208       {__m2, __m1};
 209 #endif
 210   const __vector signed short __zero = {0};
 211   __vector __bool short __select = vec_cmplt(__vm1, __zero);
 212   __r =
 213       vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1);
 214   __vector __bool char __packsel = vec_pack(__select, __select);
 215   __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel);
 216   return (__m64)((__vector long long)__r)[0];
 217 }
 218
 219 extern __inline __m64
 220     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 221     _m_packuswb(__m64 __m1, __m64 __m2) {
 222   return _mm_packs_pu16(__m1, __m2);
 223 }
 224 #endif /* end ARCH_PWR8 */
 225
 226 /* Interleave the four 8-bit values from the high half of M1 with the four
 227    8-bit values from the high half of M2.  */
 228 extern __inline __m64
 229     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 230     _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
 231 #if _ARCH_PWR8
 232   __vector unsigned char __a, __b, __c;
 233
 234   __a = (__vector unsigned char)vec_splats(__m1);
 235   __b = (__vector unsigned char)vec_splats(__m2);
 236   __c = vec_mergel(__a, __b);
 237   return (__m64)((__vector long long)__c)[1];
 238 #else
 239   __m64_union __mu1, __mu2, __res;
 240
 241   __mu1.as_m64 = __m1;
 242   __mu2.as_m64 = __m2;
 243
 244   __res.as_char[0] = __mu1.as_char[4];
 245   __res.as_char[1] = __mu2.as_char[4];
 246   __res.as_char[2] = __mu1.as_char[5];
 247   __res.as_char[3] = __mu2.as_char[5];
 248   __res.as_char[4] = __mu1.as_char[6];
 249   __res.as_char[5] = __mu2.as_char[6];
 250   __res.as_char[6] = __mu1.as_char[7];
 251   __res.as_char[7] = __mu2.as_char[7];
 252
 253   return (__m64)__res.as_m64;
 254 #endif
 255 }
 256
 257 extern __inline __m64
 258     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 259     _m_punpckhbw(__m64 __m1, __m64 __m2) {
 260   return _mm_unpackhi_pi8(__m1, __m2);
 261 }
 262
 263 /* Interleave the two 16-bit values from the high half of M1 with the two
 264    16-bit values from the high half of M2.  */
 265 extern __inline __m64
 266     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 267     _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
 268   __m64_union __mu1, __mu2, __res;
 269
 270   __mu1.as_m64 = __m1;
 271   __mu2.as_m64 = __m2;
 272
 273   __res.as_short[0] = __mu1.as_short[2];
 274   __res.as_short[1] = __mu2.as_short[2];
 275   __res.as_short[2] = __mu1.as_short[3];
 276   __res.as_short[3] = __mu2.as_short[3];
 277
 278   return (__m64)__res.as_m64;
 279 }
 280
 281 extern __inline __m64
 282     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 283     _m_punpckhwd(__m64 __m1, __m64 __m2) {
 284   return _mm_unpackhi_pi16(__m1, __m2);
 285 }
 286 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
 287    value from the high half of M2.  */
 288 extern __inline __m64
 289     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 290     _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
 291   __m64_union __mu1, __mu2, __res;
 292
 293   __mu1.as_m64 = __m1;
 294   __mu2.as_m64 = __m2;
 295
 296   __res.as_int[0] = __mu1.as_int[1];
 297   __res.as_int[1] = __mu2.as_int[1];
 298
 299   return (__m64)__res.as_m64;
 300 }
 301
 302 extern __inline __m64
 303     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 304     _m_punpckhdq(__m64 __m1, __m64 __m2) {
 305   return _mm_unpackhi_pi32(__m1, __m2);
 306 }
 307 /* Interleave the four 8-bit values from the low half of M1 with the four
 308    8-bit values from the low half of M2.  */
 309 extern __inline __m64
 310     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 311     _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
 312 #if _ARCH_PWR8
 313   __vector unsigned char __a, __b, __c;
 314
 315   __a = (__vector unsigned char)vec_splats(__m1);
 316   __b = (__vector unsigned char)vec_splats(__m2);
 317   __c = vec_mergel(__a, __b);
 318   return (__m64)((__vector long long)__c)[0];
 319 #else
 320   __m64_union __mu1, __mu2, __res;
 321
 322   __mu1.as_m64 = __m1;
 323   __mu2.as_m64 = __m2;
 324
 325   __res.as_char[0] = __mu1.as_char[0];
 326   __res.as_char[1] = __mu2.as_char[0];
 327   __res.as_char[2] = __mu1.as_char[1];
 328   __res.as_char[3] = __mu2.as_char[1];
 329   __res.as_char[4] = __mu1.as_char[2];
 330   __res.as_char[5] = __mu2.as_char[2];
 331   __res.as_char[6] = __mu1.as_char[3];
 332   __res.as_char[7] = __mu2.as_char[3];
 333
 334   return (__m64)__res.as_m64;
 335 #endif
 336 }
 337
 338 extern __inline __m64
 339     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 340     _m_punpcklbw(__m64 __m1, __m64 __m2) {
 341   return _mm_unpacklo_pi8(__m1, __m2);
 342 }
 343 /* Interleave the two 16-bit values from the low half of M1 with the two
 344    16-bit values from the low half of M2.  */
 345 extern __inline __m64
 346     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 347     _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
 348   __m64_union __mu1, __mu2, __res;
 349
 350   __mu1.as_m64 = __m1;
 351   __mu2.as_m64 = __m2;
 352
 353   __res.as_short[0] = __mu1.as_short[0];
 354   __res.as_short[1] = __mu2.as_short[0];
 355   __res.as_short[2] = __mu1.as_short[1];
 356   __res.as_short[3] = __mu2.as_short[1];
 357
 358   return (__m64)__res.as_m64;
 359 }
 360
 361 extern __inline __m64
 362     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 363     _m_punpcklwd(__m64 __m1, __m64 __m2) {
 364   return _mm_unpacklo_pi16(__m1, __m2);
 365 }
 366
 367 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
 368    value from the low half of M2.  */
 369 extern __inline __m64
 370     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 371     _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
 372   __m64_union __mu1, __mu2, __res;
 373
 374   __mu1.as_m64 = __m1;
 375   __mu2.as_m64 = __m2;
 376
 377   __res.as_int[0] = __mu1.as_int[0];
 378   __res.as_int[1] = __mu2.as_int[0];
 379
 380   return (__m64)__res.as_m64;
 381 }
 382
 383 extern __inline __m64
 384     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 385     _m_punpckldq(__m64 __m1, __m64 __m2) {
 386   return _mm_unpacklo_pi32(__m1, __m2);
 387 }
 388
 389 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
 390 extern __inline __m64
 391     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 392     _mm_add_pi8(__m64 __m1, __m64 __m2) {
 393 #if _ARCH_PWR8
 394   __vector signed char __a, __b, __c;
 395
 396   __a = (__vector signed char)vec_splats(__m1);
 397   __b = (__vector signed char)vec_splats(__m2);
 398   __c = vec_add(__a, __b);
 399   return (__m64)((__vector long long)__c)[0];
 400 #else
 401   __m64_union __mu1, __mu2, __res;
 402
 403   __mu1.as_m64 = __m1;
 404   __mu2.as_m64 = __m2;
 405
 406   __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
 407   __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
 408   __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
 409   __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
 410   __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
 411   __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
 412   __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
 413   __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
 414
 415   return (__m64)__res.as_m64;
 416 #endif
 417 }
 418
 419 extern __inline __m64
 420     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 421     _m_paddb(__m64 __m1, __m64 __m2) {
 422   return _mm_add_pi8(__m1, __m2);
 423 }
 424
 425 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
 426 extern __inline __m64
 427     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 428     _mm_add_pi16(__m64 __m1, __m64 __m2) {
 429 #if _ARCH_PWR8
 430   __vector signed short __a, __b, __c;
 431
 432   __a = (__vector signed short)vec_splats(__m1);
 433   __b = (__vector signed short)vec_splats(__m2);
 434   __c = vec_add(__a, __b);
 435   return (__m64)((__vector long long)__c)[0];
 436 #else
 437   __m64_union __mu1, __mu2, __res;
 438
 439   __mu1.as_m64 = __m1;
 440   __mu2.as_m64 = __m2;
 441
 442   __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
 443   __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
 444   __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
 445   __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
 446
 447   return (__m64)__res.as_m64;
 448 #endif
 449 }
 450
 451 extern __inline __m64
 452     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 453     _m_paddw(__m64 __m1, __m64 __m2) {
 454   return _mm_add_pi16(__m1, __m2);
 455 }
 456
 457 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
 458 extern __inline __m64
 459     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 460     _mm_add_pi32(__m64 __m1, __m64 __m2) {
 461 #if _ARCH_PWR9
 462   __vector signed int __a, __b, __c;
 463
 464   __a = (__vector signed int)vec_splats(__m1);
 465   __b = (__vector signed int)vec_splats(__m2);
 466   __c = vec_add(__a, __b);
 467   return (__m64)((__vector long long)__c)[0];
 468 #else
 469   __m64_union __mu1, __mu2, __res;
 470
 471   __mu1.as_m64 = __m1;
 472   __mu2.as_m64 = __m2;
 473
 474   __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
 475   __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
 476
 477   return (__m64)__res.as_m64;
 478 #endif
 479 }
 480
 481 extern __inline __m64
 482     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 483     _m_paddd(__m64 __m1, __m64 __m2) {
 484   return _mm_add_pi32(__m1, __m2);
 485 }
 486
 487 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
 488 extern __inline __m64
 489     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 490     _mm_sub_pi8(__m64 __m1, __m64 __m2) {
 491 #if _ARCH_PWR8
 492   __vector signed char __a, __b, __c;
 493
 494   __a = (__vector signed char)vec_splats(__m1);
 495   __b = (__vector signed char)vec_splats(__m2);
 496   __c = vec_sub(__a, __b);
 497   return (__m64)((__vector long long)__c)[0];
 498 #else
 499   __m64_union __mu1, __mu2, __res;
 500
 501   __mu1.as_m64 = __m1;
 502   __mu2.as_m64 = __m2;
 503
 504   __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
 505   __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
 506   __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
 507   __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
 508   __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
 509   __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
 510   __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
 511   __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
 512
 513   return (__m64)__res.as_m64;
 514 #endif
 515 }
 516
 517 extern __inline __m64
 518     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 519     _m_psubb(__m64 __m1, __m64 __m2) {
 520   return _mm_sub_pi8(__m1, __m2);
 521 }
 522
 523 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
 524 extern __inline __m64
 525     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 526     _mm_sub_pi16(__m64 __m1, __m64 __m2) {
 527 #if _ARCH_PWR8
 528   __vector signed short __a, __b, __c;
 529
 530   __a = (__vector signed short)vec_splats(__m1);
 531   __b = (__vector signed short)vec_splats(__m2);
 532   __c = vec_sub(__a, __b);
 533   return (__m64)((__vector long long)__c)[0];
 534 #else
 535   __m64_union __mu1, __mu2, __res;
 536
 537   __mu1.as_m64 = __m1;
 538   __mu2.as_m64 = __m2;
 539
 540   __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
 541   __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
 542   __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
 543   __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
 544
 545   return (__m64)__res.as_m64;
 546 #endif
 547 }
 548
 549 extern __inline __m64
 550     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 551     _m_psubw(__m64 __m1, __m64 __m2) {
 552   return _mm_sub_pi16(__m1, __m2);
 553 }
 554
 555 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
 556 extern __inline __m64
 557     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 558     _mm_sub_pi32(__m64 __m1, __m64 __m2) {
 559 #if _ARCH_PWR9
 560   __vector signed int __a, __b, __c;
 561
 562   __a = (__vector signed int)vec_splats(__m1);
 563   __b = (__vector signed int)vec_splats(__m2);
 564   __c = vec_sub(__a, __b);
 565   return (__m64)((__vector long long)__c)[0];
 566 #else
 567   __m64_union __mu1, __mu2, __res;
 568
 569   __mu1.as_m64 = __m1;
 570   __mu2.as_m64 = __m2;
 571
 572   __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
 573   __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
 574
 575   return (__m64)__res.as_m64;
 576 #endif
 577 }
 578
 579 extern __inline __m64
 580     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 581     _m_psubd(__m64 __m1, __m64 __m2) {
 582   return _mm_sub_pi32(__m1, __m2);
 583 }
 584
 585 extern __inline __m64
 586     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 587     _mm_add_si64(__m64 __m1, __m64 __m2) {
 588   return (__m1 + __m2);
 589 }
 590
 591 extern __inline __m64
 592     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 593     _mm_sub_si64(__m64 __m1, __m64 __m2) {
 594   return (__m1 - __m2);
 595 }
 596
 597 /* Shift the 64-bit value in M left by COUNT.  */
 598 extern __inline __m64
 599     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 600     _mm_sll_si64(__m64 __m, __m64 __count) {
 601   return (__m << __count);
 602 }
 603
 604 extern __inline __m64
 605     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 606     _m_psllq(__m64 __m, __m64 __count) {
 607   return _mm_sll_si64(__m, __count);
 608 }
 609
 610 extern __inline __m64
 611     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 612     _mm_slli_si64(__m64 __m, const int __count) {
 613   return (__m << __count);
 614 }
 615
 616 extern __inline __m64
 617     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 618     _m_psllqi(__m64 __m, const int __count) {
 619   return _mm_slli_si64(__m, __count);
 620 }
 621
 622 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
 623 extern __inline __m64
 624     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 625     _mm_srl_si64(__m64 __m, __m64 __count) {
 626   return (__m >> __count);
 627 }
 628
 629 extern __inline __m64
 630     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 631     _m_psrlq(__m64 __m, __m64 __count) {
 632   return _mm_srl_si64(__m, __count);
 633 }
 634
 635 extern __inline __m64
 636     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 637     _mm_srli_si64(__m64 __m, const int __count) {
 638   return (__m >> __count);
 639 }
 640
 641 extern __inline __m64
 642     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 643     _m_psrlqi(__m64 __m, const int __count) {
 644   return _mm_srli_si64(__m, __count);
 645 }
 646
 647 /* Bit-wise AND the 64-bit values in M1 and M2.  */
 648 extern __inline __m64
 649     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 650     _mm_and_si64(__m64 __m1, __m64 __m2) {
 651   return (__m1 & __m2);
 652 }
 653
 654 extern __inline __m64
 655     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 656     _m_pand(__m64 __m1, __m64 __m2) {
 657   return _mm_and_si64(__m1, __m2);
 658 }
 659
 660 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
 661    64-bit value in M2.  */
 662 extern __inline __m64
 663     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 664     _mm_andnot_si64(__m64 __m1, __m64 __m2) {
 665   return (~__m1 & __m2);
 666 }
 667
 668 extern __inline __m64
 669     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 670     _m_pandn(__m64 __m1, __m64 __m2) {
 671   return _mm_andnot_si64(__m1, __m2);
 672 }
 673
 674 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
 675 extern __inline __m64
 676     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 677     _mm_or_si64(__m64 __m1, __m64 __m2) {
 678   return (__m1 | __m2);
 679 }
 680
 681 extern __inline __m64
 682     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 683     _m_por(__m64 __m1, __m64 __m2) {
 684   return _mm_or_si64(__m1, __m2);
 685 }
 686
 687 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
 688 extern __inline __m64
 689     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 690     _mm_xor_si64(__m64 __m1, __m64 __m2) {
 691   return (__m1 ^ __m2);
 692 }
 693
 694 extern __inline __m64
 695     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 696     _m_pxor(__m64 __m1, __m64 __m2) {
 697   return _mm_xor_si64(__m1, __m2);
 698 }
 699
 700 /* Creates a 64-bit zero.  */
 701 extern __inline __m64
 702     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 703     _mm_setzero_si64(void) {
 704   return (__m64)0;
 705 }
 706
 707 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
 708    test is true and zero if false.  */
 709 extern __inline __m64
 710     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 711     _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
 712 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
 713   __m64 __res;
 714   __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :);
 715   return (__res);
 716 #else
 717   __m64_union __mu1, __mu2, __res;
 718
 719   __mu1.as_m64 = __m1;
 720   __mu2.as_m64 = __m2;
 721
 722   __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0;
 723   __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0;
 724   __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0;
 725   __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0;
 726   __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0;
 727   __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0;
 728   __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0;
 729   __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0;
 730
 731   return (__m64)__res.as_m64;
 732 #endif
 733 }
 734
 735 extern __inline __m64
 736     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 737     _m_pcmpeqb(__m64 __m1, __m64 __m2) {
 738   return _mm_cmpeq_pi8(__m1, __m2);
 739 }
 740
 741 extern __inline __m64
 742     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 743     _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
 744 #if _ARCH_PWR8
 745   __vector signed char __a, __b, __c;
 746
 747   __a = (__vector signed char)vec_splats(__m1);
 748   __b = (__vector signed char)vec_splats(__m2);
 749   __c = (__vector signed char)vec_cmpgt(__a, __b);
 750   return (__m64)((__vector long long)__c)[0];
 751 #else
 752   __m64_union __mu1, __mu2, __res;
 753
 754   __mu1.as_m64 = __m1;
 755   __mu2.as_m64 = __m2;
 756
 757   __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0;
 758   __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0;
 759   __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0;
 760   __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0;
 761   __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0;
 762   __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0;
 763   __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0;
 764   __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0;
 765
 766   return (__m64)__res.as_m64;
 767 #endif
 768 }
 769
 770 extern __inline __m64
 771     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 772     _m_pcmpgtb(__m64 __m1, __m64 __m2) {
 773   return _mm_cmpgt_pi8(__m1, __m2);
 774 }
 775
 776 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
 777    the test is true and zero if false.  */
 778 extern __inline __m64
 779     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 780     _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
 781 #if _ARCH_PWR8
 782   __vector signed short __a, __b, __c;
 783
 784   __a = (__vector signed short)vec_splats(__m1);
 785   __b = (__vector signed short)vec_splats(__m2);
 786   __c = (__vector signed short)vec_cmpeq(__a, __b);
 787   return (__m64)((__vector long long)__c)[0];
 788 #else
 789   __m64_union __mu1, __mu2, __res;
 790
 791   __mu1.as_m64 = __m1;
 792   __mu2.as_m64 = __m2;
 793
 794   __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0;
 795   __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0;
 796   __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0;
 797   __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0;
 798
 799   return (__m64)__res.as_m64;
 800 #endif
 801 }
 802
 803 extern __inline __m64
 804     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 805     _m_pcmpeqw(__m64 __m1, __m64 __m2) {
 806   return _mm_cmpeq_pi16(__m1, __m2);
 807 }
 808
 809 extern __inline __m64
 810     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 811     _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
 812 #if _ARCH_PWR8
 813   __vector signed short __a, __b, __c;
 814
 815   __a = (__vector signed short)vec_splats(__m1);
 816   __b = (__vector signed short)vec_splats(__m2);
 817   __c = (__vector signed short)vec_cmpgt(__a, __b);
 818   return (__m64)((__vector long long)__c)[0];
 819 #else
 820   __m64_union __mu1, __mu2, __res;
 821
 822   __mu1.as_m64 = __m1;
 823   __mu2.as_m64 = __m2;
 824
 825   __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0;
 826   __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0;
 827   __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0;
 828   __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0;
 829
 830   return (__m64)__res.as_m64;
 831 #endif
 832 }
 833
 834 extern __inline __m64
 835     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 836     _m_pcmpgtw(__m64 __m1, __m64 __m2) {
 837   return _mm_cmpgt_pi16(__m1, __m2);
 838 }
 839
 840 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
 841    the test is true and zero if false.  */
 842 extern __inline __m64
 843     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 844     _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
 845 #if _ARCH_PWR9
 846   __vector signed int __a, __b, __c;
 847
 848   __a = (__vector signed int)vec_splats(__m1);
 849   __b = (__vector signed int)vec_splats(__m2);
 850   __c = (__vector signed int)vec_cmpeq(__a, __b);
 851   return (__m64)((__vector long long)__c)[0];
 852 #else
 853   __m64_union __mu1, __mu2, __res;
 854
 855   __mu1.as_m64 = __m1;
 856   __mu2.as_m64 = __m2;
 857
 858   __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0;
 859   __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0;
 860
 861   return (__m64)__res.as_m64;
 862 #endif
 863 }
 864
 865 extern __inline __m64
 866     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 867     _m_pcmpeqd(__m64 __m1, __m64 __m2) {
 868   return _mm_cmpeq_pi32(__m1, __m2);
 869 }
 870
 871 extern __inline __m64
 872     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 873     _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
 874 #if _ARCH_PWR9
 875   __vector signed int __a, __b, __c;
 876
 877   __a = (__vector signed int)vec_splats(__m1);
 878   __b = (__vector signed int)vec_splats(__m2);
 879   __c = (__vector signed int)vec_cmpgt(__a, __b);
 880   return (__m64)((__vector long long)__c)[0];
 881 #else
 882   __m64_union __mu1, __mu2, __res;
 883
 884   __mu1.as_m64 = __m1;
 885   __mu2.as_m64 = __m2;
 886
 887   __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0;
 888   __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0;
 889
 890   return (__m64)__res.as_m64;
 891 #endif
 892 }
 893
 894 extern __inline __m64
 895     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 896     _m_pcmpgtd(__m64 __m1, __m64 __m2) {
 897   return _mm_cmpgt_pi32(__m1, __m2);
 898 }
 899
 900 #if _ARCH_PWR8
 901 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
 902    saturated arithmetic.  */
 903 extern __inline __m64
 904     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 905     _mm_adds_pi8(__m64 __m1, __m64 __m2) {
 906   __vector signed char __a, __b, __c;
 907
 908   __a = (__vector signed char)vec_splats(__m1);
 909   __b = (__vector signed char)vec_splats(__m2);
 910   __c = vec_adds(__a, __b);
 911   return (__m64)((__vector long long)__c)[0];
 912 }
 913
 914 extern __inline __m64
 915     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 916     _m_paddsb(__m64 __m1, __m64 __m2) {
 917   return _mm_adds_pi8(__m1, __m2);
 918 }
 919 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
 920    saturated arithmetic.  */
 921 extern __inline __m64
 922     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 923     _mm_adds_pi16(__m64 __m1, __m64 __m2) {
 924   __vector signed short __a, __b, __c;
 925
 926   __a = (__vector signed short)vec_splats(__m1);
 927   __b = (__vector signed short)vec_splats(__m2);
 928   __c = vec_adds(__a, __b);
 929   return (__m64)((__vector long long)__c)[0];
 930 }
 931
 932 extern __inline __m64
 933     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 934     _m_paddsw(__m64 __m1, __m64 __m2) {
 935   return _mm_adds_pi16(__m1, __m2);
 936 }
 937 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 938    saturated arithmetic.  */
 939 extern __inline __m64
 940     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 941     _mm_adds_pu8(__m64 __m1, __m64 __m2) {
 942   __vector unsigned char __a, __b, __c;
 943
 944   __a = (__vector unsigned char)vec_splats(__m1);
 945   __b = (__vector unsigned char)vec_splats(__m2);
 946   __c = vec_adds(__a, __b);
 947   return (__m64)((__vector long long)__c)[0];
 948 }
 949
 950 extern __inline __m64
 951     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 952     _m_paddusb(__m64 __m1, __m64 __m2) {
 953   return _mm_adds_pu8(__m1, __m2);
 954 }
 955
 956 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 957    saturated arithmetic.  */
 958 extern __inline __m64
 959     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 960     _mm_adds_pu16(__m64 __m1, __m64 __m2) {
 961   __vector unsigned short __a, __b, __c;
 962
 963   __a = (__vector unsigned short)vec_splats(__m1);
 964   __b = (__vector unsigned short)vec_splats(__m2);
 965   __c = vec_adds(__a, __b);
 966   return (__m64)((__vector long long)__c)[0];
 967 }
 968
 969 extern __inline __m64
 970     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 971     _m_paddusw(__m64 __m1, __m64 __m2) {
 972   return _mm_adds_pu16(__m1, __m2);
 973 }
 974
 975 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
 976    saturating arithmetic.  */
 977 extern __inline __m64
 978     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 979     _mm_subs_pi8(__m64 __m1, __m64 __m2) {
 980   __vector signed char __a, __b, __c;
 981
 982   __a = (__vector signed char)vec_splats(__m1);
 983   __b = (__vector signed char)vec_splats(__m2);
 984   __c = vec_subs(__a, __b);
 985   return (__m64)((__vector long long)__c)[0];
 986 }
 987
 988 extern __inline __m64
 989     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 990     _m_psubsb(__m64 __m1, __m64 __m2) {
 991   return _mm_subs_pi8(__m1, __m2);
 992 }
 993
 994 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 995    signed saturating arithmetic.  */
 996 extern __inline __m64
 997     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 998     _mm_subs_pi16(__m64 __m1, __m64 __m2) {
 999   __vector signed short __a, __b, __c;
1000
1001   __a = (__vector signed short)vec_splats(__m1);
1002   __b = (__vector signed short)vec_splats(__m2);
1003   __c = vec_subs(__a, __b);
1004   return (__m64)((__vector long long)__c)[0];
1005 }
1006
1007 extern __inline __m64
1008     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1009     _m_psubsw(__m64 __m1, __m64 __m2) {
1010   return _mm_subs_pi16(__m1, __m2);
1011 }
1012
1013 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1014    unsigned saturating arithmetic.  */
1015 extern __inline __m64
1016     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017     _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1018   __vector unsigned char __a, __b, __c;
1019
1020   __a = (__vector unsigned char)vec_splats(__m1);
1021   __b = (__vector unsigned char)vec_splats(__m2);
1022   __c = vec_subs(__a, __b);
1023   return (__m64)((__vector long long)__c)[0];
1024 }
1025
1026 extern __inline __m64
1027     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1028     _m_psubusb(__m64 __m1, __m64 __m2) {
1029   return _mm_subs_pu8(__m1, __m2);
1030 }
1031
1032 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1033    unsigned saturating arithmetic.  */
1034 extern __inline __m64
1035     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036     _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1037   __vector unsigned short __a, __b, __c;
1038
1039   __a = (__vector unsigned short)vec_splats(__m1);
1040   __b = (__vector unsigned short)vec_splats(__m2);
1041   __c = vec_subs(__a, __b);
1042   return (__m64)((__vector long long)__c)[0];
1043 }
1044
1045 extern __inline __m64
1046     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047     _m_psubusw(__m64 __m1, __m64 __m2) {
1048   return _mm_subs_pu16(__m1, __m2);
1049 }
1050
1051 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1052    four 32-bit intermediate results, which are then summed by pairs to
1053    produce two 32-bit results.  */
1054 extern __inline __m64
1055     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056     _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1057   __vector signed short __a, __b;
1058   __vector signed int __c;
1059   __vector signed int __zero = {0, 0, 0, 0};
1060
1061   __a = (__vector signed short)vec_splats(__m1);
1062   __b = (__vector signed short)vec_splats(__m2);
1063   __c = vec_vmsumshm(__a, __b, __zero);
1064   return (__m64)((__vector long long)__c)[0];
1065 }
1066
1067 extern __inline __m64
1068     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069     _m_pmaddwd(__m64 __m1, __m64 __m2) {
1070   return _mm_madd_pi16(__m1, __m2);
1071 }
1072 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1073    M2 and produce the high 16 bits of the 32-bit results.  */
1074 extern __inline __m64
1075     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1076     _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1077   __vector signed short __a, __b;
1078   __vector signed short __c;
1079   __vector signed int __w0, __w1;
1080   __vector unsigned char __xform1 = {
1081 #ifdef __LITTLE_ENDIAN__
1082       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1083       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1084 #else
1085       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1086       0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1087 #endif
1088   };
1089
1090   __a = (__vector signed short)vec_splats(__m1);
1091   __b = (__vector signed short)vec_splats(__m2);
1092
1093   __w0 = vec_vmulesh(__a, __b);
1094   __w1 = vec_vmulosh(__a, __b);
1095   __c = (__vector signed short)vec_perm(__w0, __w1, __xform1);
1096
1097   return (__m64)((__vector long long)__c)[0];
1098 }
1099
1100 extern __inline __m64
1101     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102     _m_pmulhw(__m64 __m1, __m64 __m2) {
1103   return _mm_mulhi_pi16(__m1, __m2);
1104 }
1105
1106 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1107    the low 16 bits of the results.  */
1108 extern __inline __m64
1109     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110     _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1111   __vector signed short __a, __b, __c;
1112
1113   __a = (__vector signed short)vec_splats(__m1);
1114   __b = (__vector signed short)vec_splats(__m2);
1115   __c = __a * __b;
1116   return (__m64)((__vector long long)__c)[0];
1117 }
1118
1119 extern __inline __m64
1120     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121     _m_pmullw(__m64 __m1, __m64 __m2) {
1122   return _mm_mullo_pi16(__m1, __m2);
1123 }
1124
1125 /* Shift four 16-bit values in M left by COUNT.  */
1126 extern __inline __m64
1127     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128     _mm_sll_pi16(__m64 __m, __m64 __count) {
1129   __vector signed short __r;
1130   __vector unsigned short __c;
1131
1132   if (__count <= 15) {
1133     __r = (__vector signed short)vec_splats(__m);
1134     __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1135     __r = vec_sl(__r, (__vector unsigned short)__c);
1136     return (__m64)((__vector long long)__r)[0];
1137   } else
1138     return (0);
1139 }
1140
1141 extern __inline __m64
1142     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1143     _m_psllw(__m64 __m, __m64 __count) {
1144   return _mm_sll_pi16(__m, __count);
1145 }
1146
1147 extern __inline __m64
1148     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1149     _mm_slli_pi16(__m64 __m, int __count) {
1150   /* Promote int to long then invoke mm_sll_pi16.  */
1151   return _mm_sll_pi16(__m, __count);
1152 }
1153
1154 extern __inline __m64
1155     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156     _m_psllwi(__m64 __m, int __count) {
1157   return _mm_slli_pi16(__m, __count);
1158 }
1159
1160 /* Shift two 32-bit values in M left by COUNT.  */
1161 extern __inline __m64
1162     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1163     _mm_sll_pi32(__m64 __m, __m64 __count) {
1164   __m64_union __res;
1165
1166   __res.as_m64 = __m;
1167
1168   __res.as_int[0] = __res.as_int[0] << __count;
1169   __res.as_int[1] = __res.as_int[1] << __count;
1170   return (__res.as_m64);
1171 }
1172
1173 extern __inline __m64
1174     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175     _m_pslld(__m64 __m, __m64 __count) {
1176   return _mm_sll_pi32(__m, __count);
1177 }
1178
1179 extern __inline __m64
1180     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181     _mm_slli_pi32(__m64 __m, int __count) {
1182   /* Promote int to long then invoke mm_sll_pi32.  */
1183   return _mm_sll_pi32(__m, __count);
1184 }
1185
1186 extern __inline __m64
1187     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188     _m_pslldi(__m64 __m, int __count) {
1189   return _mm_slli_pi32(__m, __count);
1190 }
1191
1192 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1193 extern __inline __m64
1194     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195     _mm_sra_pi16(__m64 __m, __m64 __count) {
1196   __vector signed short __r;
1197   __vector unsigned short __c;
1198
1199   if (__count <= 15) {
1200     __r = (__vector signed short)vec_splats(__m);
1201     __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1202     __r = vec_sra(__r, (__vector unsigned short)__c);
1203     return (__m64)((__vector long long)__r)[0];
1204   } else
1205     return (0);
1206 }
1207
1208 extern __inline __m64
1209     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1210     _m_psraw(__m64 __m, __m64 __count) {
1211   return _mm_sra_pi16(__m, __count);
1212 }
1213
1214 extern __inline __m64
1215     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1216     _mm_srai_pi16(__m64 __m, int __count) {
1217   /* Promote int to long then invoke mm_sra_pi32.  */
1218   return _mm_sra_pi16(__m, __count);
1219 }
1220
1221 extern __inline __m64
1222     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1223     _m_psrawi(__m64 __m, int __count) {
1224   return _mm_srai_pi16(__m, __count);
1225 }
1226
1227 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1228 extern __inline __m64
1229     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1230     _mm_sra_pi32(__m64 __m, __m64 __count) {
1231   __m64_union __res;
1232
1233   __res.as_m64 = __m;
1234
1235   __res.as_int[0] = __res.as_int[0] >> __count;
1236   __res.as_int[1] = __res.as_int[1] >> __count;
1237   return (__res.as_m64);
1238 }
1239
1240 extern __inline __m64
1241     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1242     _m_psrad(__m64 __m, __m64 __count) {
1243   return _mm_sra_pi32(__m, __count);
1244 }
1245
1246 extern __inline __m64
1247     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1248     _mm_srai_pi32(__m64 __m, int __count) {
1249   /* Promote int to long then invoke mm_sra_pi32.  */
1250   return _mm_sra_pi32(__m, __count);
1251 }
1252
1253 extern __inline __m64
1254     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255     _m_psradi(__m64 __m, int __count) {
1256   return _mm_srai_pi32(__m, __count);
1257 }
1258
1259 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1260 extern __inline __m64
1261     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262     _mm_srl_pi16(__m64 __m, __m64 __count) {
1263   __vector unsigned short __r;
1264   __vector unsigned short __c;
1265
1266   if (__count <= 15) {
1267     __r = (__vector unsigned short)vec_splats(__m);
1268     __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1269     __r = vec_sr(__r, (__vector unsigned short)__c);
1270     return (__m64)((__vector long long)__r)[0];
1271   } else
1272     return (0);
1273 }
1274
1275 extern __inline __m64
1276     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1277     _m_psrlw(__m64 __m, __m64 __count) {
1278   return _mm_srl_pi16(__m, __count);
1279 }
1280
1281 extern __inline __m64
1282     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283     _mm_srli_pi16(__m64 __m, int __count) {
1284   /* Promote int to long then invoke mm_sra_pi32.  */
1285   return _mm_srl_pi16(__m, __count);
1286 }
1287
1288 extern __inline __m64
1289     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290     _m_psrlwi(__m64 __m, int __count) {
1291   return _mm_srli_pi16(__m, __count);
1292 }
1293
1294 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1295 extern __inline __m64
1296     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297     _mm_srl_pi32(__m64 __m, __m64 __count) {
1298   __m64_union __res;
1299
1300   __res.as_m64 = __m;
1301
1302   __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
1303   __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
1304   return (__res.as_m64);
1305 }
1306
1307 extern __inline __m64
1308     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309     _m_psrld(__m64 __m, __m64 __count) {
1310   return _mm_srl_pi32(__m, __count);
1311 }
1312
1313 extern __inline __m64
1314     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315     _mm_srli_pi32(__m64 __m, int __count) {
1316   /* Promote int to long then invoke mm_srl_pi32.  */
1317   return _mm_srl_pi32(__m, __count);
1318 }
1319
1320 extern __inline __m64
1321     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322     _m_psrldi(__m64 __m, int __count) {
1323   return _mm_srli_pi32(__m, __count);
1324 }
1325 #endif /* _ARCH_PWR8 */
1326
1327 /* Creates a vector of two 32-bit values; I0 is least significant.  */
1328 extern __inline __m64
1329     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330     _mm_set_pi32(int __i1, int __i0) {
1331   __m64_union __res;
1332
1333   __res.as_int[0] = __i0;
1334   __res.as_int[1] = __i1;
1335   return (__res.as_m64);
1336 }
1337
1338 /* Creates a vector of four 16-bit values; W0 is least significant.  */
1339 extern __inline __m64
1340     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341     _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1342   __m64_union __res;
1343
1344   __res.as_short[0] = __w0;
1345   __res.as_short[1] = __w1;
1346   __res.as_short[2] = __w2;
1347   __res.as_short[3] = __w3;
1348   return (__res.as_m64);
1349 }
1350
1351 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
1352 extern __inline __m64
1353     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1354     _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1355                 char __b2, char __b1, char __b0) {
1356   __m64_union __res;
1357
1358   __res.as_char[0] = __b0;
1359   __res.as_char[1] = __b1;
1360   __res.as_char[2] = __b2;
1361   __res.as_char[3] = __b3;
1362   __res.as_char[4] = __b4;
1363   __res.as_char[5] = __b5;
1364   __res.as_char[6] = __b6;
1365   __res.as_char[7] = __b7;
1366   return (__res.as_m64);
1367 }
1368
1369 /* Similar, but with the arguments in reverse order.  */
1370 extern __inline __m64
1371     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372     _mm_setr_pi32(int __i0, int __i1) {
1373   __m64_union __res;
1374
1375   __res.as_int[0] = __i0;
1376   __res.as_int[1] = __i1;
1377   return (__res.as_m64);
1378 }
1379
1380 extern __inline __m64
1381     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382     _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1383   return _mm_set_pi16(__w3, __w2, __w1, __w0);
1384 }
1385
1386 extern __inline __m64
1387     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388     _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1389                  char __b5, char __b6, char __b7) {
1390   return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1391 }
1392
1393 /* Creates a vector of two 32-bit values, both elements containing I.  */
1394 extern __inline __m64
1395     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396     _mm_set1_pi32(int __i) {
1397   __m64_union __res;
1398
1399   __res.as_int[0] = __i;
1400   __res.as_int[1] = __i;
1401   return (__res.as_m64);
1402 }
1403
1404 /* Creates a vector of four 16-bit values, all elements containing W.  */
1405 extern __inline __m64
1406     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1407     _mm_set1_pi16(short __w) {
1408 #if _ARCH_PWR9
1409   __vector signed short w;
1410
1411   w = (__vector signed short)vec_splats(__w);
1412   return (__m64)((__vector long long)w)[0];
1413 #else
1414   __m64_union __res;
1415
1416   __res.as_short[0] = __w;
1417   __res.as_short[1] = __w;
1418   __res.as_short[2] = __w;
1419   __res.as_short[3] = __w;
1420   return (__res.as_m64);
1421 #endif
1422 }
1423
1424 /* Creates a vector of eight 8-bit values, all elements containing B.  */
1425 extern __inline __m64
1426     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427     _mm_set1_pi8(signed char __b) {
1428 #if _ARCH_PWR8
1429   __vector signed char __res;
1430
1431   __res = (__vector signed char)vec_splats(__b);
1432   return (__m64)((__vector long long)__res)[0];
1433 #else
1434   __m64_union __res;
1435
1436   __res.as_char[0] = __b;
1437   __res.as_char[1] = __b;
1438   __res.as_char[2] = __b;
1439   __res.as_char[3] = __b;
1440   __res.as_char[4] = __b;
1441   __res.as_char[5] = __b;
1442   __res.as_char[6] = __b;
1443   __res.as_char[7] = __b;
1444   return (__res.as_m64);
1445 #endif
1446 }
1447
1448 #else
1449 #include_next <mmintrin.h>
1450 #endif /* defined(__powerpc64__) &&                                            \
1451         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
1452
1453 #endif /* _MMINTRIN_H_INCLUDED */