clang/lib/Headers/ppc_wrappers/smmintrin.h

   1 /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 /* Implemented from the specification included in the Intel C++ Compiler
  11    User Guide and Reference, version 9.0.
  12
  13    NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
  14
  15 #ifndef NO_WARN_X86_INTRINSICS
  16 /* This header is distributed to simplify porting x86_64 code that
  17    makes explicit use of Intel intrinsics to powerp64/powerpc64le.
  18
  19    It is the user's responsibility to determine if the results are
  20    acceptable and make additional changes as necessary.
  21
  22    Note that much code that uses Intel intrinsics can be rewritten in
  23    standard C or GNU C extensions, which are more portable and better
  24    optimized across multiple targets.  */
  25 #error                                                                         \
  26     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  27 #endif
  28
  29 #ifndef SMMINTRIN_H_
  30 #define SMMINTRIN_H_
  31
  32 #if defined(__ppc64__) &&                                                      \
  33     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
  34
  35 #include <altivec.h>
  36 #include <tmmintrin.h>
  37
  38 /* Rounding mode macros. */
  39 #define _MM_FROUND_TO_NEAREST_INT 0x00
  40 #define _MM_FROUND_TO_ZERO 0x01
  41 #define _MM_FROUND_TO_POS_INF 0x02
  42 #define _MM_FROUND_TO_NEG_INF 0x03
  43 #define _MM_FROUND_CUR_DIRECTION 0x04
  44
  45 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
  46 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
  47 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
  48 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
  49 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
  50 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
  51
  52 #define _MM_FROUND_RAISE_EXC 0x00
  53 #define _MM_FROUND_NO_EXC 0x08
  54
  55 extern __inline __m128d
  56     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  57     _mm_round_pd(__m128d __A, int __rounding) {
  58   __v2df __r;
  59   union {
  60     double __fr;
  61     long long __fpscr;
  62   } __enables_save, __fpscr_save;
  63
  64   if (__rounding & _MM_FROUND_NO_EXC) {
  65     /* Save enabled exceptions, disable all exceptions,
  66        and preserve the rounding mode.  */
  67 #ifdef _ARCH_PWR9
  68     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
  69     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
  70 #else
  71     __fpscr_save.__fr = __builtin_mffs();
  72     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
  73     __fpscr_save.__fpscr &= ~0xf8;
  74     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
  75 #endif
  76     /* Insert an artificial "read/write" reference to the variable
  77        read below, to ensure the compiler does not schedule
  78        a read/use of the variable before the FPSCR is modified, above.
  79        This can be removed if and when GCC PR102783 is fixed.
  80      */
  81     __asm__("" : "+wa"(__A));
  82   }
  83
  84   switch (__rounding) {
  85   case _MM_FROUND_TO_NEAREST_INT:
  86     __fpscr_save.__fr = __builtin_mffsl();
  87     __attribute__((fallthrough));
  88   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
  89     __builtin_set_fpscr_rn(0b00);
  90     /* Insert an artificial "read/write" reference to the variable
  91        read below, to ensure the compiler does not schedule
  92        a read/use of the variable before the FPSCR is modified, above.
  93        This can be removed if and when GCC PR102783 is fixed.
  94      */
  95     __asm__("" : "+wa"(__A));
  96
  97     __r = vec_rint((__v2df)__A);
  98
  99     /* Insert an artificial "read" reference to the variable written
 100        above, to ensure the compiler does not schedule the computation
 101        of the value after the manipulation of the FPSCR, below.
 102        This can be removed if and when GCC PR102783 is fixed.
 103      */
 104     __asm__("" : : "wa"(__r));
 105     __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
 106     break;
 107   case _MM_FROUND_TO_NEG_INF:
 108   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
 109     __r = vec_floor((__v2df)__A);
 110     break;
 111   case _MM_FROUND_TO_POS_INF:
 112   case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
 113     __r = vec_ceil((__v2df)__A);
 114     break;
 115   case _MM_FROUND_TO_ZERO:
 116   case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
 117     __r = vec_trunc((__v2df)__A);
 118     break;
 119   case _MM_FROUND_CUR_DIRECTION:
 120     __r = vec_rint((__v2df)__A);
 121     break;
 122   }
 123   if (__rounding & _MM_FROUND_NO_EXC) {
 124     /* Insert an artificial "read" reference to the variable written
 125        above, to ensure the compiler does not schedule the computation
 126        of the value after the manipulation of the FPSCR, below.
 127        This can be removed if and when GCC PR102783 is fixed.
 128      */
 129     __asm__("" : : "wa"(__r));
 130     /* Restore enabled exceptions.  */
 131     __fpscr_save.__fr = __builtin_mffsl();
 132     __fpscr_save.__fpscr |= __enables_save.__fpscr;
 133     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
 134   }
 135   return (__m128d)__r;
 136 }
 137
 138 extern __inline __m128d
 139     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 140     _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
 141   __B = _mm_round_pd(__B, __rounding);
 142   __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
 143   return (__m128d)__r;
 144 }
 145
 146 extern __inline __m128
 147     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 148     _mm_round_ps(__m128 __A, int __rounding) {
 149   __v4sf __r;
 150   union {
 151     double __fr;
 152     long long __fpscr;
 153   } __enables_save, __fpscr_save;
 154
 155   if (__rounding & _MM_FROUND_NO_EXC) {
 156     /* Save enabled exceptions, disable all exceptions,
 157        and preserve the rounding mode.  */
 158 #ifdef _ARCH_PWR9
 159     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
 160     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 161 #else
 162     __fpscr_save.__fr = __builtin_mffs();
 163     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 164     __fpscr_save.__fpscr &= ~0xf8;
 165     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
 166 #endif
 167     /* Insert an artificial "read/write" reference to the variable
 168        read below, to ensure the compiler does not schedule
 169        a read/use of the variable before the FPSCR is modified, above.
 170        This can be removed if and when GCC PR102783 is fixed.
 171      */
 172     __asm__("" : "+wa"(__A));
 173   }
 174
 175   switch (__rounding) {
 176   case _MM_FROUND_TO_NEAREST_INT:
 177     __fpscr_save.__fr = __builtin_mffsl();
 178     __attribute__((fallthrough));
 179   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
 180     __builtin_set_fpscr_rn(0b00);
 181     /* Insert an artificial "read/write" reference to the variable
 182        read below, to ensure the compiler does not schedule
 183        a read/use of the variable before the FPSCR is modified, above.
 184        This can be removed if and when GCC PR102783 is fixed.
 185      */
 186     __asm__("" : "+wa"(__A));
 187
 188     __r = vec_rint((__v4sf)__A);
 189
 190     /* Insert an artificial "read" reference to the variable written
 191        above, to ensure the compiler does not schedule the computation
 192        of the value after the manipulation of the FPSCR, below.
 193        This can be removed if and when GCC PR102783 is fixed.
 194      */
 195     __asm__("" : : "wa"(__r));
 196     __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
 197     break;
 198   case _MM_FROUND_TO_NEG_INF:
 199   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
 200     __r = vec_floor((__v4sf)__A);
 201     break;
 202   case _MM_FROUND_TO_POS_INF:
 203   case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
 204     __r = vec_ceil((__v4sf)__A);
 205     break;
 206   case _MM_FROUND_TO_ZERO:
 207   case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
 208     __r = vec_trunc((__v4sf)__A);
 209     break;
 210   case _MM_FROUND_CUR_DIRECTION:
 211     __r = vec_rint((__v4sf)__A);
 212     break;
 213   }
 214   if (__rounding & _MM_FROUND_NO_EXC) {
 215     /* Insert an artificial "read" reference to the variable written
 216        above, to ensure the compiler does not schedule the computation
 217        of the value after the manipulation of the FPSCR, below.
 218        This can be removed if and when GCC PR102783 is fixed.
 219      */
 220     __asm__("" : : "wa"(__r));
 221     /* Restore enabled exceptions.  */
 222     __fpscr_save.__fr = __builtin_mffsl();
 223     __fpscr_save.__fpscr |= __enables_save.__fpscr;
 224     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
 225   }
 226   return (__m128)__r;
 227 }
 228
 229 extern __inline __m128
 230     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 231     _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
 232   __B = _mm_round_ps(__B, __rounding);
 233   __v4sf __r = (__v4sf)__A;
 234   __r[0] = ((__v4sf)__B)[0];
 235   return (__m128)__r;
 236 }
 237
 238 #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
 239 #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
 240
 241 #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
 242 #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
 243
 244 #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
 245 #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
 246
 247 #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
 248 #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
 249
 250 extern __inline __m128i
 251     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 252     _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
 253   __v16qi __result = (__v16qi)__A;
 254
 255   __result[__N & 0xf] = __D;
 256
 257   return (__m128i)__result;
 258 }
 259
 260 extern __inline __m128i
 261     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 262     _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
 263   __v4si __result = (__v4si)__A;
 264
 265   __result[__N & 3] = __D;
 266
 267   return (__m128i)__result;
 268 }
 269
 270 extern __inline __m128i
 271     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 272     _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
 273   __v2di __result = (__v2di)__A;
 274
 275   __result[__N & 1] = __D;
 276
 277   return (__m128i)__result;
 278 }
 279
 280 extern __inline int
 281     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 282     _mm_extract_epi8(__m128i __X, const int __N) {
 283   return (unsigned char)((__v16qi)__X)[__N & 15];
 284 }
 285
 286 extern __inline int
 287     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 288     _mm_extract_epi32(__m128i __X, const int __N) {
 289   return ((__v4si)__X)[__N & 3];
 290 }
 291
 292 extern __inline int
 293     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 294     _mm_extract_epi64(__m128i __X, const int __N) {
 295   return ((__v2di)__X)[__N & 1];
 296 }
 297
 298 extern __inline int
 299     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 300     _mm_extract_ps(__m128 __X, const int __N) {
 301   return ((__v4si)__X)[__N & 3];
 302 }
 303
 304 #ifdef _ARCH_PWR8
 305 extern __inline __m128i
 306     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 307     _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
 308   __v16qi __charmask = vec_splats((signed char)__imm8);
 309   __charmask = vec_gb(__charmask);
 310   __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
 311 #ifdef __BIG_ENDIAN__
 312   __shortmask = vec_reve(__shortmask);
 313 #endif
 314   return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
 315 }
 316 #endif
 317
 318 extern __inline __m128i
 319     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 320     _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
 321 #ifdef _ARCH_PWR10
 322   return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
 323 #else
 324   const __v16qu __seven = vec_splats((unsigned char)0x07);
 325   __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
 326   return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
 327 #endif
 328 }
 329
 330 extern __inline __m128
 331     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 332     _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
 333   __v16qu __pcv[] = {
 334       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
 335       {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
 336       {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
 337       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
 338       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
 339       {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
 340       {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
 341       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
 342       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
 343       {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
 344       {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
 345       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
 346       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
 347       {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
 348       {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
 349       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
 350   };
 351   __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
 352   return (__m128)__r;
 353 }
 354
 355 extern __inline __m128
 356     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 357     _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
 358 #ifdef _ARCH_PWR10
 359   return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
 360 #else
 361   const __v4si __zero = {0};
 362   const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
 363   return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
 364 #endif
 365 }
 366
 367 extern __inline __m128d
 368     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 369     _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
 370   __v16qu __pcv[] = {
 371       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
 372       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
 373       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
 374       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
 375   __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
 376   return (__m128d)__r;
 377 }
 378
 379 #ifdef _ARCH_PWR8
 380 extern __inline __m128d
 381     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 382     _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
 383 #ifdef _ARCH_PWR10
 384   return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
 385 #else
 386   const __v2di __zero = {0};
 387   const __vector __bool long long __boolmask =
 388       vec_cmplt((__v2di)__mask, __zero);
 389   return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
 390 #endif
 391 }
 392 #endif
 393
 394 extern __inline int
 395     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 396     _mm_testz_si128(__m128i __A, __m128i __B) {
 397   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
 398   const __v16qu __zero = {0};
 399   return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
 400 }
 401
 402 extern __inline int
 403     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 404     _mm_testc_si128(__m128i __A, __m128i __B) {
 405   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
 406   const __v16qu __zero = {0};
 407   const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
 408   return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
 409 }
 410
 411 extern __inline int
 412     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 413     _mm_testnzc_si128(__m128i __A, __m128i __B) {
 414   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
 415   return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
 416 }
 417
 418 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
 419
 420 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
 421
 422 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
 423
 424 #ifdef _ARCH_PWR8
 425 extern __inline __m128i
 426     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 427     _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
 428   return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
 429 }
 430 #endif
 431
 432 extern __inline __m128i
 433     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 434     _mm_min_epi8(__m128i __X, __m128i __Y) {
 435   return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
 436 }
 437
 438 extern __inline __m128i
 439     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 440     _mm_min_epu16(__m128i __X, __m128i __Y) {
 441   return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
 442 }
 443
 444 extern __inline __m128i
 445     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 446     _mm_min_epi32(__m128i __X, __m128i __Y) {
 447   return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
 448 }
 449
 450 extern __inline __m128i
 451     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 452     _mm_min_epu32(__m128i __X, __m128i __Y) {
 453   return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
 454 }
 455
 456 extern __inline __m128i
 457     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 458     _mm_max_epi8(__m128i __X, __m128i __Y) {
 459   return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
 460 }
 461
 462 extern __inline __m128i
 463     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 464     _mm_max_epu16(__m128i __X, __m128i __Y) {
 465   return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
 466 }
 467
 468 extern __inline __m128i
 469     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 470     _mm_max_epi32(__m128i __X, __m128i __Y) {
 471   return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
 472 }
 473
 474 extern __inline __m128i
 475     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 476     _mm_max_epu32(__m128i __X, __m128i __Y) {
 477   return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
 478 }
 479
 480 extern __inline __m128i
 481     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 482     _mm_mullo_epi32(__m128i __X, __m128i __Y) {
 483   return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
 484 }
 485
 486 #ifdef _ARCH_PWR8
 487 extern __inline __m128i
 488     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 489     _mm_mul_epi32(__m128i __X, __m128i __Y) {
 490   return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
 491 }
 492 #endif
 493
 494 extern __inline __m128i
 495     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 496     _mm_cvtepi8_epi16(__m128i __A) {
 497   return (__m128i)vec_unpackh((__v16qi)__A);
 498 }
 499
 500 extern __inline __m128i
 501     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 502     _mm_cvtepi8_epi32(__m128i __A) {
 503   __A = (__m128i)vec_unpackh((__v16qi)__A);
 504   return (__m128i)vec_unpackh((__v8hi)__A);
 505 }
 506
 507 #ifdef _ARCH_PWR8
 508 extern __inline __m128i
 509     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 510     _mm_cvtepi8_epi64(__m128i __A) {
 511   __A = (__m128i)vec_unpackh((__v16qi)__A);
 512   __A = (__m128i)vec_unpackh((__v8hi)__A);
 513   return (__m128i)vec_unpackh((__v4si)__A);
 514 }
 515 #endif
 516
 517 extern __inline __m128i
 518     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 519     _mm_cvtepi16_epi32(__m128i __A) {
 520   return (__m128i)vec_unpackh((__v8hi)__A);
 521 }
 522
 523 #ifdef _ARCH_PWR8
 524 extern __inline __m128i
 525     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 526     _mm_cvtepi16_epi64(__m128i __A) {
 527   __A = (__m128i)vec_unpackh((__v8hi)__A);
 528   return (__m128i)vec_unpackh((__v4si)__A);
 529 }
 530 #endif
 531
 532 #ifdef _ARCH_PWR8
 533 extern __inline __m128i
 534     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 535     _mm_cvtepi32_epi64(__m128i __A) {
 536   return (__m128i)vec_unpackh((__v4si)__A);
 537 }
 538 #endif
 539
 540 extern __inline __m128i
 541     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 542     _mm_cvtepu8_epi16(__m128i __A) {
 543   const __v16qu __zero = {0};
 544 #ifdef __LITTLE_ENDIAN__
 545   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
 546 #else  /* __BIG_ENDIAN__.  */
 547   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
 548 #endif /* __BIG_ENDIAN__.  */
 549   return __A;
 550 }
 551
 552 extern __inline __m128i
 553     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 554     _mm_cvtepu8_epi32(__m128i __A) {
 555   const __v16qu __zero = {0};
 556 #ifdef __LITTLE_ENDIAN__
 557   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
 558   __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
 559 #else  /* __BIG_ENDIAN__.  */
 560   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
 561   __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
 562 #endif /* __BIG_ENDIAN__.  */
 563   return __A;
 564 }
 565
 566 extern __inline __m128i
 567     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 568     _mm_cvtepu8_epi64(__m128i __A) {
 569   const __v16qu __zero = {0};
 570 #ifdef __LITTLE_ENDIAN__
 571   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
 572   __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
 573   __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
 574 #else  /* __BIG_ENDIAN__.  */
 575   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
 576   __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
 577   __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
 578 #endif /* __BIG_ENDIAN__.  */
 579   return __A;
 580 }
 581
 582 extern __inline __m128i
 583     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 584     _mm_cvtepu16_epi32(__m128i __A) {
 585   const __v8hu __zero = {0};
 586 #ifdef __LITTLE_ENDIAN__
 587   __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
 588 #else  /* __BIG_ENDIAN__.  */
 589   __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
 590 #endif /* __BIG_ENDIAN__.  */
 591   return __A;
 592 }
 593
 594 extern __inline __m128i
 595     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 596     _mm_cvtepu16_epi64(__m128i __A) {
 597   const __v8hu __zero = {0};
 598 #ifdef __LITTLE_ENDIAN__
 599   __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
 600   __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
 601 #else  /* __BIG_ENDIAN__.  */
 602   __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
 603   __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
 604 #endif /* __BIG_ENDIAN__.  */
 605   return __A;
 606 }
 607
 608 extern __inline __m128i
 609     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 610     _mm_cvtepu32_epi64(__m128i __A) {
 611   const __v4su __zero = {0};
 612 #ifdef __LITTLE_ENDIAN__
 613   __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
 614 #else  /* __BIG_ENDIAN__.  */
 615   __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
 616 #endif /* __BIG_ENDIAN__.  */
 617   return __A;
 618 }
 619
 620 /* Return horizontal packed word minimum and its index in bits [15:0]
 621    and bits [18:16] respectively.  */
 622 extern __inline __m128i
 623     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 624     _mm_minpos_epu16(__m128i __A) {
 625   union __u {
 626     __m128i __m;
 627     __v8hu __uh;
 628   };
 629   union __u __u = {.__m = __A}, __r = {.__m = {0}};
 630   unsigned short __ridx = 0;
 631   unsigned short __rmin = __u.__uh[__ridx];
 632   unsigned long __i;
 633   for (__i = 1; __i < 8; __i++) {
 634     if (__u.__uh[__i] < __rmin) {
 635       __rmin = __u.__uh[__i];
 636       __ridx = __i;
 637     }
 638   }
 639   __r.__uh[0] = __rmin;
 640   __r.__uh[1] = __ridx;
 641   return __r.__m;
 642 }
 643
 644 extern __inline __m128i
 645     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 646     _mm_packus_epi32(__m128i __X, __m128i __Y) {
 647   return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
 648 }
 649
 650 #ifdef _ARCH_PWR8
 651 extern __inline __m128i
 652     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 653     _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
 654   return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
 655 }
 656 #endif
 657
 658 #else
 659 #include_next <smmintrin.h>
 660 #endif /* defined(__ppc64__) &&
 661         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
 662
 663 #endif /* SMMINTRIN_H_ */