clang/lib/Headers/tmmintrin.h

   1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __TMMINTRIN_H
  11 #define __TMMINTRIN_H
  12
  13 #if !defined(__i386__) && !defined(__x86_64__)
  14 #error "This header is only meant to be used on x86 and x64 architecture"
  15 #endif
  16
  17 #include <pmmintrin.h>
  18
  19 /* Define the default attributes for the functions in this file. */
  20 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
  21 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
  22
  23 /// Computes the absolute value of each of the packed 8-bit signed
  24 ///    integers in the source operand and stores the 8-bit unsigned integer
  25 ///    results in the destination.
  26 ///
  27 /// \headerfile <x86intrin.h>
  28 ///
  29 /// This intrinsic corresponds to the \c PABSB instruction.
  30 ///
  31 /// \param __a
  32 ///    A 64-bit vector of [8 x i8].
  33 /// \returns A 64-bit integer vector containing the absolute values of the
  34 ///    elements in the operand.
  35 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  36 _mm_abs_pi8(__m64 __a)
  37 {
  38     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
  39 }
  40
  41 /// Computes the absolute value of each of the packed 8-bit signed
  42 ///    integers in the source operand and stores the 8-bit unsigned integer
  43 ///    results in the destination.
  44 ///
  45 /// \headerfile <x86intrin.h>
  46 ///
  47 /// This intrinsic corresponds to the \c VPABSB instruction.
  48 ///
  49 /// \param __a
  50 ///    A 128-bit vector of [16 x i8].
  51 /// \returns A 128-bit integer vector containing the absolute values of the
  52 ///    elements in the operand.
  53 static __inline__ __m128i __DEFAULT_FN_ATTRS
  54 _mm_abs_epi8(__m128i __a)
  55 {
  56     return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
  57 }
  58
  59 /// Computes the absolute value of each of the packed 16-bit signed
  60 ///    integers in the source operand and stores the 16-bit unsigned integer
  61 ///    results in the destination.
  62 ///
  63 /// \headerfile <x86intrin.h>
  64 ///
  65 /// This intrinsic corresponds to the \c PABSW instruction.
  66 ///
  67 /// \param __a
  68 ///    A 64-bit vector of [4 x i16].
  69 /// \returns A 64-bit integer vector containing the absolute values of the
  70 ///    elements in the operand.
  71 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  72 _mm_abs_pi16(__m64 __a)
  73 {
  74     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
  75 }
  76
  77 /// Computes the absolute value of each of the packed 16-bit signed
  78 ///    integers in the source operand and stores the 16-bit unsigned integer
  79 ///    results in the destination.
  80 ///
  81 /// \headerfile <x86intrin.h>
  82 ///
  83 /// This intrinsic corresponds to the \c VPABSW instruction.
  84 ///
  85 /// \param __a
  86 ///    A 128-bit vector of [8 x i16].
  87 /// \returns A 128-bit integer vector containing the absolute values of the
  88 ///    elements in the operand.
  89 static __inline__ __m128i __DEFAULT_FN_ATTRS
  90 _mm_abs_epi16(__m128i __a)
  91 {
  92     return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
  93 }
  94
  95 /// Computes the absolute value of each of the packed 32-bit signed
  96 ///    integers in the source operand and stores the 32-bit unsigned integer
  97 ///    results in the destination.
  98 ///
  99 /// \headerfile <x86intrin.h>
 100 ///
 101 /// This intrinsic corresponds to the \c PABSD instruction.
 102 ///
 103 /// \param __a
 104 ///    A 64-bit vector of [2 x i32].
 105 /// \returns A 64-bit integer vector containing the absolute values of the
 106 ///    elements in the operand.
 107 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 108 _mm_abs_pi32(__m64 __a)
 109 {
 110     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
 111 }
 112
 113 /// Computes the absolute value of each of the packed 32-bit signed
 114 ///    integers in the source operand and stores the 32-bit unsigned integer
 115 ///    results in the destination.
 116 ///
 117 /// \headerfile <x86intrin.h>
 118 ///
 119 /// This intrinsic corresponds to the \c VPABSD instruction.
 120 ///
 121 /// \param __a
 122 ///    A 128-bit vector of [4 x i32].
 123 /// \returns A 128-bit integer vector containing the absolute values of the
 124 ///    elements in the operand.
 125 static __inline__ __m128i __DEFAULT_FN_ATTRS
 126 _mm_abs_epi32(__m128i __a)
 127 {
 128     return (__m128i)__builtin_elementwise_abs((__v4si)__a);
 129 }
 130
 131 /// Concatenates the two 128-bit integer vector operands, and
 132 ///    right-shifts the result by the number of bytes specified in the immediate
 133 ///    operand.
 134 ///
 135 /// \headerfile <x86intrin.h>
 136 ///
 137 /// \code
 138 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
 139 /// \endcode
 140 ///
 141 /// This intrinsic corresponds to the \c PALIGNR instruction.
 142 ///
 143 /// \param a
 144 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
 145 /// \param b
 146 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
 147 /// \param n
 148 ///    An immediate operand specifying how many bytes to right-shift the result.
 149 /// \returns A 128-bit integer vector containing the concatenated right-shifted
 150 ///    value.
 151 #define _mm_alignr_epi8(a, b, n) \
 152   ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
 153                                       (__v16qi)(__m128i)(b), (n)))
 154
 155 /// Concatenates the two 64-bit integer vector operands, and right-shifts
 156 ///    the result by the number of bytes specified in the immediate operand.
 157 ///
 158 /// \headerfile <x86intrin.h>
 159 ///
 160 /// \code
 161 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
 162 /// \endcode
 163 ///
 164 /// This intrinsic corresponds to the \c PALIGNR instruction.
 165 ///
 166 /// \param a
 167 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
 168 /// \param b
 169 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
 170 /// \param n
 171 ///    An immediate operand specifying how many bytes to right-shift the result.
 172 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 173 ///    value.
 174 #define _mm_alignr_pi8(a, b, n) \
 175   ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
 176
 177 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 178 ///    128-bit vectors of [8 x i16].
 179 ///
 180 /// \headerfile <x86intrin.h>
 181 ///
 182 /// This intrinsic corresponds to the \c VPHADDW instruction.
 183 ///
 184 /// \param __a
 185 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 186 ///    horizontal sums of the values are stored in the lower bits of the
 187 ///    destination.
 188 /// \param __b
 189 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 190 ///    horizontal sums of the values are stored in the upper bits of the
 191 ///    destination.
 192 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
 193 ///    both operands.
 194 static __inline__ __m128i __DEFAULT_FN_ATTRS
 195 _mm_hadd_epi16(__m128i __a, __m128i __b)
 196 {
 197     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 198 }
 199
 200 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 201 ///    128-bit vectors of [4 x i32].
 202 ///
 203 /// \headerfile <x86intrin.h>
 204 ///
 205 /// This intrinsic corresponds to the \c VPHADDD instruction.
 206 ///
 207 /// \param __a
 208 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 209 ///    horizontal sums of the values are stored in the lower bits of the
 210 ///    destination.
 211 /// \param __b
 212 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 213 ///    horizontal sums of the values are stored in the upper bits of the
 214 ///    destination.
 215 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
 216 ///    both operands.
 217 static __inline__ __m128i __DEFAULT_FN_ATTRS
 218 _mm_hadd_epi32(__m128i __a, __m128i __b)
 219 {
 220     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 221 }
 222
 223 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 224 ///    64-bit vectors of [4 x i16].
 225 ///
 226 /// \headerfile <x86intrin.h>
 227 ///
 228 /// This intrinsic corresponds to the \c PHADDW instruction.
 229 ///
 230 /// \param __a
 231 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 232 ///    horizontal sums of the values are stored in the lower bits of the
 233 ///    destination.
 234 /// \param __b
 235 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 236 ///    horizontal sums of the values are stored in the upper bits of the
 237 ///    destination.
 238 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 239 ///    operands.
 240 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 241 _mm_hadd_pi16(__m64 __a, __m64 __b)
 242 {
 243     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
 244 }
 245
 246 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 247 ///    64-bit vectors of [2 x i32].
 248 ///
 249 /// \headerfile <x86intrin.h>
 250 ///
 251 /// This intrinsic corresponds to the \c PHADDD instruction.
 252 ///
 253 /// \param __a
 254 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 255 ///    horizontal sums of the values are stored in the lower bits of the
 256 ///    destination.
 257 /// \param __b
 258 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 259 ///    horizontal sums of the values are stored in the upper bits of the
 260 ///    destination.
 261 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 262 ///    operands.
 263 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 264 _mm_hadd_pi32(__m64 __a, __m64 __b)
 265 {
 266     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 267 }
 268
 269 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 270 ///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
 271 ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
 272 ///    0x8000.
 273 ///
 274 /// \headerfile <x86intrin.h>
 275 ///
 276 /// This intrinsic corresponds to the \c VPHADDSW instruction.
 277 ///
 278 /// \param __a
 279 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 280 ///    horizontal sums of the values are stored in the lower bits of the
 281 ///    destination.
 282 /// \param __b
 283 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 284 ///    horizontal sums of the values are stored in the upper bits of the
 285 ///    destination.
 286 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 287 ///    sums of both operands.
 288 static __inline__ __m128i __DEFAULT_FN_ATTRS
 289 _mm_hadds_epi16(__m128i __a, __m128i __b)
 290 {
 291     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 292 }
 293
 294 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 295 ///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
 296 ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
 297 ///    0x8000.
 298 ///
 299 /// \headerfile <x86intrin.h>
 300 ///
 301 /// This intrinsic corresponds to the \c PHADDSW instruction.
 302 ///
 303 /// \param __a
 304 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 305 ///    horizontal sums of the values are stored in the lower bits of the
 306 ///    destination.
 307 /// \param __b
 308 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 309 ///    horizontal sums of the values are stored in the upper bits of the
 310 ///    destination.
 311 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 312 ///    sums of both operands.
 313 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 314 _mm_hadds_pi16(__m64 __a, __m64 __b)
 315 {
 316     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
 317 }
 318
 319 /// Horizontally subtracts the adjacent pairs of values contained in 2
 320 ///    packed 128-bit vectors of [8 x i16].
 321 ///
 322 /// \headerfile <x86intrin.h>
 323 ///
 324 /// This intrinsic corresponds to the \c VPHSUBW instruction.
 325 ///
 326 /// \param __a
 327 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 328 ///    horizontal differences between the values are stored in the lower bits of
 329 ///    the destination.
 330 /// \param __b
 331 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 332 ///    horizontal differences between the values are stored in the upper bits of
 333 ///    the destination.
 334 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
 335 ///    of both operands.
 336 static __inline__ __m128i __DEFAULT_FN_ATTRS
 337 _mm_hsub_epi16(__m128i __a, __m128i __b)
 338 {
 339     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 340 }
 341
 342 /// Horizontally subtracts the adjacent pairs of values contained in 2
 343 ///    packed 128-bit vectors of [4 x i32].
 344 ///
 345 /// \headerfile <x86intrin.h>
 346 ///
 347 /// This intrinsic corresponds to the \c VPHSUBD instruction.
 348 ///
 349 /// \param __a
 350 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 351 ///    horizontal differences between the values are stored in the lower bits of
 352 ///    the destination.
 353 /// \param __b
 354 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 355 ///    horizontal differences between the values are stored in the upper bits of
 356 ///    the destination.
 357 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
 358 ///    of both operands.
 359 static __inline__ __m128i __DEFAULT_FN_ATTRS
 360 _mm_hsub_epi32(__m128i __a, __m128i __b)
 361 {
 362     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 363 }
 364
 365 /// Horizontally subtracts the adjacent pairs of values contained in 2
 366 ///    packed 64-bit vectors of [4 x i16].
 367 ///
 368 /// \headerfile <x86intrin.h>
 369 ///
 370 /// This intrinsic corresponds to the \c PHSUBW instruction.
 371 ///
 372 /// \param __a
 373 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 374 ///    horizontal differences between the values are stored in the lower bits of
 375 ///    the destination.
 376 /// \param __b
 377 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 378 ///    horizontal differences between the values are stored in the upper bits of
 379 ///    the destination.
 380 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 381 ///    of both operands.
 382 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 383 _mm_hsub_pi16(__m64 __a, __m64 __b)
 384 {
 385     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
 386 }
 387
 388 /// Horizontally subtracts the adjacent pairs of values contained in 2
 389 ///    packed 64-bit vectors of [2 x i32].
 390 ///
 391 /// \headerfile <x86intrin.h>
 392 ///
 393 /// This intrinsic corresponds to the \c PHSUBD instruction.
 394 ///
 395 /// \param __a
 396 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 397 ///    horizontal differences between the values are stored in the lower bits of
 398 ///    the destination.
 399 /// \param __b
 400 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 401 ///    horizontal differences between the values are stored in the upper bits of
 402 ///    the destination.
 403 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 404 ///    of both operands.
 405 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 406 _mm_hsub_pi32(__m64 __a, __m64 __b)
 407 {
 408     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 409 }
 410
 411 /// Horizontally subtracts the adjacent pairs of values contained in 2
 412 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
 413 ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
 414 ///    saturated to 0x8000.
 415 ///
 416 /// \headerfile <x86intrin.h>
 417 ///
 418 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
 419 ///
 420 /// \param __a
 421 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 422 ///    horizontal differences between the values are stored in the lower bits of
 423 ///    the destination.
 424 /// \param __b
 425 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 426 ///    horizontal differences between the values are stored in the upper bits of
 427 ///    the destination.
 428 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 429 ///    differences of both operands.
 430 static __inline__ __m128i __DEFAULT_FN_ATTRS
 431 _mm_hsubs_epi16(__m128i __a, __m128i __b)
 432 {
 433     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 434 }
 435
 436 /// Horizontally subtracts the adjacent pairs of values contained in 2
 437 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
 438 ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
 439 ///    saturated to 0x8000.
 440 ///
 441 /// \headerfile <x86intrin.h>
 442 ///
 443 /// This intrinsic corresponds to the \c PHSUBSW instruction.
 444 ///
 445 /// \param __a
 446 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 447 ///    horizontal differences between the values are stored in the lower bits of
 448 ///    the destination.
 449 /// \param __b
 450 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 451 ///    horizontal differences between the values are stored in the upper bits of
 452 ///    the destination.
 453 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 454 ///    differences of both operands.
 455 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 456 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 457 {
 458     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
 459 }
 460
 461 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
 462 ///    values contained in the first source operand and packed 8-bit signed
 463 ///    integer values contained in the second source operand, adds pairs of
 464 ///    contiguous products with signed saturation, and writes the 16-bit sums to
 465 ///    the corresponding bits in the destination.
 466 ///
 467 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
 468 ///    both operands are multiplied, and the sum of both results is written to
 469 ///    bits [15:0] of the destination.
 470 ///
 471 /// \headerfile <x86intrin.h>
 472 ///
 473 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
 474 ///
 475 /// \param __a
 476 ///    A 128-bit integer vector containing the first source operand.
 477 /// \param __b
 478 ///    A 128-bit integer vector containing the second source operand.
 479 /// \returns A 128-bit integer vector containing the sums of products of both
 480 ///    operands: \n
 481 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
 482 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 483 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 484 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
 485 ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
 486 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
 487 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
 488 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
 489 static __inline__ __m128i __DEFAULT_FN_ATTRS
 490 _mm_maddubs_epi16(__m128i __a, __m128i __b)
 491 {
 492     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 493 }
 494
 495 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
 496 ///    values contained in the first source operand and packed 8-bit signed
 497 ///    integer values contained in the second source operand, adds pairs of
 498 ///    contiguous products with signed saturation, and writes the 16-bit sums to
 499 ///    the corresponding bits in the destination.
 500 ///
 501 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
 502 ///    both operands are multiplied, and the sum of both results is written to
 503 ///    bits [15:0] of the destination.
 504 ///
 505 /// \headerfile <x86intrin.h>
 506 ///
 507 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
 508 ///
 509 /// \param __a
 510 ///    A 64-bit integer vector containing the first source operand.
 511 /// \param __b
 512 ///    A 64-bit integer vector containing the second source operand.
 513 /// \returns A 64-bit integer vector containing the sums of products of both
 514 ///    operands: \n
 515 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
 516 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 517 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 518 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
 519 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 520 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 521 {
 522     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
 523 }
 524
 525 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
 526 ///    products to the 18 most significant bits by right-shifting, rounds the
 527 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 528 ///
 529 /// \headerfile <x86intrin.h>
 530 ///
 531 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
 532 ///
 533 /// \param __a
 534 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 535 /// \param __b
 536 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 537 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 538 ///    products of both operands.
 539 static __inline__ __m128i __DEFAULT_FN_ATTRS
 540 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 541 {
 542     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 543 }
 544
 545 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
 546 ///    products to the 18 most significant bits by right-shifting, rounds the
 547 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 548 ///
 549 /// \headerfile <x86intrin.h>
 550 ///
 551 /// This intrinsic corresponds to the \c PMULHRSW instruction.
 552 ///
 553 /// \param __a
 554 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 555 /// \param __b
 556 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 557 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 558 ///    products of both operands.
 559 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 560 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 561 {
 562     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
 563 }
 564
 565 /// Copies the 8-bit integers from a 128-bit integer vector to the
 566 ///    destination or clears 8-bit values in the destination, as specified by
 567 ///    the second source operand.
 568 ///
 569 /// \headerfile <x86intrin.h>
 570 ///
 571 /// This intrinsic corresponds to the \c VPSHUFB instruction.
 572 ///
 573 /// \param __a
 574 ///    A 128-bit integer vector containing the values to be copied.
 575 /// \param __b
 576 ///    A 128-bit integer vector containing control bytes corresponding to
 577 ///    positions in the destination:
 578 ///    Bit 7: \n
 579 ///    1: Clear the corresponding byte in the destination. \n
 580 ///    0: Copy the selected source byte to the corresponding byte in the
 581 ///    destination. \n
 582 ///    Bits [6:4] Reserved.  \n
 583 ///    Bits [3:0] select the source byte to be copied.
 584 /// \returns A 128-bit integer vector containing the copied or cleared values.
 585 static __inline__ __m128i __DEFAULT_FN_ATTRS
 586 _mm_shuffle_epi8(__m128i __a, __m128i __b)
 587 {
 588     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
 589 }
 590
 591 /// Copies the 8-bit integers from a 64-bit integer vector to the
 592 ///    destination or clears 8-bit values in the destination, as specified by
 593 ///    the second source operand.
 594 ///
 595 /// \headerfile <x86intrin.h>
 596 ///
 597 /// This intrinsic corresponds to the \c PSHUFB instruction.
 598 ///
 599 /// \param __a
 600 ///    A 64-bit integer vector containing the values to be copied.
 601 /// \param __b
 602 ///    A 64-bit integer vector containing control bytes corresponding to
 603 ///    positions in the destination:
 604 ///    Bit 7: \n
 605 ///    1: Clear the corresponding byte in the destination. \n
 606 ///    0: Copy the selected source byte to the corresponding byte in the
 607 ///    destination. \n
 608 ///    Bits [3:0] select the source byte to be copied.
 609 /// \returns A 64-bit integer vector containing the copied or cleared values.
 610 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 611 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 612 {
 613     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
 614 }
 615
 616 /// For each 8-bit integer in the first source operand, perform one of
 617 ///    the following actions as specified by the second source operand.
 618 ///
 619 ///    If the byte in the second source is negative, calculate the two's
 620 ///    complement of the corresponding byte in the first source, and write that
 621 ///    value to the destination. If the byte in the second source is positive,
 622 ///    copy the corresponding byte from the first source to the destination. If
 623 ///    the byte in the second source is zero, clear the corresponding byte in
 624 ///    the destination.
 625 ///
 626 /// \headerfile <x86intrin.h>
 627 ///
 628 /// This intrinsic corresponds to the \c VPSIGNB instruction.
 629 ///
 630 /// \param __a
 631 ///    A 128-bit integer vector containing the values to be copied.
 632 /// \param __b
 633 ///    A 128-bit integer vector containing control bytes corresponding to
 634 ///    positions in the destination.
 635 /// \returns A 128-bit integer vector containing the resultant values.
 636 static __inline__ __m128i __DEFAULT_FN_ATTRS
 637 _mm_sign_epi8(__m128i __a, __m128i __b)
 638 {
 639     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
 640 }
 641
 642 /// For each 16-bit integer in the first source operand, perform one of
 643 ///    the following actions as specified by the second source operand.
 644 ///
 645 ///    If the word in the second source is negative, calculate the two's
 646 ///    complement of the corresponding word in the first source, and write that
 647 ///    value to the destination. If the word in the second source is positive,
 648 ///    copy the corresponding word from the first source to the destination. If
 649 ///    the word in the second source is zero, clear the corresponding word in
 650 ///    the destination.
 651 ///
 652 /// \headerfile <x86intrin.h>
 653 ///
 654 /// This intrinsic corresponds to the \c VPSIGNW instruction.
 655 ///
 656 /// \param __a
 657 ///    A 128-bit integer vector containing the values to be copied.
 658 /// \param __b
 659 ///    A 128-bit integer vector containing control words corresponding to
 660 ///    positions in the destination.
 661 /// \returns A 128-bit integer vector containing the resultant values.
 662 static __inline__ __m128i __DEFAULT_FN_ATTRS
 663 _mm_sign_epi16(__m128i __a, __m128i __b)
 664 {
 665     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
 666 }
 667
 668 /// For each 32-bit integer in the first source operand, perform one of
 669 ///    the following actions as specified by the second source operand.
 670 ///
 671 ///    If the doubleword in the second source is negative, calculate the two's
 672 ///    complement of the corresponding word in the first source, and write that
 673 ///    value to the destination. If the doubleword in the second source is
 674 ///    positive, copy the corresponding word from the first source to the
 675 ///    destination. If the doubleword in the second source is zero, clear the
 676 ///    corresponding word in the destination.
 677 ///
 678 /// \headerfile <x86intrin.h>
 679 ///
 680 /// This intrinsic corresponds to the \c VPSIGND instruction.
 681 ///
 682 /// \param __a
 683 ///    A 128-bit integer vector containing the values to be copied.
 684 /// \param __b
 685 ///    A 128-bit integer vector containing control doublewords corresponding to
 686 ///    positions in the destination.
 687 /// \returns A 128-bit integer vector containing the resultant values.
 688 static __inline__ __m128i __DEFAULT_FN_ATTRS
 689 _mm_sign_epi32(__m128i __a, __m128i __b)
 690 {
 691     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
 692 }
 693
 694 /// For each 8-bit integer in the first source operand, perform one of
 695 ///    the following actions as specified by the second source operand.
 696 ///
 697 ///    If the byte in the second source is negative, calculate the two's
 698 ///    complement of the corresponding byte in the first source, and write that
 699 ///    value to the destination. If the byte in the second source is positive,
 700 ///    copy the corresponding byte from the first source to the destination. If
 701 ///    the byte in the second source is zero, clear the corresponding byte in
 702 ///    the destination.
 703 ///
 704 /// \headerfile <x86intrin.h>
 705 ///
 706 /// This intrinsic corresponds to the \c PSIGNB instruction.
 707 ///
 708 /// \param __a
 709 ///    A 64-bit integer vector containing the values to be copied.
 710 /// \param __b
 711 ///    A 64-bit integer vector containing control bytes corresponding to
 712 ///    positions in the destination.
 713 /// \returns A 64-bit integer vector containing the resultant values.
 714 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 715 _mm_sign_pi8(__m64 __a, __m64 __b)
 716 {
 717     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
 718 }
 719
 720 /// For each 16-bit integer in the first source operand, perform one of
 721 ///    the following actions as specified by the second source operand.
 722 ///
 723 ///    If the word in the second source is negative, calculate the two's
 724 ///    complement of the corresponding word in the first source, and write that
 725 ///    value to the destination. If the word in the second source is positive,
 726 ///    copy the corresponding word from the first source to the destination. If
 727 ///    the word in the second source is zero, clear the corresponding word in
 728 ///    the destination.
 729 ///
 730 /// \headerfile <x86intrin.h>
 731 ///
 732 /// This intrinsic corresponds to the \c PSIGNW instruction.
 733 ///
 734 /// \param __a
 735 ///    A 64-bit integer vector containing the values to be copied.
 736 /// \param __b
 737 ///    A 64-bit integer vector containing control words corresponding to
 738 ///    positions in the destination.
 739 /// \returns A 64-bit integer vector containing the resultant values.
 740 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 741 _mm_sign_pi16(__m64 __a, __m64 __b)
 742 {
 743     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
 744 }
 745
 746 /// For each 32-bit integer in the first source operand, perform one of
 747 ///    the following actions as specified by the second source operand.
 748 ///
 749 ///    If the doubleword in the second source is negative, calculate the two's
 750 ///    complement of the corresponding doubleword in the first source, and
 751 ///    write that value to the destination. If the doubleword in the second
 752 ///    source is positive, copy the corresponding doubleword from the first
 753 ///    source to the destination. If the doubleword in the second source is
 754 ///    zero, clear the corresponding doubleword in the destination.
 755 ///
 756 /// \headerfile <x86intrin.h>
 757 ///
 758 /// This intrinsic corresponds to the \c PSIGND instruction.
 759 ///
 760 /// \param __a
 761 ///    A 64-bit integer vector containing the values to be copied.
 762 /// \param __b
 763 ///    A 64-bit integer vector containing two control doublewords corresponding
 764 ///    to positions in the destination.
 765 /// \returns A 64-bit integer vector containing the resultant values.
 766 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 767 _mm_sign_pi32(__m64 __a, __m64 __b)
 768 {
 769     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
 770 }
 771
 772 #undef __DEFAULT_FN_ATTRS
 773 #undef __DEFAULT_FN_ATTRS_MMX
 774
 775 #endif /* __TMMINTRIN_H */