clang/lib/Headers/tmmintrin.h

   1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __TMMINTRIN_H
  11 #define __TMMINTRIN_H
  12
  13 #include <pmmintrin.h>
  14
  15 /* Define the default attributes for the functions in this file. */
  16 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
  17 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
  18
  19 /// Computes the absolute value of each of the packed 8-bit signed
  20 ///    integers in the source operand and stores the 8-bit unsigned integer
  21 ///    results in the destination.
  22 ///
  23 /// \headerfile <x86intrin.h>
  24 ///
  25 /// This intrinsic corresponds to the \c PABSB instruction.
  26 ///
  27 /// \param __a
  28 ///    A 64-bit vector of [8 x i8].
  29 /// \returns A 64-bit integer vector containing the absolute values of the
  30 ///    elements in the operand.
  31 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  32 _mm_abs_pi8(__m64 __a)
  33 {
  34     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
  35 }
  36
  37 /// Computes the absolute value of each of the packed 8-bit signed
  38 ///    integers in the source operand and stores the 8-bit unsigned integer
  39 ///    results in the destination.
  40 ///
  41 /// \headerfile <x86intrin.h>
  42 ///
  43 /// This intrinsic corresponds to the \c VPABSB instruction.
  44 ///
  45 /// \param __a
  46 ///    A 128-bit vector of [16 x i8].
  47 /// \returns A 128-bit integer vector containing the absolute values of the
  48 ///    elements in the operand.
  49 static __inline__ __m128i __DEFAULT_FN_ATTRS
  50 _mm_abs_epi8(__m128i __a)
  51 {
  52     return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
  53 }
  54
  55 /// Computes the absolute value of each of the packed 16-bit signed
  56 ///    integers in the source operand and stores the 16-bit unsigned integer
  57 ///    results in the destination.
  58 ///
  59 /// \headerfile <x86intrin.h>
  60 ///
  61 /// This intrinsic corresponds to the \c PABSW instruction.
  62 ///
  63 /// \param __a
  64 ///    A 64-bit vector of [4 x i16].
  65 /// \returns A 64-bit integer vector containing the absolute values of the
  66 ///    elements in the operand.
  67 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  68 _mm_abs_pi16(__m64 __a)
  69 {
  70     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
  71 }
  72
  73 /// Computes the absolute value of each of the packed 16-bit signed
  74 ///    integers in the source operand and stores the 16-bit unsigned integer
  75 ///    results in the destination.
  76 ///
  77 /// \headerfile <x86intrin.h>
  78 ///
  79 /// This intrinsic corresponds to the \c VPABSW instruction.
  80 ///
  81 /// \param __a
  82 ///    A 128-bit vector of [8 x i16].
  83 /// \returns A 128-bit integer vector containing the absolute values of the
  84 ///    elements in the operand.
  85 static __inline__ __m128i __DEFAULT_FN_ATTRS
  86 _mm_abs_epi16(__m128i __a)
  87 {
  88     return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
  89 }
  90
  91 /// Computes the absolute value of each of the packed 32-bit signed
  92 ///    integers in the source operand and stores the 32-bit unsigned integer
  93 ///    results in the destination.
  94 ///
  95 /// \headerfile <x86intrin.h>
  96 ///
  97 /// This intrinsic corresponds to the \c PABSD instruction.
  98 ///
  99 /// \param __a
 100 ///    A 64-bit vector of [2 x i32].
 101 /// \returns A 64-bit integer vector containing the absolute values of the
 102 ///    elements in the operand.
 103 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 104 _mm_abs_pi32(__m64 __a)
 105 {
 106     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
 107 }
 108
 109 /// Computes the absolute value of each of the packed 32-bit signed
 110 ///    integers in the source operand and stores the 32-bit unsigned integer
 111 ///    results in the destination.
 112 ///
 113 /// \headerfile <x86intrin.h>
 114 ///
 115 /// This intrinsic corresponds to the \c VPABSD instruction.
 116 ///
 117 /// \param __a
 118 ///    A 128-bit vector of [4 x i32].
 119 /// \returns A 128-bit integer vector containing the absolute values of the
 120 ///    elements in the operand.
 121 static __inline__ __m128i __DEFAULT_FN_ATTRS
 122 _mm_abs_epi32(__m128i __a)
 123 {
 124     return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
 125 }
 126
 127 /// Concatenates the two 128-bit integer vector operands, and
 128 ///    right-shifts the result by the number of bytes specified in the immediate
 129 ///    operand.
 130 ///
 131 /// \headerfile <x86intrin.h>
 132 ///
 133 /// \code
 134 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
 135 /// \endcode
 136 ///
 137 /// This intrinsic corresponds to the \c PALIGNR instruction.
 138 ///
 139 /// \param a
 140 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
 141 /// \param b
 142 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
 143 /// \param n
 144 ///    An immediate operand specifying how many bytes to right-shift the result.
 145 /// \returns A 128-bit integer vector containing the concatenated right-shifted
 146 ///    value.
 147 #define _mm_alignr_epi8(a, b, n) \
 148   (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
 149                                      (__v16qi)(__m128i)(b), (n))
 150
 151 /// Concatenates the two 64-bit integer vector operands, and right-shifts
 152 ///    the result by the number of bytes specified in the immediate operand.
 153 ///
 154 /// \headerfile <x86intrin.h>
 155 ///
 156 /// \code
 157 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
 158 /// \endcode
 159 ///
 160 /// This intrinsic corresponds to the \c PALIGNR instruction.
 161 ///
 162 /// \param a
 163 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
 164 /// \param b
 165 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
 166 /// \param n
 167 ///    An immediate operand specifying how many bytes to right-shift the result.
 168 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 169 ///    value.
 170 #define _mm_alignr_pi8(a, b, n) \
 171   (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
 172
 173 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 174 ///    128-bit vectors of [8 x i16].
 175 ///
 176 /// \headerfile <x86intrin.h>
 177 ///
 178 /// This intrinsic corresponds to the \c VPHADDW instruction.
 179 ///
 180 /// \param __a
 181 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 182 ///    horizontal sums of the values are stored in the lower bits of the
 183 ///    destination.
 184 /// \param __b
 185 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 186 ///    horizontal sums of the values are stored in the upper bits of the
 187 ///    destination.
 188 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
 189 ///    both operands.
 190 static __inline__ __m128i __DEFAULT_FN_ATTRS
 191 _mm_hadd_epi16(__m128i __a, __m128i __b)
 192 {
 193     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 194 }
 195
 196 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 197 ///    128-bit vectors of [4 x i32].
 198 ///
 199 /// \headerfile <x86intrin.h>
 200 ///
 201 /// This intrinsic corresponds to the \c VPHADDD instruction.
 202 ///
 203 /// \param __a
 204 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 205 ///    horizontal sums of the values are stored in the lower bits of the
 206 ///    destination.
 207 /// \param __b
 208 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 209 ///    horizontal sums of the values are stored in the upper bits of the
 210 ///    destination.
 211 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
 212 ///    both operands.
 213 static __inline__ __m128i __DEFAULT_FN_ATTRS
 214 _mm_hadd_epi32(__m128i __a, __m128i __b)
 215 {
 216     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 217 }
 218
 219 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 220 ///    64-bit vectors of [4 x i16].
 221 ///
 222 /// \headerfile <x86intrin.h>
 223 ///
 224 /// This intrinsic corresponds to the \c PHADDW instruction.
 225 ///
 226 /// \param __a
 227 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 228 ///    horizontal sums of the values are stored in the lower bits of the
 229 ///    destination.
 230 /// \param __b
 231 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 232 ///    horizontal sums of the values are stored in the upper bits of the
 233 ///    destination.
 234 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 235 ///    operands.
 236 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 237 _mm_hadd_pi16(__m64 __a, __m64 __b)
 238 {
 239     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
 240 }
 241
 242 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 243 ///    64-bit vectors of [2 x i32].
 244 ///
 245 /// \headerfile <x86intrin.h>
 246 ///
 247 /// This intrinsic corresponds to the \c PHADDD instruction.
 248 ///
 249 /// \param __a
 250 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 251 ///    horizontal sums of the values are stored in the lower bits of the
 252 ///    destination.
 253 /// \param __b
 254 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 255 ///    horizontal sums of the values are stored in the upper bits of the
 256 ///    destination.
 257 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 258 ///    operands.
 259 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 260 _mm_hadd_pi32(__m64 __a, __m64 __b)
 261 {
 262     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 263 }
 264
 265 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 266 ///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
 267 ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
 268 ///    0x8000.
 269 ///
 270 /// \headerfile <x86intrin.h>
 271 ///
 272 /// This intrinsic corresponds to the \c VPHADDSW instruction.
 273 ///
 274 /// \param __a
 275 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 276 ///    horizontal sums of the values are stored in the lower bits of the
 277 ///    destination.
 278 /// \param __b
 279 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 280 ///    horizontal sums of the values are stored in the upper bits of the
 281 ///    destination.
 282 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 283 ///    sums of both operands.
 284 static __inline__ __m128i __DEFAULT_FN_ATTRS
 285 _mm_hadds_epi16(__m128i __a, __m128i __b)
 286 {
 287     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 288 }
 289
 290 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 291 ///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
 292 ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
 293 ///    0x8000.
 294 ///
 295 /// \headerfile <x86intrin.h>
 296 ///
 297 /// This intrinsic corresponds to the \c PHADDSW instruction.
 298 ///
 299 /// \param __a
 300 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 301 ///    horizontal sums of the values are stored in the lower bits of the
 302 ///    destination.
 303 /// \param __b
 304 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 305 ///    horizontal sums of the values are stored in the upper bits of the
 306 ///    destination.
 307 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 308 ///    sums of both operands.
 309 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 310 _mm_hadds_pi16(__m64 __a, __m64 __b)
 311 {
 312     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
 313 }
 314
 315 /// Horizontally subtracts the adjacent pairs of values contained in 2
 316 ///    packed 128-bit vectors of [8 x i16].
 317 ///
 318 /// \headerfile <x86intrin.h>
 319 ///
 320 /// This intrinsic corresponds to the \c VPHSUBW instruction.
 321 ///
 322 /// \param __a
 323 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 324 ///    horizontal differences between the values are stored in the lower bits of
 325 ///    the destination.
 326 /// \param __b
 327 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 328 ///    horizontal differences between the values are stored in the upper bits of
 329 ///    the destination.
 330 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
 331 ///    of both operands.
 332 static __inline__ __m128i __DEFAULT_FN_ATTRS
 333 _mm_hsub_epi16(__m128i __a, __m128i __b)
 334 {
 335     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 336 }
 337
 338 /// Horizontally subtracts the adjacent pairs of values contained in 2
 339 ///    packed 128-bit vectors of [4 x i32].
 340 ///
 341 /// \headerfile <x86intrin.h>
 342 ///
 343 /// This intrinsic corresponds to the \c VPHSUBD instruction.
 344 ///
 345 /// \param __a
 346 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 347 ///    horizontal differences between the values are stored in the lower bits of
 348 ///    the destination.
 349 /// \param __b
 350 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 351 ///    horizontal differences between the values are stored in the upper bits of
 352 ///    the destination.
 353 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
 354 ///    of both operands.
 355 static __inline__ __m128i __DEFAULT_FN_ATTRS
 356 _mm_hsub_epi32(__m128i __a, __m128i __b)
 357 {
 358     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 359 }
 360
 361 /// Horizontally subtracts the adjacent pairs of values contained in 2
 362 ///    packed 64-bit vectors of [4 x i16].
 363 ///
 364 /// \headerfile <x86intrin.h>
 365 ///
 366 /// This intrinsic corresponds to the \c PHSUBW instruction.
 367 ///
 368 /// \param __a
 369 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 370 ///    horizontal differences between the values are stored in the lower bits of
 371 ///    the destination.
 372 /// \param __b
 373 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 374 ///    horizontal differences between the values are stored in the upper bits of
 375 ///    the destination.
 376 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 377 ///    of both operands.
 378 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 379 _mm_hsub_pi16(__m64 __a, __m64 __b)
 380 {
 381     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
 382 }
 383
 384 /// Horizontally subtracts the adjacent pairs of values contained in 2
 385 ///    packed 64-bit vectors of [2 x i32].
 386 ///
 387 /// \headerfile <x86intrin.h>
 388 ///
 389 /// This intrinsic corresponds to the \c PHSUBD instruction.
 390 ///
 391 /// \param __a
 392 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 393 ///    horizontal differences between the values are stored in the lower bits of
 394 ///    the destination.
 395 /// \param __b
 396 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 397 ///    horizontal differences between the values are stored in the upper bits of
 398 ///    the destination.
 399 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 400 ///    of both operands.
 401 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 402 _mm_hsub_pi32(__m64 __a, __m64 __b)
 403 {
 404     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 405 }
 406
 407 /// Horizontally subtracts the adjacent pairs of values contained in 2
 408 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
 409 ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
 410 ///    saturated to 0x8000.
 411 ///
 412 /// \headerfile <x86intrin.h>
 413 ///
 414 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
 415 ///
 416 /// \param __a
 417 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 418 ///    horizontal differences between the values are stored in the lower bits of
 419 ///    the destination.
 420 /// \param __b
 421 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 422 ///    horizontal differences between the values are stored in the upper bits of
 423 ///    the destination.
 424 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 425 ///    differences of both operands.
 426 static __inline__ __m128i __DEFAULT_FN_ATTRS
 427 _mm_hsubs_epi16(__m128i __a, __m128i __b)
 428 {
 429     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 430 }
 431
 432 /// Horizontally subtracts the adjacent pairs of values contained in 2
 433 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
 434 ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
 435 ///    saturated to 0x8000.
 436 ///
 437 /// \headerfile <x86intrin.h>
 438 ///
 439 /// This intrinsic corresponds to the \c PHSUBSW instruction.
 440 ///
 441 /// \param __a
 442 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 443 ///    horizontal differences between the values are stored in the lower bits of
 444 ///    the destination.
 445 /// \param __b
 446 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 447 ///    horizontal differences between the values are stored in the upper bits of
 448 ///    the destination.
 449 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 450 ///    differences of both operands.
 451 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 452 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 453 {
 454     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
 455 }
 456
 457 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
 458 ///    values contained in the first source operand and packed 8-bit signed
 459 ///    integer values contained in the second source operand, adds pairs of
 460 ///    contiguous products with signed saturation, and writes the 16-bit sums to
 461 ///    the corresponding bits in the destination.
 462 ///
 463 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
 464 ///    both operands are multiplied, and the sum of both results is written to
 465 ///    bits [15:0] of the destination.
 466 ///
 467 /// \headerfile <x86intrin.h>
 468 ///
 469 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
 470 ///
 471 /// \param __a
 472 ///    A 128-bit integer vector containing the first source operand.
 473 /// \param __b
 474 ///    A 128-bit integer vector containing the second source operand.
 475 /// \returns A 128-bit integer vector containing the sums of products of both
 476 ///    operands: \n
 477 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
 478 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 479 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 480 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
 481 ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
 482 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
 483 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
 484 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
 485 static __inline__ __m128i __DEFAULT_FN_ATTRS
 486 _mm_maddubs_epi16(__m128i __a, __m128i __b)
 487 {
 488     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 489 }
 490
 491 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
 492 ///    values contained in the first source operand and packed 8-bit signed
 493 ///    integer values contained in the second source operand, adds pairs of
 494 ///    contiguous products with signed saturation, and writes the 16-bit sums to
 495 ///    the corresponding bits in the destination.
 496 ///
 497 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
 498 ///    both operands are multiplied, and the sum of both results is written to
 499 ///    bits [15:0] of the destination.
 500 ///
 501 /// \headerfile <x86intrin.h>
 502 ///
 503 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
 504 ///
 505 /// \param __a
 506 ///    A 64-bit integer vector containing the first source operand.
 507 /// \param __b
 508 ///    A 64-bit integer vector containing the second source operand.
 509 /// \returns A 64-bit integer vector containing the sums of products of both
 510 ///    operands: \n
 511 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
 512 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 513 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 514 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
 515 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 516 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 517 {
 518     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
 519 }
 520
 521 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
 522 ///    products to the 18 most significant bits by right-shifting, rounds the
 523 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 524 ///
 525 /// \headerfile <x86intrin.h>
 526 ///
 527 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
 528 ///
 529 /// \param __a
 530 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 531 /// \param __b
 532 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 533 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 534 ///    products of both operands.
 535 static __inline__ __m128i __DEFAULT_FN_ATTRS
 536 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 537 {
 538     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 539 }
 540
 541 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
 542 ///    products to the 18 most significant bits by right-shifting, rounds the
 543 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 544 ///
 545 /// \headerfile <x86intrin.h>
 546 ///
 547 /// This intrinsic corresponds to the \c PMULHRSW instruction.
 548 ///
 549 /// \param __a
 550 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 551 /// \param __b
 552 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 553 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 554 ///    products of both operands.
 555 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 556 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 557 {
 558     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
 559 }
 560
 561 /// Copies the 8-bit integers from a 128-bit integer vector to the
 562 ///    destination or clears 8-bit values in the destination, as specified by
 563 ///    the second source operand.
 564 ///
 565 /// \headerfile <x86intrin.h>
 566 ///
 567 /// This intrinsic corresponds to the \c VPSHUFB instruction.
 568 ///
 569 /// \param __a
 570 ///    A 128-bit integer vector containing the values to be copied.
 571 /// \param __b
 572 ///    A 128-bit integer vector containing control bytes corresponding to
 573 ///    positions in the destination:
 574 ///    Bit 7: \n
 575 ///    1: Clear the corresponding byte in the destination. \n
 576 ///    0: Copy the selected source byte to the corresponding byte in the
 577 ///    destination. \n
 578 ///    Bits [6:4] Reserved.  \n
 579 ///    Bits [3:0] select the source byte to be copied.
 580 /// \returns A 128-bit integer vector containing the copied or cleared values.
 581 static __inline__ __m128i __DEFAULT_FN_ATTRS
 582 _mm_shuffle_epi8(__m128i __a, __m128i __b)
 583 {
 584     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
 585 }
 586
 587 /// Copies the 8-bit integers from a 64-bit integer vector to the
 588 ///    destination or clears 8-bit values in the destination, as specified by
 589 ///    the second source operand.
 590 ///
 591 /// \headerfile <x86intrin.h>
 592 ///
 593 /// This intrinsic corresponds to the \c PSHUFB instruction.
 594 ///
 595 /// \param __a
 596 ///    A 64-bit integer vector containing the values to be copied.
 597 /// \param __b
 598 ///    A 64-bit integer vector containing control bytes corresponding to
 599 ///    positions in the destination:
 600 ///    Bit 7: \n
 601 ///    1: Clear the corresponding byte in the destination. \n
 602 ///    0: Copy the selected source byte to the corresponding byte in the
 603 ///    destination. \n
 604 ///    Bits [3:0] select the source byte to be copied.
 605 /// \returns A 64-bit integer vector containing the copied or cleared values.
 606 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 607 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 608 {
 609     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
 610 }
 611
 612 /// For each 8-bit integer in the first source operand, perform one of
 613 ///    the following actions as specified by the second source operand.
 614 ///
 615 ///    If the byte in the second source is negative, calculate the two's
 616 ///    complement of the corresponding byte in the first source, and write that
 617 ///    value to the destination. If the byte in the second source is positive,
 618 ///    copy the corresponding byte from the first source to the destination. If
 619 ///    the byte in the second source is zero, clear the corresponding byte in
 620 ///    the destination.
 621 ///
 622 /// \headerfile <x86intrin.h>
 623 ///
 624 /// This intrinsic corresponds to the \c VPSIGNB instruction.
 625 ///
 626 /// \param __a
 627 ///    A 128-bit integer vector containing the values to be copied.
 628 /// \param __b
 629 ///    A 128-bit integer vector containing control bytes corresponding to
 630 ///    positions in the destination.
 631 /// \returns A 128-bit integer vector containing the resultant values.
 632 static __inline__ __m128i __DEFAULT_FN_ATTRS
 633 _mm_sign_epi8(__m128i __a, __m128i __b)
 634 {
 635     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
 636 }
 637
 638 /// For each 16-bit integer in the first source operand, perform one of
 639 ///    the following actions as specified by the second source operand.
 640 ///
 641 ///    If the word in the second source is negative, calculate the two's
 642 ///    complement of the corresponding word in the first source, and write that
 643 ///    value to the destination. If the word in the second source is positive,
 644 ///    copy the corresponding word from the first source to the destination. If
 645 ///    the word in the second source is zero, clear the corresponding word in
 646 ///    the destination.
 647 ///
 648 /// \headerfile <x86intrin.h>
 649 ///
 650 /// This intrinsic corresponds to the \c VPSIGNW instruction.
 651 ///
 652 /// \param __a
 653 ///    A 128-bit integer vector containing the values to be copied.
 654 /// \param __b
 655 ///    A 128-bit integer vector containing control words corresponding to
 656 ///    positions in the destination.
 657 /// \returns A 128-bit integer vector containing the resultant values.
 658 static __inline__ __m128i __DEFAULT_FN_ATTRS
 659 _mm_sign_epi16(__m128i __a, __m128i __b)
 660 {
 661     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
 662 }
 663
 664 /// For each 32-bit integer in the first source operand, perform one of
 665 ///    the following actions as specified by the second source operand.
 666 ///
 667 ///    If the doubleword in the second source is negative, calculate the two's
 668 ///    complement of the corresponding word in the first source, and write that
 669 ///    value to the destination. If the doubleword in the second source is
 670 ///    positive, copy the corresponding word from the first source to the
 671 ///    destination. If the doubleword in the second source is zero, clear the
 672 ///    corresponding word in the destination.
 673 ///
 674 /// \headerfile <x86intrin.h>
 675 ///
 676 /// This intrinsic corresponds to the \c VPSIGND instruction.
 677 ///
 678 /// \param __a
 679 ///    A 128-bit integer vector containing the values to be copied.
 680 /// \param __b
 681 ///    A 128-bit integer vector containing control doublewords corresponding to
 682 ///    positions in the destination.
 683 /// \returns A 128-bit integer vector containing the resultant values.
 684 static __inline__ __m128i __DEFAULT_FN_ATTRS
 685 _mm_sign_epi32(__m128i __a, __m128i __b)
 686 {
 687     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
 688 }
 689
 690 /// For each 8-bit integer in the first source operand, perform one of
 691 ///    the following actions as specified by the second source operand.
 692 ///
 693 ///    If the byte in the second source is negative, calculate the two's
 694 ///    complement of the corresponding byte in the first source, and write that
 695 ///    value to the destination. If the byte in the second source is positive,
 696 ///    copy the corresponding byte from the first source to the destination. If
 697 ///    the byte in the second source is zero, clear the corresponding byte in
 698 ///    the destination.
 699 ///
 700 /// \headerfile <x86intrin.h>
 701 ///
 702 /// This intrinsic corresponds to the \c PSIGNB instruction.
 703 ///
 704 /// \param __a
 705 ///    A 64-bit integer vector containing the values to be copied.
 706 /// \param __b
 707 ///    A 64-bit integer vector containing control bytes corresponding to
 708 ///    positions in the destination.
 709 /// \returns A 64-bit integer vector containing the resultant values.
 710 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 711 _mm_sign_pi8(__m64 __a, __m64 __b)
 712 {
 713     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
 714 }
 715
 716 /// For each 16-bit integer in the first source operand, perform one of
 717 ///    the following actions as specified by the second source operand.
 718 ///
 719 ///    If the word in the second source is negative, calculate the two's
 720 ///    complement of the corresponding word in the first source, and write that
 721 ///    value to the destination. If the word in the second source is positive,
 722 ///    copy the corresponding word from the first source to the destination. If
 723 ///    the word in the second source is zero, clear the corresponding word in
 724 ///    the destination.
 725 ///
 726 /// \headerfile <x86intrin.h>
 727 ///
 728 /// This intrinsic corresponds to the \c PSIGNW instruction.
 729 ///
 730 /// \param __a
 731 ///    A 64-bit integer vector containing the values to be copied.
 732 /// \param __b
 733 ///    A 64-bit integer vector containing control words corresponding to
 734 ///    positions in the destination.
 735 /// \returns A 64-bit integer vector containing the resultant values.
 736 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 737 _mm_sign_pi16(__m64 __a, __m64 __b)
 738 {
 739     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
 740 }
 741
 742 /// For each 32-bit integer in the first source operand, perform one of
 743 ///    the following actions as specified by the second source operand.
 744 ///
 745 ///    If the doubleword in the second source is negative, calculate the two's
 746 ///    complement of the corresponding doubleword in the first source, and
 747 ///    write that value to the destination. If the doubleword in the second
 748 ///    source is positive, copy the corresponding doubleword from the first
 749 ///    source to the destination. If the doubleword in the second source is
 750 ///    zero, clear the corresponding doubleword in the destination.
 751 ///
 752 /// \headerfile <x86intrin.h>
 753 ///
 754 /// This intrinsic corresponds to the \c PSIGND instruction.
 755 ///
 756 /// \param __a
 757 ///    A 64-bit integer vector containing the values to be copied.
 758 /// \param __b
 759 ///    A 64-bit integer vector containing two control doublewords corresponding
 760 ///    to positions in the destination.
 761 /// \returns A 64-bit integer vector containing the resultant values.
 762 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 763 _mm_sign_pi32(__m64 __a, __m64 __b)
 764 {
 765     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
 766 }
 767
 768 #undef __DEFAULT_FN_ATTRS
 769 #undef __DEFAULT_FN_ATTRS_MMX
 770
 771 #endif /* __TMMINTRIN_H */