mingw/lib/gcc/mingw32/3.4.2/include/mmintrin.h

   1 /* Copyright (C) 2002, 2003 Free Software Foundation, Inc.
   2
   3    This file is part of GCC.
   4
   5    GCC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    GCC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with GCC; see the file COPYING.  If not, write to
  17    the Free Software Foundation, 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 /* As a special exception, if you include this header file into source
  21    files compiled by GCC, this header file does not by itself cause
  22    the resulting executable to be covered by the GNU General Public
  23    License.  This exception does not however invalidate any other
  24    reasons why the executable file might be covered by the GNU General
  25    Public License.  */
  26
  27 /* Implemented from the specification included in the Intel C++ Compiler
  28    User Guide and Reference, version 8.0.  */
  29
  30 #ifndef _MMINTRIN_H_INCLUDED
  31 #define _MMINTRIN_H_INCLUDED
  32
  33 #ifndef __MMX__
  34 # error "MMX instruction set not enabled"
  35 #else
  36 /* The data type intended for user use.  */
  37 typedef int __m64 __attribute__ ((__mode__ (__V2SI__)));
  38
  39 /* Internal data types for implementing the intrinsics.  */
  40 typedef int __v2si __attribute__ ((__mode__ (__V2SI__)));
  41 typedef int __v4hi __attribute__ ((__mode__ (__V4HI__)));
  42 typedef int __v8qi __attribute__ ((__mode__ (__V8QI__)));
  43
  44 /* Empty the multimedia state.  */
  45 static __inline void
  46 _mm_empty (void)
  47 {
  48   __builtin_ia32_emms ();
  49 }
  50
  51 static __inline void
  52 _m_empty (void)
  53 {
  54   _mm_empty ();
  55 }
  56
  57 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
  58 static __inline __m64
  59 _mm_cvtsi32_si64 (int __i)
  60 {
  61   long long __tmp = (unsigned int)__i;
  62   return (__m64) __tmp;
  63 }
  64
  65 static __inline __m64
  66 _m_from_int (int __i)
  67 {
  68   return _mm_cvtsi32_si64 (__i);
  69 }
  70
  71 #ifdef __x86_64__
  72 /* Convert I to a __m64 object.  */
  73 static __inline __m64
  74 _mm_cvtsi64x_si64 (long long __i)
  75 {
  76   return (__m64) __i;
  77 }
  78
  79 /* Convert I to a __m64 object.  */
  80 static __inline __m64
  81 _mm_set_pi64x (long long __i)
  82 {
  83   return (__m64) __i;
  84 }
  85 #endif
  86
  87 /* Convert the lower 32 bits of the __m64 object into an integer.  */
  88 static __inline int
  89 _mm_cvtsi64_si32 (__m64 __i)
  90 {
  91   long long __tmp = (long long)__i;
  92   return __tmp;
  93 }
  94
  95 static __inline int
  96 _m_to_int (__m64 __i)
  97 {
  98   return _mm_cvtsi64_si32 (__i);
  99 }
 100
 101 #ifdef __x86_64__
 102 /* Convert the lower 32 bits of the __m64 object into an integer.  */
 103 static __inline long long
 104 _mm_cvtsi64_si64x (__m64 __i)
 105 {
 106   return (long long)__i;
 107 }
 108 #endif
 109
 110 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 111    the result, and the four 16-bit values from M2 into the upper four 8-bit
 112    values of the result, all with signed saturation.  */
 113 static __inline __m64
 114 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
 115 {
 116   return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
 117 }
 118
 119 static __inline __m64
 120 _m_packsswb (__m64 __m1, __m64 __m2)
 121 {
 122   return _mm_packs_pi16 (__m1, __m2);
 123 }
 124
 125 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
 126    the result, and the two 32-bit values from M2 into the upper two 16-bit
 127    values of the result, all with signed saturation.  */
 128 static __inline __m64
 129 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
 130 {
 131   return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
 132 }
 133
 134 static __inline __m64
 135 _m_packssdw (__m64 __m1, __m64 __m2)
 136 {
 137   return _mm_packs_pi32 (__m1, __m2);
 138 }
 139
 140 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 141    the result, and the four 16-bit values from M2 into the upper four 8-bit
 142    values of the result, all with unsigned saturation.  */
 143 static __inline __m64
 144 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
 145 {
 146   return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
 147 }
 148
 149 static __inline __m64
 150 _m_packuswb (__m64 __m1, __m64 __m2)
 151 {
 152   return _mm_packs_pu16 (__m1, __m2);
 153 }
 154
 155 /* Interleave the four 8-bit values from the high half of M1 with the four
 156    8-bit values from the high half of M2.  */
 157 static __inline __m64
 158 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
 159 {
 160   return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
 161 }
 162
 163 static __inline __m64
 164 _m_punpckhbw (__m64 __m1, __m64 __m2)
 165 {
 166   return _mm_unpackhi_pi8 (__m1, __m2);
 167 }
 168
 169 /* Interleave the two 16-bit values from the high half of M1 with the two
 170    16-bit values from the high half of M2.  */
 171 static __inline __m64
 172 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
 173 {
 174   return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
 175 }
 176
 177 static __inline __m64
 178 _m_punpckhwd (__m64 __m1, __m64 __m2)
 179 {
 180   return _mm_unpackhi_pi16 (__m1, __m2);
 181 }
 182
 183 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
 184    value from the high half of M2.  */
 185 static __inline __m64
 186 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
 187 {
 188   return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
 189 }
 190
 191 static __inline __m64
 192 _m_punpckhdq (__m64 __m1, __m64 __m2)
 193 {
 194   return _mm_unpackhi_pi32 (__m1, __m2);
 195 }
 196
 197 /* Interleave the four 8-bit values from the low half of M1 with the four
 198    8-bit values from the low half of M2.  */
 199 static __inline __m64
 200 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
 201 {
 202   return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
 203 }
 204
 205 static __inline __m64
 206 _m_punpcklbw (__m64 __m1, __m64 __m2)
 207 {
 208   return _mm_unpacklo_pi8 (__m1, __m2);
 209 }
 210
 211 /* Interleave the two 16-bit values from the low half of M1 with the two
 212    16-bit values from the low half of M2.  */
 213 static __inline __m64
 214 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
 215 {
 216   return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
 217 }
 218
 219 static __inline __m64
 220 _m_punpcklwd (__m64 __m1, __m64 __m2)
 221 {
 222   return _mm_unpacklo_pi16 (__m1, __m2);
 223 }
 224
 225 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
 226    value from the low half of M2.  */
 227 static __inline __m64
 228 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
 229 {
 230   return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
 231 }
 232
 233 static __inline __m64
 234 _m_punpckldq (__m64 __m1, __m64 __m2)
 235 {
 236   return _mm_unpacklo_pi32 (__m1, __m2);
 237 }
 238
 239 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
 240 static __inline __m64
 241 _mm_add_pi8 (__m64 __m1, __m64 __m2)
 242 {
 243   return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
 244 }
 245
 246 static __inline __m64
 247 _m_paddb (__m64 __m1, __m64 __m2)
 248 {
 249   return _mm_add_pi8 (__m1, __m2);
 250 }
 251
 252 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
 253 static __inline __m64
 254 _mm_add_pi16 (__m64 __m1, __m64 __m2)
 255 {
 256   return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
 257 }
 258
 259 static __inline __m64
 260 _m_paddw (__m64 __m1, __m64 __m2)
 261 {
 262   return _mm_add_pi16 (__m1, __m2);
 263 }
 264
 265 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
 266 static __inline __m64
 267 _mm_add_pi32 (__m64 __m1, __m64 __m2)
 268 {
 269   return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
 270 }
 271
 272 static __inline __m64
 273 _m_paddd (__m64 __m1, __m64 __m2)
 274 {
 275   return _mm_add_pi32 (__m1, __m2);
 276 }
 277
 278 /* Add the 64-bit values in M1 to the 64-bit values in M2.  */
 279 static __inline __m64
 280 _mm_add_si64 (__m64 __m1, __m64 __m2)
 281 {
 282   return (__m64) __builtin_ia32_paddq ((long long)__m1, (long long)__m2);
 283 }
 284
 285 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
 286    saturated arithmetic.  */
 287 static __inline __m64
 288 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
 289 {
 290   return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
 291 }
 292
 293 static __inline __m64
 294 _m_paddsb (__m64 __m1, __m64 __m2)
 295 {
 296   return _mm_adds_pi8 (__m1, __m2);
 297 }
 298
 299 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
 300    saturated arithmetic.  */
 301 static __inline __m64
 302 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
 303 {
 304   return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
 305 }
 306
 307 static __inline __m64
 308 _m_paddsw (__m64 __m1, __m64 __m2)
 309 {
 310   return _mm_adds_pi16 (__m1, __m2);
 311 }
 312
 313 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 314    saturated arithmetic.  */
 315 static __inline __m64
 316 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
 317 {
 318   return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
 319 }
 320
 321 static __inline __m64
 322 _m_paddusb (__m64 __m1, __m64 __m2)
 323 {
 324   return _mm_adds_pu8 (__m1, __m2);
 325 }
 326
 327 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 328    saturated arithmetic.  */
 329 static __inline __m64
 330 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
 331 {
 332   return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
 333 }
 334
 335 static __inline __m64
 336 _m_paddusw (__m64 __m1, __m64 __m2)
 337 {
 338   return _mm_adds_pu16 (__m1, __m2);
 339 }
 340
 341 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
 342 static __inline __m64
 343 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
 344 {
 345   return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
 346 }
 347
 348 static __inline __m64
 349 _m_psubb (__m64 __m1, __m64 __m2)
 350 {
 351   return _mm_sub_pi8 (__m1, __m2);
 352 }
 353
 354 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
 355 static __inline __m64
 356 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
 357 {
 358   return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
 359 }
 360
 361 static __inline __m64
 362 _m_psubw (__m64 __m1, __m64 __m2)
 363 {
 364   return _mm_sub_pi16 (__m1, __m2);
 365 }
 366
 367 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
 368 static __inline __m64
 369 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
 370 {
 371   return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
 372 }
 373
 374 static __inline __m64
 375 _m_psubd (__m64 __m1, __m64 __m2)
 376 {
 377   return _mm_sub_pi32 (__m1, __m2);
 378 }
 379
 380 /* Add the 64-bit values in M1 to the 64-bit values in M2.  */
 381 static __inline __m64
 382 _mm_sub_si64 (__m64 __m1, __m64 __m2)
 383 {
 384   return (__m64) __builtin_ia32_psubq ((long long)__m1, (long long)__m2);
 385 }
 386
 387 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
 388    saturating arithmetic.  */
 389 static __inline __m64
 390 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
 391 {
 392   return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
 393 }
 394
 395 static __inline __m64
 396 _m_psubsb (__m64 __m1, __m64 __m2)
 397 {
 398   return _mm_subs_pi8 (__m1, __m2);
 399 }
 400
 401 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 402    signed saturating arithmetic.  */
 403 static __inline __m64
 404 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
 405 {
 406   return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
 407 }
 408
 409 static __inline __m64
 410 _m_psubsw (__m64 __m1, __m64 __m2)
 411 {
 412   return _mm_subs_pi16 (__m1, __m2);
 413 }
 414
 415 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
 416    unsigned saturating arithmetic.  */
 417 static __inline __m64
 418 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
 419 {
 420   return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
 421 }
 422
 423 static __inline __m64
 424 _m_psubusb (__m64 __m1, __m64 __m2)
 425 {
 426   return _mm_subs_pu8 (__m1, __m2);
 427 }
 428
 429 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 430    unsigned saturating arithmetic.  */
 431 static __inline __m64
 432 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
 433 {
 434   return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
 435 }
 436
 437 static __inline __m64
 438 _m_psubusw (__m64 __m1, __m64 __m2)
 439 {
 440   return _mm_subs_pu16 (__m1, __m2);
 441 }
 442
 443 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
 444    four 32-bit intermediate results, which are then summed by pairs to
 445    produce two 32-bit results.  */
 446 static __inline __m64
 447 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
 448 {
 449   return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
 450 }
 451
 452 static __inline __m64
 453 _m_pmaddwd (__m64 __m1, __m64 __m2)
 454 {
 455   return _mm_madd_pi16 (__m1, __m2);
 456 }
 457
 458 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
 459    M2 and produce the high 16 bits of the 32-bit results.  */
 460 static __inline __m64
 461 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
 462 {
 463   return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
 464 }
 465
 466 static __inline __m64
 467 _m_pmulhw (__m64 __m1, __m64 __m2)
 468 {
 469   return _mm_mulhi_pi16 (__m1, __m2);
 470 }
 471
 472 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
 473    the low 16 bits of the results.  */
 474 static __inline __m64
 475 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
 476 {
 477   return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
 478 }
 479
 480 static __inline __m64
 481 _m_pmullw (__m64 __m1, __m64 __m2)
 482 {
 483   return _mm_mullo_pi16 (__m1, __m2);
 484 }
 485
 486 /* Shift four 16-bit values in M left by COUNT.  */
 487 static __inline __m64
 488 _mm_sll_pi16 (__m64 __m, __m64 __count)
 489 {
 490   return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (long long)__count);
 491 }
 492
 493 static __inline __m64
 494 _m_psllw (__m64 __m, __m64 __count)
 495 {
 496   return _mm_sll_pi16 (__m, __count);
 497 }
 498
 499 static __inline __m64
 500 _mm_slli_pi16 (__m64 __m, int __count)
 501 {
 502   return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
 503 }
 504
 505 static __inline __m64
 506 _m_psllwi (__m64 __m, int __count)
 507 {
 508   return _mm_slli_pi16 (__m, __count);
 509 }
 510
 511 /* Shift two 32-bit values in M left by COUNT.  */
 512 static __inline __m64
 513 _mm_sll_pi32 (__m64 __m, __m64 __count)
 514 {
 515   return (__m64) __builtin_ia32_pslld ((__v2si)__m, (long long)__count);
 516 }
 517
 518 static __inline __m64
 519 _m_pslld (__m64 __m, __m64 __count)
 520 {
 521   return _mm_sll_pi32 (__m, __count);
 522 }
 523
 524 static __inline __m64
 525 _mm_slli_pi32 (__m64 __m, int __count)
 526 {
 527   return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
 528 }
 529
 530 static __inline __m64
 531 _m_pslldi (__m64 __m, int __count)
 532 {
 533   return _mm_slli_pi32 (__m, __count);
 534 }
 535
 536 /* Shift the 64-bit value in M left by COUNT.  */
 537 static __inline __m64
 538 _mm_sll_si64 (__m64 __m, __m64 __count)
 539 {
 540   return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
 541 }
 542
 543 static __inline __m64
 544 _m_psllq (__m64 __m, __m64 __count)
 545 {
 546   return _mm_sll_si64 (__m, __count);
 547 }
 548
 549 static __inline __m64
 550 _mm_slli_si64 (__m64 __m, int __count)
 551 {
 552   return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
 553 }
 554
 555 static __inline __m64
 556 _m_psllqi (__m64 __m, int __count)
 557 {
 558   return _mm_slli_si64 (__m, __count);
 559 }
 560
 561 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
 562 static __inline __m64
 563 _mm_sra_pi16 (__m64 __m, __m64 __count)
 564 {
 565   return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (long long)__count);
 566 }
 567
 568 static __inline __m64
 569 _m_psraw (__m64 __m, __m64 __count)
 570 {
 571   return _mm_sra_pi16 (__m, __count);
 572 }
 573
 574 static __inline __m64
 575 _mm_srai_pi16 (__m64 __m, int __count)
 576 {
 577   return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
 578 }
 579
 580 static __inline __m64
 581 _m_psrawi (__m64 __m, int __count)
 582 {
 583   return _mm_srai_pi16 (__m, __count);
 584 }
 585
 586 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
 587 static __inline __m64
 588 _mm_sra_pi32 (__m64 __m, __m64 __count)
 589 {
 590   return (__m64) __builtin_ia32_psrad ((__v2si)__m, (long long)__count);
 591 }
 592
 593 static __inline __m64
 594 _m_psrad (__m64 __m, __m64 __count)
 595 {
 596   return _mm_sra_pi32 (__m, __count);
 597 }
 598
 599 static __inline __m64
 600 _mm_srai_pi32 (__m64 __m, int __count)
 601 {
 602   return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
 603 }
 604
 605 static __inline __m64
 606 _m_psradi (__m64 __m, int __count)
 607 {
 608   return _mm_srai_pi32 (__m, __count);
 609 }
 610
 611 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
 612 static __inline __m64
 613 _mm_srl_pi16 (__m64 __m, __m64 __count)
 614 {
 615   return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (long long)__count);
 616 }
 617
 618 static __inline __m64
 619 _m_psrlw (__m64 __m, __m64 __count)
 620 {
 621   return _mm_srl_pi16 (__m, __count);
 622 }
 623
 624 static __inline __m64
 625 _mm_srli_pi16 (__m64 __m, int __count)
 626 {
 627   return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
 628 }
 629
 630 static __inline __m64
 631 _m_psrlwi (__m64 __m, int __count)
 632 {
 633   return _mm_srli_pi16 (__m, __count);
 634 }
 635
 636 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
 637 static __inline __m64
 638 _mm_srl_pi32 (__m64 __m, __m64 __count)
 639 {
 640   return (__m64) __builtin_ia32_psrld ((__v2si)__m, (long long)__count);
 641 }
 642
 643 static __inline __m64
 644 _m_psrld (__m64 __m, __m64 __count)
 645 {
 646   return _mm_srl_pi32 (__m, __count);
 647 }
 648
 649 static __inline __m64
 650 _mm_srli_pi32 (__m64 __m, int __count)
 651 {
 652   return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
 653 }
 654
 655 static __inline __m64
 656 _m_psrldi (__m64 __m, int __count)
 657 {
 658   return _mm_srli_pi32 (__m, __count);
 659 }
 660
 661 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
 662 static __inline __m64
 663 _mm_srl_si64 (__m64 __m, __m64 __count)
 664 {
 665   return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
 666 }
 667
 668 static __inline __m64
 669 _m_psrlq (__m64 __m, __m64 __count)
 670 {
 671   return _mm_srl_si64 (__m, __count);
 672 }
 673
 674 static __inline __m64
 675 _mm_srli_si64 (__m64 __m, int __count)
 676 {
 677   return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
 678 }
 679
 680 static __inline __m64
 681 _m_psrlqi (__m64 __m, int __count)
 682 {
 683   return _mm_srli_si64 (__m, __count);
 684 }
 685
 686 /* Bit-wise AND the 64-bit values in M1 and M2.  */
 687 static __inline __m64
 688 _mm_and_si64 (__m64 __m1, __m64 __m2)
 689 {
 690   return (__m64) __builtin_ia32_pand ((long long)__m1, (long long)__m2);
 691 }
 692
 693 static __inline __m64
 694 _m_pand (__m64 __m1, __m64 __m2)
 695 {
 696   return _mm_and_si64 (__m1, __m2);
 697 }
 698
 699 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
 700    64-bit value in M2.  */
 701 static __inline __m64
 702 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
 703 {
 704   return (__m64) __builtin_ia32_pandn ((long long)__m1, (long long)__m2);
 705 }
 706
 707 static __inline __m64
 708 _m_pandn (__m64 __m1, __m64 __m2)
 709 {
 710   return _mm_andnot_si64 (__m1, __m2);
 711 }
 712
 713 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
 714 static __inline __m64
 715 _mm_or_si64 (__m64 __m1, __m64 __m2)
 716 {
 717   return (__m64)__builtin_ia32_por ((long long)__m1, (long long)__m2);
 718 }
 719
 720 static __inline __m64
 721 _m_por (__m64 __m1, __m64 __m2)
 722 {
 723   return _mm_or_si64 (__m1, __m2);
 724 }
 725
 726 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
 727 static __inline __m64
 728 _mm_xor_si64 (__m64 __m1, __m64 __m2)
 729 {
 730   return (__m64)__builtin_ia32_pxor ((long long)__m1, (long long)__m2);
 731 }
 732
 733 static __inline __m64
 734 _m_pxor (__m64 __m1, __m64 __m2)
 735 {
 736   return _mm_xor_si64 (__m1, __m2);
 737 }
 738
 739 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
 740    test is true and zero if false.  */
 741 static __inline __m64
 742 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
 743 {
 744   return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
 745 }
 746
 747 static __inline __m64
 748 _m_pcmpeqb (__m64 __m1, __m64 __m2)
 749 {
 750   return _mm_cmpeq_pi8 (__m1, __m2);
 751 }
 752
 753 static __inline __m64
 754 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
 755 {
 756   return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
 757 }
 758
 759 static __inline __m64
 760 _m_pcmpgtb (__m64 __m1, __m64 __m2)
 761 {
 762   return _mm_cmpgt_pi8 (__m1, __m2);
 763 }
 764
 765 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
 766    the test is true and zero if false.  */
 767 static __inline __m64
 768 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
 769 {
 770   return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
 771 }
 772
 773 static __inline __m64
 774 _m_pcmpeqw (__m64 __m1, __m64 __m2)
 775 {
 776   return _mm_cmpeq_pi16 (__m1, __m2);
 777 }
 778
 779 static __inline __m64
 780 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
 781 {
 782   return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
 783 }
 784
 785 static __inline __m64
 786 _m_pcmpgtw (__m64 __m1, __m64 __m2)
 787 {
 788   return _mm_cmpgt_pi16 (__m1, __m2);
 789 }
 790
 791 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
 792    the test is true and zero if false.  */
 793 static __inline __m64
 794 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
 795 {
 796   return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
 797 }
 798
 799 static __inline __m64
 800 _m_pcmpeqd (__m64 __m1, __m64 __m2)
 801 {
 802   return _mm_cmpeq_pi32 (__m1, __m2);
 803 }
 804
 805 static __inline __m64
 806 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
 807 {
 808   return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
 809 }
 810
 811 static __inline __m64
 812 _m_pcmpgtd (__m64 __m1, __m64 __m2)
 813 {
 814   return _mm_cmpgt_pi32 (__m1, __m2);
 815 }
 816
 817 /* Creates a 64-bit zero.  */
 818 static __inline __m64
 819 _mm_setzero_si64 (void)
 820 {
 821   return (__m64)__builtin_ia32_mmx_zero ();
 822 }
 823
 824 /* Creates a vector of two 32-bit values; I0 is least significant.  */
 825 static __inline __m64
 826 _mm_set_pi32 (int __i1, int __i0)
 827 {
 828   union {
 829     __m64 __q;
 830     struct {
 831       unsigned int __i0;
 832       unsigned int __i1;
 833     } __s;
 834   } __u;
 835
 836   __u.__s.__i0 = __i0;
 837   __u.__s.__i1 = __i1;
 838
 839   return __u.__q;
 840 }
 841
 842 /* Creates a vector of four 16-bit values; W0 is least significant.  */
 843 static __inline __m64
 844 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
 845 {
 846   unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
 847   unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
 848   return _mm_set_pi32 (__i1, __i0);
 849
 850 }
 851
 852 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
 853 static __inline __m64
 854 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
 855              char __b3, char __b2, char __b1, char __b0)
 856 {
 857   unsigned int __i1, __i0;
 858
 859   __i1 = (unsigned char)__b7;
 860   __i1 = __i1 << 8 | (unsigned char)__b6;
 861   __i1 = __i1 << 8 | (unsigned char)__b5;
 862   __i1 = __i1 << 8 | (unsigned char)__b4;
 863
 864   __i0 = (unsigned char)__b3;
 865   __i0 = __i0 << 8 | (unsigned char)__b2;
 866   __i0 = __i0 << 8 | (unsigned char)__b1;
 867   __i0 = __i0 << 8 | (unsigned char)__b0;
 868
 869   return _mm_set_pi32 (__i1, __i0);
 870 }
 871
 872 /* Similar, but with the arguments in reverse order.  */
 873 static __inline __m64
 874 _mm_setr_pi32 (int __i0, int __i1)
 875 {
 876   return _mm_set_pi32 (__i1, __i0);
 877 }
 878
 879 static __inline __m64
 880 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
 881 {
 882   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
 883 }
 884
 885 static __inline __m64
 886 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
 887               char __b4, char __b5, char __b6, char __b7)
 888 {
 889   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 890 }
 891
 892 /* Creates a vector of two 32-bit values, both elements containing I.  */
 893 static __inline __m64
 894 _mm_set1_pi32 (int __i)
 895 {
 896   return _mm_set_pi32 (__i, __i);
 897 }
 898
 899 /* Creates a vector of four 16-bit values, all elements containing W.  */
 900 static __inline __m64
 901 _mm_set1_pi16 (short __w)
 902 {
 903   unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
 904   return _mm_set1_pi32 (__i);
 905 }
 906
 907 /* Creates a vector of eight 8-bit values, all elements containing B.  */
 908 static __inline __m64
 909 _mm_set1_pi8 (char __b)
 910 {
 911   unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
 912   unsigned int __i = __w << 16 | __w;
 913   return _mm_set1_pi32 (__i);
 914 }
 915
 916 #endif /* __MMX__ */
 917 #endif /* _MMINTRIN_H_INCLUDED */