1 /* Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING. If not, write to
17 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
18 Boston, MA 02110-1301, USA. */
20 /* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
27 /* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 8.0. */
30 #ifndef _MMINTRIN_H_INCLUDED
31 #define _MMINTRIN_H_INCLUDED
34 # error "MMX instruction set not enabled"
36 /* The Intel API is flexible enough that we must allow aliasing with other
37 vector types, and their scalar components. */
38 typedef int __m64
__attribute__ ((__vector_size__ (8), __may_alias__
));
40 /* Internal data types for implementing the intrinsics. */
41 typedef int __v2si
__attribute__ ((__vector_size__ (8)));
42 typedef short __v4hi
__attribute__ ((__vector_size__ (8)));
43 typedef char __v8qi
__attribute__ ((__vector_size__ (8)));
45 /* Empty the multimedia state. */
46 static __inline
void __attribute__((__always_inline__
))
49 __builtin_ia32_emms ();
52 static __inline
void __attribute__((__always_inline__
))
58 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
59 static __inline __m64
__attribute__((__always_inline__
))
60 _mm_cvtsi32_si64 (int __i
)
62 return (__m64
) __builtin_ia32_vec_init_v2si (__i
, 0);
65 static __inline __m64
__attribute__((__always_inline__
))
68 return _mm_cvtsi32_si64 (__i
);
72 /* Convert I to a __m64 object. */
73 static __inline __m64
__attribute__((__always_inline__
))
74 _mm_cvtsi64x_si64 (long long __i
)
79 /* Convert I to a __m64 object. */
80 static __inline __m64
__attribute__((__always_inline__
))
81 _mm_set_pi64x (long long __i
)
87 /* Convert the lower 32 bits of the __m64 object into an integer. */
88 static __inline
int __attribute__((__always_inline__
))
89 _mm_cvtsi64_si32 (__m64 __i
)
91 return __builtin_ia32_vec_ext_v2si ((__v2si
)__i
, 0);
94 static __inline
int __attribute__((__always_inline__
))
97 return _mm_cvtsi64_si32 (__i
);
101 /* Convert the lower 32 bits of the __m64 object into an integer. */
102 static __inline
long long __attribute__((__always_inline__
))
103 _mm_cvtsi64_si64x (__m64 __i
)
105 return (long long)__i
;
109 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
110 the result, and the four 16-bit values from M2 into the upper four 8-bit
111 values of the result, all with signed saturation. */
112 static __inline __m64
__attribute__((__always_inline__
))
113 _mm_packs_pi16 (__m64 __m1
, __m64 __m2
)
115 return (__m64
) __builtin_ia32_packsswb ((__v4hi
)__m1
, (__v4hi
)__m2
);
118 static __inline __m64
__attribute__((__always_inline__
))
119 _m_packsswb (__m64 __m1
, __m64 __m2
)
121 return _mm_packs_pi16 (__m1
, __m2
);
124 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
125 the result, and the two 32-bit values from M2 into the upper two 16-bit
126 values of the result, all with signed saturation. */
127 static __inline __m64
__attribute__((__always_inline__
))
128 _mm_packs_pi32 (__m64 __m1
, __m64 __m2
)
130 return (__m64
) __builtin_ia32_packssdw ((__v2si
)__m1
, (__v2si
)__m2
);
133 static __inline __m64
__attribute__((__always_inline__
))
134 _m_packssdw (__m64 __m1
, __m64 __m2
)
136 return _mm_packs_pi32 (__m1
, __m2
);
139 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
140 the result, and the four 16-bit values from M2 into the upper four 8-bit
141 values of the result, all with unsigned saturation. */
142 static __inline __m64
__attribute__((__always_inline__
))
143 _mm_packs_pu16 (__m64 __m1
, __m64 __m2
)
145 return (__m64
) __builtin_ia32_packuswb ((__v4hi
)__m1
, (__v4hi
)__m2
);
148 static __inline __m64
__attribute__((__always_inline__
))
149 _m_packuswb (__m64 __m1
, __m64 __m2
)
151 return _mm_packs_pu16 (__m1
, __m2
);
154 /* Interleave the four 8-bit values from the high half of M1 with the four
155 8-bit values from the high half of M2. */
156 static __inline __m64
__attribute__((__always_inline__
))
157 _mm_unpackhi_pi8 (__m64 __m1
, __m64 __m2
)
159 return (__m64
) __builtin_ia32_punpckhbw ((__v8qi
)__m1
, (__v8qi
)__m2
);
162 static __inline __m64
__attribute__((__always_inline__
))
163 _m_punpckhbw (__m64 __m1
, __m64 __m2
)
165 return _mm_unpackhi_pi8 (__m1
, __m2
);
168 /* Interleave the two 16-bit values from the high half of M1 with the two
169 16-bit values from the high half of M2. */
170 static __inline __m64
__attribute__((__always_inline__
))
171 _mm_unpackhi_pi16 (__m64 __m1
, __m64 __m2
)
173 return (__m64
) __builtin_ia32_punpckhwd ((__v4hi
)__m1
, (__v4hi
)__m2
);
176 static __inline __m64
__attribute__((__always_inline__
))
177 _m_punpckhwd (__m64 __m1
, __m64 __m2
)
179 return _mm_unpackhi_pi16 (__m1
, __m2
);
182 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
183 value from the high half of M2. */
184 static __inline __m64
__attribute__((__always_inline__
))
185 _mm_unpackhi_pi32 (__m64 __m1
, __m64 __m2
)
187 return (__m64
) __builtin_ia32_punpckhdq ((__v2si
)__m1
, (__v2si
)__m2
);
190 static __inline __m64
__attribute__((__always_inline__
))
191 _m_punpckhdq (__m64 __m1
, __m64 __m2
)
193 return _mm_unpackhi_pi32 (__m1
, __m2
);
196 /* Interleave the four 8-bit values from the low half of M1 with the four
197 8-bit values from the low half of M2. */
198 static __inline __m64
__attribute__((__always_inline__
))
199 _mm_unpacklo_pi8 (__m64 __m1
, __m64 __m2
)
201 return (__m64
) __builtin_ia32_punpcklbw ((__v8qi
)__m1
, (__v8qi
)__m2
);
204 static __inline __m64
__attribute__((__always_inline__
))
205 _m_punpcklbw (__m64 __m1
, __m64 __m2
)
207 return _mm_unpacklo_pi8 (__m1
, __m2
);
210 /* Interleave the two 16-bit values from the low half of M1 with the two
211 16-bit values from the low half of M2. */
212 static __inline __m64
__attribute__((__always_inline__
))
213 _mm_unpacklo_pi16 (__m64 __m1
, __m64 __m2
)
215 return (__m64
) __builtin_ia32_punpcklwd ((__v4hi
)__m1
, (__v4hi
)__m2
);
218 static __inline __m64
__attribute__((__always_inline__
))
219 _m_punpcklwd (__m64 __m1
, __m64 __m2
)
221 return _mm_unpacklo_pi16 (__m1
, __m2
);
224 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
225 value from the low half of M2. */
226 static __inline __m64
__attribute__((__always_inline__
))
227 _mm_unpacklo_pi32 (__m64 __m1
, __m64 __m2
)
229 return (__m64
) __builtin_ia32_punpckldq ((__v2si
)__m1
, (__v2si
)__m2
);
232 static __inline __m64
__attribute__((__always_inline__
))
233 _m_punpckldq (__m64 __m1
, __m64 __m2
)
235 return _mm_unpacklo_pi32 (__m1
, __m2
);
238 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
239 static __inline __m64
__attribute__((__always_inline__
))
240 _mm_add_pi8 (__m64 __m1
, __m64 __m2
)
242 return (__m64
) __builtin_ia32_paddb ((__v8qi
)__m1
, (__v8qi
)__m2
);
245 static __inline __m64
__attribute__((__always_inline__
))
246 _m_paddb (__m64 __m1
, __m64 __m2
)
248 return _mm_add_pi8 (__m1
, __m2
);
251 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
252 static __inline __m64
__attribute__((__always_inline__
))
253 _mm_add_pi16 (__m64 __m1
, __m64 __m2
)
255 return (__m64
) __builtin_ia32_paddw ((__v4hi
)__m1
, (__v4hi
)__m2
);
258 static __inline __m64
__attribute__((__always_inline__
))
259 _m_paddw (__m64 __m1
, __m64 __m2
)
261 return _mm_add_pi16 (__m1
, __m2
);
264 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
265 static __inline __m64
__attribute__((__always_inline__
))
266 _mm_add_pi32 (__m64 __m1
, __m64 __m2
)
268 return (__m64
) __builtin_ia32_paddd ((__v2si
)__m1
, (__v2si
)__m2
);
271 static __inline __m64
__attribute__((__always_inline__
))
272 _m_paddd (__m64 __m1
, __m64 __m2
)
274 return _mm_add_pi32 (__m1
, __m2
);
277 /* Add the 64-bit values in M1 to the 64-bit values in M2. */
278 static __inline __m64
__attribute__((__always_inline__
))
279 _mm_add_si64 (__m64 __m1
, __m64 __m2
)
281 return (__m64
) __builtin_ia32_paddq ((long long)__m1
, (long long)__m2
);
284 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
285 saturated arithmetic. */
286 static __inline __m64
__attribute__((__always_inline__
))
287 _mm_adds_pi8 (__m64 __m1
, __m64 __m2
)
289 return (__m64
) __builtin_ia32_paddsb ((__v8qi
)__m1
, (__v8qi
)__m2
);
292 static __inline __m64
__attribute__((__always_inline__
))
293 _m_paddsb (__m64 __m1
, __m64 __m2
)
295 return _mm_adds_pi8 (__m1
, __m2
);
298 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
299 saturated arithmetic. */
300 static __inline __m64
__attribute__((__always_inline__
))
301 _mm_adds_pi16 (__m64 __m1
, __m64 __m2
)
303 return (__m64
) __builtin_ia32_paddsw ((__v4hi
)__m1
, (__v4hi
)__m2
);
306 static __inline __m64
__attribute__((__always_inline__
))
307 _m_paddsw (__m64 __m1
, __m64 __m2
)
309 return _mm_adds_pi16 (__m1
, __m2
);
312 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
313 saturated arithmetic. */
314 static __inline __m64
__attribute__((__always_inline__
))
315 _mm_adds_pu8 (__m64 __m1
, __m64 __m2
)
317 return (__m64
) __builtin_ia32_paddusb ((__v8qi
)__m1
, (__v8qi
)__m2
);
320 static __inline __m64
__attribute__((__always_inline__
))
321 _m_paddusb (__m64 __m1
, __m64 __m2
)
323 return _mm_adds_pu8 (__m1
, __m2
);
326 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
327 saturated arithmetic. */
328 static __inline __m64
__attribute__((__always_inline__
))
329 _mm_adds_pu16 (__m64 __m1
, __m64 __m2
)
331 return (__m64
) __builtin_ia32_paddusw ((__v4hi
)__m1
, (__v4hi
)__m2
);
334 static __inline __m64
__attribute__((__always_inline__
))
335 _m_paddusw (__m64 __m1
, __m64 __m2
)
337 return _mm_adds_pu16 (__m1
, __m2
);
340 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
341 static __inline __m64
__attribute__((__always_inline__
))
342 _mm_sub_pi8 (__m64 __m1
, __m64 __m2
)
344 return (__m64
) __builtin_ia32_psubb ((__v8qi
)__m1
, (__v8qi
)__m2
);
347 static __inline __m64
__attribute__((__always_inline__
))
348 _m_psubb (__m64 __m1
, __m64 __m2
)
350 return _mm_sub_pi8 (__m1
, __m2
);
353 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
354 static __inline __m64
__attribute__((__always_inline__
))
355 _mm_sub_pi16 (__m64 __m1
, __m64 __m2
)
357 return (__m64
) __builtin_ia32_psubw ((__v4hi
)__m1
, (__v4hi
)__m2
);
360 static __inline __m64
__attribute__((__always_inline__
))
361 _m_psubw (__m64 __m1
, __m64 __m2
)
363 return _mm_sub_pi16 (__m1
, __m2
);
366 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
367 static __inline __m64
__attribute__((__always_inline__
))
368 _mm_sub_pi32 (__m64 __m1
, __m64 __m2
)
370 return (__m64
) __builtin_ia32_psubd ((__v2si
)__m1
, (__v2si
)__m2
);
373 static __inline __m64
__attribute__((__always_inline__
))
374 _m_psubd (__m64 __m1
, __m64 __m2
)
376 return _mm_sub_pi32 (__m1
, __m2
);
379 /* Add the 64-bit values in M1 to the 64-bit values in M2. */
380 static __inline __m64
__attribute__((__always_inline__
))
381 _mm_sub_si64 (__m64 __m1
, __m64 __m2
)
383 return (__m64
) __builtin_ia32_psubq ((long long)__m1
, (long long)__m2
);
386 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
387 saturating arithmetic. */
388 static __inline __m64
__attribute__((__always_inline__
))
389 _mm_subs_pi8 (__m64 __m1
, __m64 __m2
)
391 return (__m64
) __builtin_ia32_psubsb ((__v8qi
)__m1
, (__v8qi
)__m2
);
394 static __inline __m64
__attribute__((__always_inline__
))
395 _m_psubsb (__m64 __m1
, __m64 __m2
)
397 return _mm_subs_pi8 (__m1
, __m2
);
400 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
401 signed saturating arithmetic. */
402 static __inline __m64
__attribute__((__always_inline__
))
403 _mm_subs_pi16 (__m64 __m1
, __m64 __m2
)
405 return (__m64
) __builtin_ia32_psubsw ((__v4hi
)__m1
, (__v4hi
)__m2
);
408 static __inline __m64
__attribute__((__always_inline__
))
409 _m_psubsw (__m64 __m1
, __m64 __m2
)
411 return _mm_subs_pi16 (__m1
, __m2
);
414 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
415 unsigned saturating arithmetic. */
416 static __inline __m64
__attribute__((__always_inline__
))
417 _mm_subs_pu8 (__m64 __m1
, __m64 __m2
)
419 return (__m64
) __builtin_ia32_psubusb ((__v8qi
)__m1
, (__v8qi
)__m2
);
422 static __inline __m64
__attribute__((__always_inline__
))
423 _m_psubusb (__m64 __m1
, __m64 __m2
)
425 return _mm_subs_pu8 (__m1
, __m2
);
428 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
429 unsigned saturating arithmetic. */
430 static __inline __m64
__attribute__((__always_inline__
))
431 _mm_subs_pu16 (__m64 __m1
, __m64 __m2
)
433 return (__m64
) __builtin_ia32_psubusw ((__v4hi
)__m1
, (__v4hi
)__m2
);
436 static __inline __m64
__attribute__((__always_inline__
))
437 _m_psubusw (__m64 __m1
, __m64 __m2
)
439 return _mm_subs_pu16 (__m1
, __m2
);
442 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
443 four 32-bit intermediate results, which are then summed by pairs to
444 produce two 32-bit results. */
445 static __inline __m64
__attribute__((__always_inline__
))
446 _mm_madd_pi16 (__m64 __m1
, __m64 __m2
)
448 return (__m64
) __builtin_ia32_pmaddwd ((__v4hi
)__m1
, (__v4hi
)__m2
);
451 static __inline __m64
__attribute__((__always_inline__
))
452 _m_pmaddwd (__m64 __m1
, __m64 __m2
)
454 return _mm_madd_pi16 (__m1
, __m2
);
457 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
458 M2 and produce the high 16 bits of the 32-bit results. */
459 static __inline __m64
__attribute__((__always_inline__
))
460 _mm_mulhi_pi16 (__m64 __m1
, __m64 __m2
)
462 return (__m64
) __builtin_ia32_pmulhw ((__v4hi
)__m1
, (__v4hi
)__m2
);
465 static __inline __m64
__attribute__((__always_inline__
))
466 _m_pmulhw (__m64 __m1
, __m64 __m2
)
468 return _mm_mulhi_pi16 (__m1
, __m2
);
471 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
472 the low 16 bits of the results. */
473 static __inline __m64
__attribute__((__always_inline__
))
474 _mm_mullo_pi16 (__m64 __m1
, __m64 __m2
)
476 return (__m64
) __builtin_ia32_pmullw ((__v4hi
)__m1
, (__v4hi
)__m2
);
479 static __inline __m64
__attribute__((__always_inline__
))
480 _m_pmullw (__m64 __m1
, __m64 __m2
)
482 return _mm_mullo_pi16 (__m1
, __m2
);
485 /* Shift four 16-bit values in M left by COUNT. */
486 static __inline __m64
__attribute__((__always_inline__
))
487 _mm_sll_pi16 (__m64 __m
, __m64 __count
)
489 return (__m64
) __builtin_ia32_psllw ((__v4hi
)__m
, (long long)__count
);
492 static __inline __m64
__attribute__((__always_inline__
))
493 _m_psllw (__m64 __m
, __m64 __count
)
495 return _mm_sll_pi16 (__m
, __count
);
498 static __inline __m64
__attribute__((__always_inline__
))
499 _mm_slli_pi16 (__m64 __m
, int __count
)
501 return (__m64
) __builtin_ia32_psllw ((__v4hi
)__m
, __count
);
504 static __inline __m64
__attribute__((__always_inline__
))
505 _m_psllwi (__m64 __m
, int __count
)
507 return _mm_slli_pi16 (__m
, __count
);
510 /* Shift two 32-bit values in M left by COUNT. */
511 static __inline __m64
__attribute__((__always_inline__
))
512 _mm_sll_pi32 (__m64 __m
, __m64 __count
)
514 return (__m64
) __builtin_ia32_pslld ((__v2si
)__m
, (long long)__count
);
517 static __inline __m64
__attribute__((__always_inline__
))
518 _m_pslld (__m64 __m
, __m64 __count
)
520 return _mm_sll_pi32 (__m
, __count
);
523 static __inline __m64
__attribute__((__always_inline__
))
524 _mm_slli_pi32 (__m64 __m
, int __count
)
526 return (__m64
) __builtin_ia32_pslld ((__v2si
)__m
, __count
);
529 static __inline __m64
__attribute__((__always_inline__
))
530 _m_pslldi (__m64 __m
, int __count
)
532 return _mm_slli_pi32 (__m
, __count
);
535 /* Shift the 64-bit value in M left by COUNT. */
536 static __inline __m64
__attribute__((__always_inline__
))
537 _mm_sll_si64 (__m64 __m
, __m64 __count
)
539 return (__m64
) __builtin_ia32_psllq ((long long)__m
, (long long)__count
);
542 static __inline __m64
__attribute__((__always_inline__
))
543 _m_psllq (__m64 __m
, __m64 __count
)
545 return _mm_sll_si64 (__m
, __count
);
548 static __inline __m64
__attribute__((__always_inline__
))
549 _mm_slli_si64 (__m64 __m
, int __count
)
551 return (__m64
) __builtin_ia32_psllq ((long long)__m
, (long long)__count
);
554 static __inline __m64
__attribute__((__always_inline__
))
555 _m_psllqi (__m64 __m
, int __count
)
557 return _mm_slli_si64 (__m
, __count
);
560 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
561 static __inline __m64
__attribute__((__always_inline__
))
562 _mm_sra_pi16 (__m64 __m
, __m64 __count
)
564 return (__m64
) __builtin_ia32_psraw ((__v4hi
)__m
, (long long)__count
);
567 static __inline __m64
__attribute__((__always_inline__
))
568 _m_psraw (__m64 __m
, __m64 __count
)
570 return _mm_sra_pi16 (__m
, __count
);
573 static __inline __m64
__attribute__((__always_inline__
))
574 _mm_srai_pi16 (__m64 __m
, int __count
)
576 return (__m64
) __builtin_ia32_psraw ((__v4hi
)__m
, __count
);
579 static __inline __m64
__attribute__((__always_inline__
))
580 _m_psrawi (__m64 __m
, int __count
)
582 return _mm_srai_pi16 (__m
, __count
);
585 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
586 static __inline __m64
__attribute__((__always_inline__
))
587 _mm_sra_pi32 (__m64 __m
, __m64 __count
)
589 return (__m64
) __builtin_ia32_psrad ((__v2si
)__m
, (long long)__count
);
592 static __inline __m64
__attribute__((__always_inline__
))
593 _m_psrad (__m64 __m
, __m64 __count
)
595 return _mm_sra_pi32 (__m
, __count
);
598 static __inline __m64
__attribute__((__always_inline__
))
599 _mm_srai_pi32 (__m64 __m
, int __count
)
601 return (__m64
) __builtin_ia32_psrad ((__v2si
)__m
, __count
);
604 static __inline __m64
__attribute__((__always_inline__
))
605 _m_psradi (__m64 __m
, int __count
)
607 return _mm_srai_pi32 (__m
, __count
);
610 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
611 static __inline __m64
__attribute__((__always_inline__
))
612 _mm_srl_pi16 (__m64 __m
, __m64 __count
)
614 return (__m64
) __builtin_ia32_psrlw ((__v4hi
)__m
, (long long)__count
);
617 static __inline __m64
__attribute__((__always_inline__
))
618 _m_psrlw (__m64 __m
, __m64 __count
)
620 return _mm_srl_pi16 (__m
, __count
);
623 static __inline __m64
__attribute__((__always_inline__
))
624 _mm_srli_pi16 (__m64 __m
, int __count
)
626 return (__m64
) __builtin_ia32_psrlw ((__v4hi
)__m
, __count
);
629 static __inline __m64
__attribute__((__always_inline__
))
630 _m_psrlwi (__m64 __m
, int __count
)
632 return _mm_srli_pi16 (__m
, __count
);
635 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
636 static __inline __m64
__attribute__((__always_inline__
))
637 _mm_srl_pi32 (__m64 __m
, __m64 __count
)
639 return (__m64
) __builtin_ia32_psrld ((__v2si
)__m
, (long long)__count
);
642 static __inline __m64
__attribute__((__always_inline__
))
643 _m_psrld (__m64 __m
, __m64 __count
)
645 return _mm_srl_pi32 (__m
, __count
);
648 static __inline __m64
__attribute__((__always_inline__
))
649 _mm_srli_pi32 (__m64 __m
, int __count
)
651 return (__m64
) __builtin_ia32_psrld ((__v2si
)__m
, __count
);
654 static __inline __m64
__attribute__((__always_inline__
))
655 _m_psrldi (__m64 __m
, int __count
)
657 return _mm_srli_pi32 (__m
, __count
);
660 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
661 static __inline __m64
__attribute__((__always_inline__
))
662 _mm_srl_si64 (__m64 __m
, __m64 __count
)
664 return (__m64
) __builtin_ia32_psrlq ((long long)__m
, (long long)__count
);
667 static __inline __m64
__attribute__((__always_inline__
))
668 _m_psrlq (__m64 __m
, __m64 __count
)
670 return _mm_srl_si64 (__m
, __count
);
673 static __inline __m64
__attribute__((__always_inline__
))
674 _mm_srli_si64 (__m64 __m
, int __count
)
676 return (__m64
) __builtin_ia32_psrlq ((long long)__m
, (long long)__count
);
679 static __inline __m64
__attribute__((__always_inline__
))
680 _m_psrlqi (__m64 __m
, int __count
)
682 return _mm_srli_si64 (__m
, __count
);
685 /* Bit-wise AND the 64-bit values in M1 and M2. */
686 static __inline __m64
__attribute__((__always_inline__
))
687 _mm_and_si64 (__m64 __m1
, __m64 __m2
)
689 return __builtin_ia32_pand (__m1
, __m2
);
692 static __inline __m64
__attribute__((__always_inline__
))
693 _m_pand (__m64 __m1
, __m64 __m2
)
695 return _mm_and_si64 (__m1
, __m2
);
698 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
699 64-bit value in M2. */
700 static __inline __m64
__attribute__((__always_inline__
))
701 _mm_andnot_si64 (__m64 __m1
, __m64 __m2
)
703 return __builtin_ia32_pandn (__m1
, __m2
);
706 static __inline __m64
__attribute__((__always_inline__
))
707 _m_pandn (__m64 __m1
, __m64 __m2
)
709 return _mm_andnot_si64 (__m1
, __m2
);
712 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
713 static __inline __m64
__attribute__((__always_inline__
))
714 _mm_or_si64 (__m64 __m1
, __m64 __m2
)
716 return __builtin_ia32_por (__m1
, __m2
);
719 static __inline __m64
__attribute__((__always_inline__
))
720 _m_por (__m64 __m1
, __m64 __m2
)
722 return _mm_or_si64 (__m1
, __m2
);
725 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
726 static __inline __m64
__attribute__((__always_inline__
))
727 _mm_xor_si64 (__m64 __m1
, __m64 __m2
)
729 return __builtin_ia32_pxor (__m1
, __m2
);
732 static __inline __m64
__attribute__((__always_inline__
))
733 _m_pxor (__m64 __m1
, __m64 __m2
)
735 return _mm_xor_si64 (__m1
, __m2
);
738 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
739 test is true and zero if false. */
740 static __inline __m64
__attribute__((__always_inline__
))
741 _mm_cmpeq_pi8 (__m64 __m1
, __m64 __m2
)
743 return (__m64
) __builtin_ia32_pcmpeqb ((__v8qi
)__m1
, (__v8qi
)__m2
);
746 static __inline __m64
__attribute__((__always_inline__
))
747 _m_pcmpeqb (__m64 __m1
, __m64 __m2
)
749 return _mm_cmpeq_pi8 (__m1
, __m2
);
752 static __inline __m64
__attribute__((__always_inline__
))
753 _mm_cmpgt_pi8 (__m64 __m1
, __m64 __m2
)
755 return (__m64
) __builtin_ia32_pcmpgtb ((__v8qi
)__m1
, (__v8qi
)__m2
);
758 static __inline __m64
__attribute__((__always_inline__
))
759 _m_pcmpgtb (__m64 __m1
, __m64 __m2
)
761 return _mm_cmpgt_pi8 (__m1
, __m2
);
764 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
765 the test is true and zero if false. */
766 static __inline __m64
__attribute__((__always_inline__
))
767 _mm_cmpeq_pi16 (__m64 __m1
, __m64 __m2
)
769 return (__m64
) __builtin_ia32_pcmpeqw ((__v4hi
)__m1
, (__v4hi
)__m2
);
772 static __inline __m64
__attribute__((__always_inline__
))
773 _m_pcmpeqw (__m64 __m1
, __m64 __m2
)
775 return _mm_cmpeq_pi16 (__m1
, __m2
);
778 static __inline __m64
__attribute__((__always_inline__
))
779 _mm_cmpgt_pi16 (__m64 __m1
, __m64 __m2
)
781 return (__m64
) __builtin_ia32_pcmpgtw ((__v4hi
)__m1
, (__v4hi
)__m2
);
784 static __inline __m64
__attribute__((__always_inline__
))
785 _m_pcmpgtw (__m64 __m1
, __m64 __m2
)
787 return _mm_cmpgt_pi16 (__m1
, __m2
);
790 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
791 the test is true and zero if false. */
792 static __inline __m64
__attribute__((__always_inline__
))
793 _mm_cmpeq_pi32 (__m64 __m1
, __m64 __m2
)
795 return (__m64
) __builtin_ia32_pcmpeqd ((__v2si
)__m1
, (__v2si
)__m2
);
798 static __inline __m64
__attribute__((__always_inline__
))
799 _m_pcmpeqd (__m64 __m1
, __m64 __m2
)
801 return _mm_cmpeq_pi32 (__m1
, __m2
);
804 static __inline __m64
__attribute__((__always_inline__
))
805 _mm_cmpgt_pi32 (__m64 __m1
, __m64 __m2
)
807 return (__m64
) __builtin_ia32_pcmpgtd ((__v2si
)__m1
, (__v2si
)__m2
);
810 static __inline __m64
__attribute__((__always_inline__
))
811 _m_pcmpgtd (__m64 __m1
, __m64 __m2
)
813 return _mm_cmpgt_pi32 (__m1
, __m2
);
816 /* Creates a 64-bit zero. */
817 static __inline __m64
__attribute__((__always_inline__
))
818 _mm_setzero_si64 (void)
823 /* Creates a vector of two 32-bit values; I0 is least significant. */
824 static __inline __m64
__attribute__((__always_inline__
))
825 _mm_set_pi32 (int __i1
, int __i0
)
827 return (__m64
) __builtin_ia32_vec_init_v2si (__i0
, __i1
);
830 /* Creates a vector of four 16-bit values; W0 is least significant. */
831 static __inline __m64
__attribute__((__always_inline__
))
832 _mm_set_pi16 (short __w3
, short __w2
, short __w1
, short __w0
)
834 return (__m64
) __builtin_ia32_vec_init_v4hi (__w0
, __w1
, __w2
, __w3
);
837 /* Creates a vector of eight 8-bit values; B0 is least significant. */
838 static __inline __m64
__attribute__((__always_inline__
))
839 _mm_set_pi8 (char __b7
, char __b6
, char __b5
, char __b4
,
840 char __b3
, char __b2
, char __b1
, char __b0
)
842 return (__m64
) __builtin_ia32_vec_init_v8qi (__b0
, __b1
, __b2
, __b3
,
843 __b4
, __b5
, __b6
, __b7
);
846 /* Similar, but with the arguments in reverse order. */
847 static __inline __m64
__attribute__((__always_inline__
))
848 _mm_setr_pi32 (int __i0
, int __i1
)
850 return _mm_set_pi32 (__i1
, __i0
);
853 static __inline __m64
__attribute__((__always_inline__
))
854 _mm_setr_pi16 (short __w0
, short __w1
, short __w2
, short __w3
)
856 return _mm_set_pi16 (__w3
, __w2
, __w1
, __w0
);
859 static __inline __m64
__attribute__((__always_inline__
))
860 _mm_setr_pi8 (char __b0
, char __b1
, char __b2
, char __b3
,
861 char __b4
, char __b5
, char __b6
, char __b7
)
863 return _mm_set_pi8 (__b7
, __b6
, __b5
, __b4
, __b3
, __b2
, __b1
, __b0
);
866 /* Creates a vector of two 32-bit values, both elements containing I. */
867 static __inline __m64
__attribute__((__always_inline__
))
868 _mm_set1_pi32 (int __i
)
870 return _mm_set_pi32 (__i
, __i
);
873 /* Creates a vector of four 16-bit values, all elements containing W. */
874 static __inline __m64
__attribute__((__always_inline__
))
875 _mm_set1_pi16 (short __w
)
877 return _mm_set_pi16 (__w
, __w
, __w
, __w
);
880 /* Creates a vector of eight 8-bit values, all elements containing B. */
881 static __inline __m64
__attribute__((__always_inline__
))
882 _mm_set1_pi8 (char __b
)
884 return _mm_set_pi8 (__b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
);
888 #endif /* _MMINTRIN_H_INCLUDED */