2 * MMX/3DNow!/SSE/SSE2/SSE3/PNI support
4 * Copyright (c) 2005 Fabrice Bellard
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #define XMM_ONLY(x...)
30 #define XMM_ONLY(x...) x
38 void glue(helper_psrlw
, SUFFIX
)(Reg
*d
, Reg
*s
)
63 void glue(helper_psraw
, SUFFIX
)(Reg
*d
, Reg
*s
)
72 d
->W(0) = (int16_t)d
->W(0) >> shift
;
73 d
->W(1) = (int16_t)d
->W(1) >> shift
;
74 d
->W(2) = (int16_t)d
->W(2) >> shift
;
75 d
->W(3) = (int16_t)d
->W(3) >> shift
;
77 d
->W(4) = (int16_t)d
->W(4) >> shift
;
78 d
->W(5) = (int16_t)d
->W(5) >> shift
;
79 d
->W(6) = (int16_t)d
->W(6) >> shift
;
80 d
->W(7) = (int16_t)d
->W(7) >> shift
;
84 void glue(helper_psllw
, SUFFIX
)(Reg
*d
, Reg
*s
)
109 void glue(helper_psrld
, SUFFIX
)(Reg
*d
, Reg
*s
)
130 void glue(helper_psrad
, SUFFIX
)(Reg
*d
, Reg
*s
)
139 d
->L(0) = (int32_t)d
->L(0) >> shift
;
140 d
->L(1) = (int32_t)d
->L(1) >> shift
;
142 d
->L(2) = (int32_t)d
->L(2) >> shift
;
143 d
->L(3) = (int32_t)d
->L(3) >> shift
;
147 void glue(helper_pslld
, SUFFIX
)(Reg
*d
, Reg
*s
)
168 void glue(helper_psrlq
, SUFFIX
)(Reg
*d
, Reg
*s
)
187 void glue(helper_psllq
, SUFFIX
)(Reg
*d
, Reg
*s
)
207 void glue(helper_psrldq
, SUFFIX
)(Reg
*d
, Reg
*s
)
214 for(i
= 0; i
< 16 - shift
; i
++)
215 d
->B(i
) = d
->B(i
+ shift
);
216 for(i
= 16 - shift
; i
< 16; i
++)
221 void glue(helper_pslldq
, SUFFIX
)(Reg
*d
, Reg
*s
)
228 for(i
= 15; i
>= shift
; i
--)
229 d
->B(i
) = d
->B(i
- shift
);
230 for(i
= 0; i
< shift
; i
++)
236 #define SSE_HELPER_B(name, F)\
237 void glue(name, SUFFIX) (Reg *d, Reg *s)\
239 d->B(0) = F(d->B(0), s->B(0));\
240 d->B(1) = F(d->B(1), s->B(1));\
241 d->B(2) = F(d->B(2), s->B(2));\
242 d->B(3) = F(d->B(3), s->B(3));\
243 d->B(4) = F(d->B(4), s->B(4));\
244 d->B(5) = F(d->B(5), s->B(5));\
245 d->B(6) = F(d->B(6), s->B(6));\
246 d->B(7) = F(d->B(7), s->B(7));\
248 d->B(8) = F(d->B(8), s->B(8));\
249 d->B(9) = F(d->B(9), s->B(9));\
250 d->B(10) = F(d->B(10), s->B(10));\
251 d->B(11) = F(d->B(11), s->B(11));\
252 d->B(12) = F(d->B(12), s->B(12));\
253 d->B(13) = F(d->B(13), s->B(13));\
254 d->B(14) = F(d->B(14), s->B(14));\
255 d->B(15) = F(d->B(15), s->B(15));\
259 #define SSE_HELPER_W(name, F)\
260 void glue(name, SUFFIX) (Reg *d, Reg *s)\
262 d->W(0) = F(d->W(0), s->W(0));\
263 d->W(1) = F(d->W(1), s->W(1));\
264 d->W(2) = F(d->W(2), s->W(2));\
265 d->W(3) = F(d->W(3), s->W(3));\
267 d->W(4) = F(d->W(4), s->W(4));\
268 d->W(5) = F(d->W(5), s->W(5));\
269 d->W(6) = F(d->W(6), s->W(6));\
270 d->W(7) = F(d->W(7), s->W(7));\
274 #define SSE_HELPER_L(name, F)\
275 void glue(name, SUFFIX) (Reg *d, Reg *s)\
277 d->L(0) = F(d->L(0), s->L(0));\
278 d->L(1) = F(d->L(1), s->L(1));\
280 d->L(2) = F(d->L(2), s->L(2));\
281 d->L(3) = F(d->L(3), s->L(3));\
285 #define SSE_HELPER_Q(name, F)\
286 void glue(name, SUFFIX) (Reg *d, Reg *s)\
288 d->Q(0) = F(d->Q(0), s->Q(0));\
290 d->Q(1) = F(d->Q(1), s->Q(1));\
295 static inline int satub(int x
)
305 static inline int satuw(int x
)
315 static inline int satsb(int x
)
325 static inline int satsw(int x
)
335 #define FADD(a, b) ((a) + (b))
336 #define FADDUB(a, b) satub((a) + (b))
337 #define FADDUW(a, b) satuw((a) + (b))
338 #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
339 #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
341 #define FSUB(a, b) ((a) - (b))
342 #define FSUBUB(a, b) satub((a) - (b))
343 #define FSUBUW(a, b) satuw((a) - (b))
344 #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
345 #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
346 #define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
347 #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
348 #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
349 #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
351 #define FAND(a, b) (a) & (b)
352 #define FANDN(a, b) ((~(a)) & (b))
353 #define FOR(a, b) (a) | (b)
354 #define FXOR(a, b) (a) ^ (b)
356 #define FCMPGTB(a, b) (int8_t)(a) > (int8_t)(b) ? -1 : 0
357 #define FCMPGTW(a, b) (int16_t)(a) > (int16_t)(b) ? -1 : 0
358 #define FCMPGTL(a, b) (int32_t)(a) > (int32_t)(b) ? -1 : 0
359 #define FCMPEQ(a, b) (a) == (b) ? -1 : 0
361 #define FMULLW(a, b) (a) * (b)
362 #define FMULHRW(a, b) ((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16
363 #define FMULHUW(a, b) (a) * (b) >> 16
364 #define FMULHW(a, b) (int16_t)(a) * (int16_t)(b) >> 16
366 #define FAVG(a, b) ((a) + (b) + 1) >> 1
369 SSE_HELPER_B(helper_paddb
, FADD
)
370 SSE_HELPER_W(helper_paddw
, FADD
)
371 SSE_HELPER_L(helper_paddl
, FADD
)
372 SSE_HELPER_Q(helper_paddq
, FADD
)
374 SSE_HELPER_B(helper_psubb
, FSUB
)
375 SSE_HELPER_W(helper_psubw
, FSUB
)
376 SSE_HELPER_L(helper_psubl
, FSUB
)
377 SSE_HELPER_Q(helper_psubq
, FSUB
)
379 SSE_HELPER_B(helper_paddusb
, FADDUB
)
380 SSE_HELPER_B(helper_paddsb
, FADDSB
)
381 SSE_HELPER_B(helper_psubusb
, FSUBUB
)
382 SSE_HELPER_B(helper_psubsb
, FSUBSB
)
384 SSE_HELPER_W(helper_paddusw
, FADDUW
)
385 SSE_HELPER_W(helper_paddsw
, FADDSW
)
386 SSE_HELPER_W(helper_psubusw
, FSUBUW
)
387 SSE_HELPER_W(helper_psubsw
, FSUBSW
)
389 SSE_HELPER_B(helper_pminub
, FMINUB
)
390 SSE_HELPER_B(helper_pmaxub
, FMAXUB
)
392 SSE_HELPER_W(helper_pminsw
, FMINSW
)
393 SSE_HELPER_W(helper_pmaxsw
, FMAXSW
)
395 SSE_HELPER_Q(helper_pand
, FAND
)
396 SSE_HELPER_Q(helper_pandn
, FANDN
)
397 SSE_HELPER_Q(helper_por
, FOR
)
398 SSE_HELPER_Q(helper_pxor
, FXOR
)
400 SSE_HELPER_B(helper_pcmpgtb
, FCMPGTB
)
401 SSE_HELPER_W(helper_pcmpgtw
, FCMPGTW
)
402 SSE_HELPER_L(helper_pcmpgtl
, FCMPGTL
)
404 SSE_HELPER_B(helper_pcmpeqb
, FCMPEQ
)
405 SSE_HELPER_W(helper_pcmpeqw
, FCMPEQ
)
406 SSE_HELPER_L(helper_pcmpeql
, FCMPEQ
)
408 SSE_HELPER_W(helper_pmullw
, FMULLW
)
410 SSE_HELPER_W(helper_pmulhrw
, FMULHRW
)
412 SSE_HELPER_W(helper_pmulhuw
, FMULHUW
)
413 SSE_HELPER_W(helper_pmulhw
, FMULHW
)
415 SSE_HELPER_B(helper_pavgb
, FAVG
)
416 SSE_HELPER_W(helper_pavgw
, FAVG
)
418 void glue(helper_pmuludq
, SUFFIX
) (Reg
*d
, Reg
*s
)
420 d
->Q(0) = (uint64_t)s
->L(0) * (uint64_t)d
->L(0);
422 d
->Q(1) = (uint64_t)s
->L(2) * (uint64_t)d
->L(2);
426 void glue(helper_pmaddwd
, SUFFIX
) (Reg
*d
, Reg
*s
)
430 for(i
= 0; i
< (2 << SHIFT
); i
++) {
431 d
->L(i
) = (int16_t)s
->W(2*i
) * (int16_t)d
->W(2*i
) +
432 (int16_t)s
->W(2*i
+1) * (int16_t)d
->W(2*i
+1);
438 static inline int abs1(int a
)
446 void glue(helper_psadbw
, SUFFIX
) (Reg
*d
, Reg
*s
)
451 val
+= abs1(d
->B(0) - s
->B(0));
452 val
+= abs1(d
->B(1) - s
->B(1));
453 val
+= abs1(d
->B(2) - s
->B(2));
454 val
+= abs1(d
->B(3) - s
->B(3));
455 val
+= abs1(d
->B(4) - s
->B(4));
456 val
+= abs1(d
->B(5) - s
->B(5));
457 val
+= abs1(d
->B(6) - s
->B(6));
458 val
+= abs1(d
->B(7) - s
->B(7));
462 val
+= abs1(d
->B(8) - s
->B(8));
463 val
+= abs1(d
->B(9) - s
->B(9));
464 val
+= abs1(d
->B(10) - s
->B(10));
465 val
+= abs1(d
->B(11) - s
->B(11));
466 val
+= abs1(d
->B(12) - s
->B(12));
467 val
+= abs1(d
->B(13) - s
->B(13));
468 val
+= abs1(d
->B(14) - s
->B(14));
469 val
+= abs1(d
->B(15) - s
->B(15));
474 void glue(helper_maskmov
, SUFFIX
) (Reg
*d
, Reg
*s
, target_ulong a0
)
477 for(i
= 0; i
< (8 << SHIFT
); i
++) {
479 stb(a0
+ i
, d
->B(i
));
484 void glue(helper_movl_mm_T0
, SUFFIX
) (Reg
*d
, uint32_t val
)
494 void glue(helper_movq_mm_T0
, SUFFIX
) (Reg
*d
, uint64_t val
)
504 void glue(helper_pshufw
, SUFFIX
) (Reg
*d
, Reg
*s
, int order
)
507 r
.W(0) = s
->W(order
& 3);
508 r
.W(1) = s
->W((order
>> 2) & 3);
509 r
.W(2) = s
->W((order
>> 4) & 3);
510 r
.W(3) = s
->W((order
>> 6) & 3);
514 void helper_shufps(Reg
*d
, Reg
*s
, int order
)
517 r
.L(0) = d
->L(order
& 3);
518 r
.L(1) = d
->L((order
>> 2) & 3);
519 r
.L(2) = s
->L((order
>> 4) & 3);
520 r
.L(3) = s
->L((order
>> 6) & 3);
524 void helper_shufpd(Reg
*d
, Reg
*s
, int order
)
527 r
.Q(0) = d
->Q(order
& 1);
528 r
.Q(1) = s
->Q((order
>> 1) & 1);
532 void glue(helper_pshufd
, SUFFIX
) (Reg
*d
, Reg
*s
, int order
)
535 r
.L(0) = s
->L(order
& 3);
536 r
.L(1) = s
->L((order
>> 2) & 3);
537 r
.L(2) = s
->L((order
>> 4) & 3);
538 r
.L(3) = s
->L((order
>> 6) & 3);
542 void glue(helper_pshuflw
, SUFFIX
) (Reg
*d
, Reg
*s
, int order
)
545 r
.W(0) = s
->W(order
& 3);
546 r
.W(1) = s
->W((order
>> 2) & 3);
547 r
.W(2) = s
->W((order
>> 4) & 3);
548 r
.W(3) = s
->W((order
>> 6) & 3);
553 void glue(helper_pshufhw
, SUFFIX
) (Reg
*d
, Reg
*s
, int order
)
557 r
.W(4) = s
->W(4 + (order
& 3));
558 r
.W(5) = s
->W(4 + ((order
>> 2) & 3));
559 r
.W(6) = s
->W(4 + ((order
>> 4) & 3));
560 r
.W(7) = s
->W(4 + ((order
>> 6) & 3));
567 /* XXX: not accurate */
569 #define SSE_HELPER_S(name, F)\
570 void helper_ ## name ## ps (Reg *d, Reg *s)\
572 d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
573 d->XMM_S(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
574 d->XMM_S(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
575 d->XMM_S(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
578 void helper_ ## name ## ss (Reg *d, Reg *s)\
580 d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
582 void helper_ ## name ## pd (Reg *d, Reg *s)\
584 d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
585 d->XMM_D(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
588 void helper_ ## name ## sd (Reg *d, Reg *s)\
590 d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
593 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
594 #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
595 #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
596 #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
597 #define FPU_MIN(size, a, b) (a) < (b) ? (a) : (b)
598 #define FPU_MAX(size, a, b) (a) > (b) ? (a) : (b)
599 #define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status)
601 SSE_HELPER_S(add
, FPU_ADD
)
602 SSE_HELPER_S(sub
, FPU_SUB
)
603 SSE_HELPER_S(mul
, FPU_MUL
)
604 SSE_HELPER_S(div
, FPU_DIV
)
605 SSE_HELPER_S(min
, FPU_MIN
)
606 SSE_HELPER_S(max
, FPU_MAX
)
607 SSE_HELPER_S(sqrt
, FPU_SQRT
)
610 /* float to float conversions */
611 void helper_cvtps2pd(Reg
*d
, Reg
*s
)
616 d
->XMM_D(0) = float32_to_float64(s0
, &env
->sse_status
);
617 d
->XMM_D(1) = float32_to_float64(s1
, &env
->sse_status
);
620 void helper_cvtpd2ps(Reg
*d
, Reg
*s
)
622 d
->XMM_S(0) = float64_to_float32(s
->XMM_D(0), &env
->sse_status
);
623 d
->XMM_S(1) = float64_to_float32(s
->XMM_D(1), &env
->sse_status
);
627 void helper_cvtss2sd(Reg
*d
, Reg
*s
)
629 d
->XMM_D(0) = float32_to_float64(s
->XMM_S(0), &env
->sse_status
);
632 void helper_cvtsd2ss(Reg
*d
, Reg
*s
)
634 d
->XMM_S(0) = float64_to_float32(s
->XMM_D(0), &env
->sse_status
);
637 /* integer to float */
638 void helper_cvtdq2ps(Reg
*d
, Reg
*s
)
640 d
->XMM_S(0) = int32_to_float32(s
->XMM_L(0), &env
->sse_status
);
641 d
->XMM_S(1) = int32_to_float32(s
->XMM_L(1), &env
->sse_status
);
642 d
->XMM_S(2) = int32_to_float32(s
->XMM_L(2), &env
->sse_status
);
643 d
->XMM_S(3) = int32_to_float32(s
->XMM_L(3), &env
->sse_status
);
646 void helper_cvtdq2pd(Reg
*d
, Reg
*s
)
649 l0
= (int32_t)s
->XMM_L(0);
650 l1
= (int32_t)s
->XMM_L(1);
651 d
->XMM_D(0) = int32_to_float64(l0
, &env
->sse_status
);
652 d
->XMM_D(1) = int32_to_float64(l1
, &env
->sse_status
);
655 void helper_cvtpi2ps(XMMReg
*d
, MMXReg
*s
)
657 d
->XMM_S(0) = int32_to_float32(s
->MMX_L(0), &env
->sse_status
);
658 d
->XMM_S(1) = int32_to_float32(s
->MMX_L(1), &env
->sse_status
);
661 void helper_cvtpi2pd(XMMReg
*d
, MMXReg
*s
)
663 d
->XMM_D(0) = int32_to_float64(s
->MMX_L(0), &env
->sse_status
);
664 d
->XMM_D(1) = int32_to_float64(s
->MMX_L(1), &env
->sse_status
);
667 void helper_cvtsi2ss(XMMReg
*d
, uint32_t val
)
669 d
->XMM_S(0) = int32_to_float32(val
, &env
->sse_status
);
672 void helper_cvtsi2sd(XMMReg
*d
, uint32_t val
)
674 d
->XMM_D(0) = int32_to_float64(val
, &env
->sse_status
);
678 void helper_cvtsq2ss(XMMReg
*d
, uint64_t val
)
680 d
->XMM_S(0) = int64_to_float32(val
, &env
->sse_status
);
683 void helper_cvtsq2sd(XMMReg
*d
, uint64_t val
)
685 d
->XMM_D(0) = int64_to_float64(val
, &env
->sse_status
);
689 /* float to integer */
690 void helper_cvtps2dq(XMMReg
*d
, XMMReg
*s
)
692 d
->XMM_L(0) = float32_to_int32(s
->XMM_S(0), &env
->sse_status
);
693 d
->XMM_L(1) = float32_to_int32(s
->XMM_S(1), &env
->sse_status
);
694 d
->XMM_L(2) = float32_to_int32(s
->XMM_S(2), &env
->sse_status
);
695 d
->XMM_L(3) = float32_to_int32(s
->XMM_S(3), &env
->sse_status
);
698 void helper_cvtpd2dq(XMMReg
*d
, XMMReg
*s
)
700 d
->XMM_L(0) = float64_to_int32(s
->XMM_D(0), &env
->sse_status
);
701 d
->XMM_L(1) = float64_to_int32(s
->XMM_D(1), &env
->sse_status
);
705 void helper_cvtps2pi(MMXReg
*d
, XMMReg
*s
)
707 d
->MMX_L(0) = float32_to_int32(s
->XMM_S(0), &env
->sse_status
);
708 d
->MMX_L(1) = float32_to_int32(s
->XMM_S(1), &env
->sse_status
);
711 void helper_cvtpd2pi(MMXReg
*d
, XMMReg
*s
)
713 d
->MMX_L(0) = float64_to_int32(s
->XMM_D(0), &env
->sse_status
);
714 d
->MMX_L(1) = float64_to_int32(s
->XMM_D(1), &env
->sse_status
);
717 int32_t helper_cvtss2si(XMMReg
*s
)
719 return float32_to_int32(s
->XMM_S(0), &env
->sse_status
);
722 int32_t helper_cvtsd2si(XMMReg
*s
)
724 return float64_to_int32(s
->XMM_D(0), &env
->sse_status
);
728 int64_t helper_cvtss2sq(XMMReg
*s
)
730 return float32_to_int64(s
->XMM_S(0), &env
->sse_status
);
733 int64_t helper_cvtsd2sq(XMMReg
*s
)
735 return float64_to_int64(s
->XMM_D(0), &env
->sse_status
);
739 /* float to integer truncated */
740 void helper_cvttps2dq(XMMReg
*d
, XMMReg
*s
)
742 d
->XMM_L(0) = float32_to_int32_round_to_zero(s
->XMM_S(0), &env
->sse_status
);
743 d
->XMM_L(1) = float32_to_int32_round_to_zero(s
->XMM_S(1), &env
->sse_status
);
744 d
->XMM_L(2) = float32_to_int32_round_to_zero(s
->XMM_S(2), &env
->sse_status
);
745 d
->XMM_L(3) = float32_to_int32_round_to_zero(s
->XMM_S(3), &env
->sse_status
);
748 void helper_cvttpd2dq(XMMReg
*d
, XMMReg
*s
)
750 d
->XMM_L(0) = float64_to_int32_round_to_zero(s
->XMM_D(0), &env
->sse_status
);
751 d
->XMM_L(1) = float64_to_int32_round_to_zero(s
->XMM_D(1), &env
->sse_status
);
755 void helper_cvttps2pi(MMXReg
*d
, XMMReg
*s
)
757 d
->MMX_L(0) = float32_to_int32_round_to_zero(s
->XMM_S(0), &env
->sse_status
);
758 d
->MMX_L(1) = float32_to_int32_round_to_zero(s
->XMM_S(1), &env
->sse_status
);
761 void helper_cvttpd2pi(MMXReg
*d
, XMMReg
*s
)
763 d
->MMX_L(0) = float64_to_int32_round_to_zero(s
->XMM_D(0), &env
->sse_status
);
764 d
->MMX_L(1) = float64_to_int32_round_to_zero(s
->XMM_D(1), &env
->sse_status
);
767 int32_t helper_cvttss2si(XMMReg
*s
)
769 return float32_to_int32_round_to_zero(s
->XMM_S(0), &env
->sse_status
);
772 int32_t helper_cvttsd2si(XMMReg
*s
)
774 return float64_to_int32_round_to_zero(s
->XMM_D(0), &env
->sse_status
);
778 int64_t helper_cvttss2sq(XMMReg
*s
)
780 return float32_to_int64_round_to_zero(s
->XMM_S(0), &env
->sse_status
);
783 int64_t helper_cvttsd2sq(XMMReg
*s
)
785 return float64_to_int64_round_to_zero(s
->XMM_D(0), &env
->sse_status
);
789 void helper_rsqrtps(XMMReg
*d
, XMMReg
*s
)
791 d
->XMM_S(0) = approx_rsqrt(s
->XMM_S(0));
792 d
->XMM_S(1) = approx_rsqrt(s
->XMM_S(1));
793 d
->XMM_S(2) = approx_rsqrt(s
->XMM_S(2));
794 d
->XMM_S(3) = approx_rsqrt(s
->XMM_S(3));
797 void helper_rsqrtss(XMMReg
*d
, XMMReg
*s
)
799 d
->XMM_S(0) = approx_rsqrt(s
->XMM_S(0));
802 void helper_rcpps(XMMReg
*d
, XMMReg
*s
)
804 d
->XMM_S(0) = approx_rcp(s
->XMM_S(0));
805 d
->XMM_S(1) = approx_rcp(s
->XMM_S(1));
806 d
->XMM_S(2) = approx_rcp(s
->XMM_S(2));
807 d
->XMM_S(3) = approx_rcp(s
->XMM_S(3));
810 void helper_rcpss(XMMReg
*d
, XMMReg
*s
)
812 d
->XMM_S(0) = approx_rcp(s
->XMM_S(0));
815 void helper_haddps(XMMReg
*d
, XMMReg
*s
)
818 r
.XMM_S(0) = d
->XMM_S(0) + d
->XMM_S(1);
819 r
.XMM_S(1) = d
->XMM_S(2) + d
->XMM_S(3);
820 r
.XMM_S(2) = s
->XMM_S(0) + s
->XMM_S(1);
821 r
.XMM_S(3) = s
->XMM_S(2) + s
->XMM_S(3);
825 void helper_haddpd(XMMReg
*d
, XMMReg
*s
)
828 r
.XMM_D(0) = d
->XMM_D(0) + d
->XMM_D(1);
829 r
.XMM_D(1) = s
->XMM_D(0) + s
->XMM_D(1);
833 void helper_hsubps(XMMReg
*d
, XMMReg
*s
)
836 r
.XMM_S(0) = d
->XMM_S(0) - d
->XMM_S(1);
837 r
.XMM_S(1) = d
->XMM_S(2) - d
->XMM_S(3);
838 r
.XMM_S(2) = s
->XMM_S(0) - s
->XMM_S(1);
839 r
.XMM_S(3) = s
->XMM_S(2) - s
->XMM_S(3);
843 void helper_hsubpd(XMMReg
*d
, XMMReg
*s
)
846 r
.XMM_D(0) = d
->XMM_D(0) - d
->XMM_D(1);
847 r
.XMM_D(1) = s
->XMM_D(0) - s
->XMM_D(1);
851 void helper_addsubps(XMMReg
*d
, XMMReg
*s
)
853 d
->XMM_S(0) = d
->XMM_S(0) - s
->XMM_S(0);
854 d
->XMM_S(1) = d
->XMM_S(1) + s
->XMM_S(1);
855 d
->XMM_S(2) = d
->XMM_S(2) - s
->XMM_S(2);
856 d
->XMM_S(3) = d
->XMM_S(3) + s
->XMM_S(3);
859 void helper_addsubpd(XMMReg
*d
, XMMReg
*s
)
861 d
->XMM_D(0) = d
->XMM_D(0) - s
->XMM_D(0);
862 d
->XMM_D(1) = d
->XMM_D(1) + s
->XMM_D(1);
866 #define SSE_HELPER_CMP(name, F)\
867 void helper_ ## name ## ps (Reg *d, Reg *s)\
869 d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
870 d->XMM_L(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
871 d->XMM_L(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
872 d->XMM_L(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
875 void helper_ ## name ## ss (Reg *d, Reg *s)\
877 d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
879 void helper_ ## name ## pd (Reg *d, Reg *s)\
881 d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
882 d->XMM_Q(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
885 void helper_ ## name ## sd (Reg *d, Reg *s)\
887 d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
890 #define FPU_CMPEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? -1 : 0
891 #define FPU_CMPLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0
892 #define FPU_CMPLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? -1 : 0
893 #define FPU_CMPUNORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? - 1 : 0
894 #define FPU_CMPNEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? 0 : -1
895 #define FPU_CMPNLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1
896 #define FPU_CMPNLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? 0 : -1
897 #define FPU_CMPORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? 0 : -1
899 SSE_HELPER_CMP(cmpeq
, FPU_CMPEQ
)
900 SSE_HELPER_CMP(cmplt
, FPU_CMPLT
)
901 SSE_HELPER_CMP(cmple
, FPU_CMPLE
)
902 SSE_HELPER_CMP(cmpunord
, FPU_CMPUNORD
)
903 SSE_HELPER_CMP(cmpneq
, FPU_CMPNEQ
)
904 SSE_HELPER_CMP(cmpnlt
, FPU_CMPNLT
)
905 SSE_HELPER_CMP(cmpnle
, FPU_CMPNLE
)
906 SSE_HELPER_CMP(cmpord
, FPU_CMPORD
)
908 const int comis_eflags
[4] = {CC_C
, CC_Z
, 0, CC_Z
| CC_P
| CC_C
};
910 void helper_ucomiss(Reg
*d
, Reg
*s
)
917 ret
= float32_compare_quiet(s0
, s1
, &env
->sse_status
);
918 CC_SRC
= comis_eflags
[ret
+ 1];
922 void helper_comiss(Reg
*d
, Reg
*s
)
929 ret
= float32_compare(s0
, s1
, &env
->sse_status
);
930 CC_SRC
= comis_eflags
[ret
+ 1];
934 void helper_ucomisd(Reg
*d
, Reg
*s
)
941 ret
= float64_compare_quiet(d0
, d1
, &env
->sse_status
);
942 CC_SRC
= comis_eflags
[ret
+ 1];
946 void helper_comisd(Reg
*d
, Reg
*s
)
953 ret
= float64_compare(d0
, d1
, &env
->sse_status
);
954 CC_SRC
= comis_eflags
[ret
+ 1];
958 uint32_t helper_movmskps(Reg
*s
)
961 b0
= s
->XMM_L(0) >> 31;
962 b1
= s
->XMM_L(1) >> 31;
963 b2
= s
->XMM_L(2) >> 31;
964 b3
= s
->XMM_L(3) >> 31;
965 return b0
| (b1
<< 1) | (b2
<< 2) | (b3
<< 3);
968 uint32_t helper_movmskpd(Reg
*s
)
971 b0
= s
->XMM_L(1) >> 31;
972 b1
= s
->XMM_L(3) >> 31;
973 return b0
| (b1
<< 1);
978 uint32_t glue(helper_pmovmskb
, SUFFIX
)(Reg
*s
)
982 val
|= (s
->XMM_B(0) >> 7);
983 val
|= (s
->XMM_B(1) >> 6) & 0x02;
984 val
|= (s
->XMM_B(2) >> 5) & 0x04;
985 val
|= (s
->XMM_B(3) >> 4) & 0x08;
986 val
|= (s
->XMM_B(4) >> 3) & 0x10;
987 val
|= (s
->XMM_B(5) >> 2) & 0x20;
988 val
|= (s
->XMM_B(6) >> 1) & 0x40;
989 val
|= (s
->XMM_B(7)) & 0x80;
991 val
|= (s
->XMM_B(8) << 1) & 0x0100;
992 val
|= (s
->XMM_B(9) << 2) & 0x0200;
993 val
|= (s
->XMM_B(10) << 3) & 0x0400;
994 val
|= (s
->XMM_B(11) << 4) & 0x0800;
995 val
|= (s
->XMM_B(12) << 5) & 0x1000;
996 val
|= (s
->XMM_B(13) << 6) & 0x2000;
997 val
|= (s
->XMM_B(14) << 7) & 0x4000;
998 val
|= (s
->XMM_B(15) << 8) & 0x8000;
1003 void glue(helper_packsswb
, SUFFIX
) (Reg
*d
, Reg
*s
)
1007 r
.B(0) = satsb((int16_t)d
->W(0));
1008 r
.B(1) = satsb((int16_t)d
->W(1));
1009 r
.B(2) = satsb((int16_t)d
->W(2));
1010 r
.B(3) = satsb((int16_t)d
->W(3));
1012 r
.B(4) = satsb((int16_t)d
->W(4));
1013 r
.B(5) = satsb((int16_t)d
->W(5));
1014 r
.B(6) = satsb((int16_t)d
->W(6));
1015 r
.B(7) = satsb((int16_t)d
->W(7));
1017 r
.B((4 << SHIFT
) + 0) = satsb((int16_t)s
->W(0));
1018 r
.B((4 << SHIFT
) + 1) = satsb((int16_t)s
->W(1));
1019 r
.B((4 << SHIFT
) + 2) = satsb((int16_t)s
->W(2));
1020 r
.B((4 << SHIFT
) + 3) = satsb((int16_t)s
->W(3));
1022 r
.B(12) = satsb((int16_t)s
->W(4));
1023 r
.B(13) = satsb((int16_t)s
->W(5));
1024 r
.B(14) = satsb((int16_t)s
->W(6));
1025 r
.B(15) = satsb((int16_t)s
->W(7));
1030 void glue(helper_packuswb
, SUFFIX
) (Reg
*d
, Reg
*s
)
1034 r
.B(0) = satub((int16_t)d
->W(0));
1035 r
.B(1) = satub((int16_t)d
->W(1));
1036 r
.B(2) = satub((int16_t)d
->W(2));
1037 r
.B(3) = satub((int16_t)d
->W(3));
1039 r
.B(4) = satub((int16_t)d
->W(4));
1040 r
.B(5) = satub((int16_t)d
->W(5));
1041 r
.B(6) = satub((int16_t)d
->W(6));
1042 r
.B(7) = satub((int16_t)d
->W(7));
1044 r
.B((4 << SHIFT
) + 0) = satub((int16_t)s
->W(0));
1045 r
.B((4 << SHIFT
) + 1) = satub((int16_t)s
->W(1));
1046 r
.B((4 << SHIFT
) + 2) = satub((int16_t)s
->W(2));
1047 r
.B((4 << SHIFT
) + 3) = satub((int16_t)s
->W(3));
1049 r
.B(12) = satub((int16_t)s
->W(4));
1050 r
.B(13) = satub((int16_t)s
->W(5));
1051 r
.B(14) = satub((int16_t)s
->W(6));
1052 r
.B(15) = satub((int16_t)s
->W(7));
1057 void glue(helper_packssdw
, SUFFIX
) (Reg
*d
, Reg
*s
)
1061 r
.W(0) = satsw(d
->L(0));
1062 r
.W(1) = satsw(d
->L(1));
1064 r
.W(2) = satsw(d
->L(2));
1065 r
.W(3) = satsw(d
->L(3));
1067 r
.W((2 << SHIFT
) + 0) = satsw(s
->L(0));
1068 r
.W((2 << SHIFT
) + 1) = satsw(s
->L(1));
1070 r
.W(6) = satsw(s
->L(2));
1071 r
.W(7) = satsw(s
->L(3));
1076 #define UNPCK_OP(base_name, base) \
1078 void glue(helper_punpck ## base_name ## bw, SUFFIX) (Reg *d, Reg *s) \
1082 r.B(0) = d->B((base << (SHIFT + 2)) + 0); \
1083 r.B(1) = s->B((base << (SHIFT + 2)) + 0); \
1084 r.B(2) = d->B((base << (SHIFT + 2)) + 1); \
1085 r.B(3) = s->B((base << (SHIFT + 2)) + 1); \
1086 r.B(4) = d->B((base << (SHIFT + 2)) + 2); \
1087 r.B(5) = s->B((base << (SHIFT + 2)) + 2); \
1088 r.B(6) = d->B((base << (SHIFT + 2)) + 3); \
1089 r.B(7) = s->B((base << (SHIFT + 2)) + 3); \
1091 r.B(8) = d->B((base << (SHIFT + 2)) + 4); \
1092 r.B(9) = s->B((base << (SHIFT + 2)) + 4); \
1093 r.B(10) = d->B((base << (SHIFT + 2)) + 5); \
1094 r.B(11) = s->B((base << (SHIFT + 2)) + 5); \
1095 r.B(12) = d->B((base << (SHIFT + 2)) + 6); \
1096 r.B(13) = s->B((base << (SHIFT + 2)) + 6); \
1097 r.B(14) = d->B((base << (SHIFT + 2)) + 7); \
1098 r.B(15) = s->B((base << (SHIFT + 2)) + 7); \
1103 void glue(helper_punpck ## base_name ## wd, SUFFIX) (Reg *d, Reg *s) \
1107 r.W(0) = d->W((base << (SHIFT + 1)) + 0); \
1108 r.W(1) = s->W((base << (SHIFT + 1)) + 0); \
1109 r.W(2) = d->W((base << (SHIFT + 1)) + 1); \
1110 r.W(3) = s->W((base << (SHIFT + 1)) + 1); \
1112 r.W(4) = d->W((base << (SHIFT + 1)) + 2); \
1113 r.W(5) = s->W((base << (SHIFT + 1)) + 2); \
1114 r.W(6) = d->W((base << (SHIFT + 1)) + 3); \
1115 r.W(7) = s->W((base << (SHIFT + 1)) + 3); \
1120 void glue(helper_punpck ## base_name ## dq, SUFFIX) (Reg *d, Reg *s) \
1124 r.L(0) = d->L((base << SHIFT) + 0); \
1125 r.L(1) = s->L((base << SHIFT) + 0); \
1127 r.L(2) = d->L((base << SHIFT) + 1); \
1128 r.L(3) = s->L((base << SHIFT) + 1); \
1134 void glue(helper_punpck ## base_name ## qdq, SUFFIX) (Reg *d, Reg *s) \
1138 r.Q(0) = d->Q(base); \
1139 r.Q(1) = s->Q(base); \
1147 /* 3DNow! float ops */
1149 void helper_pi2fd(MMXReg
*d
, MMXReg
*s
)
1151 d
->MMX_S(0) = int32_to_float32(s
->MMX_L(0), &env
->mmx_status
);
1152 d
->MMX_S(1) = int32_to_float32(s
->MMX_L(1), &env
->mmx_status
);
1155 void helper_pi2fw(MMXReg
*d
, MMXReg
*s
)
1157 d
->MMX_S(0) = int32_to_float32((int16_t)s
->MMX_W(0), &env
->mmx_status
);
1158 d
->MMX_S(1) = int32_to_float32((int16_t)s
->MMX_W(2), &env
->mmx_status
);
1161 void helper_pf2id(MMXReg
*d
, MMXReg
*s
)
1163 d
->MMX_L(0) = float32_to_int32_round_to_zero(s
->MMX_S(0), &env
->mmx_status
);
1164 d
->MMX_L(1) = float32_to_int32_round_to_zero(s
->MMX_S(1), &env
->mmx_status
);
1167 void helper_pf2iw(MMXReg
*d
, MMXReg
*s
)
1169 d
->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s
->MMX_S(0), &env
->mmx_status
));
1170 d
->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s
->MMX_S(1), &env
->mmx_status
));
1173 void helper_pfacc(MMXReg
*d
, MMXReg
*s
)
1176 r
.MMX_S(0) = float32_add(d
->MMX_S(0), d
->MMX_S(1), &env
->mmx_status
);
1177 r
.MMX_S(1) = float32_add(s
->MMX_S(0), s
->MMX_S(1), &env
->mmx_status
);
1181 void helper_pfadd(MMXReg
*d
, MMXReg
*s
)
1183 d
->MMX_S(0) = float32_add(d
->MMX_S(0), s
->MMX_S(0), &env
->mmx_status
);
1184 d
->MMX_S(1) = float32_add(d
->MMX_S(1), s
->MMX_S(1), &env
->mmx_status
);
1187 void helper_pfcmpeq(MMXReg
*d
, MMXReg
*s
)
1189 d
->MMX_L(0) = float32_eq(d
->MMX_S(0), s
->MMX_S(0), &env
->mmx_status
) ? -1 : 0;
1190 d
->MMX_L(1) = float32_eq(d
->MMX_S(1), s
->MMX_S(1), &env
->mmx_status
) ? -1 : 0;
1193 void helper_pfcmpge(MMXReg
*d
, MMXReg
*s
)
1195 d
->MMX_L(0) = float32_le(s
->MMX_S(0), d
->MMX_S(0), &env
->mmx_status
) ? -1 : 0;
1196 d
->MMX_L(1) = float32_le(s
->MMX_S(1), d
->MMX_S(1), &env
->mmx_status
) ? -1 : 0;
1199 void helper_pfcmpgt(MMXReg
*d
, MMXReg
*s
)
1201 d
->MMX_L(0) = float32_lt(s
->MMX_S(0), d
->MMX_S(0), &env
->mmx_status
) ? -1 : 0;
1202 d
->MMX_L(1) = float32_lt(s
->MMX_S(1), d
->MMX_S(1), &env
->mmx_status
) ? -1 : 0;
1205 void helper_pfmax(MMXReg
*d
, MMXReg
*s
)
1207 if (float32_lt(d
->MMX_S(0), s
->MMX_S(0), &env
->mmx_status
))
1208 d
->MMX_S(0) = s
->MMX_S(0);
1209 if (float32_lt(d
->MMX_S(1), s
->MMX_S(1), &env
->mmx_status
))
1210 d
->MMX_S(1) = s
->MMX_S(1);
1213 void helper_pfmin(MMXReg
*d
, MMXReg
*s
)
1215 if (float32_lt(s
->MMX_S(0), d
->MMX_S(0), &env
->mmx_status
))
1216 d
->MMX_S(0) = s
->MMX_S(0);
1217 if (float32_lt(s
->MMX_S(1), d
->MMX_S(1), &env
->mmx_status
))
1218 d
->MMX_S(1) = s
->MMX_S(1);
1221 void helper_pfmul(MMXReg
*d
, MMXReg
*s
)
1223 d
->MMX_S(0) = float32_mul(d
->MMX_S(0), s
->MMX_S(0), &env
->mmx_status
);
1224 d
->MMX_S(1) = float32_mul(d
->MMX_S(1), s
->MMX_S(1), &env
->mmx_status
);
1227 void helper_pfnacc(MMXReg
*d
, MMXReg
*s
)
1230 r
.MMX_S(0) = float32_sub(d
->MMX_S(0), d
->MMX_S(1), &env
->mmx_status
);
1231 r
.MMX_S(1) = float32_sub(s
->MMX_S(0), s
->MMX_S(1), &env
->mmx_status
);
1235 void helper_pfpnacc(MMXReg
*d
, MMXReg
*s
)
1238 r
.MMX_S(0) = float32_sub(d
->MMX_S(0), d
->MMX_S(1), &env
->mmx_status
);
1239 r
.MMX_S(1) = float32_add(s
->MMX_S(0), s
->MMX_S(1), &env
->mmx_status
);
1243 void helper_pfrcp(MMXReg
*d
, MMXReg
*s
)
1245 d
->MMX_S(0) = approx_rcp(s
->MMX_S(0));
1246 d
->MMX_S(1) = d
->MMX_S(0);
1249 void helper_pfrsqrt(MMXReg
*d
, MMXReg
*s
)
1251 d
->MMX_L(1) = s
->MMX_L(0) & 0x7fffffff;
1252 d
->MMX_S(1) = approx_rsqrt(d
->MMX_S(1));
1253 d
->MMX_L(1) |= s
->MMX_L(0) & 0x80000000;
1254 d
->MMX_L(0) = d
->MMX_L(1);
1257 void helper_pfsub(MMXReg
*d
, MMXReg
*s
)
1259 d
->MMX_S(0) = float32_sub(d
->MMX_S(0), s
->MMX_S(0), &env
->mmx_status
);
1260 d
->MMX_S(1) = float32_sub(d
->MMX_S(1), s
->MMX_S(1), &env
->mmx_status
);
1263 void helper_pfsubr(MMXReg
*d
, MMXReg
*s
)
1265 d
->MMX_S(0) = float32_sub(s
->MMX_S(0), d
->MMX_S(0), &env
->mmx_status
);
1266 d
->MMX_S(1) = float32_sub(s
->MMX_S(1), d
->MMX_S(1), &env
->mmx_status
);
1269 void helper_pswapd(MMXReg
*d
, MMXReg
*s
)
1272 r
.MMX_L(0) = s
->MMX_L(1);
1273 r
.MMX_L(1) = s
->MMX_L(0);