2 /*---------------------------------------------------------------*/
3 /*--- begin host_amd64_isel.c ---*/
4 /*---------------------------------------------------------------*/
7 This file is part of Valgrind, a dynamic binary instrumentation
10 Copyright (C) 2004-2017 OpenWorks LLP
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 The GNU General Public License is contained in the file COPYING.
28 Neither the names of the U.S. Department of Energy nor the
29 University of California nor the names of its contributors may be
30 used to endorse or promote products derived from this software
31 without prior written permission.
34 #include "libvex_basictypes.h"
35 #include "libvex_ir.h"
39 #include "main_util.h"
40 #include "main_globals.h"
41 #include "host_generic_regs.h"
42 #include "host_generic_simd64.h"
43 #include "host_generic_simd128.h"
44 #include "host_generic_simd256.h"
45 #include "host_amd64_maddf.h"
46 #include "host_generic_maddf.h"
47 #include "host_amd64_defs.h"
50 /*---------------------------------------------------------*/
51 /*--- x87/SSE control word stuff ---*/
52 /*---------------------------------------------------------*/
54 /* Vex-generated code expects to run with the FPU set as follows: all
55 exceptions masked, round-to-nearest, precision = 53 bits. This
56 corresponds to a FPU control word value of 0x027F.
58 Similarly the SSE control word (%mxcsr) should be 0x1F80.
60 %fpucw and %mxcsr should have these values on entry to
61 Vex-generated code, and should those values should be
65 #define DEFAULT_FPUCW 0x027F
67 #define DEFAULT_MXCSR 0x1F80
69 /* debugging only, do not use */
70 /* define DEFAULT_FPUCW 0x037F */
73 /*---------------------------------------------------------*/
74 /*--- misc helpers ---*/
75 /*---------------------------------------------------------*/
77 /* These are duplicated in guest-amd64/toIR.c */
78 static IRExpr
* unop ( IROp op
, IRExpr
* a
)
80 return IRExpr_Unop(op
, a
);
83 static IRExpr
* binop ( IROp op
, IRExpr
* a1
, IRExpr
* a2
)
85 return IRExpr_Binop(op
, a1
, a2
);
88 static IRExpr
* bind ( Int binder
)
90 return IRExpr_Binder(binder
);
93 static Bool
isZeroU8 ( const IRExpr
* e
)
95 return e
->tag
== Iex_Const
96 && e
->Iex
.Const
.con
->tag
== Ico_U8
97 && e
->Iex
.Const
.con
->Ico
.U8
== 0;
101 /*---------------------------------------------------------*/
103 /*---------------------------------------------------------*/
105 /* This carries around:
107 - A mapping from IRTemp to IRType, giving the type of any IRTemp we
108 might encounter. This is computed before insn selection starts,
111 - A mapping from IRTemp to HReg. This tells the insn selector
112 which virtual register is associated with each IRTemp
113 temporary. This is computed before insn selection starts, and
114 does not change. We expect this mapping to map precisely the
115 same set of IRTemps as the type mapping does.
117 - vregmap holds the primary register for the IRTemp.
118 - vregmapHI is only used for 128-bit integer-typed
119 IRTemps. It holds the identity of a second
120 64-bit virtual HReg, which holds the high half
123 - The host subarchitecture we are selecting insns for.
124 This is set at the start and does not change.
126 - The code array, that is, the insns selected so far.
128 - A counter, for generating new virtual registers.
130 - A Bool for indicating whether we may generate chain-me
131 instructions for control flow transfers, or whether we must use
134 - The maximum guest address of any guest insn in this block.
135 Actually, the address of the highest-addressed byte from any insn
136 in this block. Is set at the start and does not change. This is
137 used for detecting jumps which are definitely forward-edges from
138 this block, and therefore can be made (chained) to the fast entry
139 point of the destination, thereby avoiding the destination's
142 Note, this is all host-independent. (JRS 20050201: well, kinda
143 ... not completely. Compare with ISelEnv for X86.)
148 /* Constant -- are set at the start and do not change. */
157 Bool chainingAllowed
;
160 /* These are modified as we go along. */
167 static HReg
lookupIRTemp ( ISelEnv
* env
, IRTemp tmp
)
169 vassert(tmp
< env
->n_vregmap
);
170 return env
->vregmap
[tmp
];
173 static void lookupIRTempPair ( HReg
* vrHI
, HReg
* vrLO
,
174 ISelEnv
* env
, IRTemp tmp
)
176 vassert(tmp
< env
->n_vregmap
);
177 vassert(! hregIsInvalid(env
->vregmapHI
[tmp
]));
178 *vrLO
= env
->vregmap
[tmp
];
179 *vrHI
= env
->vregmapHI
[tmp
];
182 static void addInstr ( ISelEnv
* env
, AMD64Instr
* instr
)
184 addHInstr(env
->code
, instr
);
185 if (vex_traceflags
& VEX_TRACE_VCODE
) {
186 ppAMD64Instr(instr
, True
);
191 static HReg
newVRegI ( ISelEnv
* env
)
193 HReg reg
= mkHReg(True
/*virtual reg*/, HRcInt64
, 0/*enc*/, env
->vreg_ctr
);
198 static HReg
newVRegV ( ISelEnv
* env
)
200 HReg reg
= mkHReg(True
/*virtual reg*/, HRcVec128
, 0/*enc*/, env
->vreg_ctr
);
206 /*---------------------------------------------------------*/
207 /*--- ISEL: Forward declarations ---*/
208 /*---------------------------------------------------------*/
210 /* These are organised as iselXXX and iselXXX_wrk pairs. The
211 iselXXX_wrk do the real work, but are not to be called directly.
212 For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
213 checks that all returned registers are virtual. You should not
214 call the _wrk version directly.
216 static AMD64RMI
* iselIntExpr_RMI_wrk ( ISelEnv
* env
, const IRExpr
* e
);
217 static AMD64RMI
* iselIntExpr_RMI ( ISelEnv
* env
, const IRExpr
* e
);
219 static AMD64RI
* iselIntExpr_RI_wrk ( ISelEnv
* env
, const IRExpr
* e
);
220 static AMD64RI
* iselIntExpr_RI ( ISelEnv
* env
, const IRExpr
* e
);
222 static AMD64RM
* iselIntExpr_RM_wrk ( ISelEnv
* env
, const IRExpr
* e
);
223 static AMD64RM
* iselIntExpr_RM ( ISelEnv
* env
, const IRExpr
* e
);
225 static HReg
iselIntExpr_R_wrk ( ISelEnv
* env
, const IRExpr
* e
);
226 static HReg
iselIntExpr_R ( ISelEnv
* env
, const IRExpr
* e
);
228 static AMD64AMode
* iselIntExpr_AMode_wrk ( ISelEnv
* env
, const IRExpr
* e
);
229 static AMD64AMode
* iselIntExpr_AMode ( ISelEnv
* env
, const IRExpr
* e
);
231 static void iselInt128Expr_wrk ( /*OUT*/HReg
* rHi
, HReg
* rLo
,
232 ISelEnv
* env
, const IRExpr
* e
);
233 static void iselInt128Expr ( /*OUT*/HReg
* rHi
, HReg
* rLo
,
234 ISelEnv
* env
, const IRExpr
* e
);
236 static AMD64CondCode
iselCondCode_C_wrk ( ISelEnv
* env
, const IRExpr
* e
);
237 static AMD64CondCode
iselCondCode_C ( ISelEnv
* env
, const IRExpr
* e
);
239 static HReg
iselCondCode_R_wrk ( ISelEnv
* env
, const IRExpr
* e
);
240 static HReg
iselCondCode_R ( ISelEnv
* env
, const IRExpr
* e
);
242 static HReg
iselDblExpr_wrk ( ISelEnv
* env
, const IRExpr
* e
);
243 static HReg
iselDblExpr ( ISelEnv
* env
, const IRExpr
* e
);
245 static HReg
iselFltExpr_wrk ( ISelEnv
* env
, const IRExpr
* e
);
246 static HReg
iselFltExpr ( ISelEnv
* env
, const IRExpr
* e
);
248 static HReg
iselVecExpr_wrk ( ISelEnv
* env
, const IRExpr
* e
);
249 static HReg
iselVecExpr ( ISelEnv
* env
, const IRExpr
* e
);
251 static void iselDVecExpr_wrk ( /*OUT*/HReg
* rHi
, HReg
* rLo
,
252 ISelEnv
* env
, const IRExpr
* e
);
253 static void iselDVecExpr ( /*OUT*/HReg
* rHi
, HReg
* rLo
,
254 ISelEnv
* env
, const IRExpr
* e
);
257 /*---------------------------------------------------------*/
258 /*--- ISEL: Misc helpers ---*/
259 /*---------------------------------------------------------*/
261 static Bool
sane_AMode ( AMD64AMode
* am
)
266 toBool( hregClass(am
->Aam
.IR
.reg
) == HRcInt64
267 && (hregIsVirtual(am
->Aam
.IR
.reg
)
268 || sameHReg(am
->Aam
.IR
.reg
, hregAMD64_RBP())) );
271 toBool( hregClass(am
->Aam
.IRRS
.base
) == HRcInt64
272 && hregIsVirtual(am
->Aam
.IRRS
.base
)
273 && hregClass(am
->Aam
.IRRS
.index
) == HRcInt64
274 && hregIsVirtual(am
->Aam
.IRRS
.index
) );
276 vpanic("sane_AMode: unknown amd64 amode tag");
281 /* Can the lower 32 bits be signedly widened to produce the whole
282 64-bit value? In other words, are the top 33 bits either all 0 or
284 static Bool
fitsIn32Bits ( ULong x
)
289 return toBool(x
== y1
);
292 /* Is this a 64-bit zero expression? */
294 static Bool
isZeroU64 ( const IRExpr
* e
)
296 return e
->tag
== Iex_Const
297 && e
->Iex
.Const
.con
->tag
== Ico_U64
298 && e
->Iex
.Const
.con
->Ico
.U64
== 0ULL;
301 static Bool
isZeroU32 ( const IRExpr
* e
)
303 return e
->tag
== Iex_Const
304 && e
->Iex
.Const
.con
->tag
== Ico_U32
305 && e
->Iex
.Const
.con
->Ico
.U32
== 0;
308 /* Are both args atoms and the same? This is copy of eqIRAtom
309 that omits the assertions that the args are indeed atoms. */
311 static Bool
areAtomsAndEqual ( const IRExpr
* a1
, const IRExpr
* a2
)
313 if (a1
->tag
== Iex_RdTmp
&& a2
->tag
== Iex_RdTmp
)
314 return toBool(a1
->Iex
.RdTmp
.tmp
== a2
->Iex
.RdTmp
.tmp
);
315 if (a1
->tag
== Iex_Const
&& a2
->tag
== Iex_Const
)
316 return eqIRConst(a1
->Iex
.Const
.con
, a2
->Iex
.Const
.con
);
320 /* Make a int reg-reg move. */
322 static AMD64Instr
* mk_iMOVsd_RR ( HReg src
, HReg dst
)
324 vassert(hregClass(src
) == HRcInt64
);
325 vassert(hregClass(dst
) == HRcInt64
);
326 return AMD64Instr_Alu64R(Aalu_MOV
, AMD64RMI_Reg(src
), dst
);
329 /* Make a vector (128 bit) reg-reg move. */
331 static AMD64Instr
* mk_vMOVsd_RR ( HReg src
, HReg dst
)
333 vassert(hregClass(src
) == HRcVec128
);
334 vassert(hregClass(dst
) == HRcVec128
);
335 return AMD64Instr_SseReRg(Asse_MOV
, src
, dst
);
338 /* Advance/retreat %rsp by n. */
340 static void add_to_rsp ( ISelEnv
* env
, Int n
)
342 vassert(n
> 0 && n
< 256 && (n
%8) == 0);
344 AMD64Instr_Alu64R(Aalu_ADD
, AMD64RMI_Imm(n
),
348 static void sub_from_rsp ( ISelEnv
* env
, Int n
)
350 vassert(n
> 0 && n
< 256 && (n
%8) == 0);
352 AMD64Instr_Alu64R(Aalu_SUB
, AMD64RMI_Imm(n
),
356 /* Push 64-bit constants on the stack. */
357 static void push_uimm64( ISelEnv
* env
, ULong uimm64
)
359 /* If uimm64 can be expressed as the sign extension of its
360 lower 32 bits, we can do it the easy way. */
361 Long simm64
= (Long
)uimm64
;
362 if ( simm64
== ((Long
)(uimm64
<< 32) >> 32) ) {
363 addInstr( env
, AMD64Instr_Push(AMD64RMI_Imm( (UInt
)uimm64
)) );
365 HReg tmp
= newVRegI(env
);
366 addInstr( env
, AMD64Instr_Imm64(uimm64
, tmp
) );
367 addInstr( env
, AMD64Instr_Push(AMD64RMI_Reg(tmp
)) );
372 /* Used only in doHelperCall. If possible, produce a single
373 instruction which computes 'e' into 'dst'. If not possible, return
376 static AMD64Instr
* iselIntExpr_single_instruction ( ISelEnv
* env
,
380 /* Per comments in doHelperCall below, appearance of
381 Iex_VECRET implies ill-formed IR. */
382 vassert(e
->tag
!= Iex_VECRET
);
384 /* In this case we give out a copy of the BaseBlock pointer. */
385 if (UNLIKELY(e
->tag
== Iex_GSPTR
)) {
386 return mk_iMOVsd_RR( hregAMD64_RBP(), dst
);
389 vassert(typeOfIRExpr(env
->type_env
, e
) == Ity_I64
);
391 if (e
->tag
== Iex_Const
) {
392 vassert(e
->Iex
.Const
.con
->tag
== Ico_U64
);
393 if (fitsIn32Bits(e
->Iex
.Const
.con
->Ico
.U64
)) {
394 return AMD64Instr_Alu64R(
396 AMD64RMI_Imm(toUInt(e
->Iex
.Const
.con
->Ico
.U64
)),
400 return AMD64Instr_Imm64(e
->Iex
.Const
.con
->Ico
.U64
, dst
);
404 if (e
->tag
== Iex_RdTmp
) {
405 HReg src
= lookupIRTemp(env
, e
->Iex
.RdTmp
.tmp
);
406 return mk_iMOVsd_RR(src
, dst
);
409 if (e
->tag
== Iex_Get
) {
410 vassert(e
->Iex
.Get
.ty
== Ity_I64
);
411 return AMD64Instr_Alu64R(
414 AMD64AMode_IR(e
->Iex
.Get
.offset
,
419 if (e
->tag
== Iex_Unop
420 && e
->Iex
.Unop
.op
== Iop_32Uto64
421 && e
->Iex
.Unop
.arg
->tag
== Iex_RdTmp
) {
422 HReg src
= lookupIRTemp(env
, e
->Iex
.Unop
.arg
->Iex
.RdTmp
.tmp
);
423 return AMD64Instr_MovxLQ(False
, src
, dst
);
426 if (0) { ppIRExpr(e
); vex_printf("\n"); }
432 /* Do a complete function call. |guard| is a Ity_Bit expression
433 indicating whether or not the call happens. If guard==NULL, the
434 call is unconditional. |retloc| is set to indicate where the
435 return value is after the call. The caller (of this fn) must
436 generate code to add |stackAdjustAfterCall| to the stack pointer
437 after the call is done. */
440 void doHelperCall ( /*OUT*/UInt
* stackAdjustAfterCall
,
441 /*OUT*/RetLoc
* retloc
,
444 IRCallee
* cee
, IRType retTy
, IRExpr
** args
)
449 AMD64Instr
* fastinstrs
[6];
452 /* Set default returns. We'll update them later if needed. */
453 *stackAdjustAfterCall
= 0;
454 *retloc
= mk_RetLoc_INVALID();
456 /* These are used for cross-checking that IR-level constraints on
457 the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
461 /* Marshal args for a call and do the call.
463 This function only deals with a tiny set of possibilities, which
464 cover all helpers in practice. The restrictions are that only
465 arguments in registers are supported, hence only 6x64 integer
466 bits in total can be passed. In fact the only supported arg
469 The return type can be I{64,32,16,8} or V{128,256}. In the
470 latter two cases, it is expected that |args| will contain the
471 special node IRExpr_VECRET(), in which case this routine
472 generates code to allocate space on the stack for the vector
473 return value. Since we are not passing any scalars on the
474 stack, it is enough to preallocate the return space before
475 marshalling any arguments, in this case.
477 |args| may also contain IRExpr_GSPTR(), in which case the
478 value in %rbp is passed as the corresponding argument.
480 Generating code which is both efficient and correct when
481 parameters are to be passed in registers is difficult, for the
482 reasons elaborated in detail in comments attached to
483 doHelperCall() in priv/host-x86/isel.c. Here, we use a variant
484 of the method described in those comments.
486 The problem is split into two cases: the fast scheme and the
487 slow scheme. In the fast scheme, arguments are computed
488 directly into the target (real) registers. This is only safe
489 when we can be sure that computation of each argument will not
490 trash any real registers set by computation of any other
493 In the slow scheme, all args are first computed into vregs, and
494 once they are all done, they are moved to the relevant real
495 regs. This always gives correct code, but it also gives a bunch
496 of vreg-to-rreg moves which are usually redundant but are hard
497 for the register allocator to get rid of.
499 To decide which scheme to use, all argument expressions are
500 first examined. If they are all so simple that it is clear they
501 will be evaluated without use of any fixed registers, use the
502 fast scheme, else use the slow scheme. Note also that only
503 unconditional calls may use the fast scheme, since having to
504 compute a condition expression could itself trash real
505 registers. Note that for simplicity, in the case where
506 IRExpr_VECRET() is present, we use the slow scheme. This is
507 motivated by the desire to avoid any possible complexity
510 Note this requires being able to examine an expression and
511 determine whether or not evaluation of it might use a fixed
512 register. That requires knowledge of how the rest of this insn
513 selector works. Currently just the following 3 are regarded as
514 safe -- hopefully they cover the majority of arguments in
515 practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
518 /* Note that the cee->regparms field is meaningless on AMD64 host
519 (since there is only one calling convention) and so we always
522 for (i
= 0; args
[i
]; i
++)
526 vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
528 argregs
[0] = hregAMD64_RDI();
529 argregs
[1] = hregAMD64_RSI();
530 argregs
[2] = hregAMD64_RDX();
531 argregs
[3] = hregAMD64_RCX();
532 argregs
[4] = hregAMD64_R8();
533 argregs
[5] = hregAMD64_R9();
535 tmpregs
[0] = tmpregs
[1] = tmpregs
[2] =
536 tmpregs
[3] = tmpregs
[4] = tmpregs
[5] = INVALID_HREG
;
538 fastinstrs
[0] = fastinstrs
[1] = fastinstrs
[2] =
539 fastinstrs
[3] = fastinstrs
[4] = fastinstrs
[5] = NULL
;
541 /* First decide which scheme (slow or fast) is to be used. First
542 assume the fast scheme, and select slow if any contraindications
545 /* We'll need space on the stack for the return value. Avoid
546 possible complications with nested calls by using the slow
548 if (retTy
== Ity_V128
|| retTy
== Ity_V256
)
552 if (guard
->tag
== Iex_Const
553 && guard
->Iex
.Const
.con
->tag
== Ico_U1
554 && guard
->Iex
.Const
.con
->Ico
.U1
== True
) {
557 /* Not manifestly unconditional -- be conservative. */
562 /* Ok, let's try for the fast scheme. If it doesn't pan out, we'll
563 use the slow scheme. Because this is tentative, we can't call
564 addInstr (that is, commit to) any instructions until we're
565 handled all the arguments. So park the resulting instructions
566 in a buffer and emit that if we're successful. */
569 /* In this loop, we process args that can be computed into the
570 destination (real) register with a single instruction, without
571 using any fixed regs. That also includes IRExpr_GSPTR(), but
572 not IRExpr_VECRET(). Indeed, if the IR is well-formed, we can
573 never see IRExpr_VECRET() at this point, since the return-type
574 check above should ensure all those cases use the slow scheme
576 vassert(n_args
<= 6);
577 for (i
= 0; i
< n_args
; i
++) {
578 IRExpr
* arg
= args
[i
];
579 if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg
))) {
580 vassert(typeOfIRExpr(env
->type_env
, args
[i
]) == Ity_I64
);
583 = iselIntExpr_single_instruction( env
, argregs
[i
], args
[i
] );
584 if (fastinstrs
[i
] == NULL
)
588 /* Looks like we're in luck. Emit the accumulated instructions and
589 move on to doing the call itself. */
590 for (i
= 0; i
< n_args
; i
++)
591 addInstr(env
, fastinstrs
[i
]);
593 /* Fast scheme only applies for unconditional calls. Hence: */
599 /* SLOW SCHEME; move via temporaries */
602 # if 0 /* debug only */
603 if (n_args
> 0) {for (i
= 0; args
[i
]; i
++) {
604 ppIRExpr(args
[i
]); vex_printf(" "); }
608 /* If we have a vector return type, allocate a place for it on the
609 stack and record its address. */
610 HReg r_vecRetAddr
= INVALID_HREG
;
611 if (retTy
== Ity_V128
) {
612 r_vecRetAddr
= newVRegI(env
);
613 sub_from_rsp(env
, 16);
614 addInstr(env
, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr
));
616 else if (retTy
== Ity_V256
) {
617 r_vecRetAddr
= newVRegI(env
);
618 sub_from_rsp(env
, 32);
619 addInstr(env
, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr
));
622 vassert(n_args
<= 6);
623 for (i
= 0; i
< n_args
; i
++) {
624 IRExpr
* arg
= args
[i
];
625 if (UNLIKELY(arg
->tag
== Iex_GSPTR
)) {
626 tmpregs
[i
] = newVRegI(env
);
627 addInstr(env
, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs
[i
]));
630 else if (UNLIKELY(arg
->tag
== Iex_VECRET
)) {
631 /* We stashed the address of the return slot earlier, so just
633 vassert(!hregIsInvalid(r_vecRetAddr
));
634 tmpregs
[i
] = r_vecRetAddr
;
638 vassert(typeOfIRExpr(env
->type_env
, args
[i
]) == Ity_I64
);
639 tmpregs
[i
] = iselIntExpr_R(env
, args
[i
]);
643 /* Now we can compute the condition. We can't do it earlier
644 because the argument computations could trash the condition
645 codes. Be a bit clever to handle the common case where the
649 if (guard
->tag
== Iex_Const
650 && guard
->Iex
.Const
.con
->tag
== Ico_U1
651 && guard
->Iex
.Const
.con
->Ico
.U1
== True
) {
652 /* unconditional -- do nothing */
654 cc
= iselCondCode_C( env
, guard
);
658 /* Move the args to their final destinations. */
659 for (i
= 0; i
< n_args
; i
++) {
660 /* None of these insns, including any spill code that might
661 be generated, may alter the condition codes. */
662 addInstr( env
, mk_iMOVsd_RR( tmpregs
[i
], argregs
[i
] ) );
666 /* Do final checks, set the return values, and generate the call
667 instruction proper. */
670 if (retTy
== Ity_V128
|| retTy
== Ity_V256
) {
671 vassert(nVECRETs
== 1);
673 vassert(nVECRETs
== 0);
676 vassert(nGSPTRs
== 0 || nGSPTRs
== 1);
678 vassert(*stackAdjustAfterCall
== 0);
679 vassert(is_RetLoc_INVALID(*retloc
));
682 /* Function doesn't return a value. */
683 *retloc
= mk_RetLoc_simple(RLPri_None
);
685 case Ity_I64
: case Ity_I32
: case Ity_I16
: case Ity_I8
:
686 *retloc
= mk_RetLoc_simple(RLPri_Int
);
689 *retloc
= mk_RetLoc_spRel(RLPri_V128SpRel
, 0);
690 *stackAdjustAfterCall
= 16;
693 *retloc
= mk_RetLoc_spRel(RLPri_V256SpRel
, 0);
694 *stackAdjustAfterCall
= 32;
697 /* IR can denote other possible return types, but we don't
698 handle those here. */
702 /* Finally, generate the call itself. This needs the *retloc value
703 set in the switch above, which is why it's at the end. */
705 AMD64Instr_Call(cc
, (Addr
)cee
->addr
, n_args
, *retloc
));
709 /* Given a guest-state array descriptor, an index expression and a
710 bias, generate an AMD64AMode holding the relevant guest state
714 AMD64AMode
* genGuestArrayOffset ( ISelEnv
* env
, IRRegArray
* descr
,
715 IRExpr
* off
, Int bias
)
718 Int elemSz
= sizeofIRType(descr
->elemTy
);
719 Int nElems
= descr
->nElems
;
721 /* Throw out any cases not generated by an amd64 front end. In
722 theory there might be a day where we need to handle them -- if
723 we ever run non-amd64-guest on amd64 host. */
725 if (nElems
!= 8 || (elemSz
!= 1 && elemSz
!= 8))
726 vpanic("genGuestArrayOffset(amd64 host)");
728 /* Compute off into a reg, %off. Then return:
731 addq $bias, %tmp (if bias != 0)
733 ... base(%rbp, %tmp, shift) ...
736 roff
= iselIntExpr_R(env
, off
);
737 addInstr(env
, mk_iMOVsd_RR(roff
, tmp
));
739 /* Make sure the bias is sane, in the sense that there are
740 no significant bits above bit 30 in it. */
741 vassert(-10000 < bias
&& bias
< 10000);
743 AMD64Instr_Alu64R(Aalu_ADD
, AMD64RMI_Imm(bias
), tmp
));
746 AMD64Instr_Alu64R(Aalu_AND
, AMD64RMI_Imm(7), tmp
));
747 vassert(elemSz
== 1 || elemSz
== 8);
749 AMD64AMode_IRRS( descr
->base
, hregAMD64_RBP(), tmp
,
754 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
756 void set_SSE_rounding_default ( ISelEnv
* env
)
758 /* pushq $DEFAULT_MXCSR
762 AMD64AMode
* zero_rsp
= AMD64AMode_IR(0, hregAMD64_RSP());
763 addInstr(env
, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR
)));
764 addInstr(env
, AMD64Instr_LdMXCSR(zero_rsp
));
768 /* Mess with the FPU's rounding mode: set to the default rounding mode
771 void set_FPU_rounding_default ( ISelEnv
* env
)
773 /* movq $DEFAULT_FPUCW, -8(%rsp)
776 AMD64AMode
* m8_rsp
= AMD64AMode_IR(-8, hregAMD64_RSP());
777 addInstr(env
, AMD64Instr_Alu64M(
778 Aalu_MOV
, AMD64RI_Imm(DEFAULT_FPUCW
), m8_rsp
));
779 addInstr(env
, AMD64Instr_A87LdCW(m8_rsp
));
783 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
784 expression denoting a value in the range 0 .. 3, indicating a round
785 mode encoded as per type IRRoundingMode. Set the SSE machinery to
786 have the same rounding.
789 void set_SSE_rounding_mode ( ISelEnv
* env
, IRExpr
* mode
)
791 /* Note: this sequence only makes sense because DEFAULT_MXCSR has
792 both rounding bits == 0. If that wasn't the case, we couldn't
793 create a new rounding field simply by ORing the new value into
797 andq [[mode]], %reg -- shouldn't be needed; paranoia
799 orq $DEFAULT_MXCSR, %reg
804 HReg reg
= newVRegI(env
);
805 AMD64AMode
* zero_rsp
= AMD64AMode_IR(0, hregAMD64_RSP());
806 addInstr(env
, AMD64Instr_Alu64R(Aalu_MOV
, AMD64RMI_Imm(3), reg
));
807 addInstr(env
, AMD64Instr_Alu64R(Aalu_AND
,
808 iselIntExpr_RMI(env
, mode
), reg
));
809 addInstr(env
, AMD64Instr_Sh64(Ash_SHL
, 13, reg
));
810 addInstr(env
, AMD64Instr_Alu64R(
811 Aalu_OR
, AMD64RMI_Imm(DEFAULT_MXCSR
), reg
));
812 addInstr(env
, AMD64Instr_Push(AMD64RMI_Reg(reg
)));
813 addInstr(env
, AMD64Instr_LdMXCSR(zero_rsp
));
818 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
819 expression denoting a value in the range 0 .. 3, indicating a round
820 mode encoded as per type IRRoundingMode. Set the x87 FPU to have
824 void set_FPU_rounding_mode ( ISelEnv
* env
, IRExpr
* mode
)
826 HReg rrm
= iselIntExpr_R(env
, mode
);
827 HReg rrm2
= newVRegI(env
);
828 AMD64AMode
* m8_rsp
= AMD64AMode_IR(-8, hregAMD64_RSP());
831 andq $3, %rrm2 -- shouldn't be needed; paranoia
833 orq $DEFAULT_FPUCW, %rrm2
837 addInstr(env
, mk_iMOVsd_RR(rrm
, rrm2
));
838 addInstr(env
, AMD64Instr_Alu64R(Aalu_AND
, AMD64RMI_Imm(3), rrm2
));
839 addInstr(env
, AMD64Instr_Sh64(Ash_SHL
, 10, rrm2
));
840 addInstr(env
, AMD64Instr_Alu64R(Aalu_OR
,
841 AMD64RMI_Imm(DEFAULT_FPUCW
), rrm2
));
842 addInstr(env
, AMD64Instr_Alu64M(Aalu_MOV
,
843 AMD64RI_Reg(rrm2
), m8_rsp
));
844 addInstr(env
, AMD64Instr_A87LdCW(m8_rsp
));
848 /* Generate all-zeroes into a new vector register.
850 static HReg
generate_zeroes_V128 ( ISelEnv
* env
)
852 HReg dst
= newVRegV(env
);
853 addInstr(env
, AMD64Instr_SseReRg(Asse_XOR
, dst
, dst
));
857 /* Generate all-ones into a new vector register.
859 static HReg
generate_ones_V128 ( ISelEnv
* env
)
861 HReg dst
= newVRegV(env
);
862 addInstr(env
, AMD64Instr_SseReRg(Asse_CMPEQ32
, dst
, dst
));
867 /* Generate !src into a new vector register. Amazing that there isn't
868 a less crappy way to do this.
870 static HReg
do_sse_NotV128 ( ISelEnv
* env
, HReg src
)
872 HReg dst
= generate_ones_V128(env
);
873 addInstr(env
, AMD64Instr_SseReRg(Asse_XOR
, src
, dst
));
878 /* Expand the given byte into a 64-bit word, by cloning each bit
880 static ULong
bitmask8_to_bytemask64 ( UShort w8
)
882 vassert(w8
== (w8
& 0xFF));
885 for (i
= 0; i
< 8; i
++) {
887 w64
|= (0xFFULL
<< (8 * i
));
893 /*---------------------------------------------------------*/
894 /*--- ISEL: Integer expressions (64/32/16/8 bit) ---*/
895 /*---------------------------------------------------------*/
897 /* Select insns for an integer-typed expression, and add them to the
898 code list. Return a reg holding the result. This reg will be a
899 virtual register. THE RETURNED REG MUST NOT BE MODIFIED. If you
900 want to modify it, ask for a new vreg, copy it in there, and modify
901 the copy. The register allocator will do its best to map both
902 vregs to the same real register, so the copies will often disappear
905 This should handle expressions of 64, 32, 16 and 8-bit type. All
906 results are returned in a 64-bit register. For 32-, 16- and 8-bit
907 expressions, the upper 32/48/56 bits are arbitrary, so you should
908 mask or sign extend partial values if necessary.
911 static HReg
iselIntExpr_R ( ISelEnv
* env
, const IRExpr
* e
)
913 HReg r
= iselIntExpr_R_wrk(env
, e
);
914 /* sanity checks ... */
916 vex_printf("\niselIntExpr_R: "); ppIRExpr(e
); vex_printf("\n");
918 vassert(hregClass(r
) == HRcInt64
);
919 vassert(hregIsVirtual(r
));
923 /* DO NOT CALL THIS DIRECTLY ! */
924 static HReg
iselIntExpr_R_wrk ( ISelEnv
* env
, const IRExpr
* e
)
927 DECLARE_PATTERN(p_1Uto8_64to1
);
928 DECLARE_PATTERN(p_LDle8_then_8Uto64
);
929 DECLARE_PATTERN(p_LDle16_then_16Uto64
);
931 IRType ty
= typeOfIRExpr(env
->type_env
,e
);
933 case Ity_I64
: case Ity_I32
: case Ity_I16
: case Ity_I8
: break;
939 /* --------- TEMP --------- */
941 return lookupIRTemp(env
, e
->Iex
.RdTmp
.tmp
);
944 /* --------- LOAD --------- */
946 HReg dst
= newVRegI(env
);
947 AMD64AMode
* amode
= iselIntExpr_AMode ( env
, e
->Iex
.Load
.addr
);
949 /* We can't handle big-endian loads, nor load-linked. */
950 if (e
->Iex
.Load
.end
!= Iend_LE
)
954 addInstr(env
, AMD64Instr_Alu64R(Aalu_MOV
,
955 AMD64RMI_Mem(amode
), dst
) );
959 addInstr(env
, AMD64Instr_LoadEX(4,False
,amode
,dst
));
963 addInstr(env
, AMD64Instr_LoadEX(2,False
,amode
,dst
));
967 addInstr(env
, AMD64Instr_LoadEX(1,False
,amode
,dst
));
973 /* --------- BINARY OP --------- */
978 /* Pattern: Sub64(0,x) */
979 /* and: Sub32(0,x) */
980 if ((e
->Iex
.Binop
.op
== Iop_Sub64
&& isZeroU64(e
->Iex
.Binop
.arg1
))
981 || (e
->Iex
.Binop
.op
== Iop_Sub32
&& isZeroU32(e
->Iex
.Binop
.arg1
))) {
982 HReg dst
= newVRegI(env
);
983 HReg reg
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
984 addInstr(env
, mk_iMOVsd_RR(reg
,dst
));
985 addInstr(env
, AMD64Instr_Unary64(Aun_NEG
,dst
));
989 /* Is it an addition or logical style op? */
990 switch (e
->Iex
.Binop
.op
) {
991 case Iop_Add8
: case Iop_Add16
: case Iop_Add32
: case Iop_Add64
:
992 aluOp
= Aalu_ADD
; break;
993 case Iop_Sub8
: case Iop_Sub16
: case Iop_Sub32
: case Iop_Sub64
:
994 aluOp
= Aalu_SUB
; break;
995 case Iop_And8
: case Iop_And16
: case Iop_And32
: case Iop_And64
:
996 aluOp
= Aalu_AND
; break;
997 case Iop_Or8
: case Iop_Or16
: case Iop_Or32
: case Iop_Or64
:
998 aluOp
= Aalu_OR
; break;
999 case Iop_Xor8
: case Iop_Xor16
: case Iop_Xor32
: case Iop_Xor64
:
1000 aluOp
= Aalu_XOR
; break;
1001 case Iop_Mul16
: case Iop_Mul32
: case Iop_Mul64
:
1002 aluOp
= Aalu_MUL
; break;
1004 aluOp
= Aalu_INVALID
; break;
1006 /* For commutative ops we assume any literal
1007 values are on the second operand. */
1008 if (aluOp
!= Aalu_INVALID
) {
1009 HReg dst
= newVRegI(env
);
1010 HReg reg
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
1011 AMD64RMI
* rmi
= iselIntExpr_RMI(env
, e
->Iex
.Binop
.arg2
);
1012 addInstr(env
, mk_iMOVsd_RR(reg
,dst
));
1013 addInstr(env
, AMD64Instr_Alu64R(aluOp
, rmi
, dst
));
1017 /* Perhaps a shift op? */
1018 switch (e
->Iex
.Binop
.op
) {
1019 case Iop_Shl64
: case Iop_Shl32
: case Iop_Shl16
: case Iop_Shl8
:
1020 shOp
= Ash_SHL
; break;
1021 case Iop_Shr64
: case Iop_Shr32
: case Iop_Shr16
: case Iop_Shr8
:
1022 shOp
= Ash_SHR
; break;
1023 case Iop_Sar64
: case Iop_Sar32
: case Iop_Sar16
: case Iop_Sar8
:
1024 shOp
= Ash_SAR
; break;
1026 shOp
= Ash_INVALID
; break;
1028 if (shOp
!= Ash_INVALID
) {
1029 HReg dst
= newVRegI(env
);
1031 /* regL = the value to be shifted */
1032 HReg regL
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
1033 addInstr(env
, mk_iMOVsd_RR(regL
,dst
));
1035 /* Do any necessary widening for 16/8 bit operands. Also decide on the
1036 final width at which the shift is to be done. */
1037 Bool shift64
= False
;
1038 switch (e
->Iex
.Binop
.op
) {
1039 case Iop_Shr64
: case Iop_Shl64
: case Iop_Sar64
:
1042 case Iop_Shl32
: case Iop_Shl16
: case Iop_Shl8
:
1045 addInstr(env
, AMD64Instr_Alu64R(
1046 Aalu_AND
, AMD64RMI_Imm(0xFF), dst
));
1049 addInstr(env
, AMD64Instr_Alu64R(
1050 Aalu_AND
, AMD64RMI_Imm(0xFFFF), dst
));
1055 addInstr(env
, AMD64Instr_Sh32(Ash_SHL
, 24, dst
));
1056 addInstr(env
, AMD64Instr_Sh32(Ash_SAR
, 24, dst
));
1059 addInstr(env
, AMD64Instr_Sh32(Ash_SHL
, 16, dst
));
1060 addInstr(env
, AMD64Instr_Sh32(Ash_SAR
, 16, dst
));
1065 ppIROp(e
->Iex
.Binop
.op
);
1069 /* Now consider the shift amount. If it's a literal, we
1070 can do a much better job than the general case. */
1071 if (e
->Iex
.Binop
.arg2
->tag
== Iex_Const
) {
1072 /* assert that the IR is well-typed */
1074 vassert(e
->Iex
.Binop
.arg2
->Iex
.Const
.con
->tag
== Ico_U8
);
1075 nshift
= e
->Iex
.Binop
.arg2
->Iex
.Const
.con
->Ico
.U8
;
1076 vassert(nshift
>= 0);
1078 /* Can't allow nshift==0 since that means %cl */
1080 addInstr(env
, AMD64Instr_Sh64(shOp
, nshift
, dst
));
1082 addInstr(env
, AMD64Instr_Sh32(shOp
, nshift
, dst
));
1086 /* General case; we have to force the amount into %cl. */
1087 HReg regR
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
1088 addInstr(env
, mk_iMOVsd_RR(regR
,hregAMD64_RCX()));
1090 addInstr(env
, AMD64Instr_Sh64(shOp
, 0/* %cl */, dst
));
1092 addInstr(env
, AMD64Instr_Sh32(shOp
, 0/* %cl */, dst
));
1098 /* Handle misc other scalar ops. */
1099 if (e
->Iex
.Binop
.op
== Iop_Max32U
) {
1100 HReg src1
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
1101 HReg dst
= newVRegI(env
);
1102 HReg src2
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
1103 addInstr(env
, mk_iMOVsd_RR(src1
, dst
));
1104 addInstr(env
, AMD64Instr_Alu32R(Aalu_CMP
, AMD64RMI_Reg(src2
), dst
));
1105 addInstr(env
, AMD64Instr_CMov64(Acc_B
, src2
, dst
));
1109 if (e
->Iex
.Binop
.op
== Iop_DivModS64to32
1110 || e
->Iex
.Binop
.op
== Iop_DivModU64to32
) {
1111 /* 64 x 32 -> (32(rem),32(div)) division */
1112 /* Get the 64-bit operand into edx:eax, and the other into
1114 HReg rax
= hregAMD64_RAX();
1115 HReg rdx
= hregAMD64_RDX();
1116 HReg dst
= newVRegI(env
);
1117 Bool syned
= toBool(e
->Iex
.Binop
.op
== Iop_DivModS64to32
);
1118 AMD64RM
* rmRight
= iselIntExpr_RM(env
, e
->Iex
.Binop
.arg2
);
1119 /* Compute the left operand into a reg, and then
1120 put the top half in edx and the bottom in eax. */
1121 HReg left64
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
1122 addInstr(env
, mk_iMOVsd_RR(left64
, rdx
));
1123 addInstr(env
, mk_iMOVsd_RR(left64
, rax
));
1124 addInstr(env
, AMD64Instr_Sh64(Ash_SHR
, 32, rdx
));
1125 addInstr(env
, AMD64Instr_Div(syned
, 4, rmRight
));
1126 addInstr(env
, AMD64Instr_MovxLQ(False
, rdx
, rdx
));
1127 addInstr(env
, AMD64Instr_MovxLQ(False
, rax
, rax
));
1128 addInstr(env
, AMD64Instr_Sh64(Ash_SHL
, 32, rdx
));
1129 addInstr(env
, mk_iMOVsd_RR(rax
, dst
));
1130 addInstr(env
, AMD64Instr_Alu64R(Aalu_OR
, AMD64RMI_Reg(rdx
), dst
));
1134 if (e
->Iex
.Binop
.op
== Iop_32HLto64
) {
1135 HReg hi32
= newVRegI(env
);
1136 HReg lo32
= newVRegI(env
);
1137 HReg hi32s
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
1138 HReg lo32s
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
1139 addInstr(env
, mk_iMOVsd_RR(hi32s
, hi32
));
1140 addInstr(env
, mk_iMOVsd_RR(lo32s
, lo32
));
1141 addInstr(env
, AMD64Instr_Sh64(Ash_SHL
, 32, hi32
));
1142 addInstr(env
, AMD64Instr_MovxLQ(False
, lo32
, lo32
));
1143 addInstr(env
, AMD64Instr_Alu64R(
1144 Aalu_OR
, AMD64RMI_Reg(lo32
), hi32
));
1148 if (e
->Iex
.Binop
.op
== Iop_16HLto32
) {
1149 HReg hi16
= newVRegI(env
);
1150 HReg lo16
= newVRegI(env
);
1151 HReg hi16s
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
1152 HReg lo16s
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
1153 addInstr(env
, mk_iMOVsd_RR(hi16s
, hi16
));
1154 addInstr(env
, mk_iMOVsd_RR(lo16s
, lo16
));
1155 addInstr(env
, AMD64Instr_Sh64(Ash_SHL
, 16, hi16
));
1156 addInstr(env
, AMD64Instr_Alu64R(
1157 Aalu_AND
, AMD64RMI_Imm(0xFFFF), lo16
));
1158 addInstr(env
, AMD64Instr_Alu64R(
1159 Aalu_OR
, AMD64RMI_Reg(lo16
), hi16
));
1163 if (e
->Iex
.Binop
.op
== Iop_8HLto16
) {
1164 HReg hi8
= newVRegI(env
);
1165 HReg lo8
= newVRegI(env
);
1166 HReg hi8s
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
1167 HReg lo8s
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
1168 addInstr(env
, mk_iMOVsd_RR(hi8s
, hi8
));
1169 addInstr(env
, mk_iMOVsd_RR(lo8s
, lo8
));
1170 addInstr(env
, AMD64Instr_Sh64(Ash_SHL
, 8, hi8
));
1171 addInstr(env
, AMD64Instr_Alu64R(
1172 Aalu_AND
, AMD64RMI_Imm(0xFF), lo8
));
1173 addInstr(env
, AMD64Instr_Alu64R(
1174 Aalu_OR
, AMD64RMI_Reg(lo8
), hi8
));
1178 if (e
->Iex
.Binop
.op
== Iop_MullS32
1179 || e
->Iex
.Binop
.op
== Iop_MullS16
1180 || e
->Iex
.Binop
.op
== Iop_MullS8
1181 || e
->Iex
.Binop
.op
== Iop_MullU32
1182 || e
->Iex
.Binop
.op
== Iop_MullU16
1183 || e
->Iex
.Binop
.op
== Iop_MullU8
) {
1184 HReg a32
= newVRegI(env
);
1185 HReg b32
= newVRegI(env
);
1186 HReg a32s
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
1187 HReg b32s
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
1189 AMD64ShiftOp shr_op
= Ash_SHR
;
1190 switch (e
->Iex
.Binop
.op
) {
1191 case Iop_MullS32
: shr_op
= Ash_SAR
; shift
= 32; break;
1192 case Iop_MullS16
: shr_op
= Ash_SAR
; shift
= 48; break;
1193 case Iop_MullS8
: shr_op
= Ash_SAR
; shift
= 56; break;
1194 case Iop_MullU32
: shr_op
= Ash_SHR
; shift
= 32; break;
1195 case Iop_MullU16
: shr_op
= Ash_SHR
; shift
= 48; break;
1196 case Iop_MullU8
: shr_op
= Ash_SHR
; shift
= 56; break;
1197 default: vassert(0);
1200 addInstr(env
, mk_iMOVsd_RR(a32s
, a32
));
1201 addInstr(env
, mk_iMOVsd_RR(b32s
, b32
));
1202 addInstr(env
, AMD64Instr_Sh64(Ash_SHL
, shift
, a32
));
1203 addInstr(env
, AMD64Instr_Sh64(Ash_SHL
, shift
, b32
));
1204 addInstr(env
, AMD64Instr_Sh64(shr_op
, shift
, a32
));
1205 addInstr(env
, AMD64Instr_Sh64(shr_op
, shift
, b32
));
1206 addInstr(env
, AMD64Instr_Alu64R(Aalu_MUL
, AMD64RMI_Reg(a32
), b32
));
1210 if (e
->Iex
.Binop
.op
== Iop_CmpF64
) {
1211 HReg fL
= iselDblExpr(env
, e
->Iex
.Binop
.arg1
);
1212 HReg fR
= iselDblExpr(env
, e
->Iex
.Binop
.arg2
);
1213 HReg dst
= newVRegI(env
);
1214 addInstr(env
, AMD64Instr_SseUComIS(8,fL
,fR
,dst
));
1215 /* Mask out irrelevant parts of the result so as to conform
1216 to the CmpF64 definition. */
1217 addInstr(env
, AMD64Instr_Alu64R(Aalu_AND
, AMD64RMI_Imm(0x45), dst
));
1221 if (e
->Iex
.Binop
.op
== Iop_F64toI32S
1222 || e
->Iex
.Binop
.op
== Iop_F64toI64S
) {
1223 Int szD
= e
->Iex
.Binop
.op
==Iop_F64toI32S
? 4 : 8;
1224 HReg rf
= iselDblExpr(env
, e
->Iex
.Binop
.arg2
);
1225 HReg dst
= newVRegI(env
);
1226 set_SSE_rounding_mode( env
, e
->Iex
.Binop
.arg1
);
1227 addInstr(env
, AMD64Instr_SseSF2SI( 8, szD
, rf
, dst
));
1228 set_SSE_rounding_default(env
);
1232 /* Deal with 64-bit SIMD binary ops. For the most part these are doable
1233 by using the equivalent 128-bit operation and ignoring the upper half
1235 AMD64SseOp op
= Asse_INVALID
;
1236 Bool arg1isEReg
= False
;
1237 Bool preShift32R
= False
;
1238 switch (e
->Iex
.Binop
.op
) {
1239 // The following 3 could be done with 128 bit insns too, but
1240 // first require the inputs to be reformatted.
1241 //case Iop_QNarrowBin32Sto16Sx4:
1242 //op = Asse_PACKSSD; arg1isEReg = True; break;
1243 //case Iop_QNarrowBin16Sto8Sx8:
1244 //op = Asse_PACKSSW; arg1isEReg = True; break;
1245 //case Iop_QNarrowBin16Sto8Ux8:
1246 //op = Asse_PACKUSW; arg1isEReg = True; break;
1248 case Iop_InterleaveHI8x8
:
1249 op
= Asse_UNPCKLB
; arg1isEReg
= True
; preShift32R
= True
;
1251 case Iop_InterleaveHI16x4
:
1252 op
= Asse_UNPCKLW
; arg1isEReg
= True
; preShift32R
= True
;
1254 case Iop_InterleaveHI32x2
:
1255 op
= Asse_UNPCKLD
; arg1isEReg
= True
; preShift32R
= True
;
1257 case Iop_InterleaveLO8x8
:
1258 op
= Asse_UNPCKLB
; arg1isEReg
= True
;
1260 case Iop_InterleaveLO16x4
:
1261 op
= Asse_UNPCKLW
; arg1isEReg
= True
;
1263 case Iop_InterleaveLO32x2
:
1264 op
= Asse_UNPCKLD
; arg1isEReg
= True
;
1267 case Iop_Add8x8
: op
= Asse_ADD8
; break;
1268 case Iop_Add16x4
: op
= Asse_ADD16
; break;
1269 case Iop_Add32x2
: op
= Asse_ADD32
; break;
1270 case Iop_QAdd8Sx8
: op
= Asse_QADD8S
; break;
1271 case Iop_QAdd16Sx4
: op
= Asse_QADD16S
; break;
1272 case Iop_QAdd8Ux8
: op
= Asse_QADD8U
; break;
1273 case Iop_QAdd16Ux4
: op
= Asse_QADD16U
; break;
1274 case Iop_Avg8Ux8
: op
= Asse_AVG8U
; break;
1275 case Iop_Avg16Ux4
: op
= Asse_AVG16U
; break;
1276 case Iop_CmpEQ8x8
: op
= Asse_CMPEQ8
; break;
1277 case Iop_CmpEQ16x4
: op
= Asse_CMPEQ16
; break;
1278 case Iop_CmpEQ32x2
: op
= Asse_CMPEQ32
; break;
1279 case Iop_CmpGT8Sx8
: op
= Asse_CMPGT8S
; break;
1280 case Iop_CmpGT16Sx4
: op
= Asse_CMPGT16S
; break;
1281 case Iop_CmpGT32Sx2
: op
= Asse_CMPGT32S
; break;
1282 case Iop_Max16Sx4
: op
= Asse_MAX16S
; break;
1283 case Iop_Max8Ux8
: op
= Asse_MAX8U
; break;
1284 case Iop_Min16Sx4
: op
= Asse_MIN16S
; break;
1285 case Iop_Min8Ux8
: op
= Asse_MIN8U
; break;
1286 case Iop_MulHi16Ux4
: op
= Asse_MULHI16U
; break;
1287 case Iop_MulHi16Sx4
: op
= Asse_MULHI16S
; break;
1288 case Iop_Mul16x4
: op
= Asse_MUL16
; break;
1289 case Iop_Sub8x8
: op
= Asse_SUB8
; break;
1290 case Iop_Sub16x4
: op
= Asse_SUB16
; break;
1291 case Iop_Sub32x2
: op
= Asse_SUB32
; break;
1292 case Iop_QSub8Sx8
: op
= Asse_QSUB8S
; break;
1293 case Iop_QSub16Sx4
: op
= Asse_QSUB16S
; break;
1294 case Iop_QSub8Ux8
: op
= Asse_QSUB8U
; break;
1295 case Iop_QSub16Ux4
: op
= Asse_QSUB16U
; break;
1298 if (op
!= Asse_INVALID
) {
1299 /* This isn't pretty, but .. move each arg to the low half of an XMM
1300 register, do the operation on the whole register, and move the
1301 result back to an integer register. */
1302 const IRExpr
* arg1
= e
->Iex
.Binop
.arg1
;
1303 const IRExpr
* arg2
= e
->Iex
.Binop
.arg2
;
1304 vassert(typeOfIRExpr(env
->type_env
, arg1
) == Ity_I64
);
1305 vassert(typeOfIRExpr(env
->type_env
, arg2
) == Ity_I64
);
1306 HReg iarg1
= iselIntExpr_R(env
, arg1
);
1307 HReg iarg2
= iselIntExpr_R(env
, arg2
);
1308 HReg varg1
= newVRegV(env
);
1309 HReg varg2
= newVRegV(env
);
1310 HReg idst
= newVRegI(env
);
1311 addInstr(env
, AMD64Instr_SseMOVQ(iarg1
, varg1
, True
/*toXMM*/));
1312 addInstr(env
, AMD64Instr_SseMOVQ(iarg2
, varg2
, True
/*toXMM*/));
1315 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHR128
, 32, varg1
));
1316 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHR128
, 32, varg2
));
1318 addInstr(env
, AMD64Instr_SseReRg(op
, varg1
, varg2
));
1319 addInstr(env
, AMD64Instr_SseMOVQ(idst
, varg2
, False
/*!toXMM*/));
1321 vassert(!preShift32R
);
1322 addInstr(env
, AMD64Instr_SseReRg(op
, varg2
, varg1
));
1323 addInstr(env
, AMD64Instr_SseMOVQ(idst
, varg1
, False
/*!toXMM*/));
1330 switch (e
->Iex
.Binop
.op
) {
1331 case Iop_ShlN16x4
: laneBits
= 16; op
= Asse_SHL16
; break;
1332 case Iop_ShlN32x2
: laneBits
= 32; op
= Asse_SHL32
; break;
1333 case Iop_SarN16x4
: laneBits
= 16; op
= Asse_SAR16
; break;
1334 case Iop_SarN32x2
: laneBits
= 32; op
= Asse_SAR32
; break;
1335 case Iop_ShrN16x4
: laneBits
= 16; op
= Asse_SHR16
; break;
1336 case Iop_ShrN32x2
: laneBits
= 32; op
= Asse_SHR32
; break;
1339 if (op
!= Asse_INVALID
) {
1340 const IRExpr
* arg1
= e
->Iex
.Binop
.arg1
;
1341 const IRExpr
* arg2
= e
->Iex
.Binop
.arg2
;
1342 vassert(typeOfIRExpr(env
->type_env
, arg1
) == Ity_I64
);
1343 vassert(typeOfIRExpr(env
->type_env
, arg2
) == Ity_I8
);
1344 HReg igreg
= iselIntExpr_R(env
, arg1
);
1345 HReg vgreg
= newVRegV(env
);
1346 HReg idst
= newVRegI(env
);
1347 addInstr(env
, AMD64Instr_SseMOVQ(igreg
, vgreg
, True
/*toXMM*/));
1348 /* If it's a shift by an in-range immediate, generate a single
1350 if (arg2
->tag
== Iex_Const
) {
1351 IRConst
* c
= arg2
->Iex
.Const
.con
;
1352 vassert(c
->tag
== Ico_U8
);
1353 UInt shift
= c
->Ico
.U8
;
1354 if (shift
< laneBits
) {
1355 addInstr(env
, AMD64Instr_SseShiftN(op
, shift
, vgreg
));
1356 addInstr(env
, AMD64Instr_SseMOVQ(idst
, vgreg
, False
/*!toXMM*/));
1360 /* Otherwise we have to do it the longwinded way. */
1361 HReg ishift
= iselIntExpr_R(env
, arg2
);
1362 HReg vshift
= newVRegV(env
);
1363 addInstr(env
, AMD64Instr_SseMOVQ(ishift
, vshift
, True
/*toXMM*/));
1364 addInstr(env
, AMD64Instr_SseReRg(op
, vshift
, vgreg
));
1365 addInstr(env
, AMD64Instr_SseMOVQ(idst
, vgreg
, False
/*!toXMM*/));
1369 if (e
->Iex
.Binop
.op
== Iop_Mul32x2
) {
1370 const IRExpr
* arg1
= e
->Iex
.Binop
.arg1
;
1371 const IRExpr
* arg2
= e
->Iex
.Binop
.arg2
;
1372 vassert(typeOfIRExpr(env
->type_env
, arg1
) == Ity_I64
);
1373 vassert(typeOfIRExpr(env
->type_env
, arg2
) == Ity_I64
);
1374 HReg s1
= iselIntExpr_R(env
, arg1
);
1375 HReg s2
= iselIntExpr_R(env
, arg2
);
1376 HReg resLo
= newVRegI(env
);
1377 // resLo = (s1 *64 s2) & 0xFFFF'FFFF
1378 addInstr(env
, mk_iMOVsd_RR(s1
, resLo
));
1379 addInstr(env
, AMD64Instr_Alu64R(Aalu_MUL
, AMD64RMI_Reg(s2
), resLo
));
1380 addInstr(env
, AMD64Instr_MovxLQ(False
, resLo
, resLo
));
1382 // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32;
1383 HReg resHi
= newVRegI(env
);
1384 addInstr(env
, mk_iMOVsd_RR(s1
, resHi
));
1385 addInstr(env
, AMD64Instr_Sh64(Ash_SHR
, 32, resHi
));
1386 HReg tmp
= newVRegI(env
);
1387 addInstr(env
, mk_iMOVsd_RR(s2
, tmp
));
1388 addInstr(env
, AMD64Instr_Sh64(Ash_SHR
, 32, tmp
));
1389 addInstr(env
, AMD64Instr_Alu64R(Aalu_MUL
, AMD64RMI_Reg(tmp
), resHi
));
1390 addInstr(env
, AMD64Instr_Sh64(Ash_SHL
, 32, resHi
));
1392 // final result = resHi | resLo
1393 addInstr(env
, AMD64Instr_Alu64R(Aalu_OR
, AMD64RMI_Reg(resHi
), resLo
));
1397 // A few remaining SIMD64 ops require helper functions, at least for
1399 Bool second_is_UInt
= False
;
1401 switch (e
->Iex
.Binop
.op
) {
1402 case Iop_CatOddLanes16x4
:
1403 fn
= (HWord
)h_generic_calc_CatOddLanes16x4
; break;
1404 case Iop_CatEvenLanes16x4
:
1405 fn
= (HWord
)h_generic_calc_CatEvenLanes16x4
; break;
1406 case Iop_PermOrZero8x8
:
1407 fn
= (HWord
)h_generic_calc_PermOrZero8x8
; break;
1409 case Iop_QNarrowBin32Sto16Sx4
:
1410 fn
= (HWord
)h_generic_calc_QNarrowBin32Sto16Sx4
; break;
1411 case Iop_QNarrowBin16Sto8Sx8
:
1412 fn
= (HWord
)h_generic_calc_QNarrowBin16Sto8Sx8
; break;
1413 case Iop_QNarrowBin16Sto8Ux8
:
1414 fn
= (HWord
)h_generic_calc_QNarrowBin16Sto8Ux8
; break;
1416 case Iop_NarrowBin16to8x8
:
1417 fn
= (HWord
)h_generic_calc_NarrowBin16to8x8
; break;
1418 case Iop_NarrowBin32to16x4
:
1419 fn
= (HWord
)h_generic_calc_NarrowBin32to16x4
; break;
1422 fn
= (HWord
)h_generic_calc_SarN8x8
;
1423 second_is_UInt
= True
;
1427 fn
= (HWord
)0; break;
1429 if (fn
!= (HWord
)0) {
1430 /* Note: the following assumes all helpers are of signature
1431 ULong fn ( ULong, ULong ), and they are
1432 not marked as regparm functions.
1434 HReg dst
= newVRegI(env
);
1435 HReg argL
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
1436 HReg argR
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
1438 addInstr(env
, AMD64Instr_MovxLQ(False
, argR
, argR
));
1439 addInstr(env
, mk_iMOVsd_RR(argL
, hregAMD64_RDI()) );
1440 addInstr(env
, mk_iMOVsd_RR(argR
, hregAMD64_RSI()) );
1441 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
, (ULong
)fn
, 2,
1442 mk_RetLoc_simple(RLPri_Int
) ));
1443 addInstr(env
, mk_iMOVsd_RR(hregAMD64_RAX(), dst
));
1447 // Half-float vector conversion
1448 if (e
->Iex
.Binop
.op
== Iop_F32toF16x4
1449 && (env
->hwcaps
& VEX_HWCAPS_AMD64_F16C
)) {
1450 HReg srcV
= iselVecExpr(env
, e
->Iex
.Binop
.arg2
);
1451 HReg dstV
= newVRegV(env
);
1452 HReg dstI
= newVRegI(env
);
1453 set_SSE_rounding_mode( env
, e
->Iex
.Binop
.arg1
);
1454 addInstr(env
, AMD64Instr_Sse32Fx4(Asse_F32toF16
, srcV
, dstV
));
1455 set_SSE_rounding_default(env
);
1456 addInstr(env
, AMD64Instr_SseMOVQ(dstI
, dstV
, /*toXMM=*/False
));
1463 /* --------- UNARY OP --------- */
1466 /* 1Uto8(64to1(expr64)) */
1468 DEFINE_PATTERN( p_1Uto8_64to1
,
1469 unop(Iop_1Uto8
, unop(Iop_64to1
, bind(0))) );
1470 if (matchIRExpr(&mi
,p_1Uto8_64to1
,e
)) {
1471 const IRExpr
* expr64
= mi
.bindee
[0];
1472 HReg dst
= newVRegI(env
);
1473 HReg src
= iselIntExpr_R(env
, expr64
);
1474 addInstr(env
, mk_iMOVsd_RR(src
,dst
) );
1475 addInstr(env
, AMD64Instr_Alu64R(Aalu_AND
,
1476 AMD64RMI_Imm(1), dst
));
1481 /* 8Uto64(LDle(expr64)) */
1483 DEFINE_PATTERN(p_LDle8_then_8Uto64
,
1485 IRExpr_Load(Iend_LE
,Ity_I8
,bind(0))) );
1486 if (matchIRExpr(&mi
,p_LDle8_then_8Uto64
,e
)) {
1487 HReg dst
= newVRegI(env
);
1488 AMD64AMode
* amode
= iselIntExpr_AMode ( env
, mi
.bindee
[0] );
1489 addInstr(env
, AMD64Instr_LoadEX(1,False
,amode
,dst
));
1494 /* 16Uto64(LDle(expr64)) */
1496 DEFINE_PATTERN(p_LDle16_then_16Uto64
,
1498 IRExpr_Load(Iend_LE
,Ity_I16
,bind(0))) );
1499 if (matchIRExpr(&mi
,p_LDle16_then_16Uto64
,e
)) {
1500 HReg dst
= newVRegI(env
);
1501 AMD64AMode
* amode
= iselIntExpr_AMode ( env
, mi
.bindee
[0] );
1502 addInstr(env
, AMD64Instr_LoadEX(2,False
,amode
,dst
));
1507 /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1508 Use 32 bit arithmetic and let the default zero-extend rule
1509 do the 32Uto64 for free. */
1510 if (e
->Iex
.Unop
.op
== Iop_32Uto64
&& e
->Iex
.Unop
.arg
->tag
== Iex_Binop
) {
1511 IROp opi
= e
->Iex
.Unop
.arg
->Iex
.Binop
.op
; /* inner op */
1512 IRExpr
* argL
= e
->Iex
.Unop
.arg
->Iex
.Binop
.arg1
;
1513 IRExpr
* argR
= e
->Iex
.Unop
.arg
->Iex
.Binop
.arg2
;
1514 AMD64AluOp aluOp
= Aalu_INVALID
;
1516 case Iop_Add32
: aluOp
= Aalu_ADD
; break;
1517 case Iop_Sub32
: aluOp
= Aalu_SUB
; break;
1518 case Iop_And32
: aluOp
= Aalu_AND
; break;
1519 case Iop_Or32
: aluOp
= Aalu_OR
; break;
1520 case Iop_Xor32
: aluOp
= Aalu_XOR
; break;
1523 if (aluOp
!= Aalu_INVALID
) {
1524 /* For commutative ops we assume any literal values are on
1525 the second operand. */
1526 HReg dst
= newVRegI(env
);
1527 HReg reg
= iselIntExpr_R(env
, argL
);
1528 AMD64RMI
* rmi
= iselIntExpr_RMI(env
, argR
);
1529 addInstr(env
, mk_iMOVsd_RR(reg
,dst
));
1530 addInstr(env
, AMD64Instr_Alu32R(aluOp
, rmi
, dst
));
1533 /* just fall through to normal handling for Iop_32Uto64 */
1536 /* Fallback cases */
1537 switch (e
->Iex
.Unop
.op
) {
1540 HReg dst
= newVRegI(env
);
1541 HReg src
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1542 addInstr(env
, AMD64Instr_MovxLQ(e
->Iex
.Unop
.op
== Iop_32Sto64
,
1546 case Iop_128HIto64
: {
1548 iselInt128Expr(&rHi
,&rLo
, env
, e
->Iex
.Unop
.arg
);
1549 return rHi
; /* and abandon rLo */
1553 iselInt128Expr(&rHi
,&rLo
, env
, e
->Iex
.Unop
.arg
);
1554 return rLo
; /* and abandon rHi */
1561 HReg dst
= newVRegI(env
);
1562 HReg src
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1563 Bool srcIs16
= toBool( e
->Iex
.Unop
.op
==Iop_16Uto32
1564 || e
->Iex
.Unop
.op
==Iop_16Uto64
);
1565 UInt mask
= srcIs16
? 0xFFFF : 0xFF;
1566 addInstr(env
, mk_iMOVsd_RR(src
,dst
) );
1567 addInstr(env
, AMD64Instr_Alu64R(Aalu_AND
,
1568 AMD64RMI_Imm(mask
), dst
));
1576 HReg dst
= newVRegI(env
);
1577 HReg src
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1578 Bool srcIs16
= toBool( e
->Iex
.Unop
.op
==Iop_16Sto32
1579 || e
->Iex
.Unop
.op
==Iop_16Sto64
);
1580 UInt amt
= srcIs16
? 48 : 56;
1581 addInstr(env
, mk_iMOVsd_RR(src
,dst
) );
1582 addInstr(env
, AMD64Instr_Sh64(Ash_SHL
, amt
, dst
));
1583 addInstr(env
, AMD64Instr_Sh64(Ash_SAR
, amt
, dst
));
1590 HReg dst
= newVRegI(env
);
1591 HReg src
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1592 addInstr(env
, mk_iMOVsd_RR(src
,dst
) );
1593 addInstr(env
, AMD64Instr_Unary64(Aun_NOT
,dst
));
1598 case Iop_64HIto32
: {
1599 HReg dst
= newVRegI(env
);
1600 HReg src
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1602 switch (e
->Iex
.Unop
.op
) {
1603 case Iop_16HIto8
: shift
= 8; break;
1604 case Iop_32HIto16
: shift
= 16; break;
1605 case Iop_64HIto32
: shift
= 32; break;
1606 default: vassert(0);
1608 addInstr(env
, mk_iMOVsd_RR(src
,dst
) );
1609 addInstr(env
, AMD64Instr_Sh64(Ash_SHR
, shift
, dst
));
1615 HReg dst
= newVRegI(env
);
1616 AMD64CondCode cond
= iselCondCode_C(env
, e
->Iex
.Unop
.arg
);
1617 addInstr(env
, AMD64Instr_Set64(cond
,dst
));
1624 HReg dst
= newVRegI(env
);
1625 HReg tmp
= iselCondCode_R(env
, e
->Iex
.Unop
.arg
);
1626 addInstr(env
, mk_iMOVsd_RR(tmp
, dst
));
1627 addInstr(env
, AMD64Instr_Sh64(Ash_SHL
, 63, dst
));
1628 addInstr(env
, AMD64Instr_Sh64(Ash_SAR
, 63, dst
));
1632 /* Count trailing zeroes, implemented by amd64 'bsfq' */
1633 HReg dst
= newVRegI(env
);
1634 HReg src
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1635 addInstr(env
, AMD64Instr_Bsfr64(True
,src
,dst
));
1639 /* Count leading zeroes. Do 'bsrq' to establish the index
1640 of the highest set bit, and subtract that value from
1642 HReg tmp
= newVRegI(env
);
1643 HReg dst
= newVRegI(env
);
1644 HReg src
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1645 addInstr(env
, AMD64Instr_Bsfr64(False
,src
,tmp
));
1646 addInstr(env
, AMD64Instr_Alu64R(Aalu_MOV
,
1647 AMD64RMI_Imm(63), dst
));
1648 addInstr(env
, AMD64Instr_Alu64R(Aalu_SUB
,
1649 AMD64RMI_Reg(tmp
), dst
));
1653 case Iop_CmpwNEZ64
: {
1654 HReg dst
= newVRegI(env
);
1655 HReg src
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1656 addInstr(env
, mk_iMOVsd_RR(src
,dst
));
1657 addInstr(env
, AMD64Instr_Unary64(Aun_NEG
,dst
));
1658 addInstr(env
, AMD64Instr_Alu64R(Aalu_OR
,
1659 AMD64RMI_Reg(src
), dst
));
1660 addInstr(env
, AMD64Instr_Sh64(Ash_SAR
, 63, dst
));
1664 case Iop_CmpwNEZ32
: {
1665 HReg src
= newVRegI(env
);
1666 HReg dst
= newVRegI(env
);
1667 HReg pre
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1668 addInstr(env
, mk_iMOVsd_RR(pre
,src
));
1669 addInstr(env
, AMD64Instr_MovxLQ(False
, src
, src
));
1670 addInstr(env
, mk_iMOVsd_RR(src
,dst
));
1671 addInstr(env
, AMD64Instr_Unary64(Aun_NEG
,dst
));
1672 addInstr(env
, AMD64Instr_Alu64R(Aalu_OR
,
1673 AMD64RMI_Reg(src
), dst
));
1674 addInstr(env
, AMD64Instr_Sh64(Ash_SAR
, 63, dst
));
1682 HReg dst
= newVRegI(env
);
1683 HReg src
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1684 addInstr(env
, mk_iMOVsd_RR(src
, dst
));
1685 addInstr(env
, AMD64Instr_Unary64(Aun_NEG
, dst
));
1686 addInstr(env
, AMD64Instr_Alu64R(Aalu_OR
, AMD64RMI_Reg(src
), dst
));
1690 case Iop_V128to32
: {
1691 HReg dst
= newVRegI(env
);
1692 HReg vec
= iselVecExpr(env
, e
->Iex
.Unop
.arg
);
1693 AMD64AMode
* rsp_m16
= AMD64AMode_IR(-16, hregAMD64_RSP());
1694 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 16, vec
, rsp_m16
));
1695 addInstr(env
, AMD64Instr_LoadEX(4, False
/*z-widen*/, rsp_m16
, dst
));
1700 case Iop_V128to64
: {
1701 HReg dst
= newVRegI(env
);
1702 HReg vec
= iselVecExpr(env
, e
->Iex
.Unop
.arg
);
1703 addInstr(env
, AMD64Instr_SseMOVQ(dst
, vec
, False
/*!toXMM*/));
1706 case Iop_V128HIto64
: {
1707 HReg dst
= newVRegI(env
);
1708 HReg vec
= iselVecExpr(env
, e
->Iex
.Unop
.arg
);
1709 HReg vec2
= newVRegV(env
);
1710 addInstr(env
, mk_vMOVsd_RR(vec
, vec2
));
1711 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHR128
, 64, vec2
));
1712 addInstr(env
, AMD64Instr_SseMOVQ(dst
, vec2
, False
/*!toXMM*/));
1716 /* V256to64_{3,2,1,0} */
1717 case Iop_V256to64_0
: case Iop_V256to64_1
:
1718 case Iop_V256to64_2
: case Iop_V256to64_3
: {
1720 iselDVecExpr(&vHi
, &vLo
, env
, e
->Iex
.Unop
.arg
);
1721 /* Do the first part of the selection by deciding which of
1722 the 128 bit registers to look at, and second part using
1723 the same scheme as for V128{HI}to64 above. */
1724 Bool low64of128
= True
;
1725 switch (e
->Iex
.Unop
.op
) {
1726 case Iop_V256to64_0
: vec
= vLo
; low64of128
= True
; break;
1727 case Iop_V256to64_1
: vec
= vLo
; low64of128
= False
; break;
1728 case Iop_V256to64_2
: vec
= vHi
; low64of128
= True
; break;
1729 case Iop_V256to64_3
: vec
= vHi
; low64of128
= False
; break;
1730 default: vassert(0);
1732 HReg dst
= newVRegI(env
);
1734 addInstr(env
, AMD64Instr_SseMOVQ(dst
, vec
, False
/*!toXMM*/));
1736 HReg vec2
= newVRegV(env
);
1737 addInstr(env
, mk_vMOVsd_RR(vec
, vec2
));
1738 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHR128
, 64, vec2
));
1739 addInstr(env
, AMD64Instr_SseMOVQ(dst
, vec2
, False
/*!toXMM*/));
1744 /* ReinterpF64asI64(e) */
1745 /* Given an IEEE754 double, produce an I64 with the same bit
1747 case Iop_ReinterpF64asI64
: {
1748 AMD64AMode
* m8_rsp
= AMD64AMode_IR(-8, hregAMD64_RSP());
1749 HReg dst
= newVRegI(env
);
1750 HReg src
= iselDblExpr(env
, e
->Iex
.Unop
.arg
);
1752 set_SSE_rounding_default(env
);
1753 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 8, src
, m8_rsp
));
1754 addInstr(env
, AMD64Instr_Alu64R(
1755 Aalu_MOV
, AMD64RMI_Mem(m8_rsp
), dst
));
1759 /* ReinterpF32asI32(e) */
1760 /* Given an IEEE754 single, produce an I64 with the same bit
1761 pattern in the lower half. */
1762 case Iop_ReinterpF32asI32
: {
1763 AMD64AMode
* m8_rsp
= AMD64AMode_IR(-8, hregAMD64_RSP());
1764 HReg dst
= newVRegI(env
);
1765 HReg src
= iselFltExpr(env
, e
->Iex
.Unop
.arg
);
1767 set_SSE_rounding_default(env
);
1768 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 4, src
, m8_rsp
));
1769 addInstr(env
, AMD64Instr_LoadEX(4, False
/*unsigned*/, m8_rsp
, dst
));
1779 /* These are no-ops. */
1780 return iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1782 case Iop_GetMSBs8x8
: {
1783 /* Note: the following assumes the helper is of
1785 UInt fn ( ULong ), and is not a regparm fn.
1787 HReg dst
= newVRegI(env
);
1788 HReg arg
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1789 HWord fn
= (HWord
)h_generic_calc_GetMSBs8x8
;
1790 addInstr(env
, mk_iMOVsd_RR(arg
, hregAMD64_RDI()) );
1791 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
, (ULong
)fn
,
1792 1, mk_RetLoc_simple(RLPri_Int
) ));
1793 /* MovxLQ is not exactly the right thing here. We just
1794 need to get the bottom 8 bits of RAX into dst, and zero
1795 out everything else. Assuming that the helper returns
1796 a UInt with the top 24 bits zeroed out, it'll do,
1798 addInstr(env
, AMD64Instr_MovxLQ(False
, hregAMD64_RAX(), dst
));
1802 case Iop_GetMSBs8x16
: {
1803 /* Note: the following assumes the helper is of signature
1804 UInt fn ( ULong w64hi, ULong w64Lo ),
1805 and is not a regparm fn. */
1806 HReg dst
= newVRegI(env
);
1807 HReg vec
= iselVecExpr(env
, e
->Iex
.Unop
.arg
);
1808 HReg rsp
= hregAMD64_RSP();
1809 HWord fn
= (HWord
)h_generic_calc_GetMSBs8x16
;
1810 AMD64AMode
* m8_rsp
= AMD64AMode_IR( -8, rsp
);
1811 AMD64AMode
* m16_rsp
= AMD64AMode_IR(-16, rsp
);
1812 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/,
1814 /* hi 64 bits into RDI -- the first arg */
1815 addInstr(env
, AMD64Instr_Alu64R( Aalu_MOV
,
1816 AMD64RMI_Mem(m8_rsp
),
1817 hregAMD64_RDI() )); /* 1st arg */
1818 /* lo 64 bits into RSI -- the 2nd arg */
1819 addInstr(env
, AMD64Instr_Alu64R( Aalu_MOV
,
1820 AMD64RMI_Mem(m16_rsp
),
1821 hregAMD64_RSI() )); /* 2nd arg */
1822 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
, (ULong
)fn
,
1823 2, mk_RetLoc_simple(RLPri_Int
) ));
1824 /* MovxLQ is not exactly the right thing here. We just
1825 need to get the bottom 16 bits of RAX into dst, and zero
1826 out everything else. Assuming that the helper returns
1827 a UInt with the top 16 bits zeroed out, it'll do,
1829 addInstr(env
, AMD64Instr_MovxLQ(False
, hregAMD64_RAX(), dst
));
1837 /* Deal with unary 64-bit SIMD ops. */
1839 switch (e
->Iex
.Unop
.op
) {
1840 case Iop_CmpNEZ32x2
:
1841 fn
= (HWord
)h_generic_calc_CmpNEZ32x2
; break;
1842 case Iop_CmpNEZ16x4
:
1843 fn
= (HWord
)h_generic_calc_CmpNEZ16x4
; break;
1845 fn
= (HWord
)h_generic_calc_CmpNEZ8x8
; break;
1847 fn
= (HWord
)0; break;
1849 if (fn
!= (HWord
)0) {
1850 /* Note: the following assumes all helpers are of
1852 ULong fn ( ULong ), and they are
1853 not marked as regparm functions.
1855 HReg dst
= newVRegI(env
);
1856 HReg arg
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
1857 addInstr(env
, mk_iMOVsd_RR(arg
, hregAMD64_RDI()) );
1858 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
, (ULong
)fn
, 1,
1859 mk_RetLoc_simple(RLPri_Int
) ));
1860 addInstr(env
, mk_iMOVsd_RR(hregAMD64_RAX(), dst
));
1867 /* --------- GET --------- */
1869 if (ty
== Ity_I64
) {
1870 HReg dst
= newVRegI(env
);
1871 addInstr(env
, AMD64Instr_Alu64R(
1874 AMD64AMode_IR(e
->Iex
.Get
.offset
,
1879 if (ty
== Ity_I8
|| ty
== Ity_I16
|| ty
== Ity_I32
) {
1880 HReg dst
= newVRegI(env
);
1881 addInstr(env
, AMD64Instr_LoadEX(
1882 toUChar(ty
==Ity_I8
? 1 : (ty
==Ity_I16
? 2 : 4)),
1884 AMD64AMode_IR(e
->Iex
.Get
.offset
,hregAMD64_RBP()),
1893 = genGuestArrayOffset(
1894 env
, e
->Iex
.GetI
.descr
,
1895 e
->Iex
.GetI
.ix
, e
->Iex
.GetI
.bias
);
1896 HReg dst
= newVRegI(env
);
1898 addInstr(env
, AMD64Instr_LoadEX( 1, False
, am
, dst
));
1901 if (ty
== Ity_I64
) {
1902 addInstr(env
, AMD64Instr_Alu64R( Aalu_MOV
, AMD64RMI_Mem(am
), dst
));
1908 /* --------- CCALL --------- */
1910 HReg dst
= newVRegI(env
);
1911 vassert(ty
== e
->Iex
.CCall
.retty
);
1913 /* be very restrictive for now. Only 64-bit ints allowed for
1914 args, and 64 or 32 bits for return type. */
1915 if (e
->Iex
.CCall
.retty
!= Ity_I64
&& e
->Iex
.CCall
.retty
!= Ity_I32
)
1918 /* Marshal args, do the call. */
1920 RetLoc rloc
= mk_RetLoc_INVALID();
1921 doHelperCall( &addToSp
, &rloc
, env
, NULL
/*guard*/,
1922 e
->Iex
.CCall
.cee
, e
->Iex
.CCall
.retty
, e
->Iex
.CCall
.args
);
1923 vassert(is_sane_RetLoc(rloc
));
1924 vassert(rloc
.pri
== RLPri_Int
);
1925 vassert(addToSp
== 0);
1927 /* Move to dst, and zero out the top 32 bits if the result type is
1928 Ity_I32. Probably overkill, but still .. */
1929 if (e
->Iex
.CCall
.retty
== Ity_I64
)
1930 addInstr(env
, mk_iMOVsd_RR(hregAMD64_RAX(), dst
));
1932 addInstr(env
, AMD64Instr_MovxLQ(False
, hregAMD64_RAX(), dst
));
1937 /* --------- LITERAL --------- */
1938 /* 64/32/16/8-bit literals */
1940 if (ty
== Ity_I64
) {
1941 HReg r
= newVRegI(env
);
1942 addInstr(env
, AMD64Instr_Imm64(e
->Iex
.Const
.con
->Ico
.U64
, r
));
1945 AMD64RMI
* rmi
= iselIntExpr_RMI ( env
, e
);
1946 HReg r
= newVRegI(env
);
1947 addInstr(env
, AMD64Instr_Alu64R(Aalu_MOV
, rmi
, r
));
1951 /* --------- MULTIPLEX --------- */
1952 case Iex_ITE
: { // VFD
1953 if ((ty
== Ity_I64
|| ty
== Ity_I32
|| ty
== Ity_I16
|| ty
== Ity_I8
)
1954 && typeOfIRExpr(env
->type_env
,e
->Iex
.ITE
.cond
) == Ity_I1
) {
1955 HReg r1
= iselIntExpr_R(env
, e
->Iex
.ITE
.iftrue
);
1956 HReg r0
= iselIntExpr_R(env
, e
->Iex
.ITE
.iffalse
);
1957 HReg dst
= newVRegI(env
);
1958 addInstr(env
, mk_iMOVsd_RR(r1
,dst
));
1959 AMD64CondCode cc
= iselCondCode_C(env
, e
->Iex
.ITE
.cond
);
1960 addInstr(env
, AMD64Instr_CMov64(cc
^ 1, r0
, dst
));
1966 /* --------- TERNARY OP --------- */
1968 IRTriop
*triop
= e
->Iex
.Triop
.details
;
1969 /* C3210 flags following FPU partial remainder (fprem), both
1970 IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1971 if (triop
->op
== Iop_PRemC3210F64
1972 || triop
->op
== Iop_PRem1C3210F64
) {
1973 AMD64AMode
* m8_rsp
= AMD64AMode_IR(-8, hregAMD64_RSP());
1974 HReg arg1
= iselDblExpr(env
, triop
->arg2
);
1975 HReg arg2
= iselDblExpr(env
, triop
->arg3
);
1976 HReg dst
= newVRegI(env
);
1977 addInstr(env
, AMD64Instr_A87Free(2));
1979 /* one arg -> top of x87 stack */
1980 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 8, arg2
, m8_rsp
));
1981 addInstr(env
, AMD64Instr_A87PushPop(m8_rsp
, True
/*push*/, 8));
1983 /* other arg -> top of x87 stack */
1984 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 8, arg1
, m8_rsp
));
1985 addInstr(env
, AMD64Instr_A87PushPop(m8_rsp
, True
/*push*/, 8));
1987 switch (triop
->op
) {
1988 case Iop_PRemC3210F64
:
1989 addInstr(env
, AMD64Instr_A87FpOp(Afp_PREM
));
1991 case Iop_PRem1C3210F64
:
1992 addInstr(env
, AMD64Instr_A87FpOp(Afp_PREM1
));
1997 /* Ignore the result, and instead make off with the FPU's
1998 C3210 flags (in the status word). */
1999 addInstr(env
, AMD64Instr_A87StSW(m8_rsp
));
2000 addInstr(env
, AMD64Instr_Alu64R(Aalu_MOV
,AMD64RMI_Mem(m8_rsp
),dst
));
2001 addInstr(env
, AMD64Instr_Alu64R(Aalu_AND
,AMD64RMI_Imm(0x4700),dst
));
2009 } /* switch (e->tag) */
2011 /* We get here if no pattern matched. */
2014 vpanic("iselIntExpr_R(amd64): cannot reduce tree");
2018 /*---------------------------------------------------------*/
2019 /*--- ISEL: Integer expression auxiliaries ---*/
2020 /*---------------------------------------------------------*/
2022 /* --------------------- AMODEs --------------------- */
2024 /* Return an AMode which computes the value of the specified
2025 expression, possibly also adding insns to the code list as a
2026 result. The expression may only be a 32-bit one.
2029 static AMD64AMode
* iselIntExpr_AMode ( ISelEnv
* env
, const IRExpr
* e
)
2031 AMD64AMode
* am
= iselIntExpr_AMode_wrk(env
, e
);
2032 vassert(sane_AMode(am
));
2036 /* DO NOT CALL THIS DIRECTLY ! */
2037 static AMD64AMode
* iselIntExpr_AMode_wrk ( ISelEnv
* env
, const IRExpr
* e
)
2040 DECLARE_PATTERN(p_complex
);
2041 IRType ty
= typeOfIRExpr(env
->type_env
,e
);
2042 vassert(ty
== Ity_I64
);
2044 /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
2045 /* bind0 bind1 bind2 bind3 */
2046 DEFINE_PATTERN(p_complex
,
2050 binop(Iop_Shl64
, bind(1), bind(2))
2055 if (matchIRExpr(&mi
, p_complex
, e
)) {
2056 const IRExpr
* expr1
= mi
.bindee
[0];
2057 const IRExpr
* expr2
= mi
.bindee
[1];
2058 const IRExpr
* imm8
= mi
.bindee
[2];
2059 const IRExpr
* simm32
= mi
.bindee
[3];
2060 if (imm8
->tag
== Iex_Const
2061 && imm8
->Iex
.Const
.con
->tag
== Ico_U8
2062 && imm8
->Iex
.Const
.con
->Ico
.U8
< 4
2063 /* imm8 is OK, now check simm32 */
2064 && simm32
->tag
== Iex_Const
2065 && simm32
->Iex
.Const
.con
->tag
== Ico_U64
2066 && fitsIn32Bits(simm32
->Iex
.Const
.con
->Ico
.U64
)) {
2067 UInt shift
= imm8
->Iex
.Const
.con
->Ico
.U8
;
2068 UInt offset
= toUInt(simm32
->Iex
.Const
.con
->Ico
.U64
);
2069 HReg r1
= iselIntExpr_R(env
, expr1
);
2070 HReg r2
= iselIntExpr_R(env
, expr2
);
2071 vassert(shift
== 0 || shift
== 1 || shift
== 2 || shift
== 3);
2072 return AMD64AMode_IRRS(offset
, r1
, r2
, shift
);
2076 /* Add64(expr1, Shl64(expr2, imm)) */
2077 if (e
->tag
== Iex_Binop
2078 && e
->Iex
.Binop
.op
== Iop_Add64
2079 && e
->Iex
.Binop
.arg2
->tag
== Iex_Binop
2080 && e
->Iex
.Binop
.arg2
->Iex
.Binop
.op
== Iop_Shl64
2081 && e
->Iex
.Binop
.arg2
->Iex
.Binop
.arg2
->tag
== Iex_Const
2082 && e
->Iex
.Binop
.arg2
->Iex
.Binop
.arg2
->Iex
.Const
.con
->tag
== Ico_U8
) {
2083 UInt shift
= e
->Iex
.Binop
.arg2
->Iex
.Binop
.arg2
->Iex
.Const
.con
->Ico
.U8
;
2084 if (shift
== 1 || shift
== 2 || shift
== 3) {
2085 HReg r1
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
2086 HReg r2
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
->Iex
.Binop
.arg1
);
2087 return AMD64AMode_IRRS(0, r1
, r2
, shift
);
2092 if (e
->tag
== Iex_Binop
2093 && e
->Iex
.Binop
.op
== Iop_Add64
2094 && e
->Iex
.Binop
.arg2
->tag
== Iex_Const
2095 && e
->Iex
.Binop
.arg2
->Iex
.Const
.con
->tag
== Ico_U64
2096 && fitsIn32Bits(e
->Iex
.Binop
.arg2
->Iex
.Const
.con
->Ico
.U64
)) {
2097 HReg r1
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
2098 return AMD64AMode_IR(
2099 toUInt(e
->Iex
.Binop
.arg2
->Iex
.Const
.con
->Ico
.U64
),
2104 /* Doesn't match anything in particular. Generate it into
2105 a register and use that. */
2107 HReg r1
= iselIntExpr_R(env
, e
);
2108 return AMD64AMode_IR(0, r1
);
2113 /* --------------------- RMIs --------------------- */
2115 /* Similarly, calculate an expression into an X86RMI operand. As with
2116 iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
2118 static AMD64RMI
* iselIntExpr_RMI ( ISelEnv
* env
, const IRExpr
* e
)
2120 AMD64RMI
* rmi
= iselIntExpr_RMI_wrk(env
, e
);
2121 /* sanity checks ... */
2126 vassert(hregClass(rmi
->Armi
.Reg
.reg
) == HRcInt64
);
2127 vassert(hregIsVirtual(rmi
->Armi
.Reg
.reg
));
2130 vassert(sane_AMode(rmi
->Armi
.Mem
.am
));
2133 vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2137 /* DO NOT CALL THIS DIRECTLY ! */
2138 static AMD64RMI
* iselIntExpr_RMI_wrk ( ISelEnv
* env
, const IRExpr
* e
)
2140 IRType ty
= typeOfIRExpr(env
->type_env
,e
);
2141 vassert(ty
== Ity_I64
|| ty
== Ity_I32
2142 || ty
== Ity_I16
|| ty
== Ity_I8
);
2144 /* special case: immediate 64/32/16/8 */
2145 if (e
->tag
== Iex_Const
) {
2146 switch (e
->Iex
.Const
.con
->tag
) {
2148 if (fitsIn32Bits(e
->Iex
.Const
.con
->Ico
.U64
)) {
2149 return AMD64RMI_Imm(toUInt(e
->Iex
.Const
.con
->Ico
.U64
));
2153 return AMD64RMI_Imm(e
->Iex
.Const
.con
->Ico
.U32
); break;
2155 return AMD64RMI_Imm(0xFFFF & e
->Iex
.Const
.con
->Ico
.U16
); break;
2157 return AMD64RMI_Imm(0xFF & e
->Iex
.Const
.con
->Ico
.U8
); break;
2159 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2163 /* special case: 64-bit GET */
2164 if (e
->tag
== Iex_Get
&& ty
== Ity_I64
) {
2165 return AMD64RMI_Mem(AMD64AMode_IR(e
->Iex
.Get
.offset
,
2169 /* special case: 64-bit load from memory */
2170 if (e
->tag
== Iex_Load
&& ty
== Ity_I64
2171 && e
->Iex
.Load
.end
== Iend_LE
) {
2172 AMD64AMode
* am
= iselIntExpr_AMode(env
, e
->Iex
.Load
.addr
);
2173 return AMD64RMI_Mem(am
);
2176 /* default case: calculate into a register and return that */
2178 HReg r
= iselIntExpr_R ( env
, e
);
2179 return AMD64RMI_Reg(r
);
2184 /* --------------------- RIs --------------------- */
2186 /* Calculate an expression into an AMD64RI operand. As with
2187 iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2190 static AMD64RI
* iselIntExpr_RI ( ISelEnv
* env
, const IRExpr
* e
)
2192 AMD64RI
* ri
= iselIntExpr_RI_wrk(env
, e
);
2193 /* sanity checks ... */
2198 vassert(hregClass(ri
->Ari
.Reg
.reg
) == HRcInt64
);
2199 vassert(hregIsVirtual(ri
->Ari
.Reg
.reg
));
2202 vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2206 /* DO NOT CALL THIS DIRECTLY ! */
2207 static AMD64RI
* iselIntExpr_RI_wrk ( ISelEnv
* env
, const IRExpr
* e
)
2209 IRType ty
= typeOfIRExpr(env
->type_env
,e
);
2210 vassert(ty
== Ity_I64
|| ty
== Ity_I32
2211 || ty
== Ity_I16
|| ty
== Ity_I8
);
2213 /* special case: immediate */
2214 if (e
->tag
== Iex_Const
) {
2215 switch (e
->Iex
.Const
.con
->tag
) {
2217 if (fitsIn32Bits(e
->Iex
.Const
.con
->Ico
.U64
)) {
2218 return AMD64RI_Imm(toUInt(e
->Iex
.Const
.con
->Ico
.U64
));
2222 return AMD64RI_Imm(e
->Iex
.Const
.con
->Ico
.U32
);
2224 return AMD64RI_Imm(0xFFFF & e
->Iex
.Const
.con
->Ico
.U16
);
2226 return AMD64RI_Imm(0xFF & e
->Iex
.Const
.con
->Ico
.U8
);
2228 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2232 /* default case: calculate into a register and return that */
2234 HReg r
= iselIntExpr_R ( env
, e
);
2235 return AMD64RI_Reg(r
);
2240 /* --------------------- RMs --------------------- */
2242 /* Similarly, calculate an expression into an AMD64RM operand. As
2243 with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2246 static AMD64RM
* iselIntExpr_RM ( ISelEnv
* env
, const IRExpr
* e
)
2248 AMD64RM
* rm
= iselIntExpr_RM_wrk(env
, e
);
2249 /* sanity checks ... */
2252 vassert(hregClass(rm
->Arm
.Reg
.reg
) == HRcInt64
);
2253 vassert(hregIsVirtual(rm
->Arm
.Reg
.reg
));
2256 vassert(sane_AMode(rm
->Arm
.Mem
.am
));
2259 vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2263 /* DO NOT CALL THIS DIRECTLY ! */
2264 static AMD64RM
* iselIntExpr_RM_wrk ( ISelEnv
* env
, const IRExpr
* e
)
2266 IRType ty
= typeOfIRExpr(env
->type_env
,e
);
2267 vassert(ty
== Ity_I64
|| ty
== Ity_I32
|| ty
== Ity_I16
|| ty
== Ity_I8
);
2269 /* special case: 64-bit GET */
2270 if (e
->tag
== Iex_Get
&& ty
== Ity_I64
) {
2271 return AMD64RM_Mem(AMD64AMode_IR(e
->Iex
.Get
.offset
,
2275 /* special case: load from memory */
2277 /* default case: calculate into a register and return that */
2279 HReg r
= iselIntExpr_R ( env
, e
);
2280 return AMD64RM_Reg(r
);
2285 /* --------------------- CONDCODE as %rflag test --------------------- */
2287 /* Generate code to evaluated a bit-typed expression, returning the
2288 condition code which would correspond when the expression would
2289 notionally have returned 1.
2291 Note that iselCondCode_C and iselCondCode_R are mutually recursive. For
2292 future changes to either of them, take care not to introduce an infinite
2293 loop involving the two of them.
2295 static AMD64CondCode
iselCondCode_C ( ISelEnv
* env
, const IRExpr
* e
)
2297 /* Uh, there's nothing we can sanity check here, unfortunately. */
2298 return iselCondCode_C_wrk(env
,e
);
2301 /* DO NOT CALL THIS DIRECTLY ! */
2302 static AMD64CondCode
iselCondCode_C_wrk ( ISelEnv
* env
, const IRExpr
* e
)
2305 vassert(typeOfIRExpr(env
->type_env
,e
) == Ity_I1
);
2308 if (e
->tag
== Iex_RdTmp
) {
2309 HReg r64
= lookupIRTemp(env
, e
->Iex
.RdTmp
.tmp
);
2310 addInstr(env
, AMD64Instr_Test64(1,r64
));
2314 /* Constant 1:Bit */
2315 if (e
->tag
== Iex_Const
) {
2317 vassert(e
->Iex
.Const
.con
->tag
== Ico_U1
);
2318 vassert(e
->Iex
.Const
.con
->Ico
.U1
== True
2319 || e
->Iex
.Const
.con
->Ico
.U1
== False
);
2321 addInstr(env
, AMD64Instr_Alu64R(Aalu_MOV
,AMD64RMI_Imm(0),r
));
2322 addInstr(env
, AMD64Instr_Alu64R(Aalu_XOR
,AMD64RMI_Reg(r
),r
));
2323 return e
->Iex
.Const
.con
->Ico
.U1
? Acc_Z
: Acc_NZ
;
2327 if (e
->tag
== Iex_Unop
&& e
->Iex
.Unop
.op
== Iop_Not1
) {
2328 /* Generate code for the arg, and negate the test condition */
2329 return 1 ^ iselCondCode_C(env
, e
->Iex
.Unop
.arg
);
2332 /* --- patterns rooted at: 64to1 --- */
2335 if (e
->tag
== Iex_Unop
&& e
->Iex
.Unop
.op
== Iop_64to1
) {
2336 HReg reg
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
2337 addInstr(env
, AMD64Instr_Test64(1,reg
));
2341 /* --- patterns rooted at: 32to1 --- */
2344 if (e
->tag
== Iex_Unop
&& e
->Iex
.Unop
.op
== Iop_32to1
) {
2345 HReg reg
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
2346 addInstr(env
, AMD64Instr_Test64(1,reg
));
2350 /* --- patterns rooted at: CmpNEZ8 --- */
2353 if (e
->tag
== Iex_Unop
2354 && e
->Iex
.Unop
.op
== Iop_CmpNEZ8
) {
2355 HReg r
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
2356 addInstr(env
, AMD64Instr_Test64(0xFF,r
));
2360 /* --- patterns rooted at: CmpNEZ16 --- */
2363 if (e
->tag
== Iex_Unop
2364 && e
->Iex
.Unop
.op
== Iop_CmpNEZ16
) {
2365 HReg r
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
2366 addInstr(env
, AMD64Instr_Test64(0xFFFF,r
));
2370 /* --- patterns rooted at: CmpNEZ32 --- */
2372 if (e
->tag
== Iex_Unop
2373 && e
->Iex
.Unop
.op
== Iop_CmpNEZ32
) {
2374 IRExpr
* arg
= e
->Iex
.Unop
.arg
;
2375 if (arg
->tag
== Iex_Binop
2376 && (arg
->Iex
.Binop
.op
== Iop_Or32
2377 || arg
->Iex
.Binop
.op
== Iop_And32
)) {
2378 /* CmpNEZ32(Or32(x,y)) */
2379 /* CmpNEZ32(And32(x,y)) */
2380 HReg r0
= iselIntExpr_R(env
, arg
->Iex
.Binop
.arg1
);
2381 AMD64RMI
* rmi1
= iselIntExpr_RMI(env
, arg
->Iex
.Binop
.arg2
);
2382 HReg tmp
= newVRegI(env
);
2383 addInstr(env
, mk_iMOVsd_RR(r0
, tmp
));
2384 addInstr(env
, AMD64Instr_Alu32R(
2385 arg
->Iex
.Binop
.op
== Iop_Or32
? Aalu_OR
: Aalu_AND
,
2390 HReg r1
= iselIntExpr_R(env
, arg
);
2391 AMD64RMI
* rmi2
= AMD64RMI_Imm(0);
2392 addInstr(env
, AMD64Instr_Alu32R(Aalu_CMP
,rmi2
,r1
));
2396 /* --- patterns rooted at: CmpNEZ64 --- */
2398 if (e
->tag
== Iex_Unop
2399 && e
->Iex
.Unop
.op
== Iop_CmpNEZ64
) {
2400 IRExpr
* arg
= e
->Iex
.Unop
.arg
;
2401 if (arg
->tag
== Iex_Binop
2402 && (arg
->Iex
.Binop
.op
== Iop_Or64
2403 || arg
->Iex
.Binop
.op
== Iop_And64
)) {
2404 /* CmpNEZ64(Or64(x,y)) */
2405 /* CmpNEZ64(And64(x,y)) */
2406 HReg r0
= iselIntExpr_R(env
, arg
->Iex
.Binop
.arg1
);
2407 AMD64RMI
* rmi1
= iselIntExpr_RMI(env
, arg
->Iex
.Binop
.arg2
);
2408 HReg tmp
= newVRegI(env
);
2409 addInstr(env
, mk_iMOVsd_RR(r0
, tmp
));
2410 addInstr(env
, AMD64Instr_Alu64R(
2411 arg
->Iex
.Binop
.op
== Iop_Or64
? Aalu_OR
: Aalu_AND
,
2416 HReg r1
= iselIntExpr_R(env
, arg
);
2417 AMD64RMI
* rmi2
= AMD64RMI_Imm(0);
2418 addInstr(env
, AMD64Instr_Alu64R(Aalu_CMP
,rmi2
,r1
));
2422 /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2424 /* CmpEQ8 / CmpNE8 */
2425 if (e
->tag
== Iex_Binop
2426 && (e
->Iex
.Binop
.op
== Iop_CmpEQ8
2427 || e
->Iex
.Binop
.op
== Iop_CmpNE8
2428 || e
->Iex
.Binop
.op
== Iop_CasCmpEQ8
2429 || e
->Iex
.Binop
.op
== Iop_CasCmpNE8
)) {
2430 if (isZeroU8(e
->Iex
.Binop
.arg2
)) {
2431 HReg r1
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
2432 addInstr(env
, AMD64Instr_Test64(0xFF,r1
));
2433 switch (e
->Iex
.Binop
.op
) {
2434 case Iop_CmpEQ8
: case Iop_CasCmpEQ8
: return Acc_Z
;
2435 case Iop_CmpNE8
: case Iop_CasCmpNE8
: return Acc_NZ
;
2436 default: vpanic("iselCondCode_C(amd64): CmpXX8(expr,0:I8)");
2439 HReg r1
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
2440 AMD64RMI
* rmi2
= iselIntExpr_RMI(env
, e
->Iex
.Binop
.arg2
);
2441 HReg r
= newVRegI(env
);
2442 addInstr(env
, mk_iMOVsd_RR(r1
,r
));
2443 addInstr(env
, AMD64Instr_Alu64R(Aalu_XOR
,rmi2
,r
));
2444 addInstr(env
, AMD64Instr_Alu64R(Aalu_AND
,AMD64RMI_Imm(0xFF),r
));
2445 switch (e
->Iex
.Binop
.op
) {
2446 case Iop_CmpEQ8
: case Iop_CasCmpEQ8
: return Acc_Z
;
2447 case Iop_CmpNE8
: case Iop_CasCmpNE8
: return Acc_NZ
;
2448 default: vpanic("iselCondCode_C(amd64): CmpXX8(expr,expr)");
2453 /* CmpEQ16 / CmpNE16 */
2454 if (e
->tag
== Iex_Binop
2455 && (e
->Iex
.Binop
.op
== Iop_CmpEQ16
2456 || e
->Iex
.Binop
.op
== Iop_CmpNE16
2457 || e
->Iex
.Binop
.op
== Iop_CasCmpEQ16
2458 || e
->Iex
.Binop
.op
== Iop_CasCmpNE16
)) {
2459 HReg r1
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
2460 AMD64RMI
* rmi2
= iselIntExpr_RMI(env
, e
->Iex
.Binop
.arg2
);
2461 HReg r
= newVRegI(env
);
2462 addInstr(env
, mk_iMOVsd_RR(r1
,r
));
2463 addInstr(env
, AMD64Instr_Alu64R(Aalu_XOR
,rmi2
,r
));
2464 addInstr(env
, AMD64Instr_Alu64R(Aalu_AND
,AMD64RMI_Imm(0xFFFF),r
));
2465 switch (e
->Iex
.Binop
.op
) {
2466 case Iop_CmpEQ16
: case Iop_CasCmpEQ16
: return Acc_Z
;
2467 case Iop_CmpNE16
: case Iop_CasCmpNE16
: return Acc_NZ
;
2468 default: vpanic("iselCondCode_C(amd64): CmpXX16");
2472 /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2473 Saves a "movq %rax, %tmp" compared to the default route. */
2474 if (e
->tag
== Iex_Binop
2475 && e
->Iex
.Binop
.op
== Iop_CmpNE64
2476 && e
->Iex
.Binop
.arg1
->tag
== Iex_CCall
2477 && e
->Iex
.Binop
.arg2
->tag
== Iex_Const
) {
2478 IRExpr
* cal
= e
->Iex
.Binop
.arg1
;
2479 IRExpr
* con
= e
->Iex
.Binop
.arg2
;
2480 HReg tmp
= newVRegI(env
);
2481 /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2482 vassert(cal
->Iex
.CCall
.retty
== Ity_I64
); /* else ill-typed IR */
2483 vassert(con
->Iex
.Const
.con
->tag
== Ico_U64
);
2484 /* Marshal args, do the call. */
2486 RetLoc rloc
= mk_RetLoc_INVALID();
2487 doHelperCall( &addToSp
, &rloc
, env
, NULL
/*guard*/,
2489 cal
->Iex
.CCall
.retty
, cal
->Iex
.CCall
.args
);
2490 vassert(is_sane_RetLoc(rloc
));
2491 vassert(rloc
.pri
== RLPri_Int
);
2492 vassert(addToSp
== 0);
2494 addInstr(env
, AMD64Instr_Imm64(con
->Iex
.Const
.con
->Ico
.U64
, tmp
));
2495 addInstr(env
, AMD64Instr_Alu64R(Aalu_CMP
,
2496 AMD64RMI_Reg(hregAMD64_RAX()), tmp
));
2501 if (e
->tag
== Iex_Binop
2502 && (e
->Iex
.Binop
.op
== Iop_CmpEQ64
2503 || e
->Iex
.Binop
.op
== Iop_CmpNE64
2504 || e
->Iex
.Binop
.op
== Iop_CmpLT64S
2505 || e
->Iex
.Binop
.op
== Iop_CmpLT64U
2506 || e
->Iex
.Binop
.op
== Iop_CmpLE64S
2507 || e
->Iex
.Binop
.op
== Iop_CmpLE64U
2508 || e
->Iex
.Binop
.op
== Iop_CasCmpEQ64
2509 || e
->Iex
.Binop
.op
== Iop_CasCmpNE64
2510 || e
->Iex
.Binop
.op
== Iop_ExpCmpNE64
)) {
2511 HReg r1
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
2512 AMD64RMI
* rmi2
= iselIntExpr_RMI(env
, e
->Iex
.Binop
.arg2
);
2513 addInstr(env
, AMD64Instr_Alu64R(Aalu_CMP
,rmi2
,r1
));
2514 switch (e
->Iex
.Binop
.op
) {
2515 case Iop_CmpEQ64
: case Iop_CasCmpEQ64
: return Acc_Z
;
2517 case Iop_CasCmpNE64
: case Iop_ExpCmpNE64
: return Acc_NZ
;
2518 case Iop_CmpLT64S
: return Acc_L
;
2519 case Iop_CmpLT64U
: return Acc_B
;
2520 case Iop_CmpLE64S
: return Acc_LE
;
2521 case Iop_CmpLE64U
: return Acc_BE
;
2522 default: vpanic("iselCondCode_C(amd64): CmpXX64");
2527 if (e
->tag
== Iex_Binop
2528 && (e
->Iex
.Binop
.op
== Iop_CmpEQ32
2529 || e
->Iex
.Binop
.op
== Iop_CmpNE32
2530 || e
->Iex
.Binop
.op
== Iop_CmpLT32S
2531 || e
->Iex
.Binop
.op
== Iop_CmpLT32U
2532 || e
->Iex
.Binop
.op
== Iop_CmpLE32S
2533 || e
->Iex
.Binop
.op
== Iop_CmpLE32U
2534 || e
->Iex
.Binop
.op
== Iop_CasCmpEQ32
2535 || e
->Iex
.Binop
.op
== Iop_CasCmpNE32
2536 || e
->Iex
.Binop
.op
== Iop_ExpCmpNE32
)) {
2537 HReg r1
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
2538 AMD64RMI
* rmi2
= iselIntExpr_RMI(env
, e
->Iex
.Binop
.arg2
);
2539 addInstr(env
, AMD64Instr_Alu32R(Aalu_CMP
,rmi2
,r1
));
2540 switch (e
->Iex
.Binop
.op
) {
2541 case Iop_CmpEQ32
: case Iop_CasCmpEQ32
: return Acc_Z
;
2543 case Iop_CasCmpNE32
: case Iop_ExpCmpNE32
: return Acc_NZ
;
2544 case Iop_CmpLT32S
: return Acc_L
;
2545 case Iop_CmpLT32U
: return Acc_B
;
2546 case Iop_CmpLE32S
: return Acc_LE
;
2547 case Iop_CmpLE32U
: return Acc_BE
;
2548 default: vpanic("iselCondCode_C(amd64): CmpXX32");
2552 /* And1(x,y), Or1(x,y) */
2553 if (e
->tag
== Iex_Binop
2554 && (e
->Iex
.Binop
.op
== Iop_And1
|| e
->Iex
.Binop
.op
== Iop_Or1
)) {
2555 // Get the result in an int reg, then test the least significant bit.
2556 HReg tmp
= iselCondCode_R(env
, e
);
2557 addInstr(env
, AMD64Instr_Test64(1, tmp
));
2562 vpanic("iselCondCode_C(amd64)");
2566 /* --------------------- CONDCODE as int reg --------------------- */
2568 /* Generate code to evaluated a bit-typed expression, returning the resulting
2569 value in bit 0 of an integer register. WARNING: all of the other bits in the
2570 register can be arbitrary. Callers must mask them off or otherwise ignore
2573 Note that iselCondCode_C and iselCondCode_R are mutually recursive. For
2574 future changes to either of them, take care not to introduce an infinite
2575 loop involving the two of them.
2577 static HReg
iselCondCode_R ( ISelEnv
* env
, const IRExpr
* e
)
2579 /* Uh, there's nothing we can sanity check here, unfortunately. */
2580 return iselCondCode_R_wrk(env
,e
);
2583 /* DO NOT CALL THIS DIRECTLY ! */
2584 static HReg
iselCondCode_R_wrk ( ISelEnv
* env
, const IRExpr
* e
)
2587 vassert(typeOfIRExpr(env
->type_env
,e
) == Ity_I1
);
2590 if (e
->tag
== Iex_RdTmp
) {
2591 return lookupIRTemp(env
, e
->Iex
.RdTmp
.tmp
);
2594 /* And1(x,y), Or1(x,y) */
2595 if (e
->tag
== Iex_Binop
2596 && (e
->Iex
.Binop
.op
== Iop_And1
|| e
->Iex
.Binop
.op
== Iop_Or1
)) {
2597 HReg x_as_64
= iselCondCode_R(env
, e
->Iex
.Binop
.arg1
);
2598 HReg y_as_64
= iselCondCode_R(env
, e
->Iex
.Binop
.arg2
);
2599 HReg res
= newVRegI(env
);
2600 addInstr(env
, mk_iMOVsd_RR(y_as_64
, res
));
2601 AMD64AluOp aop
= e
->Iex
.Binop
.op
== Iop_And1
? Aalu_AND
: Aalu_OR
;
2602 addInstr(env
, AMD64Instr_Alu64R(aop
, AMD64RMI_Reg(x_as_64
), res
));
2606 /* Anything else, we hand off to iselCondCode_C and force the value into a
2608 HReg res
= newVRegI(env
);
2609 AMD64CondCode cc
= iselCondCode_C(env
, e
);
2610 addInstr(env
, AMD64Instr_Set64(cc
, res
));
2613 // PJF old debug code? - unreachable
2616 vpanic("iselCondCode_R(amd64)");
2621 /*---------------------------------------------------------*/
2622 /*--- ISEL: Integer expressions (128 bit) ---*/
2623 /*---------------------------------------------------------*/
2625 /* Compute a 128-bit value into a register pair, which is returned as
2626 the first two parameters. As with iselIntExpr_R, these may be
2627 either real or virtual regs; in any case they must not be changed
2628 by subsequent code emitted by the caller. */
2630 static void iselInt128Expr ( HReg
* rHi
, HReg
* rLo
,
2631 ISelEnv
* env
, const IRExpr
* e
)
2633 iselInt128Expr_wrk(rHi
, rLo
, env
, e
);
2635 vex_printf("\n"); ppIRExpr(e
); vex_printf("\n");
2637 vassert(hregClass(*rHi
) == HRcInt64
);
2638 vassert(hregIsVirtual(*rHi
));
2639 vassert(hregClass(*rLo
) == HRcInt64
);
2640 vassert(hregIsVirtual(*rLo
));
2643 /* DO NOT CALL THIS DIRECTLY ! */
2644 static void iselInt128Expr_wrk ( HReg
* rHi
, HReg
* rLo
,
2645 ISelEnv
* env
, const IRExpr
* e
)
2648 vassert(typeOfIRExpr(env
->type_env
,e
) == Ity_I128
);
2650 /* read 128-bit IRTemp */
2651 if (e
->tag
== Iex_RdTmp
) {
2652 lookupIRTempPair( rHi
, rLo
, env
, e
->Iex
.RdTmp
.tmp
);
2656 /* --------- BINARY ops --------- */
2657 if (e
->tag
== Iex_Binop
) {
2658 switch (e
->Iex
.Binop
.op
) {
2659 /* 64 x 64 -> 128 multiply */
2662 /* get one operand into %rax, and the other into a R/M.
2663 Need to make an educated guess about which is better in
2665 HReg tLo
= newVRegI(env
);
2666 HReg tHi
= newVRegI(env
);
2667 Bool syned
= toBool(e
->Iex
.Binop
.op
== Iop_MullS64
);
2668 AMD64RM
* rmLeft
= iselIntExpr_RM(env
, e
->Iex
.Binop
.arg1
);
2669 HReg rRight
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
2670 addInstr(env
, mk_iMOVsd_RR(rRight
, hregAMD64_RAX()));
2671 addInstr(env
, AMD64Instr_MulL(syned
, rmLeft
));
2672 /* Result is now in RDX:RAX. Tell the caller. */
2673 addInstr(env
, mk_iMOVsd_RR(hregAMD64_RDX(), tHi
));
2674 addInstr(env
, mk_iMOVsd_RR(hregAMD64_RAX(), tLo
));
2680 /* 128 x 64 -> (64(rem),64(div)) division */
2681 case Iop_DivModU128to64
:
2682 case Iop_DivModS128to64
: {
2683 /* Get the 128-bit operand into rdx:rax, and the other into
2686 HReg tLo
= newVRegI(env
);
2687 HReg tHi
= newVRegI(env
);
2688 Bool syned
= toBool(e
->Iex
.Binop
.op
== Iop_DivModS128to64
);
2689 AMD64RM
* rmRight
= iselIntExpr_RM(env
, e
->Iex
.Binop
.arg2
);
2690 iselInt128Expr(&sHi
,&sLo
, env
, e
->Iex
.Binop
.arg1
);
2691 addInstr(env
, mk_iMOVsd_RR(sHi
, hregAMD64_RDX()));
2692 addInstr(env
, mk_iMOVsd_RR(sLo
, hregAMD64_RAX()));
2693 addInstr(env
, AMD64Instr_Div(syned
, 8, rmRight
));
2694 addInstr(env
, mk_iMOVsd_RR(hregAMD64_RDX(), tHi
));
2695 addInstr(env
, mk_iMOVsd_RR(hregAMD64_RAX(), tLo
));
2701 /* 64HLto128(e1,e2) */
2703 *rHi
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg1
);
2704 *rLo
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
2710 } /* if (e->tag == Iex_Binop) */
2713 vpanic("iselInt128Expr");
2717 /*---------------------------------------------------------*/
2718 /*--- ISEL: Floating point expressions (32 bit) ---*/
2719 /*---------------------------------------------------------*/
2721 /* Nothing interesting here; really just wrappers for
2724 static HReg
iselFltExpr ( ISelEnv
* env
, const IRExpr
* e
)
2726 HReg r
= iselFltExpr_wrk( env
, e
);
2728 vex_printf("\n"); ppIRExpr(e
); vex_printf("\n");
2730 vassert(hregClass(r
) == HRcVec128
);
2731 vassert(hregIsVirtual(r
));
2735 /* DO NOT CALL THIS DIRECTLY */
2736 static HReg
iselFltExpr_wrk ( ISelEnv
* env
, const IRExpr
* e
)
2738 IRType ty
= typeOfIRExpr(env
->type_env
,e
);
2739 vassert(ty
== Ity_F32
);
2741 if (e
->tag
== Iex_RdTmp
) {
2742 return lookupIRTemp(env
, e
->Iex
.RdTmp
.tmp
);
2745 if (e
->tag
== Iex_Load
&& e
->Iex
.Load
.end
== Iend_LE
) {
2747 HReg res
= newVRegV(env
);
2748 vassert(e
->Iex
.Load
.ty
== Ity_F32
);
2749 am
= iselIntExpr_AMode(env
, e
->Iex
.Load
.addr
);
2750 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 4, res
, am
));
2754 if (e
->tag
== Iex_Binop
2755 && e
->Iex
.Binop
.op
== Iop_F64toF32
) {
2756 /* Although the result is still held in a standard SSE register,
2757 we need to round it to reflect the loss of accuracy/range
2758 entailed in casting it to a 32-bit float. */
2759 HReg dst
= newVRegV(env
);
2760 HReg src
= iselDblExpr(env
, e
->Iex
.Binop
.arg2
);
2761 set_SSE_rounding_mode( env
, e
->Iex
.Binop
.arg1
);
2762 addInstr(env
, AMD64Instr_SseSDSS(True
/*D->S*/,src
,dst
));
2763 set_SSE_rounding_default( env
);
2767 if (e
->tag
== Iex_Get
) {
2768 AMD64AMode
* am
= AMD64AMode_IR( e
->Iex
.Get
.offset
,
2770 HReg res
= newVRegV(env
);
2771 addInstr(env
, AMD64Instr_SseLdSt( True
/*load*/, 4, res
, am
));
2775 if (e
->tag
== Iex_Unop
2776 && e
->Iex
.Unop
.op
== Iop_ReinterpI32asF32
) {
2777 /* Given an I32, produce an IEEE754 float with the same bit
2779 HReg dst
= newVRegV(env
);
2780 HReg src
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
2781 AMD64AMode
* m4_rsp
= AMD64AMode_IR(-4, hregAMD64_RSP());
2782 addInstr(env
, AMD64Instr_Store(4, src
, m4_rsp
));
2783 addInstr(env
, AMD64Instr_SseLdSt( True
/*load*/, 4, dst
, m4_rsp
));
2787 if (e
->tag
== Iex_Binop
&& e
->Iex
.Binop
.op
== Iop_RoundF32toInt
) {
2788 AMD64AMode
* m8_rsp
= AMD64AMode_IR(-8, hregAMD64_RSP());
2789 HReg arg
= iselFltExpr(env
, e
->Iex
.Binop
.arg2
);
2790 HReg dst
= newVRegV(env
);
2792 /* rf now holds the value to be rounded. The first thing to do
2793 is set the FPU's rounding mode accordingly. */
2795 /* Set host x87 rounding mode */
2796 set_FPU_rounding_mode( env
, e
->Iex
.Binop
.arg1
);
2798 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 4, arg
, m8_rsp
));
2799 addInstr(env
, AMD64Instr_A87Free(1));
2800 addInstr(env
, AMD64Instr_A87PushPop(m8_rsp
, True
/*push*/, 4));
2801 addInstr(env
, AMD64Instr_A87FpOp(Afp_ROUND
));
2802 addInstr(env
, AMD64Instr_A87PushPop(m8_rsp
, False
/*pop*/, 4));
2803 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 4, dst
, m8_rsp
));
2805 /* Restore default x87 rounding. */
2806 set_FPU_rounding_default( env
);
2811 if (e
->tag
== Iex_Unop
&& e
->Iex
.Unop
.op
== Iop_NegF32
) {
2812 /* Sigh ... very rough code. Could do much better. */
2813 /* Get the 128-bit literal 00---0 10---0 into a register
2814 and xor it with the value to be negated. */
2815 HReg r1
= newVRegI(env
);
2816 HReg dst
= newVRegV(env
);
2817 HReg tmp
= newVRegV(env
);
2818 HReg src
= iselFltExpr(env
, e
->Iex
.Unop
.arg
);
2819 AMD64AMode
* rsp0
= AMD64AMode_IR(0, hregAMD64_RSP());
2820 addInstr(env
, mk_vMOVsd_RR(src
,tmp
));
2821 addInstr(env
, AMD64Instr_Push(AMD64RMI_Imm(0)));
2822 addInstr(env
, AMD64Instr_Imm64( 1ULL<<31, r1
));
2823 addInstr(env
, AMD64Instr_Push(AMD64RMI_Reg(r1
)));
2824 addInstr(env
, AMD64Instr_SseLdSt(True
, 16, dst
, rsp0
));
2825 addInstr(env
, AMD64Instr_SseReRg(Asse_XOR
, tmp
, dst
));
2826 add_to_rsp(env
, 16);
2830 if (e
->tag
== Iex_Qop
&& e
->Iex
.Qop
.details
->op
== Iop_MAddF32
) {
2831 IRQop
*qop
= e
->Iex
.Qop
.details
;
2832 HReg dst
= newVRegV(env
);
2833 HReg argX
= iselFltExpr(env
, qop
->arg2
);
2834 HReg argY
= iselFltExpr(env
, qop
->arg3
);
2835 HReg argZ
= iselFltExpr(env
, qop
->arg4
);
2836 if (env
->hwcaps
& VEX_HWCAPS_AMD64_FMA3
) {
2837 vassert(dst
.u32
!= argY
.u32
&& dst
.u32
!= argZ
.u32
);
2838 if (dst
.u32
!= argX
.u32
)
2839 addInstr(env
, AMD64Instr_SseReRg(Asse_MOV
, argX
, dst
));
2840 addInstr(env
, AMD64Instr_Avx32FLo(Asse_VFMADD213
, argY
, argZ
, dst
));
2843 /* XXXROUNDINGFIXME */
2844 /* set roundingmode here */
2845 /* subq $16, %rsp -- make a space*/
2846 sub_from_rsp(env
, 16);
2847 /* Prepare 4 arg regs:
2853 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2855 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2857 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2859 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2861 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2862 movss %argX, 0(%rsi)
2863 movss %argY, 0(%rdx)
2864 movss %argZ, 0(%rcx)
2866 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 4, argX
,
2867 AMD64AMode_IR(0, hregAMD64_RSI())));
2868 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 4, argY
,
2869 AMD64AMode_IR(0, hregAMD64_RDX())));
2870 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 4, argZ
,
2871 AMD64AMode_IR(0, hregAMD64_RCX())));
2873 /* call the helper with priority order : fma4 -> fallback generic
2874 remark: the fma3 case is handled before without helper*/
2875 #if defined(VGA_amd64)
2876 if (env
->hwcaps
& VEX_HWCAPS_AMD64_FMA4
) {
2877 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
,
2878 (ULong
)(HWord
)h_amd64_calc_MAddF32_fma4
,
2879 4, mk_RetLoc_simple(RLPri_None
) ));
2883 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
,
2884 (ULong
)(HWord
)h_generic_calc_MAddF32
,
2885 4, mk_RetLoc_simple(RLPri_None
) ));
2888 /* fetch the result from memory, using %r_argp, which the
2889 register allocator will keep alive across the call. */
2890 addInstr(env
, AMD64Instr_SseLdSt(True
/*isLoad*/, 4, dst
,
2891 AMD64AMode_IR(0, hregAMD64_RSP())));
2892 /* and finally, clear the space */
2893 add_to_rsp(env
, 16);
2897 if (e
->tag
== Iex_ITE
) { // VFD
2899 vassert(ty
== Ity_F32
);
2900 vassert(typeOfIRExpr(env
->type_env
,e
->Iex
.ITE
.cond
) == Ity_I1
);
2901 r1
= iselFltExpr(env
, e
->Iex
.ITE
.iftrue
);
2902 r0
= iselFltExpr(env
, e
->Iex
.ITE
.iffalse
);
2903 dst
= newVRegV(env
);
2904 addInstr(env
, mk_vMOVsd_RR(r1
,dst
));
2905 AMD64CondCode cc
= iselCondCode_C(env
, e
->Iex
.ITE
.cond
);
2906 addInstr(env
, AMD64Instr_SseCMov(cc
^ 1, r0
, dst
));
2911 vpanic("iselFltExpr_wrk");
2915 /*---------------------------------------------------------*/
2916 /*--- ISEL: Floating point expressions (64 bit) ---*/
2917 /*---------------------------------------------------------*/
2919 /* Compute a 64-bit floating point value into the lower half of an xmm
2920 register, the identity of which is returned. As with
2921 iselIntExpr_R, the returned reg will be virtual, and it must not be
2922 changed by subsequent code emitted by the caller.
2925 /* IEEE 754 formats. From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2927 Type S (1 bit) E (11 bits) F (52 bits)
2928 ---- --------- ----------- -----------
2929 signalling NaN u 2047 (max) .0uuuuu---u
2932 quiet NaN u 2047 (max) .1uuuuu---u
2934 negative infinity 1 2047 (max) .000000---0
2936 positive infinity 0 2047 (max) .000000---0
2938 negative zero 1 0 .000000---0
2940 positive zero 0 0 .000000---0
2943 static HReg
iselDblExpr ( ISelEnv
* env
, const IRExpr
* e
)
2945 HReg r
= iselDblExpr_wrk( env
, e
);
2947 vex_printf("\n"); ppIRExpr(e
); vex_printf("\n");
2949 vassert(hregClass(r
) == HRcVec128
);
2950 vassert(hregIsVirtual(r
));
2954 /* DO NOT CALL THIS DIRECTLY */
2955 static HReg
iselDblExpr_wrk ( ISelEnv
* env
, const IRExpr
* e
)
2957 IRType ty
= typeOfIRExpr(env
->type_env
,e
);
2959 vassert(ty
== Ity_F64
);
2961 if (e
->tag
== Iex_RdTmp
) {
2962 return lookupIRTemp(env
, e
->Iex
.RdTmp
.tmp
);
2965 if (e
->tag
== Iex_Const
) {
2966 union { ULong u64
; Double f64
; } u
;
2967 HReg res
= newVRegV(env
);
2968 HReg tmp
= newVRegI(env
);
2969 vassert(sizeof(u
) == 8);
2970 vassert(sizeof(u
.u64
) == 8);
2971 vassert(sizeof(u
.f64
) == 8);
2973 if (e
->Iex
.Const
.con
->tag
== Ico_F64
) {
2974 u
.f64
= e
->Iex
.Const
.con
->Ico
.F64
;
2976 else if (e
->Iex
.Const
.con
->tag
== Ico_F64i
) {
2977 u
.u64
= e
->Iex
.Const
.con
->Ico
.F64i
;
2980 vpanic("iselDblExpr(amd64): const");
2982 addInstr(env
, AMD64Instr_Imm64(u
.u64
, tmp
));
2983 addInstr(env
, AMD64Instr_Push(AMD64RMI_Reg(tmp
)));
2984 addInstr(env
, AMD64Instr_SseLdSt(
2985 True
/*load*/, 8, res
,
2986 AMD64AMode_IR(0, hregAMD64_RSP())
2992 if (e
->tag
== Iex_Load
&& e
->Iex
.Load
.end
== Iend_LE
) {
2994 HReg res
= newVRegV(env
);
2995 vassert(e
->Iex
.Load
.ty
== Ity_F64
);
2996 am
= iselIntExpr_AMode(env
, e
->Iex
.Load
.addr
);
2997 addInstr(env
, AMD64Instr_SseLdSt( True
/*load*/, 8, res
, am
));
3001 if (e
->tag
== Iex_Get
) {
3002 AMD64AMode
* am
= AMD64AMode_IR( e
->Iex
.Get
.offset
,
3004 HReg res
= newVRegV(env
);
3005 addInstr(env
, AMD64Instr_SseLdSt( True
/*load*/, 8, res
, am
));
3009 if (e
->tag
== Iex_GetI
) {
3011 = genGuestArrayOffset(
3012 env
, e
->Iex
.GetI
.descr
,
3013 e
->Iex
.GetI
.ix
, e
->Iex
.GetI
.bias
);
3014 HReg res
= newVRegV(env
);
3015 addInstr(env
, AMD64Instr_SseLdSt( True
/*load*/, 8, res
, am
));
3019 if (e
->tag
== Iex_Triop
) {
3020 IRTriop
*triop
= e
->Iex
.Triop
.details
;
3021 AMD64SseOp op
= Asse_INVALID
;
3022 switch (triop
->op
) {
3023 case Iop_AddF64
: op
= Asse_ADDF
; break;
3024 case Iop_SubF64
: op
= Asse_SUBF
; break;
3025 case Iop_MulF64
: op
= Asse_MULF
; break;
3026 case Iop_DivF64
: op
= Asse_DIVF
; break;
3029 if (op
!= Asse_INVALID
) {
3030 HReg dst
= newVRegV(env
);
3031 HReg argL
= iselDblExpr(env
, triop
->arg2
);
3032 HReg argR
= iselDblExpr(env
, triop
->arg3
);
3033 addInstr(env
, mk_vMOVsd_RR(argL
, dst
));
3034 /* XXXROUNDINGFIXME */
3035 /* set roundingmode here */
3036 addInstr(env
, AMD64Instr_Sse64FLo(op
, argR
, dst
));
3041 if (e
->tag
== Iex_Qop
&& e
->Iex
.Qop
.details
->op
== Iop_MAddF64
) {
3042 IRQop
*qop
= e
->Iex
.Qop
.details
;
3043 HReg dst
= newVRegV(env
);
3044 HReg argX
= iselDblExpr(env
, qop
->arg2
);
3045 HReg argY
= iselDblExpr(env
, qop
->arg3
);
3046 HReg argZ
= iselDblExpr(env
, qop
->arg4
);
3047 if (env
->hwcaps
& VEX_HWCAPS_AMD64_FMA3
) {
3048 vassert(dst
.u32
!= argY
.u32
&& dst
.u32
!= argZ
.u32
);
3049 if (dst
.u32
!= argX
.u32
)
3050 addInstr(env
, AMD64Instr_SseReRg(Asse_MOV
, argX
, dst
));
3051 addInstr(env
, AMD64Instr_Avx64FLo(Asse_VFMADD213
, argY
, argZ
, dst
));
3055 /* XXXROUNDINGFIXME */
3056 /* set roundingmode here */
3057 /* subq $32, %rsp -- make a space*/
3058 sub_from_rsp(env
, 32);
3059 /* Prepare 4 arg regs:
3065 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
3067 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
3069 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
3071 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
3073 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
3074 movsd %argX, 0(%rsi)
3075 movsd %argY, 0(%rdx)
3076 movsd %argZ, 0(%rcx)
3078 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 8, argX
,
3079 AMD64AMode_IR(0, hregAMD64_RSI())));
3080 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 8, argY
,
3081 AMD64AMode_IR(0, hregAMD64_RDX())));
3082 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 8, argZ
,
3083 AMD64AMode_IR(0, hregAMD64_RCX())));
3085 /* call the helper with priority order : fma4 -> fallback generic
3086 remark: the fma3 case is handled before without helper*/
3087 #if defined(VGA_amd64)
3088 if (env
->hwcaps
& VEX_HWCAPS_AMD64_FMA4
) {
3089 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
,
3090 (ULong
)(HWord
)h_amd64_calc_MAddF64_fma4
,
3091 4, mk_RetLoc_simple(RLPri_None
) ));
3095 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
,
3096 (ULong
)(HWord
)h_generic_calc_MAddF64
,
3097 4, mk_RetLoc_simple(RLPri_None
) ));
3100 /* fetch the result from memory, using %r_argp, which the
3101 register allocator will keep alive across the call. */
3102 addInstr(env
, AMD64Instr_SseLdSt(True
/*isLoad*/, 8, dst
,
3103 AMD64AMode_IR(0, hregAMD64_RSP())));
3104 /* and finally, clear the space */
3105 add_to_rsp(env
, 32);
3109 if (e
->tag
== Iex_Binop
&& e
->Iex
.Binop
.op
== Iop_RoundF64toInt
) {
3110 AMD64AMode
* m8_rsp
= AMD64AMode_IR(-8, hregAMD64_RSP());
3111 HReg arg
= iselDblExpr(env
, e
->Iex
.Binop
.arg2
);
3112 HReg dst
= newVRegV(env
);
3114 /* rf now holds the value to be rounded. The first thing to do
3115 is set the FPU's rounding mode accordingly. */
3117 /* Set host x87 rounding mode */
3118 set_FPU_rounding_mode( env
, e
->Iex
.Binop
.arg1
);
3120 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 8, arg
, m8_rsp
));
3121 addInstr(env
, AMD64Instr_A87Free(1));
3122 addInstr(env
, AMD64Instr_A87PushPop(m8_rsp
, True
/*push*/, 8));
3123 addInstr(env
, AMD64Instr_A87FpOp(Afp_ROUND
));
3124 addInstr(env
, AMD64Instr_A87PushPop(m8_rsp
, False
/*pop*/, 8));
3125 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 8, dst
, m8_rsp
));
3127 /* Restore default x87 rounding. */
3128 set_FPU_rounding_default( env
);
3133 IRTriop
*triop
= e
->Iex
.Triop
.details
;
3134 if (e
->tag
== Iex_Triop
3135 && (triop
->op
== Iop_ScaleF64
3136 || triop
->op
== Iop_AtanF64
3137 || triop
->op
== Iop_Yl2xF64
3138 || triop
->op
== Iop_Yl2xp1F64
3139 || triop
->op
== Iop_PRemF64
3140 || triop
->op
== Iop_PRem1F64
)
3142 AMD64AMode
* m8_rsp
= AMD64AMode_IR(-8, hregAMD64_RSP());
3143 HReg arg1
= iselDblExpr(env
, triop
->arg2
);
3144 HReg arg2
= iselDblExpr(env
, triop
->arg3
);
3145 HReg dst
= newVRegV(env
);
3146 Bool arg2first
= toBool(triop
->op
== Iop_ScaleF64
3147 || triop
->op
== Iop_PRemF64
3148 || triop
->op
== Iop_PRem1F64
);
3149 addInstr(env
, AMD64Instr_A87Free(2));
3151 /* one arg -> top of x87 stack */
3152 addInstr(env
, AMD64Instr_SseLdSt(
3153 False
/*store*/, 8, arg2first
? arg2
: arg1
, m8_rsp
));
3154 addInstr(env
, AMD64Instr_A87PushPop(m8_rsp
, True
/*push*/, 8));
3156 /* other arg -> top of x87 stack */
3157 addInstr(env
, AMD64Instr_SseLdSt(
3158 False
/*store*/, 8, arg2first
? arg1
: arg2
, m8_rsp
));
3159 addInstr(env
, AMD64Instr_A87PushPop(m8_rsp
, True
/*push*/, 8));
3162 /* XXXROUNDINGFIXME */
3163 /* set roundingmode here */
3164 switch (triop
->op
) {
3166 addInstr(env
, AMD64Instr_A87FpOp(Afp_SCALE
));
3169 addInstr(env
, AMD64Instr_A87FpOp(Afp_ATAN
));
3172 addInstr(env
, AMD64Instr_A87FpOp(Afp_YL2X
));
3175 addInstr(env
, AMD64Instr_A87FpOp(Afp_YL2XP1
));
3178 addInstr(env
, AMD64Instr_A87FpOp(Afp_PREM
));
3181 addInstr(env
, AMD64Instr_A87FpOp(Afp_PREM1
));
3188 addInstr(env
, AMD64Instr_A87PushPop(m8_rsp
, False
/*pop*/, 8));
3189 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 8, dst
, m8_rsp
));
3193 if (e
->tag
== Iex_Binop
&& e
->Iex
.Binop
.op
== Iop_I64StoF64
) {
3194 HReg dst
= newVRegV(env
);
3195 HReg src
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
3196 set_SSE_rounding_mode( env
, e
->Iex
.Binop
.arg1
);
3197 addInstr(env
, AMD64Instr_SseSI2SF( 8, 8, src
, dst
));
3198 set_SSE_rounding_default( env
);
3202 if (e
->tag
== Iex_Unop
&& e
->Iex
.Unop
.op
== Iop_I32StoF64
) {
3203 HReg dst
= newVRegV(env
);
3204 HReg src
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
3205 set_SSE_rounding_default( env
);
3206 addInstr(env
, AMD64Instr_SseSI2SF( 4, 8, src
, dst
));
3210 if (e
->tag
== Iex_Unop
3211 && (e
->Iex
.Unop
.op
== Iop_NegF64
3212 || e
->Iex
.Unop
.op
== Iop_AbsF64
)) {
3213 /* Sigh ... very rough code. Could do much better. */
3214 /* Get the 128-bit literal 00---0 10---0 into a register
3215 and xor/nand it with the value to be negated. */
3216 HReg r1
= newVRegI(env
);
3217 HReg dst
= newVRegV(env
);
3218 HReg tmp
= newVRegV(env
);
3219 HReg src
= iselDblExpr(env
, e
->Iex
.Unop
.arg
);
3220 AMD64AMode
* rsp0
= AMD64AMode_IR(0, hregAMD64_RSP());
3221 addInstr(env
, mk_vMOVsd_RR(src
,tmp
));
3222 addInstr(env
, AMD64Instr_Push(AMD64RMI_Imm(0)));
3223 addInstr(env
, AMD64Instr_Imm64( 1ULL<<63, r1
));
3224 addInstr(env
, AMD64Instr_Push(AMD64RMI_Reg(r1
)));
3225 addInstr(env
, AMD64Instr_SseLdSt(True
, 16, dst
, rsp0
));
3227 if (e
->Iex
.Unop
.op
== Iop_NegF64
)
3228 addInstr(env
, AMD64Instr_SseReRg(Asse_XOR
, tmp
, dst
));
3230 addInstr(env
, AMD64Instr_SseReRg(Asse_ANDN
, tmp
, dst
));
3232 add_to_rsp(env
, 16);
3236 if (e
->tag
== Iex_Binop
) {
3237 A87FpOp fpop
= Afp_INVALID
;
3238 switch (e
->Iex
.Binop
.op
) {
3239 case Iop_SqrtF64
: fpop
= Afp_SQRT
; break;
3240 case Iop_SinF64
: fpop
= Afp_SIN
; break;
3241 case Iop_CosF64
: fpop
= Afp_COS
; break;
3242 case Iop_TanF64
: fpop
= Afp_TAN
; break;
3243 case Iop_2xm1F64
: fpop
= Afp_2XM1
; break;
3246 if (fpop
!= Afp_INVALID
) {
3247 AMD64AMode
* m8_rsp
= AMD64AMode_IR(-8, hregAMD64_RSP());
3248 HReg arg
= iselDblExpr(env
, e
->Iex
.Binop
.arg2
);
3249 HReg dst
= newVRegV(env
);
3250 Int nNeeded
= e
->Iex
.Binop
.op
==Iop_TanF64
? 2 : 1;
3251 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 8, arg
, m8_rsp
));
3252 addInstr(env
, AMD64Instr_A87Free(nNeeded
));
3253 addInstr(env
, AMD64Instr_A87PushPop(m8_rsp
, True
/*push*/, 8));
3254 /* XXXROUNDINGFIXME */
3255 /* set roundingmode here */
3256 /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3257 codes. I don't think that matters, since this insn
3258 selector never generates such an instruction intervening
3259 between an flag-setting instruction and a flag-using
3261 addInstr(env
, AMD64Instr_A87FpOp(fpop
));
3262 addInstr(env
, AMD64Instr_A87PushPop(m8_rsp
, False
/*pop*/, 8));
3263 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 8, dst
, m8_rsp
));
3268 if (e
->tag
== Iex_Unop
) {
3269 switch (e
->Iex
.Unop
.op
) {
3270 //.. case Iop_I32toF64: {
3271 //.. HReg dst = newVRegF(env);
3272 //.. HReg ri = iselIntExpr_R(env, e->Iex.Unop.arg);
3273 //.. addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3274 //.. set_FPU_rounding_default(env);
3275 //.. addInstr(env, X86Instr_FpLdStI(
3276 //.. True/*load*/, 4, dst,
3277 //.. X86AMode_IR(0, hregX86_ESP())));
3278 //.. add_to_esp(env, 4);
3281 case Iop_ReinterpI64asF64
: {
3282 /* Given an I64, produce an IEEE754 double with the same
3284 AMD64AMode
* m8_rsp
= AMD64AMode_IR(-8, hregAMD64_RSP());
3285 HReg dst
= newVRegV(env
);
3286 AMD64RI
* src
= iselIntExpr_RI(env
, e
->Iex
.Unop
.arg
);
3288 set_SSE_rounding_default(env
);
3289 addInstr(env
, AMD64Instr_Alu64M(Aalu_MOV
, src
, m8_rsp
));
3290 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 8, dst
, m8_rsp
));
3293 case Iop_F32toF64
: {
3295 HReg f64
= newVRegV(env
);
3296 /* this shouldn't be necessary, but be paranoid ... */
3297 set_SSE_rounding_default(env
);
3298 f32
= iselFltExpr(env
, e
->Iex
.Unop
.arg
);
3299 addInstr(env
, AMD64Instr_SseSDSS(False
/*S->D*/, f32
, f64
));
3307 /* --------- MULTIPLEX --------- */
3308 if (e
->tag
== Iex_ITE
) { // VFD
3310 vassert(ty
== Ity_F64
);
3311 vassert(typeOfIRExpr(env
->type_env
,e
->Iex
.ITE
.cond
) == Ity_I1
);
3312 r1
= iselDblExpr(env
, e
->Iex
.ITE
.iftrue
);
3313 r0
= iselDblExpr(env
, e
->Iex
.ITE
.iffalse
);
3314 dst
= newVRegV(env
);
3315 addInstr(env
, mk_vMOVsd_RR(r1
,dst
));
3316 AMD64CondCode cc
= iselCondCode_C(env
, e
->Iex
.ITE
.cond
);
3317 addInstr(env
, AMD64Instr_SseCMov(cc
^ 1, r0
, dst
));
3322 vpanic("iselDblExpr_wrk");
3326 /*---------------------------------------------------------*/
3327 /*--- ISEL: SIMD (Vector) expressions, 128 bit. ---*/
3328 /*---------------------------------------------------------*/
3330 static HReg
iselVecExpr ( ISelEnv
* env
, const IRExpr
* e
)
3332 HReg r
= iselVecExpr_wrk( env
, e
);
3334 vex_printf("\n"); ppIRExpr(e
); vex_printf("\n");
3336 vassert(hregClass(r
) == HRcVec128
);
3337 vassert(hregIsVirtual(r
));
3342 /* DO NOT CALL THIS DIRECTLY */
3343 static HReg
iselVecExpr_wrk ( ISelEnv
* env
, const IRExpr
* e
)
3345 HWord fn
= 0; /* address of helper fn, if required */
3346 Bool arg1isEReg
= False
;
3347 AMD64SseOp op
= Asse_INVALID
;
3349 IRType ty
= typeOfIRExpr(env
->type_env
, e
);
3350 vassert(ty
== Ity_V128
);
3353 if (e
->tag
== Iex_RdTmp
) {
3354 return lookupIRTemp(env
, e
->Iex
.RdTmp
.tmp
);
3357 if (e
->tag
== Iex_Get
) {
3358 HReg dst
= newVRegV(env
);
3359 addInstr(env
, AMD64Instr_SseLdSt(
3363 AMD64AMode_IR(e
->Iex
.Get
.offset
, hregAMD64_RBP())
3369 if (e
->tag
== Iex_Load
&& e
->Iex
.Load
.end
== Iend_LE
) {
3370 HReg dst
= newVRegV(env
);
3371 AMD64AMode
* am
= iselIntExpr_AMode(env
, e
->Iex
.Load
.addr
);
3372 addInstr(env
, AMD64Instr_SseLdSt( True
/*load*/, 16, dst
, am
));
3376 if (e
->tag
== Iex_Const
) {
3377 HReg dst
= newVRegV(env
);
3378 vassert(e
->Iex
.Const
.con
->tag
== Ico_V128
);
3379 switch (e
->Iex
.Const
.con
->Ico
.V128
) {
3381 dst
= generate_zeroes_V128(env
);
3384 dst
= generate_ones_V128(env
);
3387 AMD64AMode
* rsp0
= AMD64AMode_IR(0, hregAMD64_RSP());
3388 /* do push_uimm64 twice, first time for the high-order half. */
3389 push_uimm64(env
, bitmask8_to_bytemask64(
3390 (e
->Iex
.Const
.con
->Ico
.V128
>> 8) & 0xFF
3392 push_uimm64(env
, bitmask8_to_bytemask64(
3393 (e
->Iex
.Const
.con
->Ico
.V128
>> 0) & 0xFF
3395 addInstr(env
, AMD64Instr_SseLdSt( True
/*load*/, 16, dst
, rsp0
));
3396 add_to_rsp(env
, 16);
3403 if (e
->tag
== Iex_Unop
) {
3404 switch (e
->Iex
.Unop
.op
) {
3407 HReg arg
= iselVecExpr(env
, e
->Iex
.Unop
.arg
);
3408 return do_sse_NotV128(env
, arg
);
3411 case Iop_CmpNEZ64x2
: {
3412 /* We can use SSE2 instructions for this. */
3413 /* Ideally, we want to do a 64Ix2 comparison against zero of
3414 the operand. Problem is no such insn exists. Solution
3415 therefore is to do a 32Ix4 comparison instead, and bitwise-
3416 negate (NOT) the result. Let a,b,c,d be 32-bit lanes, and
3417 let the not'd result of this initial comparison be a:b:c:d.
3418 What we need to compute is (a|b):(a|b):(c|d):(c|d). So, use
3419 pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3420 giving the required result.
3422 The required selection sequence is 2,3,0,1, which
3423 according to Intel's documentation means the pshufd
3424 literal value is 0xB1, that is,
3425 (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3427 HReg arg
= iselVecExpr(env
, e
->Iex
.Unop
.arg
);
3428 HReg tmp
= generate_zeroes_V128(env
);
3429 HReg dst
= newVRegV(env
);
3430 addInstr(env
, AMD64Instr_SseReRg(Asse_CMPEQ32
, arg
, tmp
));
3431 tmp
= do_sse_NotV128(env
, tmp
);
3432 addInstr(env
, AMD64Instr_SseShuf(0xB1, tmp
, dst
));
3433 addInstr(env
, AMD64Instr_SseReRg(Asse_OR
, tmp
, dst
));
3437 case Iop_CmpNEZ32x4
: op
= Asse_CMPEQ32
; goto do_CmpNEZ_vector
;
3438 case Iop_CmpNEZ16x8
: op
= Asse_CMPEQ16
; goto do_CmpNEZ_vector
;
3439 case Iop_CmpNEZ8x16
: op
= Asse_CMPEQ8
; goto do_CmpNEZ_vector
;
3442 HReg arg
= iselVecExpr(env
, e
->Iex
.Unop
.arg
);
3443 HReg tmp
= newVRegV(env
);
3444 HReg zero
= generate_zeroes_V128(env
);
3446 addInstr(env
, mk_vMOVsd_RR(arg
, tmp
));
3447 addInstr(env
, AMD64Instr_SseReRg(op
, zero
, tmp
));
3448 dst
= do_sse_NotV128(env
, tmp
);
3452 case Iop_RecipEst32Fx4
: op
= Asse_RCPF
; goto do_32Fx4_unary
;
3453 case Iop_RSqrtEst32Fx4
: op
= Asse_RSQRTF
; goto do_32Fx4_unary
;
3456 HReg arg
= iselVecExpr(env
, e
->Iex
.Unop
.arg
);
3457 HReg dst
= newVRegV(env
);
3458 addInstr(env
, AMD64Instr_Sse32Fx4(op
, arg
, dst
));
3462 case Iop_RecipEst32F0x4
: op
= Asse_RCPF
; goto do_32F0x4_unary
;
3463 case Iop_RSqrtEst32F0x4
: op
= Asse_RSQRTF
; goto do_32F0x4_unary
;
3464 case Iop_Sqrt32F0x4
: op
= Asse_SQRTF
; goto do_32F0x4_unary
;
3467 /* A bit subtle. We have to copy the arg to the result
3468 register first, because actually doing the SSE scalar insn
3469 leaves the upper 3/4 of the destination register
3470 unchanged. Whereas the required semantics of these
3471 primops is that the upper 3/4 is simply copied in from the
3473 HReg arg
= iselVecExpr(env
, e
->Iex
.Unop
.arg
);
3474 HReg dst
= newVRegV(env
);
3475 addInstr(env
, mk_vMOVsd_RR(arg
, dst
));
3476 addInstr(env
, AMD64Instr_Sse32FLo(op
, arg
, dst
));
3480 case Iop_Sqrt64F0x2
: op
= Asse_SQRTF
; goto do_64F0x2_unary
;
3483 /* A bit subtle. We have to copy the arg to the result
3484 register first, because actually doing the SSE scalar insn
3485 leaves the upper half of the destination register
3486 unchanged. Whereas the required semantics of these
3487 primops is that the upper half is simply copied in from the
3489 HReg arg
= iselVecExpr(env
, e
->Iex
.Unop
.arg
);
3490 HReg dst
= newVRegV(env
);
3491 addInstr(env
, mk_vMOVsd_RR(arg
, dst
));
3492 addInstr(env
, AMD64Instr_Sse64FLo(op
, arg
, dst
));
3496 case Iop_32UtoV128
: {
3497 // FIXME maybe just use MOVQ here?
3498 HReg dst
= newVRegV(env
);
3499 AMD64AMode
* rsp_m32
= AMD64AMode_IR(-32, hregAMD64_RSP());
3500 AMD64RI
* ri
= iselIntExpr_RI(env
, e
->Iex
.Unop
.arg
);
3501 addInstr(env
, AMD64Instr_Alu64M(Aalu_MOV
, ri
, rsp_m32
));
3502 addInstr(env
, AMD64Instr_SseLdzLO(4, dst
, rsp_m32
));
3506 case Iop_64UtoV128
: {
3507 // FIXME maybe just use MOVQ here?
3508 HReg dst
= newVRegV(env
);
3509 AMD64AMode
* rsp0
= AMD64AMode_IR(0, hregAMD64_RSP());
3510 AMD64RMI
* rmi
= iselIntExpr_RMI(env
, e
->Iex
.Unop
.arg
);
3511 addInstr(env
, AMD64Instr_Push(rmi
));
3512 addInstr(env
, AMD64Instr_SseLdzLO(8, dst
, rsp0
));
3517 case Iop_V256toV128_0
:
3518 case Iop_V256toV128_1
: {
3520 iselDVecExpr(&vHi
, &vLo
, env
, e
->Iex
.Unop
.arg
);
3521 return (e
->Iex
.Unop
.op
== Iop_V256toV128_1
) ? vHi
: vLo
;
3524 case Iop_F16toF32x4
: {
3525 if (env
->hwcaps
& VEX_HWCAPS_AMD64_F16C
) {
3526 HReg src
= iselIntExpr_R(env
, e
->Iex
.Unop
.arg
);
3527 HReg dst
= newVRegV(env
);
3528 addInstr(env
, AMD64Instr_SseMOVQ(src
, dst
, /*toXMM=*/True
));
3529 addInstr(env
, AMD64Instr_Sse32Fx4(Asse_F16toF32
, dst
, dst
));
3537 } /* switch (e->Iex.Unop.op) */
3538 } /* if (e->tag == Iex_Unop) */
3540 if (e
->tag
== Iex_Binop
) {
3541 switch (e
->Iex
.Binop
.op
) {
3544 case Iop_Sqrt32Fx4
: {
3545 /* :: (rmode, vec) -> vec */
3546 HReg arg
= iselVecExpr(env
, e
->Iex
.Binop
.arg2
);
3547 HReg dst
= newVRegV(env
);
3548 /* XXXROUNDINGFIXME */
3549 /* set roundingmode here */
3550 addInstr(env
, (e
->Iex
.Binop
.op
== Iop_Sqrt64Fx2
3551 ? AMD64Instr_Sse64Fx2
: AMD64Instr_Sse32Fx4
)
3552 (Asse_SQRTF
, arg
, dst
));
3556 /* FIXME: could we generate MOVQ here? */
3557 case Iop_SetV128lo64
: {
3558 HReg dst
= newVRegV(env
);
3559 HReg srcV
= iselVecExpr(env
, e
->Iex
.Binop
.arg1
);
3560 HReg srcI
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
3561 AMD64AMode
* rsp_m16
= AMD64AMode_IR(-16, hregAMD64_RSP());
3562 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 16, srcV
, rsp_m16
));
3563 addInstr(env
, AMD64Instr_Alu64M(Aalu_MOV
, AMD64RI_Reg(srcI
), rsp_m16
));
3564 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 16, dst
, rsp_m16
));
3568 /* FIXME: could we generate MOVD here? */
3569 case Iop_SetV128lo32
: {
3570 HReg dst
= newVRegV(env
);
3571 HReg srcV
= iselVecExpr(env
, e
->Iex
.Binop
.arg1
);
3572 HReg srcI
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
3573 AMD64AMode
* rsp_m16
= AMD64AMode_IR(-16, hregAMD64_RSP());
3574 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 16, srcV
, rsp_m16
));
3575 addInstr(env
, AMD64Instr_Store(4, srcI
, rsp_m16
));
3576 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 16, dst
, rsp_m16
));
3580 case Iop_64HLtoV128
: {
3581 const IRExpr
* arg1
= e
->Iex
.Binop
.arg1
;
3582 const IRExpr
* arg2
= e
->Iex
.Binop
.arg2
;
3583 HReg dst
= newVRegV(env
);
3584 HReg tmp
= newVRegV(env
);
3585 HReg qHi
= iselIntExpr_R(env
, arg1
);
3586 // If the args are trivially the same (tmp or const), use the same
3587 // source register for both, and only one movq since those are
3588 // (relatively) expensive.
3589 if (areAtomsAndEqual(arg1
, arg2
)) {
3590 addInstr(env
, AMD64Instr_SseMOVQ(qHi
, dst
, True
/*toXMM*/));
3591 addInstr(env
, mk_vMOVsd_RR(dst
, tmp
));
3592 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHL128
, 64, dst
));
3593 addInstr(env
, AMD64Instr_SseReRg(Asse_OR
, tmp
, dst
));
3595 HReg qLo
= iselIntExpr_R(env
, arg2
);
3596 addInstr(env
, AMD64Instr_SseMOVQ(qHi
, dst
, True
/*toXMM*/));
3597 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHL128
, 64, dst
));
3598 addInstr(env
, AMD64Instr_SseMOVQ(qLo
, tmp
, True
/*toXMM*/));
3599 addInstr(env
, AMD64Instr_SseReRg(Asse_OR
, tmp
, dst
));
3604 case Iop_CmpEQ32Fx4
: op
= Asse_CMPEQF
; goto do_32Fx4
;
3605 case Iop_CmpLT32Fx4
: op
= Asse_CMPLTF
; goto do_32Fx4
;
3606 case Iop_CmpLE32Fx4
: op
= Asse_CMPLEF
; goto do_32Fx4
;
3607 case Iop_CmpUN32Fx4
: op
= Asse_CMPUNF
; goto do_32Fx4
;
3608 case Iop_Max32Fx4
: op
= Asse_MAXF
; goto do_32Fx4
;
3609 case Iop_Min32Fx4
: op
= Asse_MINF
; goto do_32Fx4
;
3612 HReg argL
= iselVecExpr(env
, e
->Iex
.Binop
.arg1
);
3613 HReg argR
= iselVecExpr(env
, e
->Iex
.Binop
.arg2
);
3614 HReg dst
= newVRegV(env
);
3615 addInstr(env
, mk_vMOVsd_RR(argL
, dst
));
3616 addInstr(env
, AMD64Instr_Sse32Fx4(op
, argR
, dst
));
3620 case Iop_CmpEQ64Fx2
: op
= Asse_CMPEQF
; goto do_64Fx2
;
3621 case Iop_CmpLT64Fx2
: op
= Asse_CMPLTF
; goto do_64Fx2
;
3622 case Iop_CmpLE64Fx2
: op
= Asse_CMPLEF
; goto do_64Fx2
;
3623 case Iop_CmpUN64Fx2
: op
= Asse_CMPUNF
; goto do_64Fx2
;
3624 case Iop_Max64Fx2
: op
= Asse_MAXF
; goto do_64Fx2
;
3625 case Iop_Min64Fx2
: op
= Asse_MINF
; goto do_64Fx2
;
3628 HReg argL
= iselVecExpr(env
, e
->Iex
.Binop
.arg1
);
3629 HReg argR
= iselVecExpr(env
, e
->Iex
.Binop
.arg2
);
3630 HReg dst
= newVRegV(env
);
3631 addInstr(env
, mk_vMOVsd_RR(argL
, dst
));
3632 addInstr(env
, AMD64Instr_Sse64Fx2(op
, argR
, dst
));
3636 case Iop_CmpEQ32F0x4
: op
= Asse_CMPEQF
; goto do_32F0x4
;
3637 case Iop_CmpLT32F0x4
: op
= Asse_CMPLTF
; goto do_32F0x4
;
3638 case Iop_CmpLE32F0x4
: op
= Asse_CMPLEF
; goto do_32F0x4
;
3639 case Iop_CmpUN32F0x4
: op
= Asse_CMPUNF
; goto do_32F0x4
;
3640 case Iop_Add32F0x4
: op
= Asse_ADDF
; goto do_32F0x4
;
3641 case Iop_Div32F0x4
: op
= Asse_DIVF
; goto do_32F0x4
;
3642 case Iop_Max32F0x4
: op
= Asse_MAXF
; goto do_32F0x4
;
3643 case Iop_Min32F0x4
: op
= Asse_MINF
; goto do_32F0x4
;
3644 case Iop_Mul32F0x4
: op
= Asse_MULF
; goto do_32F0x4
;
3645 case Iop_Sub32F0x4
: op
= Asse_SUBF
; goto do_32F0x4
;
3647 HReg argL
= iselVecExpr(env
, e
->Iex
.Binop
.arg1
);
3648 HReg argR
= iselVecExpr(env
, e
->Iex
.Binop
.arg2
);
3649 HReg dst
= newVRegV(env
);
3650 addInstr(env
, mk_vMOVsd_RR(argL
, dst
));
3651 addInstr(env
, AMD64Instr_Sse32FLo(op
, argR
, dst
));
3655 case Iop_CmpEQ64F0x2
: op
= Asse_CMPEQF
; goto do_64F0x2
;
3656 case Iop_CmpLT64F0x2
: op
= Asse_CMPLTF
; goto do_64F0x2
;
3657 case Iop_CmpLE64F0x2
: op
= Asse_CMPLEF
; goto do_64F0x2
;
3658 case Iop_CmpUN64F0x2
: op
= Asse_CMPUNF
; goto do_64F0x2
;
3659 case Iop_Add64F0x2
: op
= Asse_ADDF
; goto do_64F0x2
;
3660 case Iop_Div64F0x2
: op
= Asse_DIVF
; goto do_64F0x2
;
3661 case Iop_Max64F0x2
: op
= Asse_MAXF
; goto do_64F0x2
;
3662 case Iop_Min64F0x2
: op
= Asse_MINF
; goto do_64F0x2
;
3663 case Iop_Mul64F0x2
: op
= Asse_MULF
; goto do_64F0x2
;
3664 case Iop_Sub64F0x2
: op
= Asse_SUBF
; goto do_64F0x2
;
3666 HReg argL
= iselVecExpr(env
, e
->Iex
.Binop
.arg1
);
3667 HReg argR
= iselVecExpr(env
, e
->Iex
.Binop
.arg2
);
3668 HReg dst
= newVRegV(env
);
3669 addInstr(env
, mk_vMOVsd_RR(argL
, dst
));
3670 addInstr(env
, AMD64Instr_Sse64FLo(op
, argR
, dst
));
3674 case Iop_PermOrZero8x16
:
3675 if (env
->hwcaps
& VEX_HWCAPS_AMD64_SSSE3
) {
3679 // Otherwise we'll have to generate a call to
3680 // h_generic_calc_PermOrZero8x16 (ATK). But that would only be for a
3681 // host which doesn't have SSSE3, in which case we don't expect this
3682 // IROp to enter the compilation pipeline in the first place.
3685 case Iop_PwExtUSMulQAdd8x16
:
3686 if (env
->hwcaps
& VEX_HWCAPS_AMD64_SSSE3
) {
3687 op
= Asse_PMADDUBSW
;
3692 case Iop_QNarrowBin32Sto16Sx8
:
3693 op
= Asse_PACKSSD
; arg1isEReg
= True
; goto do_SseReRg
;
3694 case Iop_QNarrowBin16Sto8Sx16
:
3695 op
= Asse_PACKSSW
; arg1isEReg
= True
; goto do_SseReRg
;
3696 case Iop_QNarrowBin16Sto8Ux16
:
3697 op
= Asse_PACKUSW
; arg1isEReg
= True
; goto do_SseReRg
;
3699 case Iop_InterleaveHI8x16
:
3700 op
= Asse_UNPCKHB
; arg1isEReg
= True
; goto do_SseReRg
;
3701 case Iop_InterleaveHI16x8
:
3702 op
= Asse_UNPCKHW
; arg1isEReg
= True
; goto do_SseReRg
;
3703 case Iop_InterleaveHI32x4
:
3704 op
= Asse_UNPCKHD
; arg1isEReg
= True
; goto do_SseReRg
;
3705 case Iop_InterleaveHI64x2
:
3706 op
= Asse_UNPCKHQ
; arg1isEReg
= True
; goto do_SseReRg
;
3708 case Iop_InterleaveLO8x16
:
3709 op
= Asse_UNPCKLB
; arg1isEReg
= True
; goto do_SseReRg
;
3710 case Iop_InterleaveLO16x8
:
3711 op
= Asse_UNPCKLW
; arg1isEReg
= True
; goto do_SseReRg
;
3712 case Iop_InterleaveLO32x4
:
3713 op
= Asse_UNPCKLD
; arg1isEReg
= True
; goto do_SseReRg
;
3714 case Iop_InterleaveLO64x2
:
3715 op
= Asse_UNPCKLQ
; arg1isEReg
= True
; goto do_SseReRg
;
3717 case Iop_AndV128
: op
= Asse_AND
; goto do_SseReRg
;
3718 case Iop_OrV128
: op
= Asse_OR
; goto do_SseReRg
;
3719 case Iop_XorV128
: op
= Asse_XOR
; goto do_SseReRg
;
3720 case Iop_Add8x16
: op
= Asse_ADD8
; goto do_SseReRg
;
3721 case Iop_Add16x8
: op
= Asse_ADD16
; goto do_SseReRg
;
3722 case Iop_Add32x4
: op
= Asse_ADD32
; goto do_SseReRg
;
3723 case Iop_Add64x2
: op
= Asse_ADD64
; goto do_SseReRg
;
3724 case Iop_QAdd8Sx16
: op
= Asse_QADD8S
; goto do_SseReRg
;
3725 case Iop_QAdd16Sx8
: op
= Asse_QADD16S
; goto do_SseReRg
;
3726 case Iop_QAdd8Ux16
: op
= Asse_QADD8U
; goto do_SseReRg
;
3727 case Iop_QAdd16Ux8
: op
= Asse_QADD16U
; goto do_SseReRg
;
3728 case Iop_Avg8Ux16
: op
= Asse_AVG8U
; goto do_SseReRg
;
3729 case Iop_Avg16Ux8
: op
= Asse_AVG16U
; goto do_SseReRg
;
3730 case Iop_CmpEQ8x16
: op
= Asse_CMPEQ8
; goto do_SseReRg
;
3731 case Iop_CmpEQ16x8
: op
= Asse_CMPEQ16
; goto do_SseReRg
;
3732 case Iop_CmpEQ32x4
: op
= Asse_CMPEQ32
; goto do_SseReRg
;
3733 case Iop_CmpGT8Sx16
: op
= Asse_CMPGT8S
; goto do_SseReRg
;
3734 case Iop_CmpGT16Sx8
: op
= Asse_CMPGT16S
; goto do_SseReRg
;
3735 case Iop_CmpGT32Sx4
: op
= Asse_CMPGT32S
; goto do_SseReRg
;
3736 case Iop_Max16Sx8
: op
= Asse_MAX16S
; goto do_SseReRg
;
3737 case Iop_Max8Ux16
: op
= Asse_MAX8U
; goto do_SseReRg
;
3738 case Iop_Min16Sx8
: op
= Asse_MIN16S
; goto do_SseReRg
;
3739 case Iop_Min8Ux16
: op
= Asse_MIN8U
; goto do_SseReRg
;
3740 case Iop_MulHi16Ux8
: op
= Asse_MULHI16U
; goto do_SseReRg
;
3741 case Iop_MulHi16Sx8
: op
= Asse_MULHI16S
; goto do_SseReRg
;
3742 case Iop_Mul16x8
: op
= Asse_MUL16
; goto do_SseReRg
;
3743 case Iop_Sub8x16
: op
= Asse_SUB8
; goto do_SseReRg
;
3744 case Iop_Sub16x8
: op
= Asse_SUB16
; goto do_SseReRg
;
3745 case Iop_Sub32x4
: op
= Asse_SUB32
; goto do_SseReRg
;
3746 case Iop_Sub64x2
: op
= Asse_SUB64
; goto do_SseReRg
;
3747 case Iop_QSub8Sx16
: op
= Asse_QSUB8S
; goto do_SseReRg
;
3748 case Iop_QSub16Sx8
: op
= Asse_QSUB16S
; goto do_SseReRg
;
3749 case Iop_QSub8Ux16
: op
= Asse_QSUB8U
; goto do_SseReRg
;
3750 case Iop_QSub16Ux8
: op
= Asse_QSUB16U
; goto do_SseReRg
;
3752 HReg arg1
= iselVecExpr(env
, e
->Iex
.Binop
.arg1
);
3753 HReg arg2
= iselVecExpr(env
, e
->Iex
.Binop
.arg2
);
3754 HReg dst
= newVRegV(env
);
3756 addInstr(env
, mk_vMOVsd_RR(arg2
, dst
));
3757 addInstr(env
, AMD64Instr_SseReRg(op
, arg1
, dst
));
3759 addInstr(env
, mk_vMOVsd_RR(arg1
, dst
));
3760 addInstr(env
, AMD64Instr_SseReRg(op
, arg2
, dst
));
3765 case Iop_ShlN8x16
: laneBits
= 8; op
= Asse_SHL16
; goto do_SseShift
;
3766 case Iop_ShlN16x8
: laneBits
= 16; op
= Asse_SHL16
; goto do_SseShift
;
3767 case Iop_ShlN32x4
: laneBits
= 32; op
= Asse_SHL32
; goto do_SseShift
;
3768 case Iop_ShlN64x2
: laneBits
= 64; op
= Asse_SHL64
; goto do_SseShift
;
3769 case Iop_SarN16x8
: laneBits
= 16; op
= Asse_SAR16
; goto do_SseShift
;
3770 case Iop_SarN32x4
: laneBits
= 32; op
= Asse_SAR32
; goto do_SseShift
;
3771 case Iop_ShrN16x8
: laneBits
= 16; op
= Asse_SHR16
; goto do_SseShift
;
3772 case Iop_ShrN32x4
: laneBits
= 32; op
= Asse_SHR32
; goto do_SseShift
;
3773 case Iop_ShrN64x2
: laneBits
= 64; op
= Asse_SHR64
; goto do_SseShift
;
3775 HReg dst
= newVRegV(env
);
3776 HReg greg
= iselVecExpr(env
, e
->Iex
.Binop
.arg1
);
3777 /* If it's a shift by an in-range immediate, generate a single
3779 if (e
->Iex
.Binop
.arg2
->tag
== Iex_Const
) {
3780 IRConst
* c
= e
->Iex
.Binop
.arg2
->Iex
.Const
.con
;
3781 vassert(c
->tag
== Ico_U8
);
3782 UInt shift
= c
->Ico
.U8
;
3783 if (shift
< laneBits
) {
3784 if (laneBits
== 8) {
3785 /* This instruction doesn't exist so we need to fake it using
3786 Asse_SHL16 and Asse_SHR16.
3788 We'd like to shift every byte in the 16-byte register to
3789 the left by some amount.
3791 Instead, we will make a copy and shift all the 16-bit words
3792 to the *right* by 8 and then to the left by 8 plus the
3793 shift amount. That will get us the correct answer for the
3794 upper 8 bits of each 16-bit word and zero elsewhere.
3796 Then we will shift all the 16-bit words in the original to
3797 the left by 8 plus the shift amount and then to the right
3798 by 8. This will get the correct answer for the lower 8
3799 bits of each 16-bit word and zero elsewhere.
3801 Finally, we will OR those two results together.
3803 Because we don't have a shift by constant in x86, we store
3804 the constant 8 into a register and shift by that as needed.
3806 AMD64SseOp reverse_op
= op
;
3809 reverse_op
= Asse_SHR16
;
3812 vpanic("Iop_ShlN8x16");
3814 HReg hi
= newVRegV(env
);
3815 addInstr(env
, mk_vMOVsd_RR(greg
, hi
));
3816 addInstr(env
, AMD64Instr_SseShiftN(reverse_op
, 8, hi
));
3817 addInstr(env
, AMD64Instr_SseShiftN(op
, 8+shift
, hi
));
3818 addInstr(env
, mk_vMOVsd_RR(greg
, dst
));
3819 addInstr(env
, AMD64Instr_SseShiftN(op
, 8+shift
, dst
));
3820 addInstr(env
, AMD64Instr_SseShiftN(reverse_op
, 8, dst
));
3821 addInstr(env
, AMD64Instr_SseReRg(Asse_OR
, hi
, dst
));
3824 addInstr(env
, mk_vMOVsd_RR(greg
, dst
));
3825 addInstr(env
, AMD64Instr_SseShiftN(op
, shift
, dst
));
3829 /* Otherwise we have to do it the longwinded way. */
3830 AMD64RMI
* rmi
= iselIntExpr_RMI(env
, e
->Iex
.Binop
.arg2
);
3831 AMD64AMode
* rsp0
= AMD64AMode_IR(0, hregAMD64_RSP());
3832 HReg ereg
= newVRegV(env
);
3833 addInstr(env
, AMD64Instr_Push(AMD64RMI_Imm(0)));
3834 addInstr(env
, AMD64Instr_Push(rmi
));
3835 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 16, ereg
, rsp0
));
3836 if (laneBits
== 8) {
3837 /* This instruction doesn't exist so we need to fake it, in the same
3840 AMD64SseOp reverse_op
= op
;
3843 reverse_op
= Asse_SHR16
;
3846 vpanic("Iop_ShlN8x16");
3848 HReg hi
= newVRegV(env
);
3849 addInstr(env
, mk_vMOVsd_RR(greg
, hi
));
3850 addInstr(env
, AMD64Instr_SseShiftN(reverse_op
, 8, hi
));
3851 addInstr(env
, AMD64Instr_SseShiftN(op
, 8, hi
));
3852 addInstr(env
, AMD64Instr_SseReRg(op
, ereg
, hi
));
3853 addInstr(env
, mk_vMOVsd_RR(greg
, dst
));
3854 addInstr(env
, AMD64Instr_SseShiftN(op
, 8, dst
));
3855 addInstr(env
, AMD64Instr_SseReRg(op
, ereg
, dst
));
3856 addInstr(env
, AMD64Instr_SseShiftN(reverse_op
, 8, dst
));
3857 addInstr(env
, AMD64Instr_SseReRg(Asse_OR
, hi
, dst
));
3860 addInstr(env
, mk_vMOVsd_RR(greg
, dst
));
3861 addInstr(env
, AMD64Instr_SseReRg(op
, ereg
, dst
));
3862 add_to_rsp(env
, 16);
3866 case Iop_Mul32x4
: fn
= (HWord
)h_generic_calc_Mul32x4
;
3867 goto do_SseAssistedBinary
;
3868 case Iop_Max32Sx4
: fn
= (HWord
)h_generic_calc_Max32Sx4
;
3869 goto do_SseAssistedBinary
;
3870 case Iop_Min32Sx4
: fn
= (HWord
)h_generic_calc_Min32Sx4
;
3871 goto do_SseAssistedBinary
;
3872 case Iop_Max32Ux4
: fn
= (HWord
)h_generic_calc_Max32Ux4
;
3873 goto do_SseAssistedBinary
;
3874 case Iop_Min32Ux4
: fn
= (HWord
)h_generic_calc_Min32Ux4
;
3875 goto do_SseAssistedBinary
;
3876 case Iop_Max16Ux8
: fn
= (HWord
)h_generic_calc_Max16Ux8
;
3877 goto do_SseAssistedBinary
;
3878 case Iop_Min16Ux8
: fn
= (HWord
)h_generic_calc_Min16Ux8
;
3879 goto do_SseAssistedBinary
;
3880 case Iop_Max8Sx16
: fn
= (HWord
)h_generic_calc_Max8Sx16
;
3881 goto do_SseAssistedBinary
;
3882 case Iop_Min8Sx16
: fn
= (HWord
)h_generic_calc_Min8Sx16
;
3883 goto do_SseAssistedBinary
;
3884 case Iop_CmpEQ64x2
: fn
= (HWord
)h_generic_calc_CmpEQ64x2
;
3885 goto do_SseAssistedBinary
;
3886 case Iop_CmpGT64Sx2
: fn
= (HWord
)h_generic_calc_CmpGT64Sx2
;
3887 goto do_SseAssistedBinary
;
3888 case Iop_Perm32x4
: fn
= (HWord
)h_generic_calc_Perm32x4
;
3889 goto do_SseAssistedBinary
;
3890 case Iop_QNarrowBin32Sto16Ux8
:
3891 fn
= (HWord
)h_generic_calc_QNarrowBin32Sto16Ux8
;
3892 goto do_SseAssistedBinary
;
3893 case Iop_NarrowBin16to8x16
:
3894 fn
= (HWord
)h_generic_calc_NarrowBin16to8x16
;
3895 goto do_SseAssistedBinary
;
3896 case Iop_NarrowBin32to16x8
:
3897 fn
= (HWord
)h_generic_calc_NarrowBin32to16x8
;
3898 goto do_SseAssistedBinary
;
3899 do_SseAssistedBinary
: {
3900 /* RRRufff! RRRufff code is what we're generating here. Oh
3903 HReg dst
= newVRegV(env
);
3904 HReg argL
= iselVecExpr(env
, e
->Iex
.Binop
.arg1
);
3905 HReg argR
= iselVecExpr(env
, e
->Iex
.Binop
.arg2
);
3906 HReg argp
= newVRegI(env
);
3907 /* subq $112, %rsp -- make a space*/
3908 sub_from_rsp(env
, 112);
3909 /* leaq 48(%rsp), %r_argp -- point into it */
3910 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3912 /* andq $-16, %r_argp -- 16-align the pointer */
3913 addInstr(env
, AMD64Instr_Alu64R(Aalu_AND
,
3914 AMD64RMI_Imm( ~(UInt
)15 ),
3916 /* Prepare 3 arg regs:
3917 leaq 0(%r_argp), %rdi
3918 leaq 16(%r_argp), %rsi
3919 leaq 32(%r_argp), %rdx
3921 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(0, argp
),
3923 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(16, argp
),
3925 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(32, argp
),
3927 /* Store the two args, at (%rsi) and (%rdx):
3928 movupd %argL, 0(%rsi)
3929 movupd %argR, 0(%rdx)
3931 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 16, argL
,
3932 AMD64AMode_IR(0, hregAMD64_RSI())));
3933 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 16, argR
,
3934 AMD64AMode_IR(0, hregAMD64_RDX())));
3935 /* call the helper */
3936 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
, (ULong
)fn
,
3937 3, mk_RetLoc_simple(RLPri_None
) ));
3938 /* fetch the result from memory, using %r_argp, which the
3939 register allocator will keep alive across the call. */
3940 addInstr(env
, AMD64Instr_SseLdSt(True
/*isLoad*/, 16, dst
,
3941 AMD64AMode_IR(0, argp
)));
3942 /* and finally, clear the space */
3943 add_to_rsp(env
, 112);
3947 case Iop_SarN64x2
: fn
= (HWord
)h_generic_calc_SarN64x2
;
3948 goto do_SseAssistedVectorAndScalar
;
3949 case Iop_SarN8x16
: fn
= (HWord
)h_generic_calc_SarN8x16
;
3950 goto do_SseAssistedVectorAndScalar
;
3951 do_SseAssistedVectorAndScalar
: {
3952 /* RRRufff! RRRufff code is what we're generating here. Oh
3955 HReg dst
= newVRegV(env
);
3956 HReg argL
= iselVecExpr(env
, e
->Iex
.Binop
.arg1
);
3957 HReg argR
= iselIntExpr_R(env
, e
->Iex
.Binop
.arg2
);
3958 HReg argp
= newVRegI(env
);
3959 /* subq $112, %rsp -- make a space*/
3960 sub_from_rsp(env
, 112);
3961 /* leaq 48(%rsp), %r_argp -- point into it */
3962 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3964 /* andq $-16, %r_argp -- 16-align the pointer */
3965 addInstr(env
, AMD64Instr_Alu64R(Aalu_AND
,
3966 AMD64RMI_Imm( ~(UInt
)15 ),
3968 /* Prepare 2 vector arg regs:
3969 leaq 0(%r_argp), %rdi
3970 leaq 16(%r_argp), %rsi
3972 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(0, argp
),
3974 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(16, argp
),
3976 /* Store the vector arg, at (%rsi):
3977 movupd %argL, 0(%rsi)
3979 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 16, argL
,
3980 AMD64AMode_IR(0, hregAMD64_RSI())));
3981 /* And get the scalar value into rdx */
3982 addInstr(env
, mk_iMOVsd_RR(argR
, hregAMD64_RDX()));
3984 /* call the helper */
3985 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
, (ULong
)fn
,
3986 3, mk_RetLoc_simple(RLPri_None
) ));
3987 /* fetch the result from memory, using %r_argp, which the
3988 register allocator will keep alive across the call. */
3989 addInstr(env
, AMD64Instr_SseLdSt(True
/*isLoad*/, 16, dst
,
3990 AMD64AMode_IR(0, argp
)));
3991 /* and finally, clear the space */
3992 add_to_rsp(env
, 112);
3996 case Iop_I32StoF32x4
:
3997 case Iop_F32toI32Sx4
: {
3998 HReg arg
= iselVecExpr(env
, e
->Iex
.Binop
.arg2
);
3999 HReg dst
= newVRegV(env
);
4001 = e
->Iex
.Binop
.op
== Iop_I32StoF32x4
? Asse_I2F
: Asse_F2I
;
4002 set_SSE_rounding_mode(env
, e
->Iex
.Binop
.arg1
);
4003 addInstr(env
, AMD64Instr_Sse32Fx4(mop
, arg
, dst
));
4004 set_SSE_rounding_default(env
);
4008 // Half-float vector conversion
4009 case Iop_F32toF16x8
: {
4010 if (env
->hwcaps
& VEX_HWCAPS_AMD64_F16C
) {
4012 iselDVecExpr(&srcHi
, &srcLo
, env
, e
->Iex
.Binop
.arg2
);
4013 HReg dstHi
= newVRegV(env
);
4014 HReg dstLo
= newVRegV(env
);
4015 set_SSE_rounding_mode( env
, e
->Iex
.Binop
.arg1
);
4016 addInstr(env
, AMD64Instr_Sse32Fx4(Asse_F32toF16
, srcHi
, dstHi
));
4017 addInstr(env
, AMD64Instr_Sse32Fx4(Asse_F32toF16
, srcLo
, dstLo
));
4018 set_SSE_rounding_default(env
);
4019 // Now we have the result in dstHi[63:0] and dstLo[63:0], but we
4020 // need to compact all that into one register. There's probably a
4021 // more elegant way to do this, but ..
4022 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHL128
, 64, dstHi
));
4023 // dstHi is now 127:64 = useful data, 63:0 = zero
4024 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHL128
, 64, dstLo
));
4025 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHR128
, 64, dstLo
));
4026 // dstLo is now 127:64 = zero, 63:0 = useful data
4027 addInstr(env
, AMD64Instr_SseReRg(Asse_OR
, dstHi
, dstLo
));
4035 } /* switch (e->Iex.Binop.op) */
4036 } /* if (e->tag == Iex_Binop) */
4038 if (e
->tag
== Iex_Triop
) {
4039 IRTriop
*triop
= e
->Iex
.Triop
.details
;
4040 switch (triop
->op
) {
4042 case Iop_Add64Fx2
: op
= Asse_ADDF
; goto do_64Fx2_w_rm
;
4043 case Iop_Sub64Fx2
: op
= Asse_SUBF
; goto do_64Fx2_w_rm
;
4044 case Iop_Mul64Fx2
: op
= Asse_MULF
; goto do_64Fx2_w_rm
;
4045 case Iop_Div64Fx2
: op
= Asse_DIVF
; goto do_64Fx2_w_rm
;
4048 HReg argL
= iselVecExpr(env
, triop
->arg2
);
4049 HReg argR
= iselVecExpr(env
, triop
->arg3
);
4050 HReg dst
= newVRegV(env
);
4051 addInstr(env
, mk_vMOVsd_RR(argL
, dst
));
4052 /* XXXROUNDINGFIXME */
4053 /* set roundingmode here */
4054 addInstr(env
, AMD64Instr_Sse64Fx2(op
, argR
, dst
));
4058 case Iop_Add32Fx4
: op
= Asse_ADDF
; goto do_32Fx4_w_rm
;
4059 case Iop_Sub32Fx4
: op
= Asse_SUBF
; goto do_32Fx4_w_rm
;
4060 case Iop_Mul32Fx4
: op
= Asse_MULF
; goto do_32Fx4_w_rm
;
4061 case Iop_Div32Fx4
: op
= Asse_DIVF
; goto do_32Fx4_w_rm
;
4064 HReg argL
= iselVecExpr(env
, triop
->arg2
);
4065 HReg argR
= iselVecExpr(env
, triop
->arg3
);
4066 HReg dst
= newVRegV(env
);
4067 addInstr(env
, mk_vMOVsd_RR(argL
, dst
));
4068 /* XXXROUNDINGFIXME */
4069 /* set roundingmode here */
4070 addInstr(env
, AMD64Instr_Sse32Fx4(op
, argR
, dst
));
4076 } /* switch (triop->op) */
4077 } /* if (e->tag == Iex_Triop) */
4079 if (e
->tag
== Iex_ITE
) { // VFD
4080 HReg r1
= iselVecExpr(env
, e
->Iex
.ITE
.iftrue
);
4081 HReg r0
= iselVecExpr(env
, e
->Iex
.ITE
.iffalse
);
4082 HReg dst
= newVRegV(env
);
4083 addInstr(env
, mk_vMOVsd_RR(r1
,dst
));
4084 AMD64CondCode cc
= iselCondCode_C(env
, e
->Iex
.ITE
.cond
);
4085 addInstr(env
, AMD64Instr_SseCMov(cc
^ 1, r0
, dst
));
4090 vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
4091 LibVEX_ppVexHwCaps(VexArchAMD64
, env
->hwcaps
));
4093 vpanic("iselVecExpr_wrk");
4097 /*---------------------------------------------------------*/
4098 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs. --*/
4099 /*---------------------------------------------------------*/
4101 static void iselDVecExpr ( /*OUT*/HReg
* rHi
, /*OUT*/HReg
* rLo
,
4102 ISelEnv
* env
, const IRExpr
* e
)
4104 iselDVecExpr_wrk( rHi
, rLo
, env
, e
);
4106 vex_printf("\n"); ppIRExpr(e
); vex_printf("\n");
4108 vassert(hregClass(*rHi
) == HRcVec128
);
4109 vassert(hregClass(*rLo
) == HRcVec128
);
4110 vassert(hregIsVirtual(*rHi
));
4111 vassert(hregIsVirtual(*rLo
));
4115 /* DO NOT CALL THIS DIRECTLY */
4116 static void iselDVecExpr_wrk ( /*OUT*/HReg
* rHi
, /*OUT*/HReg
* rLo
,
4117 ISelEnv
* env
, const IRExpr
* e
)
4119 HWord fn
= 0; /* address of helper fn, if required */
4121 IRType ty
= typeOfIRExpr(env
->type_env
, e
);
4122 vassert(ty
== Ity_V256
);
4125 AMD64SseOp op
= Asse_INVALID
;
4127 /* read 256-bit IRTemp */
4128 if (e
->tag
== Iex_RdTmp
) {
4129 lookupIRTempPair( rHi
, rLo
, env
, e
->Iex
.RdTmp
.tmp
);
4133 if (e
->tag
== Iex_Get
) {
4134 HReg vHi
= newVRegV(env
);
4135 HReg vLo
= newVRegV(env
);
4136 HReg rbp
= hregAMD64_RBP();
4137 AMD64AMode
* am0
= AMD64AMode_IR(e
->Iex
.Get
.offset
+ 0, rbp
);
4138 AMD64AMode
* am16
= AMD64AMode_IR(e
->Iex
.Get
.offset
+ 16, rbp
);
4139 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 16, vLo
, am0
));
4140 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 16, vHi
, am16
));
4146 if (e
->tag
== Iex_Load
) {
4147 HReg vHi
= newVRegV(env
);
4148 HReg vLo
= newVRegV(env
);
4149 HReg rA
= iselIntExpr_R(env
, e
->Iex
.Load
.addr
);
4150 AMD64AMode
* am0
= AMD64AMode_IR(0, rA
);
4151 AMD64AMode
* am16
= AMD64AMode_IR(16, rA
);
4152 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 16, vLo
, am0
));
4153 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 16, vHi
, am16
));
4159 if (e
->tag
== Iex_Const
) {
4160 vassert(e
->Iex
.Const
.con
->tag
== Ico_V256
);
4161 switch (e
->Iex
.Const
.con
->Ico
.V256
) {
4163 HReg vHi
= generate_zeroes_V128(env
);
4164 HReg vLo
= newVRegV(env
);
4165 addInstr(env
, mk_vMOVsd_RR(vHi
, vLo
));
4171 HReg vHi
= generate_ones_V128(env
);
4172 HReg vLo
= newVRegV(env
);
4173 addInstr(env
, mk_vMOVsd_RR(vHi
, vLo
));
4179 break; /* give up. Until such time as is necessary. */
4183 if (e
->tag
== Iex_Unop
) {
4184 switch (e
->Iex
.Unop
.op
) {
4188 iselDVecExpr(&argHi
, &argLo
, env
, e
->Iex
.Unop
.arg
);
4189 *rHi
= do_sse_NotV128(env
, argHi
);
4190 *rLo
= do_sse_NotV128(env
, argLo
);
4194 case Iop_RecipEst32Fx8
: op
= Asse_RCPF
; goto do_32Fx8_unary
;
4195 case Iop_Sqrt32Fx8
: op
= Asse_SQRTF
; goto do_32Fx8_unary
;
4196 case Iop_RSqrtEst32Fx8
: op
= Asse_RSQRTF
; goto do_32Fx8_unary
;
4200 iselDVecExpr(&argHi
, &argLo
, env
, e
->Iex
.Unop
.arg
);
4201 HReg dstHi
= newVRegV(env
);
4202 HReg dstLo
= newVRegV(env
);
4203 addInstr(env
, AMD64Instr_Sse32Fx4(op
, argHi
, dstHi
));
4204 addInstr(env
, AMD64Instr_Sse32Fx4(op
, argLo
, dstLo
));
4210 case Iop_Sqrt64Fx4
: op
= Asse_SQRTF
; goto do_64Fx4_unary
;
4214 iselDVecExpr(&argHi
, &argLo
, env
, e
->Iex
.Unop
.arg
);
4215 HReg dstHi
= newVRegV(env
);
4216 HReg dstLo
= newVRegV(env
);
4217 addInstr(env
, AMD64Instr_Sse64Fx2(op
, argHi
, dstHi
));
4218 addInstr(env
, AMD64Instr_Sse64Fx2(op
, argLo
, dstLo
));
4224 case Iop_CmpNEZ64x4
: {
4225 /* We can use SSE2 instructions for this. */
4226 /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
4227 (obviously). See comment on Iop_CmpNEZ64x2 for
4228 explanation of what's going on here. */
4230 iselDVecExpr(&argHi
, &argLo
, env
, e
->Iex
.Unop
.arg
);
4231 HReg tmpHi
= generate_zeroes_V128(env
);
4232 HReg tmpLo
= newVRegV(env
);
4233 addInstr(env
, mk_vMOVsd_RR(tmpHi
, tmpLo
));
4234 HReg dstHi
= newVRegV(env
);
4235 HReg dstLo
= newVRegV(env
);
4236 addInstr(env
, AMD64Instr_SseReRg(Asse_CMPEQ32
, argHi
, tmpHi
));
4237 addInstr(env
, AMD64Instr_SseReRg(Asse_CMPEQ32
, argLo
, tmpLo
));
4238 tmpHi
= do_sse_NotV128(env
, tmpHi
);
4239 tmpLo
= do_sse_NotV128(env
, tmpLo
);
4240 addInstr(env
, AMD64Instr_SseShuf(0xB1, tmpHi
, dstHi
));
4241 addInstr(env
, AMD64Instr_SseShuf(0xB1, tmpLo
, dstLo
));
4242 addInstr(env
, AMD64Instr_SseReRg(Asse_OR
, tmpHi
, dstHi
));
4243 addInstr(env
, AMD64Instr_SseReRg(Asse_OR
, tmpLo
, dstLo
));
4249 case Iop_CmpNEZ32x8
: op
= Asse_CMPEQ32
; goto do_CmpNEZ_vector
;
4250 case Iop_CmpNEZ16x16
: op
= Asse_CMPEQ16
; goto do_CmpNEZ_vector
;
4251 case Iop_CmpNEZ8x32
: op
= Asse_CMPEQ8
; goto do_CmpNEZ_vector
;
4255 iselDVecExpr(&argHi
, &argLo
, env
, e
->Iex
.Unop
.arg
);
4256 HReg tmpHi
= newVRegV(env
);
4257 HReg tmpLo
= newVRegV(env
);
4258 HReg zero
= generate_zeroes_V128(env
);
4260 addInstr(env
, mk_vMOVsd_RR(argHi
, tmpHi
));
4261 addInstr(env
, mk_vMOVsd_RR(argLo
, tmpLo
));
4262 addInstr(env
, AMD64Instr_SseReRg(op
, zero
, tmpHi
));
4263 addInstr(env
, AMD64Instr_SseReRg(op
, zero
, tmpLo
));
4264 dstHi
= do_sse_NotV128(env
, tmpHi
);
4265 dstLo
= do_sse_NotV128(env
, tmpLo
);
4271 case Iop_F16toF32x8
: {
4272 if (env
->hwcaps
& VEX_HWCAPS_AMD64_F16C
) {
4273 HReg src
= iselVecExpr(env
, e
->Iex
.Unop
.arg
);
4274 HReg srcCopy
= newVRegV(env
);
4275 HReg dstHi
= newVRegV(env
);
4276 HReg dstLo
= newVRegV(env
);
4277 // Copy src, since we'll need to modify it.
4278 addInstr(env
, mk_vMOVsd_RR(src
, srcCopy
));
4279 addInstr(env
, AMD64Instr_Sse32Fx4(Asse_F16toF32
, srcCopy
, dstLo
));
4280 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHR128
, 64, srcCopy
));
4281 addInstr(env
, AMD64Instr_Sse32Fx4(Asse_F16toF32
, srcCopy
, dstHi
));
4291 } /* switch (e->Iex.Unop.op) */
4292 } /* if (e->tag == Iex_Unop) */
4294 if (e
->tag
== Iex_Binop
) {
4295 switch (e
->Iex
.Binop
.op
) {
4297 case Iop_Max64Fx4
: op
= Asse_MAXF
; goto do_64Fx4
;
4298 case Iop_Min64Fx4
: op
= Asse_MINF
; goto do_64Fx4
;
4301 HReg argLhi
, argLlo
, argRhi
, argRlo
;
4302 iselDVecExpr(&argLhi
, &argLlo
, env
, e
->Iex
.Binop
.arg1
);
4303 iselDVecExpr(&argRhi
, &argRlo
, env
, e
->Iex
.Binop
.arg2
);
4304 HReg dstHi
= newVRegV(env
);
4305 HReg dstLo
= newVRegV(env
);
4306 addInstr(env
, mk_vMOVsd_RR(argLhi
, dstHi
));
4307 addInstr(env
, mk_vMOVsd_RR(argLlo
, dstLo
));
4308 addInstr(env
, AMD64Instr_Sse64Fx2(op
, argRhi
, dstHi
));
4309 addInstr(env
, AMD64Instr_Sse64Fx2(op
, argRlo
, dstLo
));
4315 case Iop_Max32Fx8
: op
= Asse_MAXF
; goto do_32Fx8
;
4316 case Iop_Min32Fx8
: op
= Asse_MINF
; goto do_32Fx8
;
4319 HReg argLhi
, argLlo
, argRhi
, argRlo
;
4320 iselDVecExpr(&argLhi
, &argLlo
, env
, e
->Iex
.Binop
.arg1
);
4321 iselDVecExpr(&argRhi
, &argRlo
, env
, e
->Iex
.Binop
.arg2
);
4322 HReg dstHi
= newVRegV(env
);
4323 HReg dstLo
= newVRegV(env
);
4324 addInstr(env
, mk_vMOVsd_RR(argLhi
, dstHi
));
4325 addInstr(env
, mk_vMOVsd_RR(argLlo
, dstLo
));
4326 addInstr(env
, AMD64Instr_Sse32Fx4(op
, argRhi
, dstHi
));
4327 addInstr(env
, AMD64Instr_Sse32Fx4(op
, argRlo
, dstLo
));
4333 case Iop_AndV256
: op
= Asse_AND
; goto do_SseReRg
;
4334 case Iop_OrV256
: op
= Asse_OR
; goto do_SseReRg
;
4335 case Iop_XorV256
: op
= Asse_XOR
; goto do_SseReRg
;
4336 case Iop_Add8x32
: op
= Asse_ADD8
; goto do_SseReRg
;
4337 case Iop_Add16x16
: op
= Asse_ADD16
; goto do_SseReRg
;
4338 case Iop_Add32x8
: op
= Asse_ADD32
; goto do_SseReRg
;
4339 case Iop_Add64x4
: op
= Asse_ADD64
; goto do_SseReRg
;
4340 case Iop_QAdd8Sx32
: op
= Asse_QADD8S
; goto do_SseReRg
;
4341 case Iop_QAdd16Sx16
: op
= Asse_QADD16S
; goto do_SseReRg
;
4342 case Iop_QAdd8Ux32
: op
= Asse_QADD8U
; goto do_SseReRg
;
4343 case Iop_QAdd16Ux16
: op
= Asse_QADD16U
; goto do_SseReRg
;
4344 case Iop_Avg8Ux32
: op
= Asse_AVG8U
; goto do_SseReRg
;
4345 case Iop_Avg16Ux16
: op
= Asse_AVG16U
; goto do_SseReRg
;
4346 case Iop_CmpEQ8x32
: op
= Asse_CMPEQ8
; goto do_SseReRg
;
4347 case Iop_CmpEQ16x16
: op
= Asse_CMPEQ16
; goto do_SseReRg
;
4348 case Iop_CmpEQ32x8
: op
= Asse_CMPEQ32
; goto do_SseReRg
;
4349 case Iop_CmpGT8Sx32
: op
= Asse_CMPGT8S
; goto do_SseReRg
;
4350 case Iop_CmpGT16Sx16
: op
= Asse_CMPGT16S
; goto do_SseReRg
;
4351 case Iop_CmpGT32Sx8
: op
= Asse_CMPGT32S
; goto do_SseReRg
;
4352 case Iop_Max16Sx16
: op
= Asse_MAX16S
; goto do_SseReRg
;
4353 case Iop_Max8Ux32
: op
= Asse_MAX8U
; goto do_SseReRg
;
4354 case Iop_Min16Sx16
: op
= Asse_MIN16S
; goto do_SseReRg
;
4355 case Iop_Min8Ux32
: op
= Asse_MIN8U
; goto do_SseReRg
;
4356 case Iop_MulHi16Ux16
: op
= Asse_MULHI16U
; goto do_SseReRg
;
4357 case Iop_MulHi16Sx16
: op
= Asse_MULHI16S
; goto do_SseReRg
;
4358 case Iop_Mul16x16
: op
= Asse_MUL16
; goto do_SseReRg
;
4359 case Iop_Sub8x32
: op
= Asse_SUB8
; goto do_SseReRg
;
4360 case Iop_Sub16x16
: op
= Asse_SUB16
; goto do_SseReRg
;
4361 case Iop_Sub32x8
: op
= Asse_SUB32
; goto do_SseReRg
;
4362 case Iop_Sub64x4
: op
= Asse_SUB64
; goto do_SseReRg
;
4363 case Iop_QSub8Sx32
: op
= Asse_QSUB8S
; goto do_SseReRg
;
4364 case Iop_QSub16Sx16
: op
= Asse_QSUB16S
; goto do_SseReRg
;
4365 case Iop_QSub8Ux32
: op
= Asse_QSUB8U
; goto do_SseReRg
;
4366 case Iop_QSub16Ux16
: op
= Asse_QSUB16U
; goto do_SseReRg
;
4369 HReg argLhi
, argLlo
, argRhi
, argRlo
;
4370 iselDVecExpr(&argLhi
, &argLlo
, env
, e
->Iex
.Binop
.arg1
);
4371 iselDVecExpr(&argRhi
, &argRlo
, env
, e
->Iex
.Binop
.arg2
);
4372 HReg dstHi
= newVRegV(env
);
4373 HReg dstLo
= newVRegV(env
);
4374 addInstr(env
, mk_vMOVsd_RR(argLhi
, dstHi
));
4375 addInstr(env
, mk_vMOVsd_RR(argLlo
, dstLo
));
4376 addInstr(env
, AMD64Instr_SseReRg(op
, argRhi
, dstHi
));
4377 addInstr(env
, AMD64Instr_SseReRg(op
, argRlo
, dstLo
));
4383 case Iop_ShlN16x16
: laneBits
= 16; op
= Asse_SHL16
; goto do_SseShift
;
4384 case Iop_ShlN32x8
: laneBits
= 32; op
= Asse_SHL32
; goto do_SseShift
;
4385 case Iop_ShlN64x4
: laneBits
= 64; op
= Asse_SHL64
; goto do_SseShift
;
4386 case Iop_SarN16x16
: laneBits
= 16; op
= Asse_SAR16
; goto do_SseShift
;
4387 case Iop_SarN32x8
: laneBits
= 32; op
= Asse_SAR32
; goto do_SseShift
;
4388 case Iop_ShrN16x16
: laneBits
= 16; op
= Asse_SHR16
; goto do_SseShift
;
4389 case Iop_ShrN32x8
: laneBits
= 32; op
= Asse_SHR32
; goto do_SseShift
;
4390 case Iop_ShrN64x4
: laneBits
= 64; op
= Asse_SHR64
; goto do_SseShift
;
4392 HReg dstHi
= newVRegV(env
);
4393 HReg dstLo
= newVRegV(env
);
4394 HReg gregHi
, gregLo
;
4395 iselDVecExpr(&gregHi
, &gregLo
, env
, e
->Iex
.Binop
.arg1
);
4396 /* If it's a shift by an in-range immediate, generate two single
4398 if (e
->Iex
.Binop
.arg2
->tag
== Iex_Const
) {
4399 IRConst
* c
= e
->Iex
.Binop
.arg2
->Iex
.Const
.con
;
4400 vassert(c
->tag
== Ico_U8
);
4401 UInt shift
= c
->Ico
.U8
;
4402 if (shift
< laneBits
) {
4403 addInstr(env
, mk_vMOVsd_RR(gregHi
, dstHi
));
4404 addInstr(env
, AMD64Instr_SseShiftN(op
, shift
, dstHi
));
4405 addInstr(env
, mk_vMOVsd_RR(gregLo
, dstLo
));
4406 addInstr(env
, AMD64Instr_SseShiftN(op
, shift
, dstLo
));
4412 /* Otherwise we have to do it the longwinded way. */
4413 AMD64RMI
* rmi
= iselIntExpr_RMI(env
, e
->Iex
.Binop
.arg2
);
4414 AMD64AMode
* rsp0
= AMD64AMode_IR(0, hregAMD64_RSP());
4415 HReg ereg
= newVRegV(env
);
4416 addInstr(env
, AMD64Instr_Push(AMD64RMI_Imm(0)));
4417 addInstr(env
, AMD64Instr_Push(rmi
));
4418 addInstr(env
, AMD64Instr_SseLdSt(True
/*load*/, 16, ereg
, rsp0
));
4419 addInstr(env
, mk_vMOVsd_RR(gregHi
, dstHi
));
4420 addInstr(env
, AMD64Instr_SseReRg(op
, ereg
, dstHi
));
4421 addInstr(env
, mk_vMOVsd_RR(gregLo
, dstLo
));
4422 addInstr(env
, AMD64Instr_SseReRg(op
, ereg
, dstLo
));
4423 add_to_rsp(env
, 16);
4429 case Iop_V128HLtoV256
: {
4430 // Curiously, there doesn't seem to be any benefit to be had here by
4431 // checking whether arg1 and arg2 are the same, in the style of how
4432 // (eg) 64HLtoV128 is handled elsewhere in this file.
4433 *rHi
= iselVecExpr(env
, e
->Iex
.Binop
.arg1
);
4434 *rLo
= iselVecExpr(env
, e
->Iex
.Binop
.arg2
);
4438 case Iop_Mul32x8
: fn
= (HWord
)h_generic_calc_Mul32x4
;
4439 goto do_SseAssistedBinary
;
4440 case Iop_Max32Sx8
: fn
= (HWord
)h_generic_calc_Max32Sx4
;
4441 goto do_SseAssistedBinary
;
4442 case Iop_Min32Sx8
: fn
= (HWord
)h_generic_calc_Min32Sx4
;
4443 goto do_SseAssistedBinary
;
4444 case Iop_Max32Ux8
: fn
= (HWord
)h_generic_calc_Max32Ux4
;
4445 goto do_SseAssistedBinary
;
4446 case Iop_Min32Ux8
: fn
= (HWord
)h_generic_calc_Min32Ux4
;
4447 goto do_SseAssistedBinary
;
4448 case Iop_Max16Ux16
: fn
= (HWord
)h_generic_calc_Max16Ux8
;
4449 goto do_SseAssistedBinary
;
4450 case Iop_Min16Ux16
: fn
= (HWord
)h_generic_calc_Min16Ux8
;
4451 goto do_SseAssistedBinary
;
4452 case Iop_Max8Sx32
: fn
= (HWord
)h_generic_calc_Max8Sx16
;
4453 goto do_SseAssistedBinary
;
4454 case Iop_Min8Sx32
: fn
= (HWord
)h_generic_calc_Min8Sx16
;
4455 goto do_SseAssistedBinary
;
4456 case Iop_CmpEQ64x4
: fn
= (HWord
)h_generic_calc_CmpEQ64x2
;
4457 goto do_SseAssistedBinary
;
4458 case Iop_CmpGT64Sx4
: fn
= (HWord
)h_generic_calc_CmpGT64Sx2
;
4459 goto do_SseAssistedBinary
;
4460 do_SseAssistedBinary
: {
4461 /* RRRufff! RRRufff code is what we're generating here. Oh
4464 HReg dstHi
= newVRegV(env
);
4465 HReg dstLo
= newVRegV(env
);
4466 HReg argLhi
, argLlo
, argRhi
, argRlo
;
4467 iselDVecExpr(&argLhi
, &argLlo
, env
, e
->Iex
.Binop
.arg1
);
4468 iselDVecExpr(&argRhi
, &argRlo
, env
, e
->Iex
.Binop
.arg2
);
4469 HReg argp
= newVRegI(env
);
4470 /* subq $160, %rsp -- make a space*/
4471 sub_from_rsp(env
, 160);
4472 /* leaq 48(%rsp), %r_argp -- point into it */
4473 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4475 /* andq $-16, %r_argp -- 16-align the pointer */
4476 addInstr(env
, AMD64Instr_Alu64R(Aalu_AND
,
4477 AMD64RMI_Imm( ~(UInt
)15 ),
4479 /* Prepare 3 arg regs:
4480 leaq 0(%r_argp), %rdi
4481 leaq 16(%r_argp), %rsi
4482 leaq 32(%r_argp), %rdx
4484 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(0, argp
),
4486 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(16, argp
),
4488 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(32, argp
),
4490 /* Store the two high args, at (%rsi) and (%rdx):
4491 movupd %argLhi, 0(%rsi)
4492 movupd %argRhi, 0(%rdx)
4494 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 16, argLhi
,
4495 AMD64AMode_IR(0, hregAMD64_RSI())));
4496 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 16, argRhi
,
4497 AMD64AMode_IR(0, hregAMD64_RDX())));
4498 /* Store the two low args, at 48(%rsi) and 48(%rdx):
4499 movupd %argLlo, 48(%rsi)
4500 movupd %argRlo, 48(%rdx)
4502 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 16, argLlo
,
4503 AMD64AMode_IR(48, hregAMD64_RSI())));
4504 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 16, argRlo
,
4505 AMD64AMode_IR(48, hregAMD64_RDX())));
4506 /* call the helper */
4507 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
, (ULong
)fn
, 3,
4508 mk_RetLoc_simple(RLPri_None
) ));
4509 /* Prepare 3 arg regs:
4510 leaq 48(%r_argp), %rdi
4511 leaq 64(%r_argp), %rsi
4512 leaq 80(%r_argp), %rdx
4514 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(48, argp
),
4516 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(64, argp
),
4518 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(80, argp
),
4520 /* call the helper */
4521 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
, (ULong
)fn
, 3,
4522 mk_RetLoc_simple(RLPri_None
) ));
4523 /* fetch the result from memory, using %r_argp, which the
4524 register allocator will keep alive across the call. */
4525 addInstr(env
, AMD64Instr_SseLdSt(True
/*isLoad*/, 16, dstHi
,
4526 AMD64AMode_IR(0, argp
)));
4527 addInstr(env
, AMD64Instr_SseLdSt(True
/*isLoad*/, 16, dstLo
,
4528 AMD64AMode_IR(48, argp
)));
4529 /* and finally, clear the space */
4530 add_to_rsp(env
, 160);
4536 case Iop_Perm32x8
: fn
= (HWord
)h_generic_calc_Perm32x8
;
4537 goto do_SseAssistedBinary256
;
4538 do_SseAssistedBinary256
: {
4539 /* RRRufff! RRRufff code is what we're generating here. Oh
4542 HReg dstHi
= newVRegV(env
);
4543 HReg dstLo
= newVRegV(env
);
4544 HReg argLhi
, argLlo
, argRhi
, argRlo
;
4545 iselDVecExpr(&argLhi
, &argLlo
, env
, e
->Iex
.Binop
.arg1
);
4546 iselDVecExpr(&argRhi
, &argRlo
, env
, e
->Iex
.Binop
.arg2
);
4547 HReg argp
= newVRegI(env
);
4548 /* subq $160, %rsp -- make a space*/
4549 sub_from_rsp(env
, 160);
4550 /* leaq 48(%rsp), %r_argp -- point into it */
4551 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4553 /* andq $-16, %r_argp -- 16-align the pointer */
4554 addInstr(env
, AMD64Instr_Alu64R(Aalu_AND
,
4555 AMD64RMI_Imm( ~(UInt
)15 ),
4557 /* Prepare 3 arg regs:
4558 leaq 0(%r_argp), %rdi
4559 leaq 32(%r_argp), %rsi
4560 leaq 64(%r_argp), %rdx
4562 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(0, argp
),
4564 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(32, argp
),
4566 addInstr(env
, AMD64Instr_Lea64(AMD64AMode_IR(64, argp
),
4568 /* Store the two args, at (%rsi) and (%rdx):
4569 movupd %argLlo, 0(%rsi)
4570 movupd %argLhi, 16(%rsi)
4571 movupd %argRlo, 0(%rdx)
4572 movupd %argRhi, 16(%rdx)
4574 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 16, argLlo
,
4575 AMD64AMode_IR(0, hregAMD64_RSI())));
4576 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 16, argLhi
,
4577 AMD64AMode_IR(16, hregAMD64_RSI())));
4578 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 16, argRlo
,
4579 AMD64AMode_IR(0, hregAMD64_RDX())));
4580 addInstr(env
, AMD64Instr_SseLdSt(False
/*!isLoad*/, 16, argRhi
,
4581 AMD64AMode_IR(16, hregAMD64_RDX())));
4582 /* call the helper */
4583 addInstr(env
, AMD64Instr_Call( Acc_ALWAYS
, (ULong
)fn
, 3,
4584 mk_RetLoc_simple(RLPri_None
) ));
4585 /* fetch the result from memory, using %r_argp, which the
4586 register allocator will keep alive across the call. */
4587 addInstr(env
, AMD64Instr_SseLdSt(True
/*isLoad*/, 16, dstLo
,
4588 AMD64AMode_IR(0, argp
)));
4589 addInstr(env
, AMD64Instr_SseLdSt(True
/*isLoad*/, 16, dstHi
,
4590 AMD64AMode_IR(16, argp
)));
4591 /* and finally, clear the space */
4592 add_to_rsp(env
, 160);
4598 case Iop_I32StoF32x8
:
4599 case Iop_F32toI32Sx8
: {
4601 iselDVecExpr(&argHi
, &argLo
, env
, e
->Iex
.Binop
.arg2
);
4602 HReg dstHi
= newVRegV(env
);
4603 HReg dstLo
= newVRegV(env
);
4605 = e
->Iex
.Binop
.op
== Iop_I32StoF32x8
? Asse_I2F
: Asse_F2I
;
4606 set_SSE_rounding_mode(env
, e
->Iex
.Binop
.arg1
);
4607 addInstr(env
, AMD64Instr_Sse32Fx4(mop
, argHi
, dstHi
));
4608 addInstr(env
, AMD64Instr_Sse32Fx4(mop
, argLo
, dstLo
));
4609 set_SSE_rounding_default(env
);
4617 } /* switch (e->Iex.Binop.op) */
4618 } /* if (e->tag == Iex_Binop) */
4620 if (e
->tag
== Iex_Triop
) {
4621 IRTriop
*triop
= e
->Iex
.Triop
.details
;
4622 switch (triop
->op
) {
4624 case Iop_Add64Fx4
: op
= Asse_ADDF
; goto do_64Fx4_w_rm
;
4625 case Iop_Sub64Fx4
: op
= Asse_SUBF
; goto do_64Fx4_w_rm
;
4626 case Iop_Mul64Fx4
: op
= Asse_MULF
; goto do_64Fx4_w_rm
;
4627 case Iop_Div64Fx4
: op
= Asse_DIVF
; goto do_64Fx4_w_rm
;
4630 HReg argLhi
, argLlo
, argRhi
, argRlo
;
4631 iselDVecExpr(&argLhi
, &argLlo
, env
, triop
->arg2
);
4632 iselDVecExpr(&argRhi
, &argRlo
, env
, triop
->arg3
);
4633 HReg dstHi
= newVRegV(env
);
4634 HReg dstLo
= newVRegV(env
);
4635 addInstr(env
, mk_vMOVsd_RR(argLhi
, dstHi
));
4636 addInstr(env
, mk_vMOVsd_RR(argLlo
, dstLo
));
4637 /* XXXROUNDINGFIXME */
4638 /* set roundingmode here */
4639 addInstr(env
, AMD64Instr_Sse64Fx2(op
, argRhi
, dstHi
));
4640 addInstr(env
, AMD64Instr_Sse64Fx2(op
, argRlo
, dstLo
));
4646 case Iop_Add32Fx8
: op
= Asse_ADDF
; goto do_32Fx8_w_rm
;
4647 case Iop_Sub32Fx8
: op
= Asse_SUBF
; goto do_32Fx8_w_rm
;
4648 case Iop_Mul32Fx8
: op
= Asse_MULF
; goto do_32Fx8_w_rm
;
4649 case Iop_Div32Fx8
: op
= Asse_DIVF
; goto do_32Fx8_w_rm
;
4652 HReg argLhi
, argLlo
, argRhi
, argRlo
;
4653 iselDVecExpr(&argLhi
, &argLlo
, env
, triop
->arg2
);
4654 iselDVecExpr(&argRhi
, &argRlo
, env
, triop
->arg3
);
4655 HReg dstHi
= newVRegV(env
);
4656 HReg dstLo
= newVRegV(env
);
4657 addInstr(env
, mk_vMOVsd_RR(argLhi
, dstHi
));
4658 addInstr(env
, mk_vMOVsd_RR(argLlo
, dstLo
));
4659 /* XXXROUNDINGFIXME */
4660 /* set roundingmode here */
4661 addInstr(env
, AMD64Instr_Sse32Fx4(op
, argRhi
, dstHi
));
4662 addInstr(env
, AMD64Instr_Sse32Fx4(op
, argRlo
, dstLo
));
4670 } /* switch (triop->op) */
4671 } /* if (e->tag == Iex_Triop) */
4674 if (e
->tag
== Iex_Qop
&& e
->Iex
.Qop
.details
->op
== Iop_64x4toV256
) {
4675 const IRExpr
* arg1
= e
->Iex
.Qop
.details
->arg1
;
4676 const IRExpr
* arg2
= e
->Iex
.Qop
.details
->arg2
;
4677 const IRExpr
* arg3
= e
->Iex
.Qop
.details
->arg3
;
4678 const IRExpr
* arg4
= e
->Iex
.Qop
.details
->arg4
;
4679 // If the args are trivially the same (tmp or const), use the same
4680 // source register for all four, and only one movq since those are
4681 // (relatively) expensive.
4682 if (areAtomsAndEqual(arg1
, arg2
)
4683 && areAtomsAndEqual(arg1
, arg3
) && areAtomsAndEqual(arg1
, arg4
)) {
4684 HReg q3
= iselIntExpr_R(env
, e
->Iex
.Qop
.details
->arg1
);
4685 HReg tmp
= newVRegV(env
);
4686 HReg dst
= newVRegV(env
);
4687 addInstr(env
, AMD64Instr_SseMOVQ(q3
, dst
, True
/*toXMM*/));
4688 addInstr(env
, mk_vMOVsd_RR(dst
, tmp
));
4689 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHL128
, 64, dst
));
4690 addInstr(env
, AMD64Instr_SseReRg(Asse_OR
, tmp
, dst
));
4694 /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4695 HReg q3
= iselIntExpr_R(env
, arg1
);
4696 HReg q2
= iselIntExpr_R(env
, arg2
);
4697 HReg q1
= iselIntExpr_R(env
, arg3
);
4698 HReg q0
= iselIntExpr_R(env
, arg4
);
4699 HReg tmp
= newVRegV(env
);
4700 HReg dstHi
= newVRegV(env
);
4701 HReg dstLo
= newVRegV(env
);
4702 addInstr(env
, AMD64Instr_SseMOVQ(q3
, dstHi
, True
/*toXMM*/));
4703 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHL128
, 64, dstHi
));
4704 addInstr(env
, AMD64Instr_SseMOVQ(q2
, tmp
, True
/*toXMM*/));
4705 addInstr(env
, AMD64Instr_SseReRg(Asse_OR
, tmp
, dstHi
));
4706 addInstr(env
, AMD64Instr_SseMOVQ(q1
, dstLo
, True
/*toXMM*/));
4707 addInstr(env
, AMD64Instr_SseShiftN(Asse_SHL128
, 64, dstLo
));
4708 addInstr(env
, AMD64Instr_SseMOVQ(q0
, tmp
, True
/*toXMM*/));
4709 addInstr(env
, AMD64Instr_SseReRg(Asse_OR
, tmp
, dstLo
));
4716 if (e
->tag
== Iex_ITE
) {
4717 HReg r1Hi
, r1Lo
, r0Hi
, r0Lo
;
4718 iselDVecExpr(&r1Hi
, &r1Lo
, env
, e
->Iex
.ITE
.iftrue
);
4719 iselDVecExpr(&r0Hi
, &r0Lo
, env
, e
->Iex
.ITE
.iffalse
);
4720 HReg dstHi
= newVRegV(env
);
4721 HReg dstLo
= newVRegV(env
);
4722 addInstr(env
, mk_vMOVsd_RR(r1Hi
,dstHi
));
4723 addInstr(env
, mk_vMOVsd_RR(r1Lo
,dstLo
));
4724 AMD64CondCode cc
= iselCondCode_C(env
, e
->Iex
.ITE
.cond
);
4725 addInstr(env
, AMD64Instr_SseCMov(cc
^ 1, r0Hi
, dstHi
));
4726 addInstr(env
, AMD64Instr_SseCMov(cc
^ 1, r0Lo
, dstLo
));
4733 vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4734 LibVEX_ppVexHwCaps(VexArchAMD64
, env
->hwcaps
));
4736 vpanic("iselDVecExpr_wrk");
4740 /*---------------------------------------------------------*/
4741 /*--- ISEL: Statements ---*/
4742 /*---------------------------------------------------------*/
4744 static void iselStmt ( ISelEnv
* env
, IRStmt
* stmt
)
4746 if (vex_traceflags
& VEX_TRACE_VCODE
) {
4747 vex_printf("\n-- ");
4752 switch (stmt
->tag
) {
4754 /* --------- LOADG (guarded load) --------- */
4756 IRLoadG
* lg
= stmt
->Ist
.LoadG
.details
;
4757 if (lg
->end
!= Iend_LE
)
4760 UChar szB
= 0; /* invalid */
4762 case ILGop_Ident32
: szB
= 4; break;
4763 case ILGop_Ident64
: szB
= 8; break;
4764 case ILGop_IdentV128
: szB
= 16; break;
4771 = iselIntExpr_AMode(env
, lg
->addr
);
4773 = szB
== 16 ? iselVecExpr(env
, lg
->alt
)
4774 : iselIntExpr_R(env
, lg
->alt
);
4776 = lookupIRTemp(env
, lg
->dst
);
4778 /* Get the alt value into the dst. We'll do a conditional load
4779 which overwrites it -- or not -- with loaded data. */
4781 addInstr(env
, mk_vMOVsd_RR(rAlt
, rDst
));
4783 addInstr(env
, mk_iMOVsd_RR(rAlt
, rDst
));
4785 AMD64CondCode cc
= iselCondCode_C(env
, lg
->guard
);
4787 addInstr(env
, AMD64Instr_SseCLoad(cc
, amAddr
, rDst
));
4789 addInstr(env
, AMD64Instr_CLoad(cc
, szB
, amAddr
, rDst
));
4794 /* --------- STOREG (guarded store) --------- */
4796 IRStoreG
* sg
= stmt
->Ist
.StoreG
.details
;
4797 if (sg
->end
!= Iend_LE
)
4800 UChar szB
= 0; /* invalid */
4801 switch (typeOfIRExpr(env
->type_env
, sg
->data
)) {
4802 case Ity_I32
: szB
= 4; break;
4803 case Ity_I64
: szB
= 8; break;
4804 case Ity_V128
: szB
= 16; break;
4811 = iselIntExpr_AMode(env
, sg
->addr
);
4813 = szB
== 16 ? iselVecExpr(env
, sg
->data
)
4814 : iselIntExpr_R(env
, sg
->data
);
4816 = iselCondCode_C(env
, sg
->guard
);
4818 addInstr(env
, AMD64Instr_SseCStore(cc
, rSrc
, amAddr
));
4820 addInstr(env
, AMD64Instr_CStore(cc
, szB
, rSrc
, amAddr
));
4825 /* --------- STORE --------- */
4827 IRType tya
= typeOfIRExpr(env
->type_env
, stmt
->Ist
.Store
.addr
);
4828 IRType tyd
= typeOfIRExpr(env
->type_env
, stmt
->Ist
.Store
.data
);
4829 IREndness end
= stmt
->Ist
.Store
.end
;
4831 if (tya
!= Ity_I64
|| end
!= Iend_LE
)
4834 if (tyd
== Ity_I64
) {
4835 AMD64AMode
* am
= iselIntExpr_AMode(env
, stmt
->Ist
.Store
.addr
);
4836 AMD64RI
* ri
= iselIntExpr_RI(env
, stmt
->Ist
.Store
.data
);
4837 addInstr(env
, AMD64Instr_Alu64M(Aalu_MOV
,ri
,am
));
4840 if (tyd
== Ity_I8
|| tyd
== Ity_I16
|| tyd
== Ity_I32
) {
4841 AMD64AMode
* am
= iselIntExpr_AMode(env
, stmt
->Ist
.Store
.addr
);
4842 HReg r
= iselIntExpr_R(env
, stmt
->Ist
.Store
.data
);
4843 addInstr(env
, AMD64Instr_Store(
4844 toUChar(tyd
==Ity_I8
? 1 : (tyd
==Ity_I16
? 2 : 4)),
4848 if (tyd
== Ity_F64
) {
4849 AMD64AMode
* am
= iselIntExpr_AMode(env
, stmt
->Ist
.Store
.addr
);
4850 HReg r
= iselDblExpr(env
, stmt
->Ist
.Store
.data
);
4851 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 8, r
, am
));
4854 if (tyd
== Ity_F32
) {
4855 AMD64AMode
* am
= iselIntExpr_AMode(env
, stmt
->Ist
.Store
.addr
);
4856 HReg r
= iselFltExpr(env
, stmt
->Ist
.Store
.data
);
4857 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 4, r
, am
));
4860 if (tyd
== Ity_V128
) {
4861 AMD64AMode
* am
= iselIntExpr_AMode(env
, stmt
->Ist
.Store
.addr
);
4862 HReg r
= iselVecExpr(env
, stmt
->Ist
.Store
.data
);
4863 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 16, r
, am
));
4866 if (tyd
== Ity_V256
) {
4867 HReg rA
= iselIntExpr_R(env
, stmt
->Ist
.Store
.addr
);
4868 AMD64AMode
* am0
= AMD64AMode_IR(0, rA
);
4869 AMD64AMode
* am16
= AMD64AMode_IR(16, rA
);
4871 iselDVecExpr(&vHi
, &vLo
, env
, stmt
->Ist
.Store
.data
);
4872 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 16, vLo
, am0
));
4873 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 16, vHi
, am16
));
4879 /* --------- PUT --------- */
4881 IRType ty
= typeOfIRExpr(env
->type_env
, stmt
->Ist
.Put
.data
);
4882 if (ty
== Ity_I64
) {
4883 /* We're going to write to memory, so compute the RHS into an
4885 AMD64RI
* ri
= iselIntExpr_RI(env
, stmt
->Ist
.Put
.data
);
4890 AMD64AMode_IR(stmt
->Ist
.Put
.offset
,
4895 if (ty
== Ity_I8
|| ty
== Ity_I16
|| ty
== Ity_I32
) {
4896 HReg r
= iselIntExpr_R(env
, stmt
->Ist
.Put
.data
);
4897 addInstr(env
, AMD64Instr_Store(
4898 toUChar(ty
==Ity_I8
? 1 : (ty
==Ity_I16
? 2 : 4)),
4900 AMD64AMode_IR(stmt
->Ist
.Put
.offset
,
4904 if (ty
== Ity_F32
) {
4905 HReg f32
= iselFltExpr(env
, stmt
->Ist
.Put
.data
);
4906 AMD64AMode
* am
= AMD64AMode_IR(stmt
->Ist
.Put
.offset
, hregAMD64_RBP());
4907 set_SSE_rounding_default(env
); /* paranoia */
4908 addInstr(env
, AMD64Instr_SseLdSt( False
/*store*/, 4, f32
, am
));
4911 if (ty
== Ity_F64
) {
4912 HReg f64
= iselDblExpr(env
, stmt
->Ist
.Put
.data
);
4913 AMD64AMode
* am
= AMD64AMode_IR( stmt
->Ist
.Put
.offset
,
4915 addInstr(env
, AMD64Instr_SseLdSt( False
/*store*/, 8, f64
, am
));
4918 if (ty
== Ity_V128
) {
4919 HReg vec
= iselVecExpr(env
, stmt
->Ist
.Put
.data
);
4920 AMD64AMode
* am
= AMD64AMode_IR(stmt
->Ist
.Put
.offset
,
4922 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 16, vec
, am
));
4925 if (ty
== Ity_V256
) {
4927 iselDVecExpr(&vHi
, &vLo
, env
, stmt
->Ist
.Put
.data
);
4928 HReg rbp
= hregAMD64_RBP();
4929 AMD64AMode
* am0
= AMD64AMode_IR(stmt
->Ist
.Put
.offset
+ 0, rbp
);
4930 AMD64AMode
* am16
= AMD64AMode_IR(stmt
->Ist
.Put
.offset
+ 16, rbp
);
4931 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 16, vLo
, am0
));
4932 addInstr(env
, AMD64Instr_SseLdSt(False
/*store*/, 16, vHi
, am16
));
4938 /* --------- Indexed PUT --------- */
4940 IRPutI
*puti
= stmt
->Ist
.PutI
.details
;
4943 = genGuestArrayOffset(
4945 puti
->ix
, puti
->bias
);
4947 IRType ty
= typeOfIRExpr(env
->type_env
, puti
->data
);
4948 if (ty
== Ity_F64
) {
4949 HReg val
= iselDblExpr(env
, puti
->data
);
4950 addInstr(env
, AMD64Instr_SseLdSt( False
/*store*/, 8, val
, am
));
4954 HReg r
= iselIntExpr_R(env
, puti
->data
);
4955 addInstr(env
, AMD64Instr_Store( 1, r
, am
));
4958 if (ty
== Ity_I64
) {
4959 AMD64RI
* ri
= iselIntExpr_RI(env
, puti
->data
);
4960 addInstr(env
, AMD64Instr_Alu64M( Aalu_MOV
, ri
, am
));
4966 /* --------- TMP --------- */
4968 IRTemp tmp
= stmt
->Ist
.WrTmp
.tmp
;
4969 IRType ty
= typeOfIRTemp(env
->type_env
, tmp
);
4971 /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4972 compute it into an AMode and then use LEA. This usually
4973 produces fewer instructions, often because (for memcheck
4974 created IR) we get t = address-expression, (t is later used
4975 twice) and so doing this naturally turns address-expression
4976 back into an AMD64 amode. */
4978 && stmt
->Ist
.WrTmp
.data
->tag
== Iex_Binop
4979 && stmt
->Ist
.WrTmp
.data
->Iex
.Binop
.op
== Iop_Add64
) {
4980 AMD64AMode
* am
= iselIntExpr_AMode(env
, stmt
->Ist
.WrTmp
.data
);
4981 HReg dst
= lookupIRTemp(env
, tmp
);
4982 if (am
->tag
== Aam_IR
&& am
->Aam
.IR
.imm
== 0) {
4983 /* Hmm, iselIntExpr_AMode wimped out and just computed the
4984 value into a register. Just emit a normal reg-reg move
4985 so reg-alloc can coalesce it away in the usual way. */
4986 HReg src
= am
->Aam
.IR
.reg
;
4987 addInstr(env
, AMD64Instr_Alu64R(Aalu_MOV
, AMD64RMI_Reg(src
), dst
));
4989 addInstr(env
, AMD64Instr_Lea64(am
,dst
));
4994 if (ty
== Ity_I64
|| ty
== Ity_I32
4995 || ty
== Ity_I16
|| ty
== Ity_I8
) {
4996 AMD64RMI
* rmi
= iselIntExpr_RMI(env
, stmt
->Ist
.WrTmp
.data
);
4997 HReg dst
= lookupIRTemp(env
, tmp
);
4998 addInstr(env
, AMD64Instr_Alu64R(Aalu_MOV
,rmi
,dst
));
5001 if (ty
== Ity_I128
) {
5002 HReg rHi
, rLo
, dstHi
, dstLo
;
5003 iselInt128Expr(&rHi
,&rLo
, env
, stmt
->Ist
.WrTmp
.data
);
5004 lookupIRTempPair( &dstHi
, &dstLo
, env
, tmp
);
5005 addInstr(env
, mk_iMOVsd_RR(rHi
,dstHi
) );
5006 addInstr(env
, mk_iMOVsd_RR(rLo
,dstLo
) );
5010 AMD64CondCode cond
= iselCondCode_C(env
, stmt
->Ist
.WrTmp
.data
);
5011 HReg dst
= lookupIRTemp(env
, tmp
);
5012 addInstr(env
, AMD64Instr_Set64(cond
, dst
));
5015 if (ty
== Ity_F64
) {
5016 HReg dst
= lookupIRTemp(env
, tmp
);
5017 HReg src
= iselDblExpr(env
, stmt
->Ist
.WrTmp
.data
);
5018 addInstr(env
, mk_vMOVsd_RR(src
, dst
));
5021 if (ty
== Ity_F32
) {
5022 HReg dst
= lookupIRTemp(env
, tmp
);
5023 HReg src
= iselFltExpr(env
, stmt
->Ist
.WrTmp
.data
);
5024 addInstr(env
, mk_vMOVsd_RR(src
, dst
));
5027 if (ty
== Ity_V128
) {
5028 HReg dst
= lookupIRTemp(env
, tmp
);
5029 HReg src
= iselVecExpr(env
, stmt
->Ist
.WrTmp
.data
);
5030 addInstr(env
, mk_vMOVsd_RR(src
, dst
));
5033 if (ty
== Ity_V256
) {
5034 HReg rHi
, rLo
, dstHi
, dstLo
;
5035 iselDVecExpr(&rHi
,&rLo
, env
, stmt
->Ist
.WrTmp
.data
);
5036 lookupIRTempPair( &dstHi
, &dstLo
, env
, tmp
);
5037 addInstr(env
, mk_vMOVsd_RR(rHi
,dstHi
) );
5038 addInstr(env
, mk_vMOVsd_RR(rLo
,dstLo
) );
5044 /* --------- Call to DIRTY helper --------- */
5046 IRDirty
* d
= stmt
->Ist
.Dirty
.details
;
5048 /* Figure out the return type, if any. */
5049 IRType retty
= Ity_INVALID
;
5050 if (d
->tmp
!= IRTemp_INVALID
)
5051 retty
= typeOfIRTemp(env
->type_env
, d
->tmp
);
5053 /* Throw out any return types we don't know about. */
5054 Bool retty_ok
= False
;
5056 case Ity_INVALID
: /* function doesn't return anything */
5057 case Ity_I64
: case Ity_I32
: case Ity_I16
: case Ity_I8
:
5058 case Ity_V128
: case Ity_V256
:
5059 retty_ok
= True
; break;
5064 break; /* will go to stmt_fail: */
5066 /* Marshal args, do the call, and set the return value to
5067 0x555..555 if this is a conditional call that returns a value
5068 and the call is skipped. */
5070 RetLoc rloc
= mk_RetLoc_INVALID();
5071 doHelperCall( &addToSp
, &rloc
, env
, d
->guard
, d
->cee
, retty
, d
->args
);
5072 vassert(is_sane_RetLoc(rloc
));
5074 /* Now figure out what to do with the returned value, if any. */
5077 /* No return value. Nothing to do. */
5078 vassert(d
->tmp
== IRTemp_INVALID
);
5079 vassert(rloc
.pri
== RLPri_None
);
5080 vassert(addToSp
== 0);
5083 case Ity_I64
: case Ity_I32
: case Ity_I16
: case Ity_I8
: {
5084 /* The returned value is in %rax. Park it in the register
5085 associated with tmp. */
5086 vassert(rloc
.pri
== RLPri_Int
);
5087 vassert(addToSp
== 0);
5088 HReg dst
= lookupIRTemp(env
, d
->tmp
);
5089 addInstr(env
, mk_iMOVsd_RR(hregAMD64_RAX(),dst
) );
5093 /* The returned value is on the stack, and rloc.spOff
5094 tells us where. Fish it off the stack and then move
5095 the stack pointer upwards to clear it, as directed by
5097 vassert(rloc
.pri
== RLPri_V128SpRel
);
5098 vassert(addToSp
>= 16);
5099 HReg dst
= lookupIRTemp(env
, d
->tmp
);
5100 AMD64AMode
* am
= AMD64AMode_IR(rloc
.spOff
, hregAMD64_RSP());
5101 addInstr(env
, AMD64Instr_SseLdSt( True
/*load*/, 16, dst
, am
));
5102 add_to_rsp(env
, addToSp
);
5106 /* See comments for Ity_V128. */
5107 vassert(rloc
.pri
== RLPri_V256SpRel
);
5108 vassert(addToSp
>= 32);
5110 lookupIRTempPair(&dstHi
, &dstLo
, env
, d
->tmp
);
5111 AMD64AMode
* amLo
= AMD64AMode_IR(rloc
.spOff
, hregAMD64_RSP());
5112 addInstr(env
, AMD64Instr_SseLdSt( True
/*load*/, 16, dstLo
, amLo
));
5113 AMD64AMode
* amHi
= AMD64AMode_IR(rloc
.spOff
+16, hregAMD64_RSP());
5114 addInstr(env
, AMD64Instr_SseLdSt( True
/*load*/, 16, dstHi
, amHi
));
5115 add_to_rsp(env
, addToSp
);
5125 /* --------- MEM FENCE --------- */
5127 switch (stmt
->Ist
.MBE
.event
) {
5129 addInstr(env
, AMD64Instr_MFence());
5136 /* --------- ACAS --------- */
5138 if (stmt
->Ist
.CAS
.details
->oldHi
== IRTemp_INVALID
) {
5139 /* "normal" singleton CAS */
5141 IRCAS
* cas
= stmt
->Ist
.CAS
.details
;
5142 IRType ty
= typeOfIRExpr(env
->type_env
, cas
->dataLo
);
5143 /* get: cas->expd into %rax, and cas->data into %rbx */
5144 AMD64AMode
* am
= iselIntExpr_AMode(env
, cas
->addr
);
5145 HReg rData
= iselIntExpr_R(env
, cas
->dataLo
);
5146 HReg rExpd
= iselIntExpr_R(env
, cas
->expdLo
);
5147 HReg rOld
= lookupIRTemp(env
, cas
->oldLo
);
5148 vassert(cas
->expdHi
== NULL
);
5149 vassert(cas
->dataHi
== NULL
);
5150 addInstr(env
, mk_iMOVsd_RR(rExpd
, rOld
));
5151 addInstr(env
, mk_iMOVsd_RR(rExpd
, hregAMD64_RAX()));
5152 addInstr(env
, mk_iMOVsd_RR(rData
, hregAMD64_RBX()));
5154 case Ity_I64
: sz
= 8; break;
5155 case Ity_I32
: sz
= 4; break;
5156 case Ity_I16
: sz
= 2; break;
5157 case Ity_I8
: sz
= 1; break;
5158 default: goto unhandled_cas
;
5160 addInstr(env
, AMD64Instr_ACAS(am
, sz
));
5161 addInstr(env
, AMD64Instr_CMov64(Acc_NZ
, hregAMD64_RAX(), rOld
));
5166 IRCAS
* cas
= stmt
->Ist
.CAS
.details
;
5167 IRType ty
= typeOfIRExpr(env
->type_env
, cas
->dataLo
);
5168 /* only 32-bit and 64-bit allowed in this case */
5169 /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
5170 /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
5171 AMD64AMode
* am
= iselIntExpr_AMode(env
, cas
->addr
);
5172 HReg rDataHi
= iselIntExpr_R(env
, cas
->dataHi
);
5173 HReg rDataLo
= iselIntExpr_R(env
, cas
->dataLo
);
5174 HReg rExpdHi
= iselIntExpr_R(env
, cas
->expdHi
);
5175 HReg rExpdLo
= iselIntExpr_R(env
, cas
->expdLo
);
5176 HReg rOldHi
= lookupIRTemp(env
, cas
->oldHi
);
5177 HReg rOldLo
= lookupIRTemp(env
, cas
->oldLo
);
5180 if (!(env
->hwcaps
& VEX_HWCAPS_AMD64_CX16
))
5181 goto unhandled_cas
; /* we'd have to generate
5182 cmpxchg16b, but the host
5183 doesn't support that */
5192 addInstr(env
, mk_iMOVsd_RR(rExpdHi
, rOldHi
));
5193 addInstr(env
, mk_iMOVsd_RR(rExpdLo
, rOldLo
));
5194 addInstr(env
, mk_iMOVsd_RR(rExpdHi
, hregAMD64_RDX()));
5195 addInstr(env
, mk_iMOVsd_RR(rExpdLo
, hregAMD64_RAX()));
5196 addInstr(env
, mk_iMOVsd_RR(rDataHi
, hregAMD64_RCX()));
5197 addInstr(env
, mk_iMOVsd_RR(rDataLo
, hregAMD64_RBX()));
5198 addInstr(env
, AMD64Instr_DACAS(am
, sz
));
5199 addInstr(env
, AMD64Instr_CMov64(Acc_NZ
, hregAMD64_RDX(), rOldHi
));
5200 addInstr(env
, AMD64Instr_CMov64(Acc_NZ
, hregAMD64_RAX(), rOldLo
));
5206 /* --------- INSTR MARK --------- */
5207 /* Doesn't generate any executable code ... */
5211 /* --------- ABI HINT --------- */
5212 /* These have no meaning (denotation in the IR) and so we ignore
5213 them ... if any actually made it this far. */
5217 /* --------- NO-OP --------- */
5221 /* --------- EXIT --------- */
5223 if (stmt
->Ist
.Exit
.dst
->tag
!= Ico_U64
)
5224 vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
5226 AMD64CondCode cc
= iselCondCode_C(env
, stmt
->Ist
.Exit
.guard
);
5227 AMD64AMode
* amRIP
= AMD64AMode_IR(stmt
->Ist
.Exit
.offsIP
,
5230 /* Case: boring transfer to known address */
5231 if (stmt
->Ist
.Exit
.jk
== Ijk_Boring
) {
5232 if (env
->chainingAllowed
) {
5233 /* .. almost always true .. */
5234 /* Skip the event check at the dst if this is a forwards
5237 = ((Addr64
)stmt
->Ist
.Exit
.dst
->Ico
.U64
) > env
->max_ga
;
5238 if (0) vex_printf("%s", toFastEP
? "Y" : ",");
5239 addInstr(env
, AMD64Instr_XDirect(stmt
->Ist
.Exit
.dst
->Ico
.U64
,
5240 amRIP
, cc
, toFastEP
));
5242 /* .. very occasionally .. */
5243 /* We can't use chaining, so ask for an assisted transfer,
5244 as that's the only alternative that is allowable. */
5245 HReg r
= iselIntExpr_R(env
, IRExpr_Const(stmt
->Ist
.Exit
.dst
));
5246 addInstr(env
, AMD64Instr_XAssisted(r
, amRIP
, cc
, Ijk_Boring
));
5251 /* Case: assisted transfer to arbitrary address */
5252 switch (stmt
->Ist
.Exit
.jk
) {
5253 /* Keep this list in sync with that in iselNext below */
5261 case Ijk_Sys_syscall
:
5262 case Ijk_Sys_int210
:
5263 case Ijk_InvalICache
:
5266 HReg r
= iselIntExpr_R(env
, IRExpr_Const(stmt
->Ist
.Exit
.dst
));
5267 addInstr(env
, AMD64Instr_XAssisted(r
, amRIP
, cc
, stmt
->Ist
.Exit
.jk
));
5274 /* Do we ever expect to see any other kind? */
5282 vpanic("iselStmt(amd64)");
5286 /*---------------------------------------------------------*/
5287 /*--- ISEL: Basic block terminators (Nexts) ---*/
5288 /*---------------------------------------------------------*/
5290 static void iselNext ( ISelEnv
* env
,
5291 IRExpr
* next
, IRJumpKind jk
, Int offsIP
)
5293 if (vex_traceflags
& VEX_TRACE_VCODE
) {
5294 vex_printf( "\n-- PUT(%d) = ", offsIP
);
5296 vex_printf( "; exit-");
5301 /* Case: boring transfer to known address */
5302 if (next
->tag
== Iex_Const
) {
5303 IRConst
* cdst
= next
->Iex
.Const
.con
;
5304 vassert(cdst
->tag
== Ico_U64
);
5305 if (jk
== Ijk_Boring
|| jk
== Ijk_Call
) {
5306 /* Boring transfer to known address */
5307 AMD64AMode
* amRIP
= AMD64AMode_IR(offsIP
, hregAMD64_RBP());
5308 if (env
->chainingAllowed
) {
5309 /* .. almost always true .. */
5310 /* Skip the event check at the dst if this is a forwards
5313 = ((Addr64
)cdst
->Ico
.U64
) > env
->max_ga
;
5314 if (0) vex_printf("%s", toFastEP
? "X" : ".");
5315 addInstr(env
, AMD64Instr_XDirect(cdst
->Ico
.U64
,
5319 /* .. very occasionally .. */
5320 /* We can't use chaining, so ask for an indirect transfer,
5321 as that's the cheapest alternative that is
5323 HReg r
= iselIntExpr_R(env
, next
);
5324 addInstr(env
, AMD64Instr_XAssisted(r
, amRIP
, Acc_ALWAYS
,
5331 /* Case: call/return (==boring) transfer to any address */
5333 case Ijk_Boring
: case Ijk_Ret
: case Ijk_Call
: {
5334 HReg r
= iselIntExpr_R(env
, next
);
5335 AMD64AMode
* amRIP
= AMD64AMode_IR(offsIP
, hregAMD64_RBP());
5336 if (env
->chainingAllowed
) {
5337 addInstr(env
, AMD64Instr_XIndir(r
, amRIP
, Acc_ALWAYS
));
5339 addInstr(env
, AMD64Instr_XAssisted(r
, amRIP
, Acc_ALWAYS
,
5348 /* Case: assisted transfer to arbitrary address */
5350 /* Keep this list in sync with that for Ist_Exit above */
5358 case Ijk_Sys_syscall
:
5359 case Ijk_Sys_int210
:
5360 case Ijk_InvalICache
:
5362 HReg r
= iselIntExpr_R(env
, next
);
5363 AMD64AMode
* amRIP
= AMD64AMode_IR(offsIP
, hregAMD64_RBP());
5364 addInstr(env
, AMD64Instr_XAssisted(r
, amRIP
, Acc_ALWAYS
, jk
));
5371 vex_printf( "\n-- PUT(%d) = ", offsIP
);
5373 vex_printf( "; exit-");
5376 vassert(0); // are we expecting any other kind?
5380 /*---------------------------------------------------------*/
5381 /*--- Insn selector top-level ---*/
5382 /*---------------------------------------------------------*/
5384 /* Translate an entire SB to amd64 code. */
5386 HInstrArray
* iselSB_AMD64 ( const IRSB
* bb
,
5388 const VexArchInfo
* archinfo_host
,
5389 const VexAbiInfo
* vbi
/*UNUSED*/,
5390 Int offs_Host_EvC_Counter
,
5391 Int offs_Host_EvC_FailAddr
,
5392 Bool chainingAllowed
,
5399 UInt hwcaps_host
= archinfo_host
->hwcaps
;
5400 AMD64AMode
*amCounter
, *amFailAddr
;
5403 vassert(arch_host
== VexArchAMD64
);
5404 vassert(0 == (hwcaps_host
5405 & ~(VEX_HWCAPS_AMD64_SSE3
5406 | VEX_HWCAPS_AMD64_SSSE3
5407 | VEX_HWCAPS_AMD64_CX16
5408 | VEX_HWCAPS_AMD64_LZCNT
5409 | VEX_HWCAPS_AMD64_AVX
5410 | VEX_HWCAPS_AMD64_RDTSCP
5411 | VEX_HWCAPS_AMD64_BMI
5412 | VEX_HWCAPS_AMD64_AVX2
5413 | VEX_HWCAPS_AMD64_F16C
5414 | VEX_HWCAPS_AMD64_RDRAND
5415 | VEX_HWCAPS_AMD64_RDSEED
5416 | VEX_HWCAPS_AMD64_FMA3
5417 | VEX_HWCAPS_AMD64_FMA4
)));
5419 /* Check that the host's endianness is as expected. */
5420 vassert(archinfo_host
->endness
== VexEndnessLE
);
5422 /* Make up an initial environment to use. */
5423 env
= LibVEX_Alloc_inline(sizeof(ISelEnv
));
5426 /* Set up output code array. */
5427 env
->code
= newHInstrArray();
5429 /* Copy BB's type env. */
5430 env
->type_env
= bb
->tyenv
;
5432 /* Make up an IRTemp -> virtual HReg mapping. This doesn't
5433 change as we go along. */
5434 env
->n_vregmap
= bb
->tyenv
->types_used
;
5435 env
->vregmap
= LibVEX_Alloc_inline(env
->n_vregmap
* sizeof(HReg
));
5436 env
->vregmapHI
= LibVEX_Alloc_inline(env
->n_vregmap
* sizeof(HReg
));
5438 /* and finally ... */
5439 env
->chainingAllowed
= chainingAllowed
;
5440 env
->hwcaps
= hwcaps_host
;
5441 env
->max_ga
= max_ga
;
5443 /* For each IR temporary, allocate a suitably-kinded virtual
5446 for (i
= 0; i
< env
->n_vregmap
; i
++) {
5447 hregHI
= hreg
= INVALID_HREG
;
5448 switch (bb
->tyenv
->types
[i
]) {
5450 case Ity_I8
: case Ity_I16
: case Ity_I32
: case Ity_I64
:
5451 hreg
= mkHReg(True
, HRcInt64
, 0, j
++);
5454 hreg
= mkHReg(True
, HRcInt64
, 0, j
++);
5455 hregHI
= mkHReg(True
, HRcInt64
, 0, j
++);
5460 hreg
= mkHReg(True
, HRcVec128
, 0, j
++);
5463 hreg
= mkHReg(True
, HRcVec128
, 0, j
++);
5464 hregHI
= mkHReg(True
, HRcVec128
, 0, j
++);
5467 ppIRType(bb
->tyenv
->types
[i
]);
5468 vpanic("iselBB(amd64): IRTemp type");
5470 env
->vregmap
[i
] = hreg
;
5471 env
->vregmapHI
[i
] = hregHI
;
5475 /* The very first instruction must be an event check. */
5476 amCounter
= AMD64AMode_IR(offs_Host_EvC_Counter
, hregAMD64_RBP());
5477 amFailAddr
= AMD64AMode_IR(offs_Host_EvC_FailAddr
, hregAMD64_RBP());
5478 addInstr(env
, AMD64Instr_EvCheck(amCounter
, amFailAddr
));
5480 /* Possibly a block counter increment (for profiling). At this
5481 point we don't know the address of the counter, so just pretend
5482 it is zero. It will have to be patched later, but before this
5483 translation is used, by a call to LibVEX_patchProfCtr. */
5485 addInstr(env
, AMD64Instr_ProfInc());
5488 /* Ok, finally we can iterate over the statements. */
5489 for (i
= 0; i
< bb
->stmts_used
; i
++)
5491 iselStmt(env
, bb
->stmts
[i
]);
5493 iselNext(env
, bb
->next
, bb
->jumpkind
, bb
->offsIP
);
5495 /* record the number of vregs we used. */
5496 env
->code
->n_vregs
= env
->vreg_ctr
;
5501 /*---------------------------------------------------------------*/
5502 /*--- end host_amd64_isel.c ---*/
5503 /*---------------------------------------------------------------*/