host_amd64_defs.c don't initialize opc and subopc_imm in emit_AMD64Instr.
[valgrind.git] / VEX / priv / host_amd64_isel.c
blobe19dcb34ffe17b122edc307720681a0e14bdec21
2 /*---------------------------------------------------------------*/
3 /*--- begin host_amd64_isel.c ---*/
4 /*---------------------------------------------------------------*/
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
10 Copyright (C) 2004-2017 OpenWorks LLP
11 info@open-works.net
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 The GNU General Public License is contained in the file COPYING.
28 Neither the names of the U.S. Department of Energy nor the
29 University of California nor the names of its contributors may be
30 used to endorse or promote products derived from this software
31 without prior written permission.
34 #include "libvex_basictypes.h"
35 #include "libvex_ir.h"
36 #include "libvex.h"
38 #include "ir_match.h"
39 #include "main_util.h"
40 #include "main_globals.h"
41 #include "host_generic_regs.h"
42 #include "host_generic_simd64.h"
43 #include "host_generic_simd128.h"
44 #include "host_generic_simd256.h"
45 #include "host_generic_maddf.h"
46 #include "host_amd64_defs.h"
49 /*---------------------------------------------------------*/
50 /*--- x87/SSE control word stuff ---*/
51 /*---------------------------------------------------------*/
53 /* Vex-generated code expects to run with the FPU set as follows: all
54 exceptions masked, round-to-nearest, precision = 53 bits. This
55 corresponds to a FPU control word value of 0x027F.
57 Similarly the SSE control word (%mxcsr) should be 0x1F80.
59 %fpucw and %mxcsr should have these values on entry to
60 Vex-generated code, and should those values should be
61 unchanged at exit.
64 #define DEFAULT_FPUCW 0x027F
66 #define DEFAULT_MXCSR 0x1F80
68 /* debugging only, do not use */
69 /* define DEFAULT_FPUCW 0x037F */
72 /*---------------------------------------------------------*/
73 /*--- misc helpers ---*/
74 /*---------------------------------------------------------*/
76 /* These are duplicated in guest-amd64/toIR.c */
77 static IRExpr* unop ( IROp op, IRExpr* a )
79 return IRExpr_Unop(op, a);
82 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
84 return IRExpr_Binop(op, a1, a2);
87 static IRExpr* bind ( Int binder )
89 return IRExpr_Binder(binder);
92 static Bool isZeroU8 ( const IRExpr* e )
94 return e->tag == Iex_Const
95 && e->Iex.Const.con->tag == Ico_U8
96 && e->Iex.Const.con->Ico.U8 == 0;
100 /*---------------------------------------------------------*/
101 /*--- ISelEnv ---*/
102 /*---------------------------------------------------------*/
104 /* This carries around:
106 - A mapping from IRTemp to IRType, giving the type of any IRTemp we
107 might encounter. This is computed before insn selection starts,
108 and does not change.
110 - A mapping from IRTemp to HReg. This tells the insn selector
111 which virtual register is associated with each IRTemp
112 temporary. This is computed before insn selection starts, and
113 does not change. We expect this mapping to map precisely the
114 same set of IRTemps as the type mapping does.
116 - vregmap holds the primary register for the IRTemp.
117 - vregmapHI is only used for 128-bit integer-typed
118 IRTemps. It holds the identity of a second
119 64-bit virtual HReg, which holds the high half
120 of the value.
122 - The host subarchitecture we are selecting insns for.
123 This is set at the start and does not change.
125 - The code array, that is, the insns selected so far.
127 - A counter, for generating new virtual registers.
129 - A Bool for indicating whether we may generate chain-me
130 instructions for control flow transfers, or whether we must use
131 XAssisted.
133 - The maximum guest address of any guest insn in this block.
134 Actually, the address of the highest-addressed byte from any insn
135 in this block. Is set at the start and does not change. This is
136 used for detecting jumps which are definitely forward-edges from
137 this block, and therefore can be made (chained) to the fast entry
138 point of the destination, thereby avoiding the destination's
139 event check.
141 Note, this is all host-independent. (JRS 20050201: well, kinda
142 ... not completely. Compare with ISelEnv for X86.)
145 typedef
146 struct {
147 /* Constant -- are set at the start and do not change. */
148 IRTypeEnv* type_env;
150 HReg* vregmap;
151 HReg* vregmapHI;
152 Int n_vregmap;
154 UInt hwcaps;
156 Bool chainingAllowed;
157 Addr64 max_ga;
159 /* These are modified as we go along. */
160 HInstrArray* code;
161 Int vreg_ctr;
163 ISelEnv;
166 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
168 vassert(tmp >= 0);
169 vassert(tmp < env->n_vregmap);
170 return env->vregmap[tmp];
173 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
174 ISelEnv* env, IRTemp tmp )
176 vassert(tmp >= 0);
177 vassert(tmp < env->n_vregmap);
178 vassert(! hregIsInvalid(env->vregmapHI[tmp]));
179 *vrLO = env->vregmap[tmp];
180 *vrHI = env->vregmapHI[tmp];
183 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
185 addHInstr(env->code, instr);
186 if (vex_traceflags & VEX_TRACE_VCODE) {
187 ppAMD64Instr(instr, True);
188 vex_printf("\n");
192 static HReg newVRegI ( ISelEnv* env )
194 HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
195 env->vreg_ctr++;
196 return reg;
199 static HReg newVRegV ( ISelEnv* env )
201 HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
202 env->vreg_ctr++;
203 return reg;
207 /*---------------------------------------------------------*/
208 /*--- ISEL: Forward declarations ---*/
209 /*---------------------------------------------------------*/
211 /* These are organised as iselXXX and iselXXX_wrk pairs. The
212 iselXXX_wrk do the real work, but are not to be called directly.
213 For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
214 checks that all returned registers are virtual. You should not
215 call the _wrk version directly.
217 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
218 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e );
220 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e );
221 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e );
223 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e );
224 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e );
226 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e );
227 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e );
229 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
230 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e );
232 static void iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
233 ISelEnv* env, const IRExpr* e );
234 static void iselInt128Expr ( /*OUT*/HReg* rHi, HReg* rLo,
235 ISelEnv* env, const IRExpr* e );
237 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e );
238 static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e );
240 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e );
241 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e );
243 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e );
244 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e );
246 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e );
247 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e );
249 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
250 ISelEnv* env, const IRExpr* e );
251 static void iselDVecExpr ( /*OUT*/HReg* rHi, HReg* rLo,
252 ISelEnv* env, const IRExpr* e );
255 /*---------------------------------------------------------*/
256 /*--- ISEL: Misc helpers ---*/
257 /*---------------------------------------------------------*/
259 static Bool sane_AMode ( AMD64AMode* am )
261 switch (am->tag) {
262 case Aam_IR:
263 return
264 toBool( hregClass(am->Aam.IR.reg) == HRcInt64
265 && (hregIsVirtual(am->Aam.IR.reg)
266 || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
267 case Aam_IRRS:
268 return
269 toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
270 && hregIsVirtual(am->Aam.IRRS.base)
271 && hregClass(am->Aam.IRRS.index) == HRcInt64
272 && hregIsVirtual(am->Aam.IRRS.index) );
273 default:
274 vpanic("sane_AMode: unknown amd64 amode tag");
279 /* Can the lower 32 bits be signedly widened to produce the whole
280 64-bit value? In other words, are the top 33 bits either all 0 or
281 all 1 ? */
282 static Bool fitsIn32Bits ( ULong x )
284 Long y1;
285 y1 = x << 32;
286 y1 >>=/*s*/ 32;
287 return toBool(x == y1);
290 /* Is this a 64-bit zero expression? */
292 static Bool isZeroU64 ( const IRExpr* e )
294 return e->tag == Iex_Const
295 && e->Iex.Const.con->tag == Ico_U64
296 && e->Iex.Const.con->Ico.U64 == 0ULL;
299 static Bool isZeroU32 ( const IRExpr* e )
301 return e->tag == Iex_Const
302 && e->Iex.Const.con->tag == Ico_U32
303 && e->Iex.Const.con->Ico.U32 == 0;
306 /* Are both args atoms and the same? This is copy of eqIRAtom
307 that omits the assertions that the args are indeed atoms. */
309 static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 )
311 if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
312 return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
313 if (a1->tag == Iex_Const && a2->tag == Iex_Const)
314 return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
315 return False;
318 /* Make a int reg-reg move. */
320 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
322 vassert(hregClass(src) == HRcInt64);
323 vassert(hregClass(dst) == HRcInt64);
324 return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
327 /* Make a vector (128 bit) reg-reg move. */
329 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
331 vassert(hregClass(src) == HRcVec128);
332 vassert(hregClass(dst) == HRcVec128);
333 return AMD64Instr_SseReRg(Asse_MOV, src, dst);
336 /* Advance/retreat %rsp by n. */
338 static void add_to_rsp ( ISelEnv* env, Int n )
340 vassert(n > 0 && n < 256 && (n%8) == 0);
341 addInstr(env,
342 AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
343 hregAMD64_RSP()));
346 static void sub_from_rsp ( ISelEnv* env, Int n )
348 vassert(n > 0 && n < 256 && (n%8) == 0);
349 addInstr(env,
350 AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
351 hregAMD64_RSP()));
354 /* Push 64-bit constants on the stack. */
355 static void push_uimm64( ISelEnv* env, ULong uimm64 )
357 /* If uimm64 can be expressed as the sign extension of its
358 lower 32 bits, we can do it the easy way. */
359 Long simm64 = (Long)uimm64;
360 if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
361 addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
362 } else {
363 HReg tmp = newVRegI(env);
364 addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
365 addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
370 /* Used only in doHelperCall. If possible, produce a single
371 instruction which computes 'e' into 'dst'. If not possible, return
372 NULL. */
374 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
375 HReg dst,
376 IRExpr* e )
378 /* Per comments in doHelperCall below, appearance of
379 Iex_VECRET implies ill-formed IR. */
380 vassert(e->tag != Iex_VECRET);
382 /* In this case we give out a copy of the BaseBlock pointer. */
383 if (UNLIKELY(e->tag == Iex_GSPTR)) {
384 return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
387 vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
389 if (e->tag == Iex_Const) {
390 vassert(e->Iex.Const.con->tag == Ico_U64);
391 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
392 return AMD64Instr_Alu64R(
393 Aalu_MOV,
394 AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
397 } else {
398 return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
402 if (e->tag == Iex_RdTmp) {
403 HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
404 return mk_iMOVsd_RR(src, dst);
407 if (e->tag == Iex_Get) {
408 vassert(e->Iex.Get.ty == Ity_I64);
409 return AMD64Instr_Alu64R(
410 Aalu_MOV,
411 AMD64RMI_Mem(
412 AMD64AMode_IR(e->Iex.Get.offset,
413 hregAMD64_RBP())),
414 dst);
417 if (e->tag == Iex_Unop
418 && e->Iex.Unop.op == Iop_32Uto64
419 && e->Iex.Unop.arg->tag == Iex_RdTmp) {
420 HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
421 return AMD64Instr_MovxLQ(False, src, dst);
424 if (0) { ppIRExpr(e); vex_printf("\n"); }
426 return NULL;
430 /* Do a complete function call. |guard| is a Ity_Bit expression
431 indicating whether or not the call happens. If guard==NULL, the
432 call is unconditional. |retloc| is set to indicate where the
433 return value is after the call. The caller (of this fn) must
434 generate code to add |stackAdjustAfterCall| to the stack pointer
435 after the call is done. */
437 static
438 void doHelperCall ( /*OUT*/UInt* stackAdjustAfterCall,
439 /*OUT*/RetLoc* retloc,
440 ISelEnv* env,
441 IRExpr* guard,
442 IRCallee* cee, IRType retTy, IRExpr** args )
444 AMD64CondCode cc;
445 HReg argregs[6];
446 HReg tmpregs[6];
447 AMD64Instr* fastinstrs[6];
448 UInt n_args, i;
450 /* Set default returns. We'll update them later if needed. */
451 *stackAdjustAfterCall = 0;
452 *retloc = mk_RetLoc_INVALID();
454 /* These are used for cross-checking that IR-level constraints on
455 the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
456 UInt nVECRETs = 0;
457 UInt nGSPTRs = 0;
459 /* Marshal args for a call and do the call.
461 This function only deals with a tiny set of possibilities, which
462 cover all helpers in practice. The restrictions are that only
463 arguments in registers are supported, hence only 6x64 integer
464 bits in total can be passed. In fact the only supported arg
465 type is I64.
467 The return type can be I{64,32,16,8} or V{128,256}. In the
468 latter two cases, it is expected that |args| will contain the
469 special node IRExpr_VECRET(), in which case this routine
470 generates code to allocate space on the stack for the vector
471 return value. Since we are not passing any scalars on the
472 stack, it is enough to preallocate the return space before
473 marshalling any arguments, in this case.
475 |args| may also contain IRExpr_GSPTR(), in which case the
476 value in %rbp is passed as the corresponding argument.
478 Generating code which is both efficient and correct when
479 parameters are to be passed in registers is difficult, for the
480 reasons elaborated in detail in comments attached to
481 doHelperCall() in priv/host-x86/isel.c. Here, we use a variant
482 of the method described in those comments.
484 The problem is split into two cases: the fast scheme and the
485 slow scheme. In the fast scheme, arguments are computed
486 directly into the target (real) registers. This is only safe
487 when we can be sure that computation of each argument will not
488 trash any real registers set by computation of any other
489 argument.
491 In the slow scheme, all args are first computed into vregs, and
492 once they are all done, they are moved to the relevant real
493 regs. This always gives correct code, but it also gives a bunch
494 of vreg-to-rreg moves which are usually redundant but are hard
495 for the register allocator to get rid of.
497 To decide which scheme to use, all argument expressions are
498 first examined. If they are all so simple that it is clear they
499 will be evaluated without use of any fixed registers, use the
500 fast scheme, else use the slow scheme. Note also that only
501 unconditional calls may use the fast scheme, since having to
502 compute a condition expression could itself trash real
503 registers. Note that for simplicity, in the case where
504 IRExpr_VECRET() is present, we use the slow scheme. This is
505 motivated by the desire to avoid any possible complexity
506 w.r.t. nested calls.
508 Note this requires being able to examine an expression and
509 determine whether or not evaluation of it might use a fixed
510 register. That requires knowledge of how the rest of this insn
511 selector works. Currently just the following 3 are regarded as
512 safe -- hopefully they cover the majority of arguments in
513 practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
516 /* Note that the cee->regparms field is meaningless on AMD64 host
517 (since there is only one calling convention) and so we always
518 ignore it. */
519 n_args = 0;
520 for (i = 0; args[i]; i++)
521 n_args++;
523 if (n_args > 6)
524 vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
526 argregs[0] = hregAMD64_RDI();
527 argregs[1] = hregAMD64_RSI();
528 argregs[2] = hregAMD64_RDX();
529 argregs[3] = hregAMD64_RCX();
530 argregs[4] = hregAMD64_R8();
531 argregs[5] = hregAMD64_R9();
533 tmpregs[0] = tmpregs[1] = tmpregs[2] =
534 tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
536 fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
537 fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
539 /* First decide which scheme (slow or fast) is to be used. First
540 assume the fast scheme, and select slow if any contraindications
541 (wow) appear. */
543 /* We'll need space on the stack for the return value. Avoid
544 possible complications with nested calls by using the slow
545 scheme. */
546 if (retTy == Ity_V128 || retTy == Ity_V256)
547 goto slowscheme;
549 if (guard) {
550 if (guard->tag == Iex_Const
551 && guard->Iex.Const.con->tag == Ico_U1
552 && guard->Iex.Const.con->Ico.U1 == True) {
553 /* unconditional */
554 } else {
555 /* Not manifestly unconditional -- be conservative. */
556 goto slowscheme;
560 /* Ok, let's try for the fast scheme. If it doesn't pan out, we'll
561 use the slow scheme. Because this is tentative, we can't call
562 addInstr (that is, commit to) any instructions until we're
563 handled all the arguments. So park the resulting instructions
564 in a buffer and emit that if we're successful. */
566 /* FAST SCHEME */
567 /* In this loop, we process args that can be computed into the
568 destination (real) register with a single instruction, without
569 using any fixed regs. That also includes IRExpr_GSPTR(), but
570 not IRExpr_VECRET(). Indeed, if the IR is well-formed, we can
571 never see IRExpr_VECRET() at this point, since the return-type
572 check above should ensure all those cases use the slow scheme
573 instead. */
574 vassert(n_args >= 0 && n_args <= 6);
575 for (i = 0; i < n_args; i++) {
576 IRExpr* arg = args[i];
577 if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
578 vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
580 fastinstrs[i]
581 = iselIntExpr_single_instruction( env, argregs[i], args[i] );
582 if (fastinstrs[i] == NULL)
583 goto slowscheme;
586 /* Looks like we're in luck. Emit the accumulated instructions and
587 move on to doing the call itself. */
588 for (i = 0; i < n_args; i++)
589 addInstr(env, fastinstrs[i]);
591 /* Fast scheme only applies for unconditional calls. Hence: */
592 cc = Acc_ALWAYS;
594 goto handle_call;
597 /* SLOW SCHEME; move via temporaries */
598 slowscheme:
600 # if 0 /* debug only */
601 if (n_args > 0) {for (i = 0; args[i]; i++) {
602 ppIRExpr(args[i]); vex_printf(" "); }
603 vex_printf("\n");}
604 # endif
606 /* If we have a vector return type, allocate a place for it on the
607 stack and record its address. */
608 HReg r_vecRetAddr = INVALID_HREG;
609 if (retTy == Ity_V128) {
610 r_vecRetAddr = newVRegI(env);
611 sub_from_rsp(env, 16);
612 addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
614 else if (retTy == Ity_V256) {
615 r_vecRetAddr = newVRegI(env);
616 sub_from_rsp(env, 32);
617 addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
620 vassert(n_args >= 0 && n_args <= 6);
621 for (i = 0; i < n_args; i++) {
622 IRExpr* arg = args[i];
623 if (UNLIKELY(arg->tag == Iex_GSPTR)) {
624 tmpregs[i] = newVRegI(env);
625 addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
626 nGSPTRs++;
628 else if (UNLIKELY(arg->tag == Iex_VECRET)) {
629 /* We stashed the address of the return slot earlier, so just
630 retrieve it now. */
631 vassert(!hregIsInvalid(r_vecRetAddr));
632 tmpregs[i] = r_vecRetAddr;
633 nVECRETs++;
635 else {
636 vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
637 tmpregs[i] = iselIntExpr_R(env, args[i]);
641 /* Now we can compute the condition. We can't do it earlier
642 because the argument computations could trash the condition
643 codes. Be a bit clever to handle the common case where the
644 guard is 1:Bit. */
645 cc = Acc_ALWAYS;
646 if (guard) {
647 if (guard->tag == Iex_Const
648 && guard->Iex.Const.con->tag == Ico_U1
649 && guard->Iex.Const.con->Ico.U1 == True) {
650 /* unconditional -- do nothing */
651 } else {
652 cc = iselCondCode( env, guard );
656 /* Move the args to their final destinations. */
657 for (i = 0; i < n_args; i++) {
658 /* None of these insns, including any spill code that might
659 be generated, may alter the condition codes. */
660 addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
664 /* Do final checks, set the return values, and generate the call
665 instruction proper. */
666 handle_call:
668 if (retTy == Ity_V128 || retTy == Ity_V256) {
669 vassert(nVECRETs == 1);
670 } else {
671 vassert(nVECRETs == 0);
674 vassert(nGSPTRs == 0 || nGSPTRs == 1);
676 vassert(*stackAdjustAfterCall == 0);
677 vassert(is_RetLoc_INVALID(*retloc));
678 switch (retTy) {
679 case Ity_INVALID:
680 /* Function doesn't return a value. */
681 *retloc = mk_RetLoc_simple(RLPri_None);
682 break;
683 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
684 *retloc = mk_RetLoc_simple(RLPri_Int);
685 break;
686 case Ity_V128:
687 *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
688 *stackAdjustAfterCall = 16;
689 break;
690 case Ity_V256:
691 *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
692 *stackAdjustAfterCall = 32;
693 break;
694 default:
695 /* IR can denote other possible return types, but we don't
696 handle those here. */
697 vassert(0);
700 /* Finally, generate the call itself. This needs the *retloc value
701 set in the switch above, which is why it's at the end. */
702 addInstr(env,
703 AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
707 /* Given a guest-state array descriptor, an index expression and a
708 bias, generate an AMD64AMode holding the relevant guest state
709 offset. */
711 static
712 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
713 IRExpr* off, Int bias )
715 HReg tmp, roff;
716 Int elemSz = sizeofIRType(descr->elemTy);
717 Int nElems = descr->nElems;
719 /* Throw out any cases not generated by an amd64 front end. In
720 theory there might be a day where we need to handle them -- if
721 we ever run non-amd64-guest on amd64 host. */
723 if (nElems != 8 || (elemSz != 1 && elemSz != 8))
724 vpanic("genGuestArrayOffset(amd64 host)");
726 /* Compute off into a reg, %off. Then return:
728 movq %off, %tmp
729 addq $bias, %tmp (if bias != 0)
730 andq %tmp, 7
731 ... base(%rbp, %tmp, shift) ...
733 tmp = newVRegI(env);
734 roff = iselIntExpr_R(env, off);
735 addInstr(env, mk_iMOVsd_RR(roff, tmp));
736 if (bias != 0) {
737 /* Make sure the bias is sane, in the sense that there are
738 no significant bits above bit 30 in it. */
739 vassert(-10000 < bias && bias < 10000);
740 addInstr(env,
741 AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
743 addInstr(env,
744 AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
745 vassert(elemSz == 1 || elemSz == 8);
746 return
747 AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
748 elemSz==8 ? 3 : 0);
752 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
753 static
754 void set_SSE_rounding_default ( ISelEnv* env )
756 /* pushq $DEFAULT_MXCSR
757 ldmxcsr 0(%rsp)
758 addq $8, %rsp
760 AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
761 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
762 addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
763 add_to_rsp(env, 8);
766 /* Mess with the FPU's rounding mode: set to the default rounding mode
767 (DEFAULT_FPUCW). */
768 static
769 void set_FPU_rounding_default ( ISelEnv* env )
771 /* movq $DEFAULT_FPUCW, -8(%rsp)
772 fldcw -8(%esp)
774 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
775 addInstr(env, AMD64Instr_Alu64M(
776 Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
777 addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
781 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
782 expression denoting a value in the range 0 .. 3, indicating a round
783 mode encoded as per type IRRoundingMode. Set the SSE machinery to
784 have the same rounding.
786 static
787 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
789 /* Note: this sequence only makes sense because DEFAULT_MXCSR has
790 both rounding bits == 0. If that wasn't the case, we couldn't
791 create a new rounding field simply by ORing the new value into
792 place. */
794 /* movq $3, %reg
795 andq [[mode]], %reg -- shouldn't be needed; paranoia
796 shlq $13, %reg
797 orq $DEFAULT_MXCSR, %reg
798 pushq %reg
799 ldmxcsr 0(%esp)
800 addq $8, %rsp
802 HReg reg = newVRegI(env);
803 AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
804 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
805 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
806 iselIntExpr_RMI(env, mode), reg));
807 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
808 addInstr(env, AMD64Instr_Alu64R(
809 Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
810 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
811 addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
812 add_to_rsp(env, 8);
816 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
817 expression denoting a value in the range 0 .. 3, indicating a round
818 mode encoded as per type IRRoundingMode. Set the x87 FPU to have
819 the same rounding.
821 static
822 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
824 HReg rrm = iselIntExpr_R(env, mode);
825 HReg rrm2 = newVRegI(env);
826 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
828 /* movq %rrm, %rrm2
829 andq $3, %rrm2 -- shouldn't be needed; paranoia
830 shlq $10, %rrm2
831 orq $DEFAULT_FPUCW, %rrm2
832 movq %rrm2, -8(%rsp)
833 fldcw -8(%esp)
835 addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
836 addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
837 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
838 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
839 AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
840 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
841 AMD64RI_Reg(rrm2), m8_rsp));
842 addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
846 /* Generate all-zeroes into a new vector register.
848 static HReg generate_zeroes_V128 ( ISelEnv* env )
850 HReg dst = newVRegV(env);
851 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
852 return dst;
855 /* Generate all-ones into a new vector register.
857 static HReg generate_ones_V128 ( ISelEnv* env )
859 HReg dst = newVRegV(env);
860 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
861 return dst;
865 /* Generate !src into a new vector register. Amazing that there isn't
866 a less crappy way to do this.
868 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
870 HReg dst = generate_ones_V128(env);
871 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
872 return dst;
876 /* Expand the given byte into a 64-bit word, by cloning each bit
877 8 times. */
878 static ULong bitmask8_to_bytemask64 ( UShort w8 )
880 vassert(w8 == (w8 & 0xFF));
881 ULong w64 = 0;
882 Int i;
883 for (i = 0; i < 8; i++) {
884 if (w8 & (1<<i))
885 w64 |= (0xFFULL << (8 * i));
887 return w64;
891 /*---------------------------------------------------------*/
892 /*--- ISEL: Integer expressions (64/32/16/8 bit) ---*/
893 /*---------------------------------------------------------*/
895 /* Select insns for an integer-typed expression, and add them to the
896 code list. Return a reg holding the result. This reg will be a
897 virtual register. THE RETURNED REG MUST NOT BE MODIFIED. If you
898 want to modify it, ask for a new vreg, copy it in there, and modify
899 the copy. The register allocator will do its best to map both
900 vregs to the same real register, so the copies will often disappear
901 later in the game.
903 This should handle expressions of 64, 32, 16 and 8-bit type. All
904 results are returned in a 64-bit register. For 32-, 16- and 8-bit
905 expressions, the upper 32/48/56 bits are arbitrary, so you should
906 mask or sign extend partial values if necessary.
909 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
911 HReg r = iselIntExpr_R_wrk(env, e);
912 /* sanity checks ... */
913 # if 0
914 vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
915 # endif
916 vassert(hregClass(r) == HRcInt64);
917 vassert(hregIsVirtual(r));
918 return r;
921 /* DO NOT CALL THIS DIRECTLY ! */
922 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
924 MatchInfo mi;
925 DECLARE_PATTERN(p_1Uto8_64to1);
926 DECLARE_PATTERN(p_LDle8_then_8Uto64);
927 DECLARE_PATTERN(p_LDle16_then_16Uto64);
929 IRType ty = typeOfIRExpr(env->type_env,e);
930 switch (ty) {
931 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
932 default: vassert(0);
935 switch (e->tag) {
937 /* --------- TEMP --------- */
938 case Iex_RdTmp: {
939 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
942 /* --------- LOAD --------- */
943 case Iex_Load: {
944 HReg dst = newVRegI(env);
945 AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
947 /* We can't handle big-endian loads, nor load-linked. */
948 if (e->Iex.Load.end != Iend_LE)
949 goto irreducible;
951 if (ty == Ity_I64) {
952 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
953 AMD64RMI_Mem(amode), dst) );
954 return dst;
956 if (ty == Ity_I32) {
957 addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
958 return dst;
960 if (ty == Ity_I16) {
961 addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
962 return dst;
964 if (ty == Ity_I8) {
965 addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
966 return dst;
968 break;
971 /* --------- BINARY OP --------- */
972 case Iex_Binop: {
973 AMD64AluOp aluOp;
974 AMD64ShiftOp shOp;
976 /* Pattern: Sub64(0,x) */
977 /* and: Sub32(0,x) */
978 if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
979 || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
980 HReg dst = newVRegI(env);
981 HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
982 addInstr(env, mk_iMOVsd_RR(reg,dst));
983 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
984 return dst;
987 /* Is it an addition or logical style op? */
988 switch (e->Iex.Binop.op) {
989 case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
990 aluOp = Aalu_ADD; break;
991 case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
992 aluOp = Aalu_SUB; break;
993 case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
994 aluOp = Aalu_AND; break;
995 case Iop_Or8: case Iop_Or16: case Iop_Or32: case Iop_Or64:
996 aluOp = Aalu_OR; break;
997 case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
998 aluOp = Aalu_XOR; break;
999 case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
1000 aluOp = Aalu_MUL; break;
1001 default:
1002 aluOp = Aalu_INVALID; break;
1004 /* For commutative ops we assume any literal
1005 values are on the second operand. */
1006 if (aluOp != Aalu_INVALID) {
1007 HReg dst = newVRegI(env);
1008 HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
1009 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1010 addInstr(env, mk_iMOVsd_RR(reg,dst));
1011 addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1012 return dst;
1015 /* Perhaps a shift op? */
1016 switch (e->Iex.Binop.op) {
1017 case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1018 shOp = Ash_SHL; break;
1019 case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1020 shOp = Ash_SHR; break;
1021 case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1022 shOp = Ash_SAR; break;
1023 default:
1024 shOp = Ash_INVALID; break;
1026 if (shOp != Ash_INVALID) {
1027 HReg dst = newVRegI(env);
1029 /* regL = the value to be shifted */
1030 HReg regL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1031 addInstr(env, mk_iMOVsd_RR(regL,dst));
1033 /* Do any necessary widening for 32/16/8 bit operands */
1034 switch (e->Iex.Binop.op) {
1035 case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1036 break;
1037 case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1038 break;
1039 case Iop_Shr8:
1040 addInstr(env, AMD64Instr_Alu64R(
1041 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1042 break;
1043 case Iop_Shr16:
1044 addInstr(env, AMD64Instr_Alu64R(
1045 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1046 break;
1047 case Iop_Shr32:
1048 addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
1049 break;
1050 case Iop_Sar8:
1051 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
1052 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
1053 break;
1054 case Iop_Sar16:
1055 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
1056 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
1057 break;
1058 case Iop_Sar32:
1059 addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
1060 break;
1061 default:
1062 ppIROp(e->Iex.Binop.op);
1063 vassert(0);
1066 /* Now consider the shift amount. If it's a literal, we
1067 can do a much better job than the general case. */
1068 if (e->Iex.Binop.arg2->tag == Iex_Const) {
1069 /* assert that the IR is well-typed */
1070 Int nshift;
1071 vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1072 nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1073 vassert(nshift >= 0);
1074 if (nshift > 0)
1075 /* Can't allow nshift==0 since that means %cl */
1076 addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1077 } else {
1078 /* General case; we have to force the amount into %cl. */
1079 HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1080 addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1081 addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1083 return dst;
1086 /* Handle misc other scalar ops. */
1087 if (e->Iex.Binop.op == Iop_Max32U) {
1088 HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1089 HReg dst = newVRegI(env);
1090 HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1091 addInstr(env, mk_iMOVsd_RR(src1, dst));
1092 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1093 addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
1094 return dst;
1097 if (e->Iex.Binop.op == Iop_DivModS64to32
1098 || e->Iex.Binop.op == Iop_DivModU64to32) {
1099 /* 64 x 32 -> (32(rem),32(div)) division */
1100 /* Get the 64-bit operand into edx:eax, and the other into
1101 any old R/M. */
1102 HReg rax = hregAMD64_RAX();
1103 HReg rdx = hregAMD64_RDX();
1104 HReg dst = newVRegI(env);
1105 Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1106 AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1107 /* Compute the left operand into a reg, and then
1108 put the top half in edx and the bottom in eax. */
1109 HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1110 addInstr(env, mk_iMOVsd_RR(left64, rdx));
1111 addInstr(env, mk_iMOVsd_RR(left64, rax));
1112 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1113 addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1114 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1115 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1116 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1117 addInstr(env, mk_iMOVsd_RR(rax, dst));
1118 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1119 return dst;
1122 if (e->Iex.Binop.op == Iop_32HLto64) {
1123 HReg hi32 = newVRegI(env);
1124 HReg lo32 = newVRegI(env);
1125 HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1126 HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1127 addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1128 addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1129 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1130 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1131 addInstr(env, AMD64Instr_Alu64R(
1132 Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1133 return hi32;
1136 if (e->Iex.Binop.op == Iop_16HLto32) {
1137 HReg hi16 = newVRegI(env);
1138 HReg lo16 = newVRegI(env);
1139 HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1140 HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1141 addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1142 addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1143 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1144 addInstr(env, AMD64Instr_Alu64R(
1145 Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1146 addInstr(env, AMD64Instr_Alu64R(
1147 Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1148 return hi16;
1151 if (e->Iex.Binop.op == Iop_8HLto16) {
1152 HReg hi8 = newVRegI(env);
1153 HReg lo8 = newVRegI(env);
1154 HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1155 HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1156 addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1157 addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1158 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1159 addInstr(env, AMD64Instr_Alu64R(
1160 Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1161 addInstr(env, AMD64Instr_Alu64R(
1162 Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1163 return hi8;
1166 if (e->Iex.Binop.op == Iop_MullS32
1167 || e->Iex.Binop.op == Iop_MullS16
1168 || e->Iex.Binop.op == Iop_MullS8
1169 || e->Iex.Binop.op == Iop_MullU32
1170 || e->Iex.Binop.op == Iop_MullU16
1171 || e->Iex.Binop.op == Iop_MullU8) {
1172 HReg a32 = newVRegI(env);
1173 HReg b32 = newVRegI(env);
1174 HReg a32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1175 HReg b32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1176 Int shift = 0;
1177 AMD64ShiftOp shr_op = Ash_SHR;
1178 switch (e->Iex.Binop.op) {
1179 case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1180 case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1181 case Iop_MullS8: shr_op = Ash_SAR; shift = 56; break;
1182 case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1183 case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1184 case Iop_MullU8: shr_op = Ash_SHR; shift = 56; break;
1185 default: vassert(0);
1188 addInstr(env, mk_iMOVsd_RR(a32s, a32));
1189 addInstr(env, mk_iMOVsd_RR(b32s, b32));
1190 addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1191 addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1192 addInstr(env, AMD64Instr_Sh64(shr_op, shift, a32));
1193 addInstr(env, AMD64Instr_Sh64(shr_op, shift, b32));
1194 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1195 return b32;
1198 if (e->Iex.Binop.op == Iop_CmpF64) {
1199 HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1200 HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1201 HReg dst = newVRegI(env);
1202 addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1203 /* Mask out irrelevant parts of the result so as to conform
1204 to the CmpF64 definition. */
1205 addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1206 return dst;
1209 if (e->Iex.Binop.op == Iop_F64toI32S
1210 || e->Iex.Binop.op == Iop_F64toI64S) {
1211 Int szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1212 HReg rf = iselDblExpr(env, e->Iex.Binop.arg2);
1213 HReg dst = newVRegI(env);
1214 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1215 addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1216 set_SSE_rounding_default(env);
1217 return dst;
1220 /* Deal with 64-bit SIMD binary ops. For the most part these are doable
1221 by using the equivalent 128-bit operation and ignoring the upper half
1222 of the result. */
1223 AMD64SseOp op = Asse_INVALID;
1224 Bool arg1isEReg = False;
1225 Bool preShift32R = False;
1226 switch (e->Iex.Binop.op) {
1227 // The following 3 could be done with 128 bit insns too, but
1228 // first require the inputs to be reformatted.
1229 //case Iop_QNarrowBin32Sto16Sx4:
1230 //op = Asse_PACKSSD; arg1isEReg = True; break;
1231 //case Iop_QNarrowBin16Sto8Sx8:
1232 //op = Asse_PACKSSW; arg1isEReg = True; break;
1233 //case Iop_QNarrowBin16Sto8Ux8:
1234 //op = Asse_PACKUSW; arg1isEReg = True; break;
1236 case Iop_InterleaveHI8x8:
1237 op = Asse_UNPCKLB; arg1isEReg = True; preShift32R = True;
1238 break;
1239 case Iop_InterleaveHI16x4:
1240 op = Asse_UNPCKLW; arg1isEReg = True; preShift32R = True;
1241 break;
1242 case Iop_InterleaveHI32x2:
1243 op = Asse_UNPCKLD; arg1isEReg = True; preShift32R = True;
1244 break;
1245 case Iop_InterleaveLO8x8:
1246 op = Asse_UNPCKLB; arg1isEReg = True;
1247 break;
1248 case Iop_InterleaveLO16x4:
1249 op = Asse_UNPCKLW; arg1isEReg = True;
1250 break;
1251 case Iop_InterleaveLO32x2:
1252 op = Asse_UNPCKLD; arg1isEReg = True;
1253 break;
1255 case Iop_Add8x8: op = Asse_ADD8; break;
1256 case Iop_Add16x4: op = Asse_ADD16; break;
1257 case Iop_Add32x2: op = Asse_ADD32; break;
1258 case Iop_QAdd8Sx8: op = Asse_QADD8S; break;
1259 case Iop_QAdd16Sx4: op = Asse_QADD16S; break;
1260 case Iop_QAdd8Ux8: op = Asse_QADD8U; break;
1261 case Iop_QAdd16Ux4: op = Asse_QADD16U; break;
1262 case Iop_Avg8Ux8: op = Asse_AVG8U; break;
1263 case Iop_Avg16Ux4: op = Asse_AVG16U; break;
1264 case Iop_CmpEQ8x8: op = Asse_CMPEQ8; break;
1265 case Iop_CmpEQ16x4: op = Asse_CMPEQ16; break;
1266 case Iop_CmpEQ32x2: op = Asse_CMPEQ32; break;
1267 case Iop_CmpGT8Sx8: op = Asse_CMPGT8S; break;
1268 case Iop_CmpGT16Sx4: op = Asse_CMPGT16S; break;
1269 case Iop_CmpGT32Sx2: op = Asse_CMPGT32S; break;
1270 case Iop_Max16Sx4: op = Asse_MAX16S; break;
1271 case Iop_Max8Ux8: op = Asse_MAX8U; break;
1272 case Iop_Min16Sx4: op = Asse_MIN16S; break;
1273 case Iop_Min8Ux8: op = Asse_MIN8U; break;
1274 case Iop_MulHi16Ux4: op = Asse_MULHI16U; break;
1275 case Iop_MulHi16Sx4: op = Asse_MULHI16S; break;
1276 case Iop_Mul16x4: op = Asse_MUL16; break;
1277 case Iop_Sub8x8: op = Asse_SUB8; break;
1278 case Iop_Sub16x4: op = Asse_SUB16; break;
1279 case Iop_Sub32x2: op = Asse_SUB32; break;
1280 case Iop_QSub8Sx8: op = Asse_QSUB8S; break;
1281 case Iop_QSub16Sx4: op = Asse_QSUB16S; break;
1282 case Iop_QSub8Ux8: op = Asse_QSUB8U; break;
1283 case Iop_QSub16Ux4: op = Asse_QSUB16U; break;
1284 default: break;
1286 if (op != Asse_INVALID) {
1287 /* This isn't pretty, but .. move each arg to the low half of an XMM
1288 register, do the operation on the whole register, and move the
1289 result back to an integer register. */
1290 const IRExpr* arg1 = e->Iex.Binop.arg1;
1291 const IRExpr* arg2 = e->Iex.Binop.arg2;
1292 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1293 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1294 HReg iarg1 = iselIntExpr_R(env, arg1);
1295 HReg iarg2 = iselIntExpr_R(env, arg2);
1296 HReg varg1 = newVRegV(env);
1297 HReg varg2 = newVRegV(env);
1298 HReg idst = newVRegI(env);
1299 addInstr(env, AMD64Instr_SseMOVQ(iarg1, varg1, True/*toXMM*/));
1300 addInstr(env, AMD64Instr_SseMOVQ(iarg2, varg2, True/*toXMM*/));
1301 if (arg1isEReg) {
1302 if (preShift32R) {
1303 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg1));
1304 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg2));
1306 addInstr(env, AMD64Instr_SseReRg(op, varg1, varg2));
1307 addInstr(env, AMD64Instr_SseMOVQ(idst, varg2, False/*!toXMM*/));
1308 } else {
1309 vassert(!preShift32R);
1310 addInstr(env, AMD64Instr_SseReRg(op, varg2, varg1));
1311 addInstr(env, AMD64Instr_SseMOVQ(idst, varg1, False/*!toXMM*/));
1313 return idst;
1316 UInt laneBits = 0;
1317 op = Asse_INVALID;
1318 switch (e->Iex.Binop.op) {
1319 case Iop_ShlN16x4: laneBits = 16; op = Asse_SHL16; break;
1320 case Iop_ShlN32x2: laneBits = 32; op = Asse_SHL32; break;
1321 case Iop_SarN16x4: laneBits = 16; op = Asse_SAR16; break;
1322 case Iop_SarN32x2: laneBits = 32; op = Asse_SAR32; break;
1323 case Iop_ShrN16x4: laneBits = 16; op = Asse_SHR16; break;
1324 case Iop_ShrN32x2: laneBits = 32; op = Asse_SHR32; break;
1325 default: break;
1327 if (op != Asse_INVALID) {
1328 const IRExpr* arg1 = e->Iex.Binop.arg1;
1329 const IRExpr* arg2 = e->Iex.Binop.arg2;
1330 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1331 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I8);
1332 HReg igreg = iselIntExpr_R(env, arg1);
1333 HReg vgreg = newVRegV(env);
1334 HReg idst = newVRegI(env);
1335 addInstr(env, AMD64Instr_SseMOVQ(igreg, vgreg, True/*toXMM*/));
1336 /* If it's a shift by an in-range immediate, generate a single
1337 instruction. */
1338 if (arg2->tag == Iex_Const) {
1339 IRConst* c = arg2->Iex.Const.con;
1340 vassert(c->tag == Ico_U8);
1341 UInt shift = c->Ico.U8;
1342 if (shift < laneBits) {
1343 addInstr(env, AMD64Instr_SseShiftN(op, shift, vgreg));
1344 addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1345 return idst;
1348 /* Otherwise we have to do it the longwinded way. */
1349 HReg ishift = iselIntExpr_R(env, arg2);
1350 HReg vshift = newVRegV(env);
1351 addInstr(env, AMD64Instr_SseMOVQ(ishift, vshift, True/*toXMM*/));
1352 addInstr(env, AMD64Instr_SseReRg(op, vshift, vgreg));
1353 addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1354 return idst;
1357 if (e->Iex.Binop.op == Iop_Mul32x2) {
1358 const IRExpr* arg1 = e->Iex.Binop.arg1;
1359 const IRExpr* arg2 = e->Iex.Binop.arg2;
1360 vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1361 vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1362 HReg s1 = iselIntExpr_R(env, arg1);
1363 HReg s2 = iselIntExpr_R(env, arg2);
1364 HReg resLo = newVRegI(env);
1365 // resLo = (s1 *64 s2) & 0xFFFF'FFFF
1366 addInstr(env, mk_iMOVsd_RR(s1, resLo));
1367 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(s2), resLo));
1368 addInstr(env, AMD64Instr_MovxLQ(False, resLo, resLo));
1370 // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32;
1371 HReg resHi = newVRegI(env);
1372 addInstr(env, mk_iMOVsd_RR(s1, resHi));
1373 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, resHi));
1374 HReg tmp = newVRegI(env);
1375 addInstr(env, mk_iMOVsd_RR(s2, tmp));
1376 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, tmp));
1377 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(tmp), resHi));
1378 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, resHi));
1380 // final result = resHi | resLo
1381 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(resHi), resLo));
1382 return resLo;
1385 // A few remaining SIMD64 ops require helper functions, at least for
1386 // now.
1387 Bool second_is_UInt = False;
1388 HWord fn = 0;
1389 switch (e->Iex.Binop.op) {
1390 case Iop_CatOddLanes16x4:
1391 fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1392 case Iop_CatEvenLanes16x4:
1393 fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1394 case Iop_PermOrZero8x8:
1395 fn = (HWord)h_generic_calc_PermOrZero8x8; break;
1397 case Iop_QNarrowBin32Sto16Sx4:
1398 fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1399 case Iop_QNarrowBin16Sto8Sx8:
1400 fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1401 case Iop_QNarrowBin16Sto8Ux8:
1402 fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1404 case Iop_NarrowBin16to8x8:
1405 fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1406 case Iop_NarrowBin32to16x4:
1407 fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1409 case Iop_SarN8x8:
1410 fn = (HWord)h_generic_calc_SarN8x8;
1411 second_is_UInt = True;
1412 break;
1414 default:
1415 fn = (HWord)0; break;
1417 if (fn != (HWord)0) {
1418 /* Note: the following assumes all helpers are of signature
1419 ULong fn ( ULong, ULong ), and they are
1420 not marked as regparm functions.
1422 HReg dst = newVRegI(env);
1423 HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1424 HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1425 if (second_is_UInt)
1426 addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1427 addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1428 addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1429 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1430 mk_RetLoc_simple(RLPri_Int) ));
1431 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1432 return dst;
1435 // Half-float vector conversion
1436 if (e->Iex.Binop.op == Iop_F32toF16x4
1437 && (env->hwcaps & VEX_HWCAPS_AMD64_F16C)) {
1438 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg2);
1439 HReg dstV = newVRegV(env);
1440 HReg dstI = newVRegI(env);
1441 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1442 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcV, dstV));
1443 set_SSE_rounding_default(env);
1444 addInstr(env, AMD64Instr_SseMOVQ(dstI, dstV, /*toXMM=*/False));
1445 return dstI;
1448 break;
1451 /* --------- UNARY OP --------- */
1452 case Iex_Unop: {
1454 /* 1Uto8(64to1(expr64)) */
1456 DEFINE_PATTERN( p_1Uto8_64to1,
1457 unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1458 if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1459 const IRExpr* expr64 = mi.bindee[0];
1460 HReg dst = newVRegI(env);
1461 HReg src = iselIntExpr_R(env, expr64);
1462 addInstr(env, mk_iMOVsd_RR(src,dst) );
1463 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1464 AMD64RMI_Imm(1), dst));
1465 return dst;
1469 /* 8Uto64(LDle(expr64)) */
1471 DEFINE_PATTERN(p_LDle8_then_8Uto64,
1472 unop(Iop_8Uto64,
1473 IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1474 if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1475 HReg dst = newVRegI(env);
1476 AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1477 addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1478 return dst;
1482 /* 16Uto64(LDle(expr64)) */
1484 DEFINE_PATTERN(p_LDle16_then_16Uto64,
1485 unop(Iop_16Uto64,
1486 IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1487 if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1488 HReg dst = newVRegI(env);
1489 AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1490 addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1491 return dst;
1495 /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1496 Use 32 bit arithmetic and let the default zero-extend rule
1497 do the 32Uto64 for free. */
1498 if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1499 IROp opi = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1500 IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1501 IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1502 AMD64AluOp aluOp = Aalu_INVALID;
1503 switch (opi) {
1504 case Iop_Add32: aluOp = Aalu_ADD; break;
1505 case Iop_Sub32: aluOp = Aalu_SUB; break;
1506 case Iop_And32: aluOp = Aalu_AND; break;
1507 case Iop_Or32: aluOp = Aalu_OR; break;
1508 case Iop_Xor32: aluOp = Aalu_XOR; break;
1509 default: break;
1511 if (aluOp != Aalu_INVALID) {
1512 /* For commutative ops we assume any literal values are on
1513 the second operand. */
1514 HReg dst = newVRegI(env);
1515 HReg reg = iselIntExpr_R(env, argL);
1516 AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1517 addInstr(env, mk_iMOVsd_RR(reg,dst));
1518 addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1519 return dst;
1521 /* just fall through to normal handling for Iop_32Uto64 */
1524 /* Fallback cases */
1525 switch (e->Iex.Unop.op) {
1526 case Iop_32Uto64:
1527 case Iop_32Sto64: {
1528 HReg dst = newVRegI(env);
1529 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1530 addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1531 src, dst) );
1532 return dst;
1534 case Iop_128HIto64: {
1535 HReg rHi, rLo;
1536 iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1537 return rHi; /* and abandon rLo */
1539 case Iop_128to64: {
1540 HReg rHi, rLo;
1541 iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1542 return rLo; /* and abandon rHi */
1544 case Iop_8Uto16:
1545 case Iop_8Uto32:
1546 case Iop_8Uto64:
1547 case Iop_16Uto64:
1548 case Iop_16Uto32: {
1549 HReg dst = newVRegI(env);
1550 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1551 Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1552 || e->Iex.Unop.op==Iop_16Uto64 );
1553 UInt mask = srcIs16 ? 0xFFFF : 0xFF;
1554 addInstr(env, mk_iMOVsd_RR(src,dst) );
1555 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1556 AMD64RMI_Imm(mask), dst));
1557 return dst;
1559 case Iop_8Sto16:
1560 case Iop_8Sto64:
1561 case Iop_8Sto32:
1562 case Iop_16Sto32:
1563 case Iop_16Sto64: {
1564 HReg dst = newVRegI(env);
1565 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1566 Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1567 || e->Iex.Unop.op==Iop_16Sto64 );
1568 UInt amt = srcIs16 ? 48 : 56;
1569 addInstr(env, mk_iMOVsd_RR(src,dst) );
1570 addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1571 addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1572 return dst;
1574 case Iop_Not8:
1575 case Iop_Not16:
1576 case Iop_Not32:
1577 case Iop_Not64: {
1578 HReg dst = newVRegI(env);
1579 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1580 addInstr(env, mk_iMOVsd_RR(src,dst) );
1581 addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1582 return dst;
1584 case Iop_16HIto8:
1585 case Iop_32HIto16:
1586 case Iop_64HIto32: {
1587 HReg dst = newVRegI(env);
1588 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1589 Int shift = 0;
1590 switch (e->Iex.Unop.op) {
1591 case Iop_16HIto8: shift = 8; break;
1592 case Iop_32HIto16: shift = 16; break;
1593 case Iop_64HIto32: shift = 32; break;
1594 default: vassert(0);
1596 addInstr(env, mk_iMOVsd_RR(src,dst) );
1597 addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1598 return dst;
1600 case Iop_1Uto64:
1601 case Iop_1Uto32:
1602 case Iop_1Uto8: {
1603 HReg dst = newVRegI(env);
1604 AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1605 addInstr(env, AMD64Instr_Set64(cond,dst));
1606 return dst;
1608 case Iop_1Sto8:
1609 case Iop_1Sto16:
1610 case Iop_1Sto32:
1611 case Iop_1Sto64: {
1612 /* could do better than this, but for now ... */
1613 HReg dst = newVRegI(env);
1614 AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1615 addInstr(env, AMD64Instr_Set64(cond,dst));
1616 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1617 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1618 return dst;
1620 case Iop_Ctz64: {
1621 /* Count trailing zeroes, implemented by amd64 'bsfq' */
1622 HReg dst = newVRegI(env);
1623 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1624 addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1625 return dst;
1627 case Iop_Clz64: {
1628 /* Count leading zeroes. Do 'bsrq' to establish the index
1629 of the highest set bit, and subtract that value from
1630 63. */
1631 HReg tmp = newVRegI(env);
1632 HReg dst = newVRegI(env);
1633 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1634 addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1635 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1636 AMD64RMI_Imm(63), dst));
1637 addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1638 AMD64RMI_Reg(tmp), dst));
1639 return dst;
1642 case Iop_CmpwNEZ64: {
1643 HReg dst = newVRegI(env);
1644 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1645 addInstr(env, mk_iMOVsd_RR(src,dst));
1646 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1647 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1648 AMD64RMI_Reg(src), dst));
1649 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1650 return dst;
1653 case Iop_CmpwNEZ32: {
1654 HReg src = newVRegI(env);
1655 HReg dst = newVRegI(env);
1656 HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1657 addInstr(env, mk_iMOVsd_RR(pre,src));
1658 addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1659 addInstr(env, mk_iMOVsd_RR(src,dst));
1660 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1661 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1662 AMD64RMI_Reg(src), dst));
1663 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1664 return dst;
1667 case Iop_Left8:
1668 case Iop_Left16:
1669 case Iop_Left32:
1670 case Iop_Left64: {
1671 HReg dst = newVRegI(env);
1672 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1673 addInstr(env, mk_iMOVsd_RR(src, dst));
1674 addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1675 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1676 return dst;
1679 case Iop_V128to32: {
1680 HReg dst = newVRegI(env);
1681 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1682 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1683 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1684 addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1685 return dst;
1688 /* V128{HI}to64 */
1689 case Iop_V128to64: {
1690 HReg dst = newVRegI(env);
1691 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1692 addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1693 return dst;
1695 case Iop_V128HIto64: {
1696 HReg dst = newVRegI(env);
1697 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1698 HReg vec2 = newVRegV(env);
1699 addInstr(env, mk_vMOVsd_RR(vec, vec2));
1700 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1701 addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1702 return dst;
1705 /* V256to64_{3,2,1,0} */
1706 case Iop_V256to64_0: case Iop_V256to64_1:
1707 case Iop_V256to64_2: case Iop_V256to64_3: {
1708 HReg vHi, vLo, vec;
1709 iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1710 /* Do the first part of the selection by deciding which of
1711 the 128 bit registers to look at, and second part using
1712 the same scheme as for V128{HI}to64 above. */
1713 Bool low64of128 = True;
1714 switch (e->Iex.Unop.op) {
1715 case Iop_V256to64_0: vec = vLo; low64of128 = True; break;
1716 case Iop_V256to64_1: vec = vLo; low64of128 = False; break;
1717 case Iop_V256to64_2: vec = vHi; low64of128 = True; break;
1718 case Iop_V256to64_3: vec = vHi; low64of128 = False; break;
1719 default: vassert(0);
1721 HReg dst = newVRegI(env);
1722 if (low64of128) {
1723 addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1724 } else {
1725 HReg vec2 = newVRegV(env);
1726 addInstr(env, mk_vMOVsd_RR(vec, vec2));
1727 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1728 addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1730 return dst;
1733 /* ReinterpF64asI64(e) */
1734 /* Given an IEEE754 double, produce an I64 with the same bit
1735 pattern. */
1736 case Iop_ReinterpF64asI64: {
1737 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1738 HReg dst = newVRegI(env);
1739 HReg src = iselDblExpr(env, e->Iex.Unop.arg);
1740 /* paranoia */
1741 set_SSE_rounding_default(env);
1742 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1743 addInstr(env, AMD64Instr_Alu64R(
1744 Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1745 return dst;
1748 /* ReinterpF32asI32(e) */
1749 /* Given an IEEE754 single, produce an I64 with the same bit
1750 pattern in the lower half. */
1751 case Iop_ReinterpF32asI32: {
1752 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1753 HReg dst = newVRegI(env);
1754 HReg src = iselFltExpr(env, e->Iex.Unop.arg);
1755 /* paranoia */
1756 set_SSE_rounding_default(env);
1757 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1758 addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1759 return dst;
1762 case Iop_16to8:
1763 case Iop_32to8:
1764 case Iop_64to8:
1765 case Iop_32to16:
1766 case Iop_64to16:
1767 case Iop_64to32:
1768 /* These are no-ops. */
1769 return iselIntExpr_R(env, e->Iex.Unop.arg);
1771 case Iop_GetMSBs8x8: {
1772 /* Note: the following assumes the helper is of
1773 signature
1774 UInt fn ( ULong ), and is not a regparm fn.
1776 HReg dst = newVRegI(env);
1777 HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1778 HWord fn = (HWord)h_generic_calc_GetMSBs8x8;
1779 addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1780 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1781 1, mk_RetLoc_simple(RLPri_Int) ));
1782 /* MovxLQ is not exactly the right thing here. We just
1783 need to get the bottom 8 bits of RAX into dst, and zero
1784 out everything else. Assuming that the helper returns
1785 a UInt with the top 24 bits zeroed out, it'll do,
1786 though. */
1787 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1788 return dst;
1791 case Iop_GetMSBs8x16: {
1792 /* Note: the following assumes the helper is of signature
1793 UInt fn ( ULong w64hi, ULong w64Lo ),
1794 and is not a regparm fn. */
1795 HReg dst = newVRegI(env);
1796 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1797 HReg rsp = hregAMD64_RSP();
1798 HWord fn = (HWord)h_generic_calc_GetMSBs8x16;
1799 AMD64AMode* m8_rsp = AMD64AMode_IR( -8, rsp);
1800 AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1801 addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1802 16, vec, m16_rsp));
1803 /* hi 64 bits into RDI -- the first arg */
1804 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1805 AMD64RMI_Mem(m8_rsp),
1806 hregAMD64_RDI() )); /* 1st arg */
1807 /* lo 64 bits into RSI -- the 2nd arg */
1808 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1809 AMD64RMI_Mem(m16_rsp),
1810 hregAMD64_RSI() )); /* 2nd arg */
1811 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1812 2, mk_RetLoc_simple(RLPri_Int) ));
1813 /* MovxLQ is not exactly the right thing here. We just
1814 need to get the bottom 16 bits of RAX into dst, and zero
1815 out everything else. Assuming that the helper returns
1816 a UInt with the top 16 bits zeroed out, it'll do,
1817 though. */
1818 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1819 return dst;
1822 default:
1823 break;
1826 /* Deal with unary 64-bit SIMD ops. */
1827 HWord fn = 0;
1828 switch (e->Iex.Unop.op) {
1829 case Iop_CmpNEZ32x2:
1830 fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1831 case Iop_CmpNEZ16x4:
1832 fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1833 case Iop_CmpNEZ8x8:
1834 fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1835 default:
1836 fn = (HWord)0; break;
1838 if (fn != (HWord)0) {
1839 /* Note: the following assumes all helpers are of
1840 signature
1841 ULong fn ( ULong ), and they are
1842 not marked as regparm functions.
1844 HReg dst = newVRegI(env);
1845 HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1846 addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1847 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1848 mk_RetLoc_simple(RLPri_Int) ));
1849 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1850 return dst;
1853 break;
1856 /* --------- GET --------- */
1857 case Iex_Get: {
1858 if (ty == Ity_I64) {
1859 HReg dst = newVRegI(env);
1860 addInstr(env, AMD64Instr_Alu64R(
1861 Aalu_MOV,
1862 AMD64RMI_Mem(
1863 AMD64AMode_IR(e->Iex.Get.offset,
1864 hregAMD64_RBP())),
1865 dst));
1866 return dst;
1868 if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1869 HReg dst = newVRegI(env);
1870 addInstr(env, AMD64Instr_LoadEX(
1871 toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1872 False,
1873 AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1874 dst));
1875 return dst;
1877 break;
1880 case Iex_GetI: {
1881 AMD64AMode* am
1882 = genGuestArrayOffset(
1883 env, e->Iex.GetI.descr,
1884 e->Iex.GetI.ix, e->Iex.GetI.bias );
1885 HReg dst = newVRegI(env);
1886 if (ty == Ity_I8) {
1887 addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1888 return dst;
1890 if (ty == Ity_I64) {
1891 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1892 return dst;
1894 break;
1897 /* --------- CCALL --------- */
1898 case Iex_CCall: {
1899 HReg dst = newVRegI(env);
1900 vassert(ty == e->Iex.CCall.retty);
1902 /* be very restrictive for now. Only 64-bit ints allowed for
1903 args, and 64 or 32 bits for return type. */
1904 if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1905 goto irreducible;
1907 /* Marshal args, do the call. */
1908 UInt addToSp = 0;
1909 RetLoc rloc = mk_RetLoc_INVALID();
1910 doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1911 e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1912 vassert(is_sane_RetLoc(rloc));
1913 vassert(rloc.pri == RLPri_Int);
1914 vassert(addToSp == 0);
1916 /* Move to dst, and zero out the top 32 bits if the result type is
1917 Ity_I32. Probably overkill, but still .. */
1918 if (e->Iex.CCall.retty == Ity_I64)
1919 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1920 else
1921 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1923 return dst;
1926 /* --------- LITERAL --------- */
1927 /* 64/32/16/8-bit literals */
1928 case Iex_Const:
1929 if (ty == Ity_I64) {
1930 HReg r = newVRegI(env);
1931 addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1932 return r;
1933 } else {
1934 AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1935 HReg r = newVRegI(env);
1936 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1937 return r;
1940 /* --------- MULTIPLEX --------- */
1941 case Iex_ITE: { // VFD
1942 if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1943 && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1944 HReg r1 = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1945 HReg r0 = iselIntExpr_R(env, e->Iex.ITE.iffalse);
1946 HReg dst = newVRegI(env);
1947 addInstr(env, mk_iMOVsd_RR(r1,dst));
1948 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1949 addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1950 return dst;
1952 break;
1955 /* --------- TERNARY OP --------- */
1956 case Iex_Triop: {
1957 IRTriop *triop = e->Iex.Triop.details;
1958 /* C3210 flags following FPU partial remainder (fprem), both
1959 IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1960 if (triop->op == Iop_PRemC3210F64
1961 || triop->op == Iop_PRem1C3210F64) {
1962 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1963 HReg arg1 = iselDblExpr(env, triop->arg2);
1964 HReg arg2 = iselDblExpr(env, triop->arg3);
1965 HReg dst = newVRegI(env);
1966 addInstr(env, AMD64Instr_A87Free(2));
1968 /* one arg -> top of x87 stack */
1969 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1970 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1972 /* other arg -> top of x87 stack */
1973 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1974 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1976 switch (triop->op) {
1977 case Iop_PRemC3210F64:
1978 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1979 break;
1980 case Iop_PRem1C3210F64:
1981 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1982 break;
1983 default:
1984 vassert(0);
1986 /* Ignore the result, and instead make off with the FPU's
1987 C3210 flags (in the status word). */
1988 addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1989 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1990 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1991 return dst;
1993 break;
1996 default:
1997 break;
1998 } /* switch (e->tag) */
2000 /* We get here if no pattern matched. */
2001 irreducible:
2002 ppIRExpr(e);
2003 vpanic("iselIntExpr_R(amd64): cannot reduce tree");
2007 /*---------------------------------------------------------*/
2008 /*--- ISEL: Integer expression auxiliaries ---*/
2009 /*---------------------------------------------------------*/
2011 /* --------------------- AMODEs --------------------- */
2013 /* Return an AMode which computes the value of the specified
2014 expression, possibly also adding insns to the code list as a
2015 result. The expression may only be a 32-bit one.
2018 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
2020 AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
2021 vassert(sane_AMode(am));
2022 return am;
2025 /* DO NOT CALL THIS DIRECTLY ! */
2026 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
2028 MatchInfo mi;
2029 DECLARE_PATTERN(p_complex);
2030 IRType ty = typeOfIRExpr(env->type_env,e);
2031 vassert(ty == Ity_I64);
2033 /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
2034 /* bind0 bind1 bind2 bind3 */
2035 DEFINE_PATTERN(p_complex,
2036 binop( Iop_Add64,
2037 binop( Iop_Add64,
2038 bind(0),
2039 binop(Iop_Shl64, bind(1), bind(2))
2041 bind(3)
2044 if (matchIRExpr(&mi, p_complex, e)) {
2045 const IRExpr* expr1 = mi.bindee[0];
2046 const IRExpr* expr2 = mi.bindee[1];
2047 const IRExpr* imm8 = mi.bindee[2];
2048 const IRExpr* simm32 = mi.bindee[3];
2049 if (imm8->tag == Iex_Const
2050 && imm8->Iex.Const.con->tag == Ico_U8
2051 && imm8->Iex.Const.con->Ico.U8 < 4
2052 /* imm8 is OK, now check simm32 */
2053 && simm32->tag == Iex_Const
2054 && simm32->Iex.Const.con->tag == Ico_U64
2055 && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
2056 UInt shift = imm8->Iex.Const.con->Ico.U8;
2057 UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
2058 HReg r1 = iselIntExpr_R(env, expr1);
2059 HReg r2 = iselIntExpr_R(env, expr2);
2060 vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
2061 return AMD64AMode_IRRS(offset, r1, r2, shift);
2065 /* Add64(expr1, Shl64(expr2, imm)) */
2066 if (e->tag == Iex_Binop
2067 && e->Iex.Binop.op == Iop_Add64
2068 && e->Iex.Binop.arg2->tag == Iex_Binop
2069 && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
2070 && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
2071 && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
2072 UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
2073 if (shift == 1 || shift == 2 || shift == 3) {
2074 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2075 HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
2076 return AMD64AMode_IRRS(0, r1, r2, shift);
2080 /* Add64(expr,i) */
2081 if (e->tag == Iex_Binop
2082 && e->Iex.Binop.op == Iop_Add64
2083 && e->Iex.Binop.arg2->tag == Iex_Const
2084 && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2085 && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2086 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2087 return AMD64AMode_IR(
2088 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2093 /* Doesn't match anything in particular. Generate it into
2094 a register and use that. */
2096 HReg r1 = iselIntExpr_R(env, e);
2097 return AMD64AMode_IR(0, r1);
2102 /* --------------------- RMIs --------------------- */
2104 /* Similarly, calculate an expression into an X86RMI operand. As with
2105 iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
2107 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
2109 AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2110 /* sanity checks ... */
2111 switch (rmi->tag) {
2112 case Armi_Imm:
2113 return rmi;
2114 case Armi_Reg:
2115 vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2116 vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2117 return rmi;
2118 case Armi_Mem:
2119 vassert(sane_AMode(rmi->Armi.Mem.am));
2120 return rmi;
2121 default:
2122 vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2126 /* DO NOT CALL THIS DIRECTLY ! */
2127 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
2129 IRType ty = typeOfIRExpr(env->type_env,e);
2130 vassert(ty == Ity_I64 || ty == Ity_I32
2131 || ty == Ity_I16 || ty == Ity_I8);
2133 /* special case: immediate 64/32/16/8 */
2134 if (e->tag == Iex_Const) {
2135 switch (e->Iex.Const.con->tag) {
2136 case Ico_U64:
2137 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2138 return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2140 break;
2141 case Ico_U32:
2142 return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2143 case Ico_U16:
2144 return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2145 case Ico_U8:
2146 return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2147 default:
2148 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2152 /* special case: 64-bit GET */
2153 if (e->tag == Iex_Get && ty == Ity_I64) {
2154 return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2155 hregAMD64_RBP()));
2158 /* special case: 64-bit load from memory */
2159 if (e->tag == Iex_Load && ty == Ity_I64
2160 && e->Iex.Load.end == Iend_LE) {
2161 AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2162 return AMD64RMI_Mem(am);
2165 /* default case: calculate into a register and return that */
2167 HReg r = iselIntExpr_R ( env, e );
2168 return AMD64RMI_Reg(r);
2173 /* --------------------- RIs --------------------- */
2175 /* Calculate an expression into an AMD64RI operand. As with
2176 iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2177 bits. */
2179 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
2181 AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2182 /* sanity checks ... */
2183 switch (ri->tag) {
2184 case Ari_Imm:
2185 return ri;
2186 case Ari_Reg:
2187 vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2188 vassert(hregIsVirtual(ri->Ari.Reg.reg));
2189 return ri;
2190 default:
2191 vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2195 /* DO NOT CALL THIS DIRECTLY ! */
2196 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
2198 IRType ty = typeOfIRExpr(env->type_env,e);
2199 vassert(ty == Ity_I64 || ty == Ity_I32
2200 || ty == Ity_I16 || ty == Ity_I8);
2202 /* special case: immediate */
2203 if (e->tag == Iex_Const) {
2204 switch (e->Iex.Const.con->tag) {
2205 case Ico_U64:
2206 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2207 return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2209 break;
2210 case Ico_U32:
2211 return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2212 case Ico_U16:
2213 return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2214 case Ico_U8:
2215 return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2216 default:
2217 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2221 /* default case: calculate into a register and return that */
2223 HReg r = iselIntExpr_R ( env, e );
2224 return AMD64RI_Reg(r);
2229 /* --------------------- RMs --------------------- */
2231 /* Similarly, calculate an expression into an AMD64RM operand. As
2232 with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2233 bits. */
2235 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
2237 AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2238 /* sanity checks ... */
2239 switch (rm->tag) {
2240 case Arm_Reg:
2241 vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2242 vassert(hregIsVirtual(rm->Arm.Reg.reg));
2243 return rm;
2244 case Arm_Mem:
2245 vassert(sane_AMode(rm->Arm.Mem.am));
2246 return rm;
2247 default:
2248 vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2252 /* DO NOT CALL THIS DIRECTLY ! */
2253 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
2255 IRType ty = typeOfIRExpr(env->type_env,e);
2256 vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2258 /* special case: 64-bit GET */
2259 if (e->tag == Iex_Get && ty == Ity_I64) {
2260 return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2261 hregAMD64_RBP()));
2264 /* special case: load from memory */
2266 /* default case: calculate into a register and return that */
2268 HReg r = iselIntExpr_R ( env, e );
2269 return AMD64RM_Reg(r);
2274 /* --------------------- CONDCODE --------------------- */
2276 /* Generate code to evaluated a bit-typed expression, returning the
2277 condition code which would correspond when the expression would
2278 notionally have returned 1. */
2280 static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e )
2282 /* Uh, there's nothing we can sanity check here, unfortunately. */
2283 return iselCondCode_wrk(env,e);
2286 /* DO NOT CALL THIS DIRECTLY ! */
2287 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e )
2289 vassert(e);
2290 vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2292 /* var */
2293 if (e->tag == Iex_RdTmp) {
2294 HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2295 HReg dst = newVRegI(env);
2296 addInstr(env, mk_iMOVsd_RR(r64,dst));
2297 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
2298 return Acc_NZ;
2301 /* Constant 1:Bit */
2302 if (e->tag == Iex_Const) {
2303 HReg r;
2304 vassert(e->Iex.Const.con->tag == Ico_U1);
2305 vassert(e->Iex.Const.con->Ico.U1 == True
2306 || e->Iex.Const.con->Ico.U1 == False);
2307 r = newVRegI(env);
2308 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2309 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2310 return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2313 /* Not1(...) */
2314 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2315 /* Generate code for the arg, and negate the test condition */
2316 return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2319 /* --- patterns rooted at: 64to1 --- */
2321 /* 64to1 */
2322 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2323 HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2324 addInstr(env, AMD64Instr_Test64(1,reg));
2325 return Acc_NZ;
2328 /* --- patterns rooted at: 32to1 --- */
2330 /* 32to1 */
2331 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2332 HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2333 addInstr(env, AMD64Instr_Test64(1,reg));
2334 return Acc_NZ;
2337 /* --- patterns rooted at: CmpNEZ8 --- */
2339 /* CmpNEZ8(x) */
2340 if (e->tag == Iex_Unop
2341 && e->Iex.Unop.op == Iop_CmpNEZ8) {
2342 HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2343 addInstr(env, AMD64Instr_Test64(0xFF,r));
2344 return Acc_NZ;
2347 /* --- patterns rooted at: CmpNEZ16 --- */
2349 /* CmpNEZ16(x) */
2350 if (e->tag == Iex_Unop
2351 && e->Iex.Unop.op == Iop_CmpNEZ16) {
2352 HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2353 addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2354 return Acc_NZ;
2357 /* --- patterns rooted at: CmpNEZ32 --- */
2359 if (e->tag == Iex_Unop
2360 && e->Iex.Unop.op == Iop_CmpNEZ32) {
2361 IRExpr* arg = e->Iex.Unop.arg;
2362 if (arg->tag == Iex_Binop
2363 && (arg->Iex.Binop.op == Iop_Or32
2364 || arg->Iex.Binop.op == Iop_And32)) {
2365 /* CmpNEZ32(Or32(x,y)) */
2366 /* CmpNEZ32(And32(x,y)) */
2367 HReg r0 = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2368 AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2369 HReg tmp = newVRegI(env);
2370 addInstr(env, mk_iMOVsd_RR(r0, tmp));
2371 addInstr(env, AMD64Instr_Alu32R(
2372 arg->Iex.Binop.op == Iop_Or32 ? Aalu_OR : Aalu_AND,
2373 rmi1, tmp));
2374 return Acc_NZ;
2376 /* CmpNEZ32(x) */
2377 HReg r1 = iselIntExpr_R(env, arg);
2378 AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2379 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2380 return Acc_NZ;
2383 /* --- patterns rooted at: CmpNEZ64 --- */
2385 if (e->tag == Iex_Unop
2386 && e->Iex.Unop.op == Iop_CmpNEZ64) {
2387 IRExpr* arg = e->Iex.Unop.arg;
2388 if (arg->tag == Iex_Binop
2389 && (arg->Iex.Binop.op == Iop_Or64
2390 || arg->Iex.Binop.op == Iop_And64)) {
2391 /* CmpNEZ64(Or64(x,y)) */
2392 /* CmpNEZ64(And64(x,y)) */
2393 HReg r0 = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2394 AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2395 HReg tmp = newVRegI(env);
2396 addInstr(env, mk_iMOVsd_RR(r0, tmp));
2397 addInstr(env, AMD64Instr_Alu64R(
2398 arg->Iex.Binop.op == Iop_Or64 ? Aalu_OR : Aalu_AND,
2399 rmi1, tmp));
2400 return Acc_NZ;
2402 /* CmpNEZ64(x) */
2403 HReg r1 = iselIntExpr_R(env, arg);
2404 AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2405 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2406 return Acc_NZ;
2409 /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2411 /* CmpEQ8 / CmpNE8 */
2412 if (e->tag == Iex_Binop
2413 && (e->Iex.Binop.op == Iop_CmpEQ8
2414 || e->Iex.Binop.op == Iop_CmpNE8
2415 || e->Iex.Binop.op == Iop_CasCmpEQ8
2416 || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2417 if (isZeroU8(e->Iex.Binop.arg2)) {
2418 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2419 addInstr(env, AMD64Instr_Test64(0xFF,r1));
2420 switch (e->Iex.Binop.op) {
2421 case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2422 case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2423 default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
2425 } else {
2426 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2427 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2428 HReg r = newVRegI(env);
2429 addInstr(env, mk_iMOVsd_RR(r1,r));
2430 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2431 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2432 switch (e->Iex.Binop.op) {
2433 case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2434 case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2435 default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
2440 /* CmpEQ16 / CmpNE16 */
2441 if (e->tag == Iex_Binop
2442 && (e->Iex.Binop.op == Iop_CmpEQ16
2443 || e->Iex.Binop.op == Iop_CmpNE16
2444 || e->Iex.Binop.op == Iop_CasCmpEQ16
2445 || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2446 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2447 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2448 HReg r = newVRegI(env);
2449 addInstr(env, mk_iMOVsd_RR(r1,r));
2450 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2451 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2452 switch (e->Iex.Binop.op) {
2453 case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2454 case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2455 default: vpanic("iselCondCode(amd64): CmpXX16");
2459 /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2460 Saves a "movq %rax, %tmp" compared to the default route. */
2461 if (e->tag == Iex_Binop
2462 && e->Iex.Binop.op == Iop_CmpNE64
2463 && e->Iex.Binop.arg1->tag == Iex_CCall
2464 && e->Iex.Binop.arg2->tag == Iex_Const) {
2465 IRExpr* cal = e->Iex.Binop.arg1;
2466 IRExpr* con = e->Iex.Binop.arg2;
2467 HReg tmp = newVRegI(env);
2468 /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2469 vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2470 vassert(con->Iex.Const.con->tag == Ico_U64);
2471 /* Marshal args, do the call. */
2472 UInt addToSp = 0;
2473 RetLoc rloc = mk_RetLoc_INVALID();
2474 doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2475 cal->Iex.CCall.cee,
2476 cal->Iex.CCall.retty, cal->Iex.CCall.args );
2477 vassert(is_sane_RetLoc(rloc));
2478 vassert(rloc.pri == RLPri_Int);
2479 vassert(addToSp == 0);
2480 /* */
2481 addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2482 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2483 AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2484 return Acc_NZ;
2487 /* Cmp*64*(x,y) */
2488 if (e->tag == Iex_Binop
2489 && (e->Iex.Binop.op == Iop_CmpEQ64
2490 || e->Iex.Binop.op == Iop_CmpNE64
2491 || e->Iex.Binop.op == Iop_CmpLT64S
2492 || e->Iex.Binop.op == Iop_CmpLT64U
2493 || e->Iex.Binop.op == Iop_CmpLE64S
2494 || e->Iex.Binop.op == Iop_CmpLE64U
2495 || e->Iex.Binop.op == Iop_CasCmpEQ64
2496 || e->Iex.Binop.op == Iop_CasCmpNE64
2497 || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2498 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2499 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2500 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2501 switch (e->Iex.Binop.op) {
2502 case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2503 case Iop_CmpNE64:
2504 case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2505 case Iop_CmpLT64S: return Acc_L;
2506 case Iop_CmpLT64U: return Acc_B;
2507 case Iop_CmpLE64S: return Acc_LE;
2508 case Iop_CmpLE64U: return Acc_BE;
2509 default: vpanic("iselCondCode(amd64): CmpXX64");
2513 /* Cmp*32*(x,y) */
2514 if (e->tag == Iex_Binop
2515 && (e->Iex.Binop.op == Iop_CmpEQ32
2516 || e->Iex.Binop.op == Iop_CmpNE32
2517 || e->Iex.Binop.op == Iop_CmpLT32S
2518 || e->Iex.Binop.op == Iop_CmpLT32U
2519 || e->Iex.Binop.op == Iop_CmpLE32S
2520 || e->Iex.Binop.op == Iop_CmpLE32U
2521 || e->Iex.Binop.op == Iop_CasCmpEQ32
2522 || e->Iex.Binop.op == Iop_CasCmpNE32
2523 || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2524 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2525 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2526 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2527 switch (e->Iex.Binop.op) {
2528 case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2529 case Iop_CmpNE32:
2530 case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2531 case Iop_CmpLT32S: return Acc_L;
2532 case Iop_CmpLT32U: return Acc_B;
2533 case Iop_CmpLE32S: return Acc_LE;
2534 case Iop_CmpLE32U: return Acc_BE;
2535 default: vpanic("iselCondCode(amd64): CmpXX32");
2539 ppIRExpr(e);
2540 vpanic("iselCondCode(amd64)");
2544 /*---------------------------------------------------------*/
2545 /*--- ISEL: Integer expressions (128 bit) ---*/
2546 /*---------------------------------------------------------*/
2548 /* Compute a 128-bit value into a register pair, which is returned as
2549 the first two parameters. As with iselIntExpr_R, these may be
2550 either real or virtual regs; in any case they must not be changed
2551 by subsequent code emitted by the caller. */
2553 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2554 ISelEnv* env, const IRExpr* e )
2556 iselInt128Expr_wrk(rHi, rLo, env, e);
2557 # if 0
2558 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2559 # endif
2560 vassert(hregClass(*rHi) == HRcInt64);
2561 vassert(hregIsVirtual(*rHi));
2562 vassert(hregClass(*rLo) == HRcInt64);
2563 vassert(hregIsVirtual(*rLo));
2566 /* DO NOT CALL THIS DIRECTLY ! */
2567 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2568 ISelEnv* env, const IRExpr* e )
2570 vassert(e);
2571 vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2573 /* read 128-bit IRTemp */
2574 if (e->tag == Iex_RdTmp) {
2575 lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2576 return;
2579 /* --------- BINARY ops --------- */
2580 if (e->tag == Iex_Binop) {
2581 switch (e->Iex.Binop.op) {
2582 /* 64 x 64 -> 128 multiply */
2583 case Iop_MullU64:
2584 case Iop_MullS64: {
2585 /* get one operand into %rax, and the other into a R/M.
2586 Need to make an educated guess about which is better in
2587 which. */
2588 HReg tLo = newVRegI(env);
2589 HReg tHi = newVRegI(env);
2590 Bool syned = toBool(e->Iex.Binop.op == Iop_MullS64);
2591 AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2592 HReg rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2593 addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2594 addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2595 /* Result is now in RDX:RAX. Tell the caller. */
2596 addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2597 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2598 *rHi = tHi;
2599 *rLo = tLo;
2600 return;
2603 /* 128 x 64 -> (64(rem),64(div)) division */
2604 case Iop_DivModU128to64:
2605 case Iop_DivModS128to64: {
2606 /* Get the 128-bit operand into rdx:rax, and the other into
2607 any old R/M. */
2608 HReg sHi, sLo;
2609 HReg tLo = newVRegI(env);
2610 HReg tHi = newVRegI(env);
2611 Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2612 AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2613 iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2614 addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2615 addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2616 addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2617 addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2618 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2619 *rHi = tHi;
2620 *rLo = tLo;
2621 return;
2624 /* 64HLto128(e1,e2) */
2625 case Iop_64HLto128:
2626 *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2627 *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2628 return;
2630 default:
2631 break;
2633 } /* if (e->tag == Iex_Binop) */
2635 ppIRExpr(e);
2636 vpanic("iselInt128Expr");
2640 /*---------------------------------------------------------*/
2641 /*--- ISEL: Floating point expressions (32 bit) ---*/
2642 /*---------------------------------------------------------*/
2644 /* Nothing interesting here; really just wrappers for
2645 64-bit stuff. */
2647 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2649 HReg r = iselFltExpr_wrk( env, e );
2650 # if 0
2651 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2652 # endif
2653 vassert(hregClass(r) == HRcVec128);
2654 vassert(hregIsVirtual(r));
2655 return r;
2658 /* DO NOT CALL THIS DIRECTLY */
2659 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2661 IRType ty = typeOfIRExpr(env->type_env,e);
2662 vassert(ty == Ity_F32);
2664 if (e->tag == Iex_RdTmp) {
2665 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2668 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2669 AMD64AMode* am;
2670 HReg res = newVRegV(env);
2671 vassert(e->Iex.Load.ty == Ity_F32);
2672 am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2673 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2674 return res;
2677 if (e->tag == Iex_Binop
2678 && e->Iex.Binop.op == Iop_F64toF32) {
2679 /* Although the result is still held in a standard SSE register,
2680 we need to round it to reflect the loss of accuracy/range
2681 entailed in casting it to a 32-bit float. */
2682 HReg dst = newVRegV(env);
2683 HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2684 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2685 addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2686 set_SSE_rounding_default( env );
2687 return dst;
2690 if (e->tag == Iex_Get) {
2691 AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2692 hregAMD64_RBP() );
2693 HReg res = newVRegV(env);
2694 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2695 return res;
2698 if (e->tag == Iex_Unop
2699 && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2700 /* Given an I32, produce an IEEE754 float with the same bit
2701 pattern. */
2702 HReg dst = newVRegV(env);
2703 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2704 AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2705 addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2706 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2707 return dst;
2710 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2711 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2712 HReg arg = iselFltExpr(env, e->Iex.Binop.arg2);
2713 HReg dst = newVRegV(env);
2715 /* rf now holds the value to be rounded. The first thing to do
2716 is set the FPU's rounding mode accordingly. */
2718 /* Set host x87 rounding mode */
2719 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2721 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2722 addInstr(env, AMD64Instr_A87Free(1));
2723 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2724 addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2725 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2726 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2728 /* Restore default x87 rounding. */
2729 set_FPU_rounding_default( env );
2731 return dst;
2734 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2735 /* Sigh ... very rough code. Could do much better. */
2736 /* Get the 128-bit literal 00---0 10---0 into a register
2737 and xor it with the value to be negated. */
2738 HReg r1 = newVRegI(env);
2739 HReg dst = newVRegV(env);
2740 HReg tmp = newVRegV(env);
2741 HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2742 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2743 addInstr(env, mk_vMOVsd_RR(src,tmp));
2744 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2745 addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2746 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2747 addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2748 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2749 add_to_rsp(env, 16);
2750 return dst;
2753 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2754 IRQop *qop = e->Iex.Qop.details;
2755 HReg dst = newVRegV(env);
2756 HReg argX = iselFltExpr(env, qop->arg2);
2757 HReg argY = iselFltExpr(env, qop->arg3);
2758 HReg argZ = iselFltExpr(env, qop->arg4);
2759 /* XXXROUNDINGFIXME */
2760 /* set roundingmode here */
2761 /* subq $16, %rsp -- make a space*/
2762 sub_from_rsp(env, 16);
2763 /* Prepare 4 arg regs:
2764 leaq 0(%rsp), %rdi
2765 leaq 4(%rsp), %rsi
2766 leaq 8(%rsp), %rdx
2767 leaq 12(%rsp), %rcx
2769 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2770 hregAMD64_RDI()));
2771 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2772 hregAMD64_RSI()));
2773 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2774 hregAMD64_RDX()));
2775 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2776 hregAMD64_RCX()));
2777 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2778 movss %argX, 0(%rsi)
2779 movss %argY, 0(%rdx)
2780 movss %argZ, 0(%rcx)
2782 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2783 AMD64AMode_IR(0, hregAMD64_RSI())));
2784 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2785 AMD64AMode_IR(0, hregAMD64_RDX())));
2786 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2787 AMD64AMode_IR(0, hregAMD64_RCX())));
2788 /* call the helper */
2789 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2790 (ULong)(HWord)h_generic_calc_MAddF32,
2791 4, mk_RetLoc_simple(RLPri_None) ));
2792 /* fetch the result from memory, using %r_argp, which the
2793 register allocator will keep alive across the call. */
2794 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2795 AMD64AMode_IR(0, hregAMD64_RSP())));
2796 /* and finally, clear the space */
2797 add_to_rsp(env, 16);
2798 return dst;
2801 ppIRExpr(e);
2802 vpanic("iselFltExpr_wrk");
2806 /*---------------------------------------------------------*/
2807 /*--- ISEL: Floating point expressions (64 bit) ---*/
2808 /*---------------------------------------------------------*/
2810 /* Compute a 64-bit floating point value into the lower half of an xmm
2811 register, the identity of which is returned. As with
2812 iselIntExpr_R, the returned reg will be virtual, and it must not be
2813 changed by subsequent code emitted by the caller.
2816 /* IEEE 754 formats. From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2818 Type S (1 bit) E (11 bits) F (52 bits)
2819 ---- --------- ----------- -----------
2820 signalling NaN u 2047 (max) .0uuuuu---u
2821 (with at least
2822 one 1 bit)
2823 quiet NaN u 2047 (max) .1uuuuu---u
2825 negative infinity 1 2047 (max) .000000---0
2827 positive infinity 0 2047 (max) .000000---0
2829 negative zero 1 0 .000000---0
2831 positive zero 0 0 .000000---0
2834 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
2836 HReg r = iselDblExpr_wrk( env, e );
2837 # if 0
2838 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2839 # endif
2840 vassert(hregClass(r) == HRcVec128);
2841 vassert(hregIsVirtual(r));
2842 return r;
2845 /* DO NOT CALL THIS DIRECTLY */
2846 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
2848 IRType ty = typeOfIRExpr(env->type_env,e);
2849 vassert(e);
2850 vassert(ty == Ity_F64);
2852 if (e->tag == Iex_RdTmp) {
2853 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2856 if (e->tag == Iex_Const) {
2857 union { ULong u64; Double f64; } u;
2858 HReg res = newVRegV(env);
2859 HReg tmp = newVRegI(env);
2860 vassert(sizeof(u) == 8);
2861 vassert(sizeof(u.u64) == 8);
2862 vassert(sizeof(u.f64) == 8);
2864 if (e->Iex.Const.con->tag == Ico_F64) {
2865 u.f64 = e->Iex.Const.con->Ico.F64;
2867 else if (e->Iex.Const.con->tag == Ico_F64i) {
2868 u.u64 = e->Iex.Const.con->Ico.F64i;
2870 else
2871 vpanic("iselDblExpr(amd64): const");
2873 addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2874 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2875 addInstr(env, AMD64Instr_SseLdSt(
2876 True/*load*/, 8, res,
2877 AMD64AMode_IR(0, hregAMD64_RSP())
2879 add_to_rsp(env, 8);
2880 return res;
2883 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2884 AMD64AMode* am;
2885 HReg res = newVRegV(env);
2886 vassert(e->Iex.Load.ty == Ity_F64);
2887 am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2888 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2889 return res;
2892 if (e->tag == Iex_Get) {
2893 AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2894 hregAMD64_RBP() );
2895 HReg res = newVRegV(env);
2896 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2897 return res;
2900 if (e->tag == Iex_GetI) {
2901 AMD64AMode* am
2902 = genGuestArrayOffset(
2903 env, e->Iex.GetI.descr,
2904 e->Iex.GetI.ix, e->Iex.GetI.bias );
2905 HReg res = newVRegV(env);
2906 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2907 return res;
2910 if (e->tag == Iex_Triop) {
2911 IRTriop *triop = e->Iex.Triop.details;
2912 AMD64SseOp op = Asse_INVALID;
2913 switch (triop->op) {
2914 case Iop_AddF64: op = Asse_ADDF; break;
2915 case Iop_SubF64: op = Asse_SUBF; break;
2916 case Iop_MulF64: op = Asse_MULF; break;
2917 case Iop_DivF64: op = Asse_DIVF; break;
2918 default: break;
2920 if (op != Asse_INVALID) {
2921 HReg dst = newVRegV(env);
2922 HReg argL = iselDblExpr(env, triop->arg2);
2923 HReg argR = iselDblExpr(env, triop->arg3);
2924 addInstr(env, mk_vMOVsd_RR(argL, dst));
2925 /* XXXROUNDINGFIXME */
2926 /* set roundingmode here */
2927 addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
2928 return dst;
2932 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
2933 IRQop *qop = e->Iex.Qop.details;
2934 HReg dst = newVRegV(env);
2935 HReg argX = iselDblExpr(env, qop->arg2);
2936 HReg argY = iselDblExpr(env, qop->arg3);
2937 HReg argZ = iselDblExpr(env, qop->arg4);
2938 /* XXXROUNDINGFIXME */
2939 /* set roundingmode here */
2940 /* subq $32, %rsp -- make a space*/
2941 sub_from_rsp(env, 32);
2942 /* Prepare 4 arg regs:
2943 leaq 0(%rsp), %rdi
2944 leaq 8(%rsp), %rsi
2945 leaq 16(%rsp), %rdx
2946 leaq 24(%rsp), %rcx
2948 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2949 hregAMD64_RDI()));
2950 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2951 hregAMD64_RSI()));
2952 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
2953 hregAMD64_RDX()));
2954 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
2955 hregAMD64_RCX()));
2956 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2957 movsd %argX, 0(%rsi)
2958 movsd %argY, 0(%rdx)
2959 movsd %argZ, 0(%rcx)
2961 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
2962 AMD64AMode_IR(0, hregAMD64_RSI())));
2963 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
2964 AMD64AMode_IR(0, hregAMD64_RDX())));
2965 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
2966 AMD64AMode_IR(0, hregAMD64_RCX())));
2967 /* call the helper */
2968 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2969 (ULong)(HWord)h_generic_calc_MAddF64,
2970 4, mk_RetLoc_simple(RLPri_None) ));
2971 /* fetch the result from memory, using %r_argp, which the
2972 register allocator will keep alive across the call. */
2973 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
2974 AMD64AMode_IR(0, hregAMD64_RSP())));
2975 /* and finally, clear the space */
2976 add_to_rsp(env, 32);
2977 return dst;
2980 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2981 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2982 HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
2983 HReg dst = newVRegV(env);
2985 /* rf now holds the value to be rounded. The first thing to do
2986 is set the FPU's rounding mode accordingly. */
2988 /* Set host x87 rounding mode */
2989 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2991 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
2992 addInstr(env, AMD64Instr_A87Free(1));
2993 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2994 addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2995 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2996 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2998 /* Restore default x87 rounding. */
2999 set_FPU_rounding_default( env );
3001 return dst;
3004 IRTriop *triop = e->Iex.Triop.details;
3005 if (e->tag == Iex_Triop
3006 && (triop->op == Iop_ScaleF64
3007 || triop->op == Iop_AtanF64
3008 || triop->op == Iop_Yl2xF64
3009 || triop->op == Iop_Yl2xp1F64
3010 || triop->op == Iop_PRemF64
3011 || triop->op == Iop_PRem1F64)
3013 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3014 HReg arg1 = iselDblExpr(env, triop->arg2);
3015 HReg arg2 = iselDblExpr(env, triop->arg3);
3016 HReg dst = newVRegV(env);
3017 Bool arg2first = toBool(triop->op == Iop_ScaleF64
3018 || triop->op == Iop_PRemF64
3019 || triop->op == Iop_PRem1F64);
3020 addInstr(env, AMD64Instr_A87Free(2));
3022 /* one arg -> top of x87 stack */
3023 addInstr(env, AMD64Instr_SseLdSt(
3024 False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
3025 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3027 /* other arg -> top of x87 stack */
3028 addInstr(env, AMD64Instr_SseLdSt(
3029 False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
3030 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3032 /* do it */
3033 /* XXXROUNDINGFIXME */
3034 /* set roundingmode here */
3035 switch (triop->op) {
3036 case Iop_ScaleF64:
3037 addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
3038 break;
3039 case Iop_AtanF64:
3040 addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
3041 break;
3042 case Iop_Yl2xF64:
3043 addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
3044 break;
3045 case Iop_Yl2xp1F64:
3046 addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
3047 break;
3048 case Iop_PRemF64:
3049 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
3050 break;
3051 case Iop_PRem1F64:
3052 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
3053 break;
3054 default:
3055 vassert(0);
3058 /* save result */
3059 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3060 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3061 return dst;
3064 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3065 HReg dst = newVRegV(env);
3066 HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
3067 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3068 addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
3069 set_SSE_rounding_default( env );
3070 return dst;
3073 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
3074 HReg dst = newVRegV(env);
3075 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3076 set_SSE_rounding_default( env );
3077 addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
3078 return dst;
3081 if (e->tag == Iex_Unop
3082 && (e->Iex.Unop.op == Iop_NegF64
3083 || e->Iex.Unop.op == Iop_AbsF64)) {
3084 /* Sigh ... very rough code. Could do much better. */
3085 /* Get the 128-bit literal 00---0 10---0 into a register
3086 and xor/nand it with the value to be negated. */
3087 HReg r1 = newVRegI(env);
3088 HReg dst = newVRegV(env);
3089 HReg tmp = newVRegV(env);
3090 HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3091 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3092 addInstr(env, mk_vMOVsd_RR(src,tmp));
3093 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3094 addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3095 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3096 addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3098 if (e->Iex.Unop.op == Iop_NegF64)
3099 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3100 else
3101 addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3103 add_to_rsp(env, 16);
3104 return dst;
3107 if (e->tag == Iex_Binop) {
3108 A87FpOp fpop = Afp_INVALID;
3109 switch (e->Iex.Binop.op) {
3110 case Iop_SqrtF64: fpop = Afp_SQRT; break;
3111 case Iop_SinF64: fpop = Afp_SIN; break;
3112 case Iop_CosF64: fpop = Afp_COS; break;
3113 case Iop_TanF64: fpop = Afp_TAN; break;
3114 case Iop_2xm1F64: fpop = Afp_2XM1; break;
3115 default: break;
3117 if (fpop != Afp_INVALID) {
3118 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3119 HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
3120 HReg dst = newVRegV(env);
3121 Int nNeeded = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3122 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3123 addInstr(env, AMD64Instr_A87Free(nNeeded));
3124 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3125 /* XXXROUNDINGFIXME */
3126 /* set roundingmode here */
3127 /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3128 codes. I don't think that matters, since this insn
3129 selector never generates such an instruction intervening
3130 between an flag-setting instruction and a flag-using
3131 instruction. */
3132 addInstr(env, AMD64Instr_A87FpOp(fpop));
3133 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3134 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3135 return dst;
3139 if (e->tag == Iex_Unop) {
3140 switch (e->Iex.Unop.op) {
3141 //.. case Iop_I32toF64: {
3142 //.. HReg dst = newVRegF(env);
3143 //.. HReg ri = iselIntExpr_R(env, e->Iex.Unop.arg);
3144 //.. addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3145 //.. set_FPU_rounding_default(env);
3146 //.. addInstr(env, X86Instr_FpLdStI(
3147 //.. True/*load*/, 4, dst,
3148 //.. X86AMode_IR(0, hregX86_ESP())));
3149 //.. add_to_esp(env, 4);
3150 //.. return dst;
3151 //.. }
3152 case Iop_ReinterpI64asF64: {
3153 /* Given an I64, produce an IEEE754 double with the same
3154 bit pattern. */
3155 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3156 HReg dst = newVRegV(env);
3157 AMD64RI* src = iselIntExpr_RI(env, e->Iex.Unop.arg);
3158 /* paranoia */
3159 set_SSE_rounding_default(env);
3160 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3161 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3162 return dst;
3164 case Iop_F32toF64: {
3165 HReg f32;
3166 HReg f64 = newVRegV(env);
3167 /* this shouldn't be necessary, but be paranoid ... */
3168 set_SSE_rounding_default(env);
3169 f32 = iselFltExpr(env, e->Iex.Unop.arg);
3170 addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3171 return f64;
3173 default:
3174 break;
3178 /* --------- MULTIPLEX --------- */
3179 if (e->tag == Iex_ITE) { // VFD
3180 HReg r1, r0, dst;
3181 vassert(ty == Ity_F64);
3182 vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3183 r1 = iselDblExpr(env, e->Iex.ITE.iftrue);
3184 r0 = iselDblExpr(env, e->Iex.ITE.iffalse);
3185 dst = newVRegV(env);
3186 addInstr(env, mk_vMOVsd_RR(r1,dst));
3187 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3188 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3189 return dst;
3192 ppIRExpr(e);
3193 vpanic("iselDblExpr_wrk");
3197 /*---------------------------------------------------------*/
3198 /*--- ISEL: SIMD (Vector) expressions, 128 bit. ---*/
3199 /*---------------------------------------------------------*/
3201 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3203 HReg r = iselVecExpr_wrk( env, e );
3204 # if 0
3205 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3206 # endif
3207 vassert(hregClass(r) == HRcVec128);
3208 vassert(hregIsVirtual(r));
3209 return r;
3213 /* DO NOT CALL THIS DIRECTLY */
3214 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3216 HWord fn = 0; /* address of helper fn, if required */
3217 Bool arg1isEReg = False;
3218 AMD64SseOp op = Asse_INVALID;
3219 vassert(e);
3220 IRType ty = typeOfIRExpr(env->type_env, e);
3221 vassert(ty == Ity_V128);
3222 UInt laneBits = 0;
3224 if (e->tag == Iex_RdTmp) {
3225 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3228 if (e->tag == Iex_Get) {
3229 HReg dst = newVRegV(env);
3230 addInstr(env, AMD64Instr_SseLdSt(
3231 True/*load*/,
3233 dst,
3234 AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3237 return dst;
3240 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3241 HReg dst = newVRegV(env);
3242 AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
3243 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3244 return dst;
3247 if (e->tag == Iex_Const) {
3248 HReg dst = newVRegV(env);
3249 vassert(e->Iex.Const.con->tag == Ico_V128);
3250 switch (e->Iex.Const.con->Ico.V128) {
3251 case 0x0000:
3252 dst = generate_zeroes_V128(env);
3253 break;
3254 case 0xFFFF:
3255 dst = generate_ones_V128(env);
3256 break;
3257 default: {
3258 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3259 /* do push_uimm64 twice, first time for the high-order half. */
3260 push_uimm64(env, bitmask8_to_bytemask64(
3261 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3263 push_uimm64(env, bitmask8_to_bytemask64(
3264 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3266 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3267 add_to_rsp(env, 16);
3268 break;
3271 return dst;
3274 if (e->tag == Iex_Unop) {
3275 switch (e->Iex.Unop.op) {
3277 case Iop_NotV128: {
3278 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3279 return do_sse_NotV128(env, arg);
3282 case Iop_CmpNEZ64x2: {
3283 /* We can use SSE2 instructions for this. */
3284 /* Ideally, we want to do a 64Ix2 comparison against zero of
3285 the operand. Problem is no such insn exists. Solution
3286 therefore is to do a 32Ix4 comparison instead, and bitwise-
3287 negate (NOT) the result. Let a,b,c,d be 32-bit lanes, and
3288 let the not'd result of this initial comparison be a:b:c:d.
3289 What we need to compute is (a|b):(a|b):(c|d):(c|d). So, use
3290 pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3291 giving the required result.
3293 The required selection sequence is 2,3,0,1, which
3294 according to Intel's documentation means the pshufd
3295 literal value is 0xB1, that is,
3296 (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3298 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3299 HReg tmp = generate_zeroes_V128(env);
3300 HReg dst = newVRegV(env);
3301 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3302 tmp = do_sse_NotV128(env, tmp);
3303 addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3304 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3305 return dst;
3308 case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3309 case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3310 case Iop_CmpNEZ8x16: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
3311 do_CmpNEZ_vector:
3313 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3314 HReg tmp = newVRegV(env);
3315 HReg zero = generate_zeroes_V128(env);
3316 HReg dst;
3317 addInstr(env, mk_vMOVsd_RR(arg, tmp));
3318 addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3319 dst = do_sse_NotV128(env, tmp);
3320 return dst;
3323 case Iop_RecipEst32Fx4: op = Asse_RCPF; goto do_32Fx4_unary;
3324 case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3325 do_32Fx4_unary:
3327 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3328 HReg dst = newVRegV(env);
3329 addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3330 return dst;
3333 case Iop_RecipEst32F0x4: op = Asse_RCPF; goto do_32F0x4_unary;
3334 case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3335 case Iop_Sqrt32F0x4: op = Asse_SQRTF; goto do_32F0x4_unary;
3336 do_32F0x4_unary:
3338 /* A bit subtle. We have to copy the arg to the result
3339 register first, because actually doing the SSE scalar insn
3340 leaves the upper 3/4 of the destination register
3341 unchanged. Whereas the required semantics of these
3342 primops is that the upper 3/4 is simply copied in from the
3343 argument. */
3344 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3345 HReg dst = newVRegV(env);
3346 addInstr(env, mk_vMOVsd_RR(arg, dst));
3347 addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3348 return dst;
3351 case Iop_Sqrt64F0x2: op = Asse_SQRTF; goto do_64F0x2_unary;
3352 do_64F0x2_unary:
3354 /* A bit subtle. We have to copy the arg to the result
3355 register first, because actually doing the SSE scalar insn
3356 leaves the upper half of the destination register
3357 unchanged. Whereas the required semantics of these
3358 primops is that the upper half is simply copied in from the
3359 argument. */
3360 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3361 HReg dst = newVRegV(env);
3362 addInstr(env, mk_vMOVsd_RR(arg, dst));
3363 addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3364 return dst;
3367 case Iop_32UtoV128: {
3368 // FIXME maybe just use MOVQ here?
3369 HReg dst = newVRegV(env);
3370 AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3371 AMD64RI* ri = iselIntExpr_RI(env, e->Iex.Unop.arg);
3372 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3373 addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3374 return dst;
3377 case Iop_64UtoV128: {
3378 // FIXME maybe just use MOVQ here?
3379 HReg dst = newVRegV(env);
3380 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3381 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3382 addInstr(env, AMD64Instr_Push(rmi));
3383 addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3384 add_to_rsp(env, 8);
3385 return dst;
3388 case Iop_V256toV128_0:
3389 case Iop_V256toV128_1: {
3390 HReg vHi, vLo;
3391 iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3392 return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3395 case Iop_F16toF32x4: {
3396 if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3397 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3398 HReg dst = newVRegV(env);
3399 addInstr(env, AMD64Instr_SseMOVQ(src, dst, /*toXMM=*/True));
3400 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, dst, dst));
3401 return dst;
3403 break;
3406 default:
3407 break;
3408 } /* switch (e->Iex.Unop.op) */
3409 } /* if (e->tag == Iex_Unop) */
3411 if (e->tag == Iex_Binop) {
3412 switch (e->Iex.Binop.op) {
3414 case Iop_Sqrt64Fx2:
3415 case Iop_Sqrt32Fx4: {
3416 /* :: (rmode, vec) -> vec */
3417 HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3418 HReg dst = newVRegV(env);
3419 /* XXXROUNDINGFIXME */
3420 /* set roundingmode here */
3421 addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3422 ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
3423 (Asse_SQRTF, arg, dst));
3424 return dst;
3427 /* FIXME: could we generate MOVQ here? */
3428 case Iop_SetV128lo64: {
3429 HReg dst = newVRegV(env);
3430 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3431 HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3432 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3433 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3434 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3435 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3436 return dst;
3439 /* FIXME: could we generate MOVD here? */
3440 case Iop_SetV128lo32: {
3441 HReg dst = newVRegV(env);
3442 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3443 HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3444 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3445 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3446 addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3447 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3448 return dst;
3451 case Iop_64HLtoV128: {
3452 const IRExpr* arg1 = e->Iex.Binop.arg1;
3453 const IRExpr* arg2 = e->Iex.Binop.arg2;
3454 HReg dst = newVRegV(env);
3455 HReg tmp = newVRegV(env);
3456 HReg qHi = iselIntExpr_R(env, arg1);
3457 // If the args are trivially the same (tmp or const), use the same
3458 // source register for both, and only one movq since those are
3459 // (relatively) expensive.
3460 if (areAtomsAndEqual(arg1, arg2)) {
3461 addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3462 addInstr(env, mk_vMOVsd_RR(dst, tmp));
3463 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3464 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3465 } else {
3466 HReg qLo = iselIntExpr_R(env, arg2);
3467 addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3468 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3469 addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/));
3470 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3472 return dst;
3475 case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3476 case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3477 case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3478 case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3479 case Iop_Max32Fx4: op = Asse_MAXF; goto do_32Fx4;
3480 case Iop_Min32Fx4: op = Asse_MINF; goto do_32Fx4;
3481 do_32Fx4:
3483 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3484 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3485 HReg dst = newVRegV(env);
3486 addInstr(env, mk_vMOVsd_RR(argL, dst));
3487 addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3488 return dst;
3491 case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3492 case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3493 case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3494 case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3495 case Iop_Max64Fx2: op = Asse_MAXF; goto do_64Fx2;
3496 case Iop_Min64Fx2: op = Asse_MINF; goto do_64Fx2;
3497 do_64Fx2:
3499 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3500 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3501 HReg dst = newVRegV(env);
3502 addInstr(env, mk_vMOVsd_RR(argL, dst));
3503 addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3504 return dst;
3507 case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3508 case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3509 case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3510 case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3511 case Iop_Add32F0x4: op = Asse_ADDF; goto do_32F0x4;
3512 case Iop_Div32F0x4: op = Asse_DIVF; goto do_32F0x4;
3513 case Iop_Max32F0x4: op = Asse_MAXF; goto do_32F0x4;
3514 case Iop_Min32F0x4: op = Asse_MINF; goto do_32F0x4;
3515 case Iop_Mul32F0x4: op = Asse_MULF; goto do_32F0x4;
3516 case Iop_Sub32F0x4: op = Asse_SUBF; goto do_32F0x4;
3517 do_32F0x4: {
3518 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3519 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3520 HReg dst = newVRegV(env);
3521 addInstr(env, mk_vMOVsd_RR(argL, dst));
3522 addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3523 return dst;
3526 case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3527 case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3528 case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3529 case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3530 case Iop_Add64F0x2: op = Asse_ADDF; goto do_64F0x2;
3531 case Iop_Div64F0x2: op = Asse_DIVF; goto do_64F0x2;
3532 case Iop_Max64F0x2: op = Asse_MAXF; goto do_64F0x2;
3533 case Iop_Min64F0x2: op = Asse_MINF; goto do_64F0x2;
3534 case Iop_Mul64F0x2: op = Asse_MULF; goto do_64F0x2;
3535 case Iop_Sub64F0x2: op = Asse_SUBF; goto do_64F0x2;
3536 do_64F0x2: {
3537 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3538 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3539 HReg dst = newVRegV(env);
3540 addInstr(env, mk_vMOVsd_RR(argL, dst));
3541 addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3542 return dst;
3545 case Iop_PermOrZero8x16:
3546 if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3547 op = Asse_PSHUFB;
3548 goto do_SseReRg;
3550 // Otherwise we'll have to generate a call to
3551 // h_generic_calc_PermOrZero8x16 (ATK). But that would only be for a
3552 // host which doesn't have SSSE3, in which case we don't expect this
3553 // IROp to enter the compilation pipeline in the first place.
3554 break;
3556 case Iop_PwExtUSMulQAdd8x16:
3557 if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3558 op = Asse_PMADDUBSW;
3559 goto do_SseReRg;
3561 break;
3563 case Iop_QNarrowBin32Sto16Sx8:
3564 op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3565 case Iop_QNarrowBin16Sto8Sx16:
3566 op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3567 case Iop_QNarrowBin16Sto8Ux16:
3568 op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3570 case Iop_InterleaveHI8x16:
3571 op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3572 case Iop_InterleaveHI16x8:
3573 op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3574 case Iop_InterleaveHI32x4:
3575 op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3576 case Iop_InterleaveHI64x2:
3577 op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3579 case Iop_InterleaveLO8x16:
3580 op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3581 case Iop_InterleaveLO16x8:
3582 op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3583 case Iop_InterleaveLO32x4:
3584 op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3585 case Iop_InterleaveLO64x2:
3586 op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3588 case Iop_AndV128: op = Asse_AND; goto do_SseReRg;
3589 case Iop_OrV128: op = Asse_OR; goto do_SseReRg;
3590 case Iop_XorV128: op = Asse_XOR; goto do_SseReRg;
3591 case Iop_Add8x16: op = Asse_ADD8; goto do_SseReRg;
3592 case Iop_Add16x8: op = Asse_ADD16; goto do_SseReRg;
3593 case Iop_Add32x4: op = Asse_ADD32; goto do_SseReRg;
3594 case Iop_Add64x2: op = Asse_ADD64; goto do_SseReRg;
3595 case Iop_QAdd8Sx16: op = Asse_QADD8S; goto do_SseReRg;
3596 case Iop_QAdd16Sx8: op = Asse_QADD16S; goto do_SseReRg;
3597 case Iop_QAdd8Ux16: op = Asse_QADD8U; goto do_SseReRg;
3598 case Iop_QAdd16Ux8: op = Asse_QADD16U; goto do_SseReRg;
3599 case Iop_Avg8Ux16: op = Asse_AVG8U; goto do_SseReRg;
3600 case Iop_Avg16Ux8: op = Asse_AVG16U; goto do_SseReRg;
3601 case Iop_CmpEQ8x16: op = Asse_CMPEQ8; goto do_SseReRg;
3602 case Iop_CmpEQ16x8: op = Asse_CMPEQ16; goto do_SseReRg;
3603 case Iop_CmpEQ32x4: op = Asse_CMPEQ32; goto do_SseReRg;
3604 case Iop_CmpGT8Sx16: op = Asse_CMPGT8S; goto do_SseReRg;
3605 case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3606 case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3607 case Iop_Max16Sx8: op = Asse_MAX16S; goto do_SseReRg;
3608 case Iop_Max8Ux16: op = Asse_MAX8U; goto do_SseReRg;
3609 case Iop_Min16Sx8: op = Asse_MIN16S; goto do_SseReRg;
3610 case Iop_Min8Ux16: op = Asse_MIN8U; goto do_SseReRg;
3611 case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3612 case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3613 case Iop_Mul16x8: op = Asse_MUL16; goto do_SseReRg;
3614 case Iop_Sub8x16: op = Asse_SUB8; goto do_SseReRg;
3615 case Iop_Sub16x8: op = Asse_SUB16; goto do_SseReRg;
3616 case Iop_Sub32x4: op = Asse_SUB32; goto do_SseReRg;
3617 case Iop_Sub64x2: op = Asse_SUB64; goto do_SseReRg;
3618 case Iop_QSub8Sx16: op = Asse_QSUB8S; goto do_SseReRg;
3619 case Iop_QSub16Sx8: op = Asse_QSUB16S; goto do_SseReRg;
3620 case Iop_QSub8Ux16: op = Asse_QSUB8U; goto do_SseReRg;
3621 case Iop_QSub16Ux8: op = Asse_QSUB16U; goto do_SseReRg;
3622 do_SseReRg: {
3623 HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3624 HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3625 HReg dst = newVRegV(env);
3626 if (arg1isEReg) {
3627 addInstr(env, mk_vMOVsd_RR(arg2, dst));
3628 addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3629 } else {
3630 addInstr(env, mk_vMOVsd_RR(arg1, dst));
3631 addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3633 return dst;
3636 case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
3637 case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
3638 case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
3639 case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
3640 case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
3641 case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
3642 case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
3643 case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
3644 do_SseShift: {
3645 HReg dst = newVRegV(env);
3646 HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
3647 /* If it's a shift by an in-range immediate, generate a single
3648 instruction. */
3649 if (e->Iex.Binop.arg2->tag == Iex_Const) {
3650 IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
3651 vassert(c->tag == Ico_U8);
3652 UInt shift = c->Ico.U8;
3653 if (shift < laneBits) {
3654 addInstr(env, mk_vMOVsd_RR(greg, dst));
3655 addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
3656 return dst;
3659 /* Otherwise we have to do it the longwinded way. */
3660 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3661 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3662 HReg ereg = newVRegV(env);
3663 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3664 addInstr(env, AMD64Instr_Push(rmi));
3665 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3666 addInstr(env, mk_vMOVsd_RR(greg, dst));
3667 addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3668 add_to_rsp(env, 16);
3669 return dst;
3672 case Iop_Mul32x4: fn = (HWord)h_generic_calc_Mul32x4;
3673 goto do_SseAssistedBinary;
3674 case Iop_Max32Sx4: fn = (HWord)h_generic_calc_Max32Sx4;
3675 goto do_SseAssistedBinary;
3676 case Iop_Min32Sx4: fn = (HWord)h_generic_calc_Min32Sx4;
3677 goto do_SseAssistedBinary;
3678 case Iop_Max32Ux4: fn = (HWord)h_generic_calc_Max32Ux4;
3679 goto do_SseAssistedBinary;
3680 case Iop_Min32Ux4: fn = (HWord)h_generic_calc_Min32Ux4;
3681 goto do_SseAssistedBinary;
3682 case Iop_Max16Ux8: fn = (HWord)h_generic_calc_Max16Ux8;
3683 goto do_SseAssistedBinary;
3684 case Iop_Min16Ux8: fn = (HWord)h_generic_calc_Min16Ux8;
3685 goto do_SseAssistedBinary;
3686 case Iop_Max8Sx16: fn = (HWord)h_generic_calc_Max8Sx16;
3687 goto do_SseAssistedBinary;
3688 case Iop_Min8Sx16: fn = (HWord)h_generic_calc_Min8Sx16;
3689 goto do_SseAssistedBinary;
3690 case Iop_CmpEQ64x2: fn = (HWord)h_generic_calc_CmpEQ64x2;
3691 goto do_SseAssistedBinary;
3692 case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3693 goto do_SseAssistedBinary;
3694 case Iop_Perm32x4: fn = (HWord)h_generic_calc_Perm32x4;
3695 goto do_SseAssistedBinary;
3696 case Iop_QNarrowBin32Sto16Ux8:
3697 fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3698 goto do_SseAssistedBinary;
3699 case Iop_NarrowBin16to8x16:
3700 fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3701 goto do_SseAssistedBinary;
3702 case Iop_NarrowBin32to16x8:
3703 fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3704 goto do_SseAssistedBinary;
3705 do_SseAssistedBinary: {
3706 /* RRRufff! RRRufff code is what we're generating here. Oh
3707 well. */
3708 vassert(fn != 0);
3709 HReg dst = newVRegV(env);
3710 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3711 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3712 HReg argp = newVRegI(env);
3713 /* subq $112, %rsp -- make a space*/
3714 sub_from_rsp(env, 112);
3715 /* leaq 48(%rsp), %r_argp -- point into it */
3716 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3717 argp));
3718 /* andq $-16, %r_argp -- 16-align the pointer */
3719 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3720 AMD64RMI_Imm( ~(UInt)15 ),
3721 argp));
3722 /* Prepare 3 arg regs:
3723 leaq 0(%r_argp), %rdi
3724 leaq 16(%r_argp), %rsi
3725 leaq 32(%r_argp), %rdx
3727 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3728 hregAMD64_RDI()));
3729 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3730 hregAMD64_RSI()));
3731 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3732 hregAMD64_RDX()));
3733 /* Store the two args, at (%rsi) and (%rdx):
3734 movupd %argL, 0(%rsi)
3735 movupd %argR, 0(%rdx)
3737 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3738 AMD64AMode_IR(0, hregAMD64_RSI())));
3739 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3740 AMD64AMode_IR(0, hregAMD64_RDX())));
3741 /* call the helper */
3742 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3743 3, mk_RetLoc_simple(RLPri_None) ));
3744 /* fetch the result from memory, using %r_argp, which the
3745 register allocator will keep alive across the call. */
3746 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3747 AMD64AMode_IR(0, argp)));
3748 /* and finally, clear the space */
3749 add_to_rsp(env, 112);
3750 return dst;
3753 case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3754 goto do_SseAssistedVectorAndScalar;
3755 case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3756 goto do_SseAssistedVectorAndScalar;
3757 do_SseAssistedVectorAndScalar: {
3758 /* RRRufff! RRRufff code is what we're generating here. Oh
3759 well. */
3760 vassert(fn != 0);
3761 HReg dst = newVRegV(env);
3762 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3763 HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3764 HReg argp = newVRegI(env);
3765 /* subq $112, %rsp -- make a space*/
3766 sub_from_rsp(env, 112);
3767 /* leaq 48(%rsp), %r_argp -- point into it */
3768 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3769 argp));
3770 /* andq $-16, %r_argp -- 16-align the pointer */
3771 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3772 AMD64RMI_Imm( ~(UInt)15 ),
3773 argp));
3774 /* Prepare 2 vector arg regs:
3775 leaq 0(%r_argp), %rdi
3776 leaq 16(%r_argp), %rsi
3778 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3779 hregAMD64_RDI()));
3780 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3781 hregAMD64_RSI()));
3782 /* Store the vector arg, at (%rsi):
3783 movupd %argL, 0(%rsi)
3785 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3786 AMD64AMode_IR(0, hregAMD64_RSI())));
3787 /* And get the scalar value into rdx */
3788 addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3790 /* call the helper */
3791 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3792 3, mk_RetLoc_simple(RLPri_None) ));
3793 /* fetch the result from memory, using %r_argp, which the
3794 register allocator will keep alive across the call. */
3795 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3796 AMD64AMode_IR(0, argp)));
3797 /* and finally, clear the space */
3798 add_to_rsp(env, 112);
3799 return dst;
3802 case Iop_I32StoF32x4:
3803 case Iop_F32toI32Sx4: {
3804 HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3805 HReg dst = newVRegV(env);
3806 AMD64SseOp mop
3807 = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I;
3808 set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
3809 addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst));
3810 set_SSE_rounding_default(env);
3811 return dst;
3814 // Half-float vector conversion
3815 case Iop_F32toF16x8: {
3816 if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3817 HReg srcHi, srcLo;
3818 iselDVecExpr(&srcHi, &srcLo, env, e->Iex.Binop.arg2);
3819 HReg dstHi = newVRegV(env);
3820 HReg dstLo = newVRegV(env);
3821 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3822 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcHi, dstHi));
3823 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcLo, dstLo));
3824 set_SSE_rounding_default(env);
3825 // Now we have the result in dstHi[63:0] and dstLo[63:0], but we
3826 // need to compact all that into one register. There's probably a
3827 // more elegant way to do this, but ..
3828 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
3829 // dstHi is now 127:64 = useful data, 63:0 = zero
3830 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
3831 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, dstLo));
3832 // dstLo is now 127:64 = zero, 63:0 = useful data
3833 addInstr(env, AMD64Instr_SseReRg(Asse_OR, dstHi, dstLo));
3834 return dstLo;
3836 break;
3839 default:
3840 break;
3841 } /* switch (e->Iex.Binop.op) */
3842 } /* if (e->tag == Iex_Binop) */
3844 if (e->tag == Iex_Triop) {
3845 IRTriop *triop = e->Iex.Triop.details;
3846 switch (triop->op) {
3848 case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
3849 case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
3850 case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
3851 case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
3852 do_64Fx2_w_rm:
3854 HReg argL = iselVecExpr(env, triop->arg2);
3855 HReg argR = iselVecExpr(env, triop->arg3);
3856 HReg dst = newVRegV(env);
3857 addInstr(env, mk_vMOVsd_RR(argL, dst));
3858 /* XXXROUNDINGFIXME */
3859 /* set roundingmode here */
3860 addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3861 return dst;
3864 case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
3865 case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
3866 case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
3867 case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
3868 do_32Fx4_w_rm:
3870 HReg argL = iselVecExpr(env, triop->arg2);
3871 HReg argR = iselVecExpr(env, triop->arg3);
3872 HReg dst = newVRegV(env);
3873 addInstr(env, mk_vMOVsd_RR(argL, dst));
3874 /* XXXROUNDINGFIXME */
3875 /* set roundingmode here */
3876 addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3877 return dst;
3880 default:
3881 break;
3882 } /* switch (triop->op) */
3883 } /* if (e->tag == Iex_Triop) */
3885 if (e->tag == Iex_ITE) { // VFD
3886 HReg r1 = iselVecExpr(env, e->Iex.ITE.iftrue);
3887 HReg r0 = iselVecExpr(env, e->Iex.ITE.iffalse);
3888 HReg dst = newVRegV(env);
3889 addInstr(env, mk_vMOVsd_RR(r1,dst));
3890 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3891 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3892 return dst;
3895 //vec_fail:
3896 vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3897 LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3898 ppIRExpr(e);
3899 vpanic("iselVecExpr_wrk");
3903 /*---------------------------------------------------------*/
3904 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs. --*/
3905 /*---------------------------------------------------------*/
3907 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3908 ISelEnv* env, const IRExpr* e )
3910 iselDVecExpr_wrk( rHi, rLo, env, e );
3911 # if 0
3912 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3913 # endif
3914 vassert(hregClass(*rHi) == HRcVec128);
3915 vassert(hregClass(*rLo) == HRcVec128);
3916 vassert(hregIsVirtual(*rHi));
3917 vassert(hregIsVirtual(*rLo));
3921 /* DO NOT CALL THIS DIRECTLY */
3922 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3923 ISelEnv* env, const IRExpr* e )
3925 HWord fn = 0; /* address of helper fn, if required */
3926 vassert(e);
3927 IRType ty = typeOfIRExpr(env->type_env, e);
3928 vassert(ty == Ity_V256);
3929 UInt laneBits = 0;
3931 AMD64SseOp op = Asse_INVALID;
3933 /* read 256-bit IRTemp */
3934 if (e->tag == Iex_RdTmp) {
3935 lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3936 return;
3939 if (e->tag == Iex_Get) {
3940 HReg vHi = newVRegV(env);
3941 HReg vLo = newVRegV(env);
3942 HReg rbp = hregAMD64_RBP();
3943 AMD64AMode* am0 = AMD64AMode_IR(e->Iex.Get.offset + 0, rbp);
3944 AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
3945 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3946 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3947 *rHi = vHi;
3948 *rLo = vLo;
3949 return;
3952 if (e->tag == Iex_Load) {
3953 HReg vHi = newVRegV(env);
3954 HReg vLo = newVRegV(env);
3955 HReg rA = iselIntExpr_R(env, e->Iex.Load.addr);
3956 AMD64AMode* am0 = AMD64AMode_IR(0, rA);
3957 AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3958 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3959 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3960 *rHi = vHi;
3961 *rLo = vLo;
3962 return;
3965 if (e->tag == Iex_Const) {
3966 vassert(e->Iex.Const.con->tag == Ico_V256);
3967 switch (e->Iex.Const.con->Ico.V256) {
3968 case 0x00000000: {
3969 HReg vHi = generate_zeroes_V128(env);
3970 HReg vLo = newVRegV(env);
3971 addInstr(env, mk_vMOVsd_RR(vHi, vLo));
3972 *rHi = vHi;
3973 *rLo = vLo;
3974 return;
3976 default:
3977 break; /* give up. Until such time as is necessary. */
3981 if (e->tag == Iex_Unop) {
3982 switch (e->Iex.Unop.op) {
3984 case Iop_NotV256: {
3985 HReg argHi, argLo;
3986 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3987 *rHi = do_sse_NotV128(env, argHi);
3988 *rLo = do_sse_NotV128(env, argLo);
3989 return;
3992 case Iop_RecipEst32Fx8: op = Asse_RCPF; goto do_32Fx8_unary;
3993 case Iop_Sqrt32Fx8: op = Asse_SQRTF; goto do_32Fx8_unary;
3994 case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
3995 do_32Fx8_unary:
3997 HReg argHi, argLo;
3998 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3999 HReg dstHi = newVRegV(env);
4000 HReg dstLo = newVRegV(env);
4001 addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
4002 addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
4003 *rHi = dstHi;
4004 *rLo = dstLo;
4005 return;
4008 case Iop_Sqrt64Fx4: op = Asse_SQRTF; goto do_64Fx4_unary;
4009 do_64Fx4_unary:
4011 HReg argHi, argLo;
4012 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4013 HReg dstHi = newVRegV(env);
4014 HReg dstLo = newVRegV(env);
4015 addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
4016 addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
4017 *rHi = dstHi;
4018 *rLo = dstLo;
4019 return;
4022 case Iop_CmpNEZ64x4: {
4023 /* We can use SSE2 instructions for this. */
4024 /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
4025 (obviously). See comment on Iop_CmpNEZ64x2 for
4026 explanation of what's going on here. */
4027 HReg argHi, argLo;
4028 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4029 HReg tmpHi = generate_zeroes_V128(env);
4030 HReg tmpLo = newVRegV(env);
4031 addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
4032 HReg dstHi = newVRegV(env);
4033 HReg dstLo = newVRegV(env);
4034 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
4035 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
4036 tmpHi = do_sse_NotV128(env, tmpHi);
4037 tmpLo = do_sse_NotV128(env, tmpLo);
4038 addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
4039 addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
4040 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
4041 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
4042 *rHi = dstHi;
4043 *rLo = dstLo;
4044 return;
4047 case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
4048 case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
4049 case Iop_CmpNEZ8x32: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
4050 do_CmpNEZ_vector:
4052 HReg argHi, argLo;
4053 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4054 HReg tmpHi = newVRegV(env);
4055 HReg tmpLo = newVRegV(env);
4056 HReg zero = generate_zeroes_V128(env);
4057 HReg dstHi, dstLo;
4058 addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
4059 addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
4060 addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
4061 addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
4062 dstHi = do_sse_NotV128(env, tmpHi);
4063 dstLo = do_sse_NotV128(env, tmpLo);
4064 *rHi = dstHi;
4065 *rLo = dstLo;
4066 return;
4069 case Iop_F16toF32x8: {
4070 if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
4071 HReg src = iselVecExpr(env, e->Iex.Unop.arg);
4072 HReg srcCopy = newVRegV(env);
4073 HReg dstHi = newVRegV(env);
4074 HReg dstLo = newVRegV(env);
4075 // Copy src, since we'll need to modify it.
4076 addInstr(env, mk_vMOVsd_RR(src, srcCopy));
4077 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstLo));
4078 addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, srcCopy));
4079 addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstHi));
4080 *rHi = dstHi;
4081 *rLo = dstLo;
4082 return;
4084 break;
4087 default:
4088 break;
4089 } /* switch (e->Iex.Unop.op) */
4090 } /* if (e->tag == Iex_Unop) */
4092 if (e->tag == Iex_Binop) {
4093 switch (e->Iex.Binop.op) {
4095 case Iop_Max64Fx4: op = Asse_MAXF; goto do_64Fx4;
4096 case Iop_Min64Fx4: op = Asse_MINF; goto do_64Fx4;
4097 do_64Fx4:
4099 HReg argLhi, argLlo, argRhi, argRlo;
4100 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4101 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4102 HReg dstHi = newVRegV(env);
4103 HReg dstLo = newVRegV(env);
4104 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4105 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4106 addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4107 addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4108 *rHi = dstHi;
4109 *rLo = dstLo;
4110 return;
4113 case Iop_Max32Fx8: op = Asse_MAXF; goto do_32Fx8;
4114 case Iop_Min32Fx8: op = Asse_MINF; goto do_32Fx8;
4115 do_32Fx8:
4117 HReg argLhi, argLlo, argRhi, argRlo;
4118 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4119 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4120 HReg dstHi = newVRegV(env);
4121 HReg dstLo = newVRegV(env);
4122 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4123 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4124 addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4125 addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4126 *rHi = dstHi;
4127 *rLo = dstLo;
4128 return;
4131 case Iop_AndV256: op = Asse_AND; goto do_SseReRg;
4132 case Iop_OrV256: op = Asse_OR; goto do_SseReRg;
4133 case Iop_XorV256: op = Asse_XOR; goto do_SseReRg;
4134 case Iop_Add8x32: op = Asse_ADD8; goto do_SseReRg;
4135 case Iop_Add16x16: op = Asse_ADD16; goto do_SseReRg;
4136 case Iop_Add32x8: op = Asse_ADD32; goto do_SseReRg;
4137 case Iop_Add64x4: op = Asse_ADD64; goto do_SseReRg;
4138 case Iop_QAdd8Sx32: op = Asse_QADD8S; goto do_SseReRg;
4139 case Iop_QAdd16Sx16: op = Asse_QADD16S; goto do_SseReRg;
4140 case Iop_QAdd8Ux32: op = Asse_QADD8U; goto do_SseReRg;
4141 case Iop_QAdd16Ux16: op = Asse_QADD16U; goto do_SseReRg;
4142 case Iop_Avg8Ux32: op = Asse_AVG8U; goto do_SseReRg;
4143 case Iop_Avg16Ux16: op = Asse_AVG16U; goto do_SseReRg;
4144 case Iop_CmpEQ8x32: op = Asse_CMPEQ8; goto do_SseReRg;
4145 case Iop_CmpEQ16x16: op = Asse_CMPEQ16; goto do_SseReRg;
4146 case Iop_CmpEQ32x8: op = Asse_CMPEQ32; goto do_SseReRg;
4147 case Iop_CmpGT8Sx32: op = Asse_CMPGT8S; goto do_SseReRg;
4148 case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
4149 case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
4150 case Iop_Max16Sx16: op = Asse_MAX16S; goto do_SseReRg;
4151 case Iop_Max8Ux32: op = Asse_MAX8U; goto do_SseReRg;
4152 case Iop_Min16Sx16: op = Asse_MIN16S; goto do_SseReRg;
4153 case Iop_Min8Ux32: op = Asse_MIN8U; goto do_SseReRg;
4154 case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
4155 case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
4156 case Iop_Mul16x16: op = Asse_MUL16; goto do_SseReRg;
4157 case Iop_Sub8x32: op = Asse_SUB8; goto do_SseReRg;
4158 case Iop_Sub16x16: op = Asse_SUB16; goto do_SseReRg;
4159 case Iop_Sub32x8: op = Asse_SUB32; goto do_SseReRg;
4160 case Iop_Sub64x4: op = Asse_SUB64; goto do_SseReRg;
4161 case Iop_QSub8Sx32: op = Asse_QSUB8S; goto do_SseReRg;
4162 case Iop_QSub16Sx16: op = Asse_QSUB16S; goto do_SseReRg;
4163 case Iop_QSub8Ux32: op = Asse_QSUB8U; goto do_SseReRg;
4164 case Iop_QSub16Ux16: op = Asse_QSUB16U; goto do_SseReRg;
4165 do_SseReRg:
4167 HReg argLhi, argLlo, argRhi, argRlo;
4168 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4169 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4170 HReg dstHi = newVRegV(env);
4171 HReg dstLo = newVRegV(env);
4172 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4173 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4174 addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
4175 addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
4176 *rHi = dstHi;
4177 *rLo = dstLo;
4178 return;
4181 case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
4182 case Iop_ShlN32x8: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
4183 case Iop_ShlN64x4: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
4184 case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
4185 case Iop_SarN32x8: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
4186 case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
4187 case Iop_ShrN32x8: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
4188 case Iop_ShrN64x4: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
4189 do_SseShift: {
4190 HReg dstHi = newVRegV(env);
4191 HReg dstLo = newVRegV(env);
4192 HReg gregHi, gregLo;
4193 iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
4194 /* If it's a shift by an in-range immediate, generate two single
4195 instructions. */
4196 if (e->Iex.Binop.arg2->tag == Iex_Const) {
4197 IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
4198 vassert(c->tag == Ico_U8);
4199 UInt shift = c->Ico.U8;
4200 if (shift < laneBits) {
4201 addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4202 addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
4203 addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4204 addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
4205 *rHi = dstHi;
4206 *rLo = dstLo;
4207 return;
4210 /* Otherwise we have to do it the longwinded way. */
4211 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
4212 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
4213 HReg ereg = newVRegV(env);
4214 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
4215 addInstr(env, AMD64Instr_Push(rmi));
4216 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
4217 addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4218 addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
4219 addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4220 addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
4221 add_to_rsp(env, 16);
4222 *rHi = dstHi;
4223 *rLo = dstLo;
4224 return;
4227 case Iop_V128HLtoV256: {
4228 // Curiously, there doesn't seem to be any benefit to be had here by
4229 // checking whether arg1 and arg2 are the same, in the style of how
4230 // (eg) 64HLtoV128 is handled elsewhere in this file.
4231 *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4232 *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4233 return;
4236 case Iop_Mul32x8: fn = (HWord)h_generic_calc_Mul32x4;
4237 goto do_SseAssistedBinary;
4238 case Iop_Max32Sx8: fn = (HWord)h_generic_calc_Max32Sx4;
4239 goto do_SseAssistedBinary;
4240 case Iop_Min32Sx8: fn = (HWord)h_generic_calc_Min32Sx4;
4241 goto do_SseAssistedBinary;
4242 case Iop_Max32Ux8: fn = (HWord)h_generic_calc_Max32Ux4;
4243 goto do_SseAssistedBinary;
4244 case Iop_Min32Ux8: fn = (HWord)h_generic_calc_Min32Ux4;
4245 goto do_SseAssistedBinary;
4246 case Iop_Max16Ux16: fn = (HWord)h_generic_calc_Max16Ux8;
4247 goto do_SseAssistedBinary;
4248 case Iop_Min16Ux16: fn = (HWord)h_generic_calc_Min16Ux8;
4249 goto do_SseAssistedBinary;
4250 case Iop_Max8Sx32: fn = (HWord)h_generic_calc_Max8Sx16;
4251 goto do_SseAssistedBinary;
4252 case Iop_Min8Sx32: fn = (HWord)h_generic_calc_Min8Sx16;
4253 goto do_SseAssistedBinary;
4254 case Iop_CmpEQ64x4: fn = (HWord)h_generic_calc_CmpEQ64x2;
4255 goto do_SseAssistedBinary;
4256 case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4257 goto do_SseAssistedBinary;
4258 do_SseAssistedBinary: {
4259 /* RRRufff! RRRufff code is what we're generating here. Oh
4260 well. */
4261 vassert(fn != 0);
4262 HReg dstHi = newVRegV(env);
4263 HReg dstLo = newVRegV(env);
4264 HReg argLhi, argLlo, argRhi, argRlo;
4265 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4266 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4267 HReg argp = newVRegI(env);
4268 /* subq $160, %rsp -- make a space*/
4269 sub_from_rsp(env, 160);
4270 /* leaq 48(%rsp), %r_argp -- point into it */
4271 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4272 argp));
4273 /* andq $-16, %r_argp -- 16-align the pointer */
4274 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4275 AMD64RMI_Imm( ~(UInt)15 ),
4276 argp));
4277 /* Prepare 3 arg regs:
4278 leaq 0(%r_argp), %rdi
4279 leaq 16(%r_argp), %rsi
4280 leaq 32(%r_argp), %rdx
4282 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4283 hregAMD64_RDI()));
4284 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4285 hregAMD64_RSI()));
4286 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4287 hregAMD64_RDX()));
4288 /* Store the two high args, at (%rsi) and (%rdx):
4289 movupd %argLhi, 0(%rsi)
4290 movupd %argRhi, 0(%rdx)
4292 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4293 AMD64AMode_IR(0, hregAMD64_RSI())));
4294 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4295 AMD64AMode_IR(0, hregAMD64_RDX())));
4296 /* Store the two low args, at 48(%rsi) and 48(%rdx):
4297 movupd %argLlo, 48(%rsi)
4298 movupd %argRlo, 48(%rdx)
4300 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4301 AMD64AMode_IR(48, hregAMD64_RSI())));
4302 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4303 AMD64AMode_IR(48, hregAMD64_RDX())));
4304 /* call the helper */
4305 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4306 mk_RetLoc_simple(RLPri_None) ));
4307 /* Prepare 3 arg regs:
4308 leaq 48(%r_argp), %rdi
4309 leaq 64(%r_argp), %rsi
4310 leaq 80(%r_argp), %rdx
4312 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4313 hregAMD64_RDI()));
4314 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4315 hregAMD64_RSI()));
4316 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4317 hregAMD64_RDX()));
4318 /* call the helper */
4319 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4320 mk_RetLoc_simple(RLPri_None) ));
4321 /* fetch the result from memory, using %r_argp, which the
4322 register allocator will keep alive across the call. */
4323 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4324 AMD64AMode_IR(0, argp)));
4325 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4326 AMD64AMode_IR(48, argp)));
4327 /* and finally, clear the space */
4328 add_to_rsp(env, 160);
4329 *rHi = dstHi;
4330 *rLo = dstLo;
4331 return;
4334 case Iop_Perm32x8: fn = (HWord)h_generic_calc_Perm32x8;
4335 goto do_SseAssistedBinary256;
4336 do_SseAssistedBinary256: {
4337 /* RRRufff! RRRufff code is what we're generating here. Oh
4338 well. */
4339 vassert(fn != 0);
4340 HReg dstHi = newVRegV(env);
4341 HReg dstLo = newVRegV(env);
4342 HReg argLhi, argLlo, argRhi, argRlo;
4343 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4344 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4345 HReg argp = newVRegI(env);
4346 /* subq $160, %rsp -- make a space*/
4347 sub_from_rsp(env, 160);
4348 /* leaq 48(%rsp), %r_argp -- point into it */
4349 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4350 argp));
4351 /* andq $-16, %r_argp -- 16-align the pointer */
4352 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4353 AMD64RMI_Imm( ~(UInt)15 ),
4354 argp));
4355 /* Prepare 3 arg regs:
4356 leaq 0(%r_argp), %rdi
4357 leaq 32(%r_argp), %rsi
4358 leaq 64(%r_argp), %rdx
4360 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4361 hregAMD64_RDI()));
4362 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4363 hregAMD64_RSI()));
4364 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4365 hregAMD64_RDX()));
4366 /* Store the two args, at (%rsi) and (%rdx):
4367 movupd %argLlo, 0(%rsi)
4368 movupd %argLhi, 16(%rsi)
4369 movupd %argRlo, 0(%rdx)
4370 movupd %argRhi, 16(%rdx)
4372 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4373 AMD64AMode_IR(0, hregAMD64_RSI())));
4374 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4375 AMD64AMode_IR(16, hregAMD64_RSI())));
4376 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4377 AMD64AMode_IR(0, hregAMD64_RDX())));
4378 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4379 AMD64AMode_IR(16, hregAMD64_RDX())));
4380 /* call the helper */
4381 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4382 mk_RetLoc_simple(RLPri_None) ));
4383 /* fetch the result from memory, using %r_argp, which the
4384 register allocator will keep alive across the call. */
4385 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4386 AMD64AMode_IR(0, argp)));
4387 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4388 AMD64AMode_IR(16, argp)));
4389 /* and finally, clear the space */
4390 add_to_rsp(env, 160);
4391 *rHi = dstHi;
4392 *rLo = dstLo;
4393 return;
4396 case Iop_I32StoF32x8:
4397 case Iop_F32toI32Sx8: {
4398 HReg argHi, argLo;
4399 iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2);
4400 HReg dstHi = newVRegV(env);
4401 HReg dstLo = newVRegV(env);
4402 AMD64SseOp mop
4403 = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I;
4404 set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
4405 addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi));
4406 addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo));
4407 set_SSE_rounding_default(env);
4408 *rHi = dstHi;
4409 *rLo = dstLo;
4410 return;
4413 default:
4414 break;
4415 } /* switch (e->Iex.Binop.op) */
4416 } /* if (e->tag == Iex_Binop) */
4418 if (e->tag == Iex_Triop) {
4419 IRTriop *triop = e->Iex.Triop.details;
4420 switch (triop->op) {
4422 case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4423 case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4424 case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4425 case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4426 do_64Fx4_w_rm:
4428 HReg argLhi, argLlo, argRhi, argRlo;
4429 iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4430 iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4431 HReg dstHi = newVRegV(env);
4432 HReg dstLo = newVRegV(env);
4433 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4434 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4435 /* XXXROUNDINGFIXME */
4436 /* set roundingmode here */
4437 addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4438 addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4439 *rHi = dstHi;
4440 *rLo = dstLo;
4441 return;
4444 case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4445 case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4446 case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4447 case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4448 do_32Fx8_w_rm:
4450 HReg argLhi, argLlo, argRhi, argRlo;
4451 iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4452 iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4453 HReg dstHi = newVRegV(env);
4454 HReg dstLo = newVRegV(env);
4455 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4456 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4457 /* XXXROUNDINGFIXME */
4458 /* set roundingmode here */
4459 addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4460 addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4461 *rHi = dstHi;
4462 *rLo = dstLo;
4463 return;
4466 default:
4467 break;
4468 } /* switch (triop->op) */
4469 } /* if (e->tag == Iex_Triop) */
4472 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4473 const IRExpr* arg1 = e->Iex.Qop.details->arg1;
4474 const IRExpr* arg2 = e->Iex.Qop.details->arg2;
4475 const IRExpr* arg3 = e->Iex.Qop.details->arg3;
4476 const IRExpr* arg4 = e->Iex.Qop.details->arg4;
4477 // If the args are trivially the same (tmp or const), use the same
4478 // source register for all four, and only one movq since those are
4479 // (relatively) expensive.
4480 if (areAtomsAndEqual(arg1, arg2)
4481 && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) {
4482 HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1);
4483 HReg tmp = newVRegV(env);
4484 HReg dst = newVRegV(env);
4485 addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/));
4486 addInstr(env, mk_vMOVsd_RR(dst, tmp));
4487 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
4488 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
4489 *rHi = dst;
4490 *rLo = dst;
4491 } else {
4492 /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4493 HReg q3 = iselIntExpr_R(env, arg1);
4494 HReg q2 = iselIntExpr_R(env, arg2);
4495 HReg q1 = iselIntExpr_R(env, arg3);
4496 HReg q0 = iselIntExpr_R(env, arg4);
4497 HReg tmp = newVRegV(env);
4498 HReg dstHi = newVRegV(env);
4499 HReg dstLo = newVRegV(env);
4500 addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/));
4501 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
4502 addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/));
4503 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi));
4504 addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/));
4505 addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
4506 addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/));
4507 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo));
4508 *rHi = dstHi;
4509 *rLo = dstLo;
4511 return;
4514 if (e->tag == Iex_ITE) {
4515 HReg r1Hi, r1Lo, r0Hi, r0Lo;
4516 iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4517 iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4518 HReg dstHi = newVRegV(env);
4519 HReg dstLo = newVRegV(env);
4520 addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4521 addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4522 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
4523 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4524 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4525 *rHi = dstHi;
4526 *rLo = dstLo;
4527 return;
4530 //avx_fail:
4531 vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4532 LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4533 ppIRExpr(e);
4534 vpanic("iselDVecExpr_wrk");
4538 /*---------------------------------------------------------*/
4539 /*--- ISEL: Statements ---*/
4540 /*---------------------------------------------------------*/
4542 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4544 if (vex_traceflags & VEX_TRACE_VCODE) {
4545 vex_printf("\n-- ");
4546 ppIRStmt(stmt);
4547 vex_printf("\n");
4550 switch (stmt->tag) {
4552 /* --------- LOADG (guarded load) --------- */
4553 case Ist_LoadG: {
4554 IRLoadG* lg = stmt->Ist.LoadG.details;
4555 if (lg->end != Iend_LE)
4556 goto stmt_fail;
4558 UChar szB = 0; /* invalid */
4559 switch (lg->cvt) {
4560 case ILGop_Ident32: szB = 4; break;
4561 case ILGop_Ident64: szB = 8; break;
4562 case ILGop_IdentV128: szB = 16; break;
4563 default: break;
4565 if (szB == 0)
4566 goto stmt_fail;
4568 AMD64AMode* amAddr
4569 = iselIntExpr_AMode(env, lg->addr);
4570 HReg rAlt
4571 = szB == 16 ? iselVecExpr(env, lg->alt)
4572 : iselIntExpr_R(env, lg->alt);
4573 HReg rDst
4574 = lookupIRTemp(env, lg->dst);
4576 /* Get the alt value into the dst. We'll do a conditional load
4577 which overwrites it -- or not -- with loaded data. */
4578 if (szB == 16) {
4579 addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
4580 } else {
4581 addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
4583 AMD64CondCode cc = iselCondCode(env, lg->guard);
4584 if (szB == 16) {
4585 addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
4586 } else {
4587 addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
4589 return;
4592 /* --------- STOREG (guarded store) --------- */
4593 case Ist_StoreG: {
4594 IRStoreG* sg = stmt->Ist.StoreG.details;
4595 if (sg->end != Iend_LE)
4596 goto stmt_fail;
4598 UChar szB = 0; /* invalid */
4599 switch (typeOfIRExpr(env->type_env, sg->data)) {
4600 case Ity_I32: szB = 4; break;
4601 case Ity_I64: szB = 8; break;
4602 case Ity_V128: szB = 16; break;
4603 default: break;
4605 if (szB == 0)
4606 goto stmt_fail;
4608 AMD64AMode* amAddr
4609 = iselIntExpr_AMode(env, sg->addr);
4610 HReg rSrc
4611 = szB == 16 ? iselVecExpr(env, sg->data)
4612 : iselIntExpr_R(env, sg->data);
4613 AMD64CondCode cc
4614 = iselCondCode(env, sg->guard);
4615 if (szB == 16) {
4616 addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
4617 } else {
4618 addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
4620 return;
4623 /* --------- STORE --------- */
4624 case Ist_Store: {
4625 IRType tya = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4626 IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4627 IREndness end = stmt->Ist.Store.end;
4629 if (tya != Ity_I64 || end != Iend_LE)
4630 goto stmt_fail;
4632 if (tyd == Ity_I64) {
4633 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4634 AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4635 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4636 return;
4638 if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4639 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4640 HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4641 addInstr(env, AMD64Instr_Store(
4642 toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4643 r,am));
4644 return;
4646 if (tyd == Ity_F64) {
4647 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4648 HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4649 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4650 return;
4652 if (tyd == Ity_F32) {
4653 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4654 HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4655 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4656 return;
4658 if (tyd == Ity_V128) {
4659 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4660 HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4661 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4662 return;
4664 if (tyd == Ity_V256) {
4665 HReg rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
4666 AMD64AMode* am0 = AMD64AMode_IR(0, rA);
4667 AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4668 HReg vHi, vLo;
4669 iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4670 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4671 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4672 return;
4674 break;
4677 /* --------- PUT --------- */
4678 case Ist_Put: {
4679 IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4680 if (ty == Ity_I64) {
4681 /* We're going to write to memory, so compute the RHS into an
4682 AMD64RI. */
4683 AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4684 addInstr(env,
4685 AMD64Instr_Alu64M(
4686 Aalu_MOV,
4688 AMD64AMode_IR(stmt->Ist.Put.offset,
4689 hregAMD64_RBP())
4691 return;
4693 if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4694 HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4695 addInstr(env, AMD64Instr_Store(
4696 toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4698 AMD64AMode_IR(stmt->Ist.Put.offset,
4699 hregAMD64_RBP())));
4700 return;
4702 if (ty == Ity_F32) {
4703 HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4704 AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4705 set_SSE_rounding_default(env); /* paranoia */
4706 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4707 return;
4709 if (ty == Ity_F64) {
4710 HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4711 AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4712 hregAMD64_RBP() );
4713 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4714 return;
4716 if (ty == Ity_V128) {
4717 HReg vec = iselVecExpr(env, stmt->Ist.Put.data);
4718 AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset,
4719 hregAMD64_RBP());
4720 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4721 return;
4723 if (ty == Ity_V256) {
4724 HReg vHi, vLo;
4725 iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4726 HReg rbp = hregAMD64_RBP();
4727 AMD64AMode* am0 = AMD64AMode_IR(stmt->Ist.Put.offset + 0, rbp);
4728 AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4729 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4730 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4731 return;
4733 break;
4736 /* --------- Indexed PUT --------- */
4737 case Ist_PutI: {
4738 IRPutI *puti = stmt->Ist.PutI.details;
4740 AMD64AMode* am
4741 = genGuestArrayOffset(
4742 env, puti->descr,
4743 puti->ix, puti->bias );
4745 IRType ty = typeOfIRExpr(env->type_env, puti->data);
4746 if (ty == Ity_F64) {
4747 HReg val = iselDblExpr(env, puti->data);
4748 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4749 return;
4751 if (ty == Ity_I8) {
4752 HReg r = iselIntExpr_R(env, puti->data);
4753 addInstr(env, AMD64Instr_Store( 1, r, am ));
4754 return;
4756 if (ty == Ity_I64) {
4757 AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4758 addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4759 return;
4761 break;
4764 /* --------- TMP --------- */
4765 case Ist_WrTmp: {
4766 IRTemp tmp = stmt->Ist.WrTmp.tmp;
4767 IRType ty = typeOfIRTemp(env->type_env, tmp);
4769 /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4770 compute it into an AMode and then use LEA. This usually
4771 produces fewer instructions, often because (for memcheck
4772 created IR) we get t = address-expression, (t is later used
4773 twice) and so doing this naturally turns address-expression
4774 back into an AMD64 amode. */
4775 if (ty == Ity_I64
4776 && stmt->Ist.WrTmp.data->tag == Iex_Binop
4777 && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4778 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4779 HReg dst = lookupIRTemp(env, tmp);
4780 if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4781 /* Hmm, iselIntExpr_AMode wimped out and just computed the
4782 value into a register. Just emit a normal reg-reg move
4783 so reg-alloc can coalesce it away in the usual way. */
4784 HReg src = am->Aam.IR.reg;
4785 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4786 } else {
4787 addInstr(env, AMD64Instr_Lea64(am,dst));
4789 return;
4792 if (ty == Ity_I64 || ty == Ity_I32
4793 || ty == Ity_I16 || ty == Ity_I8) {
4794 AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4795 HReg dst = lookupIRTemp(env, tmp);
4796 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4797 return;
4799 if (ty == Ity_I128) {
4800 HReg rHi, rLo, dstHi, dstLo;
4801 iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4802 lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4803 addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4804 addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4805 return;
4807 if (ty == Ity_I1) {
4808 AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4809 HReg dst = lookupIRTemp(env, tmp);
4810 addInstr(env, AMD64Instr_Set64(cond, dst));
4811 return;
4813 if (ty == Ity_F64) {
4814 HReg dst = lookupIRTemp(env, tmp);
4815 HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4816 addInstr(env, mk_vMOVsd_RR(src, dst));
4817 return;
4819 if (ty == Ity_F32) {
4820 HReg dst = lookupIRTemp(env, tmp);
4821 HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4822 addInstr(env, mk_vMOVsd_RR(src, dst));
4823 return;
4825 if (ty == Ity_V128) {
4826 HReg dst = lookupIRTemp(env, tmp);
4827 HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4828 addInstr(env, mk_vMOVsd_RR(src, dst));
4829 return;
4831 if (ty == Ity_V256) {
4832 HReg rHi, rLo, dstHi, dstLo;
4833 iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4834 lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4835 addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
4836 addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
4837 return;
4839 break;
4842 /* --------- Call to DIRTY helper --------- */
4843 case Ist_Dirty: {
4844 IRDirty* d = stmt->Ist.Dirty.details;
4846 /* Figure out the return type, if any. */
4847 IRType retty = Ity_INVALID;
4848 if (d->tmp != IRTemp_INVALID)
4849 retty = typeOfIRTemp(env->type_env, d->tmp);
4851 /* Throw out any return types we don't know about. */
4852 Bool retty_ok = False;
4853 switch (retty) {
4854 case Ity_INVALID: /* function doesn't return anything */
4855 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4856 case Ity_V128: case Ity_V256:
4857 retty_ok = True; break;
4858 default:
4859 break;
4861 if (!retty_ok)
4862 break; /* will go to stmt_fail: */
4864 /* Marshal args, do the call, and set the return value to
4865 0x555..555 if this is a conditional call that returns a value
4866 and the call is skipped. */
4867 UInt addToSp = 0;
4868 RetLoc rloc = mk_RetLoc_INVALID();
4869 doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4870 vassert(is_sane_RetLoc(rloc));
4872 /* Now figure out what to do with the returned value, if any. */
4873 switch (retty) {
4874 case Ity_INVALID: {
4875 /* No return value. Nothing to do. */
4876 vassert(d->tmp == IRTemp_INVALID);
4877 vassert(rloc.pri == RLPri_None);
4878 vassert(addToSp == 0);
4879 return;
4881 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
4882 /* The returned value is in %rax. Park it in the register
4883 associated with tmp. */
4884 vassert(rloc.pri == RLPri_Int);
4885 vassert(addToSp == 0);
4886 HReg dst = lookupIRTemp(env, d->tmp);
4887 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
4888 return;
4890 case Ity_V128: {
4891 /* The returned value is on the stack, and rloc.spOff
4892 tells us where. Fish it off the stack and then move
4893 the stack pointer upwards to clear it, as directed by
4894 doHelperCall. */
4895 vassert(rloc.pri == RLPri_V128SpRel);
4896 vassert(addToSp >= 16);
4897 HReg dst = lookupIRTemp(env, d->tmp);
4898 AMD64AMode* am = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4899 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
4900 add_to_rsp(env, addToSp);
4901 return;
4903 case Ity_V256: {
4904 /* See comments for Ity_V128. */
4905 vassert(rloc.pri == RLPri_V256SpRel);
4906 vassert(addToSp >= 32);
4907 HReg dstLo, dstHi;
4908 lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
4909 AMD64AMode* amLo = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4910 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
4911 AMD64AMode* amHi = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
4912 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
4913 add_to_rsp(env, addToSp);
4914 return;
4916 default:
4917 /*NOTREACHED*/
4918 vassert(0);
4920 break;
4923 /* --------- MEM FENCE --------- */
4924 case Ist_MBE:
4925 switch (stmt->Ist.MBE.event) {
4926 case Imbe_Fence:
4927 addInstr(env, AMD64Instr_MFence());
4928 return;
4929 default:
4930 break;
4932 break;
4934 /* --------- ACAS --------- */
4935 case Ist_CAS:
4936 if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4937 /* "normal" singleton CAS */
4938 UChar sz;
4939 IRCAS* cas = stmt->Ist.CAS.details;
4940 IRType ty = typeOfIRExpr(env->type_env, cas->dataLo);
4941 /* get: cas->expd into %rax, and cas->data into %rbx */
4942 AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4943 HReg rData = iselIntExpr_R(env, cas->dataLo);
4944 HReg rExpd = iselIntExpr_R(env, cas->expdLo);
4945 HReg rOld = lookupIRTemp(env, cas->oldLo);
4946 vassert(cas->expdHi == NULL);
4947 vassert(cas->dataHi == NULL);
4948 addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
4949 addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
4950 addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
4951 switch (ty) {
4952 case Ity_I64: sz = 8; break;
4953 case Ity_I32: sz = 4; break;
4954 case Ity_I16: sz = 2; break;
4955 case Ity_I8: sz = 1; break;
4956 default: goto unhandled_cas;
4958 addInstr(env, AMD64Instr_ACAS(am, sz));
4959 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
4960 return;
4961 } else {
4962 /* double CAS */
4963 UChar sz;
4964 IRCAS* cas = stmt->Ist.CAS.details;
4965 IRType ty = typeOfIRExpr(env->type_env, cas->dataLo);
4966 /* only 32-bit and 64-bit allowed in this case */
4967 /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
4968 /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
4969 AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4970 HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4971 HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4972 HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4973 HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4974 HReg rOldHi = lookupIRTemp(env, cas->oldHi);
4975 HReg rOldLo = lookupIRTemp(env, cas->oldLo);
4976 switch (ty) {
4977 case Ity_I64:
4978 if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
4979 goto unhandled_cas; /* we'd have to generate
4980 cmpxchg16b, but the host
4981 doesn't support that */
4982 sz = 8;
4983 break;
4984 case Ity_I32:
4985 sz = 4;
4986 break;
4987 default:
4988 goto unhandled_cas;
4990 addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4991 addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4992 addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
4993 addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
4994 addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
4995 addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
4996 addInstr(env, AMD64Instr_DACAS(am, sz));
4997 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
4998 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
4999 return;
5001 unhandled_cas:
5002 break;
5004 /* --------- INSTR MARK --------- */
5005 /* Doesn't generate any executable code ... */
5006 case Ist_IMark:
5007 return;
5009 /* --------- ABI HINT --------- */
5010 /* These have no meaning (denotation in the IR) and so we ignore
5011 them ... if any actually made it this far. */
5012 case Ist_AbiHint:
5013 return;
5015 /* --------- NO-OP --------- */
5016 case Ist_NoOp:
5017 return;
5019 /* --------- EXIT --------- */
5020 case Ist_Exit: {
5021 if (stmt->Ist.Exit.dst->tag != Ico_U64)
5022 vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
5024 AMD64CondCode cc = iselCondCode(env, stmt->Ist.Exit.guard);
5025 AMD64AMode* amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
5026 hregAMD64_RBP());
5028 /* Case: boring transfer to known address */
5029 if (stmt->Ist.Exit.jk == Ijk_Boring) {
5030 if (env->chainingAllowed) {
5031 /* .. almost always true .. */
5032 /* Skip the event check at the dst if this is a forwards
5033 edge. */
5034 Bool toFastEP
5035 = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
5036 if (0) vex_printf("%s", toFastEP ? "Y" : ",");
5037 addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
5038 amRIP, cc, toFastEP));
5039 } else {
5040 /* .. very occasionally .. */
5041 /* We can't use chaining, so ask for an assisted transfer,
5042 as that's the only alternative that is allowable. */
5043 HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5044 addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
5046 return;
5049 /* Case: assisted transfer to arbitrary address */
5050 switch (stmt->Ist.Exit.jk) {
5051 /* Keep this list in sync with that in iselNext below */
5052 case Ijk_ClientReq:
5053 case Ijk_EmWarn:
5054 case Ijk_NoDecode:
5055 case Ijk_NoRedir:
5056 case Ijk_SigSEGV:
5057 case Ijk_SigTRAP:
5058 case Ijk_Sys_syscall:
5059 case Ijk_Sys_int210:
5060 case Ijk_InvalICache:
5061 case Ijk_Yield:
5063 HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5064 addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
5065 return;
5067 default:
5068 break;
5071 /* Do we ever expect to see any other kind? */
5072 goto stmt_fail;
5075 default: break;
5077 stmt_fail:
5078 ppIRStmt(stmt);
5079 vpanic("iselStmt(amd64)");
5083 /*---------------------------------------------------------*/
5084 /*--- ISEL: Basic block terminators (Nexts) ---*/
5085 /*---------------------------------------------------------*/
5087 static void iselNext ( ISelEnv* env,
5088 IRExpr* next, IRJumpKind jk, Int offsIP )
5090 if (vex_traceflags & VEX_TRACE_VCODE) {
5091 vex_printf( "\n-- PUT(%d) = ", offsIP);
5092 ppIRExpr( next );
5093 vex_printf( "; exit-");
5094 ppIRJumpKind(jk);
5095 vex_printf( "\n");
5098 /* Case: boring transfer to known address */
5099 if (next->tag == Iex_Const) {
5100 IRConst* cdst = next->Iex.Const.con;
5101 vassert(cdst->tag == Ico_U64);
5102 if (jk == Ijk_Boring || jk == Ijk_Call) {
5103 /* Boring transfer to known address */
5104 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5105 if (env->chainingAllowed) {
5106 /* .. almost always true .. */
5107 /* Skip the event check at the dst if this is a forwards
5108 edge. */
5109 Bool toFastEP
5110 = ((Addr64)cdst->Ico.U64) > env->max_ga;
5111 if (0) vex_printf("%s", toFastEP ? "X" : ".");
5112 addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
5113 amRIP, Acc_ALWAYS,
5114 toFastEP));
5115 } else {
5116 /* .. very occasionally .. */
5117 /* We can't use chaining, so ask for an indirect transfer,
5118 as that's the cheapest alternative that is
5119 allowable. */
5120 HReg r = iselIntExpr_R(env, next);
5121 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5122 Ijk_Boring));
5124 return;
5128 /* Case: call/return (==boring) transfer to any address */
5129 switch (jk) {
5130 case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
5131 HReg r = iselIntExpr_R(env, next);
5132 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5133 if (env->chainingAllowed) {
5134 addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
5135 } else {
5136 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5137 Ijk_Boring));
5139 return;
5141 default:
5142 break;
5145 /* Case: assisted transfer to arbitrary address */
5146 switch (jk) {
5147 /* Keep this list in sync with that for Ist_Exit above */
5148 case Ijk_ClientReq:
5149 case Ijk_EmWarn:
5150 case Ijk_NoDecode:
5151 case Ijk_NoRedir:
5152 case Ijk_SigSEGV:
5153 case Ijk_SigTRAP:
5154 case Ijk_Sys_syscall:
5155 case Ijk_Sys_int210:
5156 case Ijk_InvalICache:
5157 case Ijk_Yield: {
5158 HReg r = iselIntExpr_R(env, next);
5159 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5160 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
5161 return;
5163 default:
5164 break;
5167 vex_printf( "\n-- PUT(%d) = ", offsIP);
5168 ppIRExpr( next );
5169 vex_printf( "; exit-");
5170 ppIRJumpKind(jk);
5171 vex_printf( "\n");
5172 vassert(0); // are we expecting any other kind?
5176 /*---------------------------------------------------------*/
5177 /*--- Insn selector top-level ---*/
5178 /*---------------------------------------------------------*/
5180 /* Translate an entire SB to amd64 code. */
5182 HInstrArray* iselSB_AMD64 ( const IRSB* bb,
5183 VexArch arch_host,
5184 const VexArchInfo* archinfo_host,
5185 const VexAbiInfo* vbi/*UNUSED*/,
5186 Int offs_Host_EvC_Counter,
5187 Int offs_Host_EvC_FailAddr,
5188 Bool chainingAllowed,
5189 Bool addProfInc,
5190 Addr max_ga )
5192 Int i, j;
5193 HReg hreg, hregHI;
5194 ISelEnv* env;
5195 UInt hwcaps_host = archinfo_host->hwcaps;
5196 AMD64AMode *amCounter, *amFailAddr;
5198 /* sanity ... */
5199 vassert(arch_host == VexArchAMD64);
5200 vassert(0 == (hwcaps_host
5201 & ~(VEX_HWCAPS_AMD64_SSE3
5202 | VEX_HWCAPS_AMD64_SSSE3
5203 | VEX_HWCAPS_AMD64_CX16
5204 | VEX_HWCAPS_AMD64_LZCNT
5205 | VEX_HWCAPS_AMD64_AVX
5206 | VEX_HWCAPS_AMD64_RDTSCP
5207 | VEX_HWCAPS_AMD64_BMI
5208 | VEX_HWCAPS_AMD64_AVX2
5209 | VEX_HWCAPS_AMD64_F16C
5210 | VEX_HWCAPS_AMD64_RDRAND)));
5212 /* Check that the host's endianness is as expected. */
5213 vassert(archinfo_host->endness == VexEndnessLE);
5215 /* Make up an initial environment to use. */
5216 env = LibVEX_Alloc_inline(sizeof(ISelEnv));
5217 env->vreg_ctr = 0;
5219 /* Set up output code array. */
5220 env->code = newHInstrArray();
5222 /* Copy BB's type env. */
5223 env->type_env = bb->tyenv;
5225 /* Make up an IRTemp -> virtual HReg mapping. This doesn't
5226 change as we go along. */
5227 env->n_vregmap = bb->tyenv->types_used;
5228 env->vregmap = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5229 env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5231 /* and finally ... */
5232 env->chainingAllowed = chainingAllowed;
5233 env->hwcaps = hwcaps_host;
5234 env->max_ga = max_ga;
5236 /* For each IR temporary, allocate a suitably-kinded virtual
5237 register. */
5238 j = 0;
5239 for (i = 0; i < env->n_vregmap; i++) {
5240 hregHI = hreg = INVALID_HREG;
5241 switch (bb->tyenv->types[i]) {
5242 case Ity_I1:
5243 case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
5244 hreg = mkHReg(True, HRcInt64, 0, j++);
5245 break;
5246 case Ity_I128:
5247 hreg = mkHReg(True, HRcInt64, 0, j++);
5248 hregHI = mkHReg(True, HRcInt64, 0, j++);
5249 break;
5250 case Ity_F32:
5251 case Ity_F64:
5252 case Ity_V128:
5253 hreg = mkHReg(True, HRcVec128, 0, j++);
5254 break;
5255 case Ity_V256:
5256 hreg = mkHReg(True, HRcVec128, 0, j++);
5257 hregHI = mkHReg(True, HRcVec128, 0, j++);
5258 break;
5259 default:
5260 ppIRType(bb->tyenv->types[i]);
5261 vpanic("iselBB(amd64): IRTemp type");
5263 env->vregmap[i] = hreg;
5264 env->vregmapHI[i] = hregHI;
5266 env->vreg_ctr = j;
5268 /* The very first instruction must be an event check. */
5269 amCounter = AMD64AMode_IR(offs_Host_EvC_Counter, hregAMD64_RBP());
5270 amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
5271 addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
5273 /* Possibly a block counter increment (for profiling). At this
5274 point we don't know the address of the counter, so just pretend
5275 it is zero. It will have to be patched later, but before this
5276 translation is used, by a call to LibVEX_patchProfCtr. */
5277 if (addProfInc) {
5278 addInstr(env, AMD64Instr_ProfInc());
5281 /* Ok, finally we can iterate over the statements. */
5282 for (i = 0; i < bb->stmts_used; i++)
5283 if (bb->stmts[i])
5284 iselStmt(env, bb->stmts[i]);
5286 iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
5288 /* record the number of vregs we used. */
5289 env->code->n_vregs = env->vreg_ctr;
5290 return env->code;
5294 /*---------------------------------------------------------------*/
5295 /*--- end host_amd64_isel.c ---*/
5296 /*---------------------------------------------------------------*/